Merge branch 'devel'

2026-06-08 15:05:15 +02:00 · 2015-09-08 17:26:23 +01:00 · 2015-09-08 17:26:23 +01:00 · 90b966bc5b
commit 90b966bc5b
parent 4aa4a29891 cf2673632b
198 changed files with 65987 additions and 3344 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -17,8 +17,9 @@ before_install:
  - sudo ln -s /run/shm /dev/shm

 install:
-  - conda install --yes python=$TRAVIS_PYTHON_VERSION atlas numpy=1.7 scipy=0.12 matplotlib nose sphinx pip nose
-  - pip install . 
+  - conda install --yes python=$TRAVIS_PYTHON_VERSION atlas numpy=1.9 scipy=0.16 matplotlib nose sphinx pip nose
+  #- pip install . 
+  - python setup.py build_ext --inplace
  #--use-mirrors
  #
 # command to run tests, e.g. python setup.py test
--- a/AUTHORS.txt
+++ b/AUTHORS.txt
@ -5,3 +5,4 @@ Nicolas Durrande
 Alan Saul
 Max Zwiessele
 Neil D. Lawrence
+Zhenwen Dai
--- a/GPy/init.py
+++ b/GPy/init.py
@ -3,23 +3,23 @@
 import warnings
 warnings.filterwarnings("ignore", category=DeprecationWarning)

-import core
-from core.parameterization import transformations, priors
+from . import core
+from .core.parameterization import transformations, priors
 constraints = transformations
-import models
-import mappings
-import inference
-import util
-import examples
-import likelihoods
-import testing
+from . import models
+from . import mappings
+from . import inference
+from . import util
+from . import examples
+from . import likelihoods
+from . import testing
 from numpy.testing import Tester
-import kern
-import plotting
+from . import kern
+from . import plotting

 # Direct imports for convenience:
-from core import Model
-from core.parameterization import Param, Parameterized, ObsAr
+from .core import Model
+from .core.parameterization import Param, Parameterized, ObsAr

 #@nottest
 try:
--- a/GPy/core/init.py
+++ b/GPy/core/init.py
@ -1,11 +1,12 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from model import *
-from parameterization.parameterized import adjust_name_for_printing, Parameterizable
-from parameterization.param import Param, ParamConcatenation
-from parameterization.observable_array import ObsAr
+from .model import *
+from .parameterization.parameterized import adjust_name_for_printing, Parameterizable
+from .parameterization.param import Param, ParamConcatenation
+from .parameterization.observable_array import ObsAr

-from gp import GP
-from sparse_gp import SparseGP
-from mapping import *
+from .gp import GP
+from .svgp import SVGP
+from .sparse_gp import SparseGP
+from .mapping import *
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -4,13 +4,15 @@
 import numpy as np
 import sys
 from .. import kern
-from model import Model
-from parameterization import ObsAr
+from .model import Model
+from .parameterization import ObsAr
+from .mapping import Mapping
 from .. import likelihoods
 from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
-from parameterization.variational import VariationalPosterior
+from .parameterization.variational import VariationalPosterior

 import logging
+import warnings
 from GPy.util.normalizer import MeanNorm
 logger = logging.getLogger("GP")

@ -34,7 +36,7 @@ class GP(Model):


    """
-    def __init__(self, X, Y, kernel, likelihood, inference_method=None, name='gp', Y_metadata=None, normalizer=False):
+    def __init__(self, X, Y, kernel, likelihood, mean_function=None, inference_method=None, name='gp', Y_metadata=None, normalizer=False):
        super(GP, self).__init__(name)

        assert X.ndim == 2
@ -58,14 +60,20 @@ class GP(Model):
            self.normalizer.scale_by(Y)
            self.Y_normalized = ObsAr(self.normalizer.normalize(Y))
            self.Y = Y
-        else:
+        elif isinstance(Y, np.ndarray):
            self.Y = ObsAr(Y)
            self.Y_normalized = self.Y
+        else:
+            self.Y = Y

-        assert Y.shape[0] == self.num_data
+        if Y.shape[0] != self.num_data:
+            #There can be cases where we want inputs than outputs, for example if we have multiple latent
+            #function values
+            warnings.warn("There are more rows in your input data X, \
+                         than in your output data Y, be VERY sure this is what you want")
        _, self.output_dim = self.Y.shape

-        #TODO: check the type of this is okay?
+        assert ((Y_metadata is None) or isinstance(Y_metadata, dict))
        self.Y_metadata = Y_metadata

        assert isinstance(kernel, kern.Kern)
@ -75,6 +83,14 @@ class GP(Model):
        assert isinstance(likelihood, likelihoods.Likelihood)
        self.likelihood = likelihood

+        #handle the mean function
+        self.mean_function = mean_function
+        if mean_function is not None:
+            assert isinstance(self.mean_function, Mapping)
+            assert mean_function.input_dim == self.input_dim
+            assert mean_function.output_dim == self.output_dim
+            self.link_parameter(mean_function)
+
        #find a sensible inference method
        logger.info("initializing inference method")
        if inference_method is None:
@ -82,12 +98,27 @@ class GP(Model):
                inference_method = exact_gaussian_inference.ExactGaussianInference()
            else:
                inference_method = expectation_propagation.EP()
-                print "defaulting to ", inference_method, "for latent function inference"
+                print("defaulting to ", inference_method, "for latent function inference")
        self.inference_method = inference_method

        logger.info("adding kernel and likelihood as parameters")
        self.link_parameter(self.kern)
        self.link_parameter(self.likelihood)
+        self.posterior = None
+
+        # The predictive variable to be used to predict using the posterior object's
+        # woodbury_vector and woodbury_inv is defined as predictive_variable
+        # as long as the posterior has the right woodbury entries.
+        # It is the input variable used for the covariance between
+        # X_star and the posterior of the GP.
+        # This is usually just a link to self.X (full GP) or self.Z (sparse GP).
+        # Make sure to name this variable and the predict functions will "just work"
+        # In maths the predictive variable is:
+        #         K_{xx} - K_{xp}W_{pp}^{-1}K_{px}
+        #         W_{pp} := \texttt{Woodbury inv}
+        #         p := _predictive_variable
+        self._predictive_variable = self.X
+

    def set_XY(self, X=None, Y=None):
        """
@ -115,16 +146,15 @@ class GP(Model):
                    assert isinstance(X, type(self.X)), "The given X must have the same type as the X in the model!"
                    self.unlink_parameter(self.X)
                    self.X = X
-                    self.link_parameters(self.X)
+                    self.link_parameter(self.X)
                else:
                    self.unlink_parameter(self.X)
                    from ..core import Param
                    self.X = Param('latent mean',X)
-                    self.link_parameters(self.X)
+                    self.link_parameter(self.X)
            else:
                self.X = ObsAr(X)
        self.update_model(True)
-        self._trigger_params_changed()

    def set_X(self,X):
        """
@ -153,9 +183,11 @@ class GP(Model):
            This method is not designed to be called manually, the framework is set up to automatically call this method upon changes to parameters, if you call
            this method yourself, there may be unexpected consequences.
        """
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.Y_metadata)
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.mean_function, self.Y_metadata)
        self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
        self.kern.update_gradients_full(self.grad_dict['dL_dK'], self.X)
+        if self.mean_function is not None:
+            self.mean_function.update_gradients(self.grad_dict['dL_dm'], self.X)

    def log_likelihood(self):
        """
@ -163,7 +195,7 @@ class GP(Model):
        """
        return self._log_marginal_likelihood

-    def _raw_predict(self, _Xnew, full_cov=False, kern=None):
+    def _raw_predict(self, Xnew, full_cov=False, kern=None):
        """
        For making predictions, does not account for normalization or likelihood

@ -179,19 +211,33 @@ class GP(Model):
        if kern is None:
            kern = self.kern

-        Kx = kern.K(_Xnew, self.X).T
-        WiKx = np.dot(self.posterior.woodbury_inv, Kx)
+        Kx = kern.K(self._predictive_variable, Xnew)
        mu = np.dot(Kx.T, self.posterior.woodbury_vector)
+        if len(mu.shape)==1:
+            mu = mu.reshape(-1,1)
        if full_cov:
-            Kxx = kern.K(_Xnew)
-            var = Kxx - np.dot(Kx.T, WiKx)
+            Kxx = kern.K(Xnew)
+            if self.posterior.woodbury_inv.ndim == 2:
+                var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
+            elif self.posterior.woodbury_inv.ndim == 3: # Missing data
+                var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
+                from ..util.linalg import mdot
+                for i in range(var.shape[2]):
+                    var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
+            var = var
        else:
-            Kxx = kern.Kdiag(_Xnew)
-            var = Kxx - np.sum(WiKx*Kx, 0)
-            var = var.reshape(-1, 1)
+            Kxx = kern.Kdiag(Xnew)
+            if self.posterior.woodbury_inv.ndim == 2:
+                var = (Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0))[:,None]
+            elif self.posterior.woodbury_inv.ndim == 3: # Missing data
+                var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
+                for i in range(var.shape[1]):
+                    var[:, i] = (Kxx - (np.sum(np.dot(self.posterior.woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
+            var = var
+        #add in the mean function
+        if self.mean_function is not None:
+            mu += self.mean_function.f(Xnew)

-        #force mu to be a column vector
-        if len(mu.shape)==1: mu = mu[:,None]
        return mu, var

    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None):
@ -206,13 +252,14 @@ class GP(Model):
        :param Y_metadata: metadata about the predicting point to pass to the likelihood
        :param kern: The kernel to use for prediction (defaults to the model
                     kern). this is useful for examining e.g. subprocesses.
-        :returns: (mean, var, lower_upper):
+        :returns: (mean, var):
            mean: posterior mean, a Numpy array, Nnew x self.input_dim
            var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
-            lower_upper: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim

           If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
           This is to allow for different normalizations of the output dimensions.
+
+        Note: If you want the predictive quantiles (e.g. 95% confidence interval) use :py:func:"~GPy.core.gp.GP.predict_quantiles".
        """
        #predict the latent function values
        mu, var = self._raw_predict(Xnew, full_cov=full_cov, kern=kern)
@ -220,10 +267,10 @@ class GP(Model):
            mu, var = self.normalizer.inverse_mean(mu), self.normalizer.inverse_variance(var)

        # now push through likelihood
-        mean, var = self.likelihood.predictive_values(mu, var, full_cov, Y_metadata)
+        mean, var = self.likelihood.predictive_values(mu, var, full_cov, Y_metadata=Y_metadata)
        return mean, var

-    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None):
+    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None, kern=None):
        """
        Get the predictive quantiles around the prediction at X

@ -231,22 +278,26 @@ class GP(Model):
        :type X: np.ndarray (Xnew x self.input_dim)
        :param quantiles: tuple of quantiles, default is (2.5, 97.5) which is the 95% interval
        :type quantiles: tuple
+        :param kern: optional kernel to use for prediction
+        :type predict_kw: dict
        :returns: list of quantiles for each X and predictive quantiles for interval combination
-        :rtype: [np.ndarray (Xnew x self.input_dim), np.ndarray (Xnew x self.input_dim)]
+        :rtype: [np.ndarray (Xnew x self.output_dim), np.ndarray (Xnew x self.output_dim)]
        """
-        m, v = self._raw_predict(X,  full_cov=False)
+        m, v = self._raw_predict(X,  full_cov=False, kern=kern)
        if self.normalizer is not None:
            m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
-        return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
+        return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata=Y_metadata)

    def predictive_gradients(self, Xnew):
        """
-        Compute the derivatives of the latent function with respect to X*
+        Compute the derivatives of the predicted latent function with respect to X*

        Given a set of points at which to predict X* (size [N*,Q]), compute the
        derivatives of the mean and variance. Resulting arrays are sized:
         dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).

+        Note that this is not the same as computing the mean and variance of the derivative of the function!
+
         dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
        :param X: The points at which to get the predictive gradients
        :type X: np.ndarray (Xnew x self.input_dim)
@ -266,6 +317,120 @@ class GP(Model):
        return dmu_dX, dv_dX


+    def predict_jacobian(self, Xnew, kern=None, full_cov=True):
+        """
+        Compute the derivatives of the posterior of the GP.
+
+        Given a set of points at which to predict X* (size [N*,Q]), compute the
+        mean and variance of the derivative. Resulting arrays are sized:
+
+         dL_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).
+          Note that this is the mean and variance of the derivative,
+          not the derivative of the mean and variance! (See predictive_gradients for that)
+
+         dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
+          If there is missing data, it is not implemented for now, but
+          there will be one output variance per output dimension.
+
+        :param X: The points at which to get the predictive gradients.
+        :type X: np.ndarray (Xnew x self.input_dim)
+        :param kern: The kernel to compute the jacobian for.
+        :param boolean full_cov: whether to return the full covariance of the jacobian.
+
+        :returns: dmu_dX, dv_dX
+        :rtype: [np.ndarray (N*, Q ,D), np.ndarray (N*,Q,(D)) ]
+
+        Note: We always return sum in input_dim gradients, as the off-diagonals
+        in the input_dim are not needed for further calculations.
+        This is a compromise for increase in speed. Mathematically the jacobian would
+        have another dimension in Q.
+        """
+        if kern is None:
+            kern = self.kern
+
+        mean_jac = np.empty((Xnew.shape[0],Xnew.shape[1],self.output_dim))
+
+        for i in range(self.output_dim):
+            mean_jac[:,:,i] = kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1].T, Xnew, self._predictive_variable)
+
+        dK_dXnew_full = np.empty((self._predictive_variable.shape[0], Xnew.shape[0], Xnew.shape[1]))
+        for i in range(self._predictive_variable.shape[0]):
+            dK_dXnew_full[i] = kern.gradients_X([[1.]], Xnew, self._predictive_variable[[i]])
+
+        if full_cov:
+            dK2_dXdX = kern.gradients_XX([[1.]], Xnew)
+        else:
+            dK2_dXdX = kern.gradients_XX_diag([[1.]], Xnew)
+
+        def compute_cov_inner(wi):
+            if full_cov:
+                # full covariance gradients:
+                var_jac = dK2_dXdX - np.einsum('qnm,miq->niq', dK_dXnew_full.T.dot(wi), dK_dXnew_full)
+            else:
+                var_jac = dK2_dXdX - np.einsum('qim,miq->iq', dK_dXnew_full.T.dot(wi), dK_dXnew_full)
+            return var_jac
+
+        if self.posterior.woodbury_inv.ndim == 3: # Missing data:
+            if full_cov:
+                var_jac = np.empty((Xnew.shape[0],Xnew.shape[0],Xnew.shape[1],self.output_dim))
+                for d in range(self.posterior.woodbury_inv.shape[2]):
+                    var_jac[:, :, :, d] = compute_cov_inner(self.posterior.woodbury_inv[:, :, d])
+            else:
+                var_jac = np.empty((Xnew.shape[0],Xnew.shape[1],self.output_dim))
+                for d in range(self.posterior.woodbury_inv.shape[2]):
+                    var_jac[:, :, d] = compute_cov_inner(self.posterior.woodbury_inv[:, :, d])
+        else:
+            var_jac = compute_cov_inner(self.posterior.woodbury_inv)
+        return mean_jac, var_jac
+
+    def predict_wishard_embedding(self, Xnew, kern=None, mean=True, covariance=True):
+        """
+        Predict the wishard embedding G of the GP. This is the density of the
+        input of the GP defined by the probabilistic function mapping f.
+        G = J_mean.T*J_mean + output_dim*J_cov.
+
+        :param array-like Xnew: The points at which to evaluate the magnification.
+        :param :py:class:`~GPy.kern.Kern` kern: The kernel to use for the magnification.
+
+        Supplying only a part of the learning kernel gives insights into the density
+        of the specific kernel part of the input function. E.g. one can see how dense the
+        linear part of a kernel is compared to the non-linear part etc.
+        """
+        if kern is None:
+            kern = self.kern
+
+        mu_jac, var_jac = self.predict_jacobian(Xnew, kern, full_cov=False)
+        mumuT = np.einsum('iqd,ipd->iqp', mu_jac, mu_jac)
+        Sigma = np.zeros(mumuT.shape)
+        if var_jac.ndim == 3:
+            Sigma[(slice(None), )+np.diag_indices(Xnew.shape[1], 2)] = var_jac.sum(-1)
+        else:
+            Sigma[(slice(None), )+np.diag_indices(Xnew.shape[1], 2)] = self.output_dim*var_jac
+        G = 0.
+        if mean:
+            G += mumuT
+        if covariance:
+            G += Sigma
+        return G
+
+    def predict_magnification(self, Xnew, kern=None, mean=True, covariance=True):
+        """
+        Predict the magnification factor as
+
+        sqrt(det(G))
+
+        for each point N in Xnew
+        """
+        G = self.predict_wishard_embedding(Xnew, kern, mean, covariance)
+        from ..util.linalg import jitchol
+        mag = np.empty(Xnew.shape[0])
+        for n in range(Xnew.shape[0]):
+            try:
+                mag[n] = np.sqrt(np.exp(2*np.sum(np.log(np.diag(jitchol(G[n, :, :]))))))
+            except:
+                mag[n] = np.sqrt(np.linalg.det(G[n, :, :]))
+        return mag
+
    def posterior_samples_f(self,X,size=10, full_cov=True):
        """
        Samples the posterior GP at the points X.
@ -276,7 +441,7 @@ class GP(Model):
        :type size: int.
        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
        :type full_cov: bool.
-        :returns: Ysim: set of simulations
+        :returns: fsim: set of simulations
        :rtype: np.ndarray (N x samples)
        """
        m, v = self._raw_predict(X,  full_cov=full_cov)
@ -284,11 +449,11 @@ class GP(Model):
            m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
        v = v.reshape(m.size,-1) if len(v.shape)==3 else v
        if not full_cov:
-            Ysim = np.random.multivariate_normal(m.flatten(), np.diag(v.flatten()), size).T
+            fsim = np.random.multivariate_normal(m.flatten(), np.diag(v.flatten()), size).T
        else:
-            Ysim = np.random.multivariate_normal(m.flatten(), v, size).T
+            fsim = np.random.multivariate_normal(m.flatten(), v, size).T

-        return Ysim
+        return fsim

    def posterior_samples(self, X, size=10, full_cov=False, Y_metadata=None):
        """
@ -304,16 +469,16 @@ class GP(Model):
        :type noise_model: integer.
        :returns: Ysim: set of simulations, a Numpy array (N x samples).
        """
-        Ysim = self.posterior_samples_f(X, size, full_cov=full_cov)
-        Ysim = self.likelihood.samples(Ysim, Y_metadata)
-
+        fsim = self.posterior_samples_f(X, size, full_cov=full_cov)
+        Ysim = self.likelihood.samples(fsim, Y_metadata=Y_metadata)
        return Ysim

    def plot_f(self, plot_limits=None, which_data_rows='all',
        which_data_ycols='all', fixed_inputs=[],
        levels=20, samples=0, fignum=None, ax=None, resolution=None,
        plot_raw=True,
-        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx'):
+        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx',
+        apply_link=False):
        """
        Plot the GP's view of the world, where the data is normalized and before applying a likelihood.
        This is a call to plot with plot_raw=True.
@ -350,6 +515,8 @@ class GP(Model):
        :type Y_metadata: dict
        :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
        :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
+        :param apply_link: if there is a link function of the likelihood, plot the link(f*) rather than f*
+        :type apply_link: boolean
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import models_plots
@ -362,13 +529,13 @@ class GP(Model):
                                     which_data_ycols, fixed_inputs,
                                     levels, samples, fignum, ax, resolution,
                                     plot_raw=plot_raw, Y_metadata=Y_metadata,
-                                     data_symbol=data_symbol, **kw)
+                                     data_symbol=data_symbol, apply_link=apply_link, **kw)

    def plot(self, plot_limits=None, which_data_rows='all',
        which_data_ycols='all', fixed_inputs=[],
        levels=20, samples=0, fignum=None, ax=None, resolution=None,
-        plot_raw=False,
-        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx'):
+        plot_raw=False, linecol=None,fillcol=None, Y_metadata=None,
+        data_symbol='kx', predict_kw=None, plot_training_data=True):
        """
        Plot the posterior of the GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
@ -405,6 +572,8 @@ class GP(Model):
        :type Y_metadata: dict
        :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
        :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
+        :param plot_training_data: whether or not to plot the training points
+        :type plot_training_data: boolean
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import models_plots
@ -417,7 +586,103 @@ class GP(Model):
                                     which_data_ycols, fixed_inputs,
                                     levels, samples, fignum, ax, resolution,
                                     plot_raw=plot_raw, Y_metadata=Y_metadata,
-                                     data_symbol=data_symbol, **kw)
+                                     data_symbol=data_symbol, predict_kw=predict_kw,
+                                     plot_training_data=plot_training_data, **kw)
+
+
+    def plot_data(self, which_data_rows='all',
+        which_data_ycols='all', visible_dims=None,
+        fignum=None, ax=None, data_symbol='kx'):
+        """
+        Plot the training data
+          - For higher dimensions than two, use fixed_inputs to plot the data points with some of the inputs fixed.
+
+        Can plot only part of the data
+        using which_data_rows and which_data_ycols.
+
+        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+        :type plot_limits: np.array
+        :param which_data_rows: which of the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
+        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+        :type which_data_ycols: 'all' or a list of integers
+        :param visible_dims: an array specifying the input dimensions to plot (maximum two)
+        :type visible_dims: a numpy array
+        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+        :type resolution: int
+        :param levels: number of levels to plot in a contour plot.
+        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
+        :type levels: int
+        :param samples: the number of a posteriori samples to plot
+        :type samples: int
+        :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+        :param linecol: color of line to plot [Tango.colorsHex['darkBlue']]
+        :type linecol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
+        :param fillcol: color of fill [Tango.colorsHex['lightBlue']]
+        :type fillcol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
+        :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
+        :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
+        """
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import models_plots
+        kw = {}
+        return models_plots.plot_data(self, which_data_rows,
+                                     which_data_ycols, visible_dims,
+                                     fignum, ax, data_symbol, **kw)
+
+
+    def errorbars_trainset(self, which_data_rows='all',
+            which_data_ycols='all', fixed_inputs=[], fignum=None, ax=None,
+            linecol=None, data_symbol='kx', predict_kw=None, plot_training_data=True,lw=None):
+
+        """
+        Plot the posterior error bars corresponding to the training data
+          - For higher dimensions than two, use fixed_inputs to plot the data points with some of the inputs fixed.
+
+        Can plot only part of the data
+        using which_data_rows and which_data_ycols.
+
+        :param which_data_rows: which of the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
+        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+        :type which_data_rows: 'all' or a list of integers
+        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
+        :type fixed_inputs: a list of tuples
+        :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+        :param plot_training_data: whether or not to plot the training points
+        :type plot_training_data: boolean
+        """
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import models_plots
+        kw = {}
+        if lw is not None:
+            kw['lw'] = lw
+        return models_plots.errorbars_trainset(self, which_data_rows, which_data_ycols, fixed_inputs,
+                                    fignum, ax, linecol, data_symbol,
+                                    predict_kw, plot_training_data, **kw)
+
+
+    def plot_magnification(self, labels=None, which_indices=None,
+                resolution=50, ax=None, marker='o', s=40,
+                fignum=None, legend=True,
+                plot_limits=None,
+                aspect='auto', updates=False, plot_inducing=True, kern=None, **kwargs):
+
+        import sys
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import dim_reduction_plots
+
+        return dim_reduction_plots.plot_magnification(self, labels, which_indices,
+                resolution, ax, marker, s,
+                fignum, plot_inducing, legend,
+                plot_limits, aspect, updates, **kwargs)
+

    def input_sensitivity(self, summarize=True):
        """
@ -441,20 +706,55 @@ class GP(Model):
        try:
            super(GP, self).optimize(optimizer, start, **kwargs)
        except KeyboardInterrupt:
-            print "KeyboardInterrupt caught, calling on_optimization_end() to round things up"
+            print("KeyboardInterrupt caught, calling on_optimization_end() to round things up")
            self.inference_method.on_optimization_end()
            raise

-    def infer_newX(self, Y_new, optimize=True, ):
+    def infer_newX(self, Y_new, optimize=True):
        """
-        Infer the distribution of X for the new observed data *Y_new*.
+        Infer X for the new observed data *Y_new*.

        :param Y_new: the new observed data for inference
        :type Y_new: numpy.ndarray
        :param optimize: whether to optimize the location of new X (True by default)
        :type optimize: boolean
        :return: a tuple containing the posterior estimation of X and the model that optimize X
-        :rtype: (:class:`~GPy.core.parameterization.variational.VariationalPosterior` or numpy.ndarray, :class:`~GPy.core.model.Model`)
+        :rtype: (:class:`~GPy.core.parameterization.variational.VariationalPosterior` and numpy.ndarray, :class:`~GPy.core.model.Model`)
        """
        from ..inference.latent_function_inference.inferenceX import infer_newX
        return infer_newX(self, Y_new, optimize=optimize)
+
+    def log_predictive_density(self, x_test, y_test, Y_metadata=None):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param x_test: test locations (x_{*})
+        :type x_test: (Nx1) array
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param Y_metadata: metadata associated with the test points
+        """
+        mu_star, var_star = self._raw_predict(x_test)
+        return self.likelihood.log_predictive_density(y_test, mu_star, var_star, Y_metadata=Y_metadata)
+
+    def log_predictive_density_sampling(self, x_test, y_test, Y_metadata=None, num_samples=1000):
+        """
+        Calculation of the log predictive density by sampling
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param x_test: test locations (x_{*})
+        :type x_test: (Nx1) array
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param Y_metadata: metadata associated with the test points
+        :param num_samples: number of samples to use in monte carlo integration
+        :type num_samples: int
+        """
+        mu_star, var_star = self._raw_predict(x_test)
+        return self.likelihood.log_predictive_density_sampling(y_test, mu_star, var_star, Y_metadata=Y_metadata, num_samples=num_samples)
+
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@ -1,13 +1,14 @@
 # Copyright (c) 2013,2014, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2015, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import sys
-from parameterization import Parameterized
+from .parameterization import Parameterized
 import numpy as np

 class Mapping(Parameterized):
    """
-    Base model for shared behavior between models that can act like a mapping.
+    Base model for shared mapping behaviours
    """

    def __init__(self, input_dim, output_dim, name='mapping'):
@ -18,49 +19,12 @@ class Mapping(Parameterized):
    def f(self, X):
        raise NotImplementedError

-    def df_dX(self, dL_df, X):
-        """Evaluate derivatives of mapping outputs with respect to inputs.
-
-        :param dL_df: gradient of the objective with respect to the function.
-        :type dL_df: ndarray (num_data x output_dim)
-        :param X: the input locations where derivatives are to be evaluated.
-        :type X: ndarray (num_data x input_dim)
-        :returns: matrix containing gradients of the function with respect to the inputs.
-        """
+    def gradients_X(self, dL_dF, X):
        raise NotImplementedError

-    def df_dtheta(self, dL_df, X):
-        """The gradient of the outputs of the mapping with respect to each of the parameters.
-
-        :param dL_df: gradient of the objective with respect to the function.
-        :type dL_df: ndarray (num_data x output_dim)
-        :param X: input locations where the function is evaluated.
-        :type X: ndarray (num_data x input_dim)
-        :returns: Matrix containing gradients with respect to parameters of each output for each input data.
-        :rtype: ndarray (num_params length)
-        """
-
+    def update_gradients(self, dL_dF, X):
        raise NotImplementedError

-    def plot(self, *args):
-        """
-        Plots the mapping associated with the model.
-          - In one dimension, the function is plotted.
-          - In two dimensions, a contour-plot shows the function
-          - In higher dimensions, we've not implemented this yet !TODO!
-
-        Can plot only part of the data and part of the posterior functions
-        using which_data and which_functions
-
-        This is a convenience function: arguments are passed to
-        GPy.plotting.matplot_dep.models_plots.plot_mapping
-        """
-
-        if "matplotlib" in sys.modules:
-            from ..plotting.matplot_dep import models_plots
-            mapping_plots.plot_mapping(self,*args)
-        else:
-            raise NameError, "matplotlib package has not been imported."

 class Bijective_mapping(Mapping):
    """
@ -68,78 +32,10 @@ class Bijective_mapping(Mapping):
    also back from f to X. The inverse mapping is called g().
    """
    def __init__(self, input_dim, output_dim, name='bijective_mapping'):
-        super(Bijective_apping, self).__init__(name=name)
+        super(Bijective_mapping, self).__init__(name=name)

    def g(self, f):
        """Inverse mapping from output domain of the function to the inputs."""
        raise NotImplementedError

-from model import Model
-
-class Mapping_check_model(Model):
-    """
-    This is a dummy model class used as a base class for checking that the
-    gradients of a given mapping are implemented correctly. It enables
-    checkgradient() to be called independently on each mapping.
-    """
-    def __init__(self, mapping=None, dL_df=None, X=None):
-        num_samples = 20
-        if mapping==None:
-            mapping = GPy.mapping.linear(1, 1)
-        if X==None:
-            X = np.random.randn(num_samples, mapping.input_dim)
-        if dL_df==None:
-            dL_df = np.ones((num_samples, mapping.output_dim))
-
-        self.mapping=mapping
-        self.X = X
-        self.dL_df = dL_df
-        self.num_params = self.mapping.num_params
-        Model.__init__(self)
-
-
-    def _get_params(self):
-        return self.mapping._get_params()
-
-    def _get_param_names(self):
-        return self.mapping._get_param_names()
-
-    def _set_params(self, x):
-        self.mapping._set_params(x)
-
-    def log_likelihood(self):
-        return (self.dL_df*self.mapping.f(self.X)).sum()
-
-    def _log_likelihood_gradients(self):
-        raise NotImplementedError, "This needs to be implemented to use the Mapping_check_model class."
-
-class Mapping_check_df_dtheta(Mapping_check_model):
-    """This class allows gradient checks for the gradient of a mapping with respect to parameters. """
-    def __init__(self, mapping=None, dL_df=None, X=None):
-        Mapping_check_model.__init__(self,mapping=mapping,dL_df=dL_df, X=X)
-
-    def _log_likelihood_gradients(self):
-        return self.mapping.df_dtheta(self.dL_df, self.X)
-
-
-class Mapping_check_df_dX(Mapping_check_model):
-    """This class allows gradient checks for the gradient of a mapping with respect to X. """
-    def __init__(self, mapping=None, dL_df=None, X=None):
-        Mapping_check_model.__init__(self,mapping=mapping,dL_df=dL_df, X=X)
-
-        if dL_df==None:
-            dL_df = np.ones((self.X.shape[0],self.mapping.output_dim))
-        self.num_params = self.X.shape[0]*self.mapping.input_dim
-
-    def _log_likelihood_gradients(self):
-        return self.mapping.df_dX(self.dL_df, self.X).flatten()
-
-    def _get_param_names(self):
-        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
-
-    def _get_params(self):
-        return self.X.flatten()
-
-    def _set_params(self, x):
-        self.X=x.reshape(self.X.shape)

--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -5,12 +5,15 @@
 from .. import likelihoods
 from ..inference import optimization
 from ..util.misc import opt_wrapper
-from parameterization import Parameterized
+from .parameterization import Parameterized
 import multiprocessing as mp
 import numpy as np
 from numpy.linalg.linalg import LinAlgError
 import itertools
+import sys
+from .verbose_optimization import VerboseOptimization
 # import numdifftools as ndt
+from functools import reduce

 class Model(Parameterized):
    _fail_count = 0  # Count of failed optimization steps (see objective)
@ -24,12 +27,13 @@ class Model(Parameterized):
        from .parameterization.ties_and_remappings import Tie
        self.tie = Tie()
        self.link_parameter(self.tie, -1)
+        self.obj_grads = None
        self.add_observer(self.tie, self.tie._parameters_changed_notification, priority=-500)

    def log_likelihood(self):
-        raise NotImplementedError, "this needs to be implemented to use the model class"
+        raise NotImplementedError("this needs to be implemented to use the model class")
    def _log_likelihood_gradients(self):
-        return self.gradient
+        return self.gradient.copy()

    def optimize_restarts(self, num_restarts=10, robust=False, verbose=True, parallel=False, num_processes=None, **kwargs):
        """
@ -72,30 +76,30 @@ class Model(Parameterized):
                jobs = []
                pool = mp.Pool(processes=num_processes)
                for i in range(num_restarts):
-                    self.randomize()
+                    if i>0: self.randomize()
                    job = pool.apply_async(opt_wrapper, args=(self,), kwds=kwargs)
                    jobs.append(job)

                pool.close()  # signal that no more data coming in
                pool.join()  # wait for all the tasks to complete
            except KeyboardInterrupt:
-                print "Ctrl+c received, terminating and joining pool."
+                print("Ctrl+c received, terminating and joining pool.")
                pool.terminate()
                pool.join()

        for i in range(num_restarts):
            try:
                if not parallel:
-                    self.randomize()
+                    if i>0: self.randomize()
                    self.optimize(**kwargs)
                else:
                    self.optimization_runs.append(jobs[i].get())

                if verbose:
-                    print("Optimization restart {0}/{1}, f = {2}".format(i + 1, num_restarts, self.optimization_runs[-1].f_opt))
+                    print(("Optimization restart {0}/{1}, f = {2}".format(i + 1, num_restarts, self.optimization_runs[-1].f_opt)))
            except Exception as e:
                if robust:
-                    print("Warning - optimization restart {0}/{1} failed".format(i + 1, num_restarts))
+                    print(("Warning - optimization restart {0}/{1} failed".format(i + 1, num_restarts)))
                else:
                    raise e

@ -116,7 +120,7 @@ class Model(Parameterized):

        DEPRECATED.
        """
-        raise DeprecationWarning, 'parameters now have default constraints'
+        raise DeprecationWarning('parameters now have default constraints')

    def objective_function(self):
        """
@ -165,14 +169,14 @@ class Model(Parameterized):
        try:
            # self._set_params_transformed(x)
            self.optimizer_array = x
-            obj_grads = self._transform_gradients(self.objective_function_gradients())
+            self.obj_grads = self._transform_gradients(self.objective_function_gradients())
            self._fail_count = 0
        except (LinAlgError, ZeroDivisionError, ValueError):
            if self._fail_count >= self._allowed_failures:
                raise
            self._fail_count += 1
-            obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e100, 1e100)
-        return obj_grads
+            self.obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e100, 1e100)
+        return self.obj_grads

    def _objective(self, x):
        """
@ -200,26 +204,26 @@ class Model(Parameterized):
    def _objective_grads(self, x):
        try:
            self.optimizer_array = x
-            obj_f, obj_grads = self.objective_function(), self._transform_gradients(self.objective_function_gradients())
+            obj_f, self.obj_grads = self.objective_function(), self._transform_gradients(self.objective_function_gradients())
            self._fail_count = 0
        except (LinAlgError, ZeroDivisionError, ValueError):
            if self._fail_count >= self._allowed_failures:
                raise
            self._fail_count += 1
            obj_f = np.inf
-            obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e100, 1e100)
-        return obj_f, obj_grads
+            self.obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e10, 1e10)
+        return obj_f, self.obj_grads

-    def optimize(self, optimizer=None, start=None, **kwargs):
+    def optimize(self, optimizer=None, start=None, messages=False, max_iters=1000, ipython_notebook=True, clear_after_finish=False, **kwargs):
        """
        Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.

        kwargs are passed to the optimizer. They can be:

-        :param max_f_eval: maximum number of function evaluations
-        :type max_f_eval: int
-        :messages: whether to display during optimisation
-        :type messages: bool
+        :param max_iters: maximum number of function evaluations
+        :type max_iters: int
+        :messages: True: Display messages during optimisation, "ipython_notebook":
+        :type messages: bool"string
        :param optimizer: which optimizer to use (defaults to self.preferred optimizer)
        :type optimizer: string

@ -233,13 +237,11 @@ class Model(Parameterized):


        """
-        if self.is_fixed:
-            print 'nothing to optimize'
-        if self.size == 0:
-            print 'nothing to optimize'
+        if self.is_fixed or self.size == 0:
+            print('nothing to optimize')

        if not self.update_model():
-            print "Updates were off, setting updates on again"
+            print("updates were off, setting updates on again")
            self.update_model(True)

        if start == None:
@ -253,9 +255,11 @@ class Model(Parameterized):
            opt.model = self
        else:
            optimizer = optimization.get_optimizer(optimizer)
-            opt = optimizer(start, model=self, **kwargs)
+            opt = optimizer(start, model=self, max_iters=max_iters, **kwargs)

-        opt.run(f_fp=self._objective_grads, f=self._objective, fp=self._grads)
+        with VerboseOptimization(self, opt, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook, clear_after_finish=clear_after_finish) as vo:
+            opt.run(f_fp=self._objective_grads, f=self._objective, fp=self._grads)
+            vo.finish(opt)

        self.optimization_runs.append(opt)

@ -302,7 +306,7 @@ class Model(Parameterized):
                    transformed_index = (indices - (~self._fixes_).cumsum())[transformed_index[which[0]]]

                if transformed_index.size == 0:
-                    print "No free parameters to check"
+                    print("No free parameters to check")
                    return

            # just check the global ratio
@ -337,9 +341,9 @@ class Model(Parameterized):
            cols.extend([max(float_len, len(header[i])) for i in range(1, len(header))])
            cols = np.array(cols) + 5
            header_string = ["{h:^{col}}".format(h=header[i], col=cols[i]) for i in range(len(cols))]
-            header_string = map(lambda x: '|'.join(x), [header_string])
+            header_string = list(map(lambda x: '|'.join(x), [header_string]))
            separator = '-' * len(header_string[0])
-            print '\n'.join([header_string[0], separator])
+            print('\n'.join([header_string[0], separator]))
            if target_param is None:
                param_index = range(len(x))
                transformed_index = param_index
@ -355,20 +359,25 @@ class Model(Parameterized):
                    transformed_index = param_index

                if param_index.size == 0:
-                    print "No free parameters to check"
+                    print("No free parameters to check")
                    return

            gradient = self._grads(x).copy()
            np.where(gradient == 0, 1e-312, gradient)
            ret = True
-            for nind, xind in itertools.izip(param_index, transformed_index):
+            for nind, xind in zip(param_index, transformed_index):
                xx = x.copy()
                xx[xind] += step
                f1 = self._objective(xx)
                xx[xind] -= 2.*step
                f2 = self._objective(xx)
-                df_ratio = np.abs((f1-f2)/min(f1,f2))
-                df_unstable = df_ratio<df_tolerance
+                #Avoid divide by zero, if any of the values are above 1e-15, otherwise both values are essentiall
+                #the same
+                if f1 > 1e-15 or f1 < -1e-15 or f2 > 1e-15 or f2 < -1e-15:
+                    df_ratio = np.abs((f1 - f2) / min(f1, f2))
+                else:
+                    df_ratio = 1.0
+                df_unstable = df_ratio < df_tolerance
                numerical_gradient = (f1 - f2) / (2 * step)
                if np.all(gradient[xind] == 0): ratio = (f1 - f2) == gradient[xind]
                else: ratio = (f1 - f2) / (2 * step * gradient[xind])
@ -389,7 +398,7 @@ class Model(Parameterized):
                ng = '%.6f' % float(numerical_gradient)
                df = '%1.e' % float(df_ratio)
                grad_string = "{0:<{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}|{5:^{c5}}".format(formatted_name, r, d, g, ng, df, c0=cols[0] + 9, c1=cols[1], c2=cols[2], c3=cols[3], c4=cols[4], c5=cols[5])
-                print grad_string
+                print(grad_string)

            self.optimizer_array = x
            return ret
@ -398,11 +407,16 @@ class Model(Parameterized):
        """Representation of the model in html for notebook display."""
        model_details = [['<b>Model</b>', self.name + '<br>'],
                         ['<b>Log-likelihood</b>', '{}<br>'.format(float(self.log_likelihood()))],
-                         ["<b>Number of Parameters</b>", '{}<br>'.format(self.size)]]
+                         ["<b>Number of Parameters</b>", '{}<br>'.format(self.size)],
+                         ["<b>Number of Optimization Parameters</b>", '{}<br>'.format(self._size_transformed())],
+                         ["<b>Updates</b>", '{}<br>'.format(self._update_on)],
+                         ]
        from operator import itemgetter
        to_print = ["""<style type="text/css">
 .pd{
-    font-family:"Courier New", Courier, monospace !important;
+    font-family: "Courier New", Courier, monospace !important;
+    width: 100%;
+    padding: 3px;
 }
 </style>\n"""] + ["<p class=pd>"] + ["{}: {}".format(name, detail) for name, detail in model_details] + ["</p>"]
        to_print.append(super(Model, self)._repr_html_())
@ -411,7 +425,10 @@ class Model(Parameterized):
    def __str__(self):
        model_details = [['Name', self.name],
                         ['Log-likelihood', '{}'.format(float(self.log_likelihood()))],
-                         ["Number of Parameters", '{}'.format(self.size)]]
+                         ["Number of Parameters", '{}'.format(self.size)],
+                         ["Number of Optimization Parameters", '{}'.format(self._size_transformed())],
+                         ["Updates", '{}'.format(self._update_on)],
+                         ]
        from operator import itemgetter
        max_len = reduce(lambda a, b: max(len(b[0]), a), model_details, 0)
        to_print = [""] + ["{0:{l}} : {1}".format(name, detail, l=max_len) for name, detail in model_details] + ["Parameters:"]
--- a/GPy/core/parameterization/init.py
+++ b/GPy/core/parameterization/init.py
@ -1,5 +1,5 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from param import Param, ObsAr
-from parameterized import Parameterized
+from .param import Param, ObsAr
+from .parameterized import Parameterized
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@ -3,7 +3,9 @@

 import numpy
 from numpy.lib.function_base import vectorize
-from lists_and_dicts import IntArrayDict
+from .lists_and_dicts import IntArrayDict
+from functools import reduce
+from .transformations import Transformation

 def extract_properties_to_index(index, props):
    prop_index = dict()
@ -62,12 +64,15 @@ class ParameterIndexOperations(object):
    def __init__(self, constraints=None):
        self._properties = IntArrayDict()
        if constraints is not None:
-            for t, i in constraints.iteritems():
+            #python 3 fix
+            #for t, i in constraints.iteritems():
+            for t, i in constraints.items():
                self.add(t, i)

-    def iteritems(self):
-        return self._properties.iteritems()
-
+    #iteritems has gone in python 3
+    #def iteritems(self):
+    #    return self._properties.iteritems()
+        
    def items(self):
        return self._properties.items()

@ -75,7 +80,7 @@ class ParameterIndexOperations(object):
        return self._properties.keys()

    def iterproperties(self):
-        return self._properties.iterkeys()
+        return iter(self._properties)

    def shift_right(self, start, size):
        for ind in self.iterindices():
@ -83,7 +88,7 @@ class ParameterIndexOperations(object):
            ind[toshift] += size

    def shift_left(self, start, size):
-        for v, ind in self.items():
+        for v, ind in list(self.items()):
            todelete = (ind>=start) * (ind<start+size)
            if todelete.size != 0:
                ind = ind[~todelete]
@ -101,7 +106,11 @@ class ParameterIndexOperations(object):
        return reduce(lambda a,b: a+b.size, self.iterindices(), 0)

    def iterindices(self):
-        return self._properties.itervalues()
+        try:
+            return self._properties.itervalues()
+        except AttributeError:
+            #Changed this from itervalues to values for Py3 compatibility. It didn't break the test suite.
+            return self._properties.values()

    def indices(self):
        return self._properties.values()
@ -150,14 +159,18 @@ class ParameterIndexOperations(object):
        return numpy.array([]).astype(int)

    def update(self, parameter_index_view, offset=0):
-        for i, v in parameter_index_view.iteritems():
+        #py3 fix
+        #for i, v in parameter_index_view.iteritems():
+        for i, v in parameter_index_view.items():
            self.add(i, v+offset)

    def copy(self):
        return self.__deepcopy__(None)

    def __deepcopy__(self, memo):
-        return ParameterIndexOperations(dict(self.iteritems()))
+        #py3 fix
+        #return ParameterIndexOperations(dict(self.iteritems()))
+        return ParameterIndexOperations(dict(self.items()))

    def __getitem__(self, prop):
        return self._properties[prop]
@ -195,22 +208,26 @@ class ParameterIndexOperationsView(object):
    def _filter_index(self, ind):
        return ind[(ind >= self._offset) * (ind < (self._offset + self._size))] - self._offset

-
-    def iteritems(self):
-        for i, ind in self._param_index_ops.iteritems():
+    #iteritems has gone in python 3. It has been renamed items()
+    def items(self):
+        _items_list = list(self._param_index_ops.items())
+        for i, ind in _items_list:
            ind2 = self._filter_index(ind)
            if ind2.size > 0:
                yield i, ind2
-
-    def items(self):
-        return [[i,v] for i,v in self.iteritems()]
+    
+    #Python 3 items() is now implemented as per py2 iteritems
+    #def items(self):
+    #    return [[i,v] for i,v in self.iteritems()]

    def properties(self):
        return [i for i in self.iterproperties()]


    def iterproperties(self):
-        for i, _ in self.iteritems():
+        #py3 fix
+        #for i, _ in self.iteritems():
+        for i, _ in self.items():
            yield i


@ -230,7 +247,9 @@ class ParameterIndexOperationsView(object):


    def iterindices(self):
-        for _, ind in self.iteritems():
+        #py3 fix
+        #for _, ind in self.iteritems():
+        for _, ind in self.items():
            yield ind


@ -286,10 +305,14 @@ class ParameterIndexOperationsView(object):

    def __str__(self, *args, **kwargs):
        import pprint
-        return pprint.pformat(dict(self.iteritems()))
+        #py3 fixes
+        #return pprint.pformat(dict(self.iteritems()))
+        return pprint.pformat(dict(self.items()))

    def update(self, parameter_index_view, offset=0):
-        for i, v in parameter_index_view.iteritems():
+        #py3 fixes
+        #for i, v in parameter_index_view.iteritems():
+        for i, v in parameter_index_view.items():
            self.add(i, v+offset)


@ -297,6 +320,8 @@ class ParameterIndexOperationsView(object):
        return self.__deepcopy__(None)

    def __deepcopy__(self, memo):
-        return ParameterIndexOperations(dict(self.iteritems()))
+        #py3 fix
+        #return ParameterIndexOperations(dict(self.iteritems()))
+        return ParameterIndexOperations(dict(self.items()))
    pass

--- a/GPy/core/parameterization/lists_and_dicts.py
+++ b/GPy/core/parameterization/lists_and_dicts.py
@ -32,7 +32,7 @@ class ArrayList(list):
            if el is item:
                return index
            index += 1
-        raise ValueError, "{} is not in list".format(item)
+        raise ValueError("{} is not in list".format(item))
    pass

 class ObserverList(object):
@ -75,7 +75,7 @@ class ObserverList(object):

    def __str__(self):
        from . import ObsAr, Param
-        from parameter_core import Parameterizable
+        from .parameter_core import Parameterizable
        ret = []
        curr_p = None
        
--- a/GPy/core/parameterization/observable.py
+++ b/GPy/core/parameterization/observable.py
@ -12,8 +12,12 @@ class Observable(object):
    """
    def __init__(self, *args, **kwargs):
        super(Observable, self).__init__()
-        from lists_and_dicts import ObserverList
+        from .lists_and_dicts import ObserverList
        self.observers = ObserverList()
+        self._update_on = True
+
+    def set_updates(self, on=True):
+        self._update_on = on

    def add_observer(self, observer, callble, priority=0):
        """
@ -51,15 +55,16 @@ class Observable(object):
        :param min_priority: only notify observers with priority > min_priority
                             if min_priority is None, notify all observers in order
        """
-        if which is None:
-            which = self
-        if min_priority is None:
-            [callble(self, which=which) for _, _, callble in self.observers]
-        else:
-            for p, _, callble in self.observers:
-                if p <= min_priority:
-                    break
-                callble(self, which=which)
+        if self._update_on:
+            if which is None:
+                which = self
+            if min_priority is None:
+                [callble(self, which=which) for _, _, callble in self.observers]
+            else:
+                for p, _, callble in self.observers:
+                    if p <= min_priority:
+                        break
+                    callble(self, which=which)

    def change_priority(self, observer, callble, priority):
        self.remove_observer(observer, callble)
--- a/GPy/core/parameterization/observable_array.py
+++ b/GPy/core/parameterization/observable_array.py
@ -3,8 +3,8 @@


 import numpy as np
-from parameter_core import Pickleable
-from observable import Observable
+from .parameter_core import Pickleable
+from .observable import Observable

 class ObsAr(np.ndarray, Pickleable, Observable):
    """
@ -39,7 +39,7 @@ class ObsAr(np.ndarray, Pickleable, Observable):
        return self.view(np.ndarray)

    def copy(self):
-        from lists_and_dicts import ObserverList
+        from .lists_and_dicts import ObserverList
        memo = {}
        memo[id(self)] = self
        memo[id(self.observers)] = ObserverList()
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@ -4,8 +4,9 @@
 import itertools
 import numpy
 np = numpy
-from parameter_core import Parameterizable, adjust_name_for_printing, Pickleable
-from observable_array import ObsAr
+from .parameter_core import Parameterizable, adjust_name_for_printing, Pickleable
+from .observable_array import ObsAr
+from functools import reduce

 ###### printing
 __constraints_name__ = "Constraint"
@ -37,6 +38,11 @@ class Param(Parameterizable, ObsAr):
    Fixing parameters will fix them to the value they are right now. If you change
    the fixed value, it will be fixed to the new value!

+    Important Note:
+    Multilevel indexing (e.g. self[:2][1:]) is not supported and might lead to unexpected behaviour.
+    Try to index in one go, using boolean indexing or the numpy builtin
+    np.index function.
+
    See :py:class:`GPy.core.parameterized.Parameterized` for more details on constraining etc.

    """
@ -84,6 +90,7 @@ class Param(Parameterizable, ObsAr):
        self._original_ = getattr(obj, '_original_', None)
        self._name = getattr(obj, '_name', None)
        self._gradient_array_ = getattr(obj, '_gradient_array_', None)
+        self._update_on = getattr(obj, '_update_on', None)
        self.constraints = getattr(obj, 'constraints', None)
        self.priors = getattr(obj, 'priors', None)

@ -155,7 +162,7 @@ class Param(Parameterizable, ObsAr):
    #===========================================================================
    @property
    def is_fixed(self):
-        from transformations import __fixed__
+        from .transformations import __fixed__
        return self.constraints[__fixed__].size == self.size

    def _get_original(self, param):
@ -173,6 +180,7 @@ class Param(Parameterizable, ObsAr):
        import copy
        Pickleable.__setstate__(s, copy.deepcopy(self.__getstate__(), memo))
        return s
+
    def _setup_observers(self):
        """
        Setup the default observers
@ -206,10 +214,14 @@ class Param(Parameterizable, ObsAr):
        return 0
    @property
    def _constraints_str(self):
-        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.constraints.iteritems()))]
+        #py3 fix
+        #return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.constraints.iteritems()))]
+        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.constraints.items()))]
    @property
    def _priors_str(self):
-        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.priors.iteritems()))]
+        #py3 fix
+        #return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.priors.iteritems()))]
+        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.priors.items()))]
    @property
    def _ties_str(self):
        return ['']
@ -273,12 +285,12 @@ class Param(Parameterizable, ObsAr):
        header = header_format.format(x=self.hierarchy_name(), c=__constraints_name__, i=__index_name__, t=__tie_name__, p=__priors_name__)  # nice header for printing
        if not ties: ties = itertools.cycle([''])
        return "\n".join(["""<style type="text/css">
-.tg  {border-collapse:collapse;border-spacing:0;border-color:#999;}
-.tg td{font-family:Arial, sans-serif;font-size:14px;padding:2px 3px;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#999;color:#444;background-color:#F7FDFA;}
-.tg th{font-family:Arial, sans-serif;font-size:14px;font-weight:normal;padding:2px 3px;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#999;color:#fff;background-color:#26ADE4;}
-.tg .tg-left{font-family:"Courier New", Courier, monospace !important;;text-align:left}
-.tg .tg-right{font-family:"Courier New", Courier, monospace !important;;text-align:right}
-</style>"""] + ['<table class="tg">'] + [header] + ["<tr><td class=tg-left>{i}</td><td  class=tg-right>{x}</td><td class=tg-left>{c}</td><td class=tg-left>{p}</td><td class=tg-left>{t}</td></tr>".format(x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in itertools.izip(indices, vals, constr_matrix, ties, prirs)] + ["</table>"])
+.tg  {padding:2px 3px;word-break:normal;border-collapse:collapse;border-spacing:0;border-color:#DCDCDC;margin:0px auto;width:100%;}
+.tg td{font-family:"Courier New", Courier, monospace !important;font-weight:bold;color:#444;background-color:#F7FDFA;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
+.tg th{font-family:"Courier New", Courier, monospace !important;font-weight:normal;color:#fff;background-color:#26ADE4;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
+.tg .tg-left{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:left;}
+.tg .tg-right{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:right;}
+</style>"""] + ['<table class="tg">'] + [header] + ["<tr><td class=tg-left>{i}</td><td  class=tg-right>{x}</td><td class=tg-left>{c}</td><td class=tg-left>{p}</td><td class=tg-left>{t}</td></tr>".format(x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in zip(indices, vals, constr_matrix, ties, prirs)] + ["</table>"])

    def __str__(self, constr_matrix=None, indices=None, prirs=None, ties=None, lc=None, lx=None, li=None, lp=None, lt=None, only_name=False):
        filter_ = self._current_slice_
@ -299,7 +311,7 @@ class Param(Parameterizable, ObsAr):
        if only_name: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.hierarchy_name(), c=sep*lc, i=sep*li, t=sep*lt, p=sep*lp)  # nice header for printing
        else: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.hierarchy_name(), c=__constraints_name__, i=__index_name__, t=__tie_name__, p=__priors_name__)  # nice header for printing
        if not ties: ties = itertools.cycle([''])
-        return "\n".join([header] + ["  {i!s:^{3}s}  |  {x: >{1}.{2}g}  |  {c:^{0}s}  |  {p:^{5}s}  |  {t:^{4}s}  ".format(lc, lx, __precision__, li, lt, lp, x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in itertools.izip(indices, vals, constr_matrix, ties, prirs)])  # return all the constraints with right indices
+        return "\n".join([header] + ["  {i!s:^{3}s}  |  {x: >{1}.{2}g}  |  {c:^{0}s}  |  {p:^{5}s}  |  {t:^{4}s}  ".format(lc, lx, __precision__, li, lt, lp, x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in zip(indices, vals, constr_matrix, ties, prirs)])  # return all the constraints with right indices
        # except: return super(Param, self).__str__()

 class ParamConcatenation(object):
@ -312,7 +324,7 @@ class ParamConcatenation(object):
        See :py:class:`GPy.core.parameter.Param` for more details on constraining.
        """
        # self.params = params
-        from lists_and_dicts import ArrayList
+        from .lists_and_dicts import ArrayList
        self.params = ArrayList([])
        for p in params:
            for p in p.flattened_parameters:
@ -335,7 +347,9 @@ class ParamConcatenation(object):
                    level += 1
                    parent = parent._parent_
        import operator
-        self.parents = map(lambda x: x[0], sorted(parents.iteritems(), key=operator.itemgetter(1)))
+        #py3 fix
+        #self.parents = map(lambda x: x[0], sorted(parents.iteritems(), key=operator.itemgetter(1)))
+        self.parents = map(lambda x: x[0], sorted(parents.items(), key=operator.itemgetter(1)))
    #===========================================================================
    # Get/set items, enable broadcasting
    #===========================================================================
@ -360,7 +374,7 @@ class ParamConcatenation(object):
    #===========================================================================
    def update_all_params(self):
        for par in self.parents:
-            par.notify_observers()
+            par.trigger_update(trigger_parent=False)

    def constrain(self, constraint, warning=True):
        [param.constrain(constraint, trigger_parent=False) for param in self.params]
@ -428,14 +442,14 @@ class ParamConcatenation(object):
        params = self.params
        constr_matrices, ties_matrices, prior_matrices = zip(*map(f, params))
        indices = [p._indices() for p in params]
-        lc = max([p._max_len_names(cm, __constraints_name__) for p, cm in itertools.izip(params, constr_matrices)])
+        lc = max([p._max_len_names(cm, __constraints_name__) for p, cm in zip(params, constr_matrices)])
        lx = max([p._max_len_values() for p in params])
-        li = max([p._max_len_index(i) for p, i in itertools.izip(params, indices)])
-        lt = max([p._max_len_names(tm, __tie_name__) for p, tm in itertools.izip(params, ties_matrices)])
-        lp = max([p._max_len_names(pm, __constraints_name__) for p, pm in itertools.izip(params, prior_matrices)])
+        li = max([p._max_len_index(i) for p, i in zip(params, indices)])
+        lt = max([p._max_len_names(tm, __tie_name__) for p, tm in zip(params, ties_matrices)])
+        lp = max([p._max_len_names(pm, __constraints_name__) for p, pm in zip(params, prior_matrices)])
        strings = []
        start = True
-        for p, cm, i, tm, pm in itertools.izip(params,constr_matrices,indices,ties_matrices,prior_matrices):
+        for p, cm, i, tm, pm in zip(params,constr_matrices,indices,ties_matrices,prior_matrices):
            strings.append(p.__str__(constr_matrix=cm, indices=i, prirs=pm, ties=tm, lc=lc, lx=lx, li=li, lp=lp, lt=lt, only_name=(1-start)))
            start = False
        return "\n".join(strings)
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -1,15 +1,15 @@
 # Copyright (c) 2014, Max Zwiessele, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-
+import six # For metaclass support in Python 2 and 3 simultaneously
 import numpy; np = numpy
 import itertools
 from re import compile, _pattern_type
-from param import ParamConcatenation
-from parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing
+from .param import ParamConcatenation
+from .parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing

 import logging
-from GPy.core.parameterization.index_operations import ParameterIndexOperationsView
+from .index_operations import ParameterIndexOperationsView
 logger = logging.getLogger("parameters changed meta")

 class ParametersChangedMeta(type):
@ -27,6 +27,7 @@ class ParametersChangedMeta(type):
        self.parameters_changed()
        return self

+@six.add_metaclass(ParametersChangedMeta)
 class Parameterized(Parameterizable):
    """
    Parameterized class
@ -73,7 +74,9 @@ class Parameterized(Parameterizable):
    # Metaclass for parameters changed after init.
    # This makes sure, that parameters changed will always be called after __init__
    # **Never** call parameters_changed() yourself
-    __metaclass__ = ParametersChangedMeta
+    #This is ignored in Python 3 -- you need to put the meta class in the function definition.
+    #__metaclass__ = ParametersChangedMeta
+    #The six module is used to support both Python 2 and 3 simultaneously
    #===========================================================================
    def __init__(self, name=None, parameters=[], *a, **kw):
        super(Parameterized, self).__init__(name=name, *a, **kw)
@ -131,7 +134,7 @@ class Parameterized(Parameterizable):
            if param.has_parent():
                def visit(parent, self):
                    if parent is self:
-                        raise HierarchyError, "You cannot add a parameter twice into the hierarchy"
+                        raise HierarchyError("You cannot add a parameter twice into the hierarchy")
                param.traverse_parents(visit, self)
                param._parent_.unlink_parameter(param)
            # make sure the size is set
@ -173,7 +176,7 @@ class Parameterized(Parameterizable):
                self._highest_parent_._connect_fixes()

        else:
-            raise HierarchyError, """Parameter exists already, try making a copy"""
+            raise HierarchyError("""Parameter exists already, try making a copy""")


    def link_parameters(self, *parameters):
@ -189,14 +192,15 @@ class Parameterized(Parameterizable):
        """
        if not param in self.parameters:
            try:
-                raise RuntimeError, "{} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name)
+                raise RuntimeError("{} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name))
            except AttributeError:
-                raise RuntimeError, "{} does not seem to be a parameter, remove parameters directly from their respective parents".format(str(param))
+                raise RuntimeError("{} does not seem to be a parameter, remove parameters directly from their respective parents".format(str(param)))

        start = sum([p.size for p in self.parameters[:param._parent_index_]])
-        self._remove_parameter_name(param)
        self.size -= param.size
        del self.parameters[param._parent_index_]
+        self._remove_parameter_name(param)
+

        param._disconnect_parent()
        param.remove_observer(self, self._pass_through_notify_observers)
@ -215,9 +219,9 @@ class Parameterized(Parameterizable):
        self._highest_parent_._notify_parent_change()

    def add_parameter(self, *args, **kwargs):
-        raise DeprecationWarning, "add_parameter was renamed to link_parameter to avoid confusion of setting variables"
+        raise DeprecationWarning("add_parameter was renamed to link_parameter to avoid confusion of setting variables, use link_parameter instead")
    def remove_parameter(self, *args, **kwargs):
-        raise DeprecationWarning, "remove_parameter was renamed to link_parameter to avoid confusion of setting variables"
+        raise DeprecationWarning("remove_parameter was renamed to unlink_parameter to avoid confusion of setting variables, use unlink_parameter instead")

    def _connect_parameters(self, ignore_added_names=False):
        # connect parameterlist to this parameterized object
@ -237,7 +241,7 @@ class Parameterized(Parameterizable):
        self._param_slices_ = []
        for i, p in enumerate(self.parameters):
            if not p.param_array.flags['C_CONTIGUOUS']:
-                raise ValueError, "This should not happen! Please write an email to the developers with the code, which reproduces this error. All parameter arrays must be C_CONTIGUOUS"
+                raise ValueError("This should not happen! Please write an email to the developers with the code, which reproduces this error. All parameter arrays must be C_CONTIGUOUS")

            p._parent_ = self
            p._parent_index_ = i
@ -268,7 +272,7 @@ class Parameterized(Parameterizable):
        """
        if not isinstance(regexp, _pattern_type): regexp = compile(regexp)
        found_params = []
-        for n, p in itertools.izip(self.parameter_names(False, False, True), self.flattened_parameters):
+        for n, p in zip(self.parameter_names(False, False, True), self.flattened_parameters):
            if regexp.match(n) is not None:
                found_params.append(p)
        return found_params
@ -279,7 +283,7 @@ class Parameterized(Parameterizable):
        else:
            if paramlist is None:
                paramlist = self.grep_param_names(name)
-            if len(paramlist) < 1: raise AttributeError, name
+            if len(paramlist) < 1: raise AttributeError(name)
            if len(paramlist) == 1:
                if isinstance(paramlist[-1], Parameterized):
                    paramlist = paramlist[-1].flattened_parameters
@ -295,8 +299,8 @@ class Parameterized(Parameterizable):
            try:
                self.param_array[name] = value
            except:
-                raise ValueError, "Setting by slice or index only allowed with array-like"
-            self._trigger_params_changed()
+                raise ValueError("Setting by slice or index only allowed with array-like")
+            self.trigger_update()
        else:
            try: param = self.__getitem__(name, paramlist)
            except: raise
@ -312,7 +316,7 @@ class Parameterized(Parameterizable):
                    param[:] = val; return
            except AttributeError:
                pass
-        object.__setattr__(self, name, val);
+        return object.__setattr__(self, name, val);

    #===========================================================================
    # Pickling
@ -325,7 +329,7 @@ class Parameterized(Parameterizable):
            self._notify_parent_change()
            self.parameters_changed()
        except Exception as e:
-            print "WARNING: caught exception {!s}, trying to continue".format(e)
+            print("WARNING: caught exception {!s}, trying to continue".format(e))

    def copy(self, memo=None):
        if memo is None:
@ -379,7 +383,7 @@ class Parameterized(Parameterizable):
        pl = max([len(str(x)) if x else 0 for x in prirs + ["Prior"]])
        format_spec = "<tr><td class=tg-left>{{name:<{0}s}}</td><td class=tg-right>{{desc:>{1}s}}</td><td class=tg-left>{{const:^{2}s}}</td><td class=tg-left>{{pri:^{3}s}}</td><td class=tg-left>{{t:^{4}s}}</td></tr>".format(nl, sl, cl, pl, tl)
        to_print = []
-        for n, d, c, t, p in itertools.izip(names, desc, constrs, ts, prirs):
+        for n, d, c, t, p in zip(names, desc, constrs, ts, prirs):
            to_print.append(format_spec.format(name=n, desc=d, const=c, t=t, pri=p))
        sep = '-' * (nl + sl + cl + + pl + tl + 8 * 2 + 3)
        if header:
@ -393,11 +397,11 @@ class Parameterized(Parameterizable):
 </tr>""".format(name=name)
            to_print.insert(0, header)
        style = """<style type="text/css">
-.tg  {border-collapse:collapse;border-spacing:0;border-color:#999;}
-.tg td{font-family:Arial, sans-serif;font-size:14px;padding:2px 3px;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#999;color:#444;background-color:#F7FDFA;}
-.tg th{font-family:Arial, sans-serif;font-size:14px;font-weight:normal;padding:2px 3px;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#999;color:#fff;background-color:#26ADE4;}
-.tg .tg-left{font-family:"Courier New", Courier, monospace !important;;text-align:left}
-.tg .tg-right{font-family:"Courier New", Courier, monospace !important;;text-align:right}
+.tg  {font-family:"Courier New", Courier, monospace !important;padding:2px 3px;word-break:normal;border-collapse:collapse;border-spacing:0;border-color:#DCDCDC;margin:0px auto;width:100%;}
+.tg td{font-family:"Courier New", Courier, monospace !important;font-weight:bold;color:#444;background-color:#F7FDFA;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
+.tg th{font-family:"Courier New", Courier, monospace !important;font-weight:normal;color:#fff;background-color:#26ADE4;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
+.tg .tg-left{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:left;}
+.tg .tg-right{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:right;}
 </style>"""
        return style + '\n' + '<table class="tg">' + '\n'.format(sep).join(to_print) + '\n</table>'

@ -414,7 +418,7 @@ class Parameterized(Parameterizable):
        pl = max([len(str(x)) if x else 0 for x in prirs + ["Prior"]])
        format_spec = "  \033[1m{{name:<{0}s}}\033[0;0m  |  {{desc:>{1}s}}  |  {{const:^{2}s}}  |  {{pri:^{3}s}}  |  {{t:^{4}s}}".format(nl, sl, cl, pl, tl)
        to_print = []
-        for n, d, c, t, p in itertools.izip(names, desc, constrs, ts, prirs):
+        for n, d, c, t, p in zip(names, desc, constrs, ts, prirs):
            to_print.append(format_spec.format(name=n, desc=d, const=c, t=t, pri=p))
        sep = '-' * (nl + sl + cl + + pl + tl + 8 * 2 + 3)
        if header:
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@ -13,7 +13,6 @@ import weakref
 class Prior(object):
    domain = None
    _instance = None
-
    def __new__(cls, *args, **kwargs):
        if not cls._instance or cls._instance.__class__ is not cls:
                newfunc = super(Prior, cls).__new__
@ -92,7 +91,6 @@ class Gaussian(Prior):
 #         self.sigma2 = np.square(self.sigma)
 #         self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)

-
 class Uniform(Prior):
    domain = _REAL
    _instances = []
@ -131,7 +129,6 @@ class Uniform(Prior):
 #         self.lower = state[0]
 #         self.upper = state[1]

-
 class LogGaussian(Gaussian):
    """
    Implementation of the univariate *log*-Gaussian probability function, coupled with random variables.
@ -249,7 +246,6 @@ class MultivariateGaussian(Prior):
        self.inv, self.hld = pdinv(self.var)
        self.constant = -0.5 * self.input_dim * np.log(2 * np.pi) - self.hld

-
 def gamma_from_EV(E, V):
    warnings.warn("use Gamma.from_EV to create Gamma Prior", FutureWarning)
    return Gamma.from_EV(E, V)
@ -331,7 +327,6 @@ class Gamma(Prior):
        self.b = state[1]
        self.constant = -gammaln(self.a) + self.a * np.log(self.b)

-
 class InverseGamma(Gamma):
    """
    Implementation of the inverse-Gamma probability function, coupled with random variables.
@ -344,8 +339,7 @@ class InverseGamma(Gamma):
    """
    domain = _POSITIVE
    _instances = []
-
-    def __new__(cls, a=1, b=.5):  # Singleton:
+    def __new__(cls, a=1, b=.5): # Singleton:
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
--- a/GPy/core/parameterization/ties_and_remappings.py
+++ b/GPy/core/parameterization/ties_and_remappings.py
@ -2,8 +2,8 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-from parameterized import Parameterized
-from param import Param
+from .parameterized import Parameterized
+from .param import Param

 class Remapping(Parameterized):
    def mapping(self):
@ -98,7 +98,7 @@ class Tie(Parameterized):
            if np.all(self.label_buf[idx]==0):
                # None of p has been tied before.
                tie_idx = self._expandTieParam(1)
-                print tie_idx
+                print(tie_idx)
                tie_id = self.label_buf.max()+1
                self.label_buf[tie_idx] = tie_id
            else:
@ -185,18 +185,18 @@ class Tie(Parameterized):
    def _check_change(self):
        changed = False
        if self.tied_param is not None:
-            for i in xrange(self.tied_param.size):
+            for i in range(self.tied_param.size):
                b0 = self.label_buf==self.label_buf[self.buf_idx[i]]
                b = self._highest_parent_.param_array[b0]!=self.tied_param[i]
                if b.sum()==0:
-                    print 'XXX'
+                    print('XXX')
                    continue
                elif b.sum()==1:
-                    print '!!!'
+                    print('!!!')
                    val = self._highest_parent_.param_array[b0][b][0]
                    self._highest_parent_.param_array[b0] = val
                else:
-                    print '@@@'
+                    print('@@@')
                    self._highest_parent_.param_array[b0] = self.tied_param[i]
                changed = True
        return changed
@ -212,11 +212,11 @@ class Tie(Parameterized):
        if self.tied_param is not None:
            self.tied_param.gradient = 0.
            [np.put(self.tied_param.gradient, i, self._highest_parent_.gradient[self.label_buf==self.label_buf[self.buf_idx[i]]].sum()) 
-                for i in xrange(self.tied_param.size)]
+                for i in range(self.tied_param.size)]
    
    def propagate_val(self):
        if self.tied_param is not None:
-            for i in xrange(self.tied_param.size):
+            for i in range(self.tied_param.size):
                self._highest_parent_.param_array[self.label_buf==self.label_buf[self.buf_idx[i]]] = self.tied_param[i]


--- a/GPy/core/parameterization/updateable.py
+++ b/GPy/core/parameterization/updateable.py
@ -3,7 +3,7 @@ Created on 11 Nov 2014

@author: maxz
 '''
-from observable import Observable
+from .observable import Observable


 class Updateable(Observable):
@ -11,7 +11,6 @@ class Updateable(Observable):
    A model can be updated or not.
    Make sure updates can be switched on and off.
    """
-    _updates = True
    def __init__(self, *args, **kwargs):
        super(Updateable, self).__init__(*args, **kwargs)

@ -27,18 +26,18 @@ class Updateable(Observable):
            None: get the current update state
        """
        if updates is None:
-            p = getattr(self, '_highest_parent_', None)
-            if p is not None:
-                self._updates = p._updates
-            return self._updates
+            return self._update_on
        assert isinstance(updates, bool), "updates are either on (True) or off (False)"
        p = getattr(self, '_highest_parent_', None)
-        if p is not None:
-            p._updates = updates
-        self._updates = updates
+        def turn_updates(s):
+            s._update_on = updates
+        p.traverse(turn_updates)
        self.trigger_update()

    def toggle_update(self):
+        print("deprecated: toggle_update was renamed to update_toggle for easier access")
+        self.update_toggle()
+    def update_toggle(self):
        self.update_model(not self.update_model())

    def trigger_update(self, trigger_parent=True):
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@ -5,9 +5,9 @@ Created on 6 Nov 2013
 '''

 import numpy as np
-from parameterized import Parameterized
-from param import Param
-from transformations import Logexp, Logistic,__fixed__
+from .parameterized import Parameterized
+from .param import Param
+from .transformations import Logexp, Logistic,__fixed__
 from GPy.util.misc import param_to_array
 from GPy.util.caching import Cache_this

@ -16,13 +16,13 @@ class VariationalPrior(Parameterized):
        super(VariationalPrior, self).__init__(name=name, **kw)

    def KL_divergence(self, variational_posterior):
-        raise NotImplementedError, "override this for variational inference of latent space"
+        raise NotImplementedError("override this for variational inference of latent space")

    def update_gradients_KL(self, variational_posterior):
        """
        updates the gradients for mean and variance **in place**
        """
-        raise NotImplementedError, "override this for variational inference of latent space"
+        raise NotImplementedError("override this for variational inference of latent space")

 class NormalPrior(VariationalPrior):
    def KL_divergence(self, variational_posterior):
@ -36,8 +36,9 @@ class NormalPrior(VariationalPrior):
        variational_posterior.variance.gradient -= (1. - (1. / (variational_posterior.variance))) * 0.5

 class SpikeAndSlabPrior(VariationalPrior):
-    def __init__(self, pi=None, learnPi=False, variance = 1.0, name='SpikeAndSlabPrior', **kw):
-        super(SpikeAndSlabPrior, self).__init__(name=name, **kw)        
+    def __init__(self, pi=None, learnPi=False, variance = 1.0, group_spike=False, name='SpikeAndSlabPrior', **kw):
+        super(SpikeAndSlabPrior, self).__init__(name=name, **kw)
+        self.group_spike = group_spike
        self.variance = Param('variance',variance)
        self.learnPi = learnPi
        if learnPi:
@ -50,31 +51,39 @@ class SpikeAndSlabPrior(VariationalPrior):
    def KL_divergence(self, variational_posterior):
        mu = variational_posterior.mean
        S = variational_posterior.variance
-        gamma,gamma1 = variational_posterior.gamma_probabilities()
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        if self.group_spike:
+            gamma = variational_posterior.gamma.values[0]
+        else:
+            gamma = variational_posterior.gamma.values
        if len(self.pi.shape)==2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            idx = np.unique(variational_posterior.gamma._raveled_index()/gamma.shape[-1])
            pi = self.pi[idx]
        else:
            pi = self.pi
            
        var_mean = np.square(mu)/self.variance
        var_S = (S/self.variance - np.log(S))
-        var_gamma = (gamma*(log_gamma-np.log(pi))).sum()+(gamma1*(log_gamma1-np.log(1-pi))).sum()
+        var_gamma = (gamma*np.log(gamma/pi)).sum()+((1-gamma)*np.log((1-gamma)/(1-pi))).sum()
        return var_gamma+ (gamma* (np.log(self.variance)-1. +var_mean + var_S)).sum()/2.

    def update_gradients_KL(self, variational_posterior):
        mu = variational_posterior.mean
        S = variational_posterior.variance
-        gamma,gamma1 = variational_posterior.gamma_probabilities()
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        if self.group_spike:
+            gamma = variational_posterior.gamma.values[0]
+        else:
+            gamma = variational_posterior.gamma.values
        if len(self.pi.shape)==2:
-            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            idx = np.unique(variational_posterior.gamma._raveled_index()/gamma.shape[-1])
            pi = self.pi[idx]
        else:
            pi = self.pi

-        variational_posterior.binary_prob.gradient -= (np.log((1-pi)/pi)+log_gamma-log_gamma1+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.)*gamma*gamma1
+        if self.group_spike:
+            dgamma = np.log((1-pi)/pi*gamma/(1.-gamma))/variational_posterior.num_data
+        else:
+            dgamma = np.log((1-pi)/pi*gamma/(1.-gamma))
+        variational_posterior.binary_prob.gradient -= dgamma+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
        mu.gradient -= gamma*mu/self.variance
        S.gradient -= (1./self.variance - 1./S) * gamma /2.
        if self.learnPi:
@ -141,7 +150,7 @@ class NormalPosterior(VariationalPosterior):
    holds the means and variances for a factorizing multivariate normal distribution
    '''

-    def plot(self, *args):
+    def plot(self, *args, **kwargs):
        """
        Plot latent space X in 1D:

@ -150,36 +159,47 @@ class NormalPosterior(VariationalPosterior):
        import sys
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ...plotting.matplot_dep import variational_plots
-        import matplotlib
-        return variational_plots.plot(self,*args)
+        return variational_plots.plot(self, *args, **kwargs)

+    def KL(self, other):
+        """Compute the KL divergence to another NormalPosterior Object. This only holds, if the two NormalPosterior objects have the same shape, as we do computational tricks for the multivariate normal KL divergence.
+        """
+        return .5*(
+            np.sum(self.variance/other.variance) 
+            + ((other.mean-self.mean)**2/other.variance).sum() 
+            - self.num_data * self.input_dim
+            + np.sum(np.log(other.variance)) - np.sum(np.log(self.variance))
+            )
+    
 class SpikeAndSlabPosterior(VariationalPosterior):
    '''
    The SpikeAndSlab distribution for variational approximations.
    '''
-    def __init__(self, means, variances, binary_prob, name='latent space'):
+    def __init__(self, means, variances, binary_prob, group_spike=False, sharedX=False, name='latent space'):
        """
        binary_prob : the probability of the distribution on the slab part.
        """
        super(SpikeAndSlabPosterior, self).__init__(means, variances, name)
-        self.gamma = Param("binary_prob",binary_prob)
-        self.link_parameter(self.gamma)
-        
-    @Cache_this(limit=5)
-    def gamma_probabilities(self):
-        prob = np.zeros_like(param_to_array(self.gamma))
-        prob[self.gamma>-710] = 1./(1.+np.exp(-self.gamma[self.gamma>-710]))
-        prob1 = -np.zeros_like(param_to_array(self.gamma))
-        prob1[self.gamma<710] = 1./(1.+np.exp(self.gamma[self.gamma<710]))
-        return prob, prob1
+        self.group_spike = group_spike
+        self.sharedX = sharedX
+        if sharedX:
+            self.mean.fix(warning=False)
+            self.variance.fix(warning=False)
+        if group_spike:
+            self.gamma_group = Param("binary_prob_group",binary_prob.mean(axis=0),Logistic(1e-10,1.-1e-10))
+            self.gamma = Param("binary_prob",binary_prob, __fixed__)
+            self.link_parameters(self.gamma_group,self.gamma)
+        else:
+            self.gamma = Param("binary_prob",binary_prob,Logistic(1e-10,1.-1e-10))
+            self.link_parameter(self.gamma)
+            
+    def propogate_val(self):
+        if self.group_spike:
+            self.gamma.values[:] = self.gamma_group.values
    
-    @Cache_this(limit=5)
-    def gamma_log_prob(self):
-        loggamma = param_to_array(self.gamma).copy()
-        loggamma[loggamma>-40] = -np.log1p(np.exp(-loggamma[loggamma>-40]))
-        loggamma1 = -param_to_array(self.gamma).copy()
-        loggamma1[loggamma1>-40] = -np.log1p(np.exp(-loggamma1[loggamma1>-40]))
-        return loggamma,loggamma1
+    def collate_gradient(self):
+        if self.group_spike:
+            self.gamma_group.gradient = self.gamma.gradient.reshape(self.gamma.shape).sum(axis=0)

    def set_gradients(self, grad):
        self.mean.gradient, self.variance.gradient, self.gamma.gradient = grad
@ -198,15 +218,15 @@ class SpikeAndSlabPosterior(VariationalPosterior):
            n.parameters[dc['variance']._parent_index_] = dc['variance']
            n.parameters[dc['binary_prob']._parent_index_] = dc['binary_prob']
            n._gradient_array_ = None
-            oversize = self.size - self.mean.size - self.variance.size
-            n.size = n.mean.size + n.variance.size + oversize
+            oversize = self.size - self.mean.size - self.variance.size - self.gamma.size
+            n.size = n.mean.size + n.variance.size + n.gamma.size + oversize
            n.ndim = n.mean.ndim
            n.shape = n.mean.shape
            n.num_data = n.mean.shape[0]
            n.input_dim = n.mean.shape[1] if n.ndim != 1 else 1
            return n
        else:
-            return super(VariationalPrior, self).__getitem__(s)
+            return super(SpikeAndSlabPosterior, self).__getitem__(s)

    def plot(self, *args, **kwargs):
        """
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -2,18 +2,15 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-from gp import GP
-from parameterization.param import Param
+from .gp import GP
+from .parameterization.param import Param
 from ..inference.latent_function_inference import var_dtc
 from .. import likelihoods
-from parameterization.variational import VariationalPosterior
+from .parameterization.variational import VariationalPosterior, NormalPosterior
+from ..util.linalg import mdot

 import logging
-from GPy.inference.latent_function_inference.posterior import Posterior
-from GPy.inference.optimization.stochastics import SparseGPStochastics,\
-    SparseGPMissing
-#no stochastics.py file added! from GPy.inference.optimization.stochastics import SparseGPStochastics,\
-    #SparseGPMissing
+import itertools
 logger = logging.getLogger("sparse gp")

 class SparseGP(GP):
@ -24,6 +21,10 @@ class SparseGP(GP):
    (Gaussian likelihoods) as well as non-conjugate sparse methods based on
    these.

+    This is not for missing data, as the implementation for missing data involves
+    some inefficient optimization routine decisions.
+    See missing data SparseGP implementation in py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'.
+
    :param X: inputs
    :type X: np.ndarray (num_data x input_dim)
    :param likelihood: a likelihood instance, containing the observed data
@ -39,7 +40,7 @@ class SparseGP(GP):

    """

-    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
+    def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, inference_method=None,
                 name='sparse gp', Y_metadata=None, normalizer=False):
        #pick a sensible inference method
        if inference_method is None:
@ -47,21 +48,31 @@ class SparseGP(GP):
                inference_method = var_dtc.VarDTC(limit=1 if not self.missing_data else Y.shape[1])
            else:
                #inference_method = ??
-                raise NotImplementedError, "what to do what to do?"
-            print "defaulting to ", inference_method, "for latent function inference"
+                raise NotImplementedError("what to do what to do?")
+            print(("defaulting to ", inference_method, "for latent function inference"))

        self.Z = Param('inducing inputs', Z)
        self.num_inducing = Z.shape[0]

-        GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
+        GP.__init__(self, X, Y, kernel, likelihood, mean_function, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)

        logger.info("Adding Z as parameter")
        self.link_parameter(self.Z, index=0)
        self.posterior = None
+        self._predictive_variable = self.Z
+

    def has_uncertain_inputs(self):
        return isinstance(self.X, VariationalPosterior)

+    def set_Z(self, Z, trigger_update=True):
+        if trigger_update: self.update_model(False)
+        self.unlink_parameter(self.Z)
+        self.Z = Param('inducing inputs',Z)
+        self.link_parameter(self.Z, index=0)
+        if trigger_update: self.update_model(True)
+        if trigger_update: self._trigger_params_changed()
+
    def parameters_changed(self):
        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata)

@ -102,34 +113,74 @@ class SparseGP(GP):

    def _raw_predict(self, Xnew, full_cov=False, kern=None):
        """
-        Make a prediction for the latent function values
+        Make a prediction for the latent function values.
+
+        For certain inputs we give back a full_cov of shape NxN,
+        if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of,
+        we take only the diagonal elements across N.
+
+        For uncertain inputs, the SparseGP bound produces cannot predict the full covariance matrix full_cov for now.
+        The implementation of that will follow. However, for each dimension the
+        covariance changes, so if full_cov is False (standard), we return the variance
+        for each dimension [NxD].
        """

        if kern is None: kern = self.kern

        if not isinstance(Xnew, VariationalPosterior):
-            Kx = kern.K(self.Z, Xnew)
-            mu = np.dot(Kx.T, self.posterior.woodbury_vector)
-            if full_cov:
-                Kxx = kern.K(Xnew)
-                if self.posterior.woodbury_inv.ndim == 2:
-                    var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
-                elif self.posterior.woodbury_inv.ndim == 3:
-                    var = Kxx[:,:,None] - np.tensordot(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx).T, Kx, [1,0]).swapaxes(1,2)
-                var = var
-            else:
-                Kxx = kern.Kdiag(Xnew)
-                var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
+            # Kx = kern.K(self._predictive_variable, Xnew)
+            # mu = np.dot(Kx.T, self.posterior.woodbury_vector)
+            # if full_cov:
+            #     Kxx = kern.K(Xnew)
+            #     if self.posterior.woodbury_inv.ndim == 2:
+            #         var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
+            #     elif self.posterior.woodbury_inv.ndim == 3:
+            #         var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
+            #         for i in range(var.shape[2]):
+            #             var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
+            #     var = var
+            # else:
+            #     Kxx = kern.Kdiag(Xnew)
+            #     if self.posterior.woodbury_inv.ndim == 2:
+            #         var = (Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0))[:,None]
+            #     elif self.posterior.woodbury_inv.ndim == 3:
+            #         var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
+            #         for i in range(var.shape[1]):
+            #             var[:, i] = (Kxx - (np.sum(np.dot(self.posterior.woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
+            #     var = var
+            # #add in the mean function
+            # if self.mean_function is not None:
+            #     mu += self.mean_function.f(Xnew)
+            mu, var = super(SparseGP, self)._raw_predict(Xnew, full_cov, kern)
        else:
-            Kx = kern.psi1(self.Z, Xnew).T
-            mu = np.dot(Kx.T, self.posterior.woodbury_vector)
+            psi0_star = kern.psi0(self._predictive_variable, Xnew)
+            psi1_star = kern.psi1(self._predictive_variable, Xnew)
+            #psi2_star = kern.psi2(self.Z, Xnew) # Only possible if we get NxMxM psi2 out of the code.
+            la = self.posterior.woodbury_vector
+            mu = np.dot(psi1_star, la) # TODO: dimensions?
+
            if full_cov:
-                Kxx = kern.K(Xnew.mean)
-                if self.posterior.woodbury_inv.ndim == 2:
-                    var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
-                elif self.posterior.woodbury_inv.ndim == 3:
-                    var = Kxx[:,:,None] - np.tensordot(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx).T, Kx, [1,0]).swapaxes(1,2)
+                raise NotImplementedError("Full covariance for Sparse GP predicted with uncertain inputs not implemented yet.")
+                var = np.empty((Xnew.shape[0], la.shape[1], la.shape[1]))
+                di = np.diag_indices(la.shape[1])
            else:
-                Kxx = kern.psi0(self.Z, Xnew)
-                var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
+                var = np.empty((Xnew.shape[0], la.shape[1]))
+
+            for i in range(Xnew.shape[0]):
+                _mu, _var = Xnew.mean.values[[i]], Xnew.variance.values[[i]]
+                psi2_star = kern.psi2(self._predictive_variable, NormalPosterior(_mu, _var))
+                tmp = (psi2_star[:, :] - psi1_star[[i]].T.dot(psi1_star[[i]]))
+
+                var_ = mdot(la.T, tmp, la)
+                p0 = psi0_star[i]
+                t = np.atleast_3d(self.posterior.woodbury_inv)
+                t2 = np.trace(t.T.dot(psi2_star), axis1=1, axis2=2)
+
+                if full_cov:
+                    var_[di] += p0
+                    var_[di] += -t2
+                    var[i] = var_
+                else:
+                    var[i] = np.diag(var_)+p0-t2
+
        return mu, var
--- a/GPy/core/sparse_gp_mpi.py
+++ b/GPy/core/sparse_gp_mpi.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-from sparse_gp import SparseGP
+from .sparse_gp import SparseGP
 from numpy.linalg.linalg import LinAlgError
 from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch

@ -34,7 +34,7 @@ class SparseGP_MPI(SparseGP):

    """

-    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp mpi', Y_metadata=None, mpi_comm=None, normalizer=False):
+    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp', Y_metadata=None, mpi_comm=None, normalizer=False):
        self._IN_OPTIMIZATION_ = False
        if mpi_comm != None:
            if inference_method is None:
@ -56,7 +56,7 @@ class SparseGP_MPI(SparseGP):
            self.N_range = (N_start, N_end)
            self.N_list = np.array(N_list)
            self.Y_local = self.Y[N_start:N_end]
-            print 'MPI RANK '+str(self.mpi_comm.rank)+' with the data range '+str(self.N_range)
+            print('MPI RANK '+str(self.mpi_comm.rank)+' with the data range '+str(self.N_range))
            mpi_comm.Bcast(self.param_array, root=0)
        self.update_model(True)

--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@ -0,0 +1,105 @@
+# Copyright (c) 2014, James Hensman, Alex Matthews
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ..util import choleskies
+from .sparse_gp import SparseGP
+from .parameterization.param import Param
+from ..inference.latent_function_inference.svgp import SVGP as svgp_inf
+
+
+class SVGP(SparseGP):
+    def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, name='SVGP', Y_metadata=None, batchsize=None, num_latent_functions=None):
+        """
+        Stochastic Variational GP.
+
+        For Gaussian Likelihoods, this implements
+
+        Gaussian Processes for Big data, Hensman, Fusi and Lawrence, UAI 2013,
+
+        But without natural gradients. We'll use the lower-triangluar
+        representation of the covariance matrix to ensure
+        positive-definiteness.
+
+        For Non Gaussian Likelihoods, this implements
+
+        Hensman, Matthews and Ghahramani, Scalable Variational GP Classification, ArXiv 1411.2005
+        """
+        self.batchsize = batchsize
+        self.X_all, self.Y_all = X, Y
+        if batchsize is None:
+            X_batch, Y_batch = X, Y
+        else:
+            import climin.util
+            #Make a climin slicer to make drawing minibatches much quicker
+            self.slicer = climin.util.draw_mini_slices(self.X_all.shape[0], self.batchsize)
+            X_batch, Y_batch = self.new_batch()
+
+        #create the SVI inference method
+        inf_method = svgp_inf()
+
+        SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, mean_function=mean_function, inference_method=inf_method,
+                 name=name, Y_metadata=Y_metadata, normalizer=False)
+
+        #assume the number of latent functions is one per col of Y unless specified
+        if num_latent_functions is None:
+            num_latent_functions = Y.shape[1]
+
+        self.m = Param('q_u_mean', np.zeros((self.num_inducing, num_latent_functions)))
+        chol = choleskies.triang_to_flat(np.tile(np.eye(self.num_inducing)[None,:,:], (num_latent_functions, 1,1)))
+        self.chol = Param('q_u_chol', chol)
+        self.link_parameter(self.chol)
+        self.link_parameter(self.m)
+
+    def parameters_changed(self):
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.mean_function, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
+
+        #update the kernel gradients
+        self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z)
+        grad = self.kern.gradient.copy()
+        self.kern.update_gradients_full(self.grad_dict['dL_dKmn'], self.Z, self.X)
+        grad += self.kern.gradient.copy()
+        self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X)
+        self.kern.gradient += grad
+        if not self.Z.is_fixed:# only compute these expensive gradients if we need them
+            self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z) + self.kern.gradients_X(self.grad_dict['dL_dKmn'], self.Z, self.X)
+
+
+        self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
+        #update the variational parameter gradients:
+        self.m.gradient = self.grad_dict['dL_dm']
+        self.chol.gradient = self.grad_dict['dL_dchol']
+
+        if self.mean_function is not None:
+            self.mean_function.update_gradients(self.grad_dict['dL_dmfX'], self.X)
+            g = self.mean_function.gradient[:].copy()
+            self.mean_function.update_gradients(self.grad_dict['dL_dmfZ'], self.Z)
+            self.mean_function.gradient[:] += g
+            self.Z.gradient[:] += self.mean_function.gradients_X(self.grad_dict['dL_dmfZ'], self.Z)
+
+    def set_data(self, X, Y):
+        """
+        Set the data without calling parameters_changed to avoid wasted computation
+        If this is called by the stochastic_grad function this will immediately update the gradients
+        """
+        assert X.shape[1]==self.Z.shape[1]
+        self.X, self.Y = X, Y
+
+    def new_batch(self):
+        """
+        Return a new batch of X and Y by taking a chunk of data from the complete X and Y
+        """
+        i = self.slicer.next()
+        return self.X_all[i], self.Y_all[i]
+
+    def stochastic_grad(self, parameters):
+        self.set_data(*self.new_batch())
+        return self._grads(parameters)
+
+    def optimizeWithFreezingZ(self):
+        self.Z.fix()
+        self.kern.fix()
+        self.optimize('bfgs')
+        self.Z.unfix()
+        self.kern.constrain_positive()
+        self.optimize('bfgs')
--- a/GPy/core/symbolic.py
+++ b/GPy/core/symbolic.py
@ -223,7 +223,7 @@ class Symbolic_core():

    def code_gradients_cacheable(self, function, variable):
        if variable not in self.cacheable:
-            raise RuntimeError, variable + ' must be a cacheable.'
+            raise RuntimeError(variable + ' must be a cacheable.')
        lcode = 'gradients_' + variable + ' = np.zeros_like(' + variable + ')\n'
        lcode += 'self.update_cache(' + ', '.join(self.cacheable) + ')\n'
        for i, theta in enumerate(self.variables[variable]):
--- a/GPy/core/verbose_optimization.py
+++ b/GPy/core/verbose_optimization.py
@ -0,0 +1,185 @@
+# Copyright (c) 2012-2014, Max Zwiessele.
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from __future__ import print_function
+import numpy as np
+import sys
+import time
+import datetime
+
+def exponents(fnow, current_grad):
+    exps = [np.abs(np.float(fnow)), 1 if current_grad is np.nan else current_grad]
+    return np.sign(exps) * np.log10(exps).astype(int)
+
+class VerboseOptimization(object):
+    def __init__(self, model, opt, maxiters, verbose=False, current_iteration=0, ipython_notebook=True, clear_after_finish=False):
+        self.verbose = verbose
+        if self.verbose:
+            self.model = model
+            self.iteration = current_iteration
+            self.p_iter = self.iteration
+            self.maxiters = maxiters
+            self.len_maxiters = len(str(maxiters))
+            self.opt_name = opt.opt_name
+            self.model.add_observer(self, self.print_status)
+            self.status = 'running'
+            self.clear = clear_after_finish
+
+            self.update()
+
+            try:
+                from IPython.display import display
+                from IPython.html.widgets import IntProgress, HTML, Box, VBox, HBox, FlexBox
+                self.text = HTML(width='100%')
+                self.progress = IntProgress(min=0, max=maxiters)
+                #self.progresstext = Text(width='100%', disabled=True, value='0/{}'.format(maxiters))
+                self.model_show = HTML()
+                self.ipython_notebook = ipython_notebook
+            except:
+                # Not in Ipython notebook
+                self.ipython_notebook = False
+
+            if self.ipython_notebook:
+                left_col = VBox(children=[self.progress, self.text], padding=2, width='40%')
+                right_col = Box(children=[self.model_show], padding=2, width='60%')
+                self.hor_align = FlexBox(children = [left_col, right_col], width='100%', orientation='horizontal')
+
+                display(self.hor_align)
+
+                try:
+                    self.text.set_css('width', '100%')
+                    left_col.set_css({
+                             'padding': '2px',
+                             'width': "100%",
+                             })
+
+                    right_col.set_css({
+                             'padding': '2px',
+                             })
+
+                    self.hor_align.set_css({
+                             'width': "100%",
+                             })
+
+                    self.hor_align.remove_class('vbox')
+                    self.hor_align.add_class('hbox')
+
+                    left_col.add_class("box-flex1")
+                    right_col.add_class('box-flex0')
+
+                except:
+                    pass
+
+                #self.text.add_class('box-flex2')
+                #self.progress.add_class('box-flex1')
+            else:
+                self.exps = exponents(self.fnow, self.current_gradient)
+                print('Running {} Code:'.format(self.opt_name))
+                print('  {3:7s}   {0:{mi}s}   {1:11s}    {2:11s}'.format("i", "f", "|g|", "runtime", mi=self.len_maxiters))
+
+    def __enter__(self):
+        self.start = time.time()
+        self._time = self.start
+        return self
+
+    def print_out(self, seconds):
+        if seconds<60:
+            ms = (seconds%1)*100
+            self.timestring = "{s:0>2d}s{ms:0>2d}".format(s=int(seconds), ms=int(ms))
+        else:
+            m, s = divmod(seconds, 60)
+            if m>59:
+                h, m = divmod(m, 60)
+                if h>23:
+                    d, h = divmod(h, 24)
+                    self.timestring = '{d:0>2d}d{h:0>2d}h{m:0>2d}'.format(m=int(m), h=int(h), d=int(d))
+                else:
+                    self.timestring = '{h:0>2d}h{m:0>2d}m{s:0>2d}'.format(m=int(m), s=int(s), h=int(h))
+            else:
+                ms = (seconds%1)*100
+                self.timestring = '{m:0>2d}m{s:0>2d}s{ms:0>2d}'.format(m=int(m), s=int(s), ms=int(ms))
+        if self.ipython_notebook:
+            names_vals = [['optimizer', "{:s}".format(self.opt_name)],
+                          ['runtime', "{:>s}".format(self.timestring)],
+                          ['evaluation', "{:>0{l}}".format(self.iteration, l=self.len_maxiters)],
+                          ['objective', "{: > 12.3E}".format(self.fnow)],
+                          ['||gradient||', "{: >+12.3E}".format(float(self.current_gradient))],
+                          ['status', "{:s}".format(self.status)],
+                      ]
+            #message = "Lik:{:5.3E} Grad:{:5.3E} Lik:{:5.3E} Len:{!s}".format(float(m.log_likelihood()), np.einsum('i,i->', grads, grads), float(m.likelihood.variance), " ".join(["{:3.2E}".format(l) for l in m.kern.lengthscale.values]))
+            html_begin = """<style type="text/css">
+    .tg-opt  {font-family:"Courier New", Courier, monospace !important;padding:2px 3px;word-break:normal;border-collapse:collapse;border-spacing:0;border-color:#DCDCDC;margin:0px auto;width:100%;}
+    .tg-opt td{font-family:"Courier New", Courier, monospace !important;font-weight:bold;color:#444;background-color:#F7FDFA;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
+    .tg-opt th{font-family:"Courier New", Courier, monospace !important;font-weight:normal;color:#fff;background-color:#26ADE4;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
+    .tg-opt .tg-left{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:left;}
+    .tg-opt .tg-right{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:right;}
+    </style>
+    <table class="tg-opt">"""
+            html_end = "</table>"
+            html_body = ""
+            for name, val in names_vals:
+                html_body += "<tr>"
+                html_body += "<td class='tg-left'>{}</td>".format(name)
+                html_body += "<td class='tg-right'>{}</td>".format(val)
+                html_body += "</tr>"
+            self.text.value = html_begin + html_body + html_end
+            self.progress.value = (self.iteration+1)
+            #self.progresstext.value = '0/{}'.format((self.iteration+1))
+            self.model_show.value = self.model._repr_html_()
+        else:
+            n_exps = exponents(self.fnow, self.current_gradient)
+            if self.iteration - self.p_iter >= 20 * np.random.rand():
+                a = self.iteration >= self.p_iter * 2.78
+                b = np.any(n_exps < self.exps)
+                if a or b:
+                    self.p_iter = self.iteration
+                    print('')
+                if b:
+                    self.exps = n_exps
+            print('\r', end=' ')
+            print('{3:}  {0:>0{mi}g}  {1:> 12e}  {2:> 12e}'.format(self.iteration, float(self.fnow), float(self.current_gradient), "{:>8s}".format(self.timestring), mi=self.len_maxiters), end=' ') # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
+            sys.stdout.flush()
+
+    def print_status(self, me, which=None):
+        self.update()
+
+        t = time.time()
+        seconds = t-self.start
+        #sys.stdout.write(" "*len(self.message))
+        if t-self._time > .3 or seconds < .3:
+            self.print_out(seconds)
+            self._time = t
+
+        self.iteration += 1
+
+    def update(self):
+        self.fnow = self.model.objective_function()
+        if self.model.obj_grads is not None:
+            grad = self.model.obj_grads
+            self.current_gradient = np.dot(grad, grad)
+        else:
+            self.current_gradient = np.nan
+
+    def finish(self, opt):
+        self.status = opt.status
+        if self.verbose and self.ipython_notebook:
+            if 'conv' in self.status.lower():
+                self.progress.bar_style = 'success'
+            elif self.iteration >= self.maxiters:
+                self.progress.bar_style = 'warning'
+            else:
+                self.progress.bar_style = 'danger'
+
+    def __exit__(self, type, value, traceback):
+        if self.verbose:
+            self.stop = time.time()
+            self.model.remove_observer(self)
+            self.print_out(self.stop - self.start)
+
+            if not self.ipython_notebook:
+                print()
+                print('Runtime: {}'.format("{:>9s}".format(self.timestring)))
+                print('Optimization status: {0}'.format(self.status))
+                print()
+            elif self.clear:
+                self.hor_align.close()
--- a/GPy/defaults.cfg
+++ b/GPy/defaults.cfg
@ -25,3 +25,6 @@ MKL = False
 [weave]
 #if true, try to use weave, and fall back to numpy. if false, just use numpy.
 working = True
+
+[cython]
+working = True
--- a/GPy/examples/init.py
+++ b/GPy/examples/init.py
@ -1,7 +1,7 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-import classification
-import regression
-import dimensionality_reduction
-import non_gaussian
+from . import classification
+from . import regression
+from . import dimensionality_reduction
+from . import non_gaussian
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -15,7 +15,7 @@ def oil(num_inducing=50, max_iters=100, kernel=None, optimize=True, plot=True):

    """
    try:import pods
-    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
    data = pods.datasets.oil()
    X = data['X']
    Xtest = data['Xtest']
@ -52,7 +52,7 @@ def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):
    """

    try:import pods
-    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
    data = pods.datasets.toy_linear_1d_classification(seed=seed)
    Y = data['Y'][:, 0:1]
    Y[Y.flatten() == -1] = 0
@ -75,7 +75,7 @@ def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])

-    print m
+    print(m)
    return m

 def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=True):
@ -88,7 +88,7 @@ def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=
    """

    try:import pods
-    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
    data = pods.datasets.toy_linear_1d_classification(seed=seed)
    Y = data['Y'][:, 0:1]
    Y[Y.flatten() == -1] = 0
@ -114,7 +114,7 @@ def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])

-    print m
+    print(m)
    return m

 def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, optimize=True, plot=True):
@ -127,7 +127,7 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti
    """

    try:import pods
-    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
    data = pods.datasets.toy_linear_1d_classification(seed=seed)
    Y = data['Y'][:, 0:1]
    Y[Y.flatten() == -1] = 0
@ -147,7 +147,7 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])

-    print m
+    print(m)
    return m

 def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):
@ -160,7 +160,7 @@ def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):
    """

    try:import pods
-    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
    data = pods.datasets.toy_linear_1d_classification(seed=seed)
    Y = data['Y'][:, 0:1]
    Y[Y.flatten() == -1] = 0
@ -177,7 +177,7 @@ def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):
        # Parameters optimization:
        for _ in range(5):
            m.optimize(max_iters=int(max_iters/5))
-        print m
+        print(m)

    # Plot
    if plot:
@ -186,7 +186,7 @@ def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])

-    print m
+    print(m)
    return m

 def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=None, optimize=True, plot=True):
@ -202,7 +202,7 @@ def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=
    :type kernel: a GPy kernel
    """
    try:import pods
-    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
    data = pods.datasets.crescent_data(seed=seed)
    Y = data['Y']
    Y[Y.flatten()==-1] = 0
@ -217,12 +217,11 @@ def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=
    elif model_type == 'FITC':
        m = GPy.models.FITCClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing)
        m['.*len'] = 3.
-
    if optimize:
-        m.pseudo_EM()
+        m.optimize()

    if plot:
        m.plot()

-    print m
+    print(m)
    return m
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -215,6 +215,7 @@ def ssgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40
    return m

 def _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim=False):
+    """Simulate some data drawn from a matern covariance and a periodic exponential for use in MRD demos."""
    Q_signal = 4
    import GPy
    import numpy as np
@ -254,6 +255,7 @@ def _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim=False):
    return slist, [S1, S2, S3], Ylist

 def _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim=False):
+    """Simulate some data drawn from sine and cosine for use in demos of MRD"""
    _np.random.seed(1234)

    x = _np.linspace(0, 4 * _np.pi, N)[:, None]
@ -333,7 +335,7 @@ def bgplvm_simulation(optimize=True, verbose=1,
    m.likelihood.variance = .1

    if optimize:
-        print "Optimizing model:"
+        print("Optimizing model:")
        m.optimize('bfgs', messages=verbose, max_iters=max_iters,
                   gtol=.05)
    if plot:
@ -353,13 +355,13 @@ def ssgplvm_simulation(optimize=True, verbose=1,
    Y = Ylist[0]
    k = kern.Linear(Q, ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
    # k = kern.RBF(Q, ARD=True, lengthscale=10.)
-    m = SSGPLVM(Y, Q, init="pca", num_inducing=num_inducing, kernel=k)
+    m = SSGPLVM(Y, Q, init="rand", num_inducing=num_inducing, kernel=k, group_spike=True)
    m.X.variance[:] = _np.random.uniform(0, .01, m.X.shape)
-    m.likelihood.variance = .1
+    m.likelihood.variance = .01

    if optimize:
-        print "Optimizing model:"
-        m.optimize('scg', messages=verbose, max_iters=max_iters,
+        print("Optimizing model:")
+        m.optimize('bfgs', messages=verbose, max_iters=max_iters,
                   gtol=.05)
    if plot:
        m.X.plot("SSGPLVM Latent Space 1D")
@ -388,7 +390,7 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
    m.Yreal = Y

    if optimize:
-        print "Optimizing model:"
+        print("Optimizing model:")
        m.optimize('bfgs', messages=verbose, max_iters=max_iters,
                   gtol=.05)
    if plot:
@ -402,7 +404,8 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
    from GPy.models import MRD

    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
-    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
+    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim)
+    

    # Ylist = [Ylist[0]]
    k = kern.Linear(Q, ARD=True)
@ -411,7 +414,7 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
    m['.*noise'] = [Y.var() / 40. for Y in Ylist]

    if optimize:
-        print "Optimizing Model:"
+        print("Optimizing Model:")
        m.optimize(messages=verbose, max_iters=8e3)
    if plot:
        m.X.plot("MRD Latent Space 1D")
@ -439,7 +442,7 @@ def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim
            initx="random", initz='permute', **kw)

    if optimize:
-        print "Optimizing Model:"
+        print("Optimizing Model:")
        m.optimize('bfgs', messages=verbose, max_iters=8e3, gtol=.1)
    if plot:
        m.X.plot("MRD Latent Space 1D")
@ -585,6 +588,7 @@ def robot_wireless(optimize=True, verbose=True, plot=True):
    return m

 def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
+    """Interactive visualisation of the Stick Man data from Ohio State University with the Bayesian GPLVM."""
    from GPy.models import BayesianGPLVM
    from matplotlib import pyplot as plt
    import numpy as np
@ -603,7 +607,7 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
    try:
        if optimize: m.optimize('bfgs', messages=verbose, max_iters=5e3, bfgs_factor=10)
    except KeyboardInterrupt:
-        print "Keyboard interrupt, continuing to plot and return"
+        print("Keyboard interrupt, continuing to plot and return")

    if plot:
        fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
@ -613,7 +617,8 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y, connect=data['connect'])
        dim_select = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean[:1, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
        fig.canvas.draw()
-        fig.canvas.show()
+        # Canvas.show doesn't work on OSX.
+        #fig.canvas.show()
        raw_input('Press enter to finish')

    return m
@ -653,7 +658,7 @@ def ssgplvm_simulation_linear():
    def sample_X(Q, pi):
        x = np.empty(Q)
        dies = np.random.rand(Q)
-        for q in xrange(Q):
+        for q in range(Q):
            if dies[q] < pi:
                x[q] = np.random.randn()
            else:
@ -663,7 +668,7 @@ def ssgplvm_simulation_linear():
    Y = np.empty((N, D))
    X = np.empty((N, Q))
    # Generate data from random sampled weight matrices
-    for n in xrange(N):
+    for n in range(N):
        X[n] = sample_X(Q, pi)
        w = np.random.randn(D, Q)
        Y[n] = np.dot(w, X[n])
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@ -37,7 +37,7 @@ def student_t_approx(optimize=True, plot=True):

    #Add student t random noise to datapoints
    deg_free = 1
-    print "Real noise: ", real_std
+    print("Real noise: ", real_std)
    initial_var_guess = 0.5
    edited_real_sd = initial_var_guess

@ -73,7 +73,7 @@ def student_t_approx(optimize=True, plot=True):
    m4['.*t_scale2'].constrain_bounded(1e-6, 10.)
    m4['.*white'].constrain_fixed(1e-5)
    m4.randomize()
-    print m4
+    print(m4)
    debug=True
    if debug:
        m4.optimize(messages=1)
@ -81,18 +81,18 @@ def student_t_approx(optimize=True, plot=True):
        pb.plot(m4.X, m4.inference_method.f_hat)
        pb.plot(m4.X, m4.Y, 'rx')
        m4.plot()
-        print m4
+        print(m4)
        return m4

    if optimize:
        optimizer='scg'
-        print "Clean Gaussian"
+        print("Clean Gaussian")
        m1.optimize(optimizer, messages=1)
-        print "Corrupt Gaussian"
+        print("Corrupt Gaussian")
        m2.optimize(optimizer, messages=1)
-        print "Clean student t"
+        print("Clean student t")
        m3.optimize(optimizer, messages=1)
-        print "Corrupt student t"
+        print("Corrupt student t")
        m4.optimize(optimizer, messages=1)

    if plot:
@ -151,7 +151,7 @@ def boston_example(optimize=True, plot=True):

    for n, (train, test) in enumerate(kf):
        X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
-        print "Fold {}".format(n)
+        print("Fold {}".format(n))

        noise = 1e-1 #np.exp(-2)
        rbf_len = 0.5
@ -163,21 +163,21 @@ def boston_example(optimize=True, plot=True):
        score_folds[0, n] = rmse(Y_test, np.mean(Y_train))

        #Gaussian GP
-        print "Gauss GP"
+        print("Gauss GP")
        mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp.copy())
        mgp.constrain_fixed('.*white', 1e-5)
        mgp['.*len'] = rbf_len
        mgp['.*noise'] = noise
-        print mgp
+        print(mgp)
        if optimize:
            mgp.optimize(optimizer=optimizer, messages=messages)
        Y_test_pred = mgp.predict(X_test)
        score_folds[1, n] = rmse(Y_test, Y_test_pred[0])
        pred_density[1, n] = np.mean(mgp.log_predictive_density(X_test, Y_test))
-        print mgp
-        print pred_density
+        print(mgp)
+        print(pred_density)

-        print "Gaussian Laplace GP"
+        print("Gaussian Laplace GP")
        N, D = Y_train.shape
        g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=noise, N=N, D=D)
        g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution)
@ -186,18 +186,18 @@ def boston_example(optimize=True, plot=True):
        mg.constrain_fixed('.*white', 1e-5)
        mg['rbf_len'] = rbf_len
        mg['noise'] = noise
-        print mg
+        print(mg)
        if optimize:
            mg.optimize(optimizer=optimizer, messages=messages)
        Y_test_pred = mg.predict(X_test)
        score_folds[2, n] = rmse(Y_test, Y_test_pred[0])
        pred_density[2, n] = np.mean(mg.log_predictive_density(X_test, Y_test))
-        print pred_density
-        print mg
+        print(pred_density)
+        print(mg)

        for stu_num, df in enumerate(degrees_freedoms):
            #Student T
-            print "Student-T GP {}df".format(df)
+            print("Student-T GP {}df".format(df))
            t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=df, sigma2=noise)
            stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
            mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
@ -205,14 +205,14 @@ def boston_example(optimize=True, plot=True):
            mstu_t.constrain_bounded('.*t_scale2', 0.0001, 1000)
            mstu_t['rbf_len'] = rbf_len
            mstu_t['.*t_scale2'] = noise
-            print mstu_t
+            print(mstu_t)
            if optimize:
                mstu_t.optimize(optimizer=optimizer, messages=messages)
            Y_test_pred = mstu_t.predict(X_test)
            score_folds[3+stu_num, n] = rmse(Y_test, Y_test_pred[0])
            pred_density[3+stu_num, n] = np.mean(mstu_t.log_predictive_density(X_test, Y_test))
-            print pred_density
-            print mstu_t
+            print(pred_density)
+            print(mstu_t)

    if plot:
        plt.figure()
@ -230,8 +230,8 @@ def boston_example(optimize=True, plot=True):
        plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
        plt.title('Stu t {}df'.format(df))

-    print "Average scores: {}".format(np.mean(score_folds, 1))
-    print "Average pred density: {}".format(np.mean(pred_density, 1))
+    print("Average scores: {}".format(np.mean(score_folds, 1)))
+    print("Average pred density: {}".format(np.mean(pred_density, 1)))

    if plot:
        #Plotting
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -15,7 +15,7 @@ def olympic_marathon_men(optimize=True, plot=True):
    """Run a standard Gaussian process regression on the Olympic marathon data."""
    try:import pods
    except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
        return
    data = pods.datasets.olympic_marathon_men()

@ -88,7 +88,7 @@ def epomeo_gpx(max_iters=200, optimize=True, plot=True):
    """
    try:import pods
    except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
        return
    data = pods.datasets.epomeo_gpx()
    num_data_list = []
@ -135,7 +135,7 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000

    try:import pods
    except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
        return
    data = pods.datasets.della_gatta_TRP63_gene_expression(data_set='della_gatta',gene_number=gene_number)
    # data['Y'] = data['Y'][0::2, :]
@ -219,7 +219,7 @@ def olympic_100m_men(optimize=True, plot=True):
    """Run a standard Gaussian process regression on the Rogers and Girolami olympics data."""
    try:import pods
    except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
        return
    data = pods.datasets.olympic_100m_men()

@ -240,7 +240,7 @@ def toy_rbf_1d(optimize=True, plot=True):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
    try:import pods
    except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
        return
    data = pods.datasets.toy_rbf_1d()

@ -258,7 +258,7 @@ def toy_rbf_1d_50(optimize=True, plot=True):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
    try:import pods
    except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
        return
    data = pods.datasets.toy_rbf_1d_50()

@ -377,7 +377,7 @@ def robot_wireless(max_iters=100, kernel=None, optimize=True, plot=True):
    """Predict the location of a robot given wirelss signal strength readings."""
    try:import pods
    except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
        return
    data = pods.datasets.robot_wireless()

@ -398,14 +398,14 @@ def robot_wireless(max_iters=100, kernel=None, optimize=True, plot=True):

    sse = ((data['Xtest'] - Xpredict)**2).sum()

-    print('Sum of squares error on test data: ' + str(sse))
+    print(('Sum of squares error on test data: ' + str(sse)))
    return m

 def silhouette(max_iters=100, optimize=True, plot=True):
    """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper."""
    try:import pods
    except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
        return
    data = pods.datasets.silhouette()

@ -416,7 +416,7 @@ def silhouette(max_iters=100, optimize=True, plot=True):
    if optimize:
        m.optimize(messages=True, max_iters=max_iters)

-    print m
+    print(m)
    return m

 def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, optimize=True, plot=True, checkgrad=False):
@ -468,7 +468,7 @@ def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, opt
    if plot:
        m.plot()

-    print m
+    print(m)
    return m

 def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
@ -492,7 +492,7 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
    if plot:
        m.plot(ax=axes[0])
        axes[0].set_title('no input uncertainty')
-    print m
+    print(m)

    # the same Model with uncertainty
    m = GPy.models.SparseGPRegression(X, Y, kernel=GPy.kern.RBF(1), Z=Z, X_variance=S)
@ -503,5 +503,50 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
        axes[1].set_title('with input uncertainty')
        fig.canvas.draw()

-    print m
+    print(m)
    return m
+
+def simple_mean_function(max_iters=100, optimize=True, plot=True):
+    """
+    The simplest possible mean function. No parameters, just a simple Sinusoid.
+    """
+    #create  simple mean function
+    mf = GPy.core.Mapping(1,1)
+    mf.f = np.sin
+    mf.update_gradients = lambda a,b: None
+
+    X = np.linspace(0,10,50).reshape(-1,1)
+    Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
+
+    k =GPy.kern.RBF(1)
+    lik = GPy.likelihoods.Gaussian()
+    m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+    if optimize:
+        m.optimize(max_iters=max_iters)
+    if plot:
+        m.plot(plot_limits=(-10,15))
+    return m
+
+def parametric_mean_function(max_iters=100, optimize=True, plot=True):
+    """
+    A linear mean function with parameters that we'll learn alongside the kernel
+    """
+    #create  simple mean function
+    mf = GPy.core.Mapping(1,1)
+    mf.f = np.sin
+
+    X = np.linspace(0,10,50).reshape(-1,1)
+    Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape) + 3*X
+
+    mf = GPy.mappings.Linear(1,1)
+
+    k =GPy.kern.RBF(1)
+    lik = GPy.likelihoods.Gaussian()
+    m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
+    if optimize:
+        m.optimize(max_iters=max_iters)
+    if plot:
+        m.plot()
+    return m
+
+
--- a/GPy/inference/init.py
+++ b/GPy/inference/init.py
@ -1,3 +1,3 @@
-import latent_function_inference
-import optimization
-import mcmc
+from . import latent_function_inference
+from . import optimization
+from . import mcmc
--- a/GPy/inference/latent_function_inference/init.py
+++ b/GPy/inference/latent_function_inference/init.py
@ -1,4 +1,4 @@
-# Copyright (c) 2012, James Hensman
+# Copyright (c) 2012-2014, Max Zwiessele, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 __doc__ = """
@ -50,25 +50,26 @@ class InferenceMethodList(LatentFunctionInference, list):
    def on_optimization_end(self):
        for inf in self:
            inf.on_optimization_end()
-    
+
    def __getstate__(self):
        state = []
        for inf in self:
            state.append(inf)
        return state
-    
+
    def __setstate__(self, state):
        for inf in state:
            self.append(inf)

-from exact_gaussian_inference import ExactGaussianInference
-from laplace import Laplace
+from .exact_gaussian_inference import ExactGaussianInference
+from .laplace import Laplace,LaplaceBlock
 from GPy.inference.latent_function_inference.var_dtc import VarDTC
-from expectation_propagation import EP
-from expectation_propagation_dtc import EPDTC
-from dtc import DTC
-from fitc import FITC
-from var_dtc_parallel import VarDTC_minibatch
+from .expectation_propagation import EP
+from .expectation_propagation_dtc import EPDTC
+from .dtc import DTC
+from .fitc import FITC
+from .var_dtc_parallel import VarDTC_minibatch
+from .var_gauss import VarGauss

 # class FullLatentFunctionData(object):
 #
@ -77,9 +78,9 @@ from var_dtc_parallel import VarDTC_minibatch
 # class EMLikeLatentFunctionInference(LatentFunctionInference):
 #     def update_approximation(self):
 #         """
-#         This function gets called when the 
+#         This function gets called when the
 #         """
-#     
+#
 #     def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
 #         """
 #         Do inference on the latent functions given a covariance function `kern`,
@ -87,7 +88,7 @@ from var_dtc_parallel import VarDTC_minibatch
 #         Additional metadata for the outputs `Y` can be given in `Y_metadata`.
 #         """
 #         raise NotImplementedError, "Abstract base class for full inference"
-# 
+#
 # class VariationalLatentFunctionInference(LatentFunctionInference):
 #     def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
 #         """
--- a/GPy/inference/latent_function_inference/dtc.py
+++ b/GPy/inference/latent_function_inference/dtc.py
@ -1,7 +1,7 @@
 # Copyright (c) 2012-2014, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from posterior import Posterior
+from .posterior import Posterior
 from ...util.linalg import jitchol, tdot, dtrtrs, dpotri, pdinv
 import numpy as np
 from . import LatentFunctionInference
@ -20,7 +20,8 @@ class DTC(LatentFunctionInference):
    def __init__(self):
        self.const_jitter = 1e-6

-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"
        assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."

        num_inducing, _ = Z.shape
@ -29,7 +30,7 @@ class DTC(LatentFunctionInference):
        #make sure the noise is not hetero
        beta = 1./likelihood.gaussian_variance(Y_metadata)
        if beta.size > 1:
-            raise NotImplementedError, "no hetero noise with this implementation of DTC"
+            raise NotImplementedError("no hetero noise with this implementation of DTC")

        Kmm = kern.K(Z)
        Knn = kern.Kdiag(X)
@ -88,7 +89,8 @@ class vDTC(object):
    def __init__(self):
        self.const_jitter = 1e-6

-    def inference(self, kern, X, X_variance, Z, likelihood, Y, Y_metadata):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"
        assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."

        num_inducing, _ = Z.shape
@ -97,7 +99,7 @@ class vDTC(object):
        #make sure the noise is not hetero
        beta = 1./likelihood.gaussian_variance(Y_metadata)
        if beta.size > 1:
-            raise NotImplementedError, "no hetero noise with this implementation of DTC"
+            raise NotImplementedError("no hetero noise with this implementation of DTC")

        Kmm = kern.K(Z)
        Knn = kern.Kdiag(X)
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@ -1,7 +1,7 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from posterior import Posterior
+from .posterior import Posterior
 from ...util.linalg import pdinv, dpotrs, tdot
 from ...util import diag
 import numpy as np
@ -36,16 +36,23 @@ class ExactGaussianInference(LatentFunctionInference):
            #print "WARNING: N>D of Y, we need caching of L, such that L*L^T = Y, returning Y still!"
            return Y

-    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None):
        """
        Returns a Posterior class containing essential quantities of the posterior
        """
-        YYT_factor = self.get_YYTfactor(Y)
+
+        if mean_function is None:
+            m = 0
+        else:
+            m = mean_function.f(X)
+
+
+        YYT_factor = self.get_YYTfactor(Y-m)

        K = kern.K(X)

        Ky = K.copy()
-        diag.add(Ky, likelihood.gaussian_variance(Y_metadata))
+        diag.add(Ky, likelihood.gaussian_variance(Y_metadata)+1e-8)
        Wi, LW, LWi, W_logdet = pdinv(Ky)

        alpha, _ = dpotrs(LW, YYT_factor, lower=1)
@ -56,4 +63,18 @@ class ExactGaussianInference(LatentFunctionInference):

        dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK),Y_metadata)

-        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
+        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
+
+    def LOO(self, kern, X, Y, likelihood, posterior, Y_metadata=None, K=None):
+        """
+        Leave one out error as found in
+        "Bayesian leave-one-out cross-validation approximations for Gaussian latent variable models"
+        Vehtari et al. 2014.
+        """
+        g = posterior.woodbury_vector
+        c = posterior.woodbury_inv
+        c_diag = np.diag(c)[:, None]
+        neg_log_marginal_LOO = 0.5*np.log(2*np.pi) - 0.5*np.log(c_diag) + 0.5*(g**2)/c_diag
+        #believe from Predictive Approaches for Choosing Hyperparameters in Gaussian Processes
+        #this is the negative marginal LOO
+        return -neg_log_marginal_LOO
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ...util.linalg import pdinv,jitchol,DSYR,tdot,dtrtrs, dpotrs
-from posterior import Posterior
+from .posterior import Posterior
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)

@ -33,15 +33,19 @@ class EP(LatentFunctionInference):
        # TODO: update approximation in the end as well? Maybe even with a switch?
        pass

-    def inference(self, kern, X, likelihood, Y, Y_metadata=None, Z=None):
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, Z=None):
+        assert mean_function is None, "inference with a mean function not implemented"
        num_data, output_dim = Y.shape
        assert output_dim ==1, "ep in 1D only (for now!)"

        K = kern.K(X)

        if self._ep_approximation is None:
+
+            #if we don't yet have the results of runnign EP, run EP and store the computed factors in self._ep_approximation
            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
        else:
+            #if we've already run EP, just use the existing approximation stored in self._ep_approximation
            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation

        Wi, LW, LWi, W_logdet = pdinv(K + np.diag(1./tau_tilde))
--- a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
@ -6,7 +6,7 @@ from ...util import diag
 from ...util.linalg import mdot, jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify, DSYR
 from ...core.parameterization.variational import VariationalPosterior
 from . import LatentFunctionInference
-from posterior import Posterior
+from .posterior import Posterior
 log_2_pi = np.log(2*np.pi)

 class EPDTC(LatentFunctionInference):
@ -64,7 +64,8 @@ class EPDTC(LatentFunctionInference):
        self.old_mutilde, self.old_vtilde = None, None
        self._ep_approximation = None

-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"
        num_data, output_dim = Y.shape
        assert output_dim ==1, "ep in 1D only (for now!)"

@ -179,7 +180,7 @@ class EPDTC(LatentFunctionInference):
        if VVT_factor.shape[1] == Y.shape[1]:
            woodbury_vector = Cpsi1Vf # == Cpsi1V
        else:
-            print 'foobar'
+            print('foobar')
            psi1V = np.dot(mu_tilde[:,None].T*beta, psi1).T
            tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
            tmp, _ = dpotrs(LB, tmp, lower=1)
@ -314,7 +315,7 @@ def _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf,
        dL_dR = None
    elif het_noise:
        if uncertain_inputs:
-            raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
+            raise NotImplementedError("heteroscedatic derivates with uncertain inputs not implemented")
        else:
            #from ...util.linalg import chol_inv
            #LBi = chol_inv(LB)
--- a/GPy/inference/latent_function_inference/fitc.py
+++ b/GPy/inference/latent_function_inference/fitc.py
@ -1,7 +1,7 @@
 # Copyright (c) 2012, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from posterior import Posterior
+from .posterior import Posterior
 from ...util.linalg import jitchol, tdot, dtrtrs, dpotri, pdinv
 from ...util import diag
 import numpy as np
@ -18,7 +18,8 @@ class FITC(LatentFunctionInference):
    """
    const_jitter = 1e-6

-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
+        assert mean_function is None, "inference with a mean function not implemented"

        num_inducing, _ = Z.shape
        num_data, output_dim = Y.shape
@ -26,7 +27,7 @@ class FITC(LatentFunctionInference):
        #make sure the noise is not hetero
        sigma_n = likelihood.gaussian_variance(Y_metadata)
        if sigma_n.size >1:
-            raise NotImplementedError, "no hetero noise with this implementation of FITC"
+            raise NotImplementedError("no hetero noise with this implementation of FITC")

        Kmm = kern.K(Z)
        Knn = kern.Kdiag(X)
--- a/GPy/inference/latent_function_inference/inferenceX.py
+++ b/GPy/inference/latent_function_inference/inferenceX.py
@ -4,6 +4,8 @@
 import numpy as np
 from ...core import Model
 from ...core.parameterization import variational
+from ...util.linalg import tdot
+from GPy.core.parameterization.variational import VariationalPosterior

 def infer_newX(model, Y_new, optimize=True, init='L2'):
    """
@ -27,12 +29,19 @@ def infer_newX(model, Y_new, optimize=True, init='L2'):

 class InferenceX(Model):
    """
-    The class for inference of new X with given new Y. (do_test_latent)
+    The model class for inference of new X with given new Y. (replacing the "do_test_latent" in Bayesian GPLVM)
+    It is a tiny inference model created from the original GP model. The kernel, likelihood (only Gaussian is supported at the moment) 
+    and posterior distribution are taken from the original model.
+    For Regression models and GPLVM, a point estimate of the latent variable X will be inferred. 
+    For Bayesian GPLVM, the variational posterior of X will be inferred. 
+    X is inferred through a gradient optimization of the inference model.

    :param model: the GPy model used in inference
    :type model: GPy.core.Model
    :param Y: the new observed data for inference
    :type Y: numpy.ndarray
+    :param init: the distance metric of Y for initializing X with the nearest neighbour.
+    :type init: 'L2', 'NCC' and 'rand'
    """
    def __init__(self, model, Y, name='inferenceX', init='L2'):
        if np.isnan(Y).any() or getattr(model, 'missing_data', False):
@ -45,20 +54,27 @@ class InferenceX(Model):
        super(InferenceX, self).__init__(name)
        self.likelihood = model.likelihood.copy()
        self.kern = model.kern.copy()
-        if model.kern.useGPU:
-            from ...models import SSGPLVM
-            if isinstance(model, SSGPLVM):
-                self.kern.GPU_SSRBF(True)
-            else:
-                self.kern.GPU(True)
+#         if model.kern.useGPU:
+#             from ...models import SSGPLVM
+#             if isinstance(model, SSGPLVM):
+#                 self.kern.GPU_SSRBF(True)
+#             else:
+#                 self.kern.GPU(True)
        from copy import deepcopy
        self.posterior = deepcopy(model.posterior)
-        if hasattr(model, 'variational_prior'):
+        from ...core.parameterization.variational import VariationalPosterior
+        if isinstance(model.X, VariationalPosterior):
            self.uncertain_input = True
-            self.variational_prior = model.variational_prior.copy()
+            from ...models.ss_gplvm import IBPPrior
+            from ...models.ss_mrd import IBPPrior_SSMRD
+            if isinstance(model.variational_prior, IBPPrior) or isinstance(model.variational_prior, IBPPrior_SSMRD):
+                from ...core.parameterization.variational import SpikeAndSlabPrior
+                self.variational_prior = SpikeAndSlabPrior(pi=0.5, learnPi=False, group_spike=False)
+            else:
+                self.variational_prior = model.variational_prior.copy()
        else:
            self.uncertain_input = False
-        if hasattr(model, 'inducing_inputs'):
+        if hasattr(model, 'Z'):
            self.sparse_gp = True
            self.Z = model.Z.copy()
        else:
@ -112,13 +128,13 @@ class InferenceX(Model):
            wv = wv[:,self.valid_dim]
            output_dim = self.valid_dim.sum()
            if self.ninan is not None:
-                self.dL_dpsi2 = beta/2.*(self.posterior.woodbury_inv[:,:,self.valid_dim] - np.einsum('md,od->mo',wv, wv)[:, :, None]).sum(-1)
+                self.dL_dpsi2 = beta/2.*(self.posterior.woodbury_inv[:,:,self.valid_dim] - tdot(wv)[:, :, None]).sum(-1)
            else:
-                self.dL_dpsi2 = beta/2.*(output_dim*self.posterior.woodbury_inv - np.einsum('md,od->mo',wv, wv))
+                self.dL_dpsi2 = beta/2.*(output_dim*self.posterior.woodbury_inv - tdot(wv))
            self.dL_dpsi1 = beta*np.dot(self.Y[:,self.valid_dim], wv.T)
            self.dL_dpsi0 = - beta/2.* np.ones(self.Y.shape[0])
        else:
-            self.dL_dpsi2 = beta*(output_dim*self.posterior.woodbury_inv - np.einsum('md,od->mo',wv, wv))/2.
+            self.dL_dpsi2 = beta*(output_dim*self.posterior.woodbury_inv - tdot(wv))/2. #np.einsum('md,od->mo',wv, wv)
            self.dL_dpsi1 = beta*np.dot(self.Y, wv.T)
            self.dL_dpsi0 = -beta/2.*output_dim* np.ones(self.Y.shape[0])

@ -147,9 +163,9 @@ class InferenceX(Model):
            from ...core.parameterization.variational import SpikeAndSlabPrior
            if isinstance(self.variational_prior, SpikeAndSlabPrior):
                # Update Log-likelihood
-                KL_div = self.variational_prior.KL_divergence(self.X, N=self.Y.shape[0])
+                KL_div = self.variational_prior.KL_divergence(self.X)
                # update for the KL divergence
-                self.variational_prior.update_gradients_KL(self.X, N=self.Y.shape[0])
+                self.variational_prior.update_gradients_KL(self.X)
            else:
                # Update Log-likelihood
                KL_div = self.variational_prior.KL_divergence(self.X)
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@ -12,13 +12,14 @@

 import numpy as np
 from ...util.linalg import mdot, jitchol, dpotrs, dtrtrs, dpotri, symmetrify, pdinv
-from posterior import Posterior
+from .posterior import Posterior
 import warnings
 def warning_on_one_line(message, category, filename, lineno, file=None, line=None):
    return ' %s:%s: %s:%s\n' % (filename, lineno, category.__name__, message)
 warnings.formatwarning = warning_on_one_line
 from scipy import optimize
 from . import LatentFunctionInference
+from scipy.integrate import quad

 class Laplace(LatentFunctionInference):

@ -39,10 +40,90 @@ class Laplace(LatentFunctionInference):
        self.first_run = True
        self._previous_Ki_fhat = None

-    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+    def LOO(self, kern, X, Y, likelihood, posterior, Y_metadata=None, K=None, f_hat=None, W=None, Ki_W_i=None):
+        """
+        Leave one out log predictive density as found in
+        "Bayesian leave-one-out cross-validation approximations for Gaussian latent variable models"
+        Vehtari et al. 2014.
+        """
+        Ki_f_init = np.zeros_like(Y)
+
+        if K is None:
+            K = kern.K(X)
+
+        if f_hat is None:
+            f_hat, _ = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
+
+        if W is None:
+            W = -likelihood.d2logpdf_df2(f_hat, Y, Y_metadata=Y_metadata)
+
+        if Ki_W_i is None:
+            _, _, _, Ki_W_i = self._compute_B_statistics(K, W, likelihood.log_concave)
+
+        logpdf_dfhat = likelihood.dlogpdf_df(f_hat, Y, Y_metadata=Y_metadata)
+
+        if W.shape[1] == 1:
+            W = np.diagflat(W)
+
+        #Eq 14, and 16
+        var_site = 1./np.diag(W)[:, None]
+        mu_site = f_hat + var_site*logpdf_dfhat
+        prec_site = 1./var_site
+        #Eq 19
+        marginal_cov = Ki_W_i
+        marginal_mu = marginal_cov.dot(np.diagflat(prec_site)).dot(mu_site)
+        marginal_var = np.diag(marginal_cov)[:, None]
+        #Eq 30 with using site parameters instead of Gaussian site parameters
+        #(var_site instead of sigma^{2} )
+        posterior_cav_var = 1./(1./marginal_var - 1./var_site)
+        posterior_cav_mean = posterior_cav_var*((1./marginal_var)*marginal_mu - (1./var_site)*Y)
+
+        flat_y = Y.flatten()
+        flat_mu = posterior_cav_mean.flatten()
+        flat_var = posterior_cav_var.flatten()
+
+        if Y_metadata is not None:
+            #Need to zip individual elements of Y_metadata aswell
+            Y_metadata_flat = {}
+            if Y_metadata is not None:
+                for key, val in Y_metadata.items():
+                    Y_metadata_flat[key] = np.atleast_1d(val).reshape(-1, 1)
+
+            zipped_values = []
+
+            for i in range(Y.shape[0]):
+                y_m = {}
+                for key, val in Y_metadata_flat.items():
+                    if np.isscalar(val) or val.shape[0] == 1:
+                        y_m[key] = val
+                    else:
+                        #Won't broadcast yet
+                        y_m[key] = val[i]
+                zipped_values.append((flat_y[i], flat_mu[i], flat_var[i], y_m))
+        else:
+            #Otherwise just pass along None's
+            zipped_values = zip(flat_y, flat_mu, flat_var, [None]*Y.shape[0])
+
+        def integral_generator(yi, mi, vi, yi_m):
+            def f(fi_star):
+                #More stable in the log space
+                p_fi = np.exp(likelihood.logpdf(fi_star, yi, yi_m)
+                              - 0.5*np.log(2*np.pi*vi)
+                              - 0.5*np.square(mi-fi_star)/vi)
+                return p_fi
+            return f
+
+        #Eq 30
+        p_ystar, _ = zip(*[quad(integral_generator(y, m, v, yi_m), -np.inf, np.inf)
+                           for y, m, v, yi_m in zipped_values])
+        p_ystar = np.array(p_ystar).reshape(-1, 1)
+        return np.log(p_ystar)
+
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None):
        """
        Returns a Posterior class containing essential quantities of the posterior
        """
+        assert mean_function is None, "inference with a mean function not implemented"

        # Compute K
        K = kern.K(X)
@ -50,21 +131,21 @@ class Laplace(LatentFunctionInference):
        #Find mode
        if self.bad_fhat or self.first_run:
            Ki_f_init = np.zeros_like(Y)
-            first_run = False
+            self.first_run = False
        else:
            Ki_f_init = self._previous_Ki_fhat

+        Ki_f_init = np.zeros_like(Y)# FIXME: take this out
+
        f_hat, Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
-        self.f_hat = f_hat
-        self.Ki_fhat =  Ki_fhat
-        self.K = K.copy()
+
        #Compute hessian and other variables at mode
        log_marginal, woodbury_inv, dL_dK, dL_dthetaL = self.mode_computations(f_hat, Ki_fhat, K, Y, likelihood, kern, Y_metadata)

        self._previous_Ki_fhat = Ki_fhat.copy()
        return Posterior(woodbury_vector=Ki_fhat, woodbury_inv=woodbury_inv, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}

-    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None):
+    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None, *args, **kwargs):
        """
        Rasmussen's numerically stable mode finding
        For nomenclature see Rasmussen & Williams 2006
@ -89,7 +170,14 @@ class Laplace(LatentFunctionInference):

        #define the objective function (to be maximised)
        def obj(Ki_f, f):
-            return -0.5*np.dot(Ki_f.flatten(), f.flatten()) + np.sum(likelihood.logpdf(f, Y, Y_metadata=Y_metadata))
+            ll = -0.5*np.sum(np.dot(Ki_f.T, f)) + np.sum(likelihood.logpdf(f, Y, Y_metadata=Y_metadata))
+            print(ll)
+            if np.isnan(ll):
+                import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+                return -np.inf
+            else:
+                return ll
+

        difference = np.inf
        iteration = 0
@ -104,7 +192,7 @@ class Laplace(LatentFunctionInference):
            W_f = W*f

            b = W_f + grad # R+W p46 line 6.
-            W12BiW12, _, _ = self._compute_B_statistics(K, W, likelihood.log_concave)
+            W12BiW12, _, _, _ = self._compute_B_statistics(K, W, likelihood.log_concave, *args, **kwargs)
            W12BiW12Kb = np.dot(W12BiW12, np.dot(K, b))

            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
@ -121,7 +209,9 @@ class Laplace(LatentFunctionInference):
            step = optimize.brent(inner_obj, tol=1e-4, maxiter=12)
            Ki_f_new = Ki_f + step*dKi_f
            f_new = np.dot(K, Ki_f_new)
-
+            #print "new {} vs old {}".format(obj(Ki_f_new, f_new), obj(Ki_f, f))
+            if obj(Ki_f_new, f_new) < obj(Ki_f, f):
+                raise ValueError("Shouldn't happen, brent optimization failing")
            difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f))
            Ki_f = Ki_f_new
            f = f_new
@ -152,14 +242,10 @@ class Laplace(LatentFunctionInference):
        if np.any(np.isnan(W)):
            raise ValueError('One or more element(s) of W is NaN')

-        K_Wi_i, L, LiW12 = self._compute_B_statistics(K, W, likelihood.log_concave)
-
-        #compute vital matrices
-        C = np.dot(LiW12, K)
-        Ki_W_i  = K - C.T.dot(C)
+        K_Wi_i, logdet_I_KW, I_KW_i, Ki_W_i = self._compute_B_statistics(K, W, likelihood.log_concave)

        #compute the log marginal
-        log_marginal = -0.5*np.dot(Ki_f.flatten(), f_hat.flatten()) + np.sum(likelihood.logpdf(f_hat, Y, Y_metadata=Y_metadata)) - np.sum(np.log(np.diag(L)))
+        log_marginal = -0.5*np.sum(np.dot(Ki_f.T, f_hat)) + np.sum(likelihood.logpdf(f_hat, Y, Y_metadata=Y_metadata)) - 0.5*logdet_I_KW

        # Compute matrices for derivatives
        dW_df = -likelihood.d3logpdf_df3(f_hat, Y, Y_metadata=Y_metadata) # -d3lik_d3fhat
@ -196,23 +282,28 @@ class Laplace(LatentFunctionInference):
            dL_dthetaL = np.zeros(num_params)
            for thetaL_i in range(num_params):
                #Explicit
-                dL_dthetaL_exp = ( np.sum(dlik_dthetaL[thetaL_i])
+                dL_dthetaL_exp = ( np.sum(dlik_dthetaL[thetaL_i,:, :])
                                # The + comes from the fact that dlik_hess_dthetaL == -dW_dthetaL
-                                + 0.5*np.sum(np.diag(Ki_W_i).flatten()*dlik_hess_dthetaL[:, thetaL_i].flatten())
+                                  + 0.5*np.sum(np.diag(Ki_W_i)*np.squeeze(dlik_hess_dthetaL[thetaL_i, :, :]))
                                )

                #Implicit
-                dfhat_dthetaL = mdot(I_KW_i, K, dlik_grad_dthetaL[:, thetaL_i])
-                #dfhat_dthetaL = mdot(Ki_W_i, dlik_grad_dthetaL[:, thetaL_i])
+                dfhat_dthetaL = mdot(I_KW_i, K, dlik_grad_dthetaL[thetaL_i, :, :])
+                #dfhat_dthetaL = mdot(Ki_W_i, dlik_grad_dthetaL[thetaL_i, :, :])
                dL_dthetaL_imp = np.dot(dL_dfhat.T, dfhat_dthetaL)
-                dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
+                dL_dthetaL[thetaL_i] = np.sum(dL_dthetaL_exp + dL_dthetaL_imp)

        else:
            dL_dthetaL = np.zeros(likelihood.size)

+        #Cache some things for speedy LOO
+        self.Ki_W_i = Ki_W_i
+        self.K = K
+        self.W = W
+        self.f_hat = f_hat
        return log_marginal, K_Wi_i, dL_dK, dL_dthetaL

-    def _compute_B_statistics(self, K, W, log_concave):
+    def _compute_B_statistics(self, K, W, log_concave, *args, **kwargs):
        """
        Rasmussen suggests the use of a numerically stable positive definite matrix B
        Which has a positive diagonal elements and can be easily inverted
@ -225,7 +316,7 @@ class Laplace(LatentFunctionInference):
        """
        if not log_concave:
            #print "Under 1e-10: {}".format(np.sum(W < 1e-6))
-            W[W<1e-6] = 1e-6
+            W = np.clip(W, 1e-6, 1e+30)
            # NOTE: when setting a parameter inside parameters_changed it will allways come to closed update circles!!!
            #W.__setitem__(W < 1e-6, 1e-6, update=False)  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
@ -247,5 +338,160 @@ class Laplace(LatentFunctionInference):
        #K_Wi_i_2 , _= dpotri(L2)
        #symmetrify(K_Wi_i_2)

-        return K_Wi_i, L, LiW12
+        #compute vital matrices
+        C = np.dot(LiW12, K)
+        Ki_W_i  = K - C.T.dot(C)

+        I_KW_i = np.eye(K.shape[0]) - np.dot(K, K_Wi_i)
+        logdet_I_KW = 2*np.sum(np.log(np.diag(L)))
+
+        return K_Wi_i, logdet_I_KW, I_KW_i, Ki_W_i
+
+class LaplaceBlock(Laplace):
+    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None, *args, **kwargs):
+        Ki_f = Ki_f_init.copy()
+        f = np.dot(K, Ki_f)
+
+        #define the objective function (to be maximised)
+        def obj(Ki_f, f):
+            ll = -0.5*np.dot(Ki_f.T, f) + np.sum(likelihood.logpdf_sum(f, Y, Y_metadata=Y_metadata))
+            if np.isnan(ll):
+                return -np.inf
+            else:
+                return ll
+
+        difference = np.inf
+        iteration = 0
+
+        I = np.eye(K.shape[0])
+        while difference > self._mode_finding_tolerance and iteration < self._mode_finding_max_iter:
+            W = -likelihood.d2logpdf_df2(f, Y, Y_metadata=Y_metadata)
+
+            W[np.diag_indices_from(W)] = np.clip(np.diag(W), 1e-6, 1e+30)
+
+            W_f = np.dot(W, f)
+            grad = likelihood.dlogpdf_df(f, Y, Y_metadata=Y_metadata)
+
+            b = W_f + grad # R+W p46 line 6.
+            K_Wi_i, _, _, _ = self._compute_B_statistics(K, W, likelihood.log_concave, *args, **kwargs)
+
+            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
+            #a = (I - (K+Wi)i*K)*b
+            full_step_Ki_f = np.dot(I - np.dot(K_Wi_i, K), b)
+            dKi_f = full_step_Ki_f - Ki_f
+
+            #define an objective for the line search (minimize this one)
+            def inner_obj(step_size):
+                Ki_f_trial = Ki_f + step_size*dKi_f
+                f_trial = np.dot(K, Ki_f_trial)
+                return -obj(Ki_f_trial, f_trial)
+
+            #use scipy for the line search, the compute new values of f, Ki_f
+            step = optimize.brent(inner_obj, tol=1e-4, maxiter=12)
+
+            Ki_f_new = Ki_f + step*dKi_f
+            f_new = np.dot(K, Ki_f_new)
+
+            difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f))
+            Ki_f = Ki_f_new
+            f = f_new
+            iteration += 1
+
+        #Warn of bad fits
+        if difference > self._mode_finding_tolerance:
+            if not self.bad_fhat:
+                warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
+            self._previous_Ki_fhat = np.zeros_like(Y)
+            self.bad_fhat = True
+        elif self.bad_fhat:
+            self.bad_fhat = False
+            warnings.warn("f_hat now fine again")
+        if iteration > self._mode_finding_max_iter:
+            warnings.warn("didn't find the best")
+
+        return f, Ki_f
+
+    def mode_computations(self, f_hat, Ki_f, K, Y, likelihood, kern, Y_metadata):
+        #At this point get the hessian matrix (or vector as W is diagonal)
+        W = -likelihood.d2logpdf_df2(f_hat, Y, Y_metadata=Y_metadata)
+
+        W[np.diag_indices_from(W)] = np.clip(np.diag(W), 1e-6, 1e+30)
+
+        K_Wi_i, log_B_det, I_KW_i, Ki_W_i = self._compute_B_statistics(K, W, likelihood.log_concave)
+
+        #compute the log marginal
+        #FIXME: The derterminant should be output_dim*0.5 I think, gradients may now no longer check
+        log_marginal = -0.5*np.dot(f_hat.T, Ki_f) + np.sum(likelihood.logpdf_sum(f_hat, Y, Y_metadata=Y_metadata)) - 0.5*log_B_det
+
+        #Compute vival matrices for derivatives
+        dW_df = -likelihood.d3logpdf_df3(f_hat, Y, Y_metadata=Y_metadata) # -d3lik_d3fhat
+
+        #dL_dfhat = np.zeros((f_hat.shape[0]))
+        #for i in range(f_hat.shape[0]):
+            #dL_dfhat[i] = -0.5*np.trace(np.dot(Ki_W_i, dW_df[:,:,i]))
+
+        dL_dfhat = -0.5*np.einsum('ij,ijk->k', Ki_W_i, dW_df)
+
+        woodbury_vector = likelihood.dlogpdf_df(f_hat, Y, Y_metadata=Y_metadata)
+
+        ####################
+        #compute dL_dK#
+        ####################
+        if kern.size > 0 and not kern.is_fixed:
+            #Explicit
+            explicit_part = 0.5*(np.dot(Ki_f, Ki_f.T) - K_Wi_i)
+
+            #Implicit
+            implicit_part = woodbury_vector.dot(dL_dfhat[None,:]).dot(I_KW_i)
+            #implicit_part = Ki_f.dot(dL_dfhat[None,:]).dot(I_KW_i)
+
+            dL_dK = explicit_part + implicit_part
+        else:
+            dL_dK = np.zeros_like(K)
+
+        ####################
+        #compute dL_dthetaL#
+        ####################
+        if likelihood.size > 0 and not likelihood.is_fixed:
+            raise NotImplementedError
+        else:
+            dL_dthetaL = np.zeros(likelihood.size)
+
+        #self.K_Wi_i = K_Wi_i
+        #self.Ki_W_i = Ki_W_i
+        #self.W = W
+        #self.K = K
+        #self.dL_dfhat = dL_dfhat
+        #self.explicit_part = explicit_part
+        #self.implicit_part = implicit_part
+        return log_marginal, K_Wi_i, dL_dK, dL_dthetaL
+
+    def _compute_B_statistics(self, K, W, log_concave, *args, **kwargs):
+        """
+        Rasmussen suggests the use of a numerically stable positive definite matrix B
+        Which has a positive diagonal element and can be easyily inverted
+
+        :param K: Prior Covariance matrix evaluated at locations X
+        :type K: NxN matrix
+        :param W: Negative hessian at a point (diagonal matrix)
+        :type W: Vector of diagonal values of hessian (1xN)
+        :returns: (K_Wi_i, L_B, not_provided)
+        """
+        #w = GPy.util.diag.view(W)
+        #W[:] = np.where(w<1e-6, 1e-6, w)
+
+        #B = I + KW
+        B = np.eye(K.shape[0]) + np.dot(K, W)
+        #Bi, L, Li, logdetB = pdinv(B)
+        Bi = np.linalg.inv(B)
+
+        #K_Wi_i = np.eye(K.shape[0]) - mdot(W, Bi, K)
+        K_Wi_i = np.dot(W, Bi)
+
+        #self.K_Wi_i_brute = np.linalg.inv(K + np.linalg.inv(W))
+        #self.B = B
+        #self.Bi = Bi
+        Ki_W_i = np.dot(Bi, K)
+
+        sign, logdetB = np.linalg.slogdet(B)
+        return K_Wi_i, sign*logdetB, Bi, Ki_W_i
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@ -15,7 +15,7 @@ class Posterior(object):
    the function at any new point x_* by integrating over this posterior.

    """
-    def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None):
+    def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None, prior_mean=0):
        """
        woodbury_chol : a lower triangular matrix L that satisfies posterior_covariance = K - K L^{-T} L^{-1} K
        woodbury_vector : a matrix (or vector, as Nx1 matrix) M which satisfies posterior_mean = K M
@ -52,7 +52,7 @@ class Posterior(object):
                or ((mean is not None) and (cov is not None)):
            pass # we have sufficient to compute the posterior
        else:
-            raise ValueError, "insufficient information to compute the posterior"
+            raise ValueError("insufficient information to compute the posterior")

        self._K_chol = K_chol
        self._K = K
@ -67,6 +67,7 @@ class Posterior(object):
        #option 2:
        self._mean = mean
        self._covariance = cov
+        self._prior_mean = prior_mean

        #compute this lazily
        self._precision = None
@ -107,7 +108,7 @@ class Posterior(object):
        if self._precision is None:
            cov = np.atleast_3d(self.covariance)
            self._precision = np.zeros(cov.shape) # if one covariance per dimension
-            for p in xrange(cov.shape[-1]):
+            for p in range(cov.shape[-1]):
                self._precision[:,:,p] = pdinv(cov[:,:,p])[0]
        return self._precision

@ -125,7 +126,7 @@ class Posterior(object):
            if self._woodbury_inv is not None:
                winv = np.atleast_3d(self._woodbury_inv)
                self._woodbury_chol = np.zeros(winv.shape)
-                for p in xrange(winv.shape[-1]):
+                for p in range(winv.shape[-1]):
                    self._woodbury_chol[:,:,p] = pdinv(winv[:,:,p])[2]
                #Li = jitchol(self._woodbury_inv)
                #self._woodbury_chol, _ = dtrtri(Li)
@ -134,13 +135,13 @@ class Posterior(object):
                #self._woodbury_chol = jitchol(W)
            #try computing woodbury chol from cov
            elif self._covariance is not None:
-                raise NotImplementedError, "TODO: check code here"
+                raise NotImplementedError("TODO: check code here")
                B = self._K - self._covariance
                tmp, _ = dpotrs(self.K_chol, B)
                self._woodbury_inv, _ = dpotrs(self.K_chol, tmp.T)
                _, _, self._woodbury_chol, _ = pdinv(self._woodbury_inv)
            else:
-                raise ValueError, "insufficient information to compute posterior"
+                raise ValueError("insufficient information to compute posterior")
        return self._woodbury_chol

    @property
@ -158,9 +159,11 @@ class Posterior(object):
                #self._woodbury_inv, _ = dpotrs(self.woodbury_chol, np.eye(self.woodbury_chol.shape[0]), lower=1)
                symmetrify(self._woodbury_inv)
            elif self._covariance is not None:
-                B = self._K - self._covariance
-                tmp, _ = dpotrs(self.K_chol, B)
-                self._woodbury_inv, _ = dpotrs(self.K_chol, tmp.T)                
+                B = np.atleast_3d(self._K) - np.atleast_3d(self._covariance)
+                self._woodbury_inv = np.empty_like(B)
+                for i in range(B.shape[-1]):
+                    tmp, _ = dpotrs(self.K_chol, B[:,:,i])
+                    self._woodbury_inv[:,:,i], _ = dpotrs(self.K_chol, tmp.T)
        return self._woodbury_inv

    @property
@ -173,7 +176,7 @@ class Posterior(object):
        $$
        """
        if self._woodbury_vector is None:
-            self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean)
+            self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean - self._prior_mean)
        return self._woodbury_vector

    @property
--- a/GPy/inference/latent_function_inference/svgp.py
+++ b/GPy/inference/latent_function_inference/svgp.py
@ -0,0 +1,121 @@
+from . import LatentFunctionInference
+from ...util import linalg
+from ...util import choleskies
+import numpy as np
+from .posterior import Posterior
+from scipy.linalg.blas import dgemm, dsymm, dtrmm
+
+class SVGP(LatentFunctionInference):
+
+    def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
+
+        num_data, _ = Y.shape
+        num_inducing, num_outputs = q_u_mean.shape
+
+        #expand cholesky representation
+        L = choleskies.flat_to_triang(q_u_chol)
+
+
+        S = np.empty((num_outputs, num_inducing, num_inducing))
+        [np.dot(L[i,:,:], L[i,:,:].T, S[i,:,:]) for i in range(num_outputs)]
+        #Si,_ = linalg.dpotri(np.asfortranarray(L), lower=1)
+        Si = choleskies.multiple_dpotri(L)
+        logdetS = np.array([2.*np.sum(np.log(np.abs(np.diag(L[i,:,:])))) for i in range(L.shape[0])])
+
+        if np.any(np.isinf(Si)):
+            raise ValueError("Cholesky representation unstable")
+
+        #compute mean function stuff
+        if mean_function is not None:
+            prior_mean_u = mean_function.f(Z)
+            prior_mean_f = mean_function.f(X)
+        else:
+            prior_mean_u = np.zeros((num_inducing, num_outputs))
+            prior_mean_f = np.zeros((num_data, num_outputs))
+
+        #compute kernel related stuff
+        Kmm = kern.K(Z)
+        Kmn = kern.K(Z, X)
+        Knn_diag = kern.Kdiag(X)
+        Lm = linalg.jitchol(Kmm)
+        logdetKmm = 2.*np.sum(np.log(np.diag(Lm)))
+        Kmmi, _ = linalg.dpotri(Lm)
+
+        #compute the marginal means and variances of q(f)
+        A, _ = linalg.dpotrs(Lm, Kmn)
+        mu = prior_mean_f + np.dot(A.T, q_u_mean - prior_mean_u)
+        v = np.empty((num_data, num_outputs))
+        for i in range(num_outputs):
+            tmp = dtrmm(1.0,L[i].T, A, lower=0, trans_a=0)
+            v[:,i] = np.sum(np.square(tmp),0)
+        v += (Knn_diag - np.sum(A*Kmn,0))[:,None]
+
+        #compute the KL term
+        Kmmim = np.dot(Kmmi, q_u_mean)
+        KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.sum(Kmmi[None,:,:]*S,1).sum(1) + 0.5*np.sum(q_u_mean*Kmmim,0)
+        KL = KLs.sum()
+        #gradient of the KL term (assuming zero mean function)
+        dKL_dm = Kmmim.copy()
+        dKL_dS = 0.5*(Kmmi[None,:,:] - Si)
+        dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(0)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)
+
+        if mean_function is not None:
+            #adjust KL term for mean function
+            Kmmi_mfZ = np.dot(Kmmi, prior_mean_u)
+            KL += -np.sum(q_u_mean*Kmmi_mfZ)
+            KL += 0.5*np.sum(Kmmi_mfZ*prior_mean_u)
+
+            #adjust gradient for mean fucntion
+            dKL_dm -= Kmmi_mfZ
+            dKL_dKmm += Kmmim.dot(Kmmi_mfZ.T)
+            dKL_dKmm -= 0.5*Kmmi_mfZ.dot(Kmmi_mfZ.T)
+
+            #compute gradients for mean_function
+            dKL_dmfZ = Kmmi_mfZ - Kmmim
+
+        #quadrature for the likelihood
+        F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata)
+
+        #rescale the F term if working on a batch
+        F, dF_dmu, dF_dv =  F*batch_scale, dF_dmu*batch_scale, dF_dv*batch_scale
+        if dF_dthetaL is not None:
+            dF_dthetaL =  dF_dthetaL.sum(1).sum(1)*batch_scale
+
+        #derivatives of expected likelihood, assuming zero mean function
+        Adv = A[None,:,:]*dF_dv.T[:,None,:] # As if dF_Dv is diagonal, D, M, N
+        Admu = A.dot(dF_dmu)
+        Adv = np.ascontiguousarray(Adv) # makes for faster operations later...(inc dsymm)
+        AdvA = np.dot(Adv.reshape(-1, num_data),A.T).reshape(num_outputs, num_inducing, num_inducing )
+        tmp = np.sum([np.dot(a,s) for a, s in zip(AdvA, S)],0).dot(Kmmi)
+        dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(0) - tmp - tmp.T
+        dF_dKmm = 0.5*(dF_dKmm + dF_dKmm.T) # necessary? GPy bug?
+        tmp = S.reshape(-1, num_inducing).dot(Kmmi).reshape(num_outputs, num_inducing , num_inducing )
+        tmp = 2.*(tmp - np.eye(num_inducing)[None, :,:])
+
+        dF_dKmn = Kmmim.dot(dF_dmu.T)
+        for a,b in zip(tmp, Adv):
+            dF_dKmn += np.dot(a.T, b)
+
+        dF_dm = Admu
+        dF_dS = AdvA
+
+        #adjust gradient to account for mean function
+        if mean_function is not None:
+            dF_dmfX = dF_dmu.copy()
+            dF_dmfZ = -Admu
+            dF_dKmn -= np.dot(Kmmi_mfZ, dF_dmu.T)
+            dF_dKmm += Admu.dot(Kmmi_mfZ.T)
+
+
+        #sum (gradients of) expected likelihood and KL part
+        log_marginal = F.sum() - KL
+        dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn
+
+        dL_dchol = 2.*np.array([np.dot(a,b) for a, b in zip(dL_dS, L) ])
+        dL_dchol = choleskies.triang_to_flat(dL_dchol)
+
+        grad_dict = {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv.sum(1), 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
+        if mean_function is not None:
+            grad_dict['dL_dmfZ'] = dF_dmfZ - dKL_dmfZ
+            grad_dict['dL_dmfX'] = dF_dmfX
+        return Posterior(mean=q_u_mean, cov=S.T, K=Kmm, prior_mean=prior_mean_u), log_marginal, grad_dict
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@ -1,7 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from posterior import Posterior
+from .posterior import Posterior
 from ...util.linalg import mdot, jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify
 from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
@ -21,7 +21,7 @@ class VarDTC(LatentFunctionInference):
    For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.

    """
-    const_jitter = 1e-6
+    const_jitter = 1e-8
    def __init__(self, limit=1):
        #self._YYTfactor_cache = caching.cache()
        from ...util.caching import Cacher
@ -64,9 +64,7 @@ class VarDTC(LatentFunctionInference):
    def get_VVTfactor(self, Y, prec):
        return Y * prec # TODO chache this, and make it effective

-
-
-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None):
+    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, psi0=None, psi1=None, psi2=None):

        _, output_dim = Y.shape
        uncertain_inputs = isinstance(X, VariationalPosterior)
@ -95,17 +93,28 @@ class VarDTC(LatentFunctionInference):

        # The rather complex computations of A, and the psi stats
        if uncertain_inputs:
-            psi0 = kern.psi0(Z, X)
-            psi1 = kern.psi1(Z, X)
+            if psi0 is None:
+                psi0 = kern.psi0(Z, X)
+            if psi1 is None:
+                psi1 = kern.psi1(Z, X)
            if het_noise:
-                psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0)
+                if psi2 is None:
+                    assert len(psi2.shape) == 3  # Need to have not summed out N
+                    #FIXME: Need testing
+                    psi2_beta = np.sum([psi2[X[i:i+1,:], :, :] * beta_i for i,beta_i in enumerate(beta)],0)
+                else:
+                    psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0)
            else:
-                psi2_beta = kern.psi2(Z,X) * beta
+                if psi2 is None:
+                    psi2 = kern.psi2(Z,X)
+                psi2_beta =  psi2 * beta
            LmInv = dtrtri(Lm)
            A = LmInv.dot(psi2_beta.dot(LmInv.T))
        else:
-            psi0 = kern.Kdiag(X)
-            psi1 = kern.K(X, Z)
+            if psi0 is None:
+                psi0 = kern.Kdiag(X)
+            if psi1 is None:
+                psi1 = kern.K(X, Z)
            if het_noise:
                tmp = psi1 * (np.sqrt(beta))
            else:
@ -170,7 +179,7 @@ class VarDTC(LatentFunctionInference):
        if VVT_factor.shape[1] == Y.shape[1]:
            woodbury_vector = Cpsi1Vf # == Cpsi1V
        else:
-            print 'foobar'
+            print('foobar')
            import ipdb; ipdb.set_trace()
            psi1V = np.dot(Y.T*beta, psi1).T
            tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
@ -213,7 +222,7 @@ def _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf,
        dL_dR = None
    elif het_noise:
        if uncertain_inputs:
-            raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
+            raise NotImplementedError("heteroscedatic derivates with uncertain inputs not implemented")
        else:
            #from ...util.linalg import chol_inv
            #LBi = chol_inv(LB)
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@ -1,7 +1,7 @@
 # Copyright (c) 2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from posterior import Posterior
+from .posterior import Posterior
 from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri,pdinv
 from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
@ -24,7 +24,7 @@ class VarDTC_minibatch(LatentFunctionInference):
    For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.

    """
-    const_jitter = 1e-6
+    const_jitter = 1e-8
    def __init__(self, batchsize=None, limit=1, mpi_comm=None):

        self.batchsize = batchsize
@ -92,7 +92,7 @@ class VarDTC_minibatch(LatentFunctionInference):
        psi0_full = 0.
        YRY_full = 0.

-        for n_start in xrange(0,num_data,batchsize):
+        for n_start in range(0,num_data,batchsize):
            n_end = min(batchsize+n_start, num_data)
            if batchsize==num_data:
                Y_slice = Y
@ -169,19 +169,26 @@ class VarDTC_minibatch(LatentFunctionInference):

        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
-        Lm = jitchol(Kmm, maxtries=100)
+        if not np.isfinite(Kmm).all():
+            print(Kmm)
+        Lm = jitchol(Kmm)
+        LmInv = dtrtri(Lm)

-        LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right')
+        LmInvPsi2LmInvT = LmInv.dot(psi2_full.dot(LmInv.T))
        Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT
-        LL = jitchol(Lambda, maxtries=100)
+        LL = jitchol(Lambda)
+        LLInv = dtrtri(LL)
        logdet_L = 2.*np.sum(np.log(np.diag(LL)))
-        b = dtrtrs(LL,dtrtrs(Lm,psi1Y_full.T)[0])[0]
+        LmLLInv = LLInv.dot(LmInv)
+        
+        b  = psi1Y_full.dot(LmLLInv.T)
        bbt = np.square(b).sum()
-        v = dtrtrs(Lm,dtrtrs(LL,b,trans=1)[0],trans=1)[0]
-
-        tmp  = -backsub_both_sides(LL, tdot(b)+output_dim*np.eye(input_dim), transpose='left')
-        dL_dpsi2R = backsub_both_sides(Lm, tmp+output_dim*np.eye(input_dim), transpose='left')/2.
-
+        v = b.dot(LmLLInv).T
+        LLinvPsi1TYYTPsi1LLinvT = tdot(b.T)
+        
+        tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)).dot(LLInv)
+        dL_dpsi2R = LmInv.T.dot(tmp+output_dim*np.eye(input_dim)).dot(LmInv)/2.
+        
        # Cache intermediate results
        self.midRes['dL_dpsi2R'] = dL_dpsi2R
        self.midRes['v'] = v
@ -199,7 +206,7 @@ class VarDTC_minibatch(LatentFunctionInference):
        # Compute dL_dKmm
        #======================================================================

-        dL_dKmm =  dL_dpsi2R - output_dim*backsub_both_sides(Lm, LmInvPsi2LmInvT, transpose='left')/2.
+        dL_dKmm =  dL_dpsi2R - output_dim*LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2.

        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
--- a/GPy/inference/latent_function_inference/var_gauss.py
+++ b/GPy/inference/latent_function_inference/var_gauss.py
@ -0,0 +1,69 @@
+# Copyright (c) 2015, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+import numpy as np
+from ...util.linalg import pdinv
+from .posterior import Posterior
+from . import LatentFunctionInference
+log_2_pi = np.log(2*np.pi)
+
+class VarGauss(LatentFunctionInference):
+    """
+    The Variational Gaussian Approximation revisited
+
+    @article{Opper:2009,
+        title = {The Variational Gaussian Approximation Revisited},
+        author = {Opper, Manfred and Archambeau, C{\'e}dric},
+        journal = {Neural Comput.},
+        year = {2009},
+        pages = {786--792},
+    }
+    """
+    def __init__(self, alpha, beta):
+        """
+        :param alpha: GPy.core.Param varational parameter
+        :param beta: GPy.core.Param varational parameter
+        """
+        self.alpha, self.beta = alpha, beta
+
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, Z=None):
+        if mean_function is not None:
+            raise NotImplementedError
+        num_data, output_dim = Y.shape
+        assert output_dim ==1, "Only one output supported"
+
+        K = kern.K(X)
+        m = K.dot(self.alpha)
+        KB = K*self.beta[:, None]
+        BKB = KB*self.beta[None, :]
+        A = np.eye(num_data) + BKB
+        Ai, LA, _, Alogdet = pdinv(A)
+        Sigma = np.diag(self.beta**-2) - Ai/self.beta[:, None]/self.beta[None, :]  # posterior coavairance: need full matrix for gradients
+        var = np.diag(Sigma).reshape(-1,1)
+
+        F, dF_dm, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, m, var, Y_metadata=Y_metadata)
+        if dF_dthetaL is not None:
+            dL_dthetaL = dF_dthetaL.sum(1).sum(1)
+        else:
+            dL_dthetaL = np.array([])
+        dF_da = np.dot(K, dF_dm)
+        SigmaB = Sigma*self.beta
+        #dF_db_ = -np.diag(Sigma.dot(np.diag(dF_dv.flatten())).dot(SigmaB))*2
+        dF_db = -2*np.sum(Sigma**2 * (dF_dv * self.beta), 0)
+        #assert np.allclose(dF_db, dF_db_)
+
+        KL = 0.5*(Alogdet + np.trace(Ai) - num_data + np.sum(m*self.alpha))
+        dKL_da = m
+        A_A2 = Ai - Ai.dot(Ai)
+        dKL_db = np.diag(np.dot(KB.T, A_A2))
+        log_marginal = F.sum() - KL
+        self.alpha.gradient = dF_da - dKL_da
+        self.beta.gradient = dF_db - dKL_db
+
+        # K-gradients
+        dKL_dK = 0.5*(self.alpha*self.alpha.T + self.beta[:, None]*self.beta[None, :]*A_A2)
+        tmp = Ai*self.beta[:, None]/self.beta[None, :]
+        dF_dK = self.alpha*dF_dm.T + np.dot(tmp*dF_dv, tmp.T)
+
+        return Posterior(mean=m, cov=Sigma ,K=K),\
+               log_marginal,\
+               {'dL_dK':dF_dK-dKL_dK, 'dL_dthetaL':dL_dthetaL}
--- a/GPy/inference/mcmc/hmc.py
+++ b/GPy/inference/mcmc/hmc.py
@ -1,4 +1,4 @@
-# ## Copyright (c) 2014, Zhenwen Dai
+# ## Copyright (c) 2014 Mu Niu, Zhenwen Dai and GPy Authors
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
@ -39,7 +39,7 @@ class HMC:
        :rtype: numpy.ndarray
        """
        params = np.empty((num_samples,self.p.size))
-        for i in xrange(num_samples):
+        for i in range(num_samples):
            self.p[:] = np.random.multivariate_normal(np.zeros(self.p.size),self.M)
            H_old = self._computeH()
            theta_old = self.model.optimizer_array.copy()
@ -59,7 +59,7 @@ class HMC:
        return params

    def _update(self, hmc_iters):
-        for i in xrange(hmc_iters):
+        for i in range(hmc_iters):
            self.p[:] += -self.stepsize/2.*self.model._transform_gradients(self.model.objective_function_gradients())
            self.model.optimizer_array = self.model.optimizer_array + self.stepsize*np.dot(self.Minv, self.p)
            self.p[:] += -self.stepsize/2.*self.model._transform_gradients(self.model.objective_function_gradients())
@ -82,7 +82,7 @@ class HMC_shortcut:

    def sample(self, m_iters=1000, hmc_iters=20):
        params = np.empty((m_iters,self.p.size))
-        for i in xrange(m_iters):
+        for i in range(m_iters):
            # sample a stepsize from the uniform distribution
            stepsize = np.exp(np.random.rand()*(self.stepsize_range[1]-self.stepsize_range[0])+self.stepsize_range[0])
            self.p[:] = np.random.multivariate_normal(np.zeros(self.p.size),self.M)
--- a/GPy/inference/mcmc/samplers.py
+++ b/GPy/inference/mcmc/samplers.py
@ -1,12 +1,19 @@
 # ## Copyright (c) 2014, Zhenwen Dai
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-
+from __future__ import print_function

 import numpy as np
-from scipy import linalg, optimize
 import sys


+try:
+    #In Python 2, cPickle is faster. It does not exist in Python 3 but the underlying code is always used
+    #if available
+    import cPickle as pickle
+except ImportError:
+    import pickle
+
+
 class Metropolis_Hastings:
    def __init__(self,model,cov=None):
        """Metropolis Hastings, with tunings according to Gelman et al. """
--- a/GPy/inference/optimization/init.py
+++ b/GPy/inference/optimization/init.py
@ -1,2 +1,2 @@
-from scg import SCG
-from optimization import *
+from .scg import SCG
+from .optimization import *
--- a/GPy/inference/optimization/conjugate_gradient_descent.py
+++ b/GPy/inference/optimization/conjugate_gradient_descent.py
@ -1,7 +1,7 @@
 # Copyright (c) 2012-2014, Max Zwiessele
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from gradient_descent_update_rules import FletcherReeves, \
+from .gradient_descent_update_rules import FletcherReeves, \
    PolakRibiere
 from Queue import Empty
 from multiprocessing import Value
@ -74,7 +74,7 @@ class _Async_Optimization(Thread):
        if self.outq is not None:
            self.outq.put(self.SENTINEL)
        if self.messages:
-            print ""
+            print("")
        self.runsignal.clear()

    def run(self, *args, **kwargs):
@ -213,7 +213,7 @@ class Async_Optimize(object):
 #                     # print "^C"
 #                     self.runsignal.clear()
 #                     c.join()
-            print "WARNING: callback still running, optimisation done!"
+            print("WARNING: callback still running, optimisation done!")
        return p.result

 class CGD(Async_Optimize):
--- a/GPy/inference/optimization/optimization.py
+++ b/GPy/inference/optimization/optimization.py
@ -10,7 +10,7 @@ try:
    rasm_available = True
 except ImportError:
    rasm_available = False
-from scg import SCG
+from .scg import SCG

 class Optimizer():
    """
@ -31,12 +31,13 @@ class Optimizer():
                 ftol=None, gtol=None, xtol=None, bfgs_factor=None):
        self.opt_name = None
        self.x_init = x_init
-        self.messages = messages
+        # Turning messages off and using internal structure for print outs:
+        self.messages = False #messages
        self.f_opt = None
        self.x_opt = None
        self.funct_eval = None
        self.status = None
-        self.max_f_eval = int(max_f_eval)
+        self.max_f_eval = int(max_iters)
        self.max_iters = int(max_iters)
        self.bfgs_factor = bfgs_factor
        self.trace = None
@ -53,7 +54,7 @@ class Optimizer():
        self.time = str(end - start)

    def opt(self, f_fp=None, f=None, fp=None):
-        raise NotImplementedError, "this needs to be implemented to use the optimizer class"
+        raise NotImplementedError("this needs to be implemented to use the optimizer class")

    def plot(self):
        """
@ -124,9 +125,9 @@ class opt_lbfgsb(Optimizer):

        opt_dict = {}
        if self.xtol is not None:
-            print "WARNING: l-bfgs-b doesn't have an xtol arg, so I'm going to ignore it"
+            print("WARNING: l-bfgs-b doesn't have an xtol arg, so I'm going to ignore it")
        if self.ftol is not None:
-            print "WARNING: l-bfgs-b doesn't have an ftol arg, so I'm going to ignore it"
+            print("WARNING: l-bfgs-b doesn't have an ftol arg, so I'm going to ignore it")
        if self.gtol is not None:
            opt_dict['pgtol'] = self.gtol
        if self.bfgs_factor is not None:
@ -139,6 +140,10 @@ class opt_lbfgsb(Optimizer):
        self.funct_eval = opt_result[2]['funcalls']
        self.status = rcstrings[opt_result[2]['warnflag']]

+        #a more helpful error message is available in opt_result in the Error case
+        if opt_result[2]['warnflag']==2:
+            self.status = 'Error' + str(opt_result[2]['task'])
+
 class opt_simplex(Optimizer):
    def __init__(self, *args, **kwargs):
        Optimizer.__init__(self, *args, **kwargs)
@ -157,7 +162,7 @@ class opt_simplex(Optimizer):
        if self.ftol is not None:
            opt_dict['ftol'] = self.ftol
        if self.gtol is not None:
-            print "WARNING: simplex doesn't have an gtol arg, so I'm going to ignore it"
+            print("WARNING: simplex doesn't have an gtol arg, so I'm going to ignore it")

        opt_result = optimize.fmin(f, self.x_init, (), disp=self.messages,
                   maxfun=self.max_f_eval, full_output=True, **opt_dict)
@ -185,11 +190,11 @@ class opt_rasm(Optimizer):

        opt_dict = {}
        if self.xtol is not None:
-            print "WARNING: minimize doesn't have an xtol arg, so I'm going to ignore it"
+            print("WARNING: minimize doesn't have an xtol arg, so I'm going to ignore it")
        if self.ftol is not None:
-            print "WARNING: minimize doesn't have an ftol arg, so I'm going to ignore it"
+            print("WARNING: minimize doesn't have an ftol arg, so I'm going to ignore it")
        if self.gtol is not None:
-            print "WARNING: minimize doesn't have an gtol arg, so I'm going to ignore it"
+            print("WARNING: minimize doesn't have an gtol arg, so I'm going to ignore it")

        opt_result = rasm.minimize(self.x_init, f_fp, (), messages=self.messages,
                                   maxnumfuneval=self.max_f_eval)
--- a/GPy/inference/optimization/scg.py
+++ b/GPy/inference/optimization/scg.py
@ -21,14 +21,13 @@
 #      OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #      POSSIBILITY OF SUCH DAMAGE.

-
+from __future__ import print_function
 import numpy as np
 import sys

-
 def print_out(len_maxiters, fnow, current_grad, beta, iteration):
-    print '\r',
-    print '{0:>0{mi}g}  {1:> 12e}  {2:< 12.6e}  {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
+    print('\r', end=' ')
+    print('{0:>0{mi}g}  {1:> 12e}  {2:< 12.6e}  {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), end=' ') # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
    sys.stdout.flush()

 def exponents(fnow, current_grad):
@ -61,6 +60,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
    function_eval = 1
    fnow = fold
    gradnew = gradf(x, *optargs) # Initial gradient.
+    function_eval += 1
    #if any(np.isnan(gradnew)):
    #    raise UnexpectedInfOrNan, "Gradient contribution resulted in a NaN value"
    current_grad = np.dot(gradnew, gradnew)
@ -79,7 +79,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,

    len_maxiters = len(str(maxiters))
    if display:
-        print ' {0:{mi}s}   {1:11s}    {2:11s}    {3:11s}'.format("I", "F", "Scale", "|g|", mi=len_maxiters)
+        print(' {0:{mi}s}   {1:11s}    {2:11s}    {3:11s}'.format("I", "F", "Scale", "|g|", mi=len_maxiters))
        exps = exponents(fnow, current_grad)
        p_iter = iteration

@ -96,6 +96,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
            sigma = sigma0 / np.sqrt(kappa)
            xplus = x + sigma * d
            gplus = gradf(xplus, *optargs)
+            function_eval += 1
            theta = np.dot(d, (gplus - gradnew)) / sigma

        # Increase effective curvature and evaluate step size alpha.
@ -111,10 +112,10 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
        fnew = f(xnew, *optargs)
        function_eval += 1

-#         if function_eval >= max_f_eval:
-#             status = "maximum number of function evaluations exceeded"
-#             break
-#             return x, flog, function_eval, status
+        if function_eval >= max_f_eval:
+            status = "maximum number of function evaluations exceeded"
+            break
+            return x, flog, function_eval, status

        Delta = 2.*(fnew - fold) / (alpha * mu)
        if Delta >= 0.:
@ -138,7 +139,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
                b = np.any(n_exps < exps)
                if a or b:
                    p_iter = iteration
-                    print ''
+                    print('')
                if b:
                    exps = n_exps

@ -156,6 +157,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
                # Update variables for new position
                gradold = gradnew
                gradnew = gradf(x, *optargs)
+                function_eval += 1
                current_grad = np.dot(gradnew, gradnew)
                fold = fnew
                # If the gradient is zero then we are done.
@ -186,6 +188,6 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,

    if display:
        print_out(len_maxiters, fnow, current_grad, beta, iteration)
-        print ""
-        print status
+        print("")
+        print(status)
    return x, flog, function_eval, status
--- a/GPy/inference/optimization/stochastics.py
+++ b/GPy/inference/optimization/stochastics.py
@ -5,6 +5,10 @@ class StochasticStorage(object):
    '''
    This is a container for holding the stochastic parameters,
    such as subset indices or step length and so on.
+
+    self.d has to be a list of lists:
+    [dimension indices, nan indices for those dimensions]
+    so that the minibatches can be used as efficiently as possible.10
    '''
    def __init__(self, model):
        """
@ -28,28 +32,60 @@ class SparseGPMissing(StochasticStorage):
        """
        Here we want to loop over all dimensions everytime.
        Thus, we can just make sure the loop goes over self.d every
-        time.
+        time. We will try to get batches which look the same together
+        which speeds up calculations significantly.
        """
-        self.d = xrange(model.Y_normalized.shape[1])
+        import numpy as np
+        self.Y = model.Y_normalized
+        bdict = {}
+        #For N > 1000 array2string default crops
+        opt = np.get_printoptions()
+        np.set_printoptions(threshold=np.inf)
+        for d in range(self.Y.shape[1]):
+            inan = np.isnan(self.Y)[:, d]
+            arr_str = np.array2string(inan, np.inf, 0, True, '', formatter={'bool':lambda x: '1' if x else '0'})
+            try:
+                bdict[arr_str][0].append(d)
+            except:
+                bdict[arr_str] = [[d], ~inan]
+        np.set_printoptions(**opt)
+        self.d = bdict.values()

 class SparseGPStochastics(StochasticStorage):
    """
    For the sparse gp we need to store the dimension we are in,
    and the indices corresponding to those
    """
-    def __init__(self, model, batchsize=1):
+    def __init__(self, model, batchsize=1, missing_data=True):
        self.batchsize = batchsize
        self.output_dim = model.Y.shape[1]
+        self.Y = model.Y_normalized
+        self.missing_data = missing_data
        self.reset()
        self.do_stochastics()

    def do_stochastics(self):
+        import numpy as np
        if self.batchsize == 1:
            self.current_dim = (self.current_dim+1)%self.output_dim
-            self.d = [self.current_dim]
+            self.d = [[[self.current_dim], np.isnan(self.Y[:, self.current_dim]) if self.missing_data else None]]
        else:
-            import numpy as np
            self.d = np.random.choice(self.output_dim, size=self.batchsize, replace=False)
+            bdict = {}
+            if self.missing_data:
+                opt = np.get_printoptions()
+                np.set_printoptions(threshold=np.inf)
+                for d in self.d:
+                    inan = np.isnan(self.Y[:, d])
+                    arr_str = np.array2string(inan,np.inf, 0,True, '',formatter={'bool':lambda x: '1' if x else '0'})
+                    try:
+                        bdict[arr_str][0].append(d)
+                    except:
+                        bdict[arr_str] = [[d], ~inan]
+                np.set_printoptions(**opt)
+                self.d = bdict.values()
+            else:
+                self.d = [[self.d, None]]

    def reset(self):
        self.current_dim = -1
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -1,19 +1,24 @@
-from _src.kern import Kern
-from _src.rbf import RBF
-from _src.linear import Linear, LinearFull
-from _src.static import Bias, White, Fixed
-from _src.brownian import Brownian
-from _src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
-from _src.mlp import MLP
-from _src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52
-from _src.independent_outputs import IndependentOutputs, Hierarchical
-from _src.coregionalize import Coregionalize
-from _src.ODE_UY import ODE_UY
-from _src.ODE_UYC import ODE_UYC
-from _src.ODE_st import ODE_st
-from _src.ODE_t import ODE_t
-from _src.poly import Poly
-
-from _src.trunclinear import TruncLinear,TruncLinear_inf
-from _src.splitKern import SplitKern,DiffGenomeKern
+from ._src.kern import Kern
+from ._src.rbf import RBF
+from ._src.linear import Linear, LinearFull
+from ._src.static import Bias, White, Fixed
+from ._src.brownian import Brownian
+from ._src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
+from ._src.mlp import MLP
+from ._src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52
+from ._src.standard_periodic import StdPeriodic
+from ._src.independent_outputs import IndependentOutputs, Hierarchical
+from ._src.coregionalize import Coregionalize
+from ._src.ODE_UY import ODE_UY
+from ._src.ODE_UYC import ODE_UYC
+from ._src.ODE_st import ODE_st
+from ._src.ODE_t import ODE_t
+from ._src.poly import Poly
+from ._src.eq_ode2 import EQ_ODE2
+from ._src.trunclinear import TruncLinear,TruncLinear_inf
+from ._src.splitKern import SplitKern,DEtime
+from ._src.splitKern import DEtime as DiffGenomeKern
+from ._src.spline import Spline
+from ._src.eq_ode2 import EQ_ODE2
+from ._src.basis_funcs import LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel

--- a/GPy/kern/_src/ODE_UY.py
+++ b/GPy/kern/_src/ODE_UY.py
@ -1,11 +1,11 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
-from independent_outputs import index_to_slices
+from .independent_outputs import index_to_slices

 class ODE_UY(Kern):
    def __init__(self, input_dim, variance_U=3., variance_Y=1., lengthscale_U=1., lengthscale_Y=1., active_dims=None, name='ode_uy'):
@ -114,7 +114,7 @@ class ODE_UY(Kern):
                elif i==1:
                    Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
                else:
-                    raise ValueError, "invalid input/output index"
+                    raise ValueError("invalid input/output index")
        #Kdiag[slices[0][0]]+= self.variance_U   #matern32 diag
        #Kdiag[slices[1][0]]+= self.variance_U*self.variance_Y*(k1+k2+k3)  #  diag
        return Kdiag
--- a/GPy/kern/_src/ODE_UYC.py
+++ b/GPy/kern/_src/ODE_UYC.py
@ -1,11 +1,11 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
-from independent_outputs import index_to_slices
+from .independent_outputs import index_to_slices

 class ODE_UYC(Kern):
    def __init__(self, input_dim, variance_U=3., variance_Y=1., lengthscale_U=1., lengthscale_Y=1., ubias =1. ,active_dims=None, name='ode_uyc'):
@ -115,7 +115,7 @@ class ODE_UYC(Kern):
                elif i==1:
                    Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
                else:
-                    raise ValueError, "invalid input/output index"
+                    raise ValueError("invalid input/output index")
        #Kdiag[slices[0][0]]+= self.variance_U   #matern32 diag
        #Kdiag[slices[1][0]]+= self.variance_U*self.variance_Y*(k1+k2+k3)  #  diag
        return Kdiag
--- a/GPy/kern/_src/ODE_st.py
+++ b/GPy/kern/_src/ODE_st.py
@ -1,10 +1,10 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
-from independent_outputs import index_to_slices
+from .independent_outputs import index_to_slices


 class ODE_st(Kern):
@ -135,7 +135,7 @@ class ODE_st(Kern):
                    Kdiag[s1]+= b**2*k1 - 2*a*c*k2 + a**2*k3 + c**2*vyt*vyx
                    #Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
                else:
-                    raise ValueError, "invalid input/output index"
+                    raise ValueError("invalid input/output index")

        return Kdiag
        
--- a/GPy/kern/_src/ODE_t.py
+++ b/GPy/kern/_src/ODE_t.py
@ -1,8 +1,8 @@
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
-from independent_outputs import index_to_slices
+from .independent_outputs import index_to_slices


 class ODE_t(Kern):
@ -85,7 +85,7 @@ class ODE_t(Kern):
                            Kdiag[s1]+= k1 + vyt+self.ubias
                            #Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
                        else:
-                            raise ValueError, "invalid input/output index"
+                            raise ValueError("invalid input/output index")

                return Kdiag

--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@ -4,7 +4,8 @@
 import numpy as np
 import itertools
 from ...util.caching import Cache_this
-from kern import CombinationKernel
+from .kern import CombinationKernel
+from functools import reduce

 class Add(CombinationKernel):
    """
@ -13,7 +14,7 @@ class Add(CombinationKernel):

    This kernel will take over the active dims of it's subkernels passed in.
    """
-    def __init__(self, subkerns, name='add'):
+    def __init__(self, subkerns, name='sum'):
        for i, kern in enumerate(subkerns[:]):
            if isinstance(kern, Add):
                del subkerns[i]
@ -70,24 +71,37 @@ class Add(CombinationKernel):
        target = np.zeros(X.shape)
        [target.__iadd__(p.gradients_X_diag(dL_dKdiag, X)) for p in self.parts]
        return target
-    
-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+
+    def gradients_XX(self, dL_dK, X, X2):
+        if X2 is None:
+            target = np.zeros((X.shape[0], X.shape[0], X.shape[1]))
+        else:
+            target = np.zeros((X.shape[0], X2.shape[0], X.shape[1]))
+        [target.__iadd__(p.gradients_XX(dL_dK, X, X2)) for p in self.parts]
+        return target
+
+    def gradients_XX_diag(self, dL_dKdiag, X):
+        target = np.zeros(X.shape)
+        [target.__iadd__(p.gradients_XX_diag(dL_dKdiag, X)) for p in self.parts]
+        return target
+
+    @Cache_this(limit=1, force_kwargs=['which_parts'])
    def psi0(self, Z, variational_posterior):
        return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts))
-    
-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+
+    @Cache_this(limit=1, force_kwargs=['which_parts'])
    def psi1(self, Z, variational_posterior):
        return reduce(np.add, (p.psi1(Z, variational_posterior) for p in self.parts))

-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=1, force_kwargs=['which_parts'])
    def psi2(self, Z, variational_posterior):
        psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts))
        #return psi2
        # compute the "cross" terms
-        from static import White, Bias
-        from rbf import RBF
+        from .static import White, Bias
+        from .rbf import RBF
        #from rbf_inv import RBFInv
-        from linear import Linear
+        from .linear import Linear
        #ffrom fixed import Fixed

        for p1, p2 in itertools.combinations(self.parts, 2):
@ -111,11 +125,46 @@ class Add(CombinationKernel):
                psi2 += np.einsum('nm,no->mo',tmp1,tmp2)+np.einsum('nm,no->mo',tmp2,tmp1)
                #(tmp1[:, :, None] * tmp2[:, None, :]) + (tmp2[:, :, None] * tmp1[:, None, :])
            else:
-                raise NotImplementedError, "psi2 cannot be computed for this kernel"
+                raise NotImplementedError("psi2 cannot be computed for this kernel")
+        return psi2
+
+    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    def psi2n(self, Z, variational_posterior):
+        psi2 = reduce(np.add, (p.psi2n(Z, variational_posterior) for p in self.parts))
+        #return psi2
+        # compute the "cross" terms
+        from .static import White, Bias
+        from .rbf import RBF
+        #from rbf_inv import RBFInv
+        from .linear import Linear
+        #ffrom fixed import Fixed
+
+        for p1, p2 in itertools.combinations(self.parts, 2):
+            # i1, i2 = p1.active_dims, p2.active_dims
+            # white doesn;t combine with anything
+            if isinstance(p1, White) or isinstance(p2, White):
+                pass
+            # rbf X bias
+            #elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
+            elif isinstance(p1,  Bias) and isinstance(p2, (RBF, Linear)):
+                tmp = p2.psi1(Z, variational_posterior).sum(axis=0)
+                psi2 += p1.variance * (tmp[:, :, None] + tmp[:, None, :])
+            #elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
+            elif isinstance(p2, Bias) and isinstance(p1, (RBF, Linear)):
+                tmp = p1.psi1(Z, variational_posterior).sum(axis=0)
+                psi2 += p2.variance * (tmp[:, :, None] + tmp[:, None, :])
+            elif isinstance(p2, (RBF, Linear)) and isinstance(p1, (RBF, Linear)):
+                assert np.intersect1d(p1.active_dims, p2.active_dims).size == 0, "only non overlapping kernel dimensions allowed so far"
+                tmp1 = p1.psi1(Z, variational_posterior)
+                tmp2 = p2.psi1(Z, variational_posterior)
+                psi2 += np.einsum('nm,no->nmo',tmp1,tmp2)+np.einsum('nm,no->nmo',tmp2,tmp1)
+                #(tmp1[:, :, None] * tmp2[:, None, :]) + (tmp2[:, :, None] * tmp1[:, None, :])
+            else:
+                raise NotImplementedError("psi2 cannot be computed for this kernel")
        return psi2

    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        from static import White, Bias
+        from .static import White, Bias
        for p1 in self.parts:
            #compute the effective dL_dpsi1. Extra terms appear becaue of the cross terms in psi2!
            eff_dL_dpsi1 = dL_dpsi1.copy()
@ -125,13 +174,13 @@ class Add(CombinationKernel):
                if isinstance(p2, White):
                    continue
                elif isinstance(p2, Bias):
-                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
                else:# np.setdiff1d(p1.active_dims, ar2, assume_unique): # TODO: Careful, not correct for overlapping active_dims
-                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
            p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)

    def gradients_Z_expectations(self, dL_psi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        from static import White, Bias
+        from .static import White, Bias
        target = np.zeros(Z.shape)
        for p1 in self.parts:
            #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
@ -142,14 +191,14 @@ class Add(CombinationKernel):
                if isinstance(p2, White):
                    continue
                elif isinstance(p2, Bias):
-                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
                else:
-                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
            target += p1.gradients_Z_expectations(dL_psi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
        return target

    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        from static import White, Bias
+        from .static import White, Bias
        target_grads = [np.zeros(v.shape) for v in variational_posterior.parameters]
        for p1 in self.parameters:
            #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
@ -160,11 +209,11 @@ class Add(CombinationKernel):
                if isinstance(p2, White):
                    continue
                elif isinstance(p2, Bias):
-                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
                else:
-                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
            grads = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
-            [np.add(target_grads[i],grads[i],target_grads[i]) for i in xrange(len(grads))]
+            [np.add(target_grads[i],grads[i],target_grads[i]) for i in range(len(grads))]
        return target_grads

    def add(self, other):
@ -180,9 +229,12 @@ class Add(CombinationKernel):

    def input_sensitivity(self, summarize=True):
        if summarize:
-            return reduce(np.add, [k.input_sensitivity(summarize) for k in self.parts])
+            i_s = np.zeros((self.input_dim))
+            for k in self.parts:
+                i_s[k.active_dims] += k.input_sensitivity(summarize)
+            return i_s
        else:
            i_s = np.zeros((len(self.parts), self.input_dim))
            from operator import setitem
-            [setitem(i_s, (i, Ellipsis), k.input_sensitivity(summarize)) for i, k in enumerate(self.parts)]
+            [setitem(i_s, (i, k.active_dims), k.input_sensitivity(summarize)) for i, k in enumerate(self.parts)]
            return i_s
--- a/GPy/kern/_src/basis_funcs.py
+++ b/GPy/kern/_src/basis_funcs.py
@ -0,0 +1,183 @@
+# #Copyright (c) 2012, Max Zwiessele (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+from .kern import Kern
+from ...core.parameterization.param import Param
+from ...core.parameterization.transformations import Logexp
+import numpy as np
+from ...util.caching import Cache_this
+from ...util.linalg import tdot, mdot
+
+class BasisFuncKernel(Kern):
+    def __init__(self, input_dim, variance=1., active_dims=None, ARD=False, name='basis func kernel'):
+        """
+        Abstract superclass for kernels with explicit basis functions for use in GPy.
+
+        This class does NOT automatically add an offset to the design matrix phi!
+        """
+        super(BasisFuncKernel, self).__init__(input_dim, active_dims, name)
+        self.ARD = ARD
+        if self.ARD:
+            phi_test = self._phi(np.random.normal(0, 1, (1, self.input_dim)))
+            variance = variance * np.ones(phi_test.shape[1])
+        else:
+            variance = np.array(variance)
+        self.variance = Param('variance', variance, Logexp())
+        self.link_parameter(self.variance)
+
+    def parameters_changed(self):
+        self.alpha = np.sqrt(self.variance)
+        self.beta = 1./self.variance
+
+    @Cache_this(limit=3, ignore_args=())
+    def phi(self, X):
+        return self._phi(X)
+
+    def _phi(self, X):
+        raise NotImplementedError('Overwrite this _phi function, which maps the input X into the higher dimensional space and returns the design matrix Phi')
+
+    def K(self, X, X2=None):
+        return self._K(X, X2)
+
+    def Kdiag(self, X, X2=None):
+        return np.diag(self._K(X, X2))
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        if self.ARD:
+            phi1 = self.phi(X)
+            if X2 is None or X is X2:
+                self.variance.gradient = np.einsum('ij,iq,jq->q', dL_dK, phi1, phi1)
+            else:
+                phi2 = self.phi(X2)
+                self.variance.gradient = np.einsum('ij,iq,jq->q', dL_dK, phi1, phi2)
+        else:
+            self.variance.gradient = np.einsum('ij,ij', dL_dK, self._K(X, X2)) * self.beta
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        if self.ARD:
+            phi1 = self.phi(X)
+            self.variance.gradient = np.einsum('i,iq,iq->q', dL_dKdiag, phi1, phi1)
+        else:
+            self.variance.gradient = np.einsum('i,i', dL_dKdiag, self.Kdiag(X)) * self.beta
+
+    def concatenate_offset(self, X):
+        return np.c_[np.ones((X.shape[0], 1)), X]
+
+    def posterior_inf(self, X=None, posterior=None):
+        """
+        Do the posterior inference on the parameters given this kernels functions
+        and the model posterior, which has to be a GPy posterior, usually found at m.posterior, if m is a GPy model.
+        If not given we search for the the highest parent to be a model, containing the posterior, and for X accordingly.
+        """
+        if X is None:
+            try:
+                X = self._highest_parent_.X
+            except NameError:
+                raise RuntimeError("This kernel is not part of a model and cannot be used for posterior inference")
+        if posterior is None:
+            try:
+                posterior = self._highest_parent_.posterior
+            except NameError:
+                raise RuntimeError("This kernel is not part of a model and cannot be used for posterior inference")
+        phi_alpha = self.phi(X) * self.variance
+        return (phi_alpha).T.dot(posterior.woodbury_vector), (np.eye(phi_alpha.shape[1])*self.variance - mdot(phi_alpha.T, posterior.woodbury_inv, phi_alpha))
+
+    @Cache_this(limit=3, ignore_args=())
+    def _K(self, X, X2):
+        if X2 is None or X is X2:
+            phi = self.phi(X) * self.alpha
+            if phi.ndim != 2:
+                phi = phi[:, None]
+            return tdot(phi)
+        else:
+            phi1 = self.phi(X) * self.alpha
+            phi2 = self.phi(X2) * self.alpha
+            if phi1.ndim != 2:
+                phi1 = phi1[:, None]
+                phi2 = phi2[:, None]
+            return phi1.dot(phi2.T)
+
+
+class LinearSlopeBasisFuncKernel(BasisFuncKernel):
+    def __init__(self, input_dim, start, stop, variance=1., active_dims=None, ARD=False, name='linear_segment'):
+        """
+        A linear segment transformation. The segments start at start, \
+        are then linear to stop and constant again. The segments are
+        normalized, so that they have exactly as much mass above
+        as below the origin.
+
+        Start and stop can be tuples or lists of starts and stops.
+        Behaviour of start stop is as np.where(X<start) would do.
+        """
+
+        self.start = np.array(start)
+        self.stop = np.array(stop)
+        super(LinearSlopeBasisFuncKernel, self).__init__(input_dim, variance, active_dims, ARD, name)
+
+    @Cache_this(limit=3, ignore_args=())
+    def _phi(self, X):
+        phi = np.where(X < self.start, self.start, X)
+        phi = np.where(phi > self.stop, self.stop, phi)
+        return ((phi-(self.stop+self.start)/2.))#/(.5*(self.stop-self.start)))-1.
+
+class ChangePointBasisFuncKernel(BasisFuncKernel):
+    def __init__(self, input_dim, changepoint, variance=1., active_dims=None, ARD=False, name='changepoint'):
+        self.changepoint = np.array(changepoint)
+        super(ChangePointBasisFuncKernel, self).__init__(input_dim, variance, active_dims, ARD, name)
+
+    @Cache_this(limit=3, ignore_args=())
+    def _phi(self, X):
+        return np.where((X < self.changepoint), -1, 1)
+
+class DomainKernel(LinearSlopeBasisFuncKernel):
+    def __init__(self, input_dim, start, stop, variance=1., active_dims=None, ARD=False, name='constant_domain'):
+        super(DomainKernel, self).__init__(input_dim, start, stop, variance, active_dims, ARD, name)
+
+    @Cache_this(limit=3, ignore_args=())
+    def _phi(self, X):
+        phi = np.where((X>self.start)*(X<self.stop), 1, 0)
+        return phi#((phi-self.start)/(self.stop-self.start))-.5
+
+class LogisticBasisFuncKernel(BasisFuncKernel):
+    def __init__(self, input_dim, centers, variance=1., slope=1., active_dims=None, ARD=False, ARD_slope=True, name='logistic'):
+        self.centers = np.atleast_2d(centers)
+        self.ARD_slope = ARD_slope
+        if self.ARD_slope:
+            self.slope = Param('slope', slope * np.ones(self.centers.size), Logexp())
+        else:
+            self.slope = Param('slope', slope, Logexp())
+        super(LogisticBasisFuncKernel, self).__init__(input_dim, variance, active_dims, ARD, name)
+        self.link_parameter(self.slope)
+
+    @Cache_this(limit=3, ignore_args=())
+    def _phi(self, X):
+        import scipy as sp
+        phi = 1/(1+np.exp(-((X-self.centers)*self.slope)))
+        return np.where(np.isnan(phi), 0, phi)#((phi-self.start)/(self.stop-self.start))-.5
+
+    def parameters_changed(self):
+        BasisFuncKernel.parameters_changed(self)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        super(LogisticBasisFuncKernel, self).update_gradients_full(dL_dK, X, X2)
+        if X2 is None or X is X2:
+            phi1 = self.phi(X)
+            if phi1.ndim != 2:
+                phi1 = phi1[:, None]
+            dphi1_dl = (phi1**2) * (np.exp(-((X-self.centers)*self.slope)) * (X-self.centers))
+            if self.ARD_slope:
+                self.slope.gradient = self.variance * 2 * np.einsum('ij,iq,jq->q', dL_dK, phi1, dphi1_dl)
+            else:
+                self.slope.gradient = self.variance * 2 * (dL_dK * phi1.dot(dphi1_dl.T)).sum()
+        else:
+            phi1 = self.phi(X)
+            phi2 = self.phi(X2)
+            if phi1.ndim != 2:
+                phi1 = phi1[:, None]
+                phi2 = phi2[:, None]
+            dphi1_dl = (phi1**2) * (np.exp(-((X-self.centers)*self.slope)) * (X-self.centers))
+            dphi2_dl = (phi2**2) * (np.exp(-((X2-self.centers)*self.slope)) * (X2-self.centers))
+            if self.ARD_slope:
+                self.slope.gradient = (self.variance * np.einsum('ij,iq,jq->q', dL_dK, phi1, dphi2_dl) + np.einsum('ij,iq,jq->q', dL_dK, phi2, dphi1_dl))
+            else:
+                self.slope.gradient = self.variance * (dL_dK * phi1.dot(dphi2_dl.T)).sum() + (dL_dK * phi2.dot(dphi1_dl.T)).sum()
+        self.slope.gradient = np.where(np.isnan(self.slope.gradient), 0, self.slope.gradient)
--- a/GPy/kern/_src/brownian.py
+++ b/GPy/kern/_src/brownian.py
@ -1,7 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@ -1,12 +1,16 @@
 # Copyright (c) 2012, James Hensman and Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from kern import Kern
+from .kern import Kern
 import numpy as np
-from scipy import weave
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
-from ...util.config import config # for assesing whether to use weave
+from ...util.config import config # for assesing whether to use cython
+try:
+    from . import coregionalize_cython
+    config.set('cython', 'working', 'True')
+except ImportError:
+    config.set('cython', 'working', 'False')

 class Coregionalize(Kern):
    """
@ -57,13 +61,8 @@ class Coregionalize(Kern):
        self.B = np.dot(self.W, self.W.T) + np.diag(self.kappa)

    def K(self, X, X2=None):
-        if config.getboolean('weave', 'working'):
-            try:
-                return self._K_weave(X, X2)
-            except:
-                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
-                config.set('weave', 'working', 'False')
-                return self._K_numpy(X, X2)
+        if config.getboolean('cython', 'working'):
+            return self._K_cython(X, X2)
        else:
            return self._K_numpy(X, X2)

@ -76,36 +75,10 @@ class Coregionalize(Kern):
            index2 = np.asarray(X2, dtype=np.int)
            return self.B[index,index2.T]

-    def _K_weave(self, X, X2=None):
-        """compute the kernel function using scipy.weave"""
-        index = np.asarray(X, dtype=np.int)
-
+    def _K_cython(self, X, X2=None):
        if X2 is None:
-            target = np.empty((X.shape[0], X.shape[0]), dtype=np.float64)
-            code="""
-            for(int i=0;i<N; i++){
-              target[i+i*N] = B[index[i]+output_dim*index[i]];
-              for(int j=0; j<i; j++){
-                  target[j+i*N] = B[index[i]+output_dim*index[j]];
-                  target[i+j*N] = target[j+i*N];
-                }
-              }
-            """
-            N, B, output_dim = index.size, self.B, self.output_dim
-            weave.inline(code, ['target', 'index', 'N', 'B', 'output_dim'])
-        else:
-            index2 = np.asarray(X2, dtype=np.int)
-            target = np.empty((X.shape[0], X2.shape[0]), dtype=np.float64)
-            code="""
-            for(int i=0;i<num_inducing; i++){
-              for(int j=0; j<N; j++){
-                  target[i+j*num_inducing] = B[output_dim*index[j]+index2[i]];
-                }
-              }
-            """
-            N, num_inducing, B, output_dim = index.size, index2.size, self.B, self.output_dim
-            weave.inline(code, ['target', 'index', 'index2', 'N', 'num_inducing', 'B', 'output_dim'])
-        return target
+            return coregionalize_cython.K_symmetric(self.B, np.asarray(X, dtype=np.int64)[:,0])
+        return coregionalize_cython.K_asymmetric(self.B, np.asarray(X, dtype=np.int64)[:,0], np.asarray(X2, dtype=np.int64)[:,0])


    def Kdiag(self, X):
@ -118,51 +91,37 @@ class Coregionalize(Kern):
        else:
            index2 = np.asarray(X2, dtype=np.int)

-        #attempt to use weave for a nasty double indexing loop: fall back to numpy
-        if config.getboolean('weave', 'working'):
-            try:
-                dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
-            except:
-                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
-                config.set('weave', 'working', 'False')
-                dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
+        #attempt to use cython for a nasty double indexing loop: fall back to numpy
+        if config.getboolean('cython', 'working'):
+            dL_dK_small = self._gradient_reduce_cython(dL_dK, index, index2)
        else:
-            dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
+            dL_dK_small = self._gradient_reduce_numpy(dL_dK, index, index2)


-
-        dkappa = np.diag(dL_dK_small)
+        dkappa = np.diag(dL_dK_small).copy()
        dL_dK_small += dL_dK_small.T
        dW = (self.W[:, None, :]*dL_dK_small[:, :, None]).sum(0)

        self.W.gradient = dW
        self.kappa.gradient = dkappa

-    def _gradient_reduce_weave(self, dL_dK, index, index2):
-        dL_dK_small = np.zeros_like(self.B)
-        code="""
-        for(int i=0; i<num_inducing; i++){
-          for(int j=0; j<N; j++){
-            dL_dK_small[index[j] + output_dim*index2[i]] += dL_dK[i+j*num_inducing];
-          }
-        }
-        """
-        N, num_inducing, output_dim = index.size, index2.size, self.output_dim
-        weave.inline(code, ['N', 'num_inducing', 'output_dim', 'dL_dK', 'dL_dK_small', 'index', 'index2'])
-        return dL_dK_small
-
    def _gradient_reduce_numpy(self, dL_dK, index, index2):
        index, index2 = index[:,0], index2[:,0]
        dL_dK_small = np.zeros_like(self.B)
-        for i in range(k.output_dim):
+        for i in range(self.output_dim):
            tmp1 = dL_dK[index==i]
-            for j in range(k.output_dim):
+            for j in range(self.output_dim):
                dL_dK_small[j,i] = tmp1[:,index2==j].sum()
        return dL_dK_small

+    def _gradient_reduce_cython(self, dL_dK, index, index2):
+        index, index2 = index[:,0], index2[:,0]
+        return coregionalize_cython.gradient_reduce(self.B.shape[0], dL_dK, index, index2)
+
+
    def update_gradients_diag(self, dL_dKdiag, X):
        index = np.asarray(X, dtype=np.int).flatten()
-        dL_dKdiag_small = np.array([dL_dKdiag[index==i].sum() for i in xrange(self.output_dim)])
+        dL_dKdiag_small = np.array([dL_dKdiag[index==i].sum() for i in range(self.output_dim)])
        self.W.gradient = 2.*self.W*dL_dKdiag_small[:, None]
        self.kappa.gradient = dL_dKdiag_small

@ -171,4 +130,3 @@ class Coregionalize(Kern):

    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)
-
--- a/GPy/kern/_src/coregionalize_cython.c
+++ b/GPy/kern/_src/coregionalize_cython.c
--- a/GPy/kern/_src/coregionalize_cython.pyx
+++ b/GPy/kern/_src/coregionalize_cython.pyx
@ -0,0 +1,38 @@
+#cython: boundscheck=False
+#cython: wraparound=False
+#cython: nonecheck=False
+import cython
+import numpy as np
+cimport numpy as np
+
+def K_symmetric(np.ndarray[double, ndim=2] B, np.ndarray[np.int64_t, ndim=1] X):
+    cdef int N = X.size
+    cdef np.ndarray[np.double_t, ndim=2, mode='c'] K = np.empty((N, N))
+    with nogil:
+        for n in range(N):
+            for m in range(N):
+                K[n, m] = B[X[n], X[m]]
+    return K
+
+def K_asymmetric(np.ndarray[double, ndim=2] B, np.ndarray[np.int64_t, ndim=1] X, np.ndarray[np.int64_t, ndim=1] X2):
+    cdef int N = X.size
+    cdef int M = X2.size
+    cdef np.ndarray[np.double_t, ndim=2, mode='c'] K = np.empty((N, M))
+    with nogil:
+        for n in range(N):
+            for m in range(M):
+                K[n, m] = B[X[n], X2[m]]
+    return K
+
+def gradient_reduce(int D, np.ndarray[double, ndim=2] dL_dK, np.ndarray[np.int64_t, ndim=1] index, np.ndarray[np.int64_t, ndim=1] index2):
+        cdef np.ndarray[np.double_t, ndim=2, mode='c'] dL_dK_small = np.zeros((D, D))
+        cdef int N = index.size
+        cdef int M = index2.size
+        with nogil:
+            for i in range(N):
+                for j in range(M):
+                    dL_dK_small[index2[j],index[i]] += dL_dK[i,j];
+        return dL_dK_small
+
+
+
--- a/GPy/kern/_src/eq_ode2.py
+++ b/GPy/kern/_src/eq_ode2.py
--- a/GPy/kern/_src/independent_outputs.py
+++ b/GPy/kern/_src/independent_outputs.py
@ -2,13 +2,13 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kern import Kern, CombinationKernel
+from .kern import Kern, CombinationKernel
 import numpy as np
 import itertools

 def index_to_slices(index):
    """
-    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index. 
+    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index.

    e.g.
    >>> index = np.asarray([0,0,0,1,1,1,2,2,2])
@ -79,10 +79,10 @@ class IndependentOutputs(CombinationKernel):

    def update_gradients_full(self,dL_dK,X,X2=None):
        slices = index_to_slices(X[:,self.index_dim])
-        if self.single_kern: 
+        if self.single_kern:
            target = np.zeros(self.kern.size)
            kerns = itertools.repeat(self.kern)
-        else: 
+        else:
            kerns = self.kern
            target = [np.zeros(kern.size) for kern, _ in zip(kerns, slices)]
        def collate_grads(kern, i, dL, X, X2):
@ -94,20 +94,24 @@ class IndependentOutputs(CombinationKernel):
        else:
            slices2 = index_to_slices(X2[:,self.index_dim])
            [[[collate_grads(kern, i, dL_dK[s,s2],X[s],X2[s2]) for s in slices_i] for s2 in slices_j] for i,(kern,slices_i,slices_j) in enumerate(zip(kerns,slices,slices2))]
-        if self.single_kern: kern.gradient = target
-        else:[kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(kerns, slices))]
+        if self.single_kern:
+            self.kern.gradient = target
+        else:
+            [kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(kerns, slices))]

    def gradients_X(self,dL_dK, X, X2=None):
        target = np.zeros(X.shape)
        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
        if X2 is None:
            # TODO: make use of index_to_slices
+            # FIXME: Broken as X is already sliced out
+            print("Warning, gradients_X may not be working, I believe X has already been sliced out by the slicer!")
            values = np.unique(X[:,self.index_dim])
            slices = [X[:,self.index_dim]==i for i in values]
            [target.__setitem__(s, kern.gradients_X(dL_dK[s,s],X[s],None))
              for kern, s in zip(kerns, slices)]
            #slices = index_to_slices(X[:,self.index_dim])
-            #[[np.add(target[s], kern.gradients_X(dL_dK[s,s], X[s]), out=target[s]) 
+            #[[np.add(target[s], kern.gradients_X(dL_dK[s,s], X[s]), out=target[s])
            #  for s in slices_i] for kern, slices_i in zip(kerns, slices)]
            #import ipdb;ipdb.set_trace()
            #[[(np.add(target[s ], kern.gradients_X(dL_dK[s ,ss],X[s ], X[ss]), out=target[s ]),
@ -142,7 +146,7 @@ class IndependentOutputs(CombinationKernel):
            if self.single_kern: target[:] += kern.gradient
            else: target[i][:] += kern.gradient
        [[collate_grads(kern, i, dL_dKdiag[s], X[s,:]) for s in slices_i] for i, (kern, slices_i) in enumerate(zip(kerns, slices))]
-        if self.single_kern: kern.gradient = target
+        if self.single_kern: self.kern.gradient = target
        else:[kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(kerns, slices))]

 class Hierarchical(CombinationKernel):
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@ -4,17 +4,20 @@
 import sys
 import numpy as np
 from ...core.parameterization.parameterized import Parameterized
-from kernel_slice_operations import KernCallsViaSlicerMeta
+from .kernel_slice_operations import KernCallsViaSlicerMeta
 from ...util.caching import Cache_this
 from GPy.core.parameterization.observable_array import ObsAr
+from functools import reduce
+import six

-
-
+@six.add_metaclass(KernCallsViaSlicerMeta)
 class Kern(Parameterized):
    #===========================================================================
    # This adds input slice support. The rather ugly code for slicing can be
    # found in kernel_slice_operations
-    __metaclass__ = KernCallsViaSlicerMeta
+    # __meataclass__ is ignored in Python 3 - needs to be put in the function definiton
+    #__metaclass__ = KernCallsViaSlicerMeta
+    #Here, we use the Python module six to support Py3 and Py2 simultaneously
    #===========================================================================
    _support_GPU=False
    def __init__(self, input_dim, active_dims, name, useGPU=False, *a, **kw):
@ -55,20 +58,9 @@ class Kern(Parameterized):

        self._sliced_X = 0
        self.useGPU = self._support_GPU and useGPU
-        self._return_psi2_n_flag = ObsAr(np.zeros(1)).astype(bool)

-    @property
-    def return_psi2_n(self):
-        """
-        Flag whether to pass back psi2 as NxMxM or MxM, by summing out N.
-        """
-        return self._return_psi2_n_flag[0]
-    @return_psi2_n.setter
-    def return_psi2_n(self, val):
-        def visit(self):
-            if isinstance(self, Kern):
-                self._return_psi2_n_flag[0]=val
-        self.traverse(visit)
+        from .psi_comp import PSICOMP_GH
+        self.psicomp = PSICOMP_GH()

    @Cache_this(limit=20)
    def _slice_X(self, X):
@ -78,6 +70,9 @@ class Kern(Parameterized):
        """
        Compute the kernel function.

+        .. math::
+            K_{ij} = k(X_i, X_j)
+
        :param X: the first set of inputs to the kernel
        :param X2: (optional) the second set of arguments to the kernel. If X2
                   is None, this is passed throgh to the 'part' object, which
@ -85,16 +80,64 @@ class Kern(Parameterized):
        """
        raise NotImplementedError
    def Kdiag(self, X):
+        """
+        The diagonal of the kernel matrix K
+
+        .. math::
+            Kdiag_{i} = k(X_i, X_i)
+        """
        raise NotImplementedError
    def psi0(self, Z, variational_posterior):
-        raise NotImplementedError
+        """
+        .. math::
+            \psi_0 = \sum_{i=0}^{n}E_{q(X)}[k(X_i, X_i)]
+        """
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[0]
    def psi1(self, Z, variational_posterior):
-        raise NotImplementedError
+        """
+        .. math::
+            \psi_1^{n,m} = E_{q(X)}[k(X_n, Z_m)]
+        """
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[1]
    def psi2(self, Z, variational_posterior):
-        raise NotImplementedError
+        """
+        .. math::
+            \psi_2^{m,m'} = \sum_{i=0}^{n}E_{q(X)}[ k(Z_m, X_i) k(X_i, Z_{m'})]
+        """
+        return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=False)[2]
+    def psi2n(self, Z, variational_posterior):
+        """
+        .. math::
+            \psi_2^{n,m,m'} = E_{q(X)}[ k(Z_m, X_n) k(X_n, Z_{m'})]
+
+        Thus, we do not sum out n, compared to psi2
+        """
+        return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=True)[2]
    def gradients_X(self, dL_dK, X, X2):
+        """
+        .. math::
+
+            \\frac{\partial L}{\partial X} = \\frac{\partial L}{\partial K}\\frac{\partial K}{\partial X}
+        """
        raise NotImplementedError
+    def gradients_X_X2(self, dL_dK, X, X2):
+        return self.gradients_X(dL_dK, X, X2), self.gradients_X(dL_dK.T, X2, X)
+    def gradients_XX(self, dL_dK, X, X2):
+        """
+        .. math::
+
+            \\frac{\partial^2 L}{\partial X\partial X_2} = \\frac{\partial L}{\partial K}\\frac{\partial^2 K}{\partial X\partial X_2}
+        """
+        raise(NotImplementedError, "This is the second derivative of K wrt X and X2, and not implemented for this kernel")
+    def gradients_XX_diag(self, dL_dKdiag, X):
+        """
+        The diagonal of the second derivative w.r.t. X and X2
+        """
+        raise(NotImplementedError, "This is the diagonal of the second derivative of K wrt X and X2, and not implemented for this kernel")
    def gradients_X_diag(self, dL_dKdiag, X):
+        """
+        The diagonal of the derivative w.r.t. X
+        """
        raise NotImplementedError

    def update_gradients_diag(self, dL_dKdiag, X):
@ -110,27 +153,35 @@ class Kern(Parameterized):
        Set the gradients of all parameters when doing inference with
        uncertain inputs, using expectations of the kernel.

-        The esential maths is
+        The essential maths is

-        dL_d{theta_i} = dL_dpsi0 * dpsi0_d{theta_i} +
-                        dL_dpsi1 * dpsi1_d{theta_i} +
-                        dL_dpsi2 * dpsi2_d{theta_i}
+        .. math::
+
+            \\frac{\partial L}{\partial \\theta_i} & = \\frac{\partial L}{\partial \psi_0}\\frac{\partial \psi_0}{\partial \\theta_i}\\
+                & \quad + \\frac{\partial L}{\partial \psi_1}\\frac{\partial \psi_1}{\partial \\theta_i}\\
+                & \quad + \\frac{\partial L}{\partial \psi_2}\\frac{\partial \psi_2}{\partial \\theta_i}
+
+        Thus, we push the different derivatives through the gradients of the psi
+        statistics. Be sure to set the gradients for all kernel
+        parameters here.
        """
-        raise NotImplementedError
+        dtheta = self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[0]
+        self.gradient[:] = dtheta

-    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
+                                psi0=None, psi1=None, psi2=None):
        """
        Returns the derivative of the objective wrt Z, using the chain rule
        through the expectation variables.
        """
-        raise NotImplementedError
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[1]

    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        """
        Compute the gradients wrt the parameters of the variational
        distruibution q(X), chain-ruling via the expectations of the kernel
        """
-        raise NotImplementedError
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[2:]

    def plot(self, x=None, fignum=None, ax=None, title=None, plot_limits=None, resolution=None, **mpl_kwargs):
        """
@ -169,7 +220,7 @@ class Kern(Parameterized):
    def __iadd__(self, other):
        return self.add(other)

-    def add(self, other, name='add'):
+    def add(self, other, name='sum'):
        """
        Add another kernel to this one.

@ -178,7 +229,7 @@ class Kern(Parameterized):

        """
        assert isinstance(other, Kern), "only kernels can be added to kernels..."
-        from add import Add
+        from .add import Add
        return Add([self, other], name=name)

    def __mul__(self, other):
@ -208,7 +259,7 @@ class Kern(Parameterized):

        """
        assert isinstance(other, Kern), "only kernels can be multiplied to kernels..."
-        from prod import Prod
+        from .prod import Prod
        #kernels = []
        #if isinstance(self, Prod): kernels.extend(self.parameters)
        #else: kernels.append(self)
--- a/GPy/kern/_src/kernel_slice_operations.py
+++ b/GPy/kern/_src/kernel_slice_operations.py
@ -1,7 +1,11 @@
 '''
 Created on 11 Mar 2014

-@author: maxz
+@author: @mzwiessele
+
+This module provides a meta class for the kernels. The meta class is for
+slicing the inputs (X, X2) for the kernels, before K (or any other method involving X)
+gets calls. The `active_dims` of a kernel decide which dimensions the kernel works on.
 '''
 from ...core.parameterization.parameterized import ParametersChangedMeta
 import numpy as np
@ -19,20 +23,27 @@ class KernCallsViaSlicerMeta(ParametersChangedMeta):
        put_clean(dct, 'update_gradients_full', _slice_update_gradients_full)
        put_clean(dct, 'update_gradients_diag', _slice_update_gradients_diag)
        put_clean(dct, 'gradients_X', _slice_gradients_X)
+        put_clean(dct, 'gradients_X_X2', _slice_gradients_X)
+        put_clean(dct, 'gradients_XX', _slice_gradients_XX)
+        put_clean(dct, 'gradients_XX_diag', _slice_gradients_X_diag)
        put_clean(dct, 'gradients_X_diag', _slice_gradients_X_diag)

        put_clean(dct, 'psi0', _slice_psi)
        put_clean(dct, 'psi1', _slice_psi)
        put_clean(dct, 'psi2', _slice_psi)
+        put_clean(dct, 'psi2n', _slice_psi)
        put_clean(dct, 'update_gradients_expectations', _slice_update_gradients_expectations)
        put_clean(dct, 'gradients_Z_expectations', _slice_gradients_Z_expectations)
        put_clean(dct, 'gradients_qX_expectations', _slice_gradients_qX_expectations)
        return super(KernCallsViaSlicerMeta, cls).__new__(cls, name, bases, dct)

 class _Slice_wrap(object):
-    def __init__(self, k, X, X2=None):
+    def __init__(self, k, X, X2=None, ret_shape=None):
        self.k = k
-        self.shape = X.shape
+        if ret_shape is None:
+            self.shape = X.shape
+        else:
+            self.shape = ret_shape
        assert X.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X.shape={!s}".format(X.shape)
        if X2 is not None:
            assert X2.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X2.shape={!s}".format(X2.shape)
@ -54,7 +65,10 @@ class _Slice_wrap(object):
    def handle_return_array(self, return_val):
        if self.ret:
            ret = np.zeros(self.shape)
-            ret[:, self.k.active_dims] = return_val
+            if len(self.shape) == 2:
+                ret[:, self.k.active_dims] = return_val
+            elif len(self.shape) == 3:
+                ret[:, :, self.k.active_dims] = return_val
            return ret
        return return_val

@ -98,6 +112,19 @@ def _slice_gradients_X(f):
        return ret
    return wrap

+def _slice_gradients_XX(f):
+    @wraps(f)
+    def wrap(self, dL_dK, X, X2=None):
+        if X2 is None:
+            N, M = X.shape[0], X.shape[0]
+        else:
+            N, M = X.shape[0], X2.shape[0]
+        with _Slice_wrap(self, X, X2, ret_shape=(N, M, X.shape[1])) as s:
+        #with _Slice_wrap(self, X, X2, ret_shape=None) as s:
+            ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2))
+        return ret
+    return wrap
+
 def _slice_gradients_X_diag(f):
    @wraps(f)
    def wrap(self, dL_dKdiag, X):
@ -124,7 +151,8 @@ def _slice_update_gradients_expectations(f):

 def _slice_gradients_Z_expectations(f):
    @wraps(f)
-    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
+             psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
        with _Slice_wrap(self, Z, variational_posterior) as s:
            ret = s.handle_return_array(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2))
        return ret
@ -132,7 +160,8 @@ def _slice_gradients_Z_expectations(f):

 def _slice_gradients_qX_expectations(f):
    @wraps(f)
-    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
+             psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
        with _Slice_wrap(self, variational_posterior, Z) as s:
            ret = list(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X2, s.X))
            r2 = ret[:2]
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@ -3,7 +3,7 @@


 import numpy as np
-from kern import Kern
+from .kern import Kern
 from ...util.linalg import tdot
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
@ -17,7 +17,7 @@ class Linear(Kern):

    .. math::

-       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i x_iy_i
+       k(x,y) = \sum_{i=1}^{\\text{input_dim}} \sigma^2_i x_iy_i

    :param input_dim: the number of input dimensions
    :type input_dim: int
@ -100,6 +100,12 @@ class Linear(Kern):
            #return (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
            return np.einsum('jq,q,ij->iq', X2, self.variances, dL_dK)

+    def gradients_XX(self, dL_dK, X, X2=None):
+        if X2 is None:
+            return 2*np.ones(X.shape)*self.variances
+        else:
+            return np.ones(X.shape)*self.variances
+
    def gradients_X_diag(self, dL_dKdiag, X):
        return 2.*self.variances*dL_dKdiag[:,None]*X

@ -111,26 +117,29 @@ class Linear(Kern):
    #---------------------------------------#

    def psi0(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[0]
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[0]

    def psi1(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[1]
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[1]

    def psi2(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[2]
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[2]
+
+    def psi2n(self, Z, variational_posterior):
+        return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=True)[2]

    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        dL_dvar = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[0]
+        dL_dvar = self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[0]
        if self.ARD:
            self.variances.gradient = dL_dvar
        else:
            self.variances.gradient = dL_dvar.sum()

    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[1]
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[1]

    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[2:]
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[2:]

 class LinearFull(Kern):
    def __init__(self, input_dim, rank, W=None, kappa=None, active_dims=None, name='linear_full'):
--- a/GPy/kern/_src/mlp.py
+++ b/GPy/kern/_src/mlp.py
@ -1,10 +1,12 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
+from ...util.linalg import tdot
+from ...util.caching import Cache_this
 four_over_tau = 2./np.pi

 class MLP(Kern):
@ -31,7 +33,7 @@ class MLP(Kern):

    """

-    def __init__(self, input_dim, variance=1., weight_variance=1., bias_variance=100., active_dims=None, name='mlp'):
+    def __init__(self, input_dim, variance=1., weight_variance=1., bias_variance=1., active_dims=None, name='mlp'):
        super(MLP, self).__init__(input_dim, active_dims, name)
        self.variance = Param('variance', variance, Logexp())
        self.weight_variance = Param('weight_variance', weight_variance, Logexp())
@ -39,97 +41,84 @@ class MLP(Kern):
        self.link_parameters(self.variance, self.weight_variance, self.bias_variance)


+    @Cache_this(limit=20, ignore_args=())
    def K(self, X, X2=None):
-        self._K_computations(X, X2)
-        return self.variance*self._K_dvar
+        if X2 is None:
+            X_denom = np.sqrt(self._comp_prod(X)+1.)
+            X2_denom = X_denom
+            X2 = X
+        else:
+            X_denom = np.sqrt(self._comp_prod(X)+1.)
+            X2_denom = np.sqrt(self._comp_prod(X2)+1.)
+        XTX = self._comp_prod(X,X2)/X_denom[:,None]/X2_denom[None,:]
+        return self.variance*four_over_tau*np.arcsin(XTX)

+    @Cache_this(limit=20, ignore_args=())
    def Kdiag(self, X):
        """Compute the diagonal of the covariance matrix for X."""
-        self._K_diag_computations(X)
-        return self.variance*self._K_diag_dvar
+        X_prod = self._comp_prod(X)
+        return self.variance*four_over_tau*np.arcsin(X_prod/(X_prod+1.))

    def update_gradients_full(self, dL_dK, X, X2=None):
        """Derivative of the covariance with respect to the parameters."""
-        self._K_computations(X, X2)
-        self.variance.gradient = np.sum(self._K_dvar*dL_dK)
+        dvar, dw, db = self._comp_grads(dL_dK, X, X2)[:3]
+        self.variance.gradient = dvar
+        self.weight_variance.gradient = dw
+        self.bias_variance.gradient = db

-        denom3 = self._K_denom**3
-        base = four_over_tau*self.variance/np.sqrt(1-self._K_asin_arg*self._K_asin_arg)
-        base_cov_grad = base*dL_dK
-
-        if X2 is None:
-            vec = np.diag(self._K_inner_prod)
-            self.weight_variance.gradient = ((self._K_inner_prod/self._K_denom
-                           -.5*self._K_numer/denom3
-                           *(np.outer((self.weight_variance*vec+self.bias_variance+1.), vec)
-                             +np.outer(vec,(self.weight_variance*vec+self.bias_variance+1.))))*base_cov_grad).sum()
-            self.bias_variance.gradient = ((1./self._K_denom
-                           -.5*self._K_numer/denom3
-                           *((vec[None, :]+vec[:, None])*self.weight_variance
-                           +2.*self.bias_variance + 2.))*base_cov_grad).sum()
-        else:
-            vec1 = (X*X).sum(1)
-            vec2 = (X2*X2).sum(1)
-            self.weight_variance.gradient = ((self._K_inner_prod/self._K_denom
-                           -.5*self._K_numer/denom3
-                           *(np.outer((self.weight_variance*vec1+self.bias_variance+1.), vec2) + np.outer(vec1, self.weight_variance*vec2 + self.bias_variance+1.)))*base_cov_grad).sum()
-            self.bias_variance.gradient = ((1./self._K_denom
-                           -.5*self._K_numer/denom3
-                           *((vec1[:, None]+vec2[None, :])*self.weight_variance
-                             + 2*self.bias_variance + 2.))*base_cov_grad).sum()
-
-    def update_gradients_diag(self, X):
-        self._K_diag_computations(X)
-        self.variance.gradient = np.sum(self._K_diag_dvar*dL_dKdiag)
+    def update_gradients_diag(self, dL_dKdiag, X):
+        dvar, dw, db = self._comp_grads_diag(dL_dKdiag, X)[:3]
+        self.variance.gradient = dvar
+        self.weight_variance.gradient = dw
+        self.bias_variance.gradient = db
        
-        base = four_over_tau*self.variance/np.sqrt(1-self._K_diag_asin_arg*self._K_diag_asin_arg)
-        base_cov_grad = base*dL_dKdiag/np.square(self._K_diag_denom)
-        
-        self.weight_variance.gradient = (base_cov_grad*np.square(X).sum(axis=1)).sum()
-        self.bias_variance.gradient = base_cov_grad.sum()
-
    def gradients_X(self, dL_dK, X, X2):
        """Derivative of the covariance matrix with respect to X"""
-        self._K_computations(X, X2)
-        arg = self._K_asin_arg
-        numer = self._K_numer
-        denom = self._K_denom
-        denom3 = denom*denom*denom
-        if X2 is not None:
-            vec2 = (X2*X2).sum(1)*self.weight_variance+self.bias_variance + 1.
-            return four_over_tau*self.weight_variance*self.variance*((X2[None, :, :]/denom[:, :, None] - vec2[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
-        else:
-            vec = (X*X).sum(1)*self.weight_variance+self.bias_variance + 1.
-            return 2*four_over_tau*self.weight_variance*self.variance*((X[None, :, :]/denom[:, :, None] - vec[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
+        return self._comp_grads(dL_dK, X, X2)[3]
+
+    def gradients_X_X2(self, dL_dK, X, X2):
+        """Derivative of the covariance matrix with respect to X"""
+        return self._comp_grads(dL_dK, X, X2)[3:]

    def gradients_X_diag(self, dL_dKdiag, X):
        """Gradient of diagonal of covariance with respect to X"""
-        self._K_diag_computations(X)
-        arg = self._K_diag_asin_arg
-        denom = self._K_diag_denom
-        #numer = self._K_diag_numer
-        return four_over_tau*2.*self.weight_variance*self.variance*X*(1./denom*(1. - arg)*dL_dKdiag/(np.sqrt(1-arg*arg)))[:, None]
+        return self._comp_grads_diag(dL_dKdiag, X)[3]

-
-    def _K_computations(self, X, X2):
-        """Pre-computations for the covariance matrix (used for computing the covariance and its gradients."""
+    @Cache_this(limit=50, ignore_args=())
+    def _comp_prod(self, X, X2=None):
        if X2 is None:
-            self._K_inner_prod = np.dot(X,X.T)
-            self._K_numer = self._K_inner_prod*self.weight_variance + self.bias_variance
-            vec = np.diag(self._K_numer) + 1.
-            self._K_denom = np.sqrt(np.outer(vec,vec))
+            return (np.square(X)*self.weight_variance).sum(axis=1)+self.bias_variance
        else:
-            self._K_inner_prod = np.dot(X,X2.T)
-            self._K_numer = self._K_inner_prod*self.weight_variance + self.bias_variance
-            vec1 = (X*X).sum(1)*self.weight_variance + self.bias_variance + 1.
-            vec2 = (X2*X2).sum(1)*self.weight_variance + self.bias_variance + 1.
-            self._K_denom = np.sqrt(np.outer(vec1,vec2))
-        self._K_asin_arg = self._K_numer/self._K_denom
-        self._K_dvar = four_over_tau*np.arcsin(self._K_asin_arg)
-
-    def _K_diag_computations(self, X):
-        """Pre-computations concerning the diagonal terms (used for computation of diagonal and its gradients)."""
-        self._K_diag_numer = (X*X).sum(1)*self.weight_variance + self.bias_variance
-        self._K_diag_denom = self._K_diag_numer+1.
-        self._K_diag_asin_arg = self._K_diag_numer/self._K_diag_denom
-        self._K_diag_dvar = four_over_tau*np.arcsin(self._K_diag_asin_arg)
+            return (X*self.weight_variance).dot(X2.T)+self.bias_variance
+    
+    @Cache_this(limit=20, ignore_args=(1,))
+    def _comp_grads(self, dL_dK, X, X2=None):
+        var,w,b = self.variance, self.weight_variance, self.bias_variance
+        K = self.K(X, X2)
+        dvar = (dL_dK*K).sum()/var
+        X_prod = self._comp_prod(X)
+        X2_prod = self._comp_prod(X2) if X2 is not None else X_prod
+        XTX = self._comp_prod(X,X2) if X2 is not None else self._comp_prod(X, X)
+        common = var*four_over_tau/np.sqrt((X_prod[:,None]+1.)*(X2_prod[None,:]+1.)-np.square(XTX))*dL_dK
+        dw = (common*((XTX-b)/w-XTX*(((X_prod-b)/(w*(X_prod+1.)))[:,None]+((X2_prod-b)/(w*(X2_prod+1.)))[None,:])/2.)).sum()
+        db = (common*(1.-XTX*(1./(X_prod[:,None]+1.)+1./(X2_prod[None,:]+1.))/2.)).sum()
+        if X2 is None:
+            common = common+common.T
+            dX = common.dot(X)*w-((common*XTX).sum(axis=1)/(X_prod+1.))[:,None]*X*w
+            dX2 = dX
+        else:
+            dX = common.dot(X2)*w-((common*XTX).sum(axis=1)/(X_prod+1.))[:,None]*X*w
+            dX2 = common.T.dot(X)*w-((common*XTX).sum(axis=0)/(X2_prod+1.))[:,None]*X2*w
+        return dvar, dw, db, dX, dX2
+    
+    @Cache_this(limit=20, ignore_args=(1,))
+    def _comp_grads_diag(self, dL_dKdiag, X):
+        var,w,b = self.variance, self.weight_variance, self.bias_variance
+        K = self.Kdiag(X)
+        dvar = (dL_dKdiag*K).sum()/var
+        X_prod = self._comp_prod(X)
+        common = var*four_over_tau/(np.sqrt(1-np.square(X_prod/(X_prod+1)))*np.square(X_prod+1))*dL_dKdiag
+        dw = (common*(X_prod-b)).sum()/w
+        db = common.sum()
+        dX = common[:,None]*X*w*2
+        return dvar, dw, db, dX
--- a/GPy/kern/_src/periodic.py
+++ b/GPy/kern/_src/periodic.py
@ -3,11 +3,12 @@


 import numpy as np
-from kern import Kern
+from .kern import Kern
 from ...util.linalg import mdot
 from ...util.decorators import silence_errors
 from ...core.parameterization.param import Param
 from ...core.parameterization.transformations import Logexp
+from functools import reduce

 class Periodic(Kern):
    def __init__(self, input_dim, variance, lengthscale, period, n_freq, lower, upper, active_dims, name):
@ -67,8 +68,6 @@ class Periodic(Kern):
        return np.diag(self.K(X))


-
-
 class PeriodicExponential(Periodic):
    """
    Kernel of the periodic subspace (up to a given frequency) of a exponential
--- a/GPy/kern/_src/poly.py
+++ b/GPy/kern/_src/poly.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 class Poly(Kern):
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@ -2,9 +2,10 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-from kern import CombinationKernel
+from .kern import CombinationKernel
 from ...util.caching import Cache_this
 import itertools
+from functools import reduce


 def numpy_invalid_op_as_exception(func):
@ -53,31 +54,32 @@ class Prod(CombinationKernel):
            which_parts = self.parts
        return reduce(np.multiply, (p.Kdiag(X) for p in which_parts))

-    @numpy_invalid_op_as_exception
    def update_gradients_full(self, dL_dK, X, X2=None):
-        k = self.K(X,X2)*dL_dK
-        try:
-            for p in self.parts:
-                p.update_gradients_full(k/p.K(X,X2),X,X2)
-        except FloatingPointError:
+        if len(self.parts)==2:
+            self.parts[0].update_gradients_full(dL_dK*self.parts[1].K(X,X2), X, X2)
+            self.parts[1].update_gradients_full(dL_dK*self.parts[0].K(X,X2), X, X2)
+        else:
            for combination in itertools.combinations(self.parts, len(self.parts) - 1):
                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
                to_update = list(set(self.parts) - set(combination))[0]
                to_update.update_gradients_full(dL_dK * prod, X, X2)

    def update_gradients_diag(self, dL_dKdiag, X):
-        k = self.Kdiag(X)*dL_dKdiag
-        for p in self.parts:
-            p.update_gradients_diag(k/p.Kdiag(X),X)
+        if len(self.parts)==2:
+            self.parts[0].update_gradients_diag(dL_dKdiag*self.parts[1].Kdiag(X), X)
+            self.parts[1].update_gradients_diag(dL_dKdiag*self.parts[0].Kdiag(X), X)
+        else:
+            for combination in itertools.combinations(self.parts, len(self.parts) - 1):
+                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination])
+                to_update = list(set(self.parts) - set(combination))[0]
+                to_update.update_gradients_diag(dL_dKdiag * prod, X)

-    @numpy_invalid_op_as_exception            
    def gradients_X(self, dL_dK, X, X2=None):
        target = np.zeros(X.shape)
-        k = self.K(X,X2)*dL_dK
-        try:
-            for p in self.parts:
-                target += p.gradients_X(k/p.K(X,X2),X,X2)
-        except FloatingPointError:
+        if len(self.parts)==2:
+            target += self.parts[0].gradients_X(dL_dK*self.parts[1].K(X, X2), X, X2)
+            target += self.parts[1].gradients_X(dL_dK*self.parts[0].K(X, X2), X, X2)
+        else:
            for combination in itertools.combinations(self.parts, len(self.parts) - 1):
                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
                to_update = list(set(self.parts) - set(combination))[0]
@ -86,9 +88,13 @@ class Prod(CombinationKernel):

    def gradients_X_diag(self, dL_dKdiag, X):
        target = np.zeros(X.shape)
-        k = self.Kdiag(X)*dL_dKdiag
-        for p in self.parts:
-            target += p.gradients_X_diag(k/p.Kdiag(X),X)
+        if len(self.parts)==2:
+            target += self.parts[0].gradients_X_diag(dL_dKdiag*self.parts[1].Kdiag(X), X)
+            target += self.parts[1].gradients_X_diag(dL_dKdiag*self.parts[0].Kdiag(X), X)
+        else:
+            k = self.Kdiag(X)*dL_dKdiag
+            for p in self.parts:
+                target += p.gradients_X_diag(k/p.Kdiag(X),X)
        return target


--- a/GPy/kern/_src/psi_comp/init.py
+++ b/GPy/kern/_src/psi_comp/init.py
@ -4,52 +4,66 @@
 from ....core.parameterization.parameter_core import Pickleable
 from GPy.util.caching import Cache_this
 from ....core.parameterization import variational
-import rbf_psi_comp
-import ssrbf_psi_comp
-import sslinear_psi_comp
-import linear_psi_comp
+from . import rbf_psi_comp
+from . import ssrbf_psi_comp
+from . import sslinear_psi_comp
+from . import linear_psi_comp

-class PSICOMP_RBF(Pickleable):
-    @Cache_this(limit=2, ignore_args=(0,))
-    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
+
+class PSICOMP(Pickleable):
+        
+    def psicomputations(self, kern, Z, qX, return_psi2_n=False):
+        raise NotImplementedError("Abstract method!")
+    
+    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, qX):
+        raise NotImplementedError("Abstract method!")
+
+    def _setup_observers(self):
+        pass
+
+from .gaussherm import PSICOMP_GH
+
+class PSICOMP_RBF(PSICOMP):
+    @Cache_this(limit=5, ignore_args=(0,))
+    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
+        variance, lengthscale = kern.variance, kern.lengthscale
        if isinstance(variational_posterior, variational.NormalPosterior):
-            return rbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior)
+            return rbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior, return_psi2_n=return_psi2_n)
        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
            return ssrbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior)
        else:
-            raise ValueError, "unknown distriubtion received for psi-statistics"
+            raise ValueError("unknown distriubtion received for psi-statistics")

-    @Cache_this(limit=2, ignore_args=(0,1,2,3))
-    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+    @Cache_this(limit=5, ignore_args=(0,2,3,4))
+    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        variance, lengthscale = kern.variance, kern.lengthscale
        if isinstance(variational_posterior, variational.NormalPosterior):
            return rbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior)
        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
            return ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior)
        else:
-            raise ValueError, "unknown distriubtion received for psi-statistics"
+            raise ValueError("unknown distriubtion received for psi-statistics")

-    def _setup_observers(self):
-        pass
+class PSICOMP_Linear(PSICOMP):

-class PSICOMP_Linear(Pickleable):
-
-    @Cache_this(limit=2, ignore_args=(0,))
-    def psicomputations(self, variance, Z, variational_posterior):
+    @Cache_this(limit=5, ignore_args=(0,))
+    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
+        variances = kern.variances
        if isinstance(variational_posterior, variational.NormalPosterior):
-            return linear_psi_comp.psicomputations(variance, Z, variational_posterior)
+            return linear_psi_comp.psicomputations(variances, Z, variational_posterior, return_psi2_n=return_psi2_n)
        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
-            return sslinear_psi_comp.psicomputations(variance, Z, variational_posterior)
+            return sslinear_psi_comp.psicomputations(variances, Z, variational_posterior)
        else:
-            raise ValueError, "unknown distriubtion received for psi-statistics"
+            raise ValueError("unknown distriubtion received for psi-statistics")

-    @Cache_this(limit=2, ignore_args=(0,1,2,3))
-    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior):
+    @Cache_this(limit=2, ignore_args=(0,2,3,4))
+    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        variances = kern.variances
        if isinstance(variational_posterior, variational.NormalPosterior):
-            return linear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
+            return linear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variances, Z, variational_posterior)
        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
-            return sslinear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
+            return sslinear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variances, Z, variational_posterior)
        else:
-            raise ValueError, "unknown distriubtion received for psi-statistics"
+            raise ValueError("unknown distriubtion received for psi-statistics")
+

-    def _setup_observers(self):
-        pass
--- a/GPy/kern/_src/psi_comp/gaussherm.py
+++ b/GPy/kern/_src/psi_comp/gaussherm.py
@ -0,0 +1,93 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+"""
+An approximated psi-statistics implementation based on Gauss-Hermite Quadrature
+"""
+
+import numpy as np
+
+from GPy.util.caching import Cache_this
+from ....util.linalg import tdot
+from . import PSICOMP
+
+class PSICOMP_GH(PSICOMP):
+    
+    def __init__(self, degree=5, cache_K=True):
+        self.degree = degree
+        self.cache_K = cache_K
+        self.locs, self.weights = np.polynomial.hermite.hermgauss(degree)
+        self.locs *= np.sqrt(2.)
+        self.weights*= 1./np.sqrt(np.pi)
+        self.Xs = None
+
+    def _setup_observers(self):
+        pass
+    
+    @Cache_this(limit=10, ignore_args=(0,))
+    def comp_K(self, Z, qX):
+        if self.Xs is None or self.Xs.shape != qX.mean.shape:
+            from ....core.parameterization import ObsAr
+            self.Xs = ObsAr(np.empty((self.degree,)+qX.mean.shape))
+        mu, S = qX.mean.values, qX.variance.values
+        S_sq = np.sqrt(S)
+        for i in xrange(self.degree):
+            self.Xs[i] = self.locs[i]*S_sq+mu
+        return self.Xs
+    
+    @Cache_this(limit=10, ignore_args=(0,))
+    def psicomputations(self, kern, Z, qX, return_psi2_n=False):
+        mu, S = qX.mean.values, qX.variance.values
+        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
+        if self.cache_K: Xs = self.comp_K(Z, qX)
+        else: S_sq = np.sqrt(S)
+        
+        psi0 = np.zeros((N,))
+        psi1 = np.zeros((N,M))
+        psi2 = np.zeros((M,M))
+        for i in xrange(self.degree):
+            if self.cache_K:
+                X = Xs[i]
+            else:
+                X = self.locs[i]*S_sq+mu
+            psi0 += self.weights[i]* kern.Kdiag(X)
+            Kfu = kern.K(X,Z)
+            psi1 += self.weights[i]* Kfu
+            psi2 += self.weights[i]* tdot(Kfu.T)
+        return psi0, psi1, psi2
+    
+    @Cache_this(limit=10, ignore_args=(0, 2,3,4))
+    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, qX):
+        mu, S = qX.mean.values, qX.variance.values
+        if self.cache_K: Xs = self.comp_K(Z, qX)
+        S_sq = np.sqrt(S)
+        
+        dtheta_old = kern.gradient.copy()
+        dtheta = np.zeros_like(kern.gradient)
+        dZ = np.zeros_like(Z.values)
+        dmu = np.zeros_like(mu)
+        dS = np.zeros_like(S)
+        for i in xrange(self.degree):
+            if self.cache_K:
+                X = Xs[i]
+            else:
+                X = self.locs[i]*S_sq+mu
+            dL_dpsi0_i = dL_dpsi0*self.weights[i]
+            kern.update_gradients_diag(dL_dpsi0_i, X)
+            dtheta += kern.gradient
+            dX = kern.gradients_X_diag(dL_dpsi0_i, X)
+            Kfu = kern.K(X,Z)
+            dL_dkfu = (dL_dpsi1+ 2.*Kfu.dot(dL_dpsi2))*self.weights[i]
+            kern.update_gradients_full(dL_dkfu, X, Z)
+            dtheta += kern.gradient
+            dX_i, dZ_i = kern.gradients_X_X2(dL_dkfu, X, Z)
+            dX += dX_i
+            dZ += dZ_i
+            dmu += dX
+            dS += dX*self.locs[i]/(2.*S_sq)
+        kern.gradient[:] = dtheta_old
+        return dtheta, dZ, dmu, dS
+        
+
+
+
--- a/GPy/kern/_src/psi_comp/linear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/linear_psi_comp.py
@ -8,7 +8,7 @@ The package for the Psi statistics computation of the linear kernel for Bayesian
 import numpy as np
 from ....util.linalg import tdot

-def psicomputations(variance, Z, variational_posterior):
+def psicomputations(variance, Z, variational_posterior, return_psi2_n=False):
    """
    Compute psi-statistics for ss-linear kernel
    """
@ -21,8 +21,12 @@ def psicomputations(variance, Z, variational_posterior):
    S = variational_posterior.variance

    psi0 = (variance*(np.square(mu)+S)).sum(axis=1)
-    psi1 = np.dot(mu,(variance*Z).T)
-    psi2 = np.dot(S.sum(axis=0)*np.square(variance)*Z,Z.T)+ tdot(psi1.T)
+    Zv = variance * Z
+    psi1 = np.dot(mu,Zv.T)
+    if return_psi2_n:
+        psi2 = psi1[:,:,None] * psi1[:,None,:] + np.dot(S[:,None,:] * Zv[None,:,:], Zv.T)
+    else:
+        psi2 = np.dot(S.sum(axis=0) * Zv, Zv.T) + tdot(psi1.T)

    return psi0, psi1, psi2

@ -40,7 +44,7 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variati
    dL_dmu += 2.*dL_dpsi0_var*mu+np.dot(dL_dpsi1,Z)*variance
    dL_dS += dL_dpsi0_var
    dL_dZ += dL_dpsi1_mu*variance
-    
+
    return dL_dvar, dL_dZ, dL_dmu, dL_dS

 def _psi2computations(dL_dpsi2, variance, Z, mu, S):
@ -56,22 +60,42 @@ def _psi2computations(dL_dpsi2, variance, Z, mu, S):
    # _psi2_dZ             MxQ
    # _psi2_dmu            NxQ
    # _psi2_dS             NxQ
-    
+
    variance2 = np.square(variance)
    common_sum = np.dot(mu,(variance*Z).T)
-    Z_expect = (np.dot(dL_dpsi2,Z)*Z).sum(axis=0)
-    dL_dpsi2T = dL_dpsi2+dL_dpsi2.T
-    common_expect = np.dot(common_sum,np.dot(dL_dpsi2T,Z))
-    Z2_expect = np.inner(common_sum,dL_dpsi2T)
-    Z1_expect = np.dot(dL_dpsi2T,Z)
-
-    dL_dvar = 2.*S.sum(axis=0)*variance*Z_expect+(common_expect*mu).sum(axis=0)
-            
-    dL_dmu = common_expect*variance
+    if len(dL_dpsi2.shape)==2:
+        Z_expect = (np.dot(dL_dpsi2,Z)*Z).sum(axis=0)
+        dL_dpsi2T = dL_dpsi2+dL_dpsi2.T
+        common_expect = np.dot(common_sum,np.dot(dL_dpsi2T,Z))
+        Z2_expect = np.inner(common_sum,dL_dpsi2T)
+        Z1_expect = np.dot(dL_dpsi2T,Z)
    
-    dL_dS = np.empty(S.shape)
-    dL_dS[:] = Z_expect*variance2
+        dL_dvar = 2.*S.sum(axis=0)*variance*Z_expect+(common_expect*mu).sum(axis=0)
    
-    dL_dZ = variance2*S.sum(axis=0)*Z1_expect+np.dot(Z2_expect.T,variance*mu)
+        dL_dmu = common_expect*variance
+    
+        dL_dS = np.empty(S.shape)
+        dL_dS[:] = Z_expect*variance2
+    
+        dL_dZ = variance2*S.sum(axis=0)*Z1_expect+np.dot(Z2_expect.T,variance*mu)
+    else:
+        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
+        dL_dpsi2_ = dL_dpsi2.sum(axis=0)
+        Z_expect = (np.dot(dL_dpsi2.reshape(N*M,M),Z).reshape(N,M,Q)*Z[None,:,:]).sum(axis=1)
+        dL_dpsi2T = dL_dpsi2_+dL_dpsi2_.T
+        dL_dpsi2T_ = dL_dpsi2+np.swapaxes(dL_dpsi2, 1, 2)
+        common_expect = np.dot(common_sum,np.dot(dL_dpsi2T,Z))
+        common_expect_ = (common_sum[:,:,None]*np.dot(dL_dpsi2T_.reshape(N*M,M),Z).reshape(N,M,Q)).sum(axis=1)
+        Z2_expect = (common_sum[:,:,None]*dL_dpsi2T_).sum(axis=1)
+        Z1_expect = np.dot(dL_dpsi2T_.reshape(N*M,M),Z).reshape(N,M,Q)
+    
+        dL_dvar = 2.*variance*(S*Z_expect).sum(axis=0)+(common_expect_*mu).sum(axis=0)
+    
+        dL_dmu = common_expect_*variance
+    
+        dL_dS = np.empty(S.shape)
+        dL_dS[:] = variance2* Z_expect
+    
+        dL_dZ = variance2*(S[:,None,:]*Z1_expect).sum(axis=0)+np.dot(Z2_expect.T,variance*mu)

    return dL_dvar, dL_dmu, dL_dS, dL_dZ
--- a/GPy/kern/_src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
@ -5,13 +5,7 @@ The module for psi-statistics for RBF kernel
 import numpy as np
 from GPy.util.caching import Cacher

-def psicomputations(variance, lengthscale, Z, variational_posterior):
-    """
-    Z - MxQ
-    mu - NxQ
-    S - NxQ
-    gamma - NxQ
-    """
+def psicomputations(variance, lengthscale, Z, variational_posterior, return_psi2_n=False):
    # here are the "statistics" for psi0, psi1 and psi2
    # Produced intermediate results:
    # _psi1                NxM
@ -21,16 +15,11 @@ def psicomputations(variance, lengthscale, Z, variational_posterior):
    psi0 = np.empty(mu.shape[0])
    psi0[:] = variance
    psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
-    psi2 = _psi2computations(variance, lengthscale, Z, mu, S).sum(axis=0)
+    psi2 = _psi2computations(variance, lengthscale, Z, mu, S)
+    if not return_psi2_n: psi2 = psi2.sum(axis=0)
    return psi0, psi1, psi2

 def __psi1computations(variance, lengthscale, Z, mu, S):
-    """
-    Z - MxQ
-    mu - NxQ
-    S - NxQ
-    gamma - NxQ
-    """
    # here are the "statistics" for psi1
    # Produced intermediate results:
    # _psi1                NxM
@ -45,26 +34,19 @@ def __psi1computations(variance, lengthscale, Z, mu, S):
    return _psi1

 def __psi2computations(variance, lengthscale, Z, mu, S):
-    """
-    Z - MxQ
-    mu - NxQ
-    S - NxQ
-    gamma - NxQ
-    """
    # here are the "statistics" for psi2
    # Produced intermediate results:
    # _psi2                MxM

+    N,M,Q = mu.shape[0], Z.shape[0], mu.shape[1]
    lengthscale2 = np.square(lengthscale)

    _psi2_logdenom = np.log(2.*S/lengthscale2+1.).sum(axis=-1)/(-2.) # N
    _psi2_exp1 = (np.square(Z[:,None,:]-Z[None,:,:])/lengthscale2).sum(axis=-1)/(-4.) #MxM
    Z_hat = (Z[:,None,:]+Z[None,:,:])/2. #MxMxQ
    denom = 1./(2.*S+lengthscale2)
-    _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+2.*np.einsum('nq,moq,nq->nmo',mu,Z_hat,denom)-np.einsum('moq,nq->nmo',np.square(Z_hat),denom)
+    _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+(2*(mu*denom).dot(Z_hat.reshape(M*M,Q).T) - denom.dot(np.square(Z_hat).reshape(M*M,Q).T)).reshape(N,M,M)
    _psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2)
-
-
    return _psi2

 def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
@ -86,13 +68,6 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscal
    return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS

 def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S):
-    """
-    dL_dpsi1 - NxM
-    Z - MxQ
-    mu - NxQ
-    S - NxQ
-    gamma - NxQ
-    """
    # here are the "statistics" for psi1
    # Produced intermediate results: dL_dparams w.r.t. psi1
    # _dL_dvariance     1
@ -118,13 +93,6 @@ def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S):
    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS

 def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
-    """
-    Z - MxQ
-    mu - NxQ
-    S - NxQ
-    gamma - NxQ
-    dL_dpsi2 - MxM
-    """
    # here are the "statistics" for psi2
    # Produced the derivatives w.r.t. psi2:
    # _dL_dvariance      1
@ -157,5 +125,5 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):

    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS

-_psi1computations = Cacher(__psi1computations, limit=1)
-_psi2computations = Cacher(__psi2computations, limit=1)
+_psi1computations = Cacher(__psi1computations, limit=5)
+_psi2computations = Cacher(__psi2computations, limit=5)
--- a/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py
@ -7,13 +7,6 @@ from ....util.caching import Cache_this
 from . import PSICOMP_RBF
 from ....util import gpu_init

-try:
-    import pycuda.gpuarray as gpuarray
-    from pycuda.compiler import SourceModule
-    from ....util.linalg_gpu import sum_axis
-except:
-    pass    
-
 gpu_code = """
    // define THREADNUM

@ -241,7 +234,11 @@ gpu_code = """

 class PSICOMP_RBF_GPU(PSICOMP_RBF):

-    def __init__(self, threadnum=128, blocknum=15, GPU_direct=False):
+    def __init__(self, threadnum=256, blocknum=30, GPU_direct=False):
+        from pycuda.compiler import SourceModule
+        from ....util.gpu_init import initGPU
+        initGPU()
+        
        self.GPU_direct = GPU_direct
        self.gpuCache = None
        
@ -264,7 +261,8 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
        memo[id(self)] = s 
        return s
    
-    def _initGPUCache(self, N, M, Q):            
+    def _initGPUCache(self, N, M, Q):
+        import pycuda.gpuarray as gpuarray
        if self.gpuCache == None:
            self.gpuCache = {
                             'l_gpu'                :gpuarray.empty((Q,),np.float64,order='F'),
@ -320,13 +318,14 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
    def get_dimensions(self, Z, variational_posterior):
        return variational_posterior.mean.shape[0], Z.shape[0], Z.shape[1]

-    @Cache_this(limit=1, ignore_args=(0,))
-    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
+    @Cache_this(limit=5, ignore_args=(0,))
+    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        """
        Z - MxQ
        mu - NxQ
        S - NxQ
        """
+        variance, lengthscale = kern.variance, kern.lengthscale
        N,M,Q = self.get_dimensions(Z, variational_posterior)
        self._initGPUCache(N,M,Q)
        self.sync_params(lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
@ -355,8 +354,10 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
        else:
            return psi0, psi1_gpu.get(), psi2_gpu.get()

-    @Cache_this(limit=1, ignore_args=(0,1,2,3))
-    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+    @Cache_this(limit=5, ignore_args=(0,2,3,4))
+    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        variance, lengthscale = kern.variance, kern.lengthscale
+        from ....util.linalg_gpu import sum_axis
        ARD = (len(lengthscale)!=1)
        
        N,M,Q = self.get_dimensions(Z, variational_posterior)
--- a/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
@ -9,7 +9,7 @@ from ....util.linalg import tdot

 import numpy as np

-def psicomputations(variance, Z, variational_posterior):
+def psicomputations(variance, Z, variational_posterior, return_psi2_n=False):
    """
    Compute psi-statistics for ss-linear kernel
    """
@ -37,11 +37,11 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variati

    # Compute for psi0 and psi1
    mu2S = np.square(mu)+S
-    dL_dvar += np.einsum('n,nq,nq->q',dL_dpsi0,gamma,mu2S) + np.einsum('nm,nq,mq,nq->q',dL_dpsi1,gamma,Z,mu)
-    dL_dgamma += np.einsum('n,q,nq->nq',dL_dpsi0,variance,mu2S) + np.einsum('nm,q,mq,nq->nq',dL_dpsi1,variance,Z,mu)
-    dL_dmu += np.einsum('n,nq,q,nq->nq',dL_dpsi0,gamma,2.*variance,mu) + np.einsum('nm,nq,q,mq->nq',dL_dpsi1,gamma,variance,Z)
-    dL_dS += np.einsum('n,nq,q->nq',dL_dpsi0,gamma,variance)
-    dL_dZ +=  np.einsum('nm,nq,q,nq->mq',dL_dpsi1,gamma, variance,mu)
+    dL_dvar += (dL_dpsi0[:,None]*gamma*mu2S).sum(axis=0) + (dL_dpsi1.T.dot(gamma*mu)*Z).sum(axis=0)
+    dL_dgamma += dL_dpsi0[:,None]*variance*mu2S+ dL_dpsi1.dot(Z)*mu*variance
+    dL_dmu += dL_dpsi0[:,None]*2.*variance*gamma*mu + dL_dpsi1.dot(Z)*gamma*variance
+    dL_dS += dL_dpsi0[:,None]*variance*gamma
+    dL_dZ += dL_dpsi1.T.dot(gamma*mu)*variance
    
    return dL_dvar, dL_dZ, dL_dmu, dL_dS, dL_dgamma

@ -64,29 +64,23 @@ def _psi2computations(dL_dpsi2, variance, Z, mu, S, gamma):
    gamma2 = np.square(gamma)
    variance2 = np.square(variance)
    mu2S = mu2+S # NxQ
-    gvm = np.einsum('nq,nq,q->nq',gamma,mu,variance)
-    common_sum = np.einsum('nq,mq->nm',gvm,Z)
-#     common_sum = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) # NxM
-    Z_expect = np.einsum('mo,mq,oq->q',dL_dpsi2,Z,Z)
+    gvm = gamma*mu*variance
+    common_sum = gvm.dot(Z.T)
+    Z_expect = (np.dot(dL_dpsi2,Z)*Z).sum(axis=0)
+    Z_expect_var2 = Z_expect*variance2
    dL_dpsi2T = dL_dpsi2+dL_dpsi2.T
-    tmp = np.einsum('mo,oq->mq',dL_dpsi2T,Z)
-    common_expect = np.einsum('mq,nm->nq',tmp,common_sum)
-#     common_expect = np.einsum('mo,mq,no->nq',dL_dpsi2+dL_dpsi2.T,Z,common_sum)
-    Z2_expect = np.einsum('om,nm->no',dL_dpsi2T,common_sum)
-    Z1_expect = np.einsum('om,mq->oq',dL_dpsi2T,Z)
+    common_expect = common_sum.dot(dL_dpsi2T).dot(Z)
+    Z2_expect = common_sum.dot(dL_dpsi2T)
+    Z1_expect = dL_dpsi2T.dot(Z)
    
-    dL_dvar = np.einsum('nq,q,q->q',2.*(gamma*mu2S-gamma2*mu2),variance,Z_expect)+\
-        np.einsum('nq,nq,nq->q',common_expect,gamma,mu)
+    dL_dvar = variance*Z_expect*2.*(gamma*mu2S-gamma2*mu2).sum(axis=0)+(common_expect*gamma*mu).sum(axis=0)
        
-    dL_dgamma = np.einsum('q,q,nq->nq',Z_expect,variance2,(mu2S-2.*gamma*mu2))+\
-        np.einsum('nq,q,nq->nq',common_expect,variance,mu)
+    dL_dgamma = Z_expect_var2*(mu2S-2.*gamma*mu2)+common_expect*mu*variance
+                
+    dL_dmu = Z_expect_var2*mu*2.*(gamma-gamma2) + common_expect*gamma*variance
+
+    dL_dS = gamma*Z_expect_var2
    
-    dL_dmu = np.einsum('q,q,nq,nq->nq',Z_expect,variance2,mu,2.*(gamma-gamma2))+\
-            np.einsum('nq,nq,q->nq',common_expect,gamma,variance)
-                    
-    dL_dS = np.einsum('q,nq,q->nq',Z_expect,gamma,variance2)
-    
-#     dL_dZ = 2.*(np.einsum('om,nq,q,mq,nq->oq',dL_dpsi2,gamma,variance2,Z,(mu2S-gamma*mu2))+np.einsum('om,nq,q,nq,nm->oq',dL_dpsi2,gamma,variance,mu,common_sum))
-    dL_dZ = Z1_expect*np.einsum('nq,q,nq->q',gamma,variance2,(mu2S-gamma*mu2))+np.einsum('nq,q,nq,nm->mq',gamma,variance,mu,Z2_expect)
+    dL_dZ = (gamma*(mu2S-gamma*mu2)).sum(axis=0)*variance2*Z1_expect+ Z2_expect.T.dot(gamma*mu)*variance

    return dL_dvar, dL_dgamma, dL_dmu, dL_dS, dL_dZ
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
@ -9,7 +9,7 @@ import numpy as np

 try:
    from scipy import weave
-     
+
    def _psicomputations(variance, lengthscale, Z, variational_posterior):
        """
        Z - MxQ
@ -22,23 +22,26 @@ try:
        # _psi1                NxM
        mu = variational_posterior.mean
        S = variational_posterior.variance
-         
+        gamma = variational_posterior.binary_prob
+
        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
        l2 = np.square(lengthscale)
        log_denom1 = np.log(S/l2+1)
        log_denom2 = np.log(2*S/l2+1)
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        log_gamma = np.log(gamma)
+        log_gamma1 = np.log(1.-gamma)
        variance = float(variance)
        psi0 = np.empty(N)
        psi0[:] = variance
        psi1 = np.empty((N,M))
        psi2n = np.empty((N,M,M))
-         
+
        from ....util.misc import param_to_array
        S = param_to_array(S)
        mu = param_to_array(mu)
+        gamma = param_to_array(gamma)
        Z = param_to_array(Z)
-         
+
        support_code = """
        #include <math.h>
        """
@ -53,11 +56,11 @@ try:
                        double lq = l2(q);
                        double Zm1q = Z(m1,q);
                        double Zm2q = Z(m2,q);
-                         
+
                        if(m2==0) {
                            // Compute Psi_1
                            double muZ = mu(n,q)-Z(m1,q);
-                             
+
                            double psi1_exp1 = log_gamma(n,q) - (muZ*muZ/(Snq+lq) +log_denom1(n,q))/2.;
                            double psi1_exp2 = log_gamma1(n,q) -Zm1q*Zm1q/(2.*lq);
                            log_psi1 += (psi1_exp1>psi1_exp2)?psi1_exp1+log1p(exp(psi1_exp2-psi1_exp1)):psi1_exp2+log1p(exp(psi1_exp1-psi1_exp2));
@ -66,10 +69,10 @@ try:
                        double muZhat = mu(n,q) - (Zm1q+Zm2q)/2.;
                        double Z2 = Zm1q*Zm1q+ Zm2q*Zm2q;
                        double dZ = Zm1q - Zm2q;
-                         
+
                        double psi2_exp1 = dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2(n,q)/2. + log_gamma(n,q);
                        double psi2_exp2 = log_gamma1(n,q) - Z2/(2.*lq);
-                        log_psi2_n += (psi2_exp1>psi2_exp2)?psi2_exp1+log1p(exp(psi2_exp2-psi2_exp1)):psi2_exp2+log1p(exp(psi2_exp1-psi2_exp2));                    
+                        log_psi2_n += (psi2_exp1>psi2_exp2)?psi2_exp1+log1p(exp(psi2_exp2-psi2_exp1)):psi2_exp2+log1p(exp(psi2_exp1-psi2_exp2));
                    }
                    double exp_psi2_n = exp(log_psi2_n);
                    psi2n(n,m1,m2) = variance*variance*exp_psi2_n;
@ -79,29 +82,30 @@ try:
            }
        }
        """
-        weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz)
-     
+        weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz)
+
        psi2 = psi2n.sum(axis=0)
        return psi0,psi1,psi2,psi2n
-     
+
    from GPy.util.caching import Cacher
    psicomputations = Cacher(_psicomputations, limit=1)
-     
+
    def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
        ARD = (len(lengthscale)!=1)
-         
+
        _,psi1,_,psi2n = psicomputations(variance, lengthscale, Z, variational_posterior)
-     
+
        mu = variational_posterior.mean
        S = variational_posterior.variance
+        gamma = variational_posterior.binary_prob
        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
        l2 = np.square(lengthscale)
        log_denom1 = np.log(S/l2+1)
        log_denom2 = np.log(2*S/l2+1)
-        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
-        gamma, gamma1 = variational_posterior.gamma_probabilities()
+        log_gamma = np.log(gamma)
+        log_gamma1 = np.log(1.-gamma)
        variance = float(variance)
-     
+
        dvar = np.zeros(1)
        dmu = np.zeros((N,Q))
        dS = np.zeros((N,Q))
@ -109,12 +113,13 @@ try:
        dl = np.zeros(Q)
        dZ = np.zeros((M,Q))
        dvar += np.sum(dL_dpsi0)
-         
+
        from ....util.misc import param_to_array
        S = param_to_array(S)
        mu = param_to_array(mu)
+        gamma = param_to_array(gamma)
        Z = param_to_array(Z)
-         
+
        support_code = """
        #include <math.h>
        """
@ -130,18 +135,17 @@ try:
                        double Zm1q = Z(m1,q);
                        double Zm2q = Z(m2,q);
                        double gnq = gamma(n,q);
-                        double g1nq = gamma1(n,q);
                        double mu_nq = mu(n,q);
-                         
+
                        if(m2==0) {
-                            // Compute Psi_1                        
+                            // Compute Psi_1
                            double lpsi1 = psi1(n,m1)*dL_dpsi1(n,m1);
                            if(q==0) {dvar(0) += lpsi1/variance;}
-                             
+
                            double Zmu = Zm1q - mu_nq;
                            double denom = Snq+lq;
                            double Zmu2_denom = Zmu*Zmu/denom;
-                             
+
                            double exp1 = log_gamma(n,q)-(Zmu*Zmu/(Snq+lq)+log_denom1(n,q))/(2.);
                            double exp2 = log_gamma1(n,q)-Zm1q*Zm1q/(2.*lq);
                            double d_exp1,d_exp2;
@ -153,23 +157,23 @@ try:
                                d_exp2 = 1.;
                            }
                            double exp_sum = d_exp1+d_exp2;
-                             
+
                            dmu(n,q) += lpsi1*Zmu*d_exp1/(denom*exp_sum);
                            dS(n,q) += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum)/2.;
-                            dgamma(n,q) += lpsi1*(d_exp1*g1nq-d_exp2*gnq)/exp_sum;
+                            dgamma(n,q) += lpsi1*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
                            dl(q) += lpsi1*((Zmu2_denom+Snq/lq)/denom*d_exp1+Zm1q*Zm1q/(lq*lq)*d_exp2)/(2.*exp_sum);
                            dZ(m1,q) += lpsi1*(-Zmu/denom*d_exp1-Zm1q/lq*d_exp2)/exp_sum;
                        }
                        // Compute Psi_2
                        double lpsi2 = psi2n(n,m1,m2)*dL_dpsi2(m1,m2);
                        if(q==0) {dvar(0) += lpsi2*2/variance;}
-                         
+
                        double dZm1m2 = Zm1q - Zm2q;
                        double Z2 = Zm1q*Zm1q+Zm2q*Zm2q;
                        double muZhat =  mu_nq - (Zm1q + Zm2q)/2.;
                        double denom = 2.*Snq+lq;
                        double muZhat2_denom = muZhat*muZhat/denom;
-                         
+
                        double exp1 = dZm1m2*dZm1m2/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2(n,q)/2. + log_gamma(n,q);
                        double exp2 = log_gamma1(n,q) - Z2/(2.*lq);
                        double d_exp1,d_exp2;
@ -181,23 +185,23 @@ try:
                            d_exp2 = 1.;
                        }
                        double exp_sum = d_exp1+d_exp2;
-                         
+
                        dmu(n,q) += -2.*lpsi2*muZhat/denom*d_exp1/exp_sum;
                        dS(n,q) += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum;
-                        dgamma(n,q) += lpsi2*(d_exp1*g1nq-d_exp2*gnq)/exp_sum;
+                        dgamma(n,q) += lpsi2*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
                        dl(q) += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZm1m2*dZm1m2/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum;
-                        dZ(m1,q) += 2.*lpsi2*((muZhat/denom-dZm1m2/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum;                   
+                        dZ(m1,q) += 2.*lpsi2*((muZhat/denom-dZm1m2/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum;
                    }
                }
            }
        }
        """
-        weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','gamma1','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz)
-     
+        weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz)
+
        dl *= 2.*lengthscale
        if not ARD:
            dl = dl.sum()
-         
+
        return dvar, dl, dZ, dmu, dS, dgamma

 except:
@ -215,13 +219,13 @@ except:
        mu = variational_posterior.mean
        S = variational_posterior.variance
        gamma = variational_posterior.binary_prob
-         
+
        psi0 = np.empty(mu.shape[0])
        psi0[:] = variance
        psi1 = _psi1computations(variance, lengthscale, Z, mu, S, gamma)
        psi2 = _psi2computations(variance, lengthscale, Z, mu, S, gamma)
        return psi0, psi1, psi2
-    
+
    def _psi1computations(variance, lengthscale, Z, mu, S, gamma):
        """
        Z - MxQ
@ -232,9 +236,9 @@ except:
        # here are the "statistics" for psi1
        # Produced intermediate results:
        # _psi1                NxM
-    
+
        lengthscale2 = np.square(lengthscale)
-    
+
        # psi1
        _psi1_denom = S[:, None, :] / lengthscale2 + 1.  # Nx1xQ
        _psi1_denom_sqrt = np.sqrt(_psi1_denom) #Nx1xQ
@ -247,9 +251,9 @@ except:
        _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ
        _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM
        _psi1 = variance * np.exp(_psi1_exp_sum) # NxM
-    
+
        return _psi1
-    
+
    def _psi2computations(variance, lengthscale, Z, mu, S, gamma):
        """
        Z - MxQ
@ -260,14 +264,14 @@ except:
        # here are the "statistics" for psi2
        # Produced intermediate results:
        # _psi2                MxM
-        
+
        lengthscale2 = np.square(lengthscale)
-        
+
        _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
        _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
        _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
        _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
-    
+
        # psi2
        _psi2_denom = 2.*S[:, None, None, :] / lengthscale2 + 1. # Nx1x1xQ
        _psi2_denom_sqrt = np.sqrt(_psi2_denom)
@ -280,28 +284,28 @@ except:
        _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
        _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
        _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
-    
+
        return _psi2
-    
+
    def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
        ARD = (len(lengthscale)!=1)
-         
+
        dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1, dgamma_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
        dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2, dgamma_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-     
+
        dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
-         
+
        dL_dlengscale = dl_psi1 + dl_psi2
        if not ARD:
            dL_dlengscale = dL_dlengscale.sum()
-     
+
        dL_dgamma = dgamma_psi1 + dgamma_psi2
        dL_dmu = dmu_psi1 + dmu_psi2
        dL_dS = dS_psi1 + dS_psi2
        dL_dZ = dZ_psi1 + dZ_psi2
-         
+
        return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma
-    
+
    def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, gamma):
        """
        dL_dpsi1 - NxM
@ -318,9 +322,9 @@ except:
        # _dL_dgamma        NxQ
        # _dL_dmu           NxQ
        # _dL_dS            NxQ
-        
+
        lengthscale2 = np.square(lengthscale)
-    
+
        # psi1
        _psi1_denom = S / lengthscale2 + 1.  # NxQ
        _psi1_denom_sqrt = np.sqrt(_psi1_denom) #NxQ
@ -342,9 +346,9 @@ except:
        _dL_dS = np.einsum('nm,nmq,nmq,nq,nmq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_common,(_psi1_dist_sq-1.))/2.  # NxQ
        _dL_dZ = np.einsum('nm,nmq,nmq->mq',dL_dpsi1,_psi1_q, (- _psi1_common[:,None,:] * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z))
        _dL_dlengthscale = lengthscale* np.einsum('nm,nmq,nmq->q',dL_dpsi1,_psi1_q,(_psi1_common[:,None,:]*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + (1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z))
-    
-        return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma 
-    
+
+        return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma
+
    def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, gamma):
        """
        Z - MxQ
@ -361,14 +365,14 @@ except:
        # _dL_dgamma         NxQ
        # _dL_dmu            NxQ
        # _dL_dS             NxQ
-        
+
        lengthscale2 = np.square(lengthscale)
-        
+
        _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
        _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
        _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
        _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
-    
+
        # psi2
        _psi2_denom = 2.*S / lengthscale2 + 1. # NxQ
        _psi2_denom_sqrt = np.sqrt(_psi2_denom)
@ -380,7 +384,7 @@ except:
        _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2)
        _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
        _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
-        _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ 
+        _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ
        _psi2_exp_dist_sq = np.exp(-_psi2_Zdist_sq -_psi2_mudist_sq) # NxMxMxQ
        _psi2_exp_Z = np.exp(-0.5*_psi2_Z_sq_sum) # MxMxQ
        _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
@ -390,5 +394,5 @@ except:
        _dL_dS = np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q, _psi2_common, (2.*_psi2_mudist_sq-1.), _psi2_exp_dist_sq)
        _dL_dZ = 2.*np.einsum('mo,nmoq,nmoq->mq',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(-_psi2_Zdist*_psi2_denom[:,None,None,:]+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z))
        _dL_dlengthscale = 2.*lengthscale* np.einsum('mo,nmoq,nmoq->q',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z))
-    
+
        return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py
@ -6,14 +6,7 @@ The module for psi-statistics for RBF kernel for Spike-and-Slab GPLVM
 import numpy as np
 from ....util.caching import Cache_this
 from . import PSICOMP_RBF
-from ....util import gpu_init

-try:
-    import pycuda.gpuarray as gpuarray
-    from pycuda.compiler import SourceModule
-    from ....util.linalg_gpu import sum_axis
-except:
-    pass    

 gpu_code = """
    // define THREADNUM
@ -292,6 +285,11 @@ gpu_code = """
 class PSICOMP_SSRBF_GPU(PSICOMP_RBF):

    def __init__(self, threadnum=128, blocknum=15, GPU_direct=False):
+        
+        from pycuda.compiler import SourceModule
+        from ....util.gpu_init import initGPU
+        initGPU()
+        
        self.GPU_direct = GPU_direct
        self.gpuCache = None
        
@ -314,7 +312,8 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
        memo[id(self)] = s 
        return s

-    def _initGPUCache(self, N, M, Q):            
+    def _initGPUCache(self, N, M, Q):
+        import pycuda.gpuarray as gpuarray
        if self.gpuCache == None:
            self.gpuCache = {
                             'l_gpu'                :gpuarray.empty((Q,),np.float64,order='F'),
@ -377,12 +376,13 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
        return variational_posterior.mean.shape[0], Z.shape[0], Z.shape[1]

    @Cache_this(limit=1, ignore_args=(0,))
-    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
+    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        """
        Z - MxQ
        mu - NxQ
        S - NxQ
        """
+        variance, lengthscale = kern.variance, kern.lengthscale
        N,M,Q = self.get_dimensions(Z, variational_posterior)
        self._initGPUCache(N,M,Q)
        self.sync_params(lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
@ -409,8 +409,10 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
        else:
            return psi0, psi1_gpu.get(), psi2_gpu.get()

-    @Cache_this(limit=1, ignore_args=(0,1,2,3))
-    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+    @Cache_this(limit=1, ignore_args=(0,2,3,4))
+    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        variance, lengthscale = kern.variance, kern.lengthscale
+        from ....util.linalg_gpu import sum_axis
        ARD = (len(lengthscale)!=1)
        
        N,M,Q = self.get_dimensions(Z, variational_posterior)
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@ -3,9 +3,9 @@


 import numpy as np
-from stationary import Stationary
-from psi_comp import PSICOMP_RBF
-from psi_comp.rbf_psi_gpucomp import PSICOMP_RBF_GPU
+from .stationary import Stationary
+from .psi_comp import PSICOMP_RBF
+from .psi_comp.rbf_psi_gpucomp import PSICOMP_RBF_GPU
 from ...util.config import *

 class RBF(Stationary):
@ -20,7 +20,6 @@ class RBF(Stationary):
    _support_GPU = True
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='rbf', useGPU=False):
        super(RBF, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name, useGPU=useGPU)
-        self.psicomp = PSICOMP_RBF()
        if self.useGPU:
            self.psicomp = PSICOMP_RBF_GPU()
        else:
@ -32,10 +31,14 @@ class RBF(Stationary):
    def dK_dr(self, r):
        return -r*self.K_of_r(r)

+    def dK2_drdr(self, r):
+        return (r**2-1)*self.K_of_r(r)
+
    def __getstate__(self):
        dc = super(RBF, self).__getstate__()
        if self.useGPU:
            dc['psicomp'] = PSICOMP_RBF()
+            dc['useGPU'] = False
        return dc

    def __setstate__(self, state):
@ -50,22 +53,25 @@ class RBF(Stationary):
    #---------------------------------------#

    def psi0(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[0]
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[0]

    def psi1(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[1]
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[1]

    def psi2(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[2]
+        return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=False)[2]
+
+    def psi2n(self, Z, variational_posterior):
+        return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=True)[2]

    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[:2]
+        dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[:2]
        self.variance.gradient = dL_dvar
        self.lengthscale.gradient = dL_dlengscale

    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[2]
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[2]

    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[3:]
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[3:]

--- a/GPy/kern/_src/spline.py
+++ b/GPy/kern/_src/spline.py
@ -0,0 +1,52 @@
+# Copyright (c) 2015, Thomas Hornung
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from .kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+class Spline(Kern):
+    """
+    Linear spline kernel. You need to specify 2 parameters: the variance and c.
+    The variance is defined in powers of 10. Thus specifying -2 means 10^-2.
+    The parameter c allows to define the stiffness of the spline fit. A very stiff
+    spline equals linear regression.
+    See https://www.youtube.com/watch?v=50Vgw11qn0o starting at minute 1:17:28
+    Lit: Wahba, 1990
+    """
+
+    def __init__(self, input_dim, variance=1., c=1., active_dims=None, name='spline'):
+        super(Spline, self).__init__(input_dim, active_dims, name)
+        self.variance = Param('variance', variance, Logexp())
+        self.c = Param('c', c)
+        self.link_parameters(self.variance,self.c)
+
+
+    def K(self, X, X2=None):
+        if X2 is None: X2=X
+        term1 = (X+8.)*(X2.T+8.)/16.
+        term2 = abs((X-X2.T)/16.)**3
+        term3 = ((X+8.)/16.)**3 + ((X2.T+8.)/16.)**3
+        return (self.variance**2 * (1. + (1.+self.c) * term1 + self.c/3. * (term2 - term3)))
+
+    def Kdiag(self, X):
+        term1 = np.square(X+8.,X+8.)/16.
+        term3 = 2. * ((X+8.)/16.)**3
+        return (self.variance**2 * (1. + (1.+self.c) * term1 - self.c/3. * term3))[:,0]
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        if X2 is None: X2=X
+        term1 = (X+8.)*(X2.T+8.)/16.
+        term2 = abs((X-X2.T)/16.)**3
+        term3 = ((X+8.)/16.)**3 + ((X2.T+8.)/16.)**3
+        self.variance.gradient = np.sum(dL_dK * (2*self.variance * (1. + (1.+self.c) * term1 + self.c/3. * ( term2 - term3))))
+        self.c.gradient = np.sum(dL_dK * (self.variance**2* (term1 + 1./3.*(term2 - term3))))
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        raise NotImplementedError
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        raise NotImplementedError
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        raise NotImplementedError
--- a/GPy/kern/_src/splitKern.py
+++ b/GPy/kern/_src/splitKern.py
@ -3,11 +3,11 @@ A new kernel
 """

 import numpy as np
-from kern import Kern,CombinationKernel
+from .kern import Kern,CombinationKernel
 from .independent_outputs import index_to_slices
 import itertools

-class DiffGenomeKern(Kern):
+class DEtime(Kern):

    def __init__(self, kernel, idx_p, Xp, index_dim=-1, name='DiffGenomeKern'):
        self.idx_p = idx_p
@ -104,7 +104,7 @@ class SplitKern(CombinationKernel):
            assert len(slices2)<=2, 'The Split kernel only support two different indices'
            target = np.zeros((X.shape[0], X2.shape[0]))
            # diagonal blocks
-            [[target.__setitem__((s,s2), self.kern.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices[i], slices2[i])] for i in xrange(min(len(slices),len(slices2)))]
+            [[target.__setitem__((s,s2), self.kern.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices[i], slices2[i])] for i in range(min(len(slices),len(slices2)))]
            if len(slices)>1:
                [target.__setitem__((s,s2), self.kern_cross.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices[1], slices2[0])]
            if len(slices2)>1:
@ -135,7 +135,7 @@ class SplitKern(CombinationKernel):
        else:
            assert dL_dK.shape==(X.shape[0],X2.shape[0])
            slices2 = index_to_slices(X2[:,self.index_dim])
-            [[collate_grads(dL_dK[s,s2],X[s],X2[s2]) for s,s2 in itertools.product(slices[i], slices2[i])] for i in xrange(min(len(slices),len(slices2)))]
+            [[collate_grads(dL_dK[s,s2],X[s],X2[s2]) for s,s2 in itertools.product(slices[i], slices2[i])] for i in range(min(len(slices),len(slices2)))]
            if len(slices)>1:
                [collate_grads(dL_dK[s,s2], X[s], X2[s2], True) for s,s2 in itertools.product(slices[1], slices2[0])]
            if len(slices2)>1:
--- a/GPy/kern/_src/standard_periodic.py
+++ b/GPy/kern/_src/standard_periodic.py
@ -0,0 +1,166 @@
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+"""
+The standard periodic kernel which mentioned in:
+
+[1] Gaussian Processes for Machine Learning, C. E. Rasmussen, C. K. I. Williams.
+The MIT Press, 2005.
+
+
+[2] Introduction to Gaussian processes. D. J. C. MacKay. In C. M. Bishop, editor, 
+Neural Networks and Machine Learning, pages 133-165. Springer, 1998.
+"""
+
+from .kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+
+import numpy as np
+
+class StdPeriodic(Kern):
+    """
+    Standart periodic kernel
+
+    .. math::
+
+       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} {}\sum_{i=1}^{input\_dim}  
+       \left( \frac{\sin(\frac{\pi}{\lambda_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance :math:`\theta_1` in the formula above
+    :type variance: float
+    :param wavelength: the vector of wavelengths :math:`\lambda_i`. If None then 1.0 is assumed.
+    :type wavelength: array or list of the appropriate size (or float if there is only one wavelength parameter)
+    :param lengthscale: the vector of lengthscale :math:`\l_i`. If None then 1.0 is assumed.
+    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
+    :param ARD1: Auto Relevance Determination with respect to wavelength. 
+        If equal to "False" one single wavelength parameter :math:`\lambda_i` for 
+        each dimension is assumed, otherwise there is one lengthscale 
+        parameter per dimension.
+    :type ARD1: Boolean
+    :param ARD2: Auto Relevance Determination with respect to lengthscale. 
+        If equal to "False" one single wavelength parameter :math:`l_i` for 
+        each dimension is assumed, otherwise there is one lengthscale 
+        parameter per dimension.
+    :type ARD2: Boolean
+    :param active_dims: indices of dimensions which are used in the computation of the kernel
+    :type wavelength: array or list of the appropriate size
+    :param name: Name of the kernel for output
+    :type String
+    :param useGPU: whether of not use GPU
+    :type Boolean
+    """
+    
+    def __init__(self, input_dim, variance=1., wavelength=None, lengthscale=None, ARD1=False, ARD2=False, active_dims=None, name='std_periodic',useGPU=False):
+        super(StdPeriodic, self).__init__(input_dim, active_dims, name, useGPU=useGPU)
+        self.input_dim = input_dim
+        self.ARD1 = ARD1 # correspond to wavelengths        
+        self.ARD2 = ARD2 # correspond to lengthscales
+        
+        self.name = name
+        
+        if self.ARD1 == False:
+            if wavelength is not None:
+                wavelength = np.asarray(wavelength)
+                assert wavelength.size == 1, "Only one wavelength needed for non-ARD kernel"
+            else:
+                wavelength = np.ones(1)
+        else:
+            if wavelength is not None:
+                wavelength = np.asarray(wavelength)
+                assert wavelength.size == input_dim, "bad number of wavelengths"
+            else:
+                wavelength = np.ones(input_dim)
+        
+        if self.ARD2 == False:
+            if lengthscale is not None:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
+            else:
+                lengthscale = np.ones(1)
+        else:
+            if lengthscale is not None:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size == input_dim, "bad number of lengthscales"
+            else:
+                lengthscale = np.ones(input_dim)
+        
+        self.variance = Param('variance', variance, Logexp())
+        assert self.variance.size==1, "Variance size must be one"
+        self.wavelengths =  Param('wavelengths', wavelength, Logexp())
+        self.lengthscales =  Param('lengthscales', lengthscale, Logexp())
+        
+        self.link_parameters(self.variance,  self.wavelengths, self.lengthscales)
+
+    def parameters_changed(self):
+        """
+        This functions deals as a callback for each optimization iteration. 
+        If one optimization step was successfull and the parameters
+        this callback function will be called to be able to update any 
+        precomputations for the kernel.
+        """
+        
+        pass
+        
+        
+    def K(self, X, X2=None):
+        """Compute the covariance matrix between X and X2."""
+        if X2 is None: 
+            X2 = X
+            
+        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
+        exp_dist = np.exp( -0.5* np.sum( np.square(  np.sin( base ) / self.lengthscales ), axis = -1 ) ) 
+            
+        return self.variance * exp_dist
+
+
+    def Kdiag(self, X):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        ret = np.empty(X.shape[0])
+        ret[:] = self.variance
+        return ret
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        """derivative of the covariance matrix with respect to the parameters."""
+        if X2 is None: 
+            X2 = X
+        
+        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
+        
+        sin_base = np.sin( base )         
+        exp_dist = np.exp( -0.5* np.sum( np.square(  sin_base / self.lengthscales ), axis = -1 ) ) 
+        
+        dwl = self.variance * (1.0/np.square(self.lengthscales)) * sin_base*np.cos(base) * (base / self.wavelengths)
+        
+        dl = self.variance * np.square( sin_base) / np.power( self.lengthscales, 3) 
+        
+        self.variance.gradient = np.sum(exp_dist * dL_dK)    
+        #target[0] += np.sum( exp_dist * dL_dK)        
+        
+        if self.ARD1: # different wavelengths
+            self.wavelengths.gradient = (dwl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
+        else:  # same wavelengths
+            self.wavelengths.gradient = np.sum(dwl.sum(-1) * exp_dist * dL_dK)
+            
+        if self.ARD2: # different lengthscales
+            self.lengthscales.gradient = (dl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
+        else: # same lengthscales
+            self.lengthscales.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)
+        
+    def update_gradients_diag(self, dL_dKdiag, X):
+        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
+        self.variance.gradient = np.sum(dL_dKdiag)
+        self.wavelengths.gradient = 0
+        self.lengthscales.gradient = 0
+
+#    def gradients_X(self, dL_dK, X, X2=None):
+#        """derivative of the covariance matrix with respect to X."""
+#    
+#        raise NotImplemented("Periodic kernel: dK_dX not implemented")
+#
+#    def gradients_X_diag(self, dL_dKdiag, X):
+#        
+#        raise NotImplemented("Periodic kernel: dKdiag_dX not implemented")
--- a/GPy/kern/_src/static.py
+++ b/GPy/kern/_src/static.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kern import Kern
+from .kern import Kern
 import numpy as np
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
@ -24,6 +24,13 @@ class Static(Kern):
    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)

+    def gradients_XX(self, dL_dK, X, X2):
+        if X2 is None:
+            X2 = X
+        return np.zeros((X.shape[0], X2.shape[0], X.shape[1]), dtype=np.float64)
+    def gradients_XX_diag(self, dL_dKdiag, X):
+        return np.zeros(X.shape)
+
    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        return np.zeros(Z.shape)

@ -59,8 +66,14 @@ class White(Static):
    def psi2(self, Z, variational_posterior):
        return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)

+    def psi2n(self, Z, variational_posterior):
+        return np.zeros((1, Z.shape[0], Z.shape[0]), dtype=np.float64)
+
    def update_gradients_full(self, dL_dK, X, X2=None):
-        self.variance.gradient = np.trace(dL_dK)
+        if X2 is None:
+            self.variance.gradient = np.trace(dL_dK)
+        else:
+            self.variance.gradient = 0.

    def update_gradients_diag(self, dL_dKdiag, X):
        self.variance.gradient = dL_dKdiag.sum()
@ -89,6 +102,11 @@ class Bias(Static):
        ret[:] = self.variance*self.variance*variational_posterior.shape[0]
        return ret

+    def psi2n(self, Z, variational_posterior):
+        ret = np.empty((1, Z.shape[0], Z.shape[0]), dtype=np.float64)
+        ret[:] = self.variance*self.variance
+        return ret
+
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        self.variance.gradient = dL_dpsi0.sum() + dL_dpsi1.sum() + 2.*self.variance*dL_dpsi2.sum()*variational_posterior.shape[0]

@ -106,7 +124,7 @@ class Fixed(Static):
        return self.variance * self.fixed_K

    def Kdiag(self, X):
-        return self.variance * self.fixed_K.diag()
+        return self.variance * self.fixed_K.diagonal()

    def update_gradients_full(self, dL_dK, X, X2=None):
        self.variance.gradient = np.einsum('ij,ij', dL_dK, self.fixed_K)
@ -117,6 +135,9 @@ class Fixed(Static):
    def psi2(self, Z, variational_posterior):
        return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)

+    def psi2n(self, Z, variational_posterior):
+        return np.zeros((1, Z.shape[0], Z.shape[0]), dtype=np.float64)
+
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        self.variance.gradient = dL_dpsi0.sum()

--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@ -2,29 +2,39 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ...util.linalg import tdot
 from ... import util
 import numpy as np
-from scipy import integrate, weave
-from ...util.config import config # for assesing whether to use weave
+from scipy import integrate
+from ...util.config import config # for assesing whether to use cython
 from ...util.caching import Cache_this

+try:
+    from . import stationary_cython
+except ImportError:
+    print('warning in stationary: failed to import cython module: falling back to numpy')
+    config.set('cython', 'working', 'false')
+
+
 class Stationary(Kern):
    """
    Stationary kernels (covariance functions).

    Stationary covariance fucntion depend only on r, where r is defined as

-      r = \sqrt{ \sum_{q=1}^Q (x_q - x'_q)^2 }
+    .. math::
+        r(x, x') = \\sqrt{ \\sum_{q=1}^Q (x_q - x'_q)^2 }

    The covariance function k(x, x' can then be written k(r).

    In this implementation, r is scaled by the lengthscales parameter(s):

-      r = \sqrt{ \sum_{q=1}^Q \frac{(x_q - x'_q)^2}{\ell_q^2} }.
+    .. math::
+
+        r(x, x') = \\sqrt{ \\sum_{q=1}^Q \\frac{(x_q - x'_q)^2}{\ell_q^2} }.

    By default, there's only one lengthscale: seaprate lengthscales for each
    dimension can be enables by setting ARD=True.
@ -32,11 +42,12 @@ class Stationary(Kern):
    To implement a stationary covariance function using this class, one need
    only define the covariance function k(r), and it derivative.

-      ...
-      def K_of_r(self, r):
-          return foo
-      def dK_dr(self, r):
-          return bar
+    ```
+    def K_of_r(self, r):
+        return foo
+    def dK_dr(self, r):
+        return bar
+    ```

    The lengthscale(s) and variance parameters are added to the structure automatically.

@ -65,10 +76,14 @@ class Stationary(Kern):
        self.link_parameters(self.variance, self.lengthscale)

    def K_of_r(self, r):
-        raise NotImplementedError, "implement the covariance function as a fn of r to use this class"
+        raise NotImplementedError("implement the covariance function as a fn of r to use this class")

    def dK_dr(self, r):
-        raise NotImplementedError, "implement derivative of the covariance function wrt r to use this class"
+        raise NotImplementedError("implement derivative of the covariance function wrt r to use this class")
+
+    @Cache_this(limit=20, ignore_args=())
+    def dK2_drdr(self, r):
+        raise NotImplementedError("implement second derivative of covariance wrt r to use this method")

    @Cache_this(limit=5, ignore_args=())
    def K(self, X, X2=None):
@ -82,11 +97,16 @@ class Stationary(Kern):
        r = self._scaled_dist(X, X2)
        return self.K_of_r(r)

-    @Cache_this(limit=3, ignore_args=())
+    @Cache_this(limit=20, ignore_args=())
    def dK_dr_via_X(self, X, X2):
        #a convenience function, so we can cache dK_dr
        return self.dK_dr(self._scaled_dist(X, X2))

+    @Cache_this(limit=3, ignore_args=())
+    def dK2_drdr_via_X(self, X, X2):
+        #a convenience function, so we can cache dK_dr
+        return self.dK2_drdr(self._scaled_dist(X, X2))
+
    def _unscaled_dist(self, X, X2=None):
        """
        Compute the Euclidean distance between each row of X and X2, or between
@ -107,12 +127,13 @@ class Stationary(Kern):
            r2 = np.clip(r2, 0, np.inf)
            return np.sqrt(r2)

-    @Cache_this(limit=5, ignore_args=())
+    @Cache_this(limit=20, ignore_args=())
    def _scaled_dist(self, X, X2=None):
        """
        Efficiently compute the scaled distance, r.

-        r = \sqrt( \sum_{q=1}^Q (x_q - x'q)^2/l_q^2 )
+        ..math::
+            r = \sqrt( \sum_{q=1}^Q (x_q - x'q)^2/l_q^2 )

        Note that if thre is only one lengthscale, l comes outside the sum. In
        this case we compute the unscaled distance first (in a separate
@ -148,28 +169,18 @@ class Stationary(Kern):
        (dL_dK), compute the gradient wrt the parameters of this kernel,
        and store in the parameters object as e.g. self.variance.gradient
        """
-        self.variance.gradient = np.einsum('ij,ij,i', self.K(X, X2), dL_dK, 1./self.variance)
+        self.variance.gradient = np.sum(self.K(X, X2)* dL_dK)/self.variance

        #now the lengthscale gradient(s)
        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
        if self.ARD:
-            #rinv = self._inv_dis# this is rather high memory? Should we loop instead?t(X, X2)
-            #d =  X[:, None, :] - X2[None, :, :]
-            #x_xl3 = np.square(d)
-            #self.lengthscale.gradient = -((dL_dr*rinv)[:,:,None]*x_xl3).sum(0).sum(0)/self.lengthscale**3
+
            tmp = dL_dr*self._inv_dist(X, X2)
            if X2 is None: X2 = X
-
-
-            if config.getboolean('weave', 'working'):
-                try:
-                    self.lengthscale.gradient = self.weave_lengthscale_grads(tmp, X, X2)
-                except:
-                    print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
-                    config.set('weave', 'working', 'False')
-                    self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)])
+            if config.getboolean('cython', 'working'):
+                self.lengthscale.gradient = self._lengthscale_grads_cython(tmp, X, X2)
            else:
-                self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)])
+                self.lengthscale.gradient = self._lengthscale_grads_pure(tmp, X, X2)
        else:
            r = self._scaled_dist(X, X2)
            self.lengthscale.gradient = -np.sum(dL_dr*r)/self.lengthscale
@ -184,43 +195,80 @@ class Stationary(Kern):
        dist = self._scaled_dist(X, X2).copy()
        return 1./np.where(dist != 0., dist, np.inf)

-    def weave_lengthscale_grads(self, tmp, X, X2):
-        """Use scipy.weave to compute derivatives wrt the lengthscales"""
+    def _lengthscale_grads_pure(self, tmp, X, X2):
+        return -np.array([np.sum(tmp * np.square(X[:,q:q+1] - X2[:,q:q+1].T)) for q in range(self.input_dim)])/self.lengthscale**3
+
+    def _lengthscale_grads_cython(self, tmp, X, X2):
        N,M = tmp.shape
-        Q = X.shape[1]
-        if hasattr(X, 'values'):X = X.values
-        if hasattr(X2, 'values'):X2 = X2.values
+        Q = self.input_dim
+        X, X2 = np.ascontiguousarray(X), np.ascontiguousarray(X2)
        grads = np.zeros(self.input_dim)
-        code = """
-        double gradq;
-        for(int q=0; q<Q; q++){
-          gradq = 0;
-          for(int n=0; n<N; n++){
-            for(int m=0; m<M; m++){
-              gradq += tmp(n,m)*(X(n,q)-X2(m,q))*(X(n,q)-X2(m,q));
-            }
-          }
-          grads(q) = gradq;
-        }
-        """
-        weave.inline(code, ['tmp', 'X', 'X2', 'grads', 'N', 'M', 'Q'], type_converters=weave.converters.blitz, support_code="#include <math.h>")
+        stationary_cython.lengthscale_grads(N, M, Q, tmp, X, X2, grads)
        return -grads/self.lengthscale**3

    def gradients_X(self, dL_dK, X, X2=None):
        """
        Given the derivative of the objective wrt K (dL_dK), compute the derivative wrt X
        """
-        if config.getboolean('weave', 'working'):
-            try:
-                return self.gradients_X_weave(dL_dK, X, X2)
-            except:
-                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
-                config.set('weave', 'working', 'False')
-                return self.gradients_X_(dL_dK, X, X2)
+        if config.getboolean('cython', 'working'):
+            return self._gradients_X_cython(dL_dK, X, X2)
        else:
-            return self.gradients_X_(dL_dK, X, X2)
+            return self._gradients_X_pure(dL_dK, X, X2)

-    def gradients_X_(self, dL_dK, X, X2=None):
+    def gradients_XX(self, dL_dK, X, X2=None):
+        """
+        Given the derivative of the objective K(dL_dK), compute the second derivative of K wrt X and X2:
+
+        ..math:
+          \frac{\partial^2 K}{\partial X\partial X2}
+
+        ..returns:
+            dL2_dXdX2: NxMxQ, for X [NxQ] and X2[MxQ] (X2 is X if, X2 is None)
+            Thus, we return the second derivative in X2.
+        """
+        # The off diagonals in Q are always zero, this should also be true for the Linear kernel...
+        # According to multivariable chain rule, we can chain the second derivative through r:
+        # d2K_dXdX2 = dK_dr*d2r_dXdX2 + d2K_drdr * dr_dX * dr_dX2:
+        invdist = self._inv_dist(X, X2)
+        invdist2 = invdist**2
+
+        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
+        tmp1 = dL_dr * invdist
+
+        dL_drdr = self.dK2_drdr_via_X(X, X2) * dL_dK
+        tmp2 = dL_drdr * invdist2
+
+        l2 = np.ones(X.shape[1]) * self.lengthscale**2
+
+        if X2 is None:
+            X2 = X
+            tmp1 -= np.eye(X.shape[0])*self.variance
+        else:
+            tmp1[X==X2.T] -= self.variance
+
+        grad = np.empty((X.shape[0], X2.shape[0], X.shape[1]), dtype=np.float64)
+        #grad = np.empty(X.shape, dtype=np.float64)
+        for q in range(self.input_dim):
+            tmpdist2 = (X[:,[q]]-X2[:,[q]].T) ** 2
+            grad[:, :, q] = ((tmp1*invdist2 - tmp2)*tmpdist2/l2[q] - tmp1)/l2[q]
+            #grad[:, :, q] = ((tmp1*(((tmpdist2)*invdist2/l2[q])-1)) - (tmp2*(tmpdist2))/l2[q])/l2[q]
+            #np.sum(((tmp1*(((tmpdist2)*invdist2/l2[q])-1)) - (tmp2*(tmpdist2))/l2[q])/l2[q], axis=1, out=grad[:,q])
+            #np.sum( - (tmp2*(tmpdist**2)), axis=1, out=grad[:,q])
+        return grad
+
+    def gradients_XX_diag(self, dL_dK, X):
+        """
+        Given the derivative of the objective K(dL_dK), compute the second derivative of K wrt X and X2:
+
+        ..math:
+          \frac{\partial^2 K}{\partial X\partial X2}
+
+        ..returns:
+            dL2_dXdX2: NxMxQ, for X [NxQ] and X2[MxQ]
+        """
+        return np.ones(X.shape) * self.variance/self.lengthscale**2
+
+    def _gradients_X_pure(self, dL_dK, X, X2=None):
        invdist = self._inv_dist(X, X2)
        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
        tmp = invdist*dL_dr
@ -230,54 +278,25 @@ class Stationary(Kern):

        #The high-memory numpy way:
        #d =  X[:, None, :] - X2[None, :, :]
-        #ret = np.sum(tmp[:,:,None]*d,1)/self.lengthscale**2
+        #grad = np.sum(tmp[:,:,None]*d,1)/self.lengthscale**2

        #the lower memory way with a loop
-        ret = np.empty(X.shape, dtype=np.float64)
-        for q in xrange(self.input_dim):
-            np.sum(tmp*(X[:,q][:,None]-X2[:,q][None,:]), axis=1, out=ret[:,q])
-        ret /= self.lengthscale**2
+        grad = np.empty(X.shape, dtype=np.float64)
+        for q in range(self.input_dim):
+            np.sum(tmp*(X[:,q][:,None]-X2[:,q][None,:]), axis=1, out=grad[:,q])
+        return grad/self.lengthscale**2

-        return ret
-
-    def gradients_X_weave(self, dL_dK, X, X2=None):
+    def _gradients_X_cython(self, dL_dK, X, X2=None):
        invdist = self._inv_dist(X, X2)
        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
        tmp = invdist*dL_dr
        if X2 is None:
            tmp = tmp + tmp.T
            X2 = X
-
-        code = """
-        int n,m,d;
-        double retnd;
-        #pragma omp parallel for private(n,d, retnd, m)
-        for(d=0;d<D;d++){
-          for(n=0;n<N;n++){
-            retnd = 0.0;
-            for(m=0;m<M;m++){
-              retnd += tmp(n,m)*(X(n,d)-X2(m,d));
-            }
-            ret(n,d) = retnd;
-          }
-        }
-
-        """
-        if hasattr(X, 'values'):X = X.values #remove the GPy wrapping to make passing into weave safe
-        if hasattr(X2, 'values'):X2 = X2.values
-        ret = np.zeros(X.shape)
-        N,D = X.shape
-        N,M = tmp.shape
-        from scipy import weave
-        support_code = """
-        #include <omp.h>
-        #include <stdio.h>
-        """
-        weave_options = {'headers'           : ['<omp.h>'],
-                         'extra_compile_args': ['-fopenmp -O3'], # -march=native'],
-                         'extra_link_args'   : ['-lgomp']}
-        weave.inline(code, ['ret', 'N', 'D', 'M', 'tmp', 'X', 'X2'], type_converters=weave.converters.blitz, support_code=support_code, **weave_options)
-        return ret/self.lengthscale**2
+        X, X2 = np.ascontiguousarray(X), np.ascontiguousarray(X2)
+        grad = np.zeros(X.shape)
+        stationary_cython.grad_X(X.shape[0], X.shape[1], X2.shape[0], X, X2, tmp, grad)
+        return grad/self.lengthscale**2

    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)
@ -285,6 +304,9 @@ class Stationary(Kern):
    def input_sensitivity(self, summarize=True):
        return self.variance*np.ones(self.input_dim)/self.lengthscale**2

+
+
+
 class Exponential(Stationary):
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Exponential'):
        super(Exponential, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
@ -296,13 +318,15 @@ class Exponential(Stationary):
        return -0.5*self.K_of_r(r)


+
+
 class OU(Stationary):
    """
    OU kernel:

    .. math::

-       k(r) = \\sigma^2 \exp(- r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
+       k(r) = \\sigma^2 \exp(- r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^{\text{input_dim}} \\frac{(x_i-y_i)^2}{\ell_i^2} }

    """

@ -322,7 +346,7 @@ class Matern32(Stationary):

    .. math::

-       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
+       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^{\\text{input_dim}} \\frac{(x_i-y_i)^2}{\ell_i^2} }

    """

@ -369,7 +393,7 @@ class Matern52(Stationary):
    .. math::

       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r)
-       """
+    """
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Mat52'):
        super(Matern52, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)

--- a/GPy/kern/_src/stationary_cython.c
+++ b/GPy/kern/_src/stationary_cython.c
--- a/GPy/kern/_src/stationary_cython.pyx
+++ b/GPy/kern/_src/stationary_cython.pyx
@ -0,0 +1,60 @@
+#cython: boundscheck=False
+#cython: nonecheck=False
+#cython: wraparound=False
+import numpy as np
+cimport numpy as np
+from cython.parallel import prange
+cimport cython
+
+ctypedef np.float64_t DTYPE_t
+ 
+cdef extern from "stationary_utils.h":
+    void _grad_X "_grad_X" (int N, int D, int M, double* X, double* X2, double* tmp, double* grad) nogil
+
+cdef extern from "stationary_utils.h":
+    void _lengthscale_grads "_lengthscale_grads" (int N, int M, int Q, double* tmp, double* X, double* X2, double* grad) nogil
+ 
+def grad_X(int N, int D, int M,
+        np.ndarray[DTYPE_t, ndim=2] _X,
+        np.ndarray[DTYPE_t, ndim=2] _X2,
+        np.ndarray[DTYPE_t, ndim=2] _tmp,
+        np.ndarray[DTYPE_t, ndim=2] _grad):
+    cdef double *X = <double*> _X.data
+    cdef double *X2 = <double*> _X2.data
+    cdef double *tmp = <double*> _tmp.data
+    cdef double *grad = <double*> _grad.data
+    with nogil:
+        _grad_X(N, D, M, X, X2, tmp, grad) # return nothing, work in place.
+
+@cython.cdivision(True)
+def grad_X_cython(int N, int D, int M, double[:,:] X, double[:,:] X2, double[:,:] tmp, double[:,:] grad):
+    cdef int n,d,nd,m
+    for nd in prange(N * D, nogil=True):
+        n = nd / D
+        d = nd % D
+        grad[n,d] = 0.0
+        for m in range(M):
+            grad[n,d] += tmp[n, m] * (X[n, d] - X2[m, d])
+
+def lengthscale_grads_in_c(int N, int M, int Q,
+        np.ndarray[DTYPE_t, ndim=2] _tmp,
+        np.ndarray[DTYPE_t, ndim=2] _X,
+        np.ndarray[DTYPE_t, ndim=2] _X2,
+        np.ndarray[DTYPE_t, ndim=1] _grad):
+    cdef double *tmp = <double*> _tmp.data
+    cdef double *X = <double*> _X.data
+    cdef double *X2 = <double*> _X2.data
+    cdef double *grad = <double*> _grad.data
+    with nogil:
+        _lengthscale_grads(N, M, Q, tmp, X, X2, grad) # return nothing, work in place.
+
+def lengthscale_grads(int N, int M, int Q, double[:,:] tmp, double[:,:] X, double[:,:] X2, double[:] grad):
+    cdef int q, n, m
+    cdef double gradq, dist
+    with nogil:
+        for q in range(Q):
+            grad[q] = 0.0
+            for n in range(N):
+                for m in range(M):
+                    dist = X[n,q] - X2[m,q]
+                    grad[q] += tmp[n, m] * dist * dist
--- a/GPy/kern/_src/stationary_utils.c
+++ b/GPy/kern/_src/stationary_utils.c
@ -0,0 +1,54 @@
+void _grad_X(int N, int D, int M, double* X, double* X2, double* tmp, double* grad){
+double retnd;
+int n,d,nd,m;
+#pragma omp parallel for private(nd,n,d, retnd, m)
+for(nd=0;nd<(D*N);nd++){
+  n = nd/D;
+  d = nd%D;
+  retnd = 0.0;
+  for(m=0;m<M;m++){
+    retnd += tmp[n*M+m]*(X[nd]-X2[m*D+d]);
+  }
+  grad[nd] = retnd;
+}
+} //grad_X
+
+
+void _lengthscale_grads_unsafe(int N, int M, int Q, double* tmp, double* X, double* X2, double* grad){
+int n,m,nm,q,nQ,mQ;
+double dist;
+#pragma omp parallel for private(n,m,nm,q,nQ,mQ,dist)
+for(nm=0; nm<(N*M); nm++){
+  n = nm/M;
+  m = nm%M; 
+  nQ = n*Q;
+  mQ = m*Q;
+  for(q=0; q<Q; q++){
+    dist = X[nQ+q]-X2[mQ+q];
+    grad[q] += tmp[nm]*dist*dist;
+  }
+}
+} //lengthscale_grads
+
+
+void _lengthscale_grads(int N, int M, int Q, double* tmp, double* X, double* X2, double* grad){
+int n,m,q;
+double gradq, dist;
+#pragma omp parallel for private(n,m, gradq, dist)
+for(q=0; q<Q; q++){
+  gradq = 0;
+  for(n=0; n<N; n++){
+    for(m=0; m<M; m++){
+        dist = X[n*Q+q]-X2[m*Q+q];
+        gradq += tmp[n*M+m]*dist*dist;
+    }
+  }
+  grad[q] = gradq;
+}
+} //lengthscale_grads
+
+
+
+
+
+
--- a/GPy/kern/_src/stationary_utils.h
+++ b/GPy/kern/_src/stationary_utils.h
@ -0,0 +1,5 @@
+#ifndef __APPLE__
+#include <omp.h>
+#endif
+void _grad_X(int N, int D, int M, double*X, double* X2, double* tmp, double* grad);
+void _lengthscale_grads(int N, int D, int M, double* X, double* X2, double* tmp, double* grad);
--- a/GPy/kern/_src/symbolic.py
+++ b/GPy/kern/_src/symbolic.py
@ -1,7 +1,7 @@
 # Check Matthew Rocklin's blog post.
 import sympy as sym
 import numpy as np
-from kern import Kern
+from .kern import Kern
 from ...core.symbolic import Symbolic_core


@ -11,7 +11,7 @@ class Symbolic(Kern, Symbolic_core):
    def __init__(self, input_dim, k=None, output_dim=1, name='symbolic', parameters=None, active_dims=None, operators=None, func_modules=[]):

        if k is None:
-            raise ValueError, "You must provide an argument for the covariance function."
+            raise ValueError("You must provide an argument for the covariance function.")

        Kern.__init__(self, input_dim, active_dims, name=name)
        kdiag = k
--- a/GPy/kern/_src/trunclinear.py
+++ b/GPy/kern/_src/trunclinear.py
@ -3,7 +3,7 @@


 import numpy as np
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ...util.caching import Cache_this
@ -15,7 +15,7 @@ class TruncLinear(Kern):

    .. math::

-       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i \max(0, x_iy_i - \simga_q)
+       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i \max(0, x_iy_i - \sigma_q)

    :param input_dim: the number of input dimensions
    :type input_dim: int
@ -54,7 +54,7 @@ class TruncLinear(Kern):
        self.delta = Param('delta', delta)
        self.add_parameter(self.variances)
        self.add_parameter(self.delta)
-            
+
    @Cache_this(limit=2)
    def K(self, X, X2=None):
        XX = self.variances*self._product(X, X2)
@ -114,7 +114,7 @@ class TruncLinear_inf(Kern):

    .. math::

-       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i \max(0, x_iy_i - \simga_q)
+       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i \max(0, x_iy_i - \sigma_q)

    :param input_dim: the number of input dimensions
    :type input_dim: int
@ -148,8 +148,8 @@ class TruncLinear_inf(Kern):

        self.variances = Param('variances', variances, Logexp())
        self.add_parameter(self.variances)
-        
-    
+
+
 #     @Cache_this(limit=2)
    def K(self, X, X2=None):
        tmp = self._product(X, X2)
--- a/GPy/likelihoods/init.py
+++ b/GPy/likelihoods/init.py
@ -1,8 +1,10 @@
-from bernoulli import Bernoulli
-from exponential import Exponential
-from gaussian import Gaussian
-from gamma import Gamma
-from poisson import Poisson
-from student_t import StudentT
-from likelihood import Likelihood
-from mixed_noise import MixedNoise
+from .bernoulli import Bernoulli
+from .exponential import Exponential
+from .gaussian import Gaussian, HeteroscedasticGaussian
+from .gamma import Gamma
+from .poisson import Poisson
+from .student_t import StudentT
+from .likelihood import Likelihood
+from .mixed_noise import MixedNoise
+from .binomial import Binomial
+
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@ -3,9 +3,8 @@

 import numpy as np
 from ..util.univariate_Gaussian import std_norm_pdf, std_norm_cdf
-import link_functions
-from likelihood import Likelihood
-from scipy import stats
+from . import link_functions
+from .likelihood import Likelihood

 class Bernoulli(Likelihood):
    """
@ -77,13 +76,39 @@ class Bernoulli(Likelihood):

        return Z_hat, mu_hat, sigma2_hat

+    def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
+        if isinstance(self.gp_link, link_functions.Probit):
+
+            if gh_points is None:
+                gh_x, gh_w = self._gh_points()
+            else:
+                gh_x, gh_w = gh_points
+
+
+            gh_w = gh_w / np.sqrt(np.pi)
+            shape = m.shape
+            m,v,Y = m.flatten(), v.flatten(), Y.flatten()
+            Ysign = np.where(Y==1,1,-1)
+            X = gh_x[None,:]*np.sqrt(2.*v[:,None]) + (m*Ysign)[:,None]
+            p = std_norm_cdf(X)
+            p = np.clip(p, 1e-9, 1.-1e-9) # for numerical stability
+            N = std_norm_pdf(X)
+            F = np.log(p).dot(gh_w)
+            NoverP = N/p
+            dF_dm = (NoverP*Ysign[:,None]).dot(gh_w)
+            dF_dv = -0.5*(NoverP**2 + NoverP*X).dot(gh_w)
+            return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape), None
+        else:
+            raise NotImplementedError
+
+
    def predictive_mean(self, mu, variance, Y_metadata=None):

        if isinstance(self.gp_link, link_functions.Probit):
-            return stats.norm.cdf(mu/np.sqrt(1+variance))
+            return std_norm_cdf(mu/np.sqrt(1+variance))

        elif isinstance(self.gp_link, link_functions.Heaviside):
-            return stats.norm.cdf(mu/np.sqrt(variance))
+            return std_norm_cdf(mu/np.sqrt(variance))

        else:
            raise NotImplementedError
@ -133,7 +158,7 @@ class Bernoulli(Likelihood):
        """
        #objective = y*np.log(inv_link_f) + (1.-y)*np.log(inv_link_f)
        p = np.where(y==1, inv_link_f, 1.-inv_link_f)
-        return np.log(np.clip(p, 1e-6 ,np.inf))
+        return np.log(np.clip(p, 1e-9 ,np.inf))

    def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
        """
@ -152,7 +177,7 @@ class Bernoulli(Likelihood):
        """
        #grad = (y/inv_link_f) - (1.-y)/(1-inv_link_f)
        #grad = np.where(y, 1./inv_link_f, -1./(1-inv_link_f))
-        ff = np.clip(inv_link_f, 1e-6, 1-1e-6)
+        ff = np.clip(inv_link_f, 1e-9, 1-1e-9)
        denom = np.where(y, ff, -(1-ff))
        return 1./denom

@ -180,7 +205,7 @@ class Bernoulli(Likelihood):
        #d2logpdf_dlink2 = -y/(inv_link_f**2) - (1-y)/((1-inv_link_f)**2)
        #d2logpdf_dlink2 = np.where(y, -1./np.square(inv_link_f), -1./np.square(1.-inv_link_f))
        arg = np.where(y, inv_link_f, 1.-inv_link_f)
-        ret =  -1./np.square(np.clip(arg, 1e-3, np.inf))
+        ret =  -1./np.square(np.clip(arg, 1e-9, 1e9))
        if np.any(np.isinf(ret)):
            stop
        return ret
@ -208,6 +233,17 @@ class Bernoulli(Likelihood):
        np.seterr(**state)
        return d3logpdf_dlink3

+    def predictive_quantiles(self, mu, var, quantiles, Y_metadata=None):
+        """
+        Get the "quantiles" of the binary labels (Bernoulli draws). all the
+        quantiles must be either 0 or 1, since those are the only values the
+        draw can take!
+        """
+        p = self.predictive_mean(mu, var)
+        return [np.asarray(p>(q/100.), dtype=np.int32) for q in quantiles]
+
+
+
    def samples(self, gp, Y_metadata=None):
        """
        Returns a set of samples of observations based on a given value of the latent variable.
--- a/GPy/likelihoods/binomial.py
+++ b/GPy/likelihoods/binomial.py
@ -0,0 +1,125 @@
+# Copyright (c) 2012-2014 The GPy authors (see AUTHORS.txt)
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ..util.univariate_Gaussian import std_norm_pdf, std_norm_cdf
+from . import link_functions
+from .likelihood import Likelihood
+from scipy import special
+
+class Binomial(Likelihood):
+    """
+    Binomial likelihood
+
+    .. math::
+        p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
+
+    .. Note::
+        Y takes values in either {-1, 1} or {0, 1}.
+        link function should have the domain [0, 1], e.g. probit (default) or Heaviside
+
+    .. See also::
+        likelihood.py, for the parent class
+    """
+    def __init__(self, gp_link=None):
+        if gp_link is None:
+            gp_link = link_functions.Probit()
+
+        super(Binomial, self).__init__(gp_link, 'Binomial')
+
+    def conditional_mean(self, gp, Y_metadata):
+        return self.gp_link(gp)*Y_metadata['trials']
+
+    def pdf_link(self, inv_link_f, y, Y_metadata):
+        """
+        Likelihood function given inverse link of f.
+
+        .. math::
+            p(y_{i}|\\lambda(f_{i})) = \\lambda(f_{i})^{y_{i}}(1-f_{i})^{1-y_{i}}
+
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata must contain 'trials'
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        .. Note:
+            Each y_i must be in {0, 1}
+        """
+        return np.exp(self.logpdf_link(inv_link_f, y, Y_metadata))
+
+    def logpdf_link(self, inv_link_f, y, Y_metadata=None):
+        """
+        Log Likelihood function given inverse link of f.
+
+        .. math::
+            \\ln p(y_{i}|\\lambda(f_{i})) = y_{i}\\log\\lambda(f_{i}) + (1-y_{i})\\log (1-f_{i})
+
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata must contain 'trials'
+        :returns: log likelihood evaluated at points inverse link of f.
+        :rtype: float
+        """
+        N = Y_metadata['trials']
+        nchoosey = special.gammaln(N+1) - special.gammaln(y+1) - special.gammaln(N-y+1)
+
+        return nchoosey + y*np.log(inv_link_f) + (N-y)*np.log(1.-inv_link_f)
+
+    def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
+        """
+        Gradient of the pdf at y, given inverse link of f w.r.t inverse link of f.
+
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata must contain 'trials'
+        :returns: gradient of log likelihood evaluated at points inverse link of f.
+        :rtype: Nx1 array
+        """
+        N = Y_metadata['trials']
+        return y/inv_link_f - (N-y)/(1-inv_link_f)
+
+    def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None):
+        """
+        Hessian at y, given inv_link_f, w.r.t inv_link_f the hessian will be 0 unless i == j
+        i.e. second derivative logpdf at y given inverse link of f_i and inverse link of f_j  w.r.t inverse link of f_i and inverse link of f_j.
+
+
+        .. math::
+            \\frac{d^{2}\\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)^{2}} = \\frac{-y_{i}}{\\lambda(f)^{2}} - \\frac{(1-y_{i})}{(1-\\lambda(f))^{2}}
+
+        :param inv_link_f: latent variables inverse link of f.
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in binomial
+        :returns: Diagonal of log hessian matrix (second derivative of log likelihood evaluated at points inverse link of f.
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on inverse link of f_i not on inverse link of f_(j!=i)
+        """
+        N = Y_metadata['trials']
+        return -y/np.square(inv_link_f) - (N-y)/np.square(1-inv_link_f)
+
+    def samples(self, gp, Y_metadata=None):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        N = Y_metadata['trials']
+        Ysim = np.random.binomial(N, self.gp_link.transf(gp))
+        return Ysim.reshape(orig_shape)
+
+    def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
+        pass
--- a/GPy/likelihoods/exponential.py
+++ b/GPy/likelihoods/exponential.py
@ -5,8 +5,8 @@
 import numpy as np
 from scipy import stats,special
 import scipy as sp
-import link_functions
-from likelihood import Likelihood
+from . import link_functions
+from .likelihood import Likelihood

 class Exponential(Likelihood):
    """
@ -57,9 +57,8 @@ class Exponential(Likelihood):
        :rtype: float

        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        log_objective = np.log(link_f) - y*link_f
-        return np.sum(log_objective)
+        return log_objective

    def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
        """
@ -77,7 +76,6 @@ class Exponential(Likelihood):
        :rtype: Nx1 array

        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        grad = 1./link_f - y
        #grad = y/(link_f**2) - 1./link_f
        return grad
@ -103,7 +101,6 @@ class Exponential(Likelihood):
            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        hess = -1./(link_f**2)
        #hess = -2*y/(link_f**3) + 1/(link_f**2)
        return hess
@ -123,7 +120,6 @@ class Exponential(Likelihood):
        :returns: third derivative of likelihood evaluated at points f
        :rtype: Nx1 array
        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        d3lik_dlink3 = 2./(link_f**3)
        #d3lik_dlink3 = 6*y/(link_f**4) - 2./(link_f**3)
        return d3lik_dlink3
--- a/GPy/likelihoods/gamma.py
+++ b/GPy/likelihoods/gamma.py
@ -6,8 +6,8 @@ import numpy as np
 from scipy import stats,special
 import scipy as sp
 from ..core.parameterization import Param
-import link_functions
-from likelihood import Likelihood
+from . import link_functions
+from .likelihood import Likelihood

 class Gamma(Likelihood):
    """
@ -66,12 +66,11 @@ class Gamma(Likelihood):
        :rtype: float

        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        #alpha = self.gp_link.transf(gp)*self.beta
        #return (1. - alpha)*np.log(obs) + self.beta*obs - alpha * np.log(self.beta) + np.log(special.gamma(alpha))
        alpha = link_f*self.beta
        log_objective = alpha*np.log(self.beta) - np.log(special.gamma(alpha)) + (alpha - 1)*np.log(y) - self.beta*y
-        return np.sum(log_objective)
+        return log_objective

    def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
        """
@ -90,7 +89,6 @@ class Gamma(Likelihood):
        :rtype: Nx1 array

        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        grad = self.beta*np.log(self.beta*y) - special.psi(self.beta*link_f)*self.beta
        #old
        #return -self.gp_link.dtransf_df(gp)*self.beta*np.log(obs) + special.psi(self.gp_link.transf(gp)*self.beta) * self.gp_link.dtransf_df(gp)*self.beta
@ -118,7 +116,6 @@ class Gamma(Likelihood):
            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        hess = -special.polygamma(1, self.beta*link_f)*(self.beta**2)
        #old
        #return -self.gp_link.d2transf_df2(gp)*self.beta*np.log(obs) + special.polygamma(1,self.gp_link.transf(gp)*self.beta)*(self.gp_link.dtransf_df(gp)*self.beta)**2 + special.psi(self.gp_link.transf(gp)*self.beta)*self.gp_link.d2transf_df2(gp)*self.beta
@ -140,6 +137,5 @@ class Gamma(Likelihood):
        :returns: third derivative of likelihood evaluated at points f
        :rtype: Nx1 array
        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        d3lik_dlink3 = -special.polygamma(2, self.beta*link_f)*(self.beta**3)
        return d3lik_dlink3
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -13,8 +13,8 @@ James 11/12/13

 import numpy as np
 from scipy import stats, special
-import link_functions
-from likelihood import Likelihood
+from . import link_functions
+from .likelihood import Likelihood
 from ..core.parameterization import Param
 from ..core.parameterization.transformations import Logexp
 from scipy import stats
@ -34,7 +34,9 @@ class Gaussian(Likelihood):
        if gp_link is None:
            gp_link = link_functions.Identity()

-        assert isinstance(gp_link, link_functions.Identity), "the likelihood only implemented for the identity link"
+        if not isinstance(gp_link, link_functions.Identity):
+            print("Warning, Exact inference is not implemeted for non-identity link functions,\
+            if you are not already, ensure Laplace inference_method is used")

        super(Gaussian, self).__init__(gp_link, name=name)

@ -46,6 +48,7 @@ class Gaussian(Likelihood):

    def betaY(self,Y,Y_metadata=None):
        #TODO: ~Ricardo this does not live here
+        raise RuntimeError("Please notify the GPy developers, this should not happen")
        return Y/self.gaussian_variance(Y_metadata)

    def gaussian_variance(self, Y_metadata=None):
@ -130,11 +133,8 @@ class Gaussian(Likelihood):
        :returns: log likelihood evaluated for this point
        :rtype: float
        """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
-        N = y.shape[0]
-        ln_det_cov = N*np.log(self.variance)
-
-        return -0.5*(np.sum((y-link_f)**2/self.variance) + ln_det_cov + N*np.log(2.*np.pi))
+        ln_det_cov = np.log(self.variance)
+        return -(1.0/(2*self.variance))*((y-link_f)**2) - 0.5*ln_det_cov - 0.5*np.log(2.*np.pi)

    def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
        """
@ -151,8 +151,7 @@ class Gaussian(Likelihood):
        :returns: gradient of log likelihood evaluated at points link(f)
        :rtype: Nx1 array
        """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
-        s2_i = (1.0/self.variance)
+        s2_i = 1.0/self.variance
        grad = s2_i*y - s2_i*link_f
        return grad

@ -178,9 +177,9 @@ class Gaussian(Likelihood):
            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
        """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
        N = y.shape[0]
-        hess = -(1.0/self.variance)*np.ones((N, 1))
+        D = link_f.shape[1]
+        hess = -(1.0/self.variance)*np.ones((N, D))
        return hess

    def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
@ -198,9 +197,9 @@ class Gaussian(Likelihood):
        :returns: third derivative of log likelihood evaluated at points link(f)
        :rtype: Nx1 array
        """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
        N = y.shape[0]
-        d3logpdf_dlink3 = np.zeros((N,1))
+        D = link_f.shape[1]
+        d3logpdf_dlink3 = np.zeros((N,D))
        return d3logpdf_dlink3

    def dlogpdf_link_dvar(self, link_f, y, Y_metadata=None):
@ -218,12 +217,10 @@ class Gaussian(Likelihood):
        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
        :rtype: float
        """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
        e = y - link_f
        s_4 = 1.0/(self.variance**2)
-        N = y.shape[0]
-        dlik_dsigma = -0.5*N/self.variance + 0.5*s_4*np.sum(np.square(e))
-        return np.sum(dlik_dsigma) # Sure about this sum?
+        dlik_dsigma = -0.5/self.variance + 0.5*s_4*np.square(e)
+        return dlik_dsigma

    def dlogpdf_dlink_dvar(self, link_f, y, Y_metadata=None):
        """
@ -240,7 +237,6 @@ class Gaussian(Likelihood):
        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
        :rtype: Nx1 array
        """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
        s_4 = 1.0/(self.variance**2)
        dlik_grad_dsigma = -s_4*y + s_4*link_f
        return dlik_grad_dsigma
@ -260,23 +256,26 @@ class Gaussian(Likelihood):
        :returns: derivative of log hessian evaluated at points link(f_i) and link(f_j) w.r.t variance parameter
        :rtype: Nx1 array
        """
-        assert np.asarray(link_f).shape == np.asarray(y).shape
        s_4 = 1.0/(self.variance**2)
        N = y.shape[0]
-        d2logpdf_dlink2_dvar = np.ones((N,1))*s_4
+        D = link_f.shape[1]
+        d2logpdf_dlink2_dvar = np.ones((N, D))*s_4
        return d2logpdf_dlink2_dvar

    def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
-        dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, Y_metadata=Y_metadata)
-        return np.asarray([[dlogpdf_dvar]])
+        dlogpdf_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        dlogpdf_dtheta[0,:,:] = self.dlogpdf_link_dvar(f, y, Y_metadata=Y_metadata)
+        return dlogpdf_dtheta

    def dlogpdf_dlink_dtheta(self, f, y, Y_metadata=None):
-        dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, Y_metadata=Y_metadata)
-        return dlogpdf_dlink_dvar
+        dlogpdf_dlink_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        dlogpdf_dlink_dtheta[0, :, :]= self.dlogpdf_dlink_dvar(f, y, Y_metadata=Y_metadata)
+        return dlogpdf_dlink_dtheta

    def d2logpdf_dlink2_dtheta(self, f, y, Y_metadata=None):
-        d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, Y_metadata=Y_metadata)
-        return d2logpdf_dlink2_dvar
+        d2logpdf_dlink2_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        d2logpdf_dlink2_dtheta[0, :, :] = self.d2logpdf_dlink2_dvar(f, y, Y_metadata=Y_metadata)
+        return d2logpdf_dlink2_dtheta

    def _mean(self, gp):
        """
@ -309,10 +308,52 @@ class Gaussian(Likelihood):
        Ysim = np.array([np.random.normal(self.gp_link.transf(gpj), scale=np.sqrt(self.variance), size=1) for gpj in gp])
        return Ysim.reshape(orig_shape)

-    def log_predictive_density(self, y_test, mu_star, var_star):
+    def log_predictive_density(self, y_test, mu_star, var_star, Y_metadata=None):
        """
        assumes independence
        """
        v = var_star + self.variance
        return -0.5*np.log(2*np.pi) -0.5*np.log(v) - 0.5*np.square(y_test - mu_star)/v

+    def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
+        if not isinstance(self.gp_link, link_functions.Identity):
+            return super(Gaussian, self).variational_expectations(Y=Y, m=m, v=v, gh_points=gh_points, Y_metadata=Y_metadata)
+
+        lik_var = float(self.variance)
+        F = -0.5*np.log(2*np.pi) -0.5*np.log(lik_var) - 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/lik_var
+        dF_dmu = (Y - m)/lik_var
+        dF_dv = np.ones_like(v)*(-0.5/lik_var)
+        dF_dtheta = -0.5/lik_var + 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/(lik_var**2)
+        return F, dF_dmu, dF_dv, dF_dtheta.reshape(1, Y.shape[0], Y.shape[1])
+
+class HeteroscedasticGaussian(Gaussian):
+    def __init__(self, Y_metadata, gp_link=None, variance=1., name='het_Gauss'):
+        if gp_link is None:
+            gp_link = link_functions.Identity()
+
+        if not isinstance(gp_link, link_functions.Identity):
+            print("Warning, Exact inference is not implemeted for non-identity link functions,\
+            if you are not already, ensure Laplace inference_method is used")
+
+        super(HeteroscedasticGaussian, self).__init__(gp_link, np.ones(Y_metadata['output_index'].shape)*variance, name)
+
+    def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
+        return dL_dKdiag[Y_metadata['output_index']]
+
+    def gaussian_variance(self, Y_metadata=None):
+        return self.variance[Y_metadata['output_index'].flatten()]
+
+    def predictive_values(self, mu, var, full_cov=False, Y_metadata=None):
+        _s = self.variance[Y_metadata['output_index'].flatten()]
+        if full_cov:
+            if var.ndim == 2:
+                var += np.eye(var.shape[0])*_s
+            if var.ndim == 3:
+                var += np.atleast_3d(np.eye(var.shape[0])*_s)
+        else:
+            var += _s
+        return mu, var
+
+    def predictive_quantiles(self, mu, var, quantiles, Y_metadata=None):
+        _s = self.variance[Y_metadata['output_index'].flatten()]
+        return  [stats.norm.ppf(q/100.)*np.sqrt(var + _s) + mu for q in quantiles]
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -1,11 +1,11 @@
-# Copyright (c) 2012-2014 The GPy authors (see AUTHORS.txt)
+# Copyright (c) 2012-2015 The GPy authors (see AUTHORS.txt)
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
 from scipy import stats,special
 import scipy as sp
-import link_functions
-from ..util.misc import chain_1, chain_2, chain_3
+from . import link_functions
+from ..util.misc import chain_1, chain_2, chain_3, blockify_dhess_dtheta, blockify_third, blockify_hessian, safe_exp
 from scipy.integrate import quad
 import warnings
 from ..core.parameterization import Parameterized
@ -39,6 +39,15 @@ class Likelihood(Parameterized):
        assert isinstance(gp_link,link_functions.GPTransformation), "gp_link is not a valid GPTransformation."
        self.gp_link = gp_link
        self.log_concave = False
+        self.not_block_really = False
+
+    def request_num_latent_functions(self, Y):
+        """
+        The likelihood should infer how many latent functions are needed for the likelihood
+
+        Default is the number of outputs
+        """
+        return Y.shape[1]

    def _gradients(self,partial):
        return np.zeros(0)
@ -69,7 +78,7 @@ class Likelihood(Parameterized):
        """
        raise NotImplementedError

-    def log_predictive_density(self, y_test, mu_star, var_star):
+    def log_predictive_density(self, y_test, mu_star, var_star, Y_metadata=None):
        """
        Calculation of the log predictive density

@ -86,17 +95,88 @@ class Likelihood(Parameterized):
        assert y_test.shape==mu_star.shape
        assert y_test.shape==var_star.shape
        assert y_test.shape[1] == 1
-        def integral_generator(y, m, v):
-            """Generate a function which can be integrated to give p(Y*|Y) = int p(Y*|f*)p(f*|Y) df*"""
-            def f(f_star):
-                return self.pdf(f_star, y)*np.exp(-(1./(2*v))*np.square(m-f_star))
+
+        flat_y_test = y_test.flatten()
+        flat_mu_star = mu_star.flatten()
+        flat_var_star = var_star.flatten()
+
+        if Y_metadata is not None:
+            #Need to zip individual elements of Y_metadata aswell
+            Y_metadata_flat = {}
+            if Y_metadata is not None:
+                for key, val in Y_metadata.items():
+                    Y_metadata_flat[key] = np.atleast_1d(val).reshape(-1,1)
+
+            zipped_values = []
+
+            for i in range(y_test.shape[0]):
+                y_m = {}
+                for key, val in Y_metadata_flat.items():
+                    if np.isscalar(val) or val.shape[0] == 1:
+                        y_m[key] = val
+                    else:
+                        #Won't broadcast yet
+                        y_m[key] = val[i]
+                zipped_values.append((flat_y_test[i], flat_mu_star[i], flat_var_star[i], y_m))
+        else:
+            #Otherwise just pass along None's
+            zipped_values = zip(flat_y_test, flat_mu_star, flat_var_star, [None]*y_test.shape[0])
+
+        def integral_generator(yi, mi, vi, yi_m):
+            """Generate a function which can be integrated
+            to give p(Y*|Y) = int p(Y*|f*)p(f*|Y) df*"""
+            def f(fi_star):
+                #exponent = np.exp(-(1./(2*vi))*np.square(mi-fi_star))
+                #from GPy.util.misc import safe_exp
+                #exponent = safe_exp(exponent)
+                #res = safe_exp(self.logpdf(fi_star, yi, yi_m))*exponent
+
+                #More stable in the log space
+                res = np.exp(self.logpdf(fi_star, yi, yi_m)
+                              - 0.5*np.log(2*np.pi*vi)
+                              - 0.5*np.square(fi_star-mi)/vi)
+                if not np.isfinite(res):
+                    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+                return res
+
            return f

-        scaled_p_ystar, accuracy = zip(*[quad(integral_generator(y, m, v), -np.inf, np.inf) for y, m, v in zip(y_test.flatten(), mu_star.flatten(), var_star.flatten())])
-        scaled_p_ystar = np.array(scaled_p_ystar).reshape(-1,1)
-        p_ystar = scaled_p_ystar/np.sqrt(2*np.pi*var_star)
+        p_ystar, _ = zip(*[quad(integral_generator(yi, mi, vi, yi_m), -np.inf, np.inf)
+                           for yi, mi, vi, yi_m in zipped_values])
+        p_ystar = np.array(p_ystar).reshape(*y_test.shape)
        return np.log(p_ystar)

+    def log_predictive_density_sampling(self, y_test, mu_star, var_star, Y_metadata=None, num_samples=1000):
+        """
+        Calculation of the log predictive density via sampling
+
+        .. math:
+            log p(y_{*}|D) = log 1/num_samples prod^{S}_{s=1} p(y_{*}|f_{*s})
+            f_{*s} ~ p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        :param num_samples: num samples of p(f_{*}|mu_{*}, var_{*}) to take
+        :type num_samples: int
+        """
+        assert y_test.shape==mu_star.shape
+        assert y_test.shape==var_star.shape
+        assert y_test.shape[1] == 1
+
+        #Take samples of p(f*|y)
+        #fi_samples = np.random.randn(num_samples)*np.sqrt(var_star) + mu_star
+        fi_samples = np.random.normal(mu_star, np.sqrt(var_star), size=(mu_star.shape[0], num_samples))
+
+        from scipy.misc import logsumexp
+        log_p_ystar = -np.log(num_samples) + logsumexp(self.logpdf(fi_samples, y_test, Y_metadata=Y_metadata), axis=1)
+        log_p_ystar = np.array(log_p_ystar).reshape(*y_test.shape)
+        return log_p_ystar
+
+
    def _moments_match_ep(self,obs,tau,v):
        """
        Calculation of moments using quadrature
@ -131,9 +211,16 @@ class Likelihood(Parameterized):

        return z, mean, variance

-    def variational_expectations(self, Y, m, v, gh_points=None):
+    #only compute gh points if required
+    __gh_points = None
+    def _gh_points(self, T=20):
+        if self.__gh_points is None:
+            self.__gh_points = np.polynomial.hermite.hermgauss(T)
+        return self.__gh_points
+
+    def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
        """
-        Use Gauss-Hermite Quadrature to compute 
+        Use Gauss-Hermite Quadrature to compute

           E_p(f) [ log p(y|f) ]
           d/dm E_p(f) [ log p(y|f) ]
@ -145,7 +232,7 @@ class Likelihood(Parameterized):
        """

        if gh_points is None:
-            gh_x, gh_w = np.polynomial.hermite.hermgauss(12)
+            gh_x, gh_w = self._gh_points()
        else:
            gh_x, gh_w = gh_points

@ -156,30 +243,35 @@ class Likelihood(Parameterized):
        X = gh_x[None,:]*np.sqrt(2.*v[:,None]) + m[:,None]

        #evaluate the likelhood for the grid. First ax indexes the data (and mu, var) and the second indexes the grid.
-        # broadcast needs to be handled carefully. 
-        logp = self.logpdf(X,Y[:,None])
-        dlogp_dx = self.dlogpdf_df(X, Y[:,None])
-        d2logp_dx2 = self.d2logpdf_df2(X, Y[:,None])
+        # broadcast needs to be handled carefully.
+        logp = self.logpdf(X,Y[:,None], Y_metadata=Y_metadata)
+        dlogp_dx = self.dlogpdf_df(X, Y[:,None], Y_metadata=Y_metadata)
+        d2logp_dx2 = self.d2logpdf_df2(X, Y[:,None], Y_metadata=Y_metadata)

        #clipping for numerical stability
-        logp = np.clip(logp,-1e6,1e6)
-        dlogp_dx = np.clip(dlogp_dx,-1e6,1e6)
-        d2logp_dx2 = np.clip(d2logp_dx2,-1e6,1e6)
+        #logp = np.clip(logp,-1e9,1e9)
+        #dlogp_dx = np.clip(dlogp_dx,-1e9,1e9)
+        #d2logp_dx2 = np.clip(d2logp_dx2,-1e9,1e9)

        #average over the gird to get derivatives of the Gaussian's parameters
-        F = np.dot(logp, gh_w)
-        dF_dm = np.dot(dlogp_dx, gh_w)
-        dF_dv = np.dot(d2logp_dx2, gh_w)/2.
+        #division by pi comes from fact that for each quadrature we need to scale by 1/sqrt(pi)
+        F = np.dot(logp, gh_w)/np.sqrt(np.pi)
+        dF_dm = np.dot(dlogp_dx, gh_w)/np.sqrt(np.pi)
+        dF_dv = np.dot(d2logp_dx2, gh_w)/np.sqrt(np.pi)
+        dF_dv /= 2.

        if np.any(np.isnan(dF_dv)) or np.any(np.isinf(dF_dv)):
            stop
        if np.any(np.isnan(dF_dm)) or np.any(np.isinf(dF_dm)):
            stop

-        return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape)
-
-
-
+        if self.size:
+            dF_dtheta = self.dlogpdf_dtheta(X, Y[:,None], Y_metadata=Y_metadata) # Ntheta x (orig size) x N_{quad_points}
+            dF_dtheta = np.dot(dF_dtheta, gh_w)/np.sqrt(np.pi)
+            dF_dtheta = dF_dtheta.reshape(self.size, shape[0], shape[1])
+        else:
+            dF_dtheta = None # Not yet implemented
+        return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape), dF_dtheta

    def predictive_mean(self, mu, variance, Y_metadata=None):
        """
@ -190,28 +282,30 @@ class Likelihood(Parameterized):

        """
        #conditional_mean: the edpected value of y given some f, under this likelihood
+        fmin = -np.inf
+        fmax = np.inf
        def int_mean(f,m,v):
-            p = np.exp(-(0.5/v)*np.square(f - m))
+            exponent = -(0.5/v)*np.square(f - m)
+            #If exponent is under -30 then exp(exponent) will be very small, so don't exp it!)
            #If p is zero then conditional_mean will overflow
+            assert v.all() > 0
+            p = safe_exp(exponent)
+
+            #If p is zero then conditional_variance will overflow
            if p < 1e-10:
                return 0.
            else:
                return self.conditional_mean(f)*p
-        scaled_mean = [quad(int_mean, -np.inf, np.inf,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
+        scaled_mean = [quad(int_mean, fmin, fmax,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
        mean = np.array(scaled_mean)[:,None] / np.sqrt(2*np.pi*(variance))
-
        return mean

-    def _conditional_mean(self, f):
-        """Quadrature calculation of the conditional mean: E(Y_star|f)"""
-        raise NotImplementedError, "implement this function to make predictions"
-
    def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
        """
        Approximation to the predictive variance: V(Y_star)

        The following variance decomposition is used:
-        V(Y_star) = E( V(Y_star|f_star) ) + V( E(Y_star|f_star) )
+        V(Y_star) = E( V(Y_star|f_star)**2 ) + V( E(Y_star|f_star) )**2

        :param mu: mean of posterior
        :param sigma: standard deviation of posterior
@ -221,15 +315,22 @@ class Likelihood(Parameterized):
        #sigma2 = sigma**2
        normalizer = np.sqrt(2*np.pi*variance)

+        fmin_v = -np.inf
+        fmin_m = np.inf
+        fmin = -np.inf
+        fmax = np.inf
+
+        from ..util.misc import safe_exp
        # E( V(Y_star|f_star) )
        def int_var(f,m,v):
-            p = np.exp(-(0.5/v)*np.square(f - m))
+            exponent = -(0.5/v)*np.square(f - m)
+            p = safe_exp(exponent)
            #If p is zero then conditional_variance will overflow
            if p < 1e-10:
                return 0.
            else:
                return self.conditional_variance(f)*p
-        scaled_exp_variance = [quad(int_var, -np.inf, np.inf,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
+        scaled_exp_variance = [quad(int_var, fmin_v, fmax,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
        exp_var = np.array(scaled_exp_variance)[:,None] / normalizer

        #V( E(Y_star|f_star) ) =  E( E(Y_star|f_star)**2 ) - E( E(Y_star|f_star) )**2
@ -241,14 +342,15 @@ class Likelihood(Parameterized):

        #E( E(Y_star|f_star)**2 )
        def int_pred_mean_sq(f,m,v,predictive_mean_sq):
-            p = np.exp(-(0.5/v)*np.square(f - m))
+            exponent = -(0.5/v)*np.square(f - m)
+            p = np.exp(exponent)
            #If p is zero then conditional_mean**2 will overflow
            if p < 1e-10:
                return 0.
            else:
                return self.conditional_mean(f)**2*p

-        scaled_exp_exp2 = [quad(int_pred_mean_sq, -np.inf, np.inf,args=(mj,s2j,pm2j))[0] for mj,s2j,pm2j in zip(mu,variance,predictive_mean_sq)]
+        scaled_exp_exp2 = [quad(int_pred_mean_sq, fmin_m, fmax,args=(mj,s2j,pm2j))[0] for mj,s2j,pm2j in zip(mu,variance,predictive_mean_sq)]
        exp_exp2 = np.array(scaled_exp_exp2)[:,None] / normalizer

        var_exp = exp_exp2 - predictive_mean_sq
@ -296,8 +398,18 @@ class Likelihood(Parameterized):
        :returns: likelihood evaluated for this point
        :rtype: float
        """
-        inv_link_f = self.gp_link.transf(f)
-        return self.pdf_link(inv_link_f, y, Y_metadata=Y_metadata)
+        if isinstance(self.gp_link, link_functions.Identity):
+            return self.pdf_link(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            return self.pdf_link(inv_link_f, y, Y_metadata=Y_metadata)
+
+    def logpdf_sum(self, f, y, Y_metadata=None):
+        """
+        Convenience function that can overridden for functions where this could
+        be computed more efficiently
+        """
+        return np.sum(self.logpdf(f, y, Y_metadata=Y_metadata))

    def logpdf(self, f, y, Y_metadata=None):
        """
@ -314,8 +426,11 @@ class Likelihood(Parameterized):
        :returns: log likelihood evaluated for this point
        :rtype: float
        """
-        inv_link_f = self.gp_link.transf(f)
-        return self.logpdf_link(inv_link_f, y, Y_metadata=Y_metadata)
+        if isinstance(self.gp_link, link_functions.Identity):
+            return self.logpdf_link(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            return self.logpdf_link(inv_link_f, y, Y_metadata=Y_metadata)

    def dlogpdf_df(self, f, y, Y_metadata=None):
        """
@ -333,11 +448,15 @@ class Likelihood(Parameterized):
        :returns: derivative of log likelihood evaluated for this point
        :rtype: 1xN array
        """
-        inv_link_f = self.gp_link.transf(f)
-        dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
-        dlink_df = self.gp_link.dtransf_df(f)
-        return chain_1(dlogpdf_dlink, dlink_df)
+        if isinstance(self.gp_link, link_functions.Identity):
+            return self.dlogpdf_dlink(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
+            dlink_df = self.gp_link.dtransf_df(f)
+            return chain_1(dlogpdf_dlink, dlink_df)

+    @blockify_hessian
    def d2logpdf_df2(self, f, y, Y_metadata=None):
        """
        Evaluates the link function link(f) then computes the second derivative of log likelihood using it
@ -354,13 +473,18 @@ class Likelihood(Parameterized):
        :returns: second derivative of log likelihood evaluated for this point (diagonal only)
        :rtype: 1xN array
        """
-        inv_link_f = self.gp_link.transf(f)
-        d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
-        dlink_df = self.gp_link.dtransf_df(f)
-        dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
-        d2link_df2 = self.gp_link.d2transf_df2(f)
-        return chain_2(d2logpdf_dlink2, dlink_df, dlogpdf_dlink, d2link_df2)
+        if isinstance(self.gp_link, link_functions.Identity):
+            d2logpdf_df2 = self.d2logpdf_dlink2(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
+            dlink_df = self.gp_link.dtransf_df(f)
+            dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
+            d2link_df2 = self.gp_link.d2transf_df2(f)
+            d2logpdf_df2 = chain_2(d2logpdf_dlink2, dlink_df, dlogpdf_dlink, d2link_df2)
+        return d2logpdf_df2

+    @blockify_third
    def d3logpdf_df3(self, f, y, Y_metadata=None):
        """
        Evaluates the link function link(f) then computes the third derivative of log likelihood using it
@ -377,53 +501,85 @@ class Likelihood(Parameterized):
        :returns: third derivative of log likelihood evaluated for this point
        :rtype: float
        """
-        inv_link_f = self.gp_link.transf(f)
-        d3logpdf_dlink3 = self.d3logpdf_dlink3(inv_link_f, y, Y_metadata=Y_metadata)
-        dlink_df = self.gp_link.dtransf_df(f)
-        d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
-        d2link_df2 = self.gp_link.d2transf_df2(f)
-        dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
-        d3link_df3 = self.gp_link.d3transf_df3(f)
-        return chain_3(d3logpdf_dlink3, dlink_df, d2logpdf_dlink2, d2link_df2, dlogpdf_dlink, d3link_df3)
+        if isinstance(self.gp_link, link_functions.Identity):
+            d3logpdf_df3 = self.d3logpdf_dlink3(f, y, Y_metadata=Y_metadata)
+        else:
+            inv_link_f = self.gp_link.transf(f)
+            d3logpdf_dlink3 = self.d3logpdf_dlink3(inv_link_f, y, Y_metadata=Y_metadata)
+            dlink_df = self.gp_link.dtransf_df(f)
+            d2logpdf_dlink2 = self.d2logpdf_dlink2(inv_link_f, y, Y_metadata=Y_metadata)
+            d2link_df2 = self.gp_link.d2transf_df2(f)
+            dlogpdf_dlink = self.dlogpdf_dlink(inv_link_f, y, Y_metadata=Y_metadata)
+            d3link_df3 = self.gp_link.d3transf_df3(f)
+            d3logpdf_df3 = chain_3(d3logpdf_dlink3, dlink_df, d2logpdf_dlink2, d2link_df2, dlogpdf_dlink, d3link_df3)
+        return d3logpdf_df3
+

    def dlogpdf_dtheta(self, f, y, Y_metadata=None):
        """
        TODO: Doc strings
        """
        if self.size > 0:
-            inv_link_f = self.gp_link.transf(f)
-            return self.dlogpdf_link_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+            if self.not_block_really:
+                raise NotImplementedError("Need to make a decorator for this!")
+            if isinstance(self.gp_link, link_functions.Identity):
+                return self.dlogpdf_link_dtheta(f, y, Y_metadata=Y_metadata)
+            else:
+                inv_link_f = self.gp_link.transf(f)
+                return self.dlogpdf_link_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
        else:
            # There are no parameters so return an empty array for derivatives
-            return np.zeros([1, 0])
+            return np.zeros((0, f.shape[0], f.shape[1]))

    def dlogpdf_df_dtheta(self, f, y, Y_metadata=None):
        """
        TODO: Doc strings
        """
        if self.size > 0:
-            inv_link_f = self.gp_link.transf(f)
-            dlink_df = self.gp_link.dtransf_df(f)
-            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
-            return chain_1(dlogpdf_dlink_dtheta, dlink_df)
+            if self.not_block_really:
+                raise NotImplementedError("Need to make a decorator for this!")
+            if isinstance(self.gp_link, link_functions.Identity):
+                return self.dlogpdf_dlink_dtheta(f, y, Y_metadata=Y_metadata)
+            else:
+                inv_link_f = self.gp_link.transf(f)
+                dlink_df = self.gp_link.dtransf_df(f)
+                dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+
+                dlogpdf_df_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+                #Chain each parameter of hte likelihood seperately
+                for p in range(self.size):
+                    dlogpdf_df_dtheta[p, :, :] = chain_1(dlogpdf_dlink_dtheta[p,:,:], dlink_df)
+                return dlogpdf_df_dtheta
+                #return chain_1(dlogpdf_dlink_dtheta, dlink_df)
        else:
            # There are no parameters so return an empty array for derivatives
-            return np.zeros([f.shape[0], 0])
+            return np.zeros((0, f.shape[0], f.shape[1]))

    def d2logpdf_df2_dtheta(self, f, y, Y_metadata=None):
        """
        TODO: Doc strings
        """
        if self.size > 0:
-            inv_link_f = self.gp_link.transf(f)
-            dlink_df = self.gp_link.dtransf_df(f)
-            d2link_df2 = self.gp_link.d2transf_df2(f)
-            d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
-            dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
-            return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
+            if self.not_block_really:
+                raise NotImplementedError("Need to make a decorator for this!")
+            if isinstance(self.gp_link, link_functions.Identity):
+                return self.d2logpdf_dlink2_dtheta(f, y, Y_metadata=Y_metadata)
+            else:
+                inv_link_f = self.gp_link.transf(f)
+                dlink_df = self.gp_link.dtransf_df(f)
+                d2link_df2 = self.gp_link.d2transf_df2(f)
+                d2logpdf_dlink2_dtheta = self.d2logpdf_dlink2_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+                dlogpdf_dlink_dtheta = self.dlogpdf_dlink_dtheta(inv_link_f, y, Y_metadata=Y_metadata)
+
+                d2logpdf_df2_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+                #Chain each parameter of hte likelihood seperately
+                for p in range(self.size):
+                    d2logpdf_df2_dtheta[p, :, :] = chain_2(d2logpdf_dlink2_dtheta[p,:,:], dlink_df, dlogpdf_dlink_dtheta[p,:,:], d2link_df2)
+                return d2logpdf_df2_dtheta
+                #return chain_2(d2logpdf_dlink2_dtheta, dlink_df, dlogpdf_dlink_dtheta, d2link_df2)
        else:
            # There are no parameters so return an empty array for derivatives
-            return np.zeros([f.shape[0], 0])
+            return np.zeros((0, f.shape[0], f.shape[1]))

    def _laplace_gradients(self, f, y, Y_metadata=None):
        dlogpdf_dtheta = self.dlogpdf_dtheta(f, y, Y_metadata=Y_metadata)
@ -432,9 +588,9 @@ class Likelihood(Parameterized):

        #Parameters are stacked vertically. Must be listed in same order as 'get_param_names'
        # ensure we have gradients for every parameter we want to optimize
-        assert len(dlogpdf_dtheta) == self.size #1 x num_param array
-        assert dlogpdf_df_dtheta.shape[1] == self.size #f x num_param matrix
-        assert d2logpdf_df2_dtheta.shape[1] == self.size #f x num_param matrix
+        assert dlogpdf_dtheta.shape[0] == self.size #num_param array x f, d
+        assert dlogpdf_df_dtheta.shape[0] == self.size #num_param x f x d x matrix or just num_param x f
+        assert d2logpdf_df2_dtheta.shape[0] == self.size #num_param x f matrix or num_param x f x d x matrix, num_param x f x f or num_param x f x f x d

        return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta

@ -447,27 +603,113 @@ class Likelihood(Parameterized):
        :param full_cov: whether to use the full covariance or just the diagonal
        :type full_cov: Boolean
        """
-
-        pred_mean = self.predictive_mean(mu, var, Y_metadata)
-        pred_var = self.predictive_variance(mu, var, pred_mean, Y_metadata)
+        try:
+            pred_mean = self.predictive_mean(mu, var, Y_metadata=Y_metadata)
+            pred_var = self.predictive_variance(mu, var, pred_mean, Y_metadata=Y_metadata)
+        except NotImplementedError:
+            print("Finding predictive mean and variance via sampling rather than quadrature")
+            Nf_samp = 300
+            Ny_samp = 1
+            s = np.random.randn(mu.shape[0], Nf_samp)*np.sqrt(var) + mu
+            ss_y = self.samples(s, Y_metadata, samples=Ny_samp)
+            pred_mean = np.mean(ss_y, axis=1)[:, None]
+            pred_var = np.var(ss_y, axis=1)[:, None]

        return pred_mean, pred_var

    def predictive_quantiles(self, mu, var, quantiles, Y_metadata=None):
        #compute the quantiles by sampling!!!
-        N_samp = 1000
-        s = np.random.randn(mu.shape[0], N_samp)*np.sqrt(var) + mu
-        #ss_f = s.flatten()
-        #ss_y = self.samples(ss_f, Y_metadata)
-        ss_y = self.samples(s, Y_metadata)
-        #ss_y = ss_y.reshape(mu.shape[0], N_samp)
+        Nf_samp = 300
+        Ny_samp = 1
+        s = np.random.randn(mu.shape[0], Nf_samp)*np.sqrt(var) + mu
+        ss_y = self.samples(s, Y_metadata, samples=Ny_samp)
+        #ss_y = ss_y.reshape(mu.shape[0], mu.shape[1], Nf_samp*Ny_samp)

-        return [np.percentile(ss_y ,q, axis=1)[:,None] for q in quantiles]
+        pred_quantiles = [np.percentile(ss_y, q, axis=1)[:,None] for q in quantiles]
+        return pred_quantiles

-    def samples(self, gp, Y_metadata=None):
+    def samples(self, gp, Y_metadata=None, samples=1):
        """
        Returns a set of samples of observations based on a given value of the latent variable.

        :param gp: latent variable
+        :param samples: number of samples to take for each f location
        """
-        raise NotImplementedError
+        raise NotImplementedError("""May be possible to use MCMC with user-tuning, see
+                                  MCMC_pdf_samples in likelihood.py and write samples function
+                                  using this, beware this is a simple implementation
+                                  of Metropolis and will not work well for all likelihoods""")
+
+    def MCMC_pdf_samples(self, fNew, num_samples=1000, starting_loc=None, stepsize=0.1, burn_in=1000, Y_metadata=None):
+        """
+        Simple implementation of Metropolis sampling algorithm
+
+        Will run a parallel chain for each input dimension (treats each f independently)
+        Thus assumes f*_1 independant of f*_2 etc.
+
+        :param num_samples: Number of samples to take
+        :param fNew: f at which to sample around
+        :param starting_loc: Starting locations of the independant chains (usually will be conditional_mean of likelihood), often link_f
+        :param stepsize: Stepsize for the normal proposal distribution (will need modifying)
+        :param burnin: number of samples to use for burnin (will need modifying)
+        :param Y_metadata: Y_metadata for pdf
+        """
+        print("Warning, using MCMC for sampling y*, needs to be tuned!")
+        if starting_loc is None:
+            starting_loc = fNew
+        from functools import partial
+        logpdf = partial(self.logpdf, f=fNew, Y_metadata=Y_metadata)
+        pdf = lambda y_star: np.exp(logpdf(y=y_star[:, None]))
+        #Should be the link function of f is a good starting point
+        #(i.e. the point before you corrupt it with the likelihood)
+        par_chains = starting_loc.shape[0]
+        chain_values = np.zeros((par_chains, num_samples))
+        chain_values[:, 0][:,None] = starting_loc
+        #Use same stepsize for all par_chains
+        stepsize = np.ones(par_chains)*stepsize
+        accepted = np.zeros((par_chains, num_samples+burn_in))
+        accept_ratio = np.zeros(num_samples+burn_in)
+        #Whilst burning in, only need to keep the previous lot
+        burnin_cache = np.zeros(par_chains)
+        burnin_cache[:] = starting_loc.flatten()
+        burning_in = True
+        for i in xrange(burn_in+num_samples):
+            next_ind = i-burn_in
+            if burning_in:
+                old_y = burnin_cache
+            else:
+                old_y = chain_values[:,next_ind-1]
+
+            old_lik = pdf(old_y)
+            #Propose new y from Gaussian proposal
+            new_y = np.random.normal(loc=old_y, scale=stepsize)
+            new_lik = pdf(new_y)
+            #Accept using Metropolis (not hastings) acceptance
+            #Always accepts if new_lik > old_lik
+            accept_probability = np.minimum(1, new_lik/old_lik)
+            u = np.random.uniform(0,1,par_chains)
+            #print "Accept prob: ", accept_probability
+            accepts = u < accept_probability
+            if burning_in:
+                burnin_cache[accepts] = new_y[accepts]
+                burnin_cache[~accepts] = old_y[~accepts]
+                if i == burn_in:
+                    burning_in = False
+                    chain_values[:,0] = burnin_cache
+            else:
+                #If it was accepted then new_y becomes the latest sample
+                chain_values[accepts, next_ind] = new_y[accepts]
+                #Otherwise use old y as the sample
+                chain_values[~accepts, next_ind] = old_y[~accepts]
+
+            accepted[~accepts, i] = 0
+            accepted[accepts, i] = 1
+            accept_ratio[i] = np.sum(accepted[:,i])/float(par_chains)
+
+            #Show progress
+            if i % int((burn_in+num_samples)*0.1) == 0:
+                print("{}% of samples taken ({})".format((i/int((burn_in+num_samples)*0.1)*10), i))
+                print("Last run accept ratio: ", accept_ratio[i])
+
+        print("Average accept ratio: ", np.mean(accept_ratio))
+        return chain_values
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@ -1,13 +1,11 @@
-# Copyright (c) 2012-2014 The GPy authors (see AUTHORS.txt)
+# Copyright (c) 2012-2015 The GPy authors (see AUTHORS.txt)
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-from scipy import stats
+import scipy
+from ..util.univariate_Gaussian import std_norm_cdf, std_norm_pdf
 import scipy as sp
-from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf,inv_std_norm_cdf
-
-_exp_lim_val = np.finfo(np.float64).max
-_lim_val = np.log(_exp_lim_val)
+from ..util.misc import safe_exp, safe_square, safe_cube, safe_quad, safe_three_times

 class GPTransformation(object):
    """
@ -70,7 +68,7 @@ class Probit(GPTransformation):
    .. math::

        g(f) = \\Phi^{-1} (mu)
-    
+
    """
    def transf(self,f):
        return std_norm_cdf(f)
@ -79,13 +77,10 @@ class Probit(GPTransformation):
        return std_norm_pdf(f)

    def d2transf_df2(self,f):
-        #FIXME
        return -f * std_norm_pdf(f)

    def d3transf_df3(self,f):
-        #FIXME
-        f2 = f**2
-        return -(1/(np.sqrt(2*np.pi)))*np.exp(-0.5*(f2))*(1-f2)
+        return (safe_square(f)-1.)*std_norm_pdf(f)


 class Cloglog(GPTransformation):
@ -98,22 +93,26 @@ class Cloglog(GPTransformation):
        or

        f = \log (-\log(1-p))
-    
+
    """
    def transf(self,f):
-        return 1-np.exp(-np.exp(f))
+        ef = safe_exp(f)
+        return 1-np.exp(-ef)

    def dtransf_df(self,f):
-        return np.exp(f-np.exp(f))
+        ef = safe_exp(f)
+        return np.exp(f-ef)

    def d2transf_df2(self,f):
-        ef = np.exp(f)
+        ef = safe_exp(f)
        return -np.exp(f-ef)*(ef-1.)

    def d3transf_df3(self,f):
-        ef = np.exp(f)
-        return np.exp(f-ef)*(1.-3*ef + ef**2)
-
+        ef = safe_exp(f)
+        ef2 = safe_square(ef)
+        three_times_ef = safe_three_times(ef)
+        r_val = np.exp(f-ef)*(1.-three_times_ef + ef2)
+        return r_val

 class Log(GPTransformation):
    """
@ -123,16 +122,16 @@ class Log(GPTransformation):

    """
    def transf(self,f):
-        return np.exp(np.clip(f, -_lim_val, _lim_val))
+        return safe_exp(f)

    def dtransf_df(self,f):
-        return np.exp(np.clip(f, -_lim_val, _lim_val))
+        return safe_exp(f)

    def d2transf_df2(self,f):
-        return np.exp(np.clip(f, -_lim_val, _lim_val))
+        return safe_exp(f)

    def d3transf_df3(self,f):
-        return np.exp(np.clip(f, -_lim_val, _lim_val))
+        return safe_exp(f)

 class Log_ex_1(GPTransformation):
    """
@ -142,17 +141,20 @@ class Log_ex_1(GPTransformation):

    """
    def transf(self,f):
-        return np.log(1.+np.exp(f))
+        return scipy.special.log1p(safe_exp(f))

    def dtransf_df(self,f):
-        return np.exp(f)/(1.+np.exp(f))
+        ef = safe_exp(f)
+        return ef/(1.+ef)

    def d2transf_df2(self,f):
-        aux = np.exp(f)/(1.+np.exp(f))
+        ef = safe_exp(f)
+        aux = ef/(1.+ef)
        return aux*(1.-aux)

    def d3transf_df3(self,f):
-        aux = np.exp(f)/(1.+np.exp(f))
+        ef = safe_exp(f)
+        aux = ef/(1.+ef)
        daux_df = aux*(1.-aux)
        return daux_df - (2.*aux*daux_df)

@ -160,21 +162,24 @@ class Reciprocal(GPTransformation):
    def transf(self,f):
        return 1./f

-    def dtransf_df(self,f):
-        return -1./(f**2)
+    def dtransf_df(self, f):
+        f2 = safe_square(f)
+        return -1./f2

-    def d2transf_df2(self,f):
-        return 2./(f**3)
+    def d2transf_df2(self, f):
+        f3 = safe_cube(f)
+        return 2./f3

    def d3transf_df3(self,f):
-        return -6./(f**4)
+        f4 = safe_quad(f)
+        return -6./f4

 class Heaviside(GPTransformation):
    """

    .. math::

-        g(f) = I_{x \\in A}
+        g(f) = I_{x \\geq 0}

    """
    def transf(self,f):
@ -182,7 +187,7 @@ class Heaviside(GPTransformation):
        return np.where(f>0, 1, 0)

    def dtransf_df(self,f):
-        raise NotImplementedError, "This function is not differentiable!"
+        raise NotImplementedError("This function is not differentiable!")

    def d2transf_df2(self,f):
-        raise NotImplementedError, "This function is not differentiable!"
+        raise NotImplementedError("This function is not differentiable!")
--- a/GPy/likelihoods/mixed_noise.py
+++ b/GPy/likelihoods/mixed_noise.py
@ -3,9 +3,9 @@

 import numpy as np
 from scipy import stats, special
-import link_functions
-from likelihood import Likelihood
-from gaussian import Gaussian
+from . import link_functions
+from .likelihood import Likelihood
+from .gaussian import Gaussian
 from ..core.parameterization import Param
 from ..core.parameterization.transformations import Logexp
 from ..core.parameterization import Parameterized
--- a/GPy/likelihoods/poisson.py
+++ b/GPy/likelihoods/poisson.py
@ -5,8 +5,8 @@ from __future__ import division
 import numpy as np
 from scipy import stats,special
 import scipy as sp
-import link_functions
-from likelihood import Likelihood
+from . import link_functions
+from .likelihood import Likelihood

 class Poisson(Likelihood):
    """
@ -64,8 +64,7 @@ class Poisson(Likelihood):
        :rtype: float

        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        return np.sum(-link_f + y*np.log(link_f) - special.gammaln(y+1))
+        return -link_f + y*np.log(link_f) - special.gammaln(y+1)

    def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
        """
@ -83,7 +82,6 @@ class Poisson(Likelihood):
        :rtype: Nx1 array

        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        return y/link_f - 1

    def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
@ -107,12 +105,7 @@ class Poisson(Likelihood):
            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
-        hess = -y/(link_f**2)
-        return hess
-        #d2_df = self.gp_link.d2transf_df2(gp)
-        #transf = self.gp_link.transf(gp)
-        #return obs * ((self.gp_link.dtransf_df(gp)/transf)**2 - d2_df/transf) + d2_df
+        return -y/(link_f**2)

    def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
        """
@ -129,7 +122,6 @@ class Poisson(Likelihood):
        :returns: third derivative of likelihood evaluated at points f
        :rtype: Nx1 array
        """
-        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        d3lik_dlink3 = 2*y/(link_f)**3
        return d3lik_dlink3

@ -145,7 +137,7 @@ class Poisson(Likelihood):
        """
        return self.gp_link.transf(gp)

-    def samples(self, gp, Y_metadata=None):
+    def samples(self, gp, Y_metadata=None, samples=1):
        """
        Returns a set of samples of observations based on a given value of the latent variable.

@ -153,5 +145,5 @@ class Poisson(Likelihood):
        """
        orig_shape = gp.shape
        gp = gp.flatten()
-        Ysim = np.random.poisson(self.gp_link.transf(gp))
-        return Ysim.reshape(orig_shape)
+        Ysim = np.random.poisson(self.gp_link.transf(gp), [samples, gp.size]).T
+        return Ysim.reshape(orig_shape+(samples,))
--- a/Show more
+++ b/Show more