Merge branch 'devel' of github.com:SheffieldML/GPy into devel

2026-06-20 15:38:08 +02:00 · 2015-08-28 16:28:25 +01:00 · 2015-08-28 16:28:25 +01:00 · 938cc49aed
commit 938cc49aed
parent 4bd99c674f 9564287cf3
83 changed files with 35983 additions and 4358 deletions
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -60,9 +60,11 @@ class GP(Model):
            self.normalizer.scale_by(Y)
            self.Y_normalized = ObsAr(self.normalizer.normalize(Y))
            self.Y = Y
-        else:
+        elif isinstance(Y, np.ndarray):
            self.Y = ObsAr(Y)
            self.Y_normalized = self.Y
+        else:
+            self.Y = Y

        if Y.shape[0] != self.num_data:
            #There can be cases where we want inputs than outputs, for example if we have multiple latent
@ -89,7 +91,6 @@ class GP(Model):
            assert mean_function.output_dim == self.output_dim
            self.link_parameter(mean_function)

-
        #find a sensible inference method
        logger.info("initializing inference method")
        if inference_method is None:
@ -182,7 +183,7 @@ class GP(Model):
        """
        return self._log_marginal_likelihood

-    def _raw_predict(self, _Xnew, full_cov=False, kern=None):
+    def _raw_predict(self, Xnew, full_cov=False, kern=None):
        """
        For making predictions, does not account for normalization or likelihood

@ -198,23 +199,30 @@ class GP(Model):
        if kern is None:
            kern = self.kern

-        Kx = kern.K(_Xnew, self.X).T
-        WiKx = np.dot(self.posterior.woodbury_inv, Kx)
+        Kx = kern.K(self.X, Xnew)
        mu = np.dot(Kx.T, self.posterior.woodbury_vector)
        if full_cov:
-            Kxx = kern.K(_Xnew)
-            var = Kxx - np.dot(Kx.T, WiKx)
+            Kxx = kern.K(Xnew)
+            if self.posterior.woodbury_inv.ndim == 2:
+                var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
+            elif self.posterior.woodbury_inv.ndim == 3:
+                var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
+                for i in range(var.shape[2]):
+                    var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
+            var = var
        else:
-            Kxx = kern.Kdiag(_Xnew)
-            var = Kxx - np.sum(WiKx*Kx, 0)
-            var = var.reshape(-1, 1)
+            Kxx = kern.Kdiag(Xnew)
+            if self.posterior.woodbury_inv.ndim == 2:
+                var = (Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0))[:,None]
+            elif self.posterior.woodbury_inv.ndim == 3:
+                var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
+                for i in range(var.shape[1]):
+                    var[:, i] = (Kxx - (np.sum(np.dot(self.posterior.woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
+            var = var
+        #add in the mean function
+        if self.mean_function is not None:
+            mu += self.mean_function.f(Xnew)

-        #force mu to be a column vector
-        if len(mu.shape)==1: mu = mu[:,None]
-
-        #add the mean function in
-        if not self.mean_function is None:
-            mu += self.mean_function.f(_Xnew)
        return mu, var

    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None):
@ -244,10 +252,10 @@ class GP(Model):
            mu, var = self.normalizer.inverse_mean(mu), self.normalizer.inverse_variance(var)

        # now push through likelihood
-        mean, var = self.likelihood.predictive_values(mu, var, full_cov, Y_metadata)
+        mean, var = self.likelihood.predictive_values(mu, var, full_cov, Y_metadata=Y_metadata)
        return mean, var

-    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None):
+    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None, kern=None):
        """
        Get the predictive quantiles around the prediction at X

@ -255,13 +263,15 @@ class GP(Model):
        :type X: np.ndarray (Xnew x self.input_dim)
        :param quantiles: tuple of quantiles, default is (2.5, 97.5) which is the 95% interval
        :type quantiles: tuple
+        :param kern: optional kernel to use for prediction
+        :type predict_kw: dict
        :returns: list of quantiles for each X and predictive quantiles for interval combination
        :rtype: [np.ndarray (Xnew x self.output_dim), np.ndarray (Xnew x self.output_dim)]
        """
-        m, v = self._raw_predict(X,  full_cov=False)
+        m, v = self._raw_predict(X,  full_cov=False, kern=kern)
        if self.normalizer is not None:
            m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
-        return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
+        return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata=Y_metadata)

    def predictive_gradients(self, Xnew):
        """
@ -331,7 +341,7 @@ class GP(Model):
        :returns: Ysim: set of simulations, a Numpy array (N x samples).
        """
        fsim = self.posterior_samples_f(X, size, full_cov=full_cov)
-        Ysim = self.likelihood.samples(fsim, Y_metadata)
+        Ysim = self.likelihood.samples(fsim, Y_metadata=Y_metadata)
        return Ysim

    def plot_f(self, plot_limits=None, which_data_rows='all',
@ -473,16 +483,16 @@ class GP(Model):
            self.inference_method.on_optimization_end()
            raise

-    def infer_newX(self, Y_new, optimize=True, ):
+    def infer_newX(self, Y_new, optimize=True):
        """
-        Infer the distribution of X for the new observed data *Y_new*.
+        Infer X for the new observed data *Y_new*.

        :param Y_new: the new observed data for inference
        :type Y_new: numpy.ndarray
        :param optimize: whether to optimize the location of new X (True by default)
        :type optimize: boolean
        :return: a tuple containing the posterior estimation of X and the model that optimize X
-        :rtype: (:class:`~GPy.core.parameterization.variational.VariationalPosterior` or numpy.ndarray, :class:`~GPy.core.model.Model`)
+        :rtype: (:class:`~GPy.core.parameterization.variational.VariationalPosterior` and numpy.ndarray, :class:`~GPy.core.model.Model`)
        """
        from ..inference.latent_function_inference.inferenceX import infer_newX
        return infer_newX(self, Y_new, optimize=optimize)
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -76,7 +76,7 @@ class Model(Parameterized):
                jobs = []
                pool = mp.Pool(processes=num_processes)
                for i in range(num_restarts):
-                    self.randomize()
+                    if i>0: self.randomize()
                    job = pool.apply_async(opt_wrapper, args=(self,), kwds=kwargs)
                    jobs.append(job)

@ -90,7 +90,7 @@ class Model(Parameterized):
        for i in range(num_restarts):
            try:
                if not parallel:
-                    self.randomize()
+                    if i>0: self.randomize()
                    self.optimize(**kwargs)
                else:
                    self.optimization_runs.append(jobs[i].get())
@ -257,7 +257,7 @@ class Model(Parameterized):
            optimizer = optimization.get_optimizer(optimizer)
            opt = optimizer(start, model=self, max_iters=max_iters, **kwargs)

-        with VerboseOptimization(self, opt, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook) as vo:
+        with VerboseOptimization(self, opt, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook, clear_after_finish=clear_after_finish) as vo:
            opt.run(f_fp=self._objective_grads, f=self._objective, fp=self._grads)
            vo.finish(opt)

--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@ -5,7 +5,7 @@ import numpy
 from numpy.lib.function_base import vectorize
 from .lists_and_dicts import IntArrayDict
 from functools import reduce
-from transformations import Transformation
+from .transformations import Transformation

 def extract_properties_to_index(index, props):
    prop_index = dict()
@ -109,7 +109,7 @@ class ParameterIndexOperations(object):
        try:
            return self._properties.itervalues()
        except AttributeError:
-	#Changed this from itervalues to values for Py3 compatibility. It didn't break the test suite.
+            #Changed this from itervalues to values for Py3 compatibility. It didn't break the test suite.
            return self._properties.values()

    def indices(self):
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@ -38,6 +38,11 @@ class Param(Parameterizable, ObsAr):
    Fixing parameters will fix them to the value they are right now. If you change
    the fixed value, it will be fixed to the new value!

+    Important Note:
+    Multilevel indexing (e.g. self[:2][1:]) is not supported and might lead to unexpected behaviour.
+    Try to index in one go, using boolean indexing or the numpy builtin
+    np.index function.
+    
    See :py:class:`GPy.core.parameterized.Parameterized` for more details on constraining etc.

    """
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@ -430,23 +430,38 @@ class Indexable(Nameable, Updateable):

    def log_prior(self):
        """evaluate the prior"""
-        if self.priors.size > 0:
-            x = self.param_array
-            #py3 fix
-            #return reduce(lambda a, b: a + b, (p.lnpdf(x[ind]).sum() for p, ind in self.priors.iteritems()), 0)
-            return reduce(lambda a, b: a + b, (p.lnpdf(x[ind]).sum() for p, ind in self.priors.items()), 0)
-        return 0.
+        if self.priors.size == 0:
+            return 0.
+        x = self.param_array
+        #evaluate the prior log densities
+        log_p = reduce(lambda a, b: a + b, (p.lnpdf(x[ind]).sum() for p, ind in self.priors.items()), 0)
+
+        #account for the transformation by evaluating the log Jacobian (where things are transformed)
+        log_j = 0.
+        priored_indexes = np.hstack([i for p, i in self.priors.items()])
+        for c,j in self.constraints.items():
+            if not isinstance(c, Transformation):continue
+            for jj in j:
+                if jj in priored_indexes:
+                    log_j += c.log_jacobian(x[jj])
+        return log_p + log_j

    def _log_prior_gradients(self):
        """evaluate the gradients of the priors"""
-        if self.priors.size > 0:
-            x = self.param_array
-            ret = np.zeros(x.size)
-            #py3 fix
-            #[np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.iteritems()]
-            [np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.items()]
-            return ret
-        return 0.
+        if self.priors.size == 0:
+            return 0.
+        x = self.param_array
+        ret = np.zeros(x.size)
+        #compute derivate of prior density
+        [np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.items()]
+        #add in jacobian derivatives if transformed
+        priored_indexes = np.hstack([i for p, i in self.priors.items()])
+        for c,j in self.constraints.items():
+            if not isinstance(c, Transformation):continue
+            for jj in j:
+                if jj in priored_indexes:
+                    ret[jj] += c.log_jacobian_grad(x[jj])
+        return ret

    #===========================================================================
    # Tie parameters together
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -6,10 +6,10 @@ import numpy; np = numpy
 import itertools
 from re import compile, _pattern_type
 from .param import ParamConcatenation
-from parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing
+from .parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing

 import logging
-from index_operations import ParameterIndexOperationsView
+from .index_operations import ParameterIndexOperationsView
 logger = logging.getLogger("parameters changed meta")

 class ParametersChangedMeta(type):
@ -197,9 +197,10 @@ class Parameterized(Parameterizable):
                raise RuntimeError("{} does not seem to be a parameter, remove parameters directly from their respective parents".format(str(param)))

        start = sum([p.size for p in self.parameters[:param._parent_index_]])
-        self._remove_parameter_name(param)
        self.size -= param.size
        del self.parameters[param._parent_index_]
+        self._remove_parameter_name(param)
+

        param._disconnect_parent()
        param.remove_observer(self, self._pass_through_notify_observers)
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@ -522,16 +522,9 @@ class DGPLVM(Prior):

    """
    domain = _REAL
-    # _instances = []
-    # def __new__(cls, mu, sigma): # Singleton:
-    #     if cls._instances:
-    #         cls._instances[:] = [instance for instance in cls._instances if instance()]
-    #         for instance in cls._instances:
-    #             if instance().mu == mu and instance().sigma == sigma:
-    #                 return instance()
-    #     o = super(Prior, cls).__new__(cls, mu, sigma)
-    #     cls._instances.append(weakref.ref(o))
-    #     return cls._instances[-1]()
+    
+    def __new__(cls, sigma2, lbl, x_shape): 
+        return super(Prior, cls).__new__(cls, sigma2, lbl, x_shape)

    def __init__(self, sigma2, lbl, x_shape):
        self.sigma2 = sigma2
@ -758,12 +751,12 @@ class DGPLVM_Lamda(Prior, Parameterized):
        self.sigma2 = sigma2
        # self.x = x
        self.lbl = lbl
-	self.lamda = lamda
+        self.lamda = lamda
        self.classnum = lbl.shape[1]
        self.datanum = lbl.shape[0]
        self.x_shape = x_shape
        self.dim = x_shape[1]
-	self.lamda = Param('lamda', np.diag(lamda))
+        self.lamda = Param('lamda', np.diag(lamda))
        self.link_parameter(self.lamda)

    def get_class_label(self, y):
@ -789,7 +782,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
        M_i = np.zeros((self.classnum, self.dim))
        for i in cls:
            # Mean of each class
-	    class_i = cls[i]
+            class_i = cls[i]
            M_i[i] = np.mean(class_i, axis=0)
        return M_i

@ -843,7 +836,7 @@ class DGPLVM_Lamda(Prior, Parameterized):

    # Calculating beta and Bi for Sb
    def compute_sig_beta_Bi(self, data_idx, M_i, M_0, lst_idx_all):
-        import pdb
+        # import pdb
        # pdb.set_trace()
        B_i = np.zeros((self.classnum, self.dim))
        Sig_beta_B_i_all = np.zeros((self.datanum, self.dim))
@ -899,8 +892,8 @@ class DGPLVM_Lamda(Prior, Parameterized):
 	#!!!!!!!!!!!!!!!!!!!!!!!!!!!
 	#self.lamda.values[:] = self.lamda.values/self.lamda.values.sum()	

-	xprime = x.dot(np.diagflat(self.lamda))
-	x = xprime
+        xprime = x.dot(np.diagflat(self.lamda))
+        x = xprime
 	# print x
        cls = self.compute_cls(x)
        M_0 = np.mean(x, axis=0)
@ -909,15 +902,15 @@ class DGPLVM_Lamda(Prior, Parameterized):
        Sw = self.compute_Sw(cls, M_i)
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))

    # This function calculates derivative of the log of prior function
    def lnpdf_grad(self, x):
        x = x.reshape(self.x_shape)
-	xprime = x.dot(np.diagflat(self.lamda))
-	x = xprime
+        xprime = x.dot(np.diagflat(self.lamda))
+        x = xprime
 	# print x
        cls = self.compute_cls(x)
        M_0 = np.mean(x, axis=0)
@ -933,8 +926,8 @@ class DGPLVM_Lamda(Prior, Parameterized):
        # Calculating inverse of Sb and its transpose and minus
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
-        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.5))[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.9)[0]
        Sb_inv_N_trans = np.transpose(Sb_inv_N)
        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
        Sw_trans = np.transpose(Sw)
@ -951,14 +944,14 @@ class DGPLVM_Lamda(Prior, Parameterized):
        # Because of the GPy we need to transpose our matrix so that it gets the same shape as out matrix (denominator layout!!!)
        DPxprim_Dx = DPxprim_Dx.T
    
-    	DPxprim_Dlamda = DPx_Dx.dot(x)
+        DPxprim_Dlamda = DPx_Dx.dot(x)

    	# Because of the GPy we need to transpose our matrix so that it gets the same shape as out matrix (denominator layout!!!)
-    	DPxprim_Dlamda = DPxprim_Dlamda.T
+        DPxprim_Dlamda = DPxprim_Dlamda.T

-	self.lamda.gradient = np.diag(DPxprim_Dlamda)
+        self.lamda.gradient = np.diag(DPxprim_Dlamda)
 	# print DPxprim_Dx
-    	return DPxprim_Dx
+        return DPxprim_Dx


    # def frb(self, x):
@ -1139,8 +1132,8 @@ class DGPLVM_T(Prior):
    # This function calculates log of our prior
    def lnpdf(self, x):
        x = x.reshape(self.x_shape)
-	xprim = x.dot(self.vec)
-	x = xprim
+        xprim = x.dot(self.vec)
+        x = xprim
 	# print x  
        cls = self.compute_cls(x)
        M_0 = np.mean(x, axis=0)
@ -1156,11 +1149,11 @@ class DGPLVM_T(Prior):

    # This function calculates derivative of the log of prior function
    def lnpdf_grad(self, x):
-	x = x.reshape(self.x_shape)
-	xprim = x.dot(self.vec)
-	x = xprim 
+        x = x.reshape(self.x_shape)
+        xprim = x.dot(self.vec)
+        x = xprim 
 	# print x       
-	cls = self.compute_cls(x)
+        cls = self.compute_cls(x)
        M_0 = np.mean(x, axis=0)
        M_i = self.compute_Mi(cls)
        Sb = self.compute_Sb(cls, M_i, M_0)
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@ -31,6 +31,16 @@ class Transformation(object):
        raise NotImplementedError
    def finv(self, model_param):
        raise NotImplementedError
+    def log_jacobian(self, model_param):
+        """
+        compute the log of the jacobian of f, evaluated at f(x)= model_param
+        """
+        raise NotImplementedError
+    def log_jacobian_grad(self, model_param):
+        """
+        compute the drivative of the log of the jacobian of f, evaluated at f(x)= model_param
+        """
+        raise NotImplementedError
    def gradfactor(self, model_param, dL_dmodel_param):
        """ df(opt_param)_dopt_param evaluated at self.f(opt_param)=model_param, times the gradient dL_dmodel_param,

@ -74,9 +84,33 @@ class Logexp(Transformation):
        if np.any(f < 0.):
            print("Warning: changing parameters to satisfy constraints")
        return np.abs(f)
+    def log_jacobian(self, model_param):
+        return np.where(model_param>_lim_val, model_param, np.log(np.exp(model_param+1e-20) - 1.)) - model_param
+    def log_jacobian_grad(self, model_param):
+        return 1./(np.exp(model_param)-1.)
    def __str__(self):
        return '+ve'

+class Exponent(Transformation):
+    domain = _POSITIVE
+    def f(self, x):
+        return np.where(x<_lim_val, np.where(x>-_lim_val, np.exp(x), np.exp(-_lim_val)), np.exp(_lim_val))
+    def finv(self, x):
+        return np.log(x)
+    def gradfactor(self, f, df):
+        return np.einsum('i,i->i', df, f)
+    def initialize(self, f):
+        if np.any(f < 0.):
+            print("Warning: changing parameters to satisfy constraints")
+        return np.abs(f)
+    def log_jacobian(self, model_param):
+        return np.log(model_param)
+    def log_jacobian_grad(self, model_param):
+        return 1./model_param
+    def __str__(self):
+        return '+ve'
+
+

 class NormalTheta(Transformation):
    "Do not use, not officially supported!"
@ -417,22 +451,6 @@ class LogexpClipped(Logexp):
    def __str__(self):
        return '+ve_c'

-class Exponent(Transformation):
-    # TODO: can't allow this to go to zero, need to set a lower bound. Similar with negative Exponent below. See old MATLAB code.
-    domain = _POSITIVE
-    def f(self, x):
-        return np.where(x<_lim_val, np.where(x>-_lim_val, np.exp(x), np.exp(-_lim_val)), np.exp(_lim_val))
-    def finv(self, x):
-        return np.log(x)
-    def gradfactor(self, f, df):
-        return np.einsum('i,i->i', df, f)
-    def initialize(self, f):
-        if np.any(f < 0.):
-            print("Warning: changing parameters to satisfy constraints")
-        return np.abs(f)
-    def __str__(self):
-        return '+ve'
-
 class NegativeExponent(Exponent):
    domain = _NEGATIVE
    def f(self, x):
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@ -36,8 +36,9 @@ class NormalPrior(VariationalPrior):
        variational_posterior.variance.gradient -= (1. - (1. / (variational_posterior.variance))) * 0.5

 class SpikeAndSlabPrior(VariationalPrior):
-    def __init__(self, pi=None, learnPi=False, variance = 1.0, name='SpikeAndSlabPrior', **kw):
-        super(SpikeAndSlabPrior, self).__init__(name=name, **kw)        
+    def __init__(self, pi=None, learnPi=False, variance = 1.0, group_spike=False, name='SpikeAndSlabPrior', **kw):
+        super(SpikeAndSlabPrior, self).__init__(name=name, **kw)
+        self.group_spike = group_spike
        self.variance = Param('variance',variance)
        self.learnPi = learnPi
        if learnPi:
@ -50,7 +51,10 @@ class SpikeAndSlabPrior(VariationalPrior):
    def KL_divergence(self, variational_posterior):
        mu = variational_posterior.mean
        S = variational_posterior.variance
-        gamma = variational_posterior.gamma.values
+        if self.group_spike:
+            gamma = variational_posterior.gamma.values[0]
+        else:
+            gamma = variational_posterior.gamma.values
        if len(self.pi.shape)==2:
            idx = np.unique(variational_posterior.gamma._raveled_index()/gamma.shape[-1])
            pi = self.pi[idx]
@ -65,14 +69,21 @@ class SpikeAndSlabPrior(VariationalPrior):
    def update_gradients_KL(self, variational_posterior):
        mu = variational_posterior.mean
        S = variational_posterior.variance
-        gamma = variational_posterior.gamma.values
+        if self.group_spike:
+            gamma = variational_posterior.gamma.values[0]
+        else:
+            gamma = variational_posterior.gamma.values
        if len(self.pi.shape)==2:
            idx = np.unique(variational_posterior.gamma._raveled_index()/gamma.shape[-1])
            pi = self.pi[idx]
        else:
            pi = self.pi

-        variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
+        if self.group_spike:
+            dgamma = np.log((1-pi)/pi*gamma/(1.-gamma))/variational_posterior.num_data
+        else:
+            dgamma = np.log((1-pi)/pi*gamma/(1.-gamma))
+        variational_posterior.binary_prob.gradient -= dgamma+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
        mu.gradient -= gamma*mu/self.variance
        S.gradient -= (1./self.variance - 1./S) * gamma /2.
        if self.learnPi:
@ -150,17 +161,45 @@ class NormalPosterior(VariationalPosterior):
        from ...plotting.matplot_dep import variational_plots
        return variational_plots.plot(self, *args, **kwargs)

+    def KL(self, other):
+        """Compute the KL divergence to another NormalPosterior Object. This only holds, if the two NormalPosterior objects have the same shape, as we do computational tricks for the multivariate normal KL divergence.
+        """
+        return .5*(
+            np.sum(self.variance/other.variance) 
+            + ((other.mean-self.mean)**2/other.variance).sum() 
+            - self.num_data * self.input_dim
+            + np.sum(np.log(other.variance)) - np.sum(np.log(self.variance))
+            )
+    
 class SpikeAndSlabPosterior(VariationalPosterior):
    '''
    The SpikeAndSlab distribution for variational approximations.
    '''
-    def __init__(self, means, variances, binary_prob, name='latent space'):
+    def __init__(self, means, variances, binary_prob, group_spike=False, sharedX=False, name='latent space'):
        """
        binary_prob : the probability of the distribution on the slab part.
        """
        super(SpikeAndSlabPosterior, self).__init__(means, variances, name)
-        self.gamma = Param("binary_prob",binary_prob,Logistic(0.,1.))
-        self.link_parameter(self.gamma)
+        self.group_spike = group_spike
+        self.sharedX = sharedX
+        if sharedX:
+            self.mean.fix(warning=False)
+            self.variance.fix(warning=False)
+        if group_spike:
+            self.gamma_group = Param("binary_prob_group",binary_prob.mean(axis=0),Logistic(1e-10,1.-1e-10))
+            self.gamma = Param("binary_prob",binary_prob, __fixed__)
+            self.link_parameters(self.gamma_group,self.gamma)
+        else:
+            self.gamma = Param("binary_prob",binary_prob,Logistic(1e-10,1.-1e-10))
+            self.link_parameter(self.gamma)
+            
+    def propogate_val(self):
+        if self.group_spike:
+            self.gamma.values[:] = self.gamma_group.values
+    
+    def collate_gradient(self):
+        if self.group_spike:
+            self.gamma_group.gradient = self.gamma.gradient.reshape(self.gamma.shape).sum(axis=0)

    def set_gradients(self, grad):
        self.mean.gradient, self.variance.gradient, self.gamma.gradient = grad
@ -179,15 +218,15 @@ class SpikeAndSlabPosterior(VariationalPosterior):
            n.parameters[dc['variance']._parent_index_] = dc['variance']
            n.parameters[dc['binary_prob']._parent_index_] = dc['binary_prob']
            n._gradient_array_ = None
-            oversize = self.size - self.mean.size - self.variance.size
-            n.size = n.mean.size + n.variance.size + oversize
+            oversize = self.size - self.mean.size - self.variance.size - self.gamma.size
+            n.size = n.mean.size + n.variance.size + n.gamma.size + oversize
            n.ndim = n.mean.ndim
            n.shape = n.mean.shape
            n.num_data = n.mean.shape[0]
            n.input_dim = n.mean.shape[1] if n.ndim != 1 else 1
            return n
        else:
-            return super(VariationalPrior, self).__getitem__(s)
+            return super(SpikeAndSlabPosterior, self).__getitem__(s)

    def plot(self, *args, **kwargs):
        """
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -133,7 +133,7 @@ class SparseGP(GP):
                    var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
                elif self.posterior.woodbury_inv.ndim == 3:
                    var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
-                    for i in range(var.shape[1]):
+                    for i in range(var.shape[2]):
                        var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
                var = var
            else:
--- a/GPy/core/sparse_gp_mpi.py
+++ b/GPy/core/sparse_gp_mpi.py
@ -34,7 +34,7 @@ class SparseGP_MPI(SparseGP):

    """

-    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp mpi', Y_metadata=None, mpi_comm=None, normalizer=False):
+    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp', Y_metadata=None, mpi_comm=None, normalizer=False):
        self._IN_OPTIMIZATION_ = False
        if mpi_comm != None:
            if inference_method is None:
--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@ -5,7 +5,7 @@ import numpy as np
 from ..util import choleskies
 from .sparse_gp import SparseGP
 from .parameterization.param import Param
-from ..inference.latent_function_inference import SVGP as svgp_inf
+from ..inference.latent_function_inference.svgp import SVGP as svgp_inf


 class SVGP(SparseGP):
@ -46,7 +46,7 @@ class SVGP(SparseGP):
            num_latent_functions = Y.shape[1]

        self.m = Param('q_u_mean', np.zeros((self.num_inducing, num_latent_functions)))
-        chol = choleskies.triang_to_flat(np.tile(np.eye(self.num_inducing)[:,:,None], (1,1,num_latent_functions)))
+        chol = choleskies.triang_to_flat(np.tile(np.eye(self.num_inducing)[None,:,:], (num_latent_functions, 1,1)))
        self.chol = Param('q_u_chol', chol)
        self.link_parameter(self.chol)
        self.link_parameter(self.m)
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -217,9 +217,8 @@ def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=
    elif model_type == 'FITC':
        m = GPy.models.FITCClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing)
        m['.*len'] = 3.
-
    if optimize:
-        m.pseudo_EM()
+        m.optimize()

    if plot:
        m.plot()
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -215,6 +215,7 @@ def ssgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40
    return m

 def _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim=False):
+    """Simulate some data drawn from a matern covariance and a periodic exponential for use in MRD demos."""
    Q_signal = 4
    import GPy
    import numpy as np
@ -254,6 +255,7 @@ def _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim=False):
    return slist, [S1, S2, S3], Ylist

 def _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim=False):
+    """Simulate some data drawn from sine and cosine for use in demos of MRD"""
    _np.random.seed(1234)

    x = _np.linspace(0, 4 * _np.pi, N)[:, None]
@ -353,13 +355,13 @@ def ssgplvm_simulation(optimize=True, verbose=1,
    Y = Ylist[0]
    k = kern.Linear(Q, ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
    # k = kern.RBF(Q, ARD=True, lengthscale=10.)
-    m = SSGPLVM(Y, Q, init="pca", num_inducing=num_inducing, kernel=k)
+    m = SSGPLVM(Y, Q, init="rand", num_inducing=num_inducing, kernel=k, group_spike=True)
    m.X.variance[:] = _np.random.uniform(0, .01, m.X.shape)
-    m.likelihood.variance = .1
+    m.likelihood.variance = .01

    if optimize:
        print("Optimizing model:")
-        m.optimize('scg', messages=verbose, max_iters=max_iters,
+        m.optimize('bfgs', messages=verbose, max_iters=max_iters,
                   gtol=.05)
    if plot:
        m.X.plot("SSGPLVM Latent Space 1D")
@ -402,7 +404,8 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
    from GPy.models import MRD

    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
-    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
+    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim)
+    

    # Ylist = [Ylist[0]]
    k = kern.Linear(Q, ARD=True)
@ -585,6 +588,7 @@ def robot_wireless(optimize=True, verbose=True, plot=True):
    return m

 def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
+    """Interactive visualisation of the Stick Man data from Ohio State University with the Bayesian GPLVM."""
    from GPy.models import BayesianGPLVM
    from matplotlib import pyplot as plt
    import numpy as np
@ -613,7 +617,8 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y, connect=data['connect'])
        dim_select = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean[:1, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
        fig.canvas.draw()
-        fig.canvas.show()
+        # Canvas.show doesn't work on OSX.
+        #fig.canvas.show()
        raw_input('Press enter to finish')

    return m
--- a/GPy/inference/latent_function_inference/init.py
+++ b/GPy/inference/latent_function_inference/init.py
@ -69,7 +69,7 @@ from .expectation_propagation_dtc import EPDTC
 from .dtc import DTC
 from .fitc import FITC
 from .var_dtc_parallel import VarDTC_minibatch
-from .svgp import SVGP
+from .var_gauss import VarGauss

 # class FullLatentFunctionData(object):
 #
--- a/GPy/inference/latent_function_inference/inferenceX.py
+++ b/GPy/inference/latent_function_inference/inferenceX.py
@ -4,6 +4,7 @@
 import numpy as np
 from ...core import Model
 from ...core.parameterization import variational
+from GPy.core.parameterization.variational import VariationalPosterior

 def infer_newX(model, Y_new, optimize=True, init='L2'):
    """
@ -27,12 +28,19 @@ def infer_newX(model, Y_new, optimize=True, init='L2'):

 class InferenceX(Model):
    """
-    The class for inference of new X with given new Y. (do_test_latent)
+    The model class for inference of new X with given new Y. (replacing the "do_test_latent" in Bayesian GPLVM)
+    It is a tiny inference model created from the original GP model. The kernel, likelihood (only Gaussian is supported at the moment) 
+    and posterior distribution are taken from the original model.
+    For Regression models and GPLVM, a point estimate of the latent variable X will be inferred. 
+    For Bayesian GPLVM, the variational posterior of X will be inferred. 
+    X is inferred through a gradient optimization of the inference model.

    :param model: the GPy model used in inference
    :type model: GPy.core.Model
    :param Y: the new observed data for inference
    :type Y: numpy.ndarray
+    :param init: the distance metric of Y for initializing X with the nearest neighbour.
+    :type init: 'L2', 'NCC' and 'rand'
    """
    def __init__(self, model, Y, name='inferenceX', init='L2'):
        if np.isnan(Y).any() or getattr(model, 'missing_data', False):
@ -45,20 +53,27 @@ class InferenceX(Model):
        super(InferenceX, self).__init__(name)
        self.likelihood = model.likelihood.copy()
        self.kern = model.kern.copy()
-        if model.kern.useGPU:
-            from ...models import SSGPLVM
-            if isinstance(model, SSGPLVM):
-                self.kern.GPU_SSRBF(True)
-            else:
-                self.kern.GPU(True)
+#         if model.kern.useGPU:
+#             from ...models import SSGPLVM
+#             if isinstance(model, SSGPLVM):
+#                 self.kern.GPU_SSRBF(True)
+#             else:
+#                 self.kern.GPU(True)
        from copy import deepcopy
        self.posterior = deepcopy(model.posterior)
-        if hasattr(model, 'variational_prior'):
+        from ...core.parameterization.variational import VariationalPosterior
+        if isinstance(model.X, VariationalPosterior):
            self.uncertain_input = True
-            self.variational_prior = model.variational_prior.copy()
+            from ...models.ss_gplvm import IBPPrior
+            from ...models.ss_mrd import IBPPrior_SSMRD
+            if isinstance(model.variational_prior, IBPPrior) or isinstance(model.variational_prior, IBPPrior_SSMRD):
+                from ...core.parameterization.variational import SpikeAndSlabPrior
+                self.variational_prior = SpikeAndSlabPrior(pi=0.5, learnPi=False, group_spike=False)
+            else:
+                self.variational_prior = model.variational_prior.copy()
        else:
            self.uncertain_input = False
-        if hasattr(model, 'inducing_inputs'):
+        if hasattr(model, 'Z'):
            self.sparse_gp = True
            self.Z = model.Z.copy()
        else:
@ -147,9 +162,9 @@ class InferenceX(Model):
            from ...core.parameterization.variational import SpikeAndSlabPrior
            if isinstance(self.variational_prior, SpikeAndSlabPrior):
                # Update Log-likelihood
-                KL_div = self.variational_prior.KL_divergence(self.X, N=self.Y.shape[0])
+                KL_div = self.variational_prior.KL_divergence(self.X)
                # update for the KL divergence
-                self.variational_prior.update_gradients_KL(self.X, N=self.Y.shape[0])
+                self.variational_prior.update_gradients_KL(self.X)
            else:
                # Update Log-likelihood
                KL_div = self.variational_prior.KL_divergence(self.X)
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@ -139,10 +139,6 @@ class Laplace(LatentFunctionInference):

        f_hat, Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)

-        self.f_hat = f_hat
-        #self.Ki_fhat =  Ki_fhat
-        #self.K = K.copy()
-
        #Compute hessian and other variables at mode
        log_marginal, woodbury_inv, dL_dK, dL_dthetaL = self.mode_computations(f_hat, Ki_fhat, K, Y, likelihood, kern, Y_metadata)

@ -175,7 +171,9 @@ class Laplace(LatentFunctionInference):
        #define the objective function (to be maximised)
        def obj(Ki_f, f):
            ll = -0.5*np.sum(np.dot(Ki_f.T, f)) + np.sum(likelihood.logpdf(f, Y, Y_metadata=Y_metadata))
+            print ll
            if np.isnan(ll):
+                import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
                return -np.inf
            else:
                return ll
@ -298,6 +296,11 @@ class Laplace(LatentFunctionInference):
        else:
            dL_dthetaL = np.zeros(likelihood.size)

+        #Cache some things for speedy LOO
+        self.Ki_W_i = Ki_W_i
+        self.K = K
+        self.W = W
+        self.f_hat = f_hat
        return log_marginal, K_Wi_i, dL_dK, dL_dthetaL

    def _compute_B_statistics(self, K, W, log_concave, *args, **kwargs):
--- a/GPy/inference/latent_function_inference/svgp.py
+++ b/GPy/inference/latent_function_inference/svgp.py
@ -3,6 +3,7 @@ from ...util import linalg
 from ...util import choleskies
 import numpy as np
 from .posterior import Posterior
+from scipy.linalg.blas import dgemm, dsymm, dtrmm

 class SVGP(LatentFunctionInference):

@ -16,16 +17,13 @@ class SVGP(LatentFunctionInference):


        S = np.empty((num_outputs, num_inducing, num_inducing))
-        [np.dot(L[:,:,i], L[:,:,i].T, S[i,:,:]) for i in range(num_outputs)]
-        S = S.swapaxes(0,2)
+        [np.dot(L[i,:,:], L[i,:,:].T, S[i,:,:]) for i in range(num_outputs)]
        #Si,_ = linalg.dpotri(np.asfortranarray(L), lower=1)
        Si = choleskies.multiple_dpotri(L)
-        logdetS = np.array([2.*np.sum(np.log(np.abs(np.diag(L[:,:,i])))) for i in range(L.shape[-1])])
+        logdetS = np.array([2.*np.sum(np.log(np.abs(np.diag(L[i,:,:])))) for i in range(L.shape[0])])

        if np.any(np.isinf(Si)):
            raise ValueError("Cholesky representation unstable")
-            #S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S))
-            #Si, Lnew, _,_ = linalg.pdinv(S)

        #compute mean function stuff
        if mean_function is not None:
@ -35,27 +33,31 @@ class SVGP(LatentFunctionInference):
            prior_mean_u = np.zeros((num_inducing, num_outputs))
            prior_mean_f = np.zeros((num_data, num_outputs))

-
        #compute kernel related stuff
        Kmm = kern.K(Z)
-        Knm = kern.K(X, Z)
+        Kmn = kern.K(Z, X)
        Knn_diag = kern.Kdiag(X)
-        Kmmi, Lm, Lmi, logdetKmm = linalg.pdinv(Kmm)
+        Lm = linalg.jitchol(Kmm)
+        logdetKmm = 2.*np.sum(np.log(np.diag(Lm)))
+        Kmmi, _ = linalg.dpotri(Lm)

        #compute the marginal means and variances of q(f)
-        A = np.dot(Knm, Kmmi)
-        mu = prior_mean_f + np.dot(A, q_u_mean - prior_mean_u)
-        #v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * np.einsum('ij,jlk->ilk', A, S),1)
-        v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * linalg.ij_jlk_to_ilk(A, S),1)
+        A, _ = linalg.dpotrs(Lm, Kmn)
+        mu = prior_mean_f + np.dot(A.T, q_u_mean - prior_mean_u)
+        v = np.empty((num_data, num_outputs))
+        for i in range(num_outputs):
+            tmp = dtrmm(1.0,L[i].T, A, lower=0, trans_a=0)
+            v[:,i] = np.sum(np.square(tmp),0)
+        v += (Knn_diag - np.sum(A*Kmn,0))[:,None]

        #compute the KL term
        Kmmim = np.dot(Kmmi, q_u_mean)
-        KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.sum(Kmmi[:,:,None]*S,0).sum(0) + 0.5*np.sum(q_u_mean*Kmmim,0)
+        KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.sum(Kmmi[None,:,:]*S,1).sum(1) + 0.5*np.sum(q_u_mean*Kmmim,0)
        KL = KLs.sum()
        #gradient of the KL term (assuming zero mean function)
        dKL_dm = Kmmim.copy()
-        dKL_dS = 0.5*(Kmmi[:,:,None] - Si)
-        dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)
+        dKL_dS = 0.5*(Kmmi[None,:,:] - Si)
+        dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(0)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)

        if mean_function is not None:
            #adjust KL term for mean function
@ -80,17 +82,20 @@ class SVGP(LatentFunctionInference):
            dF_dthetaL =  dF_dthetaL.sum(1).sum(1)*batch_scale

        #derivatives of expected likelihood, assuming zero mean function
-        Adv = A.T[:,:,None]*dF_dv[None,:,:] # As if dF_Dv is diagonal
-        Admu = A.T.dot(dF_dmu)
-        AdvA = np.dstack([np.dot(A.T, Adv[:,:,i].T) for i in range(num_outputs)])
-        #tmp = np.einsum('ijk,jlk->il', AdvA, S).dot(Kmmi)
-        tmp = linalg.ijk_jlk_to_il(AdvA, S).dot(Kmmi)
-        dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(-1) - tmp - tmp.T
+        Adv = A[None,:,:]*dF_dv.T[:,None,:] # As if dF_Dv is diagonal, D, M, N
+        Admu = A.dot(dF_dmu)
+        Adv = np.ascontiguousarray(Adv) # makes for faster operations later...(inc dsymm)
+        AdvA = np.dot(Adv.reshape(-1, num_data),A.T).reshape(num_outputs, num_inducing, num_inducing )
+        tmp = np.sum([np.dot(a,s) for a, s in zip(AdvA, S)],0).dot(Kmmi)
+        dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(0) - tmp - tmp.T
        dF_dKmm = 0.5*(dF_dKmm + dF_dKmm.T) # necessary? GPy bug?
-        #tmp = 2.*(np.einsum('ij,jlk->ilk', Kmmi,S) - np.eye(num_inducing)[:,:,None])
-        tmp = 2.*(linalg.ij_jlk_to_ilk(Kmmi, S) - np.eye(num_inducing)[:,:,None])
-        #dF_dKmn = np.einsum('ijk,jlk->il', tmp, Adv) + Kmmim.dot(dF_dmu.T)
-        dF_dKmn = linalg.ijk_jlk_to_il(tmp, Adv) + Kmmim.dot(dF_dmu.T)
+        tmp = S.reshape(-1, num_inducing).dot(Kmmi).reshape(num_outputs, num_inducing , num_inducing )
+        tmp = 2.*(tmp - np.eye(num_inducing)[None, :,:])
+
+        dF_dKmn = Kmmim.dot(dF_dmu.T)
+        for a,b in zip(tmp, Adv):
+            dF_dKmn += np.dot(a.T, b)
+
        dF_dm = Admu
        dF_dS = AdvA

@ -106,11 +111,11 @@ class SVGP(LatentFunctionInference):
        log_marginal = F.sum() - KL
        dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn

-        dL_dchol = np.dstack([2.*np.dot(dL_dS[:,:,i], L[:,:,i]) for i in range(num_outputs)])
+        dL_dchol = 2.*np.array([np.dot(a,b) for a, b in zip(dL_dS, L) ])
        dL_dchol = choleskies.triang_to_flat(dL_dchol)

        grad_dict = {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv.sum(1), 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
        if mean_function is not None:
            grad_dict['dL_dmfZ'] = dF_dmfZ - dKL_dmfZ
            grad_dict['dL_dmfX'] = dF_dmfX
-        return Posterior(mean=q_u_mean, cov=S, K=Kmm, prior_mean=prior_mean_u), log_marginal, grad_dict
+        return Posterior(mean=q_u_mean, cov=S.T, K=Kmm, prior_mean=prior_mean_u), log_marginal, grad_dict
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@ -172,18 +172,23 @@ class VarDTC_minibatch(LatentFunctionInference):
        if not np.isfinite(Kmm).all():
            print(Kmm)
        Lm = jitchol(Kmm)
+        LmInv = dtrtri(Lm)

-        LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right')
+        LmInvPsi2LmInvT = LmInv.dot(psi2_full.dot(LmInv.T))
        Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT
        LL = jitchol(Lambda)
+        LLInv = dtrtri(LL)
        logdet_L = 2.*np.sum(np.log(np.diag(LL)))
-        b = dtrtrs(LL,dtrtrs(Lm,psi1Y_full.T)[0])[0]
+        LmLLInv = LLInv.dot(LmInv)
+        
+        b  = psi1Y_full.dot(LmLLInv.T)
        bbt = np.square(b).sum()
-        v = dtrtrs(Lm,dtrtrs(LL,b,trans=1)[0],trans=1)[0]
-
-        tmp  = -backsub_both_sides(LL, tdot(b)+output_dim*np.eye(input_dim), transpose='left')
-        dL_dpsi2R = backsub_both_sides(Lm, tmp+output_dim*np.eye(input_dim), transpose='left')/2.
-
+        v = b.dot(LmLLInv).T
+        LLinvPsi1TYYTPsi1LLinvT = tdot(b.T)
+        
+        tmp = -LLInv.T.dot(LLinvPsi1TYYTPsi1LLinvT+output_dim*np.eye(input_dim)).dot(LLInv)
+        dL_dpsi2R = LmInv.T.dot(tmp+output_dim*np.eye(input_dim)).dot(LmInv)/2.
+        
        # Cache intermediate results
        self.midRes['dL_dpsi2R'] = dL_dpsi2R
        self.midRes['v'] = v
@ -201,7 +206,7 @@ class VarDTC_minibatch(LatentFunctionInference):
        # Compute dL_dKmm
        #======================================================================

-        dL_dKmm =  dL_dpsi2R - output_dim*backsub_both_sides(Lm, LmInvPsi2LmInvT, transpose='left')/2.
+        dL_dKmm =  dL_dpsi2R - output_dim*LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2.

        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
--- a/GPy/inference/latent_function_inference/var_gauss.py
+++ b/GPy/inference/latent_function_inference/var_gauss.py
@ -0,0 +1,66 @@
+# Copyright (c) 2015, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+import numpy as np
+from ...util.linalg import pdinv
+from .posterior import Posterior
+from . import LatentFunctionInference
+log_2_pi = np.log(2*np.pi)
+
+class VarGauss(LatentFunctionInference):
+    """
+    The Variational Gaussian Approximation revisited
+
+    @article{Opper:2009,
+        title = {The Variational Gaussian Approximation Revisited},
+        author = {Opper, Manfred and Archambeau, C{\'e}dric},
+        journal = {Neural Comput.},
+        year = {2009},
+        pages = {786--792},
+    }
+    """
+    def __init__(self, alpha, beta):
+        """
+        :param alpha: GPy.core.Param varational parameter
+        :param beta: GPy.core.Param varational parameter
+        """
+        self.alpha, self.beta = alpha, beta
+
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, Z=None):
+        if mean_function is not None:
+            raise NotImplementedError
+        num_data, output_dim = Y.shape
+        assert output_dim ==1, "Only one output supported"
+
+        K = kern.K(X)
+        m = K.dot(self.alpha)
+        KB = K*self.beta[:, None]
+        BKB = KB*self.beta[None, :]
+        A = np.eye(num_data) + BKB
+        Ai, LA, _, Alogdet = pdinv(A)
+        Sigma = np.diag(self.beta**-2) - Ai/self.beta[:, None]/self.beta[None, :]  # posterior coavairance: need full matrix for gradients
+        var = np.diag(Sigma).reshape(-1,1)
+
+        F, dF_dm, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, m, var, Y_metadata=Y_metadata)
+        if dF_dthetaL is not None:
+            dL_dthetaL = dF_dthetaL.sum(1).sum(1)
+        else:
+            dL_dthetaL = np.array([])
+        dF_da = np.dot(K, dF_dm)
+        SigmaB = Sigma*self.beta
+        dF_db = -np.diag(Sigma.dot(np.diag(dF_dv.flatten())).dot(SigmaB))*2
+        KL = 0.5*(Alogdet + np.trace(Ai) - num_data + np.sum(m*self.alpha))
+        dKL_da = m
+        A_A2 = Ai - Ai.dot(Ai)
+        dKL_db = np.diag(np.dot(KB.T, A_A2))
+        log_marginal = F.sum() - KL
+        self.alpha.gradient = dF_da - dKL_da
+        self.beta.gradient = dF_db - dKL_db
+
+        # K-gradients
+        dKL_dK = 0.5*(self.alpha*self.alpha.T + self.beta[:, None]*self.beta[None, :]*A_A2)
+        tmp = Ai*self.beta[:, None]/self.beta[None, :]
+        dF_dK = self.alpha*dF_dm.T + np.dot(tmp*dF_dv, tmp.T)
+
+        return Posterior(mean=m, cov=Sigma ,K=K),\
+               log_marginal,\
+               {'dL_dK':dF_dK-dKL_dK, 'dL_dthetaL':dL_dthetaL}
--- a/GPy/inference/optimization/optimization.py
+++ b/GPy/inference/optimization/optimization.py
@ -142,7 +142,7 @@ class opt_lbfgsb(Optimizer):

        #a more helpful error message is available in opt_result in the Error case
        if opt_result[2]['warnflag']==2:
-            self.status = 'Error' + opt_result[2]['task']
+            self.status = 'Error' + str(opt_result[2]['task'])

 class opt_simplex(Optimizer):
    def __init__(self, *args, **kwargs):
--- a/GPy/inference/optimization/stochastics.py
+++ b/GPy/inference/optimization/stochastics.py
@ -5,6 +5,10 @@ class StochasticStorage(object):
    '''
    This is a container for holding the stochastic parameters,
    such as subset indices or step length and so on.
+    
+    self.d has to be a list of lists:
+    [dimension indices, nan indices for those dimensions]
+    so that the minibatches can be used as efficiently as possible.10
    '''
    def __init__(self, model):
        """
@ -28,9 +32,23 @@ class SparseGPMissing(StochasticStorage):
        """
        Here we want to loop over all dimensions everytime.
        Thus, we can just make sure the loop goes over self.d every
-        time.
+        time. We will try to get batches which look the same together
+        which speeds up calculations significantly.
        """
-        self.d = range(model.Y_normalized.shape[1])
+        import numpy as np
+        self.Y = model.Y_normalized
+        bdict = {}
+        for d in range(self.Y.shape[1]):
+            inan = np.isnan(self.Y[:, d])
+            arr_str = np.array2string(inan, 
+                                      np.inf, 0, 
+                                      True, '', 
+                                      formatter={'bool':lambda x: '1' if x else '0'})
+            try:
+                bdict[arr_str][0].append(d)
+            except:
+                bdict[arr_str] = [[d], ~inan]
+        self.d = bdict.values()

 class SparseGPStochastics(StochasticStorage):
    """
@ -40,16 +58,29 @@ class SparseGPStochastics(StochasticStorage):
    def __init__(self, model, batchsize=1):
        self.batchsize = batchsize
        self.output_dim = model.Y.shape[1]
+        self.Y = model.Y_normalized
        self.reset()
        self.do_stochastics()

    def do_stochastics(self):
        if self.batchsize == 1:
            self.current_dim = (self.current_dim+1)%self.output_dim
-            self.d = [self.current_dim]
+            self.d = [[[self.current_dim], np.isnan(self.Y[:, self.d])]]
        else:
            import numpy as np
            self.d = np.random.choice(self.output_dim, size=self.batchsize, replace=False)
+            bdict = {}
+            for d in self.d:
+                inan = np.isnan(self.Y[:, d])
+                arr_str = int(np.array2string(inan, 
+                                          np.inf, 0, 
+                                          True, '', 
+                                          formatter={'bool':lambda x: '1' if x else '0'}), 2)
+                try:
+                    bdict[arr_str][0].append(d)
+                except:
+                    bdict[arr_str] = [[d], ~inan]
+            self.d = bdict.values()

    def reset(self):
        self.current_dim = -1
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -6,6 +6,7 @@ from ._src.brownian import Brownian
 from ._src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
 from ._src.mlp import MLP
 from ._src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52
+from ._src.standard_periodic import StdPeriodic
 from ._src.independent_outputs import IndependentOutputs, Hierarchical
 from ._src.coregionalize import Coregionalize
 from ._src.ODE_UY import ODE_UY
@ -17,7 +18,7 @@ from ._src.eq_ode2 import EQ_ODE2
 from ._src.trunclinear import TruncLinear,TruncLinear_inf
 from ._src.splitKern import SplitKern,DEtime
 from ._src.splitKern import DEtime as DiffGenomeKern
-
-
-from _src.basis_funcs import LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel
+from ._src.spline import Spline
+from ._src.eq_ode2 import EQ_ODE2
+from ._src.basis_funcs import LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel

--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@ -6,7 +6,7 @@ import numpy as np
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ...util.config import config # for assesing whether to use cython
-import coregionalize_cython
+from . import coregionalize_cython

 class Coregionalize(Kern):
    """
@ -94,7 +94,7 @@ class Coregionalize(Kern):
            dL_dK_small = self._gradient_reduce_numpy(dL_dK, index, index2)


-        dkappa = np.diag(dL_dK_small)
+        dkappa = np.diag(dL_dK_small).copy()
        dL_dK_small += dL_dK_small.T
        dW = (self.W[:, None, :]*dL_dK_small[:, :, None]).sum(0)

--- a/GPy/kern/_src/coregionalize_cython.c
+++ b/GPy/kern/_src/coregionalize_cython.c
--- a/GPy/kern/_src/coregionalize_cython.pyx
+++ b/GPy/kern/_src/coregionalize_cython.pyx
@ -1,33 +1,37 @@
-#cython: boundscheck=True
-#cython: wraparound=True
+#cython: boundscheck=False
+#cython: wraparound=False
+#cython: nonecheck=False
 import cython
 import numpy as np
 cimport numpy as np

 def K_symmetric(np.ndarray[double, ndim=2] B, np.ndarray[np.int64_t, ndim=1] X):
    cdef int N = X.size
-    cdef np.ndarray[np.double_t, ndim=2] K = np.empty((N, N))
-    for n in range(N):
-        for m in range(N):
-            K[n,m] = B[X[n],X[m]]
+    cdef np.ndarray[np.double_t, ndim=2, mode='c'] K = np.empty((N, N))
+    with nogil:
+        for n in range(N):
+            for m in range(N):
+                K[n, m] = B[X[n], X[m]]
    return K

 def K_asymmetric(np.ndarray[double, ndim=2] B, np.ndarray[np.int64_t, ndim=1] X, np.ndarray[np.int64_t, ndim=1] X2):
    cdef int N = X.size
    cdef int M = X2.size
-    cdef np.ndarray[np.double_t, ndim=2] K = np.empty((N, M))
-    for n in range(N):
-        for m in range(M):
-            K[n,m] = B[X[n],X2[m]]
+    cdef np.ndarray[np.double_t, ndim=2, mode='c'] K = np.empty((N, M))
+    with nogil:
+        for n in range(N):
+            for m in range(M):
+                K[n, m] = B[X[n], X2[m]]
    return K

 def gradient_reduce(int D, np.ndarray[double, ndim=2] dL_dK, np.ndarray[np.int64_t, ndim=1] index, np.ndarray[np.int64_t, ndim=1] index2):
-        cdef np.ndarray[np.double_t, ndim=2] dL_dK_small = np.zeros((D, D))
+        cdef np.ndarray[np.double_t, ndim=2, mode='c'] dL_dK_small = np.zeros((D, D))
        cdef int N = index.size
        cdef int M = index2.size
-        for i in range(N):
-            for j in range(M):
-                dL_dK_small[index2[j],index[i]] += dL_dK[i,j];
+        with nogil:
+            for i in range(N):
+                for j in range(M):
+                    dL_dK_small[index2[j],index[i]] += dL_dK[i,j];
        return dL_dK_small


--- a/GPy/kern/_src/independent_outputs.py
+++ b/GPy/kern/_src/independent_outputs.py
@ -105,7 +105,7 @@ class IndependentOutputs(CombinationKernel):
        if X2 is None:
            # TODO: make use of index_to_slices
            # FIXME: Broken as X is already sliced out
-            print "Warning, gradients_X may not be working, I believe X has already been sliced out by the slicer!"
+            print("Warning, gradients_X may not be working, I believe X has already been sliced out by the slicer!")
            values = np.unique(X[:,self.index_dim])
            slices = [X[:,self.index_dim]==i for i in values]
            [target.__setitem__(s, kern.gradients_X(dL_dK[s,s],X[s],None))
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@ -59,6 +59,9 @@ class Kern(Parameterized):
        self._sliced_X = 0
        self.useGPU = self._support_GPU and useGPU
        self._return_psi2_n_flag = ObsAr(np.zeros(1)).astype(bool)
+        
+        from .psi_comp import PSICOMP_GH
+        self.psicomp = PSICOMP_GH()

    @property
    def return_psi2_n(self):
@ -90,11 +93,11 @@ class Kern(Parameterized):
    def Kdiag(self, X):
        raise NotImplementedError
    def psi0(self, Z, variational_posterior):
-        raise NotImplementedError
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[0]
    def psi1(self, Z, variational_posterior):
-        raise NotImplementedError
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[1]
    def psi2(self, Z, variational_posterior):
-        raise NotImplementedError
+        return self.psicomp.psicomputations(self, Z, variational_posterior)[2]
    def gradients_X(self, dL_dK, X, X2):
        raise NotImplementedError
    def gradients_X_diag(self, dL_dKdiag, X):
@ -119,21 +122,22 @@ class Kern(Parameterized):
                        dL_dpsi1 * dpsi1_d{theta_i} +
                        dL_dpsi2 * dpsi2_d{theta_i}
        """
-        raise NotImplementedError
+        dtheta = self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[0]
+        self.gradient[:] = dtheta

    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        """
        Returns the derivative of the objective wrt Z, using the chain rule
        through the expectation variables.
        """
-        raise NotImplementedError
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[1]

    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        """
        Compute the gradients wrt the parameters of the variational
        distruibution q(X), chain-ruling via the expectations of the kernel
        """
-        raise NotImplementedError
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[2:]

    def plot(self, x=None, fignum=None, ax=None, title=None, plot_limits=None, resolution=None, **mpl_kwargs):
        """
--- a/GPy/kern/_src/mlp.py
+++ b/GPy/kern/_src/mlp.py
@ -78,7 +78,7 @@ class MLP(Kern):
                           *((vec1[:, None]+vec2[None, :])*self.weight_variance
                             + 2*self.bias_variance + 2.))*base_cov_grad).sum()

-    def update_gradients_diag(self, X):
+    def update_gradients_diag(self, dL_dKdiag, X):
        self._K_diag_computations(X)
        self.variance.gradient = np.sum(self._K_diag_dvar*dL_dKdiag)
        
--- a/GPy/kern/_src/psi_comp/init.py
+++ b/GPy/kern/_src/psi_comp/init.py
@ -8,9 +8,10 @@ from . import rbf_psi_comp
 from . import ssrbf_psi_comp
 from . import sslinear_psi_comp
 from . import linear_psi_comp
+from .gaussherm import PSICOMP_GH

 class PSICOMP_RBF(Pickleable):
-    @Cache_this(limit=2, ignore_args=(0,))
+    @Cache_this(limit=10, ignore_args=(0,))
    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
        if isinstance(variational_posterior, variational.NormalPosterior):
            return rbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior)
@ -19,7 +20,7 @@ class PSICOMP_RBF(Pickleable):
        else:
            raise ValueError("unknown distriubtion received for psi-statistics")

-    @Cache_this(limit=2, ignore_args=(0,1,2,3))
+    @Cache_this(limit=10, ignore_args=(0,1,2,3))
    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
        if isinstance(variational_posterior, variational.NormalPosterior):
            return rbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior)
@ -33,7 +34,7 @@ class PSICOMP_RBF(Pickleable):

 class PSICOMP_Linear(Pickleable):

-    @Cache_this(limit=2, ignore_args=(0,))
+    @Cache_this(limit=10, ignore_args=(0,))
    def psicomputations(self, variance, Z, variational_posterior):
        if isinstance(variational_posterior, variational.NormalPosterior):
            return linear_psi_comp.psicomputations(variance, Z, variational_posterior)
@ -42,7 +43,7 @@ class PSICOMP_Linear(Pickleable):
        else:
            raise ValueError("unknown distriubtion received for psi-statistics")

-    @Cache_this(limit=2, ignore_args=(0,1,2,3))
+    @Cache_this(limit=10, ignore_args=(0,1,2,3))
    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior):
        if isinstance(variational_posterior, variational.NormalPosterior):
            return linear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
--- a/GPy/kern/_src/psi_comp/gaussherm.py
+++ b/GPy/kern/_src/psi_comp/gaussherm.py
@ -0,0 +1,92 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+"""
+An approximated psi-statistics implementation based on Gauss-Hermite Quadrature
+"""
+
+import numpy as np
+
+from GPy.util.caching import Cache_this
+from ....util.linalg import tdot
+from ....core.parameterization.parameter_core import Pickleable
+
+class PSICOMP_GH(Pickleable):
+    
+    def __init__(self, degree=5, cache_K=True):
+        self.degree = degree
+        self.cache_K = cache_K
+        self.locs, self.weights = np.polynomial.hermite.hermgauss(degree)
+        self.locs *= np.sqrt(2.)
+        self.weights*= 1./np.sqrt(np.pi)
+        self.Xs = None
+
+    def _setup_observers(self):
+        pass
+    
+    @Cache_this(limit=10, ignore_args=(0,))
+    def comp_K(self, Z, qX):
+        if self.Xs is None or self.Xs.shape != qX.mean.shape:
+            from ....core.parameterization import ObsAr
+            self.Xs = ObsAr(np.empty((self.degree,)+qX.mean.shape))
+        mu, S = qX.mean.values, qX.variance.values
+        S_sq = np.sqrt(S)
+        for i in xrange(self.degree):
+            self.Xs[i] = self.locs[i]*S_sq+mu
+        return self.Xs
+    
+    @Cache_this(limit=10, ignore_args=(0,))
+    def psicomputations(self, kern, Z, qX):
+        mu, S = qX.mean.values, qX.variance.values
+        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
+        if self.cache_K: Xs = self.comp_K(Z, qX)
+        else: S_sq = np.sqrt(S)
+        
+        psi0 = np.zeros((N,))
+        psi1 = np.zeros((N,M))
+        psi2 = np.zeros((M,M))
+        for i in xrange(self.degree):
+            if self.cache_K:
+                X = Xs[i]
+            else:
+                X = self.locs[i]*S_sq+mu
+            psi0 += self.weights[i]* kern.Kdiag(X)
+            Kfu = kern.K(X,Z)
+            psi1 += self.weights[i]* Kfu
+            psi2 += self.weights[i]* tdot(Kfu.T)
+        return psi0, psi1, psi2
+    
+    @Cache_this(limit=10, ignore_args=(0, 2,3,4))
+    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, qX):
+        mu, S = qX.mean.values, qX.variance.values
+        if self.cache_K: Xs = self.comp_K(Z, qX)
+        S_sq = np.sqrt(S)
+        
+        dtheta_old = kern.gradient.copy()
+        dtheta = np.zeros_like(kern.gradient)
+        dZ = np.zeros_like(Z.values)
+        dmu = np.zeros_like(mu)
+        dS = np.zeros_like(S)
+        for i in xrange(self.degree):
+            if self.cache_K:
+                X = Xs[i]
+            else:
+                X = self.locs[i]*S_sq+mu
+            dL_dpsi0_i = dL_dpsi0*self.weights[i]
+            kern.update_gradients_diag(dL_dpsi0_i, X)
+            dtheta += kern.gradient
+            dX = kern.gradients_X_diag(dL_dpsi0_i, X)
+            Kfu = kern.K(X,Z)
+            dL_dkfu = (dL_dpsi1+ 2.*Kfu.dot(dL_dpsi2))*self.weights[i]
+            kern.update_gradients_full(dL_dkfu, X, Z)
+            dtheta += kern.gradient
+            dX += kern.gradients_X(dL_dkfu, X, Z)
+            dZ += kern.gradients_X(dL_dkfu.T, Z, X)
+            dmu += dX
+            dS += dX*self.locs[i]/(2.*S_sq)
+        kern.gradient[:] = dtheta_old
+        return dtheta, dZ, dmu, dS
+        
+
+
+
--- a/GPy/kern/_src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
@ -55,16 +55,15 @@ def __psi2computations(variance, lengthscale, Z, mu, S):
    # Produced intermediate results:
    # _psi2                MxM

+    N,M,Q = mu.shape[0], Z.shape[0], mu.shape[1]
    lengthscale2 = np.square(lengthscale)

    _psi2_logdenom = np.log(2.*S/lengthscale2+1.).sum(axis=-1)/(-2.) # N
    _psi2_exp1 = (np.square(Z[:,None,:]-Z[None,:,:])/lengthscale2).sum(axis=-1)/(-4.) #MxM
    Z_hat = (Z[:,None,:]+Z[None,:,:])/2. #MxMxQ
    denom = 1./(2.*S+lengthscale2)
-    _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+2.*np.einsum('nq,moq,nq->nmo',mu,Z_hat,denom)-np.einsum('moq,nq->nmo',np.square(Z_hat),denom)
+    _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+(2*(mu*denom).dot(Z_hat.reshape(M*M,Q).T) - denom.dot(np.square(Z_hat).reshape(M*M,Q).T)).reshape(N,M,M)
    _psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2)
-
-
    return _psi2

 def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
@ -157,5 +156,5 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):

    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS

-_psi1computations = Cacher(__psi1computations, limit=1)
-_psi2computations = Cacher(__psi2computations, limit=1)
+_psi1computations = Cacher(__psi1computations, limit=5)
+_psi2computations = Cacher(__psi2computations, limit=5)
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@ -20,7 +20,6 @@ class RBF(Stationary):
    _support_GPU = True
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='rbf', useGPU=False):
        super(RBF, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name, useGPU=useGPU)
-        self.psicomp = PSICOMP_RBF()
        if self.useGPU:
            self.psicomp = PSICOMP_RBF_GPU()
        else:
@ -36,6 +35,7 @@ class RBF(Stationary):
        dc = super(RBF, self).__getstate__()
        if self.useGPU:
            dc['psicomp'] = PSICOMP_RBF()
+            dc['useGPU'] = False
        return dc

    def __setstate__(self, state):
--- a/GPy/kern/_src/spline.py
+++ b/GPy/kern/_src/spline.py
@ -0,0 +1,52 @@
+# Copyright (c) 2015, Thomas Hornung
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from .kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+class Spline(Kern):
+    """
+    Linear spline kernel. You need to specify 2 parameters: the variance and c.
+    The variance is defined in powers of 10. Thus specifying -2 means 10^-2.
+    The parameter c allows to define the stiffness of the spline fit. A very stiff
+    spline equals linear regression.
+    See https://www.youtube.com/watch?v=50Vgw11qn0o starting at minute 1:17:28
+    Lit: Wahba, 1990
+    """
+
+    def __init__(self, input_dim, variance=1., c=1., active_dims=None, name='spline'):
+        super(Spline, self).__init__(input_dim, active_dims, name)
+        self.variance = Param('variance', variance, Logexp())
+        self.c = Param('c', c)
+        self.link_parameters(self.variance,self.c)
+
+
+    def K(self, X, X2=None):
+        if X2 is None: X2=X
+        term1 = (X+8.)*(X2.T+8.)/16.
+        term2 = abs((X-X2.T)/16.)**3
+        term3 = ((X+8.)/16.)**3 + ((X2.T+8.)/16.)**3
+        return (self.variance**2 * (1. + (1.+self.c) * term1 + self.c/3. * (term2 - term3)))
+
+    def Kdiag(self, X):
+        term1 = np.square(X+8.,X+8.)/16.
+        term3 = 2. * ((X+8.)/16.)**3
+        return (self.variance**2 * (1. + (1.+self.c) * term1 - self.c/3. * term3))[:,0]
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        if X2 is None: X2=X
+        term1 = (X+8.)*(X2.T+8.)/16.
+        term2 = abs((X-X2.T)/16.)**3
+        term3 = ((X+8.)/16.)**3 + ((X2.T+8.)/16.)**3
+        self.variance.gradient = np.sum(dL_dK * (2*self.variance * (1. + (1.+self.c) * term1 + self.c/3. * ( term2 - term3))))
+        self.c.gradient = np.sum(dL_dK * (self.variance**2* (term1 + 1./3.*(term2 - term3))))
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        raise NotImplementedError
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        raise NotImplementedError
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        raise NotImplementedError
--- a/GPy/kern/_src/standard_periodic.py
+++ b/GPy/kern/_src/standard_periodic.py
@ -0,0 +1,166 @@
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+"""
+The standard periodic kernel which mentioned in:
+
+[1] Gaussian Processes for Machine Learning, C. E. Rasmussen, C. K. I. Williams.
+The MIT Press, 2005.
+
+
+[2] Introduction to Gaussian processes. D. J. C. MacKay. In C. M. Bishop, editor, 
+Neural Networks and Machine Learning, pages 133-165. Springer, 1998.
+"""
+
+from .kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+
+import numpy as np
+
+class StdPeriodic(Kern):
+    """
+    Standart periodic kernel
+
+    .. math::
+
+       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} {}\sum_{i=1}^{input\_dim}  
+       \left( \frac{\sin(\frac{\pi}{\lambda_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance :math:`\theta_1` in the formula above
+    :type variance: float
+    :param wavelength: the vector of wavelengths :math:`\lambda_i`. If None then 1.0 is assumed.
+    :type wavelength: array or list of the appropriate size (or float if there is only one wavelength parameter)
+    :param lengthscale: the vector of lengthscale :math:`\l_i`. If None then 1.0 is assumed.
+    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
+    :param ARD1: Auto Relevance Determination with respect to wavelength. 
+        If equal to "False" one single wavelength parameter :math:`\lambda_i` for 
+        each dimension is assumed, otherwise there is one lengthscale 
+        parameter per dimension.
+    :type ARD1: Boolean
+    :param ARD2: Auto Relevance Determination with respect to lengthscale. 
+        If equal to "False" one single wavelength parameter :math:`l_i` for 
+        each dimension is assumed, otherwise there is one lengthscale 
+        parameter per dimension.
+    :type ARD2: Boolean
+    :param active_dims: indices of dimensions which are used in the computation of the kernel
+    :type wavelength: array or list of the appropriate size
+    :param name: Name of the kernel for output
+    :type String
+    :param useGPU: whether of not use GPU
+    :type Boolean
+    """
+    
+    def __init__(self, input_dim, variance=1., wavelength=None, lengthscale=None, ARD1=False, ARD2=False, active_dims=None, name='std_periodic',useGPU=False):
+        super(StdPeriodic, self).__init__(input_dim, active_dims, name, useGPU=useGPU)
+        self.input_dim = input_dim
+        self.ARD1 = ARD1 # correspond to wavelengths        
+        self.ARD2 = ARD2 # correspond to lengthscales
+        
+        self.name = name
+        
+        if self.ARD1 == False:
+            if wavelength is not None:
+                wavelength = np.asarray(wavelength)
+                assert wavelength.size == 1, "Only one wavelength needed for non-ARD kernel"
+            else:
+                wavelength = np.ones(1)
+        else:
+            if wavelength is not None:
+                wavelength = np.asarray(wavelength)
+                assert wavelength.size == input_dim, "bad number of wavelengths"
+            else:
+                wavelength = np.ones(input_dim)
+        
+        if self.ARD2 == False:
+            if lengthscale is not None:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
+            else:
+                lengthscale = np.ones(1)
+        else:
+            if lengthscale is not None:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size == input_dim, "bad number of lengthscales"
+            else:
+                lengthscale = np.ones(input_dim)
+        
+        self.variance = Param('variance', variance, Logexp())
+        assert self.variance.size==1, "Variance size must be one"
+        self.wavelengths =  Param('wavelengths', wavelength, Logexp())
+        self.lengthscales =  Param('lengthscales', lengthscale, Logexp())
+        
+        self.link_parameters(self.variance,  self.wavelengths, self.lengthscales)
+
+    def parameters_changed(self):
+        """
+        This functions deals as a callback for each optimization iteration. 
+        If one optimization step was successfull and the parameters
+        this callback function will be called to be able to update any 
+        precomputations for the kernel.
+        """
+        
+        pass
+        
+        
+    def K(self, X, X2=None):
+        """Compute the covariance matrix between X and X2."""
+        if X2 is None: 
+            X2 = X
+            
+        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
+        exp_dist = np.exp( -0.5* np.sum( np.square(  np.sin( base ) / self.lengthscales ), axis = -1 ) ) 
+            
+        return self.variance * exp_dist
+
+
+    def Kdiag(self, X):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        ret = np.empty(X.shape[0])
+        ret[:] = self.variance
+        return ret
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        """derivative of the covariance matrix with respect to the parameters."""
+        if X2 is None: 
+            X2 = X
+        
+        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
+        
+        sin_base = np.sin( base )         
+        exp_dist = np.exp( -0.5* np.sum( np.square(  sin_base / self.lengthscales ), axis = -1 ) ) 
+        
+        dwl = self.variance * (1.0/np.square(self.lengthscales)) * sin_base*np.cos(base) * (base / self.wavelengths)
+        
+        dl = self.variance * np.square( sin_base) / np.power( self.lengthscales, 3) 
+        
+        self.variance.gradient = np.sum(exp_dist * dL_dK)    
+        #target[0] += np.sum( exp_dist * dL_dK)        
+        
+        if self.ARD1: # different wavelengths
+            self.wavelengths.gradient = (dwl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
+        else:  # same wavelengths
+            self.wavelengths.gradient = np.sum(dwl.sum(-1) * exp_dist * dL_dK)
+            
+        if self.ARD2: # different lengthscales
+            self.lengthscales.gradient = (dl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
+        else: # same lengthscales
+            self.lengthscales.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)
+        
+    def update_gradients_diag(self, dL_dKdiag, X):
+        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
+        self.variance.gradient = np.sum(dL_dKdiag)
+        self.wavelengths.gradient = 0
+        self.lengthscales.gradient = 0
+
+#    def gradients_X(self, dL_dK, X, X2=None):
+#        """derivative of the covariance matrix with respect to X."""
+#    
+#        raise NotImplemented("Periodic kernel: dK_dX not implemented")
+#
+#    def gradients_X_diag(self, dL_dKdiag, X):
+#        
+#        raise NotImplemented("Periodic kernel: dKdiag_dX not implemented")
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@ -13,7 +13,7 @@ from ...util.config import config # for assesing whether to use cython
 from ...util.caching import Cache_this

 try:
-    import stationary_cython
+    from . import stationary_cython
 except ImportError:
    print('warning in sationary: failed to import cython module: falling back to numpy')
    config.set('cython', 'working', 'false')
@ -77,7 +77,7 @@ class Stationary(Kern):
    def dK_dr(self, r):
        raise NotImplementedError("implement derivative of the covariance function wrt r to use this class")

-    @Cache_this(limit=5, ignore_args=())
+    @Cache_this(limit=20, ignore_args=())
    def K(self, X, X2=None):
        """
        Kernel function applied on inputs X and X2.
@ -89,7 +89,7 @@ class Stationary(Kern):
        r = self._scaled_dist(X, X2)
        return self.K_of_r(r)

-    @Cache_this(limit=3, ignore_args=())
+    @Cache_this(limit=20, ignore_args=())
    def dK_dr_via_X(self, X, X2):
        #a convenience function, so we can cache dK_dr
        return self.dK_dr(self._scaled_dist(X, X2))
@ -114,7 +114,7 @@ class Stationary(Kern):
            r2 = np.clip(r2, 0, np.inf)
            return np.sqrt(r2)

-    @Cache_this(limit=5, ignore_args=())
+    @Cache_this(limit=20, ignore_args=())
    def _scaled_dist(self, X, X2=None):
        """
        Efficiently compute the scaled distance, r.
--- a/GPy/kern/_src/stationary_cython.c
+++ b/GPy/kern/_src/stationary_cython.c
--- a/GPy/kern/_src/stationary_cython.pyx
+++ b/GPy/kern/_src/stationary_cython.pyx
@ -1,15 +1,18 @@
 #cython: boundscheck=False
+#cython: nonecheck=False
 #cython: wraparound=False
 import numpy as np
 cimport numpy as np
+from cython.parallel import prange
+cimport cython

 ctypedef np.float64_t DTYPE_t
 
 cdef extern from "stationary_utils.h":
-    void _grad_X "_grad_X" (int N, int D, int M, double* X, double* X2, double* tmp, double* grad)
+    void _grad_X "_grad_X" (int N, int D, int M, double* X, double* X2, double* tmp, double* grad) nogil

 cdef extern from "stationary_utils.h":
-    void _lengthscale_grads "_lengthscale_grads" (int N, int M, int Q, double* tmp, double* X, double* X2, double* grad)
+    void _lengthscale_grads "_lengthscale_grads" (int N, int M, int Q, double* tmp, double* X, double* X2, double* grad) nogil
 
 def grad_X(int N, int D, int M,
        np.ndarray[DTYPE_t, ndim=2] _X,
@ -20,9 +23,20 @@ def grad_X(int N, int D, int M,
    cdef double *X2 = <double*> _X2.data
    cdef double *tmp = <double*> _tmp.data
    cdef double *grad = <double*> _grad.data
-    _grad_X(N, D, M, X, X2, tmp, grad) # return nothing, work in place.
+    with nogil:
+        _grad_X(N, D, M, X, X2, tmp, grad) # return nothing, work in place.

-def lengthscale_grads(int N, int M, int Q,
+@cython.cdivision(True)
+def grad_X_cython(int N, int D, int M, double[:,:] X, double[:,:] X2, double[:,:] tmp, double[:,:] grad):
+    cdef int n,d,nd,m
+    for nd in prange(N * D, nogil=True):
+        n = nd / D
+        d = nd % D
+        grad[n,d] = 0.0
+        for m in range(M):
+            grad[n,d] += tmp[n, m] * (X[n, d] - X2[m, d])
+
+def lengthscale_grads_in_c(int N, int M, int Q,
        np.ndarray[DTYPE_t, ndim=2] _tmp,
        np.ndarray[DTYPE_t, ndim=2] _X,
        np.ndarray[DTYPE_t, ndim=2] _X2,
@ -31,6 +45,16 @@ def lengthscale_grads(int N, int M, int Q,
    cdef double *X = <double*> _X.data
    cdef double *X2 = <double*> _X2.data
    cdef double *grad = <double*> _grad.data
-    _lengthscale_grads(N, M, Q, tmp, X, X2, grad) # return nothing, work in place.
-
+    with nogil:
+        _lengthscale_grads(N, M, Q, tmp, X, X2, grad) # return nothing, work in place.

+def lengthscale_grads(int N, int M, int Q, double[:,:] tmp, double[:,:] X, double[:,:] X2, double[:] grad):
+    cdef int q, n, m
+    cdef double gradq, dist
+    with nogil:
+        for q in range(Q):
+            grad[q] = 0.0
+            for n in range(N):
+                for m in range(M):
+                    dist = X[n,q] - X2[m,q]
+                    grad[q] += tmp[n, m] * dist * dist
--- a/GPy/kern/_src/stationary_utils.c
+++ b/GPy/kern/_src/stationary_utils.c
@ -1,19 +1,36 @@
 void _grad_X(int N, int D, int M, double* X, double* X2, double* tmp, double* grad){
-int n,m,d;
 double retnd;
-//#pragma omp parallel for private(n,d, retnd, m)
-for(d=0;d<D;d++){
-  for(n=0;n<N;n++){
-    retnd = 0.0;
-    for(m=0;m<M;m++){
-      retnd += tmp[n*M+m]*(X[n*D+d]-X2[m*D+d]);
-    }
-    grad[n*D+d] = retnd;
+int n,d,nd,m;
+#pragma omp parallel for private(nd,n,d, retnd, m)
+for(nd=0;nd<(D*N);nd++){
+  n = nd/D;
+  d = nd%D;
+  retnd = 0.0;
+  for(m=0;m<M;m++){
+    retnd += tmp[n*M+m]*(X[nd]-X2[m*D+d]);
  }
+  grad[nd] = retnd;
 }
 } //grad_X


+void _lengthscale_grads_unsafe(int N, int M, int Q, double* tmp, double* X, double* X2, double* grad){
+int n,m,nm,q,nQ,mQ;
+double dist;
+#pragma omp parallel for private(n,m,nm,q,nQ,mQ,dist)
+for(nm=0; nm<(N*M); nm++){
+  n = nm/M;
+  m = nm%M; 
+  nQ = n*Q;
+  mQ = m*Q;
+  for(q=0; q<Q; q++){
+    dist = X[nQ+q]-X2[mQ+q];
+    grad[q] += tmp[nm]*dist*dist;
+  }
+}
+} //lengthscale_grads
+
+
 void _lengthscale_grads(int N, int M, int Q, double* tmp, double* X, double* X2, double* grad){
 int n,m,q;
 double gradq, dist;
@ -33,3 +50,5 @@ for(q=0; q<Q; q++){



+
+
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@ -85,6 +85,7 @@ class Bernoulli(Likelihood):
                gh_x, gh_w = gh_points


+            gh_w = gh_w / np.sqrt(np.pi)
            shape = m.shape
            m,v,Y = m.flatten(), v.flatten(), Y.flatten()
            Ysign = np.where(Y==1,1,-1)
@ -232,6 +233,17 @@ class Bernoulli(Likelihood):
        np.seterr(**state)
        return d3logpdf_dlink3

+    def predictive_quantiles(self, mu, var, quantiles, Y_metadata=None):
+        """
+        Get the "quantiles" of the binary labels (Bernoulli draws). all the
+        quantiles must be either 0 or 1, since those are the only values the
+        draw can take!
+        """
+        p = self.predictive_mean(mu, var)
+        return [np.asarray(p>(q/100.), dtype=np.int32) for q in quantiles]
+
+
+
    def samples(self, gp, Y_metadata=None):
        """
        Returns a set of samples of observations based on a given value of the latent variable.
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -316,6 +316,9 @@ class Gaussian(Likelihood):
        return -0.5*np.log(2*np.pi) -0.5*np.log(v) - 0.5*np.square(y_test - mu_star)/v

    def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
+        if not isinstance(self.gp_link, link_functions.Identity):
+            return super(Gaussian, self).variational_expectations(Y=Y, m=m, v=v, gh_points=gh_points, Y_metadata=Y_metadata)
+
        lik_var = float(self.variance)
        F = -0.5*np.log(2*np.pi) -0.5*np.log(lik_var) - 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/lik_var
        dF_dmu = (Y - m)/lik_var
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -143,7 +143,7 @@ class Likelihood(Parameterized):

        p_ystar, _ = zip(*[quad(integral_generator(yi, mi, vi, yi_m), -np.inf, np.inf)
                           for yi, mi, vi, yi_m in zipped_values])
-        p_ystar = np.array(p_ystar).reshape(-1, 1)
+        p_ystar = np.array(p_ystar).reshape(*y_test.shape)
        return np.log(p_ystar)

    def log_predictive_density_sampling(self, y_test, mu_star, var_star, Y_metadata=None, num_samples=1000):
@ -173,6 +173,7 @@ class Likelihood(Parameterized):

        from scipy.misc import logsumexp
        log_p_ystar = -np.log(num_samples) + logsumexp(self.logpdf(fi_samples, y_test, Y_metadata=Y_metadata), axis=1)
+        log_p_ystar = np.array(log_p_ystar).reshape(*y_test.shape)
        return log_p_ystar


@ -265,8 +266,8 @@ class Likelihood(Parameterized):
            stop

        if self.size:
-            dF_dtheta = self.dlogpdf_dtheta(X, Y[:,None]) # Ntheta x (orig size) x N_{quad_points}
-            dF_dtheta = np.dot(dF_dtheta, gh_w)
+            dF_dtheta = self.dlogpdf_dtheta(X, Y[:,None], Y_metadata=Y_metadata) # Ntheta x (orig size) x N_{quad_points}
+            dF_dtheta = np.dot(dF_dtheta, gh_w)/np.sqrt(np.pi)
            dF_dtheta = dF_dtheta.reshape(self.size, shape[0], shape[1])
        else:
            dF_dtheta = None # Not yet implemented
@ -297,13 +298,8 @@ class Likelihood(Parameterized):
                return self.conditional_mean(f)*p
        scaled_mean = [quad(int_mean, fmin, fmax,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
        mean = np.array(scaled_mean)[:,None] / np.sqrt(2*np.pi*(variance))
-
        return mean

-    def _conditional_mean(self, f):
-        """Quadrature calculation of the conditional mean: E(Y_star|f)"""
-        raise NotImplementedError("implement this function to make predictions")
-
    def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
        """
        Approximation to the predictive variance: V(Y_star)
@ -607,23 +603,30 @@ class Likelihood(Parameterized):
        :param full_cov: whether to use the full covariance or just the diagonal
        :type full_cov: Boolean
        """
-
-        pred_mean = self.predictive_mean(mu, var, Y_metadata)
-        pred_var = self.predictive_variance(mu, var, pred_mean, Y_metadata)
+        try:
+            pred_mean = self.predictive_mean(mu, var, Y_metadata=Y_metadata)
+            pred_var = self.predictive_variance(mu, var, pred_mean, Y_metadata=Y_metadata)
+        except NotImplementedError:
+            print("Finding predictive mean and variance via sampling rather than quadrature")
+            Nf_samp = 300
+            Ny_samp = 1
+            s = np.random.randn(mu.shape[0], Nf_samp)*np.sqrt(var) + mu
+            ss_y = self.samples(s, Y_metadata, samples=Ny_samp)
+            pred_mean = np.mean(ss_y, axis=1)[:, None]
+            pred_var = np.var(ss_y, axis=1)[:, None]

        return pred_mean, pred_var

    def predictive_quantiles(self, mu, var, quantiles, Y_metadata=None):
        #compute the quantiles by sampling!!!
-        N_samp = 500
-        s = np.random.randn(mu.shape[0], N_samp)*np.sqrt(var) + mu
-        #ss_f = s.flatten()
-        #ss_y = self.samples(ss_f, Y_metadata)
-        #ss_y = self.samples(s, Y_metadata, samples=100)
-        ss_y = self.samples(s, Y_metadata)
-        #ss_y = ss_y.reshape(mu.shape[0], N_samp)
+        Nf_samp = 300
+        Ny_samp = 1
+        s = np.random.randn(mu.shape[0], Nf_samp)*np.sqrt(var) + mu
+        ss_y = self.samples(s, Y_metadata, samples=Ny_samp)
+        #ss_y = ss_y.reshape(mu.shape[0], mu.shape[1], Nf_samp*Ny_samp)

-        return [np.percentile(ss_y ,q, axis=1)[:,None] for q in quantiles]
+        pred_quantiles = [np.percentile(ss_y, q, axis=1)[:,None] for q in quantiles]
+        return pred_quantiles

    def samples(self, gp, Y_metadata=None, samples=1):
        """
--- a/GPy/likelihoods/poisson.py
+++ b/GPy/likelihoods/poisson.py
@ -137,7 +137,7 @@ class Poisson(Likelihood):
        """
        return self.gp_link.transf(gp)

-    def samples(self, gp, Y_metadata=None):
+    def samples(self, gp, Y_metadata=None, samples=1):
        """
        Returns a set of samples of observations based on a given value of the latent variable.

@ -145,5 +145,5 @@ class Poisson(Likelihood):
        """
        orig_shape = gp.shape
        gp = gp.flatten()
-        Ysim = np.random.poisson(self.gp_link.transf(gp))
-        return Ysim.reshape(orig_shape)
+        Ysim = np.random.poisson(self.gp_link.transf(gp), [samples, gp.size]).T
+        return Ysim.reshape(orig_shape+(samples,))
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@ -64,9 +64,6 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
            self.logger.debug("creating inference_method var_dtc")
            inference_method = VarDTC(limit=1 if not missing_data else Y.shape[1])

-        if kernel.useGPU and isinstance(inference_method, VarDTC_GPU):
-            kernel.psicomp.GPU_direct = True
-
        super(BayesianGPLVMMiniBatch,self).__init__(X, Y, Z, kernel, likelihood=likelihood,
                                           name=name, inference_method=inference_method,
                                           normalizer=normalizer,
--- a/GPy/models/bcgplvm.py
+++ b/GPy/models/bcgplvm.py
@ -1,11 +1,11 @@
-# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2015 James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


 import numpy as np
 from ..core import GP
-from ..models import GPLVM
-from ..mappings import *
+from . import GPLVM
+from .. import mappings


 class BCGPLVM(GPLVM):
@ -16,33 +16,31 @@ class BCGPLVM(GPLVM):
    :type Y: np.ndarray
    :param input_dim: latent dimensionality
    :type input_dim: int
-    :param init: initialisation method for the latent space
-    :type init: 'PCA'|'random'
    :param mapping: mapping for back constraint
    :type mapping: GPy.core.Mapping object

    """
-    def __init__(self, Y, input_dim, init='PCA', X=None, kernel=None, normalize_Y=False, mapping=None):
-        
+    def __init__(self, Y, input_dim, kernel=None, mapping=None):
+
+
        if mapping is None:
-            mapping = Kernel(X=Y, output_dim=input_dim)
+            mapping = mappings.MLP(input_dim=Y.shape[1],
+                                   output_dim=input_dim,
+                                   hidden_dim=10)
+        else:
+            assert mapping.input_dim==Y.shape[1], "mapping input dim does not work for Y dimension"
+            assert mapping.output_dim==input_dim, "mapping output dim does not work for self.input_dim"
+        GPLVM.__init__(self, Y, input_dim, X=mapping.f(Y), kernel=kernel, name="bcgplvm")
+        self.unlink_parameter(self.X)
        self.mapping = mapping
-        GPLVM.__init__(self, Y, input_dim, init, X, kernel, normalize_Y)
-        self.X = self.mapping.f(self.likelihood.Y)
+        self.link_parameter(self.mapping)

-    def _get_param_names(self):
-        return self.mapping._get_param_names() + GP._get_param_names(self)
+        self.X = self.mapping.f(self.Y)

-    def _get_params(self):
-        return np.hstack((self.mapping._get_params(), GP._get_params(self)))
+    def parameters_changed(self):
+        self.X = self.mapping.f(self.Y)
+        GP.parameters_changed(self)
+        Xgradient = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X, None)
+        self.mapping.update_gradients(Xgradient, self.Y)

-    def _set_params(self, x):
-        self.mapping._set_params(x[:self.mapping.num_params])
-        self.X = self.mapping.f(self.likelihood.Y)
-        GP._set_params(self, x[self.mapping.num_params:])
-
-    def _log_likelihood_gradients(self):
-        dL_df = self.kern.gradients_X(self.dL_dK, self.X)
-        dL_dtheta = self.mapping.df_dtheta(dL_df, self.likelihood.Y)
-        return np.hstack((dL_dtheta.flatten(), GP._log_likelihood_gradients(self)))

--- a/GPy/models/dpgplvm.py
+++ b/GPy/models/dpgplvm.py
@ -3,7 +3,7 @@

 import numpy as np
 from .. import kern
-from bayesian_gplvm import BayesianGPLVM
+from .bayesian_gplvm import BayesianGPLVM
 from ..core.parameterization.variational import NormalPosterior, NormalPrior

 class DPBayesianGPLVM(BayesianGPLVM):
@ -15,5 +15,5 @@ class DPBayesianGPLVM(BayesianGPLVM):
                 name='bayesian gplvm', mpi_comm=None, normalizer=None,
                 missing_data=False, stochastic=False, batchsize=1):
        super(DPBayesianGPLVM,self).__init__(Y=Y, input_dim=input_dim, X=X, X_variance=X_variance, init=init, 					num_inducing=num_inducing, Z=Z, kernel=kernel, inference_method=inference_method, likelihood=likelihood, mpi_comm=mpi_comm, normalizer=normalizer, missing_data=missing_data, stochastic=stochastic, batchsize=batchsize, name='dp bayesian gplvm')
-	self.X.mean.set_prior(X_prior)
+        self.X.mean.set_prior(X_prior)
        self.link_parameter(X_prior)
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@ -16,6 +16,7 @@ class GPRegression(GP):
    :param Y: observed values
    :param kernel: a GPy kernel, defaults to rbf
    :param Norm normalizer: [False]
+    :param noise_var: the noise variance for Gaussian likelhood, defaults to 1.

        Normalize Y with the norm given.
        If normalizer is False, no normalization will be done
@ -25,12 +26,12 @@ class GPRegression(GP):

    """

-    def __init__(self, X, Y, kernel=None, Y_metadata=None, normalizer=None):
+    def __init__(self, X, Y, kernel=None, Y_metadata=None, normalizer=None, noise_var=1.):

        if kernel is None:
            kernel = kern.RBF(X.shape[1])
-
-        likelihood = likelihoods.Gaussian()
+	
+        likelihood = likelihoods.Gaussian(variance=noise_var)

        super(GPRegression, self).__init__(X, Y, kernel, likelihood, name='GP regression', Y_metadata=Y_metadata, normalizer=normalizer)

--- a/GPy/models/gp_var_gauss.py
+++ b/GPy/models/gp_var_gauss.py
@ -2,20 +2,18 @@
 # Distributed under the terms of the GNU General public License, see LICENSE.txt

 import numpy as np
-from scipy import stats
-from scipy.special import erf
-from ..core.model import Model
+from ..core import GP
 from ..core.parameterization import ObsAr
 from .. import kern
 from ..core.parameterization.param import Param
-from ..util.linalg import pdinv
+from ..inference.latent_function_inference import VarGauss

 log_2_pi = np.log(2*np.pi)


-class GPVariationalGaussianApproximation(Model):
+class GPVariationalGaussianApproximation(GP):
    """
-    The Variational Gaussian Approximation revisited implementation for regression
+    The Variational Gaussian Approximation revisited

    @article{Opper:2009,
        title = {The Variational Gaussian Approximation Revisited},
@ -25,84 +23,15 @@ class GPVariationalGaussianApproximation(Model):
        pages = {786--792},
    }
    """
-    def __init__(self, X, Y, kernel=None):
-        Model.__init__(self,'Variational GP classification')
-        # accept the construction arguments
-        self.X = ObsAr(X)
-        if kernel is None:
-            kernel = kern.RBF(X.shape[1]) + kern.White(X.shape[1], 0.01)
-        self.kern = kernel
-        self.link_parameter(self.kern)
-        self.num_data, self.input_dim = self.X.shape
+    def __init__(self, X, Y, kernel, likelihood, Y_metadata=None):
+
+        num_data = Y.shape[0]
+        self.alpha = Param('alpha', np.zeros((num_data,1))) # only one latent fn for now.
+        self.beta = Param('beta', np.ones(num_data))
+
+        inf = VarGauss(self.alpha, self.beta)
+        super(GPVariationalGaussianApproximation, self).__init__(X, Y, kernel, likelihood, name='VarGP', inference_method=inf)

-        self.alpha = Param('alpha', np.zeros(self.num_data))
-        self.beta = Param('beta', np.ones(self.num_data))
        self.link_parameter(self.alpha)
        self.link_parameter(self.beta)

-        self.gh_x, self.gh_w = np.polynomial.hermite.hermgauss(20)
-        self.Ysign = np.where(Y==1, 1, -1).flatten()
-
-    def log_likelihood(self):
-        """
-        Marginal log likelihood evaluation
-        """
-        return self._log_lik
-
-    def likelihood_quadrature(self, m, v):
-        """
-        Perform Gauss-Hermite quadrature over the log of the likelihood, with a fixed weight
-        """
-        # assume probit for now.
-        X = self.gh_x[None, :]*np.sqrt(2.*v[:, None]) + (m*self.Ysign)[:, None]
-        p = stats.norm.cdf(X)
-        N = stats.norm.pdf(X)
-        F = np.log(p).dot(self.gh_w)
-        NoverP = N/p
-        dF_dm = (NoverP*self.Ysign[:,None]).dot(self.gh_w)
-        dF_dv = -0.5*(NoverP**2 + NoverP*X).dot(self.gh_w)
-        return F, dF_dm, dF_dv
-
-    def parameters_changed(self):
-        K = self.kern.K(self.X)
-        m = K.dot(self.alpha)
-        KB = K*self.beta[:, None]
-        BKB = KB*self.beta[None, :]
-        A = np.eye(self.num_data) + BKB
-        Ai, LA, _, Alogdet = pdinv(A)
-        Sigma = np.diag(self.beta**-2) - Ai/self.beta[:, None]/self.beta[None, :]  # posterior coavairance: need full matrix for gradients
-        var = np.diag(Sigma)
-
-        F, dF_dm, dF_dv = self.likelihood_quadrature(m, var)
-        dF_da = np.dot(K, dF_dm)
-        SigmaB = Sigma*self.beta
-        dF_db = -np.diag(Sigma.dot(np.diag(dF_dv)).dot(SigmaB))*2
-        KL = 0.5*(Alogdet + np.trace(Ai) - self.num_data + m.dot(self.alpha))
-        dKL_da = m
-        A_A2 = Ai - Ai.dot(Ai)
-        dKL_db = np.diag(np.dot(KB.T, A_A2))
-        self._log_lik = F.sum() - KL
-        self.alpha.gradient = dF_da - dKL_da
-        self.beta.gradient = dF_db - dKL_db
-
-        # K-gradients
-        dKL_dK = 0.5*(self.alpha[None, :]*self.alpha[:, None] + self.beta[:, None]*self.beta[None, :]*A_A2)
-        tmp = Ai*self.beta[:, None]/self.beta[None, :]
-        dF_dK = self.alpha[:, None]*dF_dm[None, :] + np.dot(tmp*dF_dv, tmp.T)
-        self.kern.update_gradients_full(dF_dK - dKL_dK, self.X)
-
-    def predict(self, Xnew):
-        """
-        Predict the function(s) at the new point(s) Xnew.
-
-        :param Xnew: The points at which to make a prediction
-        :type Xnew: np.ndarray, Nnew x self.input_dim
-        """
-        Wi, _, _, _ = pdinv(self.kern.K(self.X) + np.diag(self.beta**-2))
-        Kux = self.kern.K(self.X, Xnew)
-        mu = np.dot(Kux.T, self.alpha)
-        WiKux = np.dot(Wi, Kux)
-        Kxx = self.kern.Kdiag(Xnew)
-        var = Kxx - np.sum(WiKux*Kux, 0)
-
-        return 0.5*(1+erf(mu/np.sqrt(2.*(var+1))))
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@ -58,12 +58,15 @@ class GPLVM(GP):
        return target

    def plot(self):
-        assert self.likelihood.Y.shape[1] == 2
-        pb.scatter(self.likelihood.Y[:, 0], self.likelihood.Y[:, 1], 40, self.X[:, 0].copy(), linewidth=0, cmap=pb.cm.jet)  # @UndefinedVariable
+        assert self.Y.shape[1] == 2, "too high dimensional to plot. Try plot_latent"
+        from matplotlib import pyplot as plt
+        plt.scatter(self.Y[:, 0],
+                    self.Y[:, 1],
+                    40, self.X[:, 0].copy(),
+                    linewidth=0, cmap=plt.cm.jet)
        Xnew = np.linspace(self.X.min(), self.X.max(), 200)[:, None]
        mu, _ = self.predict(Xnew)
-        import pylab as pb
-        pb.plot(mu[:, 0], mu[:, 1], 'k', linewidth=1.5)
+        plt.plot(mu[:, 0], mu[:, 1], 'k', linewidth=1.5)

    def plot_latent(self, labels=None, which_indices=None,
                resolution=50, ax=None, marker='o', s=40,
--- a/GPy/models/gradient_checker.py
+++ b/GPy/models/gradient_checker.py
@ -228,14 +228,14 @@ class HessianChecker(GradientChecker):

        if verbose:
            if block_indices:
-                print "\nBlock {}".format(block_indices)
+                print("\nBlock {}".format(block_indices))
            else:
-                print "\nAll blocks"
+                print("\nAll blocks")

            header = ['Checked', 'Max-Ratio', 'Min-Ratio', 'Min-Difference', 'Max-Difference']
            header_string = map(lambda x: ' | '.join(header), [header])
            separator = '-' * len(header_string[0])
-            print '\n'.join([header_string[0], separator])
+            print('\n'.join([header_string[0], separator]))
            min_r = '%.6f' % float(numpy.min(ratio))
            max_r = '%.6f' % float(numpy.max(ratio))
            max_d = '%.6f' % float(numpy.max(difference))
@ -248,7 +248,7 @@ class HessianChecker(GradientChecker):
                checked = "\033[91m  False \033[0m"

            grad_string = "{} | {}  | {} |    {}    |   {} ".format(checked, cols[0], cols[1], cols[2], cols[3])
-            print grad_string
+            print(grad_string)

            if plot:
                import pylab as pb
@ -348,7 +348,7 @@ class SkewChecker(HessianChecker):
            numeric_hess_partial = nd.Jacobian(self._df, vectorized=True)
            numeric_hess = numeric_hess_partial(x)

-            print "Done making numerical hessian"
+            print("Done making numerical hessian")
            if analytic_hess.dtype is np.dtype('object'):
                #Blockify numeric_hess aswell
                blocksizes, pagesizes = get_block_shapes_3d(analytic_hess)
@ -365,7 +365,7 @@ class SkewChecker(HessianChecker):
                #Unless super_plot is set, just plot the first one
                p = True if (plot and block_ind == numeric_hess.shape[2]-1) or super_plot else False
                if verbose:
-                    print "Checking derivative of hessian wrt parameter number {}".format(block_ind)
+                    print("Checking derivative of hessian wrt parameter number {}".format(block_ind))
                check_passed[block_ind] = self.checkgrad_block(analytic_hess[:,:,block_ind], numeric_hess[:,:,block_ind], verbose=verbose, step=step, tolerance=tolerance, block_indices=block_indices, plot=p)

            current_index += current_size
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@ -63,33 +63,18 @@ class SparseGPMiniBatch(SparseGP):

        if stochastic and missing_data:
            self.missing_data = True
-            self.ninan = ~np.isnan(Y)
            self.stochastics = SparseGPStochastics(self, batchsize)
        elif stochastic and not missing_data:
            self.missing_data = False
            self.stochastics = SparseGPStochastics(self, batchsize)
        elif missing_data:
            self.missing_data = True
-            self.ninan = ~np.isnan(Y)
            self.stochastics = SparseGPMissing(self)
        else:
            self.stochastics = False

        logger.info("Adding Z as parameter")
        self.link_parameter(self.Z, index=0)
-        if self.missing_data:
-            self.Ylist = []
-            overall = self.Y_normalized.shape[1]
-            m_f = lambda i: "Precomputing Y for missing data: {: >7.2%}".format(float(i+1)/overall)
-            message = m_f(-1)
-            print(message, end=' ')
-            for d in range(overall):
-                self.Ylist.append(self.Y_normalized[self.ninan[:, d], d][:, None])
-                print(' '*(len(message)+1) + '\r', end=' ')
-                message = m_f(d)
-                print(message, end=' ')
-            print('')
-
        self.posterior = None

    def has_uncertain_inputs(self):
@ -245,8 +230,7 @@ class SparseGPMiniBatch(SparseGP):
            message = m_f(-1)
            print(message, end=' ')

-        for d in self.stochastics.d:
-            ninan = self.ninan[:, d]
+        for d, ninan in self.stochastics.d:

            if not self.stochastics:
                print(' '*(len(message)) + '\r', end=' ')
@ -257,7 +241,7 @@ class SparseGPMiniBatch(SparseGP):
                grad_dict, current_values, value_indices = self._inner_parameters_changed(
                                self.kern, self.X[ninan],
                                self.Z, self.likelihood,
-                                self.Ylist[d], self.Y_metadata,
+                                self.Y_normalized[ninan][:, d], self.Y_metadata,
                                Lm, dL_dKmm,
                                subset_indices=dict(outputs=d, samples=ninan))

@ -266,8 +250,8 @@ class SparseGPMiniBatch(SparseGP):

            Lm = posterior.K_chol
            dL_dKmm = grad_dict['dL_dKmm']
-            woodbury_inv[:, :, d] = posterior.woodbury_inv
-            woodbury_vector[:, d:d+1] = posterior.woodbury_vector
+            woodbury_inv[:, :, d] = posterior.woodbury_inv[:,:,None]
+            woodbury_vector[:, d] = posterior.woodbury_vector
            self._log_marginal_likelihood += log_marginal_likelihood
        if not self.stochastics:
            print('')
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@ -5,11 +5,95 @@ import numpy as np

 from ..core.sparse_gp_mpi import SparseGP_MPI
 from .. import kern
+from ..core.parameterization import Param
 from ..likelihoods import Gaussian
-from ..core.parameterization.variational import SpikeAndSlabPrior, SpikeAndSlabPosterior
+from ..core.parameterization.variational import SpikeAndSlabPrior, SpikeAndSlabPosterior,VariationalPrior
 from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch
 from ..kern._src.psi_comp.ssrbf_psi_gpucomp import PSICOMP_SSRBF_GPU

+class IBPPosterior(SpikeAndSlabPosterior):
+    '''
+    The SpikeAndSlab distribution for variational approximations.
+    '''
+    def __init__(self, means, variances, binary_prob, tau=None,  sharedX=False, name='latent space'):
+        """
+        binary_prob : the probability of the distribution on the slab part.
+        """
+        from ..core.parameterization.transformations import Logexp
+        super(IBPPosterior, self).__init__(means, variances, binary_prob, group_spike=True, name=name)
+        self.sharedX = sharedX
+        if sharedX:
+            self.mean.fix(warning=False)
+            self.variance.fix(warning=False)
+        self.tau = Param("tau_", np.ones((self.gamma_group.shape[0],2)), Logexp())
+        self.link_parameter(self.tau)
+
+    def set_gradients(self, grad):
+        self.mean.gradient, self.variance.gradient, self.gamma.gradient, self.tau.gradient = grad
+
+    def __getitem__(self, s):
+        if isinstance(s, (int, slice, tuple, list, np.ndarray)):
+            import copy
+            n = self.__new__(self.__class__, self.name)
+            dc = self.__dict__.copy()
+            dc['mean'] = self.mean[s]
+            dc['variance'] = self.variance[s]
+            dc['binary_prob'] = self.binary_prob[s]
+            dc['tau'] = self.tau
+            dc['parameters'] = copy.copy(self.parameters)
+            n.__dict__.update(dc)
+            n.parameters[dc['mean']._parent_index_] = dc['mean']
+            n.parameters[dc['variance']._parent_index_] = dc['variance']
+            n.parameters[dc['binary_prob']._parent_index_] = dc['binary_prob']
+            n.parameters[dc['tau']._parent_index_] = dc['tau']
+            n._gradient_array_ = None
+            oversize = self.size - self.mean.size - self.variance.size - self.gamma.size - self.tau.size
+            n.size = n.mean.size + n.variance.size + n.gamma.size+ n.tau.size + oversize
+            n.ndim = n.mean.ndim
+            n.shape = n.mean.shape
+            n.num_data = n.mean.shape[0]
+            n.input_dim = n.mean.shape[1] if n.ndim != 1 else 1
+            return n
+        else:
+            return super(IBPPosterior, self).__getitem__(s)
+
+class IBPPrior(VariationalPrior):
+    def __init__(self, input_dim, alpha =2., name='IBPPrior', **kw):
+        super(IBPPrior, self).__init__(name=name, **kw)
+        from ..core.parameterization.transformations import Logexp, __fixed__  
+        self.input_dim = input_dim
+        self.variance = 1.
+        self.alpha = Param('alpha', alpha, __fixed__)
+        self.link_parameter(self.alpha)
+
+    def KL_divergence(self, variational_posterior):
+        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma_group.values, variational_posterior.tau.values
+            
+        var_mean = np.square(mu)/self.variance
+        var_S = (S/self.variance - np.log(S))
+        part1 = (gamma* (np.log(self.variance)-1. +var_mean + var_S)).sum()/2.
+        
+        ad = self.alpha/self.input_dim
+        from scipy.special import betaln,digamma
+        part2 = (gamma*np.log(gamma)).sum() + ((1.-gamma)*np.log(1.-gamma)).sum() + betaln(ad,1.)*self.input_dim \
+                -betaln(tau[:,0], tau[:,1]).sum() + ((tau[:,0]-gamma-ad)*digamma(tau[:,0])).sum() + \
+                ((tau[:,1]+gamma-2.)*digamma(tau[:,1])).sum() + ((2.+ad-tau[:,0]-tau[:,1])*digamma(tau.sum(axis=1))).sum()
+        
+        return part1+part2
+
+    def update_gradients_KL(self, variational_posterior):
+        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma_group.values, variational_posterior.tau.values
+
+        variational_posterior.mean.gradient -= gamma*mu/self.variance
+        variational_posterior.variance.gradient -= (1./self.variance - 1./S) * gamma /2.
+        from scipy.special import digamma,polygamma
+        dgamma = (np.log(gamma/(1.-gamma))+ digamma(tau[:,1])-digamma(tau[:,0]))/variational_posterior.num_data
+        variational_posterior.binary_prob.gradient -= dgamma+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
+        ad = self.alpha/self.input_dim
+        common = (ad+2-tau[:,0]-tau[:,1])*polygamma(1,tau.sum(axis=1))
+        variational_posterior.tau.gradient[:,0] = -((tau[:,0]-gamma-ad)*polygamma(1,tau[:,0])+common)
+        variational_posterior.tau.gradient[:,1] = -((tau[:,1]+gamma-2)*polygamma(1,tau[:,1])+common)
+
 class SSGPLVM(SparseGP_MPI):
    """
    Spike-and-Slab Gaussian Process Latent Variable Model
@ -23,9 +107,11 @@ class SSGPLVM(SparseGP_MPI):

    """
    def __init__(self, Y, input_dim, X=None, X_variance=None, Gamma=None, init='PCA', num_inducing=10,
-                 Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike_and_Slab GPLVM', group_spike=False, mpi_comm=None, pi=None, learnPi=True,normalizer=False, **kwargs):
+                 Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike_and_Slab GPLVM', group_spike=False, IBP=False, alpha=2., tau=None, mpi_comm=None, pi=None, learnPi=False,normalizer=False, sharedX=False, variational_prior=None,**kwargs):

        self.group_spike = group_spike
+        self.init = init
+        self.sharedX = sharedX
        
        if X == None:
            from ..util.initialization import initialize_latent
@ -33,8 +119,6 @@ class SSGPLVM(SparseGP_MPI):
        else:
            fracs = np.ones(input_dim)

-        self.init = init
-
        if X_variance is None: # The variance of the variational approximation (S)
            X_variance = np.random.uniform(0,.1,X.shape)
            
@ -64,17 +148,16 @@ class SSGPLVM(SparseGP_MPI):
        if pi is None:
            pi = np.empty((input_dim))
            pi[:] = 0.5
-        self.variational_prior = SpikeAndSlabPrior(pi=pi,learnPi=learnPi) # the prior probability of the latent binary variable b
-        
-        X = SpikeAndSlabPosterior(X, X_variance, gamma)
-                
+            
+        if IBP:
+            self.variational_prior = IBPPrior(input_dim=input_dim, alpha=alpha) if variational_prior is None else variational_prior
+            X = IBPPosterior(X, X_variance, gamma, tau=tau,sharedX=sharedX)
+        else:
+            self.variational_prior = SpikeAndSlabPrior(pi=pi,learnPi=learnPi, group_spike=group_spike)  if variational_prior is None else variational_prior
+            X = SpikeAndSlabPosterior(X, X_variance, gamma, group_spike=group_spike,sharedX=sharedX)
+
        super(SSGPLVM,self).__init__(X, Y, Z, kernel, likelihood, variational_prior=self.variational_prior, inference_method=inference_method, name=name, mpi_comm=mpi_comm, normalizer=normalizer, **kwargs)
-#         self.X.unfix()
-#         self.X.variance.constrain_positive()
        self.link_parameter(self.X, index=0)
-                
-        if self.group_spike:
-            [self.X.gamma[:,i].tie('tieGamma'+str(i)) for i in range(self.X.gamma.shape[1])] # Tie columns together
        
    def set_X_gradients(self, X, X_grad):
        """Set the gradients of the posterior distribution of X in its specific form."""
@ -84,9 +167,15 @@ class SSGPLVM(SparseGP_MPI):
        """Get the gradients of the posterior distribution of X in its specific form."""
        return X.mean.gradient, X.variance.gradient, X.binary_prob.gradient

+    def _propogate_X_val(self):
+        pass
+
    def parameters_changed(self):
+        self.X.propogate_val()
+        if self.sharedX: self._highest_parent_._propogate_X_val()
        super(SSGPLVM,self).parameters_changed()
        if isinstance(self.inference_method, VarDTC_minibatch):
+            self.X.collate_gradient()
            return
        
        self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
@ -95,6 +184,7 @@ class SSGPLVM(SparseGP_MPI):

        # update for the KL divergence
        self.variational_prior.update_gradients_KL(self.X)
+        self.X.collate_gradient()

    def input_sensitivity(self):
        if self.kern.ARD:
--- a/GPy/models/ss_mrd.py
+++ b/GPy/models/ss_mrd.py
@ -2,33 +2,256 @@
 The Maniforld Relevance Determination model with the spike-and-slab prior
 """

+import numpy as np
 from ..core import Model
 from .ss_gplvm import SSGPLVM
+from ..core.parameterization.variational import SpikeAndSlabPrior,NormalPosterior,VariationalPrior
+from ..util.misc import param_to_array
+from ..kern import RBF
+from ..core import Param
+from numpy.linalg.linalg import LinAlgError

 class SSMRD(Model):
    
-    def __init__(self, Ylist, input_dim, X=None, X_variance=None,
-                 initx = 'PCA', initz = 'permute',
-                 num_inducing=10, Z=None, kernel=None,
-                 inference_method=None, likelihoods=None, name='ss_mrd', Ynames=None):
+    def __init__(self, Ylist, input_dim, X=None, X_variance=None, Gammas=None, initx = 'PCA_concat', initz = 'permute', 
+                 num_inducing=10, Zs=None, kernels=None, inference_methods=None, likelihoods=None, group_spike=True,
+                 pi=0.5, name='ss_mrd', Ynames=None, mpi_comm=None, IBP=False, alpha=2., taus=None, ):
        super(SSMRD, self).__init__(name)
+        self.mpi_comm = mpi_comm
+        self._PROPAGATE_ = False
        
-        self.updates = False
-        self.models = [SSGPLVM(y, input_dim, X=X, X_variance=X_variance, num_inducing=num_inducing,Z=Z,init=initx,
-                               kernel=kernel.copy() if kernel else None,inference_method=inference_method,likelihood=likelihoods,
-                               name='model_'+str(i)) for i,y in enumerate(Ylist)]
-        self.add_parameters(*(self.models))
+        # initialize X for individual models
+        X, X_variance, Gammas, fracs = self._init_X(Ylist, input_dim, X, X_variance, Gammas, initx)
+        self.X = NormalPosterior(means=X, variances=X_variance)
        
-        [[[self.models[m].X.mean[i,j:j+1].tie('mean_'+str(i)+'_'+str(j)) for m in range(len(self.models))] for j in range(self.models[0].X.mean.shape[1])] 
-         for i in range(self.models[0].X.mean.shape[0])]
-        [[[self.models[m].X.variance[i,j:j+1].tie('var_'+str(i)+'_'+str(j)) for m in range(len(self.models))] for j in range(self.models[0].X.variance.shape[1])] 
-         for i in range(self.models[0].X.variance.shape[0])]
+        if kernels is None:
+            kernels = [RBF(input_dim, lengthscale=1./fracs, ARD=True) for i in xrange(len(Ylist))]
+        if Zs is None:
+            Zs = [None]* len(Ylist)
+        if likelihoods is None:
+            likelihoods = [None]* len(Ylist)
+        if inference_methods is None:
+            inference_methods = [None]* len(Ylist)
        
-        self.updates = True
+        if IBP:
+            self.var_priors = [IBPPrior_SSMRD(len(Ylist),input_dim,alpha=alpha) for i in xrange(len(Ylist))]
+        else:
+            self.var_priors = [SpikeAndSlabPrior_SSMRD(nModels=len(Ylist),pi=pi,learnPi=False, group_spike=group_spike) for i in xrange(len(Ylist))]
+        self.models = [SSGPLVM(y, input_dim, X=X.copy(), X_variance=X_variance.copy(), Gamma=Gammas[i], num_inducing=num_inducing,Z=Zs[i], learnPi=False, group_spike=group_spike,
+                               kernel=kernels[i],inference_method=inference_methods[i],likelihood=likelihoods[i], variational_prior=self.var_priors[i], IBP=IBP, tau=None if taus is None else taus[i],
+                               name='model_'+str(i), mpi_comm=mpi_comm, sharedX=True) for i,y in enumerate(Ylist)]
+        self.link_parameters(*(self.models+[self.X]))
+        
+    def _propogate_X_val(self):
+        if self._PROPAGATE_: return
+        for m in self.models:
+            m.X.mean.values[:] = self.X.mean.values
+            m.X.variance.values[:] = self.X.variance.values
+        varp_list = [m.X for m in self.models]
+        [vp._update_inernal(varp_list) for vp in self.var_priors]
+        self._PROPAGATE_=True
+    
+    def _collate_X_gradient(self):
+        self._PROPAGATE_ = False
+        self.X.mean.gradient[:] = 0
+        self.X.variance.gradient[:] = 0
+        for m in self.models:
+            self.X.mean.gradient += m.X.mean.gradient
+            self.X.variance.gradient += m.X.variance.gradient
        
    def parameters_changed(self):
-        super(SSMRD, self).parameters_changed()        
+        super(SSMRD, self).parameters_changed()
+        [m.parameters_changed() for m in self.models]
        self._log_marginal_likelihood = sum([m._log_marginal_likelihood for m in self.models])
+        self._collate_X_gradient()

    def log_likelihood(self):
        return self._log_marginal_likelihood
+    
+    def _init_X(self, Ylist, input_dim, X=None, X_variance=None, Gammas=None, initx='PCA_concat'):
+        
+        # Divide latent dimensions
+        idx = np.empty((input_dim,),dtype=np.int)
+        residue = (input_dim)%(len(Ylist))
+        for i in xrange(len(Ylist)):
+            if i < residue:
+                size = input_dim/len(Ylist)+1
+                idx[i*size:(i+1)*size] = i
+            else:
+                size = input_dim/len(Ylist)
+                idx[i*size+residue:(i+1)*size+residue] = i
+        
+        if X is None:
+            if initx == 'PCA_concat':
+                X = np.empty((Ylist[0].shape[0],input_dim))
+                fracs = np.empty((input_dim,))
+                from ..util.initialization import initialize_latent
+                for i in xrange(len(Ylist)):
+                    Y = Ylist[i]
+                    dim = (idx==i).sum()
+                    if dim>0:
+                        x, fr = initialize_latent('PCA', dim, Y)
+                        X[:,idx==i] = x
+                        fracs[idx==i] = fr
+            elif initx=='PCA_joint':
+                y = np.hstack(Ylist)
+                from ..util.initialization import initialize_latent
+                X, fracs = initialize_latent('PCA', input_dim, y)
+            else:
+                X = np.random.randn(Ylist[0].shape[0], input_dim)
+                fracs = np.ones(input_dim)
+        else:
+            fracs = np.ones(input_dim)
+            
+    
+        if X_variance is None: # The variance of the variational approximation (S)
+            X_variance = np.random.uniform(0,.1,X.shape)
+            
+        if Gammas is None:
+            Gammas = []
+            for x in X:
+                gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
+                gamma[:] = 0.5 + 0.1 * np.random.randn(X.shape[0], input_dim)
+                gamma[gamma>1.-1e-9] = 1.-1e-9
+                gamma[gamma<1e-9] = 1e-9
+                Gammas.append(gamma)
+        return X, X_variance, Gammas, fracs
+
+    @Model.optimizer_array.setter
+    def optimizer_array(self, p):
+        if self.mpi_comm != None:
+            if self._IN_OPTIMIZATION_ and self.mpi_comm.rank==0:
+                self.mpi_comm.Bcast(np.int32(1),root=0)
+            self.mpi_comm.Bcast(p, root=0)        
+        Model.optimizer_array.fset(self,p)
+        
+    def optimize(self, optimizer=None, start=None, **kwargs):
+        self._IN_OPTIMIZATION_ = True
+        if self.mpi_comm==None:
+            super(SSMRD, self).optimize(optimizer,start,**kwargs)
+        elif self.mpi_comm.rank==0:
+            super(SSMRD, self).optimize(optimizer,start,**kwargs)
+            self.mpi_comm.Bcast(np.int32(-1),root=0)
+        elif self.mpi_comm.rank>0:
+            x = self.optimizer_array.copy()
+            flag = np.empty(1,dtype=np.int32)
+            while True:
+                self.mpi_comm.Bcast(flag,root=0)
+                if flag==1:
+                    try:
+                        self.optimizer_array = x
+                        self._fail_count = 0
+                    except (LinAlgError, ZeroDivisionError, ValueError):
+                        if self._fail_count >= self._allowed_failures:
+                            raise
+                        self._fail_count += 1
+                elif flag==-1:
+                    break
+                else:
+                    self._IN_OPTIMIZATION_ = False
+                    raise Exception("Unrecognizable flag for synchronization!")
+        self._IN_OPTIMIZATION_ = False
+        
+
+class SpikeAndSlabPrior_SSMRD(SpikeAndSlabPrior):
+    def __init__(self, nModels, pi=0.5, learnPi=False, group_spike=True, variance = 1.0, name='SSMRDPrior', **kw):
+        self.nModels = nModels
+        self._b_prob_all = 0.5
+        super(SpikeAndSlabPrior_SSMRD, self).__init__(pi=pi,learnPi=learnPi,group_spike=group_spike,variance=variance, name=name, **kw)
+    
+    def _update_inernal(self, varp_list):
+        """Make an update of the internal status by gathering the variational posteriors for all the individual models."""
+        # The probability for the binary variable for the same latent dimension of any of the models is on.
+        if self.group_spike:
+            self._b_prob_all = 1.-param_to_array(varp_list[0].gamma_group)
+            [np.multiply(self._b_prob_all, 1.-vp.gamma_group, self._b_prob_all) for vp in varp_list[1:]]
+        else:
+            self._b_prob_all = 1.-param_to_array(varp_list[0].binary_prob)
+            [np.multiply(self._b_prob_all, 1.-vp.binary_prob, self._b_prob_all) for vp in varp_list[1:]]            
+
+    def KL_divergence(self, variational_posterior):
+        mu = variational_posterior.mean
+        S = variational_posterior.variance
+        if self.group_spike:
+            gamma = variational_posterior.binary_prob[0]
+        else:
+            gamma = variational_posterior.binary_prob
+        if len(self.pi.shape)==2:
+            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            pi = self.pi[idx]
+        else:
+            pi = self.pi
+
+        var_mean = np.square(mu)/self.variance
+        var_S = (S/self.variance - np.log(S))
+        var_gamma = (gamma*np.log(gamma/pi)).sum()+((1-gamma)*np.log((1-gamma)/(1-pi))).sum()
+        return var_gamma +((1.-self._b_prob_all)*(np.log(self.variance)-1. +var_mean + var_S)).sum()/(2.*self.nModels)
+
+    def update_gradients_KL(self, variational_posterior):
+        mu = variational_posterior.mean
+        S = variational_posterior.variance
+        N = variational_posterior.num_data
+        if self.group_spike:
+            gamma = variational_posterior.binary_prob.values[0]
+        else:
+            gamma = variational_posterior.binary_prob.values
+        if len(self.pi.shape)==2:
+            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            pi = self.pi[idx]
+        else:
+            pi = self.pi
+
+        if self.group_spike:
+            tmp = self._b_prob_all/(1.-gamma)
+            variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))/N +tmp*((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
+        else:
+            variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
+        mu.gradient -= (1.-self._b_prob_all)*mu/(self.variance*self.nModels)
+        S.gradient -= (1./self.variance - 1./S) * (1.-self._b_prob_all) /(2.*self.nModels)
+        if self.learnPi:
+            raise 'Not Supported!'
+
+class IBPPrior_SSMRD(VariationalPrior):
+    def __init__(self, nModels, input_dim, alpha =2., tau=None, name='IBPPrior', **kw):
+        super(IBPPrior_SSMRD, self).__init__(name=name, **kw)
+        from ..core.parameterization.transformations import Logexp, __fixed__  
+        self.nModels = nModels
+        self._b_prob_all = 0.5
+        self.input_dim = input_dim
+        self.variance = 1.
+        self.alpha = Param('alpha', alpha, __fixed__)
+        self.link_parameter(self.alpha)
+        
+    def _update_inernal(self, varp_list):
+        """Make an update of the internal status by gathering the variational posteriors for all the individual models."""
+        # The probability for the binary variable for the same latent dimension of any of the models is on.
+        self._b_prob_all = 1.-param_to_array(varp_list[0].gamma_group)
+        [np.multiply(self._b_prob_all, 1.-vp.gamma_group, self._b_prob_all) for vp in varp_list[1:]]
+
+    def KL_divergence(self, variational_posterior):
+        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma_group.values, variational_posterior.tau.values
+            
+        var_mean = np.square(mu)/self.variance
+        var_S = (S/self.variance - np.log(S))
+        part1 = ((1.-self._b_prob_all)* (np.log(self.variance)-1. +var_mean + var_S)).sum()/(2.*self.nModels)
+        
+        ad = self.alpha/self.input_dim
+        from scipy.special import betaln,digamma
+        part2 = (gamma*np.log(gamma)).sum() + ((1.-gamma)*np.log(1.-gamma)).sum() + (betaln(ad,1.)*self.input_dim -betaln(tau[:,0], tau[:,1]).sum())/self.nModels \
+                 + (( (tau[:,0]-ad)/self.nModels -gamma)*digamma(tau[:,0])).sum() + \
+                (((tau[:,1]-1.)/self.nModels+gamma-1.)*digamma(tau[:,1])).sum() + (((1.+ad-tau[:,0]-tau[:,1])/self.nModels+1.)*digamma(tau.sum(axis=1))).sum()
+        return part1+part2
+
+    def update_gradients_KL(self, variational_posterior):
+        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma_group.values, variational_posterior.tau.values
+
+        variational_posterior.mean.gradient -= (1.-self._b_prob_all)*mu/(self.variance*self.nModels)
+        variational_posterior.variance.gradient -= (1./self.variance - 1./S) * (1.-self._b_prob_all) /(2.*self.nModels)
+        from scipy.special import digamma,polygamma
+        tmp = self._b_prob_all/(1.-gamma)
+        dgamma = (np.log(gamma/(1.-gamma))+ digamma(tau[:,1])-digamma(tau[:,0]))/variational_posterior.num_data
+        variational_posterior.binary_prob.gradient -= dgamma+tmp*((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
+        ad = self.alpha/self.input_dim
+        common = ((1.+ad-tau[:,0]-tau[:,1])/self.nModels+1.)*polygamma(1,tau.sum(axis=1))
+        variational_posterior.tau.gradient[:,0] = -(((tau[:,0]-ad)/self.nModels -gamma)*polygamma(1,tau[:,0])+common)
+        variational_posterior.tau.gradient[:,1] = -(((tau[:,1]-1.)/self.nModels+gamma-1.)*polygamma(1,tau[:,1])+common)
--- a/GPy/plotting/matplot_dep/base_plots.py
+++ b/GPy/plotting/matplot_dep/base_plots.py
@ -3,7 +3,7 @@


 try:
-    import Tango
+    #import Tango
    import pylab as pb
 except:
    pass
@ -17,11 +17,11 @@ def ax_default(fignum, ax):
        fig = ax.figure
    return fig, ax

-def meanplot(x, mu, color=Tango.colorsHex['darkBlue'], ax=None, fignum=None, linewidth=2,**kw):
+def meanplot(x, mu, color='#3300FF', ax=None, fignum=None, linewidth=2,**kw):
    _, axes = ax_default(fignum, ax)
    return axes.plot(x,mu,color=color,linewidth=linewidth,**kw)

-def gpplot(x, mu, lower, upper, edgecol=Tango.colorsHex['darkBlue'], fillcol=Tango.colorsHex['lightBlue'], ax=None, fignum=None, **kwargs):
+def gpplot(x, mu, lower, upper, edgecol='#3300FF', fillcol='#33CCFF', ax=None, fignum=None, **kwargs):
    _, axes = ax_default(fignum, ax)

    mu = mu.flatten()
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@ -1,16 +1,14 @@
 # Copyright (c) 2012-2015, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-try:
-    import Tango
-    import pylab as pb
-except:
-    pass
 import numpy as np
+from . import Tango
 from base_plots import gpplot, x_frame1D, x_frame2D
 from ...models.gp_coregionalized_regression import GPCoregionalizedRegression
 from ...models.sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
 from scipy import sparse
+from ...core.parameterization.variational import VariationalPosterior
+from matplotlib import pyplot as plt

 def plot_fit(model, plot_limits=None, which_data_rows='all',
        which_data_ycols='all', fixed_inputs=[],
@ -63,7 +61,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
    #if len(which_data_ycols)==0:
        #raise ValueError('No data selected for plotting')
    if ax is None:
-        fig = pb.figure(num=fignum)
+        fig = plt.figure(num=fignum)
        ax = fig.add_subplot(111)

    if hasattr(model, 'has_uncertain_inputs') and model.has_uncertain_inputs():
@ -78,7 +76,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',

    if predict_kw is None:
        predict_kw = {}
-    
+
    #work out what the inputs are for plotting (1D or 2D)
    fixed_dims = np.array([i for i,v in fixed_inputs])
    free_dims = np.setdiff1d(np.arange(model.input_dim),fixed_dims)
@ -106,11 +104,14 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
                upper = m + 2*np.sqrt(v)
        else:
            if isinstance(model,GPCoregionalizedRegression) or isinstance(model,SparseGPCoregionalizedRegression):
-                meta = {'output_index': Xgrid[:,-1:].astype(np.int)}
-            else:
-                meta = None
-            m, v = model.predict(Xgrid, full_cov=False, Y_metadata=meta, **predict_kw)
-            lower, upper = model.predict_quantiles(Xgrid, Y_metadata=meta)
+                extra_data = Xgrid[:,-1:].astype(np.int)
+                if Y_metadata is None:
+                    Y_metadata = {'output_index': extra_data}
+                else:
+                    Y_metadata['output_index'] = extra_data
+            m, v = model.predict(Xgrid, full_cov=False, Y_metadata=Y_metadata, **predict_kw)
+            fmu, fv = model._raw_predict(Xgrid, full_cov=False, **predict_kw)
+            lower, upper = model.likelihood.predictive_quantiles(fmu, fv, (2.5, 97.5), Y_metadata=Y_metadata)


        for d in which_data_ycols:
@ -119,9 +120,11 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',

        #optionally plot some samples
        if samples: #NOTE not tested with fixed_inputs
-            Ysim = model.posterior_samples(Xgrid, samples)
+            Ysim = model.posterior_samples(Xgrid, samples, Y_metadata=Y_metadata)
+            print Ysim.shape
+            print Xnew.shape
            for yi in Ysim.T:
-                plots['posterior_samples'] = ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
+                plots['posterior_samples'] = ax.plot(Xnew, yi[:,None], '#3300FF', linewidth=0.25)
                #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.

        if samples_f: #NOTE not tested with fixed_inputs
@ -184,14 +187,16 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
            m, _ = model._raw_predict(Xgrid, **predict_kw)
        else:
            if isinstance(model,GPCoregionalizedRegression) or isinstance(model,SparseGPCoregionalizedRegression):
-                meta = {'output_index': Xgrid[:,-1:].astype(np.int)}
-            else:
-                meta = None
-            m, v = model.predict(Xgrid, full_cov=False, Y_metadata=meta, **predict_kw)
+                extra_data = Xgrid[:,-1:].astype(np.int)
+                if Y_metadata is None:
+                    Y_metadata = {'output_index': extra_data}
+                else:
+                    Y_metadata['output_index'] = extra_data
+            m, v = model.predict(Xgrid, full_cov=False, Y_metadata=Y_metadata, **predict_kw)
        for d in which_data_ycols:
            m_d = m[:,d].reshape(resolution, resolution).T
-            plots['contour'] = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
-            if not plot_raw: plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+            plots['contour'] = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=plt.cm.jet)
+            if not plot_raw: plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=plt.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)

        #set the limits of the plot to some sensible values
        ax.set_xlim(xmin[0], xmax[0])
@ -219,7 +224,7 @@ def plot_fit_f(model, *args, **kwargs):
    kwargs['plot_raw'] = True
    plot_fit(model,*args, **kwargs)

-def fixed_inputs(model, non_fixed_inputs, fix_routine='median', as_list=True):
+def fixed_inputs(model, non_fixed_inputs, fix_routine='median', as_list=True, X_all=False):
    """
    Convenience function for returning back fixed_inputs where the other inputs
    are fixed using fix_routine
@ -235,8 +240,13 @@ def fixed_inputs(model, non_fixed_inputs, fix_routine='median', as_list=True):
    f_inputs = []
    if hasattr(model, 'has_uncertain_inputs') and model.has_uncertain_inputs():
        X = model.X.mean.values.copy()
-    else:
+    elif isinstance(model.X, VariationalPosterior):
        X = model.X.values.copy()
+    else:
+        if X_all:
+            X = model.X_all.copy()
+        else:
+            X = model.X.copy()
    for i in range(X.shape[1]):
        if i not in non_fixed_inputs:
            if fix_routine == 'mean':
--- a/GPy/plotting/matplot_dep/variational_plots.py
+++ b/GPy/plotting/matplot_dep/variational_plots.py
@ -22,7 +22,7 @@ def plot(parameterized, fignum=None, ax=None, colors=None, figsize=(12, 6)):
    lines = []
    fills = []
    bg_lines = []
-    means, variances = parameterized.mean, parameterized.variance
+    means, variances = parameterized.mean.values, parameterized.variance.values
    x = np.arange(means.shape[0])
    for i in range(means.shape[1]):
        if ax is None:
@ -43,7 +43,7 @@ def plot(parameterized, fignum=None, ax=None, colors=None, figsize=(12, 6)):
        if i < means.shape[1] - 1:
            a.set_xticklabels('')
    pb.draw()
-    fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
+    a.figure.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
    return dict(lines=lines, fills=fills, bg_lines=bg_lines)

 def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None, side_by_side=True):
--- a/GPy/plotting/matplot_dep/visualize.py
+++ b/GPy/plotting/matplot_dep/visualize.py
@ -408,12 +408,13 @@ class mocap_data_show_vpython(vpython_show):
 class mocap_data_show(matplotlib_show):
    """Base class for visualizing motion capture data."""

-    def __init__(self, vals, axes=None, connect=None):
+    def __init__(self, vals, axes=None, connect=None, color='b'):
        if axes==None:
            fig = plt.figure()
            axes = fig.add_subplot(111, projection='3d', aspect='equal')
        matplotlib_show.__init__(self, vals, axes)

+        self.color = color
        self.connect = connect
        self.process_values()
        self.initialize_axes()
@ -423,7 +424,7 @@ class mocap_data_show(matplotlib_show):
        self.axes.figure.canvas.draw()

    def draw_vertices(self):
-        self.points_handle = self.axes.scatter(self.vals[:, 0], self.vals[:, 1], self.vals[:, 2])
+        self.points_handle = self.axes.scatter(self.vals[:, 0], self.vals[:, 1], self.vals[:, 2], color=self.color)

    def draw_edges(self):
        self.line_handle = []
@ -442,7 +443,7 @@ class mocap_data_show(matplotlib_show):
                z.append(self.vals[i, 2])
                z.append(self.vals[j, 2])
                z.append(np.NaN)
-            self.line_handle = self.axes.plot(np.array(x), np.array(y), np.array(z), 'b-')
+            self.line_handle = self.axes.plot(np.array(x), np.array(y), np.array(z), '-', color=self.color)

    def modify(self, vals):
        self.vals = vals.copy()
@ -450,7 +451,7 @@ class mocap_data_show(matplotlib_show):
        self.initialize_axes_modify()
        self.draw_vertices()
        self.initialize_axes()
-        self.finalize_axes_modify()
+        #self.finalize_axes_modify()
        self.draw_edges()
        self.axes.figure.canvas.draw()

@ -469,12 +470,20 @@ class mocap_data_show(matplotlib_show):
        self.line_handle[0].remove()

    def finalize_axes(self):
-        self.axes.set_xlim(self.x_lim)
-        self.axes.set_ylim(self.y_lim)
-        self.axes.set_zlim(self.z_lim)
-        self.axes.auto_scale_xyz([-1., 1.], [-1., 1.], [-1., 1.])
+#         self.axes.set_xlim(self.x_lim)
+#         self.axes.set_ylim(self.y_lim)
+#         self.axes.set_zlim(self.z_lim)
+#         self.axes.auto_scale_xyz([-1., 1.], [-1., 1.], [-1., 1.])

-#        self.axes.set_aspect('equal')
+        extents = np.array([getattr(self.axes, 'get_{}lim'.format(dim))() for dim in 'xyz'])
+        sz = extents[:,1] - extents[:,0]
+        centers = np.mean(extents, axis=1)
+        maxsize = max(abs(sz))
+        r = maxsize/2
+        for ctr, dim in zip(centers, 'xyz'):
+            getattr(self.axes, 'set_{}lim'.format(dim))(ctr - r, ctr + r)
+        
+#         self.axes.set_aspect('equal')
 #         self.axes.autoscale(enable=False)

    def finalize_axes_modify(self):
@ -494,7 +503,7 @@ class stick_show(mocap_data_show):

 class skeleton_show(mocap_data_show):
    """data_show class for visualizing motion capture data encoded as a skeleton with angles."""
-    def __init__(self, vals, skel, axes=None, padding=0):
+    def __init__(self, vals, skel, axes=None, padding=0, color='b'):
        """data_show class for visualizing motion capture data encoded as a skeleton with angles.
        :param vals: set of modeled angles to use for printing in the axis when it's first created.
        :type vals: np.array
@ -506,7 +515,7 @@ class skeleton_show(mocap_data_show):
        self.skel = skel
        self.padding = padding
        connect = skel.connection_matrix()
-        mocap_data_show.__init__(self, vals, axes=axes, connect=connect)
+        mocap_data_show.__init__(self, vals, axes=axes, connect=connect, color=color)
    def process_values(self):
        """Takes a set of angles and converts them to the x,y,z coordinates in the internal prepresentation of the class, ready for plotting.

--- a/GPy/testing/cython_tests.py
+++ b/GPy/testing/cython_tests.py
@ -9,8 +9,8 @@ These tests make sure that the opure python and cython codes work the same

 class CythonTestChols(np.testing.TestCase):
    def setUp(self):
-        self.flat = np.random.randn(45, 5)
-        self.triang = np.dstack([np.eye(20)[:,:,None] for i in range(3)])
+        self.flat = np.random.randn(45,5)
+        self.triang = np.array([np.eye(20) for i in range(3)])
    def test_flat_to_triang(self):
        L1 = choleskies._flat_to_triang_pure(self.flat)
        L2 = choleskies._flat_to_triang_cython(self.flat)
@ -51,11 +51,16 @@ class test_stationary(np.testing.TestCase):

 class test_choleskies_backprop(np.testing.TestCase):
    def setUp(self):
-        self.dL, self.L = np.random.randn(2, 100, 100)
+        a =np.random.randn(10,12)
+        A = a.dot(a.T)
+        self.L = GPy.util.linalg.jitchol(A)
+        self.dL = np.random.randn(10,10)
    def test(self):
        r1 = GPy.util.choleskies._backprop_gradient_pure(self.dL, self.L)
        r2 = GPy.util.choleskies.choleskies_cython.backprop_gradient(self.dL, self.L)
+        r3 = GPy.util.choleskies.choleskies_cython.backprop_gradient_par_c(self.dL, self.L)
        np.testing.assert_allclose(r1, r2)
+        np.testing.assert_allclose(r1, r3)



--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@ -8,11 +8,12 @@ The test cases for various inference algorithms
 import unittest, itertools
 import numpy as np
 import GPy
-
+#np.seterr(invalid='raise')

 class InferenceXTestCase(unittest.TestCase):

    def genData(self):
+        np.random.seed(1)
        D1,D2,N = 12,12,50

        x = np.linspace(0, 4 * np.pi, N)[:, None]
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -312,7 +312,12 @@ class KernelGradientTestsContinuous(unittest.TestCase):
        k = GPy.kern.LinearFull(self.D, self.D-1)
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
-
+        
+    def test_standard_periodic(self):
+        k = GPy.kern.StdPeriodic(self.D, self.D-1)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+        
 class KernelTestsMiscellaneous(unittest.TestCase):
    def setUp(self):
        N, D = 100, 10
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@ -9,8 +9,7 @@ import inspect
 from GPy.likelihoods import link_functions
 from GPy.core.parameterization import Param
 from functools import partial
-#np.random.seed(300)
-#np.random.seed(4)
+fixed_seed = 7

 #np.seterr(divide='raise')
 def dparam_partial(inst_func, *args):
@ -105,6 +104,7 @@ class TestNoiseModels(object):
    Generic model checker
    """
    def setUp(self):
+        np.random.seed(fixed_seed)
        self.N = 15
        self.D = 3
        self.X = np.random.rand(self.N, self.D)*10
@ -218,7 +218,8 @@ class TestNoiseModels(object):
                    "constraints": [(".*variance", self.constrain_positive)]
                },
                "laplace": True,
-                "ep": False # FIXME: Should be True when we have it working again
+                "ep": False, # FIXME: Should be True when we have it working again
+                "variational_expectations": True,
            },
            "Gaussian_log": {
                "model": GPy.likelihoods.Gaussian(gp_link=link_functions.Log(), variance=self.var),
@ -227,7 +228,8 @@ class TestNoiseModels(object):
                    "vals": [self.var],
                    "constraints": [(".*variance", self.constrain_positive)]
                },
-                "laplace": True
+                "laplace": True,
+                "variational_expectations": True
            },
            #"Gaussian_probit": {
            #"model": GPy.likelihoods.gaussian(gp_link=link_functions.Probit(), variance=self.var, D=self.D, N=self.N),
@ -252,7 +254,8 @@ class TestNoiseModels(object):
                "link_f_constraints": [partial(self.constrain_bounded, lower=0, upper=1)],
                "laplace": True,
                "Y": self.binary_Y,
-                "ep": False # FIXME: Should be True when we have it working again
+                "ep": False, # FIXME: Should be True when we have it working again
+                "variational_expectations": True
            },
            "Exponential_default": {
                "model": GPy.likelihoods.Exponential(),
@ -347,6 +350,10 @@ class TestNoiseModels(object):
                ep = attributes["ep"]
            else:
                ep = False
+            if "variational_expectations" in attributes:
+                var_exp = attributes["variational_expectations"]
+            else:
+                var_exp = False

            #if len(param_vals) > 1:
                #raise NotImplementedError("Cannot support multiple params in likelihood yet!")
@ -377,6 +384,11 @@ class TestNoiseModels(object):
            if ep:
                #ep likelihood gradcheck
                yield self.t_ep_fit_rbf_white, model, self.X, Y, f, Y_metadata, self.step, param_vals, param_names, param_constraints
+            if var_exp:
+                #Need to specify mu and var!
+                yield self.t_varexp, model, Y, Y_metadata
+                yield self.t_dexp_dmu, model, Y, Y_metadata
+                yield self.t_dexp_dvar, model, Y, Y_metadata


        self.tearDown()
@ -603,6 +615,87 @@ class TestNoiseModels(object):
        print(m)
        assert m.checkgrad(verbose=1, step=step)

+    ################
+    # variational expectations #
+    ################
+    @with_setup(setUp, tearDown)
+    def t_varexp(self, model, Y, Y_metadata):
+        #Test that the analytic implementation (if it exists) matches the generic gauss
+        #hermite implementation
+        print("\n{}".format(inspect.stack()[0][3]))
+        #Make mu and var (marginal means and variances of q(f)) draws from a GP
+        k = GPy.kern.RBF(1).K(np.linspace(0,1,Y.shape[0])[:, None])
+        L = GPy.util.linalg.jitchol(k)
+        mu = L.dot(np.random.randn(*Y.shape))
+        #Variance must be positive
+        var = np.abs(L.dot(np.random.randn(*Y.shape))) + 0.01
+
+        expectation = model.variational_expectations(Y=Y, m=mu, v=var, gh_points=None, Y_metadata=Y_metadata)[0]
+
+        #Implementation of gauss hermite integration
+        shape = mu.shape
+        gh_x, gh_w= np.polynomial.hermite.hermgauss(50)
+        m,v,Y = mu.flatten(), var.flatten(), Y.flatten()
+        #make a grid of points
+        X = gh_x[None,:]*np.sqrt(2.*v[:,None]) + m[:,None]
+        #evaluate the likelhood for the grid. First ax indexes the data (and mu, var) and the second indexes the grid.
+        # broadcast needs to be handled carefully.
+        logp = model.logpdf(X, Y[:,None], Y_metadata=Y_metadata)
+        #average over the gird to get derivatives of the Gaussian's parameters
+        #division by pi comes from fact that for each quadrature we need to scale by 1/sqrt(pi)
+        expectation_gh  = np.dot(logp, gh_w)/np.sqrt(np.pi)
+        expectation_gh = expectation_gh.reshape(*shape)
+
+        np.testing.assert_almost_equal(expectation, expectation_gh, decimal=5)
+
+    @with_setup(setUp, tearDown)
+    def t_dexp_dmu(self, model, Y, Y_metadata):
+        print("\n{}".format(inspect.stack()[0][3]))
+        #Make mu and var (marginal means and variances of q(f)) draws from a GP
+        k = GPy.kern.RBF(1).K(np.linspace(0,1,Y.shape[0])[:, None])
+        L = GPy.util.linalg.jitchol(k)
+        mu = L.dot(np.random.randn(*Y.shape))
+        #Variance must be positive
+        var = np.abs(L.dot(np.random.randn(*Y.shape))) + 0.01
+        expectation = functools.partial(model.variational_expectations, Y=Y, v=var, gh_points=None, Y_metadata=Y_metadata)
+
+        #Function to get the nth returned value
+        def F(mu):
+            return expectation(m=mu)[0]
+        def dmu(mu):
+            return expectation(m=mu)[1]
+
+        grad = GradientChecker(F, dmu, mu.copy(), 'm')
+
+        grad.randomize()
+        print(grad)
+        print(model)
+        assert grad.checkgrad(verbose=1)
+
+    @with_setup(setUp, tearDown)
+    def t_dexp_dvar(self, model, Y, Y_metadata):
+        print("\n{}".format(inspect.stack()[0][3]))
+        #Make mu and var (marginal means and variances of q(f)) draws from a GP
+        k = GPy.kern.RBF(1).K(np.linspace(0,1,Y.shape[0])[:, None])
+        L = GPy.util.linalg.jitchol(k)
+        mu = L.dot(np.random.randn(*Y.shape))
+        #Variance must be positive
+        var = np.abs(L.dot(np.random.randn(*Y.shape))) + 0.01
+        expectation = functools.partial(model.variational_expectations, Y=Y, m=mu, gh_points=None, Y_metadata=Y_metadata)
+
+        #Function to get the nth returned value
+        def F(var):
+            return expectation(v=var)[0]
+        def dvar(var):
+            return expectation(v=var)[2]
+
+        grad = GradientChecker(F, dvar, var.copy(), 'v')
+
+        self.constrain_positive('v', grad)
+        #grad.randomize()
+        print(grad)
+        print(model)
+        assert grad.checkgrad(verbose=1)

 class LaplaceTests(unittest.TestCase):
    """
@ -610,6 +703,7 @@ class LaplaceTests(unittest.TestCase):
    """

    def setUp(self):
+        np.random.seed(fixed_seed)
        self.N = 15
        self.D = 1
        self.X = np.random.rand(self.N, self.D)*10
--- a/GPy/testing/link_function_tests.py
+++ b/GPy/testing/link_function_tests.py
@ -49,7 +49,7 @@ class LinkFunctionTests(np.testing.TestCase):
        self.assertTrue(grad3.checkgrad(verbose=True))

        if test_lim:
-            print "Testing limits"
+            print("Testing limits")
            #Remove some otherwise we are too close to the limit for gradcheck to work effectively
            lim_of_inf = lim_of_inf - 1e-4
            grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=lim_of_inf)
@ -79,8 +79,7 @@ class LinkFunctionTests(np.testing.TestCase):
        assert np.isinf(np.exp(np.log(self.f_upper_lim)))
        #Check the clipping works
        np.testing.assert_almost_equal(link.transf(self.f_lower_lim), 0, decimal=5)
-        #Need to look at most significant figures here rather than the decimals
-        np.testing.assert_approx_equal(link.transf(self.f_upper_lim), _lim_val, significant=5)
+        self.assertTrue(np.isfinite(link.transf(self.f_upper_lim)))
        self.check_overflow(link, lim_of_inf)

        #Check that it would otherwise fail
--- a/GPy/testing/misc_tests.py
+++ b/GPy/testing/misc_tests.py
@ -1,6 +1,7 @@
 import numpy as np
 import scipy as sp
 import GPy
+import warnings

 class MiscTests(np.testing.TestCase):
    """
@ -11,8 +12,15 @@ class MiscTests(np.testing.TestCase):
        self._lim_val_exp = np.log(self._lim_val)

    def test_safe_exp_upper(self):
-        assert np.exp(self._lim_val_exp + 1) == np.inf
-        assert GPy.util.misc.safe_exp(self._lim_val_exp + 1) < np.inf
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter('always')  # always print
+            assert np.isfinite(np.exp(self._lim_val_exp))
+            assert np.isinf(np.exp(self._lim_val_exp + 1))
+            assert np.isfinite(GPy.util.misc.safe_exp(self._lim_val_exp + 1))
+
+            print w
+            print len(w)
+            assert len(w)==1 # should have one overflow warning

    def test_safe_exp_lower(self):
        assert GPy.util.misc.safe_exp(1e-10) < np.inf
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -352,8 +352,8 @@ class GradientTests(np.testing.TestCase):
        self.check_model(rbf, model_type='SparseGPRegression', dimension=2)

    def test_SparseGPRegression_rbf_linear_white_kern_1D(self):
-        ''' Testing the sparse GP regression with rbf kernel on 2d data '''
-        rbflin = GPy.kern.RBF(1) + GPy.kern.Linear(1)
+        ''' Testing the sparse GP regression with rbf kernel on 1d data '''
+        rbflin = GPy.kern.RBF(1) + GPy.kern.Linear(1) + GPy.kern.White(1, 1e-5)
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=1)

    def test_SparseGPRegression_rbf_linear_white_kern_2D(self):
@ -361,14 +361,12 @@ class GradientTests(np.testing.TestCase):
        rbflin = GPy.kern.RBF(2) + GPy.kern.Linear(2)
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=2)

-    # @unittest.expectedFailure
    def test_SparseGPRegression_rbf_linear_white_kern_2D_uncertain_inputs(self):
        ''' Testing the sparse GP regression with rbf, linear kernel on 2d data with uncertain inputs'''
        rbflin = GPy.kern.RBF(2) + GPy.kern.Linear(2)
        raise unittest.SkipTest("This is not implemented yet!")
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=2, uncertain_inputs=1)

-    # @unittest.expectedFailure
    def test_SparseGPRegression_rbf_linear_white_kern_1D_uncertain_inputs(self):
        ''' Testing the sparse GP regression with rbf, linear kernel on 1d data with uncertain inputs'''
        rbflin = GPy.kern.RBF(1) + GPy.kern.Linear(1)
@ -385,6 +383,16 @@ class GradientTests(np.testing.TestCase):
        m = GPy.models.GPLVM(Y, input_dim, kernel=k)
        self.assertTrue(m.checkgrad())

+    def test_BCGPLVM_rbf_bias_white_kern_2D(self):
+        """ Testing GPLVM with rbf + bias kernel """
+        N, input_dim, D = 50, 1, 2
+        X = np.random.rand(N, input_dim)
+        k = GPy.kern.RBF(input_dim, 0.5, 0.9 * np.ones((1,))) + GPy.kern.Bias(input_dim, 0.1) + GPy.kern.White(input_dim, 0.05)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N), K, input_dim).T
+        m = GPy.models.BCGPLVM(Y, input_dim, kernel=k)
+        self.assertTrue(m.checkgrad())
+
    def test_GPLVM_rbf_linear_white_kern_2D(self):
        """ Testing GPLVM with rbf + bias kernel """
        N, input_dim, D = 50, 1, 2
@ -410,23 +418,8 @@ class GradientTests(np.testing.TestCase):
        Z = np.linspace(0, 15, 4)[:, None]
        kernel = GPy.kern.RBF(1)
        m = GPy.models.SparseGPClassification(X, Y, kernel=kernel, Z=Z)
-        # distribution = GPy.likelihoods.likelihood_functions.Bernoulli()
-        # likelihood = GPy.likelihoods.EP(Y, distribution)
-        # m = GPy.core.SparseGP(X, likelihood, kernel, Z)
-        # m.ensure_default_constraints()
        self.assertTrue(m.checkgrad())

-    @unittest.expectedFailure
-    def test_generalized_FITC(self):
-        N = 20
-        X = np.hstack([np.random.rand(N / 2) + 1, np.random.rand(N / 2) - 1])[:, None]
-        k = GPy.kern.RBF(1) + GPy.kern.White(1)
-        Y = np.hstack([np.ones(N / 2), np.zeros(N / 2)])[:, None]
-        m = GPy.models.FITCClassification(X, Y, kernel=k)
-        m.update_likelihood_approximation()
-        self.assertTrue(m.checkgrad())
-
-    @unittest.expectedFailure
    def test_multioutput_regression_1D(self):
        X1 = np.random.rand(50, 1) * 8
        X2 = np.random.rand(30, 1) * 5
@ -436,12 +429,11 @@ class GradientTests(np.testing.TestCase):
        Y = np.vstack((Y1, Y2))

        k1 = GPy.kern.RBF(1)
-        m = GPy.models.GPMultioutputRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel_list=[k1])
-        import ipdb;ipdb.set_trace()
-        m.constrain_fixed('.*rbf_var', 1.)
+        m = GPy.models.GPCoregionalizedRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel=k1)
+        #import ipdb;ipdb.set_trace()
+        #m.constrain_fixed('.*rbf_var', 1.)
        self.assertTrue(m.checkgrad())

-    @unittest.expectedFailure
    def test_multioutput_sparse_regression_1D(self):
        X1 = np.random.rand(500, 1) * 8
        X2 = np.random.rand(300, 1) * 5
@ -451,8 +443,7 @@ class GradientTests(np.testing.TestCase):
        Y = np.vstack((Y1, Y2))

        k1 = GPy.kern.RBF(1)
-        m = GPy.models.SparseGPMultioutputRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel_list=[k1])
-        m.constrain_fixed('.*rbf_var', 1.)
+        m = GPy.models.SparseGPCoregionalizedRegression(X_list=[X1, X2], Y_list=[Y1, Y2], kernel=k1)
        self.assertTrue(m.checkgrad())

    def test_gp_heteroscedastic_regression(self):
@ -481,6 +472,7 @@ class GradientTests(np.testing.TestCase):
        self.assertTrue(m.checkgrad())

    def test_gp_kronecker_gaussian(self):
+        np.random.seed(0)
        N1, N2 = 30, 20
        X1 = np.random.randn(N1, 1)
        X2 = np.random.randn(N2, 1)
@ -501,16 +493,16 @@ class GradientTests(np.testing.TestCase):

        m.randomize()
        mm[:] = m[:]
-        assert np.allclose(m.log_likelihood(), mm.log_likelihood())
-        assert np.allclose(m.gradient, mm.gradient)
+        self.assertTrue(np.allclose(m.log_likelihood(), mm.log_likelihood()))
+        self.assertTrue(np.allclose(m.gradient, mm.gradient))
        X1test = np.random.randn(100, 1)
        X2test = np.random.randn(100, 1)
        mean1, var1 = m.predict(X1test, X2test)
        yy, xx = np.meshgrid(X2test, X1test)
        Xgrid = np.vstack((xx.flatten(order='F'), yy.flatten(order='F'))).T
        mean2, var2 = mm.predict(Xgrid)
-        assert np.allclose(mean1, mean2)
-        assert np.allclose(var1, var2)
+        self.assertTrue( np.allclose(mean1, mean2) )
+        self.assertTrue( np.allclose(var1, var2) )

    def test_gp_VGPC(self):
        num_obs = 25
@ -518,7 +510,8 @@ class GradientTests(np.testing.TestCase):
        X = X[:, None]
        Y = 25. + np.sin(X / 20.) * 2. + np.random.rand(num_obs)[:, None]
        kern = GPy.kern.Bias(1) + GPy.kern.RBF(1)
-        m = GPy.models.GPVariationalGaussianApproximation(X, Y, kern)
+        lik = GPy.likelihoods.Gaussian()
+        m = GPy.models.GPVariationalGaussianApproximation(X, Y, kernel=kern, likelihood=lik)
        self.assertTrue(m.checkgrad())


--- a/GPy/util/block_matrices.py
+++ b/GPy/util/block_matrices.py
@ -100,10 +100,10 @@ def block_dot(A, B, diagonal=False):
        Dshape = D.shape
        if diagonal and (len(Cshape) == 1 or len(Dshape) == 1\
                or C.shape[0] != C.shape[1] or D.shape[0] != D.shape[1]):
-            print "Broadcasting, C: {} D:{}".format(C.shape, D.shape)
+            print("Broadcasting, C: {} D:{}".format(C.shape, D.shape))
            return C*D
        else:
-            print "Dotting, C: {} C:{}".format(C.shape, D.shape)
+            print("Dotting, C: {} C:{}".format(C.shape, D.shape))
            return np.dot(C,D)
    dot = np.vectorize(f, otypes = [np.object])
    return dot(A,B)
--- a/GPy/util/choleskies.py
+++ b/GPy/util/choleskies.py
@ -5,7 +5,7 @@ import numpy as np
 from . import linalg
 from .config import config

-import choleskies_cython
+from . import choleskies_cython

 def safe_root(N):
    i = np.sqrt(N)
@ -17,12 +17,12 @@ def safe_root(N):
 def _flat_to_triang_pure(flat_mat):
    N, D = flat_mat.shape
    M = (-1 + safe_root(8*N+1))//2
-    ret = np.zeros((M, M, D))
-    count = 0
-    for m in range(M):
-        for mm in range(m+1):
-            for d in range(D):
-              ret.flat[d + m*D*M + mm*D] = flat_mat.flat[count];
+    ret = np.zeros((D, M, M))
+    for d in range(D):
+        count = 0
+        for m in range(M):
+            for mm in range(m+1):
+              ret[d,m, mm] = flat_mat[count, d];
              count = count+1
    return ret

@ -33,15 +33,15 @@ def _flat_to_triang_cython(flat_mat):


 def _triang_to_flat_pure(L):
-    M, _, D = L.shape
+    D, _, M = L.shape

    N = M*(M+1)//2
    flat = np.empty((N, D))
-    count = 0;
-    for m in range(M):
-        for mm in range(m+1):
-            for d in range(D):
-                flat.flat[count] = L.flat[d + m*D*M + mm*D];
+    for d in range(D):
+        count = 0;
+        for m in range(M):
+            for mm in range(m+1):
+                flat[count,d] = L[d, m, mm]
                count = count +1
    return flat

@ -59,12 +59,12 @@ def _backprop_gradient_pure(dL, L):
    """
    dL_dK = np.tril(dL).copy()
    N = L.shape[0]
-    for k in xrange(N - 1, -1, -1):
-        for j in xrange(k + 1, N):
-            for i in xrange(j, N):
+    for k in range(N - 1, -1, -1):
+        for j in range(k + 1, N):
+            for i in range(j, N):
                dL_dK[i, k] -= dL_dK[i, j] * L[j, k]
                dL_dK[j, k] -= dL_dK[i, j] * L[i, k]
-        for j in xrange(k + 1, N):
+        for j in range(k + 1, N):
            dL_dK[j, k] /= L[k, k]
            dL_dK[k, k] -= L[j, k] * dL_dK[j, k]
        dL_dK[k, k] /= (2 * L[k, k])
@ -74,7 +74,7 @@ def triang_to_cov(L):
    return np.dstack([np.dot(L[:,:,i], L[:,:,i].T) for i in range(L.shape[-1])])

 def multiple_dpotri(Ls):
-    return np.dstack([linalg.dpotri(np.asfortranarray(Ls[:,:,i]), lower=1)[0] for i in range(Ls.shape[-1])])
+    return np.array([linalg.dpotri(np.asfortranarray(Ls[i]), lower=1)[0] for i in range(Ls.shape[0])])

 def indexes_to_fix_for_low_rank(rank, size):
    """
@ -100,7 +100,7 @@ def indexes_to_fix_for_low_rank(rank, size):
 if config.getboolean('cython', 'working'):
    triang_to_flat = _triang_to_flat_cython
    flat_to_triang = _flat_to_triang_cython
-    backprop_gradient = choleskies_cython.backprop_gradient
+    backprop_gradient = choleskies_cython.backprop_gradient_par_c
 else:
    backprop_gradient = _backprop_gradient_pure
    triang_to_flat =  _triang_to_flat_pure
--- a/GPy/util/choleskies_cython.c
+++ b/GPy/util/choleskies_cython.c
--- a/GPy/util/choleskies_cython.pyx
+++ b/GPy/util/choleskies_cython.pyx
@ -5,55 +5,110 @@
 # Copyright James Hensman and Alan Saul 2015

 import numpy as np
+from cython.parallel import prange, parallel
 cimport numpy as np
+cimport scipy.linalg.cython_blas as cblas

-def flat_to_triang(np.ndarray[double, ndim=2] flat, int M):
-    """take a matrix N x D and return a M X M x D array where
+def flat_to_triang(double[:, :] flat, int M):
+    """take a matrix N x D and return a D X M x M array where

    N = M(M+1)/2

    the lower triangluar portion of the d'th slice of the result is filled by the d'th column of flat.
    """
-    cdef int N = flat.shape[0]
    cdef int D = flat.shape[1]
+    cdef int N = flat.shape[0]
    cdef int count = 0
-    cdef np.ndarray[double, ndim=3] ret = np.zeros((M, M, D))
+    cdef double[:, :, ::1] ret = np.zeros((D, M, M))
    cdef int d, m, mm
-    for d in range(D):
-        count = 0
-        for m in range(M):
-            for mm in range(m+1):
-                ret[m, mm, d] = flat[count,d]
-                count += 1
+    with nogil:
+        for d in range(D):
+            count = 0
+            for m in range(M):
+                for mm in range(m+1):
+                    ret[d, m, mm] = flat[count,d]
+                    count += 1
    return ret

-def triang_to_flat(np.ndarray[double, ndim=3] L):
-    cdef int M = L.shape[0]
-    cdef int D = L.shape[2]
+def triang_to_flat(double[:, :, :] L):
+    cdef int D = L.shape[0]
+    cdef int M = L.shape[1]
    cdef int N = M*(M+1)/2
    cdef int count = 0
-    cdef np.ndarray[double, ndim=2] flat = np.empty((N, D))
+    cdef double[:, ::1] flat = np.empty((N, D))
    cdef int d, m, mm
-    for d in range(D):
-        count = 0
-        for m in range(M):
-            for mm in range(m+1):
-                flat[count,d] = L[m, mm, d]
-                count += 1
+    with nogil:
+        for d in range(D):
+            count = 0
+            for m in range(M):
+                for mm in range(m+1):
+                    flat[count,d] = L[d, m, mm]
+                    count += 1
    return flat

-
-def backprop_gradient(np.ndarray[double, ndim=2] dL, np.ndarray[double, ndim=2] L):
-    cdef np.ndarray[double, ndim=2] dL_dK = np.tril(dL).copy()
+def backprop_gradient(double[:, :] dL, double[:, :] L):
+    cdef double[:, ::1] dL_dK = np.tril(dL)
    cdef int N = L.shape[0]
    cdef int k, j, i
-    for k in range(N - 1, -1, -1):
-        for j in range(k + 1, N):
-            for i in range(j, N):
-                dL_dK[i, k] -= dL_dK[i, j] * L[j, k]
-                dL_dK[j, k] -= dL_dK[i, j] * L[i, k]
-        for j in range(k + 1, N):
-            dL_dK[j, k] /= L[k, k]
-            dL_dK[k, k] -= L[j, k] * dL_dK[j, k]
-        dL_dK[k, k] /= (2. * L[k, k])
+    with nogil:
+        for k in range(N - 1, -1, -1):
+            for j in range(k + 1, N):
+                for i in range(j, N):
+                    dL_dK[i, k] -= dL_dK[i, j] * L[j, k]
+                    dL_dK[j, k] -= dL_dK[i, j] * L[i, k]
+            for j in range(k + 1, N):
+                dL_dK[j, k] /= L[k, k]
+                dL_dK[k, k] -= L[j, k] * dL_dK[j, k]
+            dL_dK[k, k] /= (2. * L[k, k])
    return dL_dK
+
+def backprop_gradient_par(double[:,:] dL, double[:,:] L):
+    cdef double[:,::1] dL_dK = np.tril(dL)
+    cdef int N = L.shape[0]
+    cdef int k, j, i
+    with nogil:
+        for k in range(N - 1, -1, -1):
+            with parallel():
+                for i in prange(k + 1, N):
+                    for j in range(k+1, i+1):
+                        dL_dK[i, k] -= dL_dK[i, j] * L[j, k]
+                    for j in range(i, N):
+                        dL_dK[i, k] -= dL_dK[j, i] * L[j, k]
+            for j in range(k + 1, N):
+                dL_dK[j, k] /= L[k, k]
+                dL_dK[k, k] -= L[j, k] * dL_dK[j, k]
+            dL_dK[k, k] /= (2. * L[k, k])
+    return dL_dK
+
+cdef void chol_backprop(int N, double[:, ::1] dL, double[:, ::1] L) nogil:
+    cdef int i, k, n
+
+    # DSYMV required constant arguments
+    cdef double alpha=-1, beta=1
+    cdef int incx=N
+
+    # DSCAL required arguments
+    cdef double scale
+
+    dL[N - 1, N - 1] /= (2. * L[N - 1, N - 1])
+    for k in range(N-2, -1, -1):
+        n = N-k-1
+        cblas.dsymv(uplo='u', n=&n, alpha=&alpha, a=&dL[k + 1, k + 1], lda=&N, x=&L[k + 1, k], incx=&incx,
+                    beta=&beta, y=&dL[k + 1, k], incy=&N)
+
+        for i in xrange(0, N - k - 1):
+            dL[k + 1 + i, k] -= dL[k + i+ 1, k + i + 1] * L[k + 1 + i, k]
+
+        scale = 1.0 / L[k, k]
+        cblas.dscal(&n, &scale , &dL[k + 1, k], &N)
+#
+        dL[k, k] -= cblas.ddot(&n, &dL[k + 1, k], &N, &L[k+1, k], &incx)
+        dL[k, k] /= (2.0 * L[k, k])
+
+def backprop_gradient_par_c(double[:, :] dL, double[:, :] L):
+    cdef double[:, ::1] dL_dK = np.tril(dL) # makes a copy, c-contig
+    cdef double[:, ::1] L_cont = np.ascontiguousarray(L)
+    cdef int N = L.shape[0]
+    with nogil:
+        chol_backprop(N, dL_dK, L_cont)
+    return np.asarray(dL_dK)
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -15,7 +15,7 @@ import warnings
 import os
 from .config import config
 import logging
-import linalg_cython
+from . import linalg_cython


 _scipyversion = np.float64((scipy.__version__).split('.')[:2])
--- a/GPy/util/linalg_cython.c
+++ b/GPy/util/linalg_cython.c
--- a/GPy/util/linalg_cython.pyx
+++ b/GPy/util/linalg_cython.pyx
@ -1,3 +1,4 @@
+from libc.math cimport sqrt
 cimport numpy as np
 from cpython cimport bool
 import cython
@ -19,16 +20,18 @@ def symmetrify(np.ndarray[double, ndim=2] A, bool upper):
@cython.boundscheck(False)
@cython.wraparound(False)
@cython.nonecheck(False)
+@cython.cdivision(True)
 def cholupdate(np.ndarray[double, ndim=1] x, np.ndarray[double, ndim=2] L, int N):
-    cdef double r
-    cdef double c
-    cdef double s
-    for j in xrange(N):
-        r = np.sqrt(L[j,j]*L[j,j] + x[j]*x[j])
-        c = r / L[j,j]
-        s = x[j] / L[j,j]
-        L[j,j] = r
-        for i in xrange(j):
-            L[i,j] = (L[i,j] + s*x[i])/c
-            x[i] = c*x[i] - s*L[i,j];
-        r = np.sqrt(L[j,j])
+    cdef double r, c, s
+    cdef int j, i
+
+    with nogil:
+        for j in xrange(N):
+            r = sqrt(L[j, j] * L[j, j] + x[j] * x[j])
+            c = r / L[j, j]
+            s = x[j] / L[j, j]
+            L[j, j] = r
+            for i in xrange(j):
+                L[i, j] = (L[i, j] + s * x[i]) / c
+                x[i] = c * x[i] - s * L[i, j]
+            r = sqrt(L[j, j])
--- a/GPy/util/univariate_Gaussian.py
+++ b/GPy/util/univariate_Gaussian.py
@ -8,6 +8,7 @@ from scipy.special import ndtr as std_norm_cdf
 #define a standard normal pdf
 _sqrt_2pi = np.sqrt(2*np.pi)
 def std_norm_pdf(x):
+    x = np.clip(x,-1e150,1e150)
    return np.exp(-np.square(x)/2)/_sqrt_2pi

 def inv_std_norm_cdf(x):