merge the changes

2026-05-15 06:52:39 +02:00 · 2015-07-27 17:42:01 +01:00 · 2015-07-27 17:42:01 +01:00 · fc0e8f3e7e
commit fc0e8f3e7e
parent e785668417 54e9a4f6ff
66 changed files with 55761 additions and 702 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -18,7 +18,8 @@ before_install:

 install:
  - conda install --yes python=$TRAVIS_PYTHON_VERSION atlas numpy=1.7 scipy=0.12 matplotlib nose sphinx pip nose
-  - pip install . 
+  #- pip install . 
+  - python setup.py build_ext --inplace
  #--use-mirrors
  #
 # command to run tests, e.g. python setup.py test
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -89,7 +89,6 @@ class GP(Model):
            assert mean_function.output_dim == self.output_dim
            self.link_parameter(mean_function)

-
        #find a sensible inference method
        logger.info("initializing inference method")
        if inference_method is None:
@ -208,6 +207,7 @@ class GP(Model):
            Kxx = kern.Kdiag(_Xnew)
            var = Kxx - np.sum(WiKx*Kx, 0)
            var = var.reshape(-1, 1)
+            var[var<0.] = 0.

        #force mu to be a column vector
        if len(mu.shape)==1: mu = mu[:,None]
@ -229,13 +229,14 @@ class GP(Model):
        :param Y_metadata: metadata about the predicting point to pass to the likelihood
        :param kern: The kernel to use for prediction (defaults to the model
                     kern). this is useful for examining e.g. subprocesses.
-        :returns: (mean, var, lower_upper):
+        :returns: (mean, var):
            mean: posterior mean, a Numpy array, Nnew x self.input_dim
            var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
-            lower_upper: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim

           If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
           This is to allow for different normalizations of the output dimensions.
+
+        Note: If you want the predictive quantiles (e.g. 95% confidence interval) use :py:func:"~GPy.core.gp.GP.predict_quantiles".
        """
        #predict the latent function values
        mu, var = self._raw_predict(Xnew, full_cov=full_cov, kern=kern)
@ -243,7 +244,7 @@ class GP(Model):
            mu, var = self.normalizer.inverse_mean(mu), self.normalizer.inverse_variance(var)

        # now push through likelihood
-        mean, var = self.likelihood.predictive_values(mu, var, full_cov, Y_metadata)
+        mean, var = self.likelihood.predictive_values(mu, var, full_cov, Y_metadata=Y_metadata)
        return mean, var

    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None):
@ -255,12 +256,12 @@ class GP(Model):
        :param quantiles: tuple of quantiles, default is (2.5, 97.5) which is the 95% interval
        :type quantiles: tuple
        :returns: list of quantiles for each X and predictive quantiles for interval combination
-        :rtype: [np.ndarray (Xnew x self.input_dim), np.ndarray (Xnew x self.input_dim)]
+        :rtype: [np.ndarray (Xnew x self.output_dim), np.ndarray (Xnew x self.output_dim)]
        """
        m, v = self._raw_predict(X,  full_cov=False)
        if self.normalizer is not None:
            m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
-        return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
+        return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata=Y_metadata)

    def predictive_gradients(self, Xnew):
        """
@ -330,7 +331,7 @@ class GP(Model):
        :returns: Ysim: set of simulations, a Numpy array (N x samples).
        """
        fsim = self.posterior_samples_f(X, size, full_cov=full_cov)
-        Ysim = self.likelihood.samples(fsim, Y_metadata)
+        Ysim = self.likelihood.samples(fsim, Y_metadata=Y_metadata)
        return Ysim

    def plot_f(self, plot_limits=None, which_data_rows='all',
@ -395,7 +396,7 @@ class GP(Model):
        which_data_ycols='all', fixed_inputs=[],
        levels=20, samples=0, fignum=None, ax=None, resolution=None,
        plot_raw=False,
-        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx'):
+        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx', predict_kw=None):
        """
        Plot the posterior of the GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
@ -444,7 +445,7 @@ class GP(Model):
                                     which_data_ycols, fixed_inputs,
                                     levels, samples, fignum, ax, resolution,
                                     plot_raw=plot_raw, Y_metadata=Y_metadata,
-                                     data_symbol=data_symbol, **kw)
+                                     data_symbol=data_symbol, predict_kw=predict_kw, **kw)

    def input_sensitivity(self, summarize=True):
        """
@ -472,16 +473,51 @@ class GP(Model):
            self.inference_method.on_optimization_end()
            raise

-    def infer_newX(self, Y_new, optimize=True, ):
+    def infer_newX(self, Y_new, optimize=True):
        """
-        Infer the distribution of X for the new observed data *Y_new*.
+        Infer X for the new observed data *Y_new*.

        :param Y_new: the new observed data for inference
        :type Y_new: numpy.ndarray
        :param optimize: whether to optimize the location of new X (True by default)
        :type optimize: boolean
        :return: a tuple containing the posterior estimation of X and the model that optimize X
-        :rtype: (:class:`~GPy.core.parameterization.variational.VariationalPosterior` or numpy.ndarray, :class:`~GPy.core.model.Model`)
+        :rtype: (:class:`~GPy.core.parameterization.variational.VariationalPosterior` and numpy.ndarray, :class:`~GPy.core.model.Model`)
        """
        from ..inference.latent_function_inference.inferenceX import infer_newX
        return infer_newX(self, Y_new, optimize=optimize)
+
+    def log_predictive_density(self, x_test, y_test, Y_metadata=None):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param x_test: test locations (x_{*})
+        :type x_test: (Nx1) array
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param Y_metadata: metadata associated with the test points
+        """
+        mu_star, var_star = self._raw_predict(x_test)
+        return self.likelihood.log_predictive_density(y_test, mu_star, var_star, Y_metadata=Y_metadata)
+
+    def log_predictive_density_sampling(self, x_test, y_test, Y_metadata=None, num_samples=1000):
+        """
+        Calculation of the log predictive density by sampling
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param x_test: test locations (x_{*})
+        :type x_test: (Nx1) array
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param Y_metadata: metadata associated with the test points
+        :param num_samples: number of samples to use in monte carlo integration
+        :type num_samples: int
+        """
+        mu_star, var_star = self._raw_predict(x_test)
+        return self.likelihood.log_predictive_density_sampling(y_test, mu_star, var_star, Y_metadata=Y_metadata, num_samples=num_samples)
+
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -76,7 +76,7 @@ class Model(Parameterized):
                jobs = []
                pool = mp.Pool(processes=num_processes)
                for i in range(num_restarts):
-                    self.randomize()
+                    if i>0: self.randomize()
                    job = pool.apply_async(opt_wrapper, args=(self,), kwds=kwargs)
                    jobs.append(job)

@ -90,7 +90,7 @@ class Model(Parameterized):
        for i in range(num_restarts):
            try:
                if not parallel:
-                    self.randomize()
+                    if i>0: self.randomize()
                    self.optimize(**kwargs)
                else:
                    self.optimization_runs.append(jobs[i].get())
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@ -5,6 +5,7 @@ import numpy
 from numpy.lib.function_base import vectorize
 from .lists_and_dicts import IntArrayDict
 from functools import reduce
+from .transformations import Transformation

 def extract_properties_to_index(index, props):
    prop_index = dict()
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@ -38,6 +38,11 @@ class Param(Parameterizable, ObsAr):
    Fixing parameters will fix them to the value they are right now. If you change
    the fixed value, it will be fixed to the new value!

+    Important Note:
+    Multilevel indexing (e.g. self[:2][1:]) is not supported and might lead to unexpected behaviour.
+    Try to index in one go, using boolean indexing or the numpy builtin
+    np.index function.
+    
    See :py:class:`GPy.core.parameterized.Parameterized` for more details on constraining etc.

    """
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@ -430,23 +430,38 @@ class Indexable(Nameable, Updateable):

    def log_prior(self):
        """evaluate the prior"""
-        if self.priors.size > 0:
-            x = self.param_array
-            #py3 fix
-            #return reduce(lambda a, b: a + b, (p.lnpdf(x[ind]).sum() for p, ind in self.priors.iteritems()), 0)
-            return reduce(lambda a, b: a + b, (p.lnpdf(x[ind]).sum() for p, ind in self.priors.items()), 0)
-        return 0.
+        if self.priors.size == 0:
+            return 0.
+        x = self.param_array
+        #evaluate the prior log densities
+        log_p = reduce(lambda a, b: a + b, (p.lnpdf(x[ind]).sum() for p, ind in self.priors.items()), 0)
+
+        #account for the transformation by evaluating the log Jacobian (where things are transformed)
+        log_j = 0.
+        priored_indexes = np.hstack([i for p, i in self.priors.items()])
+        for c,j in self.constraints.items():
+            if not isinstance(c, Transformation):continue
+            for jj in j:
+                if jj in priored_indexes:
+                    log_j += c.log_jacobian(x[jj])
+        return log_p + log_j

    def _log_prior_gradients(self):
        """evaluate the gradients of the priors"""
-        if self.priors.size > 0:
-            x = self.param_array
-            ret = np.zeros(x.size)
-            #py3 fix
-            #[np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.iteritems()]
-            [np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.items()]
-            return ret
-        return 0.
+        if self.priors.size == 0:
+            return 0.
+        x = self.param_array
+        ret = np.zeros(x.size)
+        #compute derivate of prior density
+        [np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.items()]
+        #add in jacobian derivatives if transformed
+        priored_indexes = np.hstack([i for p, i in self.priors.items()])
+        for c,j in self.constraints.items():
+            if not isinstance(c, Transformation):continue
+            for jj in j:
+                if jj in priored_indexes:
+                    ret[jj] += c.log_jacobian_grad(x[jj])
+        return ret

    #===========================================================================
    # Tie parameters together
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -9,7 +9,7 @@ from .param import ParamConcatenation
 from .parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing

 import logging
-from GPy.core.parameterization.index_operations import ParameterIndexOperationsView
+from .index_operations import ParameterIndexOperationsView
 logger = logging.getLogger("parameters changed meta")

 class ParametersChangedMeta(type):
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@ -522,16 +522,9 @@ class DGPLVM(Prior):

    """
    domain = _REAL
-    # _instances = []
-    # def __new__(cls, mu, sigma): # Singleton:
-    #     if cls._instances:
-    #         cls._instances[:] = [instance for instance in cls._instances if instance()]
-    #         for instance in cls._instances:
-    #             if instance().mu == mu and instance().sigma == sigma:
-    #                 return instance()
-    #     o = super(Prior, cls).__new__(cls, mu, sigma)
-    #     cls._instances.append(weakref.ref(o))
-    #     return cls._instances[-1]()
+    
+    def __new__(cls, sigma2, lbl, x_shape): 
+        return super(Prior, cls).__new__(cls, sigma2, lbl, x_shape)

    def __init__(self, sigma2, lbl, x_shape):
        self.sigma2 = sigma2
@ -730,7 +723,7 @@ class DGPLVM(Prior):

 # ******************************************

-from parameterized import Parameterized
+from .. import Parameterized
 from .. import Param
 class DGPLVM_Lamda(Prior, Parameterized):
    """
@ -758,12 +751,12 @@ class DGPLVM_Lamda(Prior, Parameterized):
        self.sigma2 = sigma2
        # self.x = x
        self.lbl = lbl
-	self.lamda = lamda
+        self.lamda = lamda
        self.classnum = lbl.shape[1]
        self.datanum = lbl.shape[0]
        self.x_shape = x_shape
        self.dim = x_shape[1]
-	self.lamda = Param('lamda', np.diag(lamda))
+        self.lamda = Param('lamda', np.diag(lamda))
        self.link_parameter(self.lamda)

    def get_class_label(self, y):
@ -789,7 +782,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
        M_i = np.zeros((self.classnum, self.dim))
        for i in cls:
            # Mean of each class
-	    class_i = cls[i]
+            class_i = cls[i]
            M_i[i] = np.mean(class_i, axis=0)
        return M_i

@ -899,8 +892,8 @@ class DGPLVM_Lamda(Prior, Parameterized):
 	#!!!!!!!!!!!!!!!!!!!!!!!!!!!
 	#self.lamda.values[:] = self.lamda.values/self.lamda.values.sum()	

-	xprime = x.dot(np.diagflat(self.lamda))
-	x = xprime
+        xprime = x.dot(np.diagflat(self.lamda))
+        x = xprime
 	# print x
        cls = self.compute_cls(x)
        M_0 = np.mean(x, axis=0)
@ -916,8 +909,8 @@ class DGPLVM_Lamda(Prior, Parameterized):
    # This function calculates derivative of the log of prior function
    def lnpdf_grad(self, x):
        x = x.reshape(self.x_shape)
-	xprime = x.dot(np.diagflat(self.lamda))
-	x = xprime
+        xprime = x.dot(np.diagflat(self.lamda))
+        x = xprime
 	# print x
        cls = self.compute_cls(x)
        M_0 = np.mean(x, axis=0)
@ -951,14 +944,14 @@ class DGPLVM_Lamda(Prior, Parameterized):
        # Because of the GPy we need to transpose our matrix so that it gets the same shape as out matrix (denominator layout!!!)
        DPxprim_Dx = DPxprim_Dx.T
    
-    	DPxprim_Dlamda = DPx_Dx.dot(x)
+        DPxprim_Dlamda = DPx_Dx.dot(x)

    	# Because of the GPy we need to transpose our matrix so that it gets the same shape as out matrix (denominator layout!!!)
-    	DPxprim_Dlamda = DPxprim_Dlamda.T
+        DPxprim_Dlamda = DPxprim_Dlamda.T

-	self.lamda.gradient = np.diag(DPxprim_Dlamda)
+        self.lamda.gradient = np.diag(DPxprim_Dlamda)
 	# print DPxprim_Dx
-    	return DPxprim_Dx
+        return DPxprim_Dx


    # def frb(self, x):
@ -1139,8 +1132,8 @@ class DGPLVM_T(Prior):
    # This function calculates log of our prior
    def lnpdf(self, x):
        x = x.reshape(self.x_shape)
-	xprim = x.dot(self.vec)
-	x = xprim
+        xprim = x.dot(self.vec)
+        x = xprim
 	# print x  
        cls = self.compute_cls(x)
        M_0 = np.mean(x, axis=0)
@ -1156,11 +1149,11 @@ class DGPLVM_T(Prior):

    # This function calculates derivative of the log of prior function
    def lnpdf_grad(self, x):
-	x = x.reshape(self.x_shape)
-	xprim = x.dot(self.vec)
-	x = xprim 
+        x = x.reshape(self.x_shape)
+        xprim = x.dot(self.vec)
+        x = xprim 
 	# print x       
-	cls = self.compute_cls(x)
+        cls = self.compute_cls(x)
        M_0 = np.mean(x, axis=0)
        M_i = self.compute_Mi(cls)
        Sb = self.compute_Sb(cls, M_i, M_0)
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@ -31,6 +31,16 @@ class Transformation(object):
        raise NotImplementedError
    def finv(self, model_param):
        raise NotImplementedError
+    def log_jacobian(self, model_param):
+        """
+        compute the log of the jacobian of f, evaluated at f(x)= model_param
+        """
+        raise NotImplementedError
+    def log_jacobian_grad(self, model_param):
+        """
+        compute the drivative of the log of the jacobian of f, evaluated at f(x)= model_param
+        """
+        raise NotImplementedError
    def gradfactor(self, model_param, dL_dmodel_param):
        """ df(opt_param)_dopt_param evaluated at self.f(opt_param)=model_param, times the gradient dL_dmodel_param,

@ -74,9 +84,33 @@ class Logexp(Transformation):
        if np.any(f < 0.):
            print("Warning: changing parameters to satisfy constraints")
        return np.abs(f)
+    def log_jacobian(self, model_param):
+        return np.where(model_param>_lim_val, model_param, np.log(np.exp(model_param+1e-20) - 1.)) - model_param
+    def log_jacobian_grad(self, model_param):
+        return 1./(np.exp(model_param)-1.)
    def __str__(self):
        return '+ve'

+class Exponent(Transformation):
+    domain = _POSITIVE
+    def f(self, x):
+        return np.where(x<_lim_val, np.where(x>-_lim_val, np.exp(x), np.exp(-_lim_val)), np.exp(_lim_val))
+    def finv(self, x):
+        return np.log(x)
+    def gradfactor(self, f, df):
+        return np.einsum('i,i->i', df, f)
+    def initialize(self, f):
+        if np.any(f < 0.):
+            print("Warning: changing parameters to satisfy constraints")
+        return np.abs(f)
+    def log_jacobian(self, model_param):
+        return np.log(model_param)
+    def log_jacobian_grad(self, model_param):
+        return 1./model_param
+    def __str__(self):
+        return '+ve'
+
+

 class NormalTheta(Transformation):
    "Do not use, not officially supported!"
@ -417,22 +451,6 @@ class LogexpClipped(Logexp):
    def __str__(self):
        return '+ve_c'

-class Exponent(Transformation):
-    # TODO: can't allow this to go to zero, need to set a lower bound. Similar with negative Exponent below. See old MATLAB code.
-    domain = _POSITIVE
-    def f(self, x):
-        return np.where(x<_lim_val, np.where(x>-_lim_val, np.exp(x), np.exp(-_lim_val)), np.exp(_lim_val))
-    def finv(self, x):
-        return np.log(x)
-    def gradfactor(self, f, df):
-        return np.einsum('i,i->i', df, f)
-    def initialize(self, f):
-        if np.any(f < 0.):
-            print("Warning: changing parameters to satisfy constraints")
-        return np.abs(f)
-    def __str__(self):
-        return '+ve'
-
 class NegativeExponent(Exponent):
    domain = _NEGATIVE
    def f(self, x):
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@ -36,8 +36,9 @@ class NormalPrior(VariationalPrior):
        variational_posterior.variance.gradient -= (1. - (1. / (variational_posterior.variance))) * 0.5

 class SpikeAndSlabPrior(VariationalPrior):
-    def __init__(self, pi=None, learnPi=False, variance = 1.0, name='SpikeAndSlabPrior', **kw):
-        super(SpikeAndSlabPrior, self).__init__(name=name, **kw)        
+    def __init__(self, pi=None, learnPi=False, variance = 1.0, group_spike=False, name='SpikeAndSlabPrior', **kw):
+        super(SpikeAndSlabPrior, self).__init__(name=name, **kw)
+        self.group_spike = group_spike
        self.variance = Param('variance',variance)
        self.learnPi = learnPi
        if learnPi:
@ -50,7 +51,10 @@ class SpikeAndSlabPrior(VariationalPrior):
    def KL_divergence(self, variational_posterior):
        mu = variational_posterior.mean
        S = variational_posterior.variance
-        gamma = variational_posterior.gamma.values
+        if self.group_spike:
+            gamma = variational_posterior.gamma.values[0]
+        else:
+            gamma = variational_posterior.gamma.values
        if len(self.pi.shape)==2:
            idx = np.unique(variational_posterior.gamma._raveled_index()/gamma.shape[-1])
            pi = self.pi[idx]
@ -65,14 +69,21 @@ class SpikeAndSlabPrior(VariationalPrior):
    def update_gradients_KL(self, variational_posterior):
        mu = variational_posterior.mean
        S = variational_posterior.variance
-        gamma = variational_posterior.gamma.values
+        if self.group_spike:
+            gamma = variational_posterior.gamma.values[0]
+        else:
+            gamma = variational_posterior.gamma.values
        if len(self.pi.shape)==2:
            idx = np.unique(variational_posterior.gamma._raveled_index()/gamma.shape[-1])
            pi = self.pi[idx]
        else:
            pi = self.pi

-        variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
+        if self.group_spike:
+            dgamma = np.log((1-pi)/pi*gamma/(1.-gamma))/variational_posterior.num_data
+        else:
+            dgamma = np.log((1-pi)/pi*gamma/(1.-gamma))
+        variational_posterior.binary_prob.gradient -= dgamma+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
        mu.gradient -= gamma*mu/self.variance
        S.gradient -= (1./self.variance - 1./S) * gamma /2.
        if self.learnPi:
@ -154,13 +165,31 @@ class SpikeAndSlabPosterior(VariationalPosterior):
    '''
    The SpikeAndSlab distribution for variational approximations.
    '''
-    def __init__(self, means, variances, binary_prob, name='latent space'):
+    def __init__(self, means, variances, binary_prob, group_spike=False, sharedX=False, name='latent space'):
        """
        binary_prob : the probability of the distribution on the slab part.
        """
        super(SpikeAndSlabPosterior, self).__init__(means, variances, name)
-        self.gamma = Param("binary_prob",binary_prob,Logistic(0.,1.))
-        self.link_parameter(self.gamma)
+        self.group_spike = group_spike
+        self.sharedX = sharedX
+        if sharedX:
+            self.mean.fix(warning=False)
+            self.variance.fix(warning=False)
+        if group_spike:
+            self.gamma_group = Param("binary_prob_group",binary_prob.mean(axis=0),Logistic(1e-10,1.-1e-10))
+            self.gamma = Param("binary_prob",binary_prob, __fixed__)
+            self.link_parameters(self.gamma_group,self.gamma)
+        else:
+            self.gamma = Param("binary_prob",binary_prob,Logistic(1e-10,1.-1e-10))
+            self.link_parameter(self.gamma)
+            
+    def propogate_val(self):
+        if self.group_spike:
+            self.gamma.values[:] = self.gamma_group.values
+    
+    def collate_gradient(self):
+        if self.group_spike:
+            self.gamma_group.gradient = self.gamma.gradient.reshape(self.gamma.shape).sum(axis=0)

    def set_gradients(self, grad):
        self.mean.gradient, self.variance.gradient, self.gamma.gradient = grad
@ -179,15 +208,15 @@ class SpikeAndSlabPosterior(VariationalPosterior):
            n.parameters[dc['variance']._parent_index_] = dc['variance']
            n.parameters[dc['binary_prob']._parent_index_] = dc['binary_prob']
            n._gradient_array_ = None
-            oversize = self.size - self.mean.size - self.variance.size
-            n.size = n.mean.size + n.variance.size + oversize
+            oversize = self.size - self.mean.size - self.variance.size - self.gamma.size
+            n.size = n.mean.size + n.variance.size + n.gamma.size + oversize
            n.ndim = n.mean.ndim
            n.shape = n.mean.shape
            n.num_data = n.mean.shape[0]
            n.input_dim = n.mean.shape[1] if n.ndim != 1 else 1
            return n
        else:
-            return super(VariationalPrior, self).__getitem__(s)
+            return super(SpikeAndSlabPosterior, self).__getitem__(s)

    def plot(self, *args, **kwargs):
        """
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -132,14 +132,14 @@ class SparseGP(GP):
                if self.posterior.woodbury_inv.ndim == 2:
                    var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
                elif self.posterior.woodbury_inv.ndim == 3:
-                    var = Kxx[:,:,None] - np.tensordot(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx).T, Kx, [1,0]).swapaxes(1,2)
+                    var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
+                    for i in range(var.shape[2]):
+                        var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
                var = var
            else:
                Kxx = kern.Kdiag(Xnew)
                if self.posterior.woodbury_inv.ndim == 2:
-		    var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T 
-		    #For plot_latent, the below code doesn't work!
-                    #var = Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0)
+                    var = (Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0))[:,None]
                elif self.posterior.woodbury_inv.ndim == 3:
                    var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
                    for i in range(var.shape[1]):
@ -149,9 +149,9 @@ class SparseGP(GP):
            if self.mean_function is not None:
                mu += self.mean_function.f(Xnew)
        else:
-            psi0_star = self.kern.psi0(self.Z, Xnew)
-            psi1_star = self.kern.psi1(self.Z, Xnew)
-            #psi2_star = self.kern.psi2(self.Z, Xnew) # Only possible if we get NxMxM psi2 out of the code.
+            psi0_star = kern.psi0(self.Z, Xnew)
+            psi1_star = kern.psi1(self.Z, Xnew)
+            #psi2_star = kern.psi2(self.Z, Xnew) # Only possible if we get NxMxM psi2 out of the code.
            la = self.posterior.woodbury_vector
            mu = np.dot(psi1_star, la) # TODO: dimensions?
            
@ -163,7 +163,7 @@ class SparseGP(GP):
                
            for i in range(Xnew.shape[0]):
                _mu, _var = Xnew.mean.values[[i]], Xnew.variance.values[[i]]
-                psi2_star = self.kern.psi2(self.Z, NormalPosterior(_mu, _var))
+                psi2_star = kern.psi2(self.Z, NormalPosterior(_mu, _var))
                tmp = (psi2_star[:, :] - psi1_star[[i]].T.dot(psi1_star[[i]]))

                var_ = mdot(la.T, tmp, la)
--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@ -9,7 +9,7 @@ from ..inference.latent_function_inference import SVGP as svgp_inf


 class SVGP(SparseGP):
-    def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, name='SVGP', Y_metadata=None, batchsize=None):
+    def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, name='SVGP', Y_metadata=None, batchsize=None, num_latent_functions=None):
        """
        Stochastic Variational GP.

@ -41,8 +41,12 @@ class SVGP(SparseGP):
        SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, mean_function=mean_function, inference_method=inf_method,
                 name=name, Y_metadata=Y_metadata, normalizer=False)

-        self.m = Param('q_u_mean', np.zeros((self.num_inducing, Y.shape[1])))
-        chol = choleskies.triang_to_flat(np.tile(np.eye(self.num_inducing)[:,:,None], (1,1,Y.shape[1])))
+        #assume the number of latent functions is one per col of Y unless specified
+        if num_latent_functions is None:
+            num_latent_functions = Y.shape[1]
+
+        self.m = Param('q_u_mean', np.zeros((self.num_inducing, num_latent_functions)))
+        chol = choleskies.triang_to_flat(np.tile(np.eye(self.num_inducing)[None,:,:], (num_latent_functions, 1,1)))
        self.chol = Param('q_u_chol', chol)
        self.link_parameter(self.chol)
        self.link_parameter(self.m)
--- a/GPy/core/verbose_optimization.py
+++ b/GPy/core/verbose_optimization.py
@ -5,9 +5,10 @@ from __future__ import print_function
 import numpy as np
 import sys
 import time
+import datetime

 def exponents(fnow, current_grad):
-    exps = [np.abs(np.float(fnow)), current_grad]
+    exps = [np.abs(np.float(fnow)), 1 if current_grad is np.nan else current_grad]
    return np.sign(exps) * np.log10(exps).astype(int)

 class VerboseOptimization(object):
@ -23,6 +24,7 @@ class VerboseOptimization(object):
            self.model.add_observer(self, self.print_status)
            self.status = 'running'
            self.clear = clear_after_finish
+            self.deltat = .2

            self.update()

@ -44,25 +46,25 @@ class VerboseOptimization(object):
                self.hor_align = FlexBox(children = [left_col, right_col], width='100%', orientation='horizontal')

                display(self.hor_align)
-                                
+
                try:
                    self.text.set_css('width', '100%')
                    left_col.set_css({
                             'padding': '2px',
                             'width': "100%",
                             })
-    
+
                    right_col.set_css({
                             'padding': '2px',
                             })
-    
+
                    self.hor_align.set_css({
                             'width': "100%",
                             })

                    self.hor_align.remove_class('vbox')
                    self.hor_align.add_class('hbox')
-    
+
                    left_col.add_class("box-flex1")
                    right_col.add_class('box-flex0')

@ -74,16 +76,31 @@ class VerboseOptimization(object):
            else:
                self.exps = exponents(self.fnow, self.current_gradient)
                print('Running {} Code:'.format(self.opt_name))
-                print(' {3:7s}   {0:{mi}s}   {1:11s}    {2:11s}'.format("i", "f", "|g|", "secs", mi=self.len_maxiters))
+                print('  {3:7s}   {0:{mi}s}   {1:11s}    {2:11s}'.format("i", "f", "|g|", "runtime", mi=self.len_maxiters))

    def __enter__(self):
        self.start = time.time()
        return self

-    def print_out(self):
+    def print_out(self, seconds):
+        if seconds<60:
+            ms = (seconds%1)*100
+            self.timestring = "{s:0>2d}s{ms:0>2d}".format(s=int(seconds), ms=int(ms))
+        else:
+            m, s = divmod(seconds, 60)
+            if m>59:
+                h, m = divmod(m, 60)
+                if h>23:
+                    d, h = divmod(h, 24)
+                    self.timestring = '{d:0>2d}d{h:0>2d}h{m:0>2d}'.format(m=int(m), h=int(h), d=int(d))
+                else:
+                    self.timestring = '{h:0>2d}h{m:0>2d}m{s:0>2d}'.format(m=int(m), s=int(s), h=int(h))
+            else:
+                ms = (seconds%1)*100
+                self.timestring = '{m:0>2d}m{s:0>2d}s{ms:0>2d}'.format(m=int(m), s=int(s), ms=int(ms))
        if self.ipython_notebook:
            names_vals = [['optimizer', "{:s}".format(self.opt_name)],
-                          ['runtime [s]', "{:> g}".format(time.time()-self.start)],
+                          ['runtime', "{:>s}".format(self.timestring)],
                          ['evaluation', "{:>0{l}}".format(self.iteration, l=self.len_maxiters)],
                          ['objective', "{: > 12.3E}".format(self.fnow)],
                          ['||gradient||', "{: >+12.3E}".format(float(self.current_gradient))],
@ -120,14 +137,18 @@ class VerboseOptimization(object):
                if b:
                    self.exps = n_exps
            print('\r', end=' ')
-            print('{3:> 7.2g}  {0:>0{mi}g}  {1:> 12e}  {2:> 12e}'.format(self.iteration, float(self.fnow), float(self.current_gradient), time.time()-self.start, mi=self.len_maxiters), end=' ') # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
+            print('{3:}  {0:>0{mi}g}  {1:> 12e}  {2:> 12e}'.format(self.iteration, float(self.fnow), float(self.current_gradient), "{:>8s}".format(self.timestring), mi=self.len_maxiters), end=' ') # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
            sys.stdout.flush()

    def print_status(self, me, which=None):
        self.update()

+        seconds = time.time()-self.start
        #sys.stdout.write(" "*len(self.message))
-        self.print_out()
+        self.deltat += seconds
+        if self.deltat > .2:
+            self.print_out(seconds)
+            self.deltat = 0

        self.iteration += 1

@ -141,17 +162,24 @@ class VerboseOptimization(object):

    def finish(self, opt):
        self.status = opt.status
+        if self.verbose and self.ipython_notebook:
+            if 'conv' in self.status.lower():
+                self.progress.bar_style = 'success'
+            elif self.iteration >= self.maxiters:
+                self.progress.bar_style = 'warning'
+            else:
+                self.progress.bar_style = 'danger'

    def __exit__(self, type, value, traceback):
        if self.verbose:
            self.stop = time.time()
            self.model.remove_observer(self)
-            self.print_out()
+            self.print_out(self.stop - self.start)

            if not self.ipython_notebook:
                print()
-                print('Optimization finished in {0:.5g} Seconds'.format(self.stop-self.start))
-                print('Optimization status: {0}'.format(self.status))             
+                print('Runtime: {}'.format("{:>9s}".format(self.timestring)))
+                print('Optimization status: {0}'.format(self.status))
                print()
            elif self.clear:
                self.hor_align.close()
--- a/GPy/defaults.cfg
+++ b/GPy/defaults.cfg
@ -25,3 +25,6 @@ MKL = False
 [weave]
 #if true, try to use weave, and fall back to numpy. if false, just use numpy.
 working = True
+
+[cython]
+working = True
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -217,9 +217,8 @@ def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=
    elif model_type == 'FITC':
        m = GPy.models.FITCClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing)
        m['.*len'] = 3.
-
    if optimize:
-        m.pseudo_EM()
+        m.optimize()

    if plot:
        m.plot()
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -215,6 +215,7 @@ def ssgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40
    return m

 def _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim=False):
+    """Simulate some data drawn from a matern covariance and a periodic exponential for use in MRD demos."""
    Q_signal = 4
    import GPy
    import numpy as np
@ -254,6 +255,7 @@ def _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim=False):
    return slist, [S1, S2, S3], Ylist

 def _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim=False):
+    """Simulate some data drawn from sine and cosine for use in demos of MRD"""
    _np.random.seed(1234)

    x = _np.linspace(0, 4 * _np.pi, N)[:, None]
@ -353,13 +355,13 @@ def ssgplvm_simulation(optimize=True, verbose=1,
    Y = Ylist[0]
    k = kern.Linear(Q, ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
    # k = kern.RBF(Q, ARD=True, lengthscale=10.)
-    m = SSGPLVM(Y, Q, init="pca", num_inducing=num_inducing, kernel=k)
+    m = SSGPLVM(Y, Q, init="rand", num_inducing=num_inducing, kernel=k, group_spike=True)
    m.X.variance[:] = _np.random.uniform(0, .01, m.X.shape)
-    m.likelihood.variance = .1
+    m.likelihood.variance = .01

    if optimize:
        print("Optimizing model:")
-        m.optimize('scg', messages=verbose, max_iters=max_iters,
+        m.optimize('bfgs', messages=verbose, max_iters=max_iters,
                   gtol=.05)
    if plot:
        m.X.plot("SSGPLVM Latent Space 1D")
@ -402,7 +404,8 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
    from GPy.models import MRD

    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
-    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
+    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim)
+    

    # Ylist = [Ylist[0]]
    k = kern.Linear(Q, ARD=True)
@ -585,6 +588,7 @@ def robot_wireless(optimize=True, verbose=True, plot=True):
    return m

 def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
+    """Interactive visualisation of the Stick Man data from Ohio State University with the Bayesian GPLVM."""
    from GPy.models import BayesianGPLVM
    from matplotlib import pyplot as plt
    import numpy as np
@ -613,7 +617,8 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y, connect=data['connect'])
        dim_select = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean[:1, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
        fig.canvas.draw()
-        fig.canvas.show()
+        # Canvas.show doesn't work on OSX.
+        #fig.canvas.show()
        raw_input('Press enter to finish')

    return m
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@ -52,7 +52,7 @@ class ExactGaussianInference(LatentFunctionInference):
        K = kern.K(X)

        Ky = K.copy()
-        diag.add(Ky, likelihood.gaussian_variance(Y_metadata))
+        diag.add(Ky, likelihood.gaussian_variance(Y_metadata)+1e-8)
        Wi, LW, LWi, W_logdet = pdinv(Ky)

        alpha, _ = dpotrs(LW, YYT_factor, lower=1)
--- a/GPy/inference/latent_function_inference/inferenceX.py
+++ b/GPy/inference/latent_function_inference/inferenceX.py
@ -27,12 +27,19 @@ def infer_newX(model, Y_new, optimize=True, init='L2'):

 class InferenceX(Model):
    """
-    The class for inference of new X with given new Y. (do_test_latent)
+    The model class for inference of new X with given new Y. (replacing the "do_test_latent" in Bayesian GPLVM)
+    It is a tiny inference model created from the original GP model. The kernel, likelihood (only Gaussian is supported at the moment) 
+    and posterior distribution are taken from the original model.
+    For Regression models and GPLVM, a point estimate of the latent variable X will be inferred. 
+    For Bayesian GPLVM, the variational posterior of X will be inferred. 
+    X is inferred through a gradient optimization of the inference model.

    :param model: the GPy model used in inference
    :type model: GPy.core.Model
    :param Y: the new observed data for inference
    :type Y: numpy.ndarray
+    :param init: the distance metric of Y for initializing X with the nearest neighbour.
+    :type init: 'L2', 'NCC' and 'rand'
    """
    def __init__(self, model, Y, name='inferenceX', init='L2'):
        if np.isnan(Y).any() or getattr(model, 'missing_data', False):
@ -45,17 +52,23 @@ class InferenceX(Model):
        super(InferenceX, self).__init__(name)
        self.likelihood = model.likelihood.copy()
        self.kern = model.kern.copy()
-        if model.kern.useGPU:
-            from ...models import SSGPLVM
-            if isinstance(model, SSGPLVM):
-                self.kern.GPU_SSRBF(True)
-            else:
-                self.kern.GPU(True)
+#         if model.kern.useGPU:
+#             from ...models import SSGPLVM
+#             if isinstance(model, SSGPLVM):
+#                 self.kern.GPU_SSRBF(True)
+#             else:
+#                 self.kern.GPU(True)
        from copy import deepcopy
        self.posterior = deepcopy(model.posterior)
        if hasattr(model, 'variational_prior'):
            self.uncertain_input = True
-            self.variational_prior = model.variational_prior.copy()
+            from ...models.ss_gplvm import IBPPrior
+            from ...models.ss_mrd import IBPPrior_SSMRD
+            if isinstance(model.variational_prior, IBPPrior) or isinstance(model.variational_prior, IBPPrior_SSMRD):
+                from ...core.parameterization.variational import SpikeAndSlabPrior
+                self.variational_prior = SpikeAndSlabPrior(pi=05,learnPi=False, group_spike=False)
+            else:
+                self.variational_prior = model.variational_prior.copy()
        else:
            self.uncertain_input = False
        if hasattr(model, 'inducing_inputs'):
@ -147,9 +160,9 @@ class InferenceX(Model):
            from ...core.parameterization.variational import SpikeAndSlabPrior
            if isinstance(self.variational_prior, SpikeAndSlabPrior):
                # Update Log-likelihood
-                KL_div = self.variational_prior.KL_divergence(self.X, N=self.Y.shape[0])
+                KL_div = self.variational_prior.KL_divergence(self.X)
                # update for the KL divergence
-                self.variational_prior.update_gradients_KL(self.X, N=self.Y.shape[0])
+                self.variational_prior.update_gradients_KL(self.X)
            else:
                # Update Log-likelihood
                KL_div = self.variational_prior.KL_divergence(self.X)
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@ -139,10 +139,6 @@ class Laplace(LatentFunctionInference):

        f_hat, Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)

-        self.f_hat = f_hat
-        #self.Ki_fhat =  Ki_fhat
-        #self.K = K.copy()
-
        #Compute hessian and other variables at mode
        log_marginal, woodbury_inv, dL_dK, dL_dthetaL = self.mode_computations(f_hat, Ki_fhat, K, Y, likelihood, kern, Y_metadata)

@ -298,6 +294,11 @@ class Laplace(LatentFunctionInference):
        else:
            dL_dthetaL = np.zeros(likelihood.size)

+        #Cache some things for speedy LOO
+        self.Ki_W_i = Ki_W_i
+        self.K = K
+        self.W = W
+        self.f_hat = f_hat
        return log_marginal, K_Wi_i, dL_dK, dL_dthetaL

    def _compute_B_statistics(self, K, W, log_concave, *args, **kwargs):
--- a/GPy/inference/latent_function_inference/svgp.py
+++ b/GPy/inference/latent_function_inference/svgp.py
@ -3,25 +3,27 @@ from ...util import linalg
 from ...util import choleskies
 import numpy as np
 from .posterior import Posterior
+from scipy.linalg.blas import dgemm, dsymm, dtrmm

 class SVGP(LatentFunctionInference):

    def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):

-        num_inducing = Z.shape[0]
-        num_data, num_outputs = Y.shape
+        num_data, _ = Y.shape
+        num_inducing, num_outputs = q_u_mean.shape

        #expand cholesky representation
        L = choleskies.flat_to_triang(q_u_chol)
-        S = np.einsum('ijk,ljk->ilk', L, L) #L.dot(L.T)
+
+
+        S = np.empty((num_outputs, num_inducing, num_inducing))
+        [np.dot(L[i,:,:], L[i,:,:].T, S[i,:,:]) for i in range(num_outputs)]
        #Si,_ = linalg.dpotri(np.asfortranarray(L), lower=1)
        Si = choleskies.multiple_dpotri(L)
-        logdetS = np.array([2.*np.sum(np.log(np.abs(np.diag(L[:,:,i])))) for i in range(L.shape[-1])])
+        logdetS = np.array([2.*np.sum(np.log(np.abs(np.diag(L[i,:,:])))) for i in range(L.shape[0])])

        if np.any(np.isinf(Si)):
            raise ValueError("Cholesky representation unstable")
-            #S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S))
-            #Si, Lnew, _,_ = linalg.pdinv(S)

        #compute mean function stuff
        if mean_function is not None:
@ -31,26 +33,31 @@ class SVGP(LatentFunctionInference):
            prior_mean_u = np.zeros((num_inducing, num_outputs))
            prior_mean_f = np.zeros((num_data, num_outputs))

-
        #compute kernel related stuff
        Kmm = kern.K(Z)
-        Knm = kern.K(X, Z)
+        Kmn = kern.K(Z, X)
        Knn_diag = kern.Kdiag(X)
-        Kmmi, Lm, Lmi, logdetKmm = linalg.pdinv(Kmm)
+        Lm = linalg.jitchol(Kmm)
+        logdetKmm = 2.*np.sum(np.log(np.diag(Lm)))
+        Kmmi, _ = linalg.dpotri(Lm)

        #compute the marginal means and variances of q(f)
-        A = np.dot(Knm, Kmmi)
-        mu = prior_mean_f + np.dot(A, q_u_mean - prior_mean_u)
-        v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * np.einsum('ij,jkl->ikl', A, S),1)
+        A, _ = linalg.dpotrs(Lm, Kmn)
+        mu = prior_mean_f + np.dot(A.T, q_u_mean - prior_mean_u)
+        v = np.empty((num_data, num_outputs))
+        for i in range(num_outputs):
+            tmp = dtrmm(1.0,L[i].T, A, lower=0, trans_a=0)
+            v[:,i] = np.sum(np.square(tmp),0)
+        v += (Knn_diag - np.sum(A*Kmn,0))[:,None]

        #compute the KL term
        Kmmim = np.dot(Kmmi, q_u_mean)
-        KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0)
+        KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.sum(Kmmi[None,:,:]*S,1).sum(1) + 0.5*np.sum(q_u_mean*Kmmim,0)
        KL = KLs.sum()
        #gradient of the KL term (assuming zero mean function)
        dKL_dm = Kmmim.copy()
-        dKL_dS = 0.5*(Kmmi[:,:,None] - Si)
-        dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)
+        dKL_dS = 0.5*(Kmmi[None,:,:] - Si)
+        dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(0)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)

        if mean_function is not None:
            #adjust KL term for mean function
@ -75,14 +82,20 @@ class SVGP(LatentFunctionInference):
            dF_dthetaL =  dF_dthetaL.sum(1).sum(1)*batch_scale

        #derivatives of expected likelihood, assuming zero mean function
-        Adv = A.T[:,:,None]*dF_dv[None,:,:] # As if dF_Dv is diagonal
-        Admu = A.T.dot(dF_dmu)
-        AdvA = np.dstack([np.dot(A.T, Adv[:,:,i].T) for i in range(num_outputs)])
-        tmp = np.einsum('ijk,jlk->il', AdvA, S).dot(Kmmi)
-        dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(-1) - tmp - tmp.T
+        Adv = A[None,:,:]*dF_dv.T[:,None,:] # As if dF_Dv is diagonal, D, M, N
+        Admu = A.dot(dF_dmu)
+        Adv = np.ascontiguousarray(Adv) # makes for faster operations later...(inc dsymm)
+        AdvA = np.dot(Adv.reshape(-1, num_data),A.T).reshape(num_outputs, num_inducing, num_inducing )
+        tmp = np.sum([np.dot(a,s) for a, s in zip(AdvA, S)],0).dot(Kmmi)
+        dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(0) - tmp - tmp.T
        dF_dKmm = 0.5*(dF_dKmm + dF_dKmm.T) # necessary? GPy bug?
-        tmp = 2.*(np.einsum('ij,jlk->ilk', Kmmi,S) - np.eye(num_inducing)[:,:,None])
-        dF_dKmn = np.einsum('ijk,jlk->il', tmp, Adv) + Kmmim.dot(dF_dmu.T)
+        tmp = S.reshape(-1, num_inducing).dot(Kmmi).reshape(num_outputs, num_inducing , num_inducing )
+        tmp = 2.*(tmp - np.eye(num_inducing)[None, :,:])
+
+        dF_dKmn = Kmmim.dot(dF_dmu.T)
+        for a,b in zip(tmp, Adv):
+            dF_dKmn += np.dot(a.T, b)
+
        dF_dm = Admu
        dF_dS = AdvA

@ -98,11 +111,11 @@ class SVGP(LatentFunctionInference):
        log_marginal = F.sum() - KL
        dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn

-        dL_dchol = np.dstack([2.*np.dot(dL_dS[:,:,i], L[:,:,i]) for i in range(num_outputs)])
+        dL_dchol = 2.*np.array([np.dot(a,b) for a, b in zip(dL_dS, L) ])
        dL_dchol = choleskies.triang_to_flat(dL_dchol)

        grad_dict = {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv.sum(1), 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
        if mean_function is not None:
            grad_dict['dL_dmfZ'] = dF_dmfZ - dKL_dmfZ
            grad_dict['dL_dmfX'] = dF_dmfX
-        return Posterior(mean=q_u_mean, cov=S, K=Kmm, prior_mean=prior_mean_u), log_marginal, grad_dict
+        return Posterior(mean=q_u_mean, cov=S.T, K=Kmm, prior_mean=prior_mean_u), log_marginal, grad_dict
--- a/GPy/inference/optimization/optimization.py
+++ b/GPy/inference/optimization/optimization.py
@ -142,7 +142,7 @@ class opt_lbfgsb(Optimizer):

        #a more helpful error message is available in opt_result in the Error case
        if opt_result[2]['warnflag']==2:
-            self.status = 'Error' + opt_result[2]['task']
+            self.status = 'Error' + str(opt_result[2]['task'])

 class opt_simplex(Optimizer):
    def __init__(self, *args, **kwargs):
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -19,5 +19,5 @@ from ._src.splitKern import SplitKern,DEtime
 from ._src.splitKern import DEtime as DiffGenomeKern


-from _src.basis_funcs import LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel
+from ._src.basis_funcs import LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel

--- a/GPy/kern/_src/basis_funcs.py
+++ b/GPy/kern/_src/basis_funcs.py
@ -121,7 +121,7 @@ class LinearSlopeBasisFuncKernel(BasisFuncKernel):
    
 class ChangePointBasisFuncKernel(BasisFuncKernel):
    def __init__(self, input_dim, changepoint, variance=1., active_dims=None, ARD=False, name='changepoint'):
-        self.changepoint = changepoint
+        self.changepoint = np.array(changepoint)
        super(ChangePointBasisFuncKernel, self).__init__(input_dim, variance, active_dims, ARD, name)
    
    @Cache_this(limit=3, ignore_args=())
@ -136,3 +136,48 @@ class DomainKernel(LinearSlopeBasisFuncKernel):
    def _phi(self, X):
        phi = np.where((X>self.start)*(X<self.stop), 1, 0)
        return phi#((phi-self.start)/(self.stop-self.start))-.5
+
+class LogisticBasisFuncKernel(BasisFuncKernel):
+    def __init__(self, input_dim, centers, variance=1., slope=1., active_dims=None, ARD=False, ARD_slope=True, name='logistic'):
+        self.centers = np.atleast_2d(centers)
+        self.ARD_slope = ARD_slope
+        if self.ARD_slope:
+            self.slope = Param('slope', slope * np.ones(self.centers.size), Logexp())
+        else:
+            self.slope = Param('slope', slope, Logexp())
+        super(LogisticBasisFuncKernel, self).__init__(input_dim, variance, active_dims, ARD, name)
+        self.link_parameter(self.slope)
+    
+    @Cache_this(limit=3, ignore_args=())
+    def _phi(self, X):
+        import scipy as sp
+        phi = 1/(1+np.exp(-((X-self.centers)*self.slope)))
+        return np.where(np.isnan(phi), 0, phi)#((phi-self.start)/(self.stop-self.start))-.5
+
+    def parameters_changed(self):
+        BasisFuncKernel.parameters_changed(self)
+    
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        super(LogisticBasisFuncKernel, self).update_gradients_full(dL_dK, X, X2)
+        if X2 is None or X is X2:
+            phi1 = self.phi(X)
+            if phi1.ndim != 2:
+                phi1 = phi1[:, None]
+            dphi1_dl = (phi1**2) * (np.exp(-((X-self.centers)*self.slope)) * (X-self.centers))
+            if self.ARD_slope:
+                self.slope.gradient = self.variance * 2 * np.einsum('ij,iq,jq->q', dL_dK, phi1, dphi1_dl)
+            else:
+                self.slope.gradient = self.variance * 2 * (dL_dK * phi1.dot(dphi1_dl.T)).sum()
+        else:
+            phi1 = self.phi(X)
+            phi2 = self.phi(X2)
+            if phi1.ndim != 2:
+                phi1 = phi1[:, None]
+                phi2 = phi2[:, None]
+            dphi1_dl = (phi1**2) * (np.exp(-((X-self.centers)*self.slope)) * (X-self.centers))
+            dphi2_dl = (phi2**2) * (np.exp(-((X2-self.centers)*self.slope)) * (X2-self.centers))
+            if self.ARD_slope:
+                self.slope.gradient = (self.variance * np.einsum('ij,iq,jq->q', dL_dK, phi1, dphi2_dl) + np.einsum('ij,iq,jq->q', dL_dK, phi2, dphi1_dl))
+            else:
+                self.slope.gradient = self.variance * (dL_dK * phi1.dot(dphi2_dl.T)).sum() + (dL_dK * phi2.dot(dphi1_dl.T)).sum()
+        self.slope.gradient = np.where(np.isnan(self.slope.gradient), 0, self.slope.gradient)
--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@ -5,12 +5,8 @@ from .kern import Kern
 import numpy as np
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
-from ...util.config import config # for assesing whether to use weave
-
-try:
-    from scipy import weave
-except ImportError:
-    config.set('weave', 'working', 'False')
+from ...util.config import config # for assesing whether to use cython
+from . import coregionalize_cython

 class Coregionalize(Kern):
    """
@ -61,13 +57,8 @@ class Coregionalize(Kern):
        self.B = np.dot(self.W, self.W.T) + np.diag(self.kappa)

    def K(self, X, X2=None):
-        if config.getboolean('weave', 'working'):
-            try:
-                return self._K_weave(X, X2)
-            except:
-                print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
-                config.set('weave', 'working', 'False')
-                return self._K_numpy(X, X2)
+        if config.getboolean('cython', 'working'):
+            return self._K_cython(X, X2)
        else:
            return self._K_numpy(X, X2)

@ -80,36 +71,10 @@ class Coregionalize(Kern):
            index2 = np.asarray(X2, dtype=np.int)
            return self.B[index,index2.T]

-    def _K_weave(self, X, X2=None):
-        """compute the kernel function using scipy.weave"""
-        index = np.asarray(X, dtype=np.int)
-
+    def _K_cython(self, X, X2=None):
        if X2 is None:
-            target = np.empty((X.shape[0], X.shape[0]), dtype=np.float64)
-            code="""
-            for(int i=0;i<N; i++){
-              target[i+i*N] = B[index[i]+output_dim*index[i]];
-              for(int j=0; j<i; j++){
-                  target[j+i*N] = B[index[i]+output_dim*index[j]];
-                  target[i+j*N] = target[j+i*N];
-                }
-              }
-            """
-            N, B, output_dim = index.size, self.B, self.output_dim
-            weave.inline(code, ['target', 'index', 'N', 'B', 'output_dim'])
-        else:
-            index2 = np.asarray(X2, dtype=np.int)
-            target = np.empty((X.shape[0], X2.shape[0]), dtype=np.float64)
-            code="""
-            for(int i=0;i<num_inducing; i++){
-              for(int j=0; j<N; j++){
-                  target[i+j*num_inducing] = B[output_dim*index[j]+index2[i]];
-                }
-              }
-            """
-            N, num_inducing, B, output_dim = index.size, index2.size, self.B, self.output_dim
-            weave.inline(code, ['target', 'index', 'index2', 'N', 'num_inducing', 'B', 'output_dim'])
-        return target
+            return coregionalize_cython.K_symmetric(self.B, np.asarray(X, dtype=np.int64)[:,0])
+        return coregionalize_cython.K_asymmetric(self.B, np.asarray(X, dtype=np.int64)[:,0], np.asarray(X2, dtype=np.int64)[:,0])


    def Kdiag(self, X):
@ -122,19 +87,13 @@ class Coregionalize(Kern):
        else:
            index2 = np.asarray(X2, dtype=np.int)

-        #attempt to use weave for a nasty double indexing loop: fall back to numpy
-        if config.getboolean('weave', 'working'):
-            try:
-                dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
-            except:
-                print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
-                config.set('weave', 'working', 'False')
-                dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
+        #attempt to use cython for a nasty double indexing loop: fall back to numpy
+        if config.getboolean('cython', 'working'):
+            dL_dK_small = self._gradient_reduce_cython(dL_dK, index, index2)
        else:
            dL_dK_small = self._gradient_reduce_numpy(dL_dK, index, index2)


-
        dkappa = np.diag(dL_dK_small)
        dL_dK_small += dL_dK_small.T
        dW = (self.W[:, None, :]*dL_dK_small[:, :, None]).sum(0)
@ -142,19 +101,6 @@ class Coregionalize(Kern):
        self.W.gradient = dW
        self.kappa.gradient = dkappa

-    def _gradient_reduce_weave(self, dL_dK, index, index2):
-        dL_dK_small = np.zeros_like(self.B)
-        code="""
-        for(int i=0; i<num_inducing; i++){
-          for(int j=0; j<N; j++){
-            dL_dK_small[index[j] + output_dim*index2[i]] += dL_dK[i+j*num_inducing];
-          }
-        }
-        """
-        N, num_inducing, output_dim = index.size, index2.size, self.output_dim
-        weave.inline(code, ['N', 'num_inducing', 'output_dim', 'dL_dK', 'dL_dK_small', 'index', 'index2'])
-        return dL_dK_small
-
    def _gradient_reduce_numpy(self, dL_dK, index, index2):
        index, index2 = index[:,0], index2[:,0]
        dL_dK_small = np.zeros_like(self.B)
@ -164,6 +110,11 @@ class Coregionalize(Kern):
                dL_dK_small[j,i] = tmp1[:,index2==j].sum()
        return dL_dK_small

+    def _gradient_reduce_cython(self, dL_dK, index, index2):
+        index, index2 = index[:,0], index2[:,0]
+        return coregionalize_cython.gradient_reduce(self.B.shape[0], dL_dK, index, index2)
+
+
    def update_gradients_diag(self, dL_dKdiag, X):
        index = np.asarray(X, dtype=np.int).flatten()
        dL_dKdiag_small = np.array([dL_dKdiag[index==i].sum() for i in range(self.output_dim)])
--- a/GPy/kern/_src/coregionalize_cython.c
+++ b/GPy/kern/_src/coregionalize_cython.c
--- a/GPy/kern/_src/coregionalize_cython.pyx
+++ b/GPy/kern/_src/coregionalize_cython.pyx
@ -0,0 +1,34 @@
+#cython: boundscheck=True
+#cython: wraparound=True
+import cython
+import numpy as np
+cimport numpy as np
+
+def K_symmetric(np.ndarray[double, ndim=2] B, np.ndarray[np.int64_t, ndim=1] X):
+    cdef int N = X.size
+    cdef np.ndarray[np.double_t, ndim=2] K = np.empty((N, N))
+    for n in range(N):
+        for m in range(N):
+            K[n,m] = B[X[n],X[m]]
+    return K
+
+def K_asymmetric(np.ndarray[double, ndim=2] B, np.ndarray[np.int64_t, ndim=1] X, np.ndarray[np.int64_t, ndim=1] X2):
+    cdef int N = X.size
+    cdef int M = X2.size
+    cdef np.ndarray[np.double_t, ndim=2] K = np.empty((N, M))
+    for n in range(N):
+        for m in range(M):
+            K[n,m] = B[X[n],X2[m]]
+    return K
+
+def gradient_reduce(int D, np.ndarray[double, ndim=2] dL_dK, np.ndarray[np.int64_t, ndim=1] index, np.ndarray[np.int64_t, ndim=1] index2):
+        cdef np.ndarray[np.double_t, ndim=2] dL_dK_small = np.zeros((D, D))
+        cdef int N = index.size
+        cdef int M = index2.size
+        for i in range(N):
+            for j in range(M):
+                dL_dK_small[index2[j],index[i]] += dL_dK[i,j];
+        return dL_dK_small
+
+
+
--- a/GPy/kern/_src/independent_outputs.py
+++ b/GPy/kern/_src/independent_outputs.py
@ -105,7 +105,7 @@ class IndependentOutputs(CombinationKernel):
        if X2 is None:
            # TODO: make use of index_to_slices
            # FIXME: Broken as X is already sliced out
-            print "Warning, gradients_X may not be working, I believe X has already been sliced out by the slicer!"
+            print("Warning, gradients_X may not be working, I believe X has already been sliced out by the slicer!")
            values = np.unique(X[:,self.index_dim])
            slices = [X[:,self.index_dim]==i for i in values]
            [target.__setitem__(s, kern.gradients_X(dL_dK[s,s],X[s],None))
--- a/GPy/kern/_src/mlp.py
+++ b/GPy/kern/_src/mlp.py
@ -78,7 +78,7 @@ class MLP(Kern):
                           *((vec1[:, None]+vec2[None, :])*self.weight_variance
                             + 2*self.bias_variance + 2.))*base_cov_grad).sum()

-    def update_gradients_diag(self, X):
+    def update_gradients_diag(self, dL_dKdiag, X):
        self._K_diag_computations(X)
        self.variance.gradient = np.sum(self._K_diag_dvar*dL_dKdiag)
        
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@ -20,7 +20,6 @@ class RBF(Stationary):
    _support_GPU = True
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='rbf', useGPU=False):
        super(RBF, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name, useGPU=useGPU)
-        self.psicomp = PSICOMP_RBF()
        if self.useGPU:
            self.psicomp = PSICOMP_RBF_GPU()
        else:
@ -36,6 +35,7 @@ class RBF(Stationary):
        dc = super(RBF, self).__getstate__()
        if self.useGPU:
            dc['psicomp'] = PSICOMP_RBF()
+            dc['useGPU'] = False
        return dc

    def __setstate__(self, state):
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@ -9,13 +9,15 @@ from ...util.linalg import tdot
 from ... import util
 import numpy as np
 from scipy import integrate
-from ...util.config import config # for assesing whether to use weave
+from ...util.config import config # for assesing whether to use cython
 from ...util.caching import Cache_this

 try:
-    from scipy import weave
+    from . import stationary_cython
 except ImportError:
-    config.set('weave', 'working', 'False')
+    print('warning in sationary: failed to import cython module: falling back to numpy')
+    config.set('cython', 'working', 'false')
+

 class Stationary(Kern):
    """
@ -153,28 +155,18 @@ class Stationary(Kern):
        (dL_dK), compute the gradient wrt the parameters of this kernel,
        and store in the parameters object as e.g. self.variance.gradient
        """
-        self.variance.gradient = np.einsum('ij,ij,i', self.K(X, X2), dL_dK, 1./self.variance)
+        self.variance.gradient = np.sum(self.K(X, X2)* dL_dK)/self.variance

        #now the lengthscale gradient(s)
        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
        if self.ARD:
-            #rinv = self._inv_dis# this is rather high memory? Should we loop instead?t(X, X2)
-            #d =  X[:, None, :] - X2[None, :, :]
-            #x_xl3 = np.square(d)
-            #self.lengthscale.gradient = -((dL_dr*rinv)[:,:,None]*x_xl3).sum(0).sum(0)/self.lengthscale**3
+
            tmp = dL_dr*self._inv_dist(X, X2)
            if X2 is None: X2 = X
-
-
-            if config.getboolean('weave', 'working'):
-                try:
-                    self.lengthscale.gradient = self.weave_lengthscale_grads(tmp, X, X2)
-                except:
-                    print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
-                    config.set('weave', 'working', 'False')
-                    self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in range(self.input_dim)])
+            if config.getboolean('cython', 'working'):
+                self.lengthscale.gradient = self._lengthscale_grads_cython(tmp, X, X2)
            else:
-                self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in range(self.input_dim)])
+                self.lengthscale.gradient = self._lengthscale_grads_pure(tmp, X, X2)
        else:
            r = self._scaled_dist(X, X2)
            self.lengthscale.gradient = -np.sum(dL_dr*r)/self.lengthscale
@ -189,43 +181,27 @@ class Stationary(Kern):
        dist = self._scaled_dist(X, X2).copy()
        return 1./np.where(dist != 0., dist, np.inf)

-    def weave_lengthscale_grads(self, tmp, X, X2):
-        """Use scipy.weave to compute derivatives wrt the lengthscales"""
+    def _lengthscale_grads_pure(self, tmp, X, X2):
+        return -np.array([np.sum(tmp * np.square(X[:,q:q+1] - X2[:,q:q+1].T)) for q in range(self.input_dim)])/self.lengthscale**3
+
+    def _lengthscale_grads_cython(self, tmp, X, X2):
        N,M = tmp.shape
-        Q = X.shape[1]
-        if hasattr(X, 'values'):X = X.values
-        if hasattr(X2, 'values'):X2 = X2.values
+        Q = self.input_dim
+        X, X2 = np.ascontiguousarray(X), np.ascontiguousarray(X2)
        grads = np.zeros(self.input_dim)
-        code = """
-        double gradq;
-        for(int q=0; q<Q; q++){
-          gradq = 0;
-          for(int n=0; n<N; n++){
-            for(int m=0; m<M; m++){
-              gradq += tmp(n,m)*(X(n,q)-X2(m,q))*(X(n,q)-X2(m,q));
-            }
-          }
-          grads(q) = gradq;
-        }
-        """
-        weave.inline(code, ['tmp', 'X', 'X2', 'grads', 'N', 'M', 'Q'], type_converters=weave.converters.blitz, support_code="#include <math.h>")
+        stationary_cython.lengthscale_grads(N, M, Q, tmp, X, X2, grads)
        return -grads/self.lengthscale**3

    def gradients_X(self, dL_dK, X, X2=None):
        """
        Given the derivative of the objective wrt K (dL_dK), compute the derivative wrt X
        """
-        if config.getboolean('weave', 'working'):
-            try:
-                return self.gradients_X_weave(dL_dK, X, X2)
-            except:
-                print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
-                config.set('weave', 'working', 'False')
-                return self.gradients_X_(dL_dK, X, X2)
+        if config.getboolean('cython', 'working'):
+            return self._gradients_X_cython(dL_dK, X, X2)
        else:
-            return self.gradients_X_(dL_dK, X, X2)
+            return self._gradients_X_pure(dL_dK, X, X2)

-    def gradients_X_(self, dL_dK, X, X2=None):
+    def _gradients_X_pure(self, dL_dK, X, X2=None):
        invdist = self._inv_dist(X, X2)
        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
        tmp = invdist*dL_dr
@ -235,54 +211,25 @@ class Stationary(Kern):

        #The high-memory numpy way:
        #d =  X[:, None, :] - X2[None, :, :]
-        #ret = np.sum(tmp[:,:,None]*d,1)/self.lengthscale**2
+        #grad = np.sum(tmp[:,:,None]*d,1)/self.lengthscale**2

        #the lower memory way with a loop
-        ret = np.empty(X.shape, dtype=np.float64)
+        grad = np.empty(X.shape, dtype=np.float64)
        for q in range(self.input_dim):
-            np.sum(tmp*(X[:,q][:,None]-X2[:,q][None,:]), axis=1, out=ret[:,q])
-        ret /= self.lengthscale**2
+            np.sum(tmp*(X[:,q][:,None]-X2[:,q][None,:]), axis=1, out=grad[:,q])
+        return grad/self.lengthscale**2

-        return ret
-
-    def gradients_X_weave(self, dL_dK, X, X2=None):
+    def _gradients_X_cython(self, dL_dK, X, X2=None):
        invdist = self._inv_dist(X, X2)
        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
        tmp = invdist*dL_dr
        if X2 is None:
            tmp = tmp + tmp.T
            X2 = X
-
-        code = """
-        int n,m,d;
-        double retnd;
-        #pragma omp parallel for private(n,d, retnd, m)
-        for(d=0;d<D;d++){
-          for(n=0;n<N;n++){
-            retnd = 0.0;
-            for(m=0;m<M;m++){
-              retnd += tmp(n,m)*(X(n,d)-X2(m,d));
-            }
-            ret(n,d) = retnd;
-          }
-        }
-
-        """
-        if hasattr(X, 'values'):X = X.values #remove the GPy wrapping to make passing into weave safe
-        if hasattr(X2, 'values'):X2 = X2.values
-        ret = np.zeros(X.shape)
-        N,D = X.shape
-        N,M = tmp.shape
-        from scipy import weave
-        support_code = """
-        #include <omp.h>
-        #include <stdio.h>
-        """
-        weave_options = {'headers'           : ['<omp.h>'],
-                         'extra_compile_args': ['-fopenmp -O3'], # -march=native'],
-                         'extra_link_args'   : ['-lgomp']}
-        weave.inline(code, ['ret', 'N', 'D', 'M', 'tmp', 'X', 'X2'], type_converters=weave.converters.blitz, support_code=support_code, **weave_options)
-        return ret/self.lengthscale**2
+        X, X2 = np.ascontiguousarray(X), np.ascontiguousarray(X2)
+        grad = np.zeros(X.shape)
+        stationary_cython.grad_X(X.shape[0], X.shape[1], X2.shape[0], X, X2, tmp, grad)
+        return grad/self.lengthscale**2

    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)
@ -290,6 +237,9 @@ class Stationary(Kern):
    def input_sensitivity(self, summarize=True):
        return self.variance*np.ones(self.input_dim)/self.lengthscale**2

+
+
+
 class Exponential(Stationary):
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Exponential'):
        super(Exponential, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
--- a/GPy/kern/_src/stationary_cython.c
+++ b/GPy/kern/_src/stationary_cython.c
--- a/GPy/kern/_src/stationary_cython.pyx
+++ b/GPy/kern/_src/stationary_cython.pyx
@ -0,0 +1,59 @@
+#cython: boundscheck=False
+#cython: nonecheck=False
+#cython: wraparound=False
+import numpy as np
+cimport numpy as np
+from cython.parallel import prange
+
+ctypedef np.float64_t DTYPE_t
+ 
+cdef extern from "stationary_utils.h":
+    void _grad_X "_grad_X" (int N, int D, int M, double* X, double* X2, double* tmp, double* grad)
+
+cdef extern from "stationary_utils.h":
+    void _lengthscale_grads "_lengthscale_grads" (int N, int M, int Q, double* tmp, double* X, double* X2, double* grad)
+ 
+def grad_X(int N, int D, int M,
+        np.ndarray[DTYPE_t, ndim=2] _X,
+        np.ndarray[DTYPE_t, ndim=2] _X2,
+        np.ndarray[DTYPE_t, ndim=2] _tmp,
+        np.ndarray[DTYPE_t, ndim=2] _grad):
+    cdef double *X = <double*> _X.data
+    cdef double *X2 = <double*> _X2.data
+    cdef double *tmp = <double*> _tmp.data
+    cdef double *grad = <double*> _grad.data
+    _grad_X(N, D, M, X, X2, tmp, grad) # return nothing, work in place.
+
+def grad_X_cython(int N, int D, int M, double[:,:] X, double[:,:] X2, double[:,:] tmp, double[:,:] grad):
+    cdef int n,d,nd,m
+    for nd in prange(N*D, nogil=True):
+        n = nd/D
+        d = nd%D
+        grad[n,d] = 0.0
+        for m in range(M):
+            grad[n,d] += tmp[n,m]*(X[n,d]-X2[m,d])
+
+
+
+def lengthscale_grads_in_c(int N, int M, int Q,
+        np.ndarray[DTYPE_t, ndim=2] _tmp,
+        np.ndarray[DTYPE_t, ndim=2] _X,
+        np.ndarray[DTYPE_t, ndim=2] _X2,
+        np.ndarray[DTYPE_t, ndim=1] _grad):
+    cdef double *tmp = <double*> _tmp.data
+    cdef double *X = <double*> _X.data
+    cdef double *X2 = <double*> _X2.data
+    cdef double *grad = <double*> _grad.data
+    _lengthscale_grads(N, M, Q, tmp, X, X2, grad) # return nothing, work in place.
+
+def lengthscale_grads(int N, int M, int Q, double[:,:] tmp, double[:,:] X, double[:,:] X2, double[:] grad):
+    cdef int q, n, m
+    cdef double gradq, dist
+    for q in range(Q):
+        grad[q] = 0.0
+        for n in range(N):
+            for m in range(M):
+                dist = X[n,q] - X2[m,q]
+                grad[q] += tmp[n,m]*dist*dist
+
+
--- a/GPy/kern/_src/stationary_utils.c
+++ b/GPy/kern/_src/stationary_utils.c
@ -0,0 +1,54 @@
+void _grad_X(int N, int D, int M, double* X, double* X2, double* tmp, double* grad){
+double retnd;
+int n,d,nd,m;
+#pragma omp parallel for private(nd,n,d, retnd, m)
+for(nd=0;nd<(D*N);nd++){
+  n = nd/D;
+  d = nd%D;
+  retnd = 0.0;
+  for(m=0;m<M;m++){
+    retnd += tmp[n*M+m]*(X[nd]-X2[m*D+d]);
+  }
+  grad[nd] = retnd;
+}
+} //grad_X
+
+
+void _lengthscale_grads_unsafe(int N, int M, int Q, double* tmp, double* X, double* X2, double* grad){
+int n,m,nm,q,nQ,mQ;
+double dist;
+#pragma omp parallel for private(n,m,nm,q,nQ,mQ,dist)
+for(nm=0; nm<(N*M); nm++){
+  n = nm/M;
+  m = nm%M; 
+  nQ = n*Q;
+  mQ = m*Q;
+  for(q=0; q<Q; q++){
+    dist = X[nQ+q]-X2[mQ+q];
+    grad[q] += tmp[nm]*dist*dist;
+  }
+}
+} //lengthscale_grads
+
+
+void _lengthscale_grads(int N, int M, int Q, double* tmp, double* X, double* X2, double* grad){
+int n,m,q;
+double gradq, dist;
+#pragma omp parallel for private(n,m, gradq, dist)
+for(q=0; q<Q; q++){
+  gradq = 0;
+  for(n=0; n<N; n++){
+    for(m=0; m<M; m++){
+        dist = X[n*Q+q]-X2[m*Q+q];
+        gradq += tmp[n*M+m]*dist*dist;
+    }
+  }
+  grad[q] = gradq;
+}
+} //lengthscale_grads
+
+
+
+
+
+
--- a/GPy/kern/_src/stationary_utils.h
+++ b/GPy/kern/_src/stationary_utils.h
@ -0,0 +1,3 @@
+#include <omp.h>
+void _grad_X(int N, int D, int M, double*X, double* X2, double* tmp, double* grad);
+void _lengthscale_grads(int N, int D, int M, double* X, double* X2, double* tmp, double* grad);
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -41,6 +41,14 @@ class Likelihood(Parameterized):
        self.log_concave = False
        self.not_block_really = False

+    def request_num_latent_functions(self, Y):
+        """
+        The likelihood should infer how many latent functions are needed for the likelihood
+
+        Default is the number of outputs
+        """
+        return Y.shape[1]
+
    def _gradients(self,partial):
        return np.zeros(0)

@ -118,22 +126,57 @@ class Likelihood(Parameterized):
            """Generate a function which can be integrated
            to give p(Y*|Y) = int p(Y*|f*)p(f*|Y) df*"""
            def f(fi_star):
-                #exponent = np.exp(-(1./(2*v))*np.square(m-f_star))
+                #exponent = np.exp(-(1./(2*vi))*np.square(mi-fi_star))
                #from GPy.util.misc import safe_exp
                #exponent = safe_exp(exponent)
-                #return self.pdf(f_star, y, y_m)*exponent
+                #res = safe_exp(self.logpdf(fi_star, yi, yi_m))*exponent

                #More stable in the log space
-                return np.exp(self.logpdf(fi_star, yi, yi_m)
+                res = np.exp(self.logpdf(fi_star, yi, yi_m)
                              - 0.5*np.log(2*np.pi*vi)
-                              - 0.5*np.square(mi-fi_star)/vi)
+                              - 0.5*np.square(fi_star-mi)/vi)
+                if not np.isfinite(res):
+                    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+                return res
+
            return f

        p_ystar, _ = zip(*[quad(integral_generator(yi, mi, vi, yi_m), -np.inf, np.inf)
                           for yi, mi, vi, yi_m in zipped_values])
-        p_ystar = np.array(p_ystar).reshape(-1, 1)
+        p_ystar = np.array(p_ystar).reshape(*y_test.shape)
        return np.log(p_ystar)

+    def log_predictive_density_sampling(self, y_test, mu_star, var_star, Y_metadata=None, num_samples=1000):
+        """
+        Calculation of the log predictive density via sampling
+
+        .. math:
+            log p(y_{*}|D) = log 1/num_samples prod^{S}_{s=1} p(y_{*}|f_{*s})
+            f_{*s} ~ p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type mu_star: (Nx1) array
+        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
+        :type var_star: (Nx1) array
+        :param num_samples: num samples of p(f_{*}|mu_{*}, var_{*}) to take
+        :type num_samples: int
+        """
+        assert y_test.shape==mu_star.shape
+        assert y_test.shape==var_star.shape
+        assert y_test.shape[1] == 1
+
+        #Take samples of p(f*|y)
+        #fi_samples = np.random.randn(num_samples)*np.sqrt(var_star) + mu_star
+        fi_samples = np.random.normal(mu_star, np.sqrt(var_star), size=(mu_star.shape[0], num_samples))
+
+        from scipy.misc import logsumexp
+        log_p_ystar = -np.log(num_samples) + logsumexp(self.logpdf(fi_samples, y_test, Y_metadata=Y_metadata), axis=1)
+        log_p_ystar = np.array(log_p_ystar).reshape(*y_test.shape)
+        return log_p_ystar
+
+
    def _moments_match_ep(self,obs,tau,v):
        """
        Calculation of moments using quadrature
@ -170,9 +213,9 @@ class Likelihood(Parameterized):

    #only compute gh points if required
    __gh_points = None
-    def _gh_points(self):
+    def _gh_points(self, T=20):
        if self.__gh_points is None:
-            self.__gh_points = np.polynomial.hermite.hermgauss(20)
+            self.__gh_points = np.polynomial.hermite.hermgauss(T)
        return self.__gh_points

    def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
@ -211,9 +254,11 @@ class Likelihood(Parameterized):
        #d2logp_dx2 = np.clip(d2logp_dx2,-1e9,1e9)

        #average over the gird to get derivatives of the Gaussian's parameters
-        F = np.dot(logp, gh_w)
-        dF_dm = np.dot(dlogp_dx, gh_w)
-        dF_dv = np.dot(d2logp_dx2, gh_w)/2.
+        #division by pi comes from fact that for each quadrature we need to scale by 1/sqrt(pi)
+        F = np.dot(logp, gh_w)/np.sqrt(np.pi)
+        dF_dm = np.dot(dlogp_dx, gh_w)/np.sqrt(np.pi)
+        dF_dv = np.dot(d2logp_dx2, gh_w)/np.sqrt(np.pi)
+        dF_dv /= 2.

        if np.any(np.isnan(dF_dv)) or np.any(np.isinf(dF_dv)):
            stop
@ -221,8 +266,8 @@ class Likelihood(Parameterized):
            stop

        if self.size:
-            dF_dtheta = self.dlogpdf_dtheta(X, Y[:,None]) # Ntheta x (orig size) x N_{quad_points}
-            dF_dtheta = np.dot(dF_dtheta, gh_w)
+            dF_dtheta = self.dlogpdf_dtheta(X, Y[:,None], Y_metadata=Y_metadata) # Ntheta x (orig size) x N_{quad_points}
+            dF_dtheta = np.dot(dF_dtheta, gh_w)/np.sqrt(np.pi)
            dF_dtheta = dF_dtheta.reshape(self.size, shape[0], shape[1])
        else:
            dF_dtheta = None # Not yet implemented
@ -253,13 +298,8 @@ class Likelihood(Parameterized):
                return self.conditional_mean(f)*p
        scaled_mean = [quad(int_mean, fmin, fmax,args=(mj,s2j))[0] for mj,s2j in zip(mu,variance)]
        mean = np.array(scaled_mean)[:,None] / np.sqrt(2*np.pi*(variance))
-
        return mean

-    def _conditional_mean(self, f):
-        """Quadrature calculation of the conditional mean: E(Y_star|f)"""
-        raise NotImplementedError("implement this function to make predictions")
-
    def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
        """
        Approximation to the predictive variance: V(Y_star)
@ -563,23 +603,30 @@ class Likelihood(Parameterized):
        :param full_cov: whether to use the full covariance or just the diagonal
        :type full_cov: Boolean
        """
-
-        pred_mean = self.predictive_mean(mu, var, Y_metadata)
-        pred_var = self.predictive_variance(mu, var, pred_mean, Y_metadata)
+        try:
+            pred_mean = self.predictive_mean(mu, var, Y_metadata=Y_metadata)
+            pred_var = self.predictive_variance(mu, var, pred_mean, Y_metadata=Y_metadata)
+        except NotImplementedError:
+            print "Finding predictive mean and variance via sampling rather than quadrature"
+            Nf_samp = 300
+            Ny_samp = 1
+            s = np.random.randn(mu.shape[0], Nf_samp)*np.sqrt(var) + mu
+            ss_y = self.samples(s, Y_metadata, samples=Ny_samp)
+            pred_mean = np.mean(ss_y, axis=1)[:, None]
+            pred_var = np.var(ss_y, axis=1)[:, None]

        return pred_mean, pred_var

    def predictive_quantiles(self, mu, var, quantiles, Y_metadata=None):
        #compute the quantiles by sampling!!!
-        N_samp = 500
-        s = np.random.randn(mu.shape[0], N_samp)*np.sqrt(var) + mu
-        #ss_f = s.flatten()
-        #ss_y = self.samples(ss_f, Y_metadata)
-        #ss_y = self.samples(s, Y_metadata, samples=100)
-        ss_y = self.samples(s, Y_metadata)
-        #ss_y = ss_y.reshape(mu.shape[0], N_samp)
+        Nf_samp = 300
+        Ny_samp = 1
+        s = np.random.randn(mu.shape[0], Nf_samp)*np.sqrt(var) + mu
+        ss_y = self.samples(s, Y_metadata, samples=Ny_samp)
+        #ss_y = ss_y.reshape(mu.shape[0], mu.shape[1], Nf_samp*Ny_samp)

-        return [np.percentile(ss_y ,q, axis=1)[:,None] for q in quantiles]
+        pred_quantiles = [np.percentile(ss_y, q, axis=1)[:,None] for q in quantiles]
+        return pred_quantiles

    def samples(self, gp, Y_metadata=None, samples=1):
        """
--- a/GPy/likelihoods/student_t.py
+++ b/GPy/likelihoods/student_t.py
@ -10,6 +10,7 @@ from scipy.special import gammaln, gamma
 from .likelihood import Likelihood
 from ..core.parameterization import Param
 from ..core.parameterization.transformations import Logexp
+from scipy.special import psi as digamma

 class StudentT(Likelihood):
    """
@ -28,16 +29,13 @@ class StudentT(Likelihood):
        super(StudentT, self).__init__(gp_link, name='Student_T')
        # sigma2 is not a noise parameter, it is a squared scale.
        self.sigma2 = Param('t_scale2', float(sigma2), Logexp())
-        self.v = Param('deg_free', float(deg_free))
+        self.v = Param('deg_free', float(deg_free), Logexp())
        self.link_parameter(self.sigma2)
        self.link_parameter(self.v)
-        self.v.constrain_fixed()
+        #self.v.constrain_fixed()

        self.log_concave = False

-    #def parameters_changed(self):
-        #self.variance = (self.v / float(self.v - 2)) * self.sigma2
-
    def update_gradients(self, grads):
        """
        Pull out the gradients, be careful as the order must match the order
@ -224,20 +222,46 @@ class StudentT(Likelihood):
                           )
        return d2logpdf_dlink2_dvar

+    def dlogpdf_link_dv(self, inv_link_f, y, Y_metadata=None):
+        e = y - inv_link_f
+        e2 = np.square(e)
+        df = float(self.v[:])
+        s2 = float(self.sigma2[:])
+        dlogpdf_dv =  0.5*digamma(0.5*(df+1)) - 0.5*digamma(0.5*df) - 1.0/(2*df)
+        dlogpdf_dv += 0.5*(df+1)*e2/(df*(e2 + s2*df))
+        dlogpdf_dv -= 0.5*np.log1p(e2/(s2*df))
+        return dlogpdf_dv
+
+    def dlogpdf_dlink_dv(self, inv_link_f, y, Y_metadata=None):
+        e = y - inv_link_f
+        e2 = np.square(e)
+        df = float(self.v[:])
+        s2 = float(self.sigma2[:])
+        dlogpdf_df_dv = e*(e2 - self.sigma2)/(e2 + s2*df)**2
+        return dlogpdf_df_dv
+
+    def d2logpdf_dlink2_dv(self, inv_link_f, y, Y_metadata=None):
+        e = y - inv_link_f
+        e2 = np.square(e)
+        df = float(self.v[:])
+        s2 = float(self.sigma2[:])
+        e2_s2v = e**2 + s2*df
+        d2logpdf_df2_dv = (-s2*(df+1) + e2 - s2*df)/e2_s2v**2 - 2*s2*(df+1)*(e2 - s2*df)/e2_s2v**3
+        return d2logpdf_df2_dv
+
    def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
        dlogpdf_dvar = self.dlogpdf_link_dvar(f, y, Y_metadata=Y_metadata)
-        dlogpdf_dv = np.zeros_like(dlogpdf_dvar) #FIXME: Not done yet
+        dlogpdf_dv = self.dlogpdf_link_dv(f, y, Y_metadata=Y_metadata)
        return np.array((dlogpdf_dvar, dlogpdf_dv))

    def dlogpdf_dlink_dtheta(self, f, y, Y_metadata=None):
        dlogpdf_dlink_dvar = self.dlogpdf_dlink_dvar(f, y, Y_metadata=Y_metadata)
-        dlogpdf_dlink_dv = np.zeros_like(dlogpdf_dlink_dvar) #FIXME: Not done yet
+        dlogpdf_dlink_dv = self.dlogpdf_dlink_dv(f, y, Y_metadata=Y_metadata)
        return np.array((dlogpdf_dlink_dvar, dlogpdf_dlink_dv))

    def d2logpdf_dlink2_dtheta(self, f, y, Y_metadata=None):
        d2logpdf_dlink2_dvar = self.d2logpdf_dlink2_dvar(f, y, Y_metadata=Y_metadata)
-        d2logpdf_dlink2_dv = np.zeros_like(d2logpdf_dlink2_dvar) #FIXME: Not done yet
-
+        d2logpdf_dlink2_dv = self.d2logpdf_dlink2_dv(f, y, Y_metadata=Y_metadata)
        return np.array((d2logpdf_dlink2_dvar, d2logpdf_dlink2_dv))

    def predictive_mean(self, mu, sigma, Y_metadata=None):
--- a/GPy/mappings/init.py
+++ b/GPy/mappings/init.py
@ -6,4 +6,5 @@ from .linear import Linear
 from .mlp import MLP
 from .additive import Additive
 from .compound import Compound
+from .constant import Constant

--- a/GPy/mappings/constant.py
+++ b/GPy/mappings/constant.py
@ -0,0 +1,40 @@
+# Copyright (c) 2015, James Hensman, Alan Saul
+import numpy as np
+from ..core.mapping import Mapping
+from ..core.parameterization import Param
+
+class Constant(Mapping):
+    """
+    A Linear mapping.
+
+    .. math::
+
+       F(\mathbf{x}) = c
+
+
+    :param input_dim: dimension of input.
+    :type input_dim: int
+    :param output_dim: dimension of output.
+    :type output_dim: int
+    :param: value the value of this constant mapping
+
+    """
+
+    def __init__(self, input_dim, output_dim, value=0., name='constmap'):
+        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
+        value = np.atleast_1d(value)
+        if not len(value.shape) ==1:
+            raise ValueError("bad constant values: pass a float or flat vectoor")
+        elif value.size==1:
+            value = np.ones(self.output_dim)*value
+        self.C = Param('C', value)
+        self.link_parameter(self.C)
+
+    def f(self, X):
+        return np.tile(self.C.values[None,:], (X.shape[0], 1))
+
+    def update_gradients(self, dL_dF, X):
+        self.C.gradient = dL_dF.sum(0)
+
+    def gradients_X(self, dL_dF, X):
+        return np.zeros_like(X)
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@ -64,9 +64,6 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
            self.logger.debug("creating inference_method var_dtc")
            inference_method = VarDTC(limit=1 if not missing_data else Y.shape[1])

-        if kernel.useGPU and isinstance(inference_method, VarDTC_GPU):
-            kernel.psicomp.GPU_direct = True
-
        super(BayesianGPLVMMiniBatch,self).__init__(X, Y, Z, kernel, likelihood=likelihood,
                                           name=name, inference_method=inference_method,
                                           normalizer=normalizer,
--- a/GPy/models/dpgplvm.py
+++ b/GPy/models/dpgplvm.py
@ -3,7 +3,7 @@

 import numpy as np
 from .. import kern
-from bayesian_gplvm import BayesianGPLVM
+from .bayesian_gplvm import BayesianGPLVM
 from ..core.parameterization.variational import NormalPosterior, NormalPrior

 class DPBayesianGPLVM(BayesianGPLVM):
@ -15,5 +15,5 @@ class DPBayesianGPLVM(BayesianGPLVM):
                 name='bayesian gplvm', mpi_comm=None, normalizer=None,
                 missing_data=False, stochastic=False, batchsize=1):
        super(DPBayesianGPLVM,self).__init__(Y=Y, input_dim=input_dim, X=X, X_variance=X_variance, init=init, 					num_inducing=num_inducing, Z=Z, kernel=kernel, inference_method=inference_method, likelihood=likelihood, mpi_comm=mpi_comm, normalizer=normalizer, missing_data=missing_data, stochastic=stochastic, batchsize=batchsize, name='dp bayesian gplvm')
-	self.X.mean.set_prior(X_prior)
+        self.X.mean.set_prior(X_prior)
        self.link_parameter(X_prior)
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@ -16,6 +16,7 @@ class GPRegression(GP):
    :param Y: observed values
    :param kernel: a GPy kernel, defaults to rbf
    :param Norm normalizer: [False]
+    :param noise_var: the noise variance for Gaussian likelhood, defaults to 1.

        Normalize Y with the norm given.
        If normalizer is False, no normalization will be done
@ -25,12 +26,12 @@ class GPRegression(GP):

    """

-    def __init__(self, X, Y, kernel=None, Y_metadata=None, normalizer=None):
+    def __init__(self, X, Y, kernel=None, Y_metadata=None, normalizer=None, noise_var=1.):

        if kernel is None:
            kernel = kern.RBF(X.shape[1])
-
-        likelihood = likelihoods.Gaussian()
+	
+        likelihood = likelihoods.Gaussian(variance=noise_var)

        super(GPRegression, self).__init__(X, Y, kernel, likelihood, name='GP regression', Y_metadata=Y_metadata, normalizer=normalizer)

--- a/GPy/models/gp_var_gauss.py
+++ b/GPy/models/gp_var_gauss.py
@ -9,13 +9,14 @@ from ..core.parameterization import ObsAr
 from .. import kern
 from ..core.parameterization.param import Param
 from ..util.linalg import pdinv
+from ..likelihoods import Gaussian

 log_2_pi = np.log(2*np.pi)


 class GPVariationalGaussianApproximation(Model):
    """
-    The Variational Gaussian Approximation revisited implementation for regression
+    The Variational Gaussian Approximation revisited

    @article{Opper:2009,
        title = {The Variational Gaussian Approximation Revisited},
@ -25,44 +26,29 @@ class GPVariationalGaussianApproximation(Model):
        pages = {786--792},
    }
    """
-    def __init__(self, X, Y, kernel=None):
-        Model.__init__(self,'Variational GP classification')
+    def __init__(self, X, Y, kernel, likelihood=None, Y_metadata=None):
+        Model.__init__(self,'Variational GP')
+        if likelihood is None:
+            likelihood = Gaussian()
        # accept the construction arguments
        self.X = ObsAr(X)
-        if kernel is None:
-            kernel = kern.RBF(X.shape[1]) + kern.White(X.shape[1], 0.01)
-        self.kern = kernel
-        self.link_parameter(self.kern)
+        self.Y = Y
        self.num_data, self.input_dim = self.X.shape
+        self.Y_metadata = Y_metadata

-        self.alpha = Param('alpha', np.zeros(self.num_data))
+        self.kern = kernel
+        self.likelihood = likelihood
+        self.link_parameter(self.kern)
+        self.link_parameter(self.likelihood)
+
+        self.alpha = Param('alpha', np.zeros((self.num_data,1))) # only one latent fn for now.
        self.beta = Param('beta', np.ones(self.num_data))
        self.link_parameter(self.alpha)
        self.link_parameter(self.beta)

-        self.gh_x, self.gh_w = np.polynomial.hermite.hermgauss(20)
-        self.Ysign = np.where(Y==1, 1, -1).flatten()
-
    def log_likelihood(self):
-        """
-        Marginal log likelihood evaluation
-        """
        return self._log_lik

-    def likelihood_quadrature(self, m, v):
-        """
-        Perform Gauss-Hermite quadrature over the log of the likelihood, with a fixed weight
-        """
-        # assume probit for now.
-        X = self.gh_x[None, :]*np.sqrt(2.*v[:, None]) + (m*self.Ysign)[:, None]
-        p = stats.norm.cdf(X)
-        N = stats.norm.pdf(X)
-        F = np.log(p).dot(self.gh_w)
-        NoverP = N/p
-        dF_dm = (NoverP*self.Ysign[:,None]).dot(self.gh_w)
-        dF_dv = -0.5*(NoverP**2 + NoverP*X).dot(self.gh_w)
-        return F, dF_dm, dF_dv
-
    def parameters_changed(self):
        K = self.kern.K(self.X)
        m = K.dot(self.alpha)
@ -71,13 +57,14 @@ class GPVariationalGaussianApproximation(Model):
        A = np.eye(self.num_data) + BKB
        Ai, LA, _, Alogdet = pdinv(A)
        Sigma = np.diag(self.beta**-2) - Ai/self.beta[:, None]/self.beta[None, :]  # posterior coavairance: need full matrix for gradients
-        var = np.diag(Sigma)
+        var = np.diag(Sigma).reshape(-1,1)

-        F, dF_dm, dF_dv = self.likelihood_quadrature(m, var)
+        F, dF_dm, dF_dv, dF_dthetaL = self.likelihood.variational_expectations(self.Y, m, var, Y_metadata=self.Y_metadata)
+        self.likelihood.gradient = dF_dthetaL.sum(1).sum(1)
        dF_da = np.dot(K, dF_dm)
        SigmaB = Sigma*self.beta
-        dF_db = -np.diag(Sigma.dot(np.diag(dF_dv)).dot(SigmaB))*2
-        KL = 0.5*(Alogdet + np.trace(Ai) - self.num_data + m.dot(self.alpha))
+        dF_db = -np.diag(Sigma.dot(np.diag(dF_dv.flatten())).dot(SigmaB))*2
+        KL = 0.5*(Alogdet + np.trace(Ai) - self.num_data + np.sum(m*self.alpha))
        dKL_da = m
        A_A2 = Ai - Ai.dot(Ai)
        dKL_db = np.diag(np.dot(KB.T, A_A2))
@ -86,12 +73,12 @@ class GPVariationalGaussianApproximation(Model):
        self.beta.gradient = dF_db - dKL_db

        # K-gradients
-        dKL_dK = 0.5*(self.alpha[None, :]*self.alpha[:, None] + self.beta[:, None]*self.beta[None, :]*A_A2)
+        dKL_dK = 0.5*(self.alpha*self.alpha.T + self.beta[:, None]*self.beta[None, :]*A_A2)
        tmp = Ai*self.beta[:, None]/self.beta[None, :]
-        dF_dK = self.alpha[:, None]*dF_dm[None, :] + np.dot(tmp*dF_dv, tmp.T)
+        dF_dK = self.alpha*dF_dm.T + np.dot(tmp*dF_dv, tmp.T)
        self.kern.update_gradients_full(dF_dK - dKL_dK, self.X)

-    def predict(self, Xnew):
+    def _raw_predict(self, Xnew):
        """
        Predict the function(s) at the new point(s) Xnew.

@ -105,4 +92,4 @@ class GPVariationalGaussianApproximation(Model):
        Kxx = self.kern.Kdiag(Xnew)
        var = Kxx - np.sum(WiKux*Kux, 0)

-        return 0.5*(1+erf(mu/np.sqrt(2.*(var+1))))
+        return mu, var.reshape(-1,1)
--- a/GPy/models/gradient_checker.py
+++ b/GPy/models/gradient_checker.py
@ -228,14 +228,14 @@ class HessianChecker(GradientChecker):

        if verbose:
            if block_indices:
-                print "\nBlock {}".format(block_indices)
+                print("\nBlock {}".format(block_indices))
            else:
-                print "\nAll blocks"
+                print("\nAll blocks")

            header = ['Checked', 'Max-Ratio', 'Min-Ratio', 'Min-Difference', 'Max-Difference']
            header_string = map(lambda x: ' | '.join(header), [header])
            separator = '-' * len(header_string[0])
-            print '\n'.join([header_string[0], separator])
+            print('\n'.join([header_string[0], separator]))
            min_r = '%.6f' % float(numpy.min(ratio))
            max_r = '%.6f' % float(numpy.max(ratio))
            max_d = '%.6f' % float(numpy.max(difference))
@ -248,7 +248,7 @@ class HessianChecker(GradientChecker):
                checked = "\033[91m  False \033[0m"

            grad_string = "{} | {}  | {} |    {}    |   {} ".format(checked, cols[0], cols[1], cols[2], cols[3])
-            print grad_string
+            print(grad_string)

            if plot:
                import pylab as pb
@ -348,7 +348,7 @@ class SkewChecker(HessianChecker):
            numeric_hess_partial = nd.Jacobian(self._df, vectorized=True)
            numeric_hess = numeric_hess_partial(x)

-            print "Done making numerical hessian"
+            print("Done making numerical hessian")
            if analytic_hess.dtype is np.dtype('object'):
                #Blockify numeric_hess aswell
                blocksizes, pagesizes = get_block_shapes_3d(analytic_hess)
@ -365,7 +365,7 @@ class SkewChecker(HessianChecker):
                #Unless super_plot is set, just plot the first one
                p = True if (plot and block_ind == numeric_hess.shape[2]-1) or super_plot else False
                if verbose:
-                    print "Checking derivative of hessian wrt parameter number {}".format(block_ind)
+                    print("Checking derivative of hessian wrt parameter number {}".format(block_ind))
                check_passed[block_ind] = self.checkgrad_block(analytic_hess[:,:,block_ind], numeric_hess[:,:,block_ind], verbose=verbose, step=step, tolerance=tolerance, block_indices=block_indices, plot=p)

            current_index += current_size
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@ -5,11 +5,95 @@ import numpy as np

 from ..core.sparse_gp_mpi import SparseGP_MPI
 from .. import kern
+from ..core.parameterization import Param
 from ..likelihoods import Gaussian
-from ..core.parameterization.variational import SpikeAndSlabPrior, SpikeAndSlabPosterior
+from ..core.parameterization.variational import SpikeAndSlabPrior, SpikeAndSlabPosterior,VariationalPrior
 from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch
 from ..kern._src.psi_comp.ssrbf_psi_gpucomp import PSICOMP_SSRBF_GPU

+class IBPPosterior(SpikeAndSlabPosterior):
+    '''
+    The SpikeAndSlab distribution for variational approximations.
+    '''
+    def __init__(self, means, variances, binary_prob, tau=None,  sharedX=False, name='latent space'):
+        """
+        binary_prob : the probability of the distribution on the slab part.
+        """
+        from ..core.parameterization.transformations import Logexp
+        super(IBPPosterior, self).__init__(means, variances, binary_prob, group_spike=True, name=name)
+        self.sharedX = sharedX
+        if sharedX:
+            self.mean.fix(warning=False)
+            self.variance.fix(warning=False)
+        self.tau = Param("tau_", np.ones((self.gamma_group.shape[0],2)), Logexp())
+        self.link_parameter(self.tau)
+
+    def set_gradients(self, grad):
+        self.mean.gradient, self.variance.gradient, self.gamma.gradient, self.tau.gradient = grad
+
+    def __getitem__(self, s):
+        if isinstance(s, (int, slice, tuple, list, np.ndarray)):
+            import copy
+            n = self.__new__(self.__class__, self.name)
+            dc = self.__dict__.copy()
+            dc['mean'] = self.mean[s]
+            dc['variance'] = self.variance[s]
+            dc['binary_prob'] = self.binary_prob[s]
+            dc['tau'] = self.tau
+            dc['parameters'] = copy.copy(self.parameters)
+            n.__dict__.update(dc)
+            n.parameters[dc['mean']._parent_index_] = dc['mean']
+            n.parameters[dc['variance']._parent_index_] = dc['variance']
+            n.parameters[dc['binary_prob']._parent_index_] = dc['binary_prob']
+            n.parameters[dc['tau']._parent_index_] = dc['tau']
+            n._gradient_array_ = None
+            oversize = self.size - self.mean.size - self.variance.size - self.gamma.size - self.tau.size
+            n.size = n.mean.size + n.variance.size + n.gamma.size+ n.tau.size + oversize
+            n.ndim = n.mean.ndim
+            n.shape = n.mean.shape
+            n.num_data = n.mean.shape[0]
+            n.input_dim = n.mean.shape[1] if n.ndim != 1 else 1
+            return n
+        else:
+            return super(IBPPosterior, self).__getitem__(s)
+
+class IBPPrior(VariationalPrior):
+    def __init__(self, input_dim, alpha =2., name='IBPPrior', **kw):
+        super(IBPPrior, self).__init__(name=name, **kw)
+        from ..core.parameterization.transformations import Logexp, __fixed__  
+        self.input_dim = input_dim
+        self.variance = 1.
+        self.alpha = Param('alpha', alpha, __fixed__)
+        self.link_parameter(self.alpha)
+
+    def KL_divergence(self, variational_posterior):
+        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma_group.values, variational_posterior.tau.values
+            
+        var_mean = np.square(mu)/self.variance
+        var_S = (S/self.variance - np.log(S))
+        part1 = (gamma* (np.log(self.variance)-1. +var_mean + var_S)).sum()/2.
+        
+        ad = self.alpha/self.input_dim
+        from scipy.special import betaln,digamma
+        part2 = (gamma*np.log(gamma)).sum() + ((1.-gamma)*np.log(1.-gamma)).sum() + betaln(ad,1.)*self.input_dim \
+                -betaln(tau[:,0], tau[:,1]).sum() + ((tau[:,0]-gamma-ad)*digamma(tau[:,0])).sum() + \
+                ((tau[:,1]+gamma-2.)*digamma(tau[:,1])).sum() + ((2.+ad-tau[:,0]-tau[:,1])*digamma(tau.sum(axis=1))).sum()
+        
+        return part1+part2
+
+    def update_gradients_KL(self, variational_posterior):
+        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma_group.values, variational_posterior.tau.values
+
+        variational_posterior.mean.gradient -= gamma*mu/self.variance
+        variational_posterior.variance.gradient -= (1./self.variance - 1./S) * gamma /2.
+        from scipy.special import digamma,polygamma
+        dgamma = (np.log(gamma/(1.-gamma))+ digamma(tau[:,1])-digamma(tau[:,0]))/variational_posterior.num_data
+        variational_posterior.binary_prob.gradient -= dgamma+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
+        ad = self.alpha/self.input_dim
+        common = (ad+2-tau[:,0]-tau[:,1])*polygamma(1,tau.sum(axis=1))
+        variational_posterior.tau.gradient[:,0] = -((tau[:,0]-gamma-ad)*polygamma(1,tau[:,0])+common)
+        variational_posterior.tau.gradient[:,1] = -((tau[:,1]+gamma-2)*polygamma(1,tau[:,1])+common)
+
 class SSGPLVM(SparseGP_MPI):
    """
    Spike-and-Slab Gaussian Process Latent Variable Model
@ -23,9 +107,11 @@ class SSGPLVM(SparseGP_MPI):

    """
    def __init__(self, Y, input_dim, X=None, X_variance=None, Gamma=None, init='PCA', num_inducing=10,
-                 Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike_and_Slab GPLVM', group_spike=False, mpi_comm=None, pi=None, learnPi=True,normalizer=False, **kwargs):
+                 Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike_and_Slab GPLVM', group_spike=False, IBP=False, alpha=2., tau=None, mpi_comm=None, pi=None, learnPi=False,normalizer=False, sharedX=False, variational_prior=None,**kwargs):

        self.group_spike = group_spike
+        self.init = init
+        self.sharedX = sharedX
        
        if X == None:
            from ..util.initialization import initialize_latent
@ -33,8 +119,6 @@ class SSGPLVM(SparseGP_MPI):
        else:
            fracs = np.ones(input_dim)

-        self.init = init
-
        if X_variance is None: # The variance of the variational approximation (S)
            X_variance = np.random.uniform(0,.1,X.shape)
            
@ -64,17 +148,16 @@ class SSGPLVM(SparseGP_MPI):
        if pi is None:
            pi = np.empty((input_dim))
            pi[:] = 0.5
-        self.variational_prior = SpikeAndSlabPrior(pi=pi,learnPi=learnPi) # the prior probability of the latent binary variable b
-        
-        X = SpikeAndSlabPosterior(X, X_variance, gamma)
-                
+            
+        if IBP:
+            self.variational_prior = IBPPrior(input_dim=input_dim, alpha=alpha) if variational_prior is None else variational_prior
+            X = IBPPosterior(X, X_variance, gamma, tau=tau,sharedX=sharedX)
+        else:
+            self.variational_prior = SpikeAndSlabPrior(pi=pi,learnPi=learnPi, group_spike=group_spike)  if variational_prior is None else variational_prior
+            X = SpikeAndSlabPosterior(X, X_variance, gamma, group_spike=group_spike,sharedX=sharedX)
+
        super(SSGPLVM,self).__init__(X, Y, Z, kernel, likelihood, variational_prior=self.variational_prior, inference_method=inference_method, name=name, mpi_comm=mpi_comm, normalizer=normalizer, **kwargs)
-#         self.X.unfix()
-#         self.X.variance.constrain_positive()
        self.link_parameter(self.X, index=0)
-                
-        if self.group_spike:
-            [self.X.gamma[:,i].tie('tieGamma'+str(i)) for i in range(self.X.gamma.shape[1])] # Tie columns together
        
    def set_X_gradients(self, X, X_grad):
        """Set the gradients of the posterior distribution of X in its specific form."""
@ -84,9 +167,15 @@ class SSGPLVM(SparseGP_MPI):
        """Get the gradients of the posterior distribution of X in its specific form."""
        return X.mean.gradient, X.variance.gradient, X.binary_prob.gradient

+    def _propogate_X_val(self):
+        pass
+
    def parameters_changed(self):
+        self.X.propogate_val()
+        if self.sharedX: self._highest_parent_._propogate_X_val()
        super(SSGPLVM,self).parameters_changed()
        if isinstance(self.inference_method, VarDTC_minibatch):
+            self.X.collate_gradient()
            return
        
        self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
@ -95,6 +184,7 @@ class SSGPLVM(SparseGP_MPI):

        # update for the KL divergence
        self.variational_prior.update_gradients_KL(self.X)
+        self.X.collate_gradient()

    def input_sensitivity(self):
        if self.kern.ARD:
--- a/GPy/models/ss_mrd.py
+++ b/GPy/models/ss_mrd.py
@ -2,33 +2,256 @@
 The Maniforld Relevance Determination model with the spike-and-slab prior
 """

+import numpy as np
 from ..core import Model
 from .ss_gplvm import SSGPLVM
+from ..core.parameterization.variational import SpikeAndSlabPrior,NormalPosterior,VariationalPrior
+from ..util.misc import param_to_array
+from ..kern import RBF
+from ..core import Param
+from numpy.linalg.linalg import LinAlgError

 class SSMRD(Model):
    
-    def __init__(self, Ylist, input_dim, X=None, X_variance=None,
-                 initx = 'PCA', initz = 'permute',
-                 num_inducing=10, Z=None, kernel=None,
-                 inference_method=None, likelihoods=None, name='ss_mrd', Ynames=None):
+    def __init__(self, Ylist, input_dim, X=None, X_variance=None, Gammas=None, initx = 'PCA_concat', initz = 'permute', 
+                 num_inducing=10, Zs=None, kernels=None, inference_methods=None, likelihoods=None, group_spike=True,
+                 pi=0.5, name='ss_mrd', Ynames=None, mpi_comm=None, IBP=False, alpha=2., taus=None, ):
        super(SSMRD, self).__init__(name)
+        self.mpi_comm = mpi_comm
+        self._PROPAGATE_ = False
        
-        self.updates = False
-        self.models = [SSGPLVM(y, input_dim, X=X, X_variance=X_variance, num_inducing=num_inducing,Z=Z,init=initx,
-                               kernel=kernel.copy() if kernel else None,inference_method=inference_method,likelihood=likelihoods,
-                               name='model_'+str(i)) for i,y in enumerate(Ylist)]
-        self.add_parameters(*(self.models))
+        # initialize X for individual models
+        X, X_variance, Gammas, fracs = self._init_X(Ylist, input_dim, X, X_variance, Gammas, initx)
+        self.X = NormalPosterior(means=X, variances=X_variance)
        
-        [[[self.models[m].X.mean[i,j:j+1].tie('mean_'+str(i)+'_'+str(j)) for m in range(len(self.models))] for j in range(self.models[0].X.mean.shape[1])] 
-         for i in range(self.models[0].X.mean.shape[0])]
-        [[[self.models[m].X.variance[i,j:j+1].tie('var_'+str(i)+'_'+str(j)) for m in range(len(self.models))] for j in range(self.models[0].X.variance.shape[1])] 
-         for i in range(self.models[0].X.variance.shape[0])]
+        if kernels is None:
+            kernels = [RBF(input_dim, lengthscale=1./fracs, ARD=True) for i in xrange(len(Ylist))]
+        if Zs is None:
+            Zs = [None]* len(Ylist)
+        if likelihoods is None:
+            likelihoods = [None]* len(Ylist)
+        if inference_methods is None:
+            inference_methods = [None]* len(Ylist)
        
-        self.updates = True
+        if IBP:
+            self.var_priors = [IBPPrior_SSMRD(len(Ylist),input_dim,alpha=alpha) for i in xrange(len(Ylist))]
+        else:
+            self.var_priors = [SpikeAndSlabPrior_SSMRD(nModels=len(Ylist),pi=pi,learnPi=False, group_spike=group_spike) for i in xrange(len(Ylist))]
+        self.models = [SSGPLVM(y, input_dim, X=X.copy(), X_variance=X_variance.copy(), Gamma=Gammas[i], num_inducing=num_inducing,Z=Zs[i], learnPi=False, group_spike=group_spike,
+                               kernel=kernels[i],inference_method=inference_methods[i],likelihood=likelihoods[i], variational_prior=self.var_priors[i], IBP=IBP, tau=None if taus is None else taus[i],
+                               name='model_'+str(i), mpi_comm=mpi_comm, sharedX=True) for i,y in enumerate(Ylist)]
+        self.link_parameters(*(self.models+[self.X]))
+        
+    def _propogate_X_val(self):
+        if self._PROPAGATE_: return
+        for m in self.models:
+            m.X.mean.values[:] = self.X.mean.values
+            m.X.variance.values[:] = self.X.variance.values
+        varp_list = [m.X for m in self.models]
+        [vp._update_inernal(varp_list) for vp in self.var_priors]
+        self._PROPAGATE_=True
+    
+    def _collate_X_gradient(self):
+        self._PROPAGATE_ = False
+        self.X.mean.gradient[:] = 0
+        self.X.variance.gradient[:] = 0
+        for m in self.models:
+            self.X.mean.gradient += m.X.mean.gradient
+            self.X.variance.gradient += m.X.variance.gradient
        
    def parameters_changed(self):
-        super(SSMRD, self).parameters_changed()        
+        super(SSMRD, self).parameters_changed()
+        [m.parameters_changed() for m in self.models]
        self._log_marginal_likelihood = sum([m._log_marginal_likelihood for m in self.models])
+        self._collate_X_gradient()

    def log_likelihood(self):
        return self._log_marginal_likelihood
+    
+    def _init_X(self, Ylist, input_dim, X=None, X_variance=None, Gammas=None, initx='PCA_concat'):
+        
+        # Divide latent dimensions
+        idx = np.empty((input_dim,),dtype=np.int)
+        residue = (input_dim)%(len(Ylist))
+        for i in xrange(len(Ylist)):
+            if i < residue:
+                size = input_dim/len(Ylist)+1
+                idx[i*size:(i+1)*size] = i
+            else:
+                size = input_dim/len(Ylist)
+                idx[i*size+residue:(i+1)*size+residue] = i
+        
+        if X is None:
+            if initx == 'PCA_concat':
+                X = np.empty((Ylist[0].shape[0],input_dim))
+                fracs = np.empty((input_dim,))
+                from ..util.initialization import initialize_latent
+                for i in xrange(len(Ylist)):
+                    Y = Ylist[i]
+                    dim = (idx==i).sum()
+                    if dim>0:
+                        x, fr = initialize_latent('PCA', dim, Y)
+                        X[:,idx==i] = x
+                        fracs[idx==i] = fr
+            elif initx=='PCA_joint':
+                y = np.hstack(Ylist)
+                from ..util.initialization import initialize_latent
+                X, fracs = initialize_latent('PCA', input_dim, y)
+            else:
+                X = np.random.randn(Ylist[0].shape[0], input_dim)
+                fracs = np.ones(input_dim)
+        else:
+            fracs = np.ones(input_dim)
+            
+    
+        if X_variance is None: # The variance of the variational approximation (S)
+            X_variance = np.random.uniform(0,.1,X.shape)
+            
+        if Gammas is None:
+            Gammas = []
+            for x in X:
+                gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
+                gamma[:] = 0.5 + 0.1 * np.random.randn(X.shape[0], input_dim)
+                gamma[gamma>1.-1e-9] = 1.-1e-9
+                gamma[gamma<1e-9] = 1e-9
+                Gammas.append(gamma)
+        return X, X_variance, Gammas, fracs
+
+    @Model.optimizer_array.setter
+    def optimizer_array(self, p):
+        if self.mpi_comm != None:
+            if self._IN_OPTIMIZATION_ and self.mpi_comm.rank==0:
+                self.mpi_comm.Bcast(np.int32(1),root=0)
+            self.mpi_comm.Bcast(p, root=0)        
+        Model.optimizer_array.fset(self,p)
+        
+    def optimize(self, optimizer=None, start=None, **kwargs):
+        self._IN_OPTIMIZATION_ = True
+        if self.mpi_comm==None:
+            super(SSMRD, self).optimize(optimizer,start,**kwargs)
+        elif self.mpi_comm.rank==0:
+            super(SSMRD, self).optimize(optimizer,start,**kwargs)
+            self.mpi_comm.Bcast(np.int32(-1),root=0)
+        elif self.mpi_comm.rank>0:
+            x = self.optimizer_array.copy()
+            flag = np.empty(1,dtype=np.int32)
+            while True:
+                self.mpi_comm.Bcast(flag,root=0)
+                if flag==1:
+                    try:
+                        self.optimizer_array = x
+                        self._fail_count = 0
+                    except (LinAlgError, ZeroDivisionError, ValueError):
+                        if self._fail_count >= self._allowed_failures:
+                            raise
+                        self._fail_count += 1
+                elif flag==-1:
+                    break
+                else:
+                    self._IN_OPTIMIZATION_ = False
+                    raise Exception("Unrecognizable flag for synchronization!")
+        self._IN_OPTIMIZATION_ = False
+        
+
+class SpikeAndSlabPrior_SSMRD(SpikeAndSlabPrior):
+    def __init__(self, nModels, pi=0.5, learnPi=False, group_spike=True, variance = 1.0, name='SSMRDPrior', **kw):
+        self.nModels = nModels
+        self._b_prob_all = 0.5
+        super(SpikeAndSlabPrior_SSMRD, self).__init__(pi=pi,learnPi=learnPi,group_spike=group_spike,variance=variance, name=name, **kw)
+    
+    def _update_inernal(self, varp_list):
+        """Make an update of the internal status by gathering the variational posteriors for all the individual models."""
+        # The probability for the binary variable for the same latent dimension of any of the models is on.
+        if self.group_spike:
+            self._b_prob_all = 1.-param_to_array(varp_list[0].gamma_group)
+            [np.multiply(self._b_prob_all, 1.-vp.gamma_group, self._b_prob_all) for vp in varp_list[1:]]
+        else:
+            self._b_prob_all = 1.-param_to_array(varp_list[0].binary_prob)
+            [np.multiply(self._b_prob_all, 1.-vp.binary_prob, self._b_prob_all) for vp in varp_list[1:]]            
+
+    def KL_divergence(self, variational_posterior):
+        mu = variational_posterior.mean
+        S = variational_posterior.variance
+        if self.group_spike:
+            gamma = variational_posterior.binary_prob[0]
+        else:
+            gamma = variational_posterior.binary_prob
+        if len(self.pi.shape)==2:
+            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            pi = self.pi[idx]
+        else:
+            pi = self.pi
+
+        var_mean = np.square(mu)/self.variance
+        var_S = (S/self.variance - np.log(S))
+        var_gamma = (gamma*np.log(gamma/pi)).sum()+((1-gamma)*np.log((1-gamma)/(1-pi))).sum()
+        return var_gamma +((1.-self._b_prob_all)*(np.log(self.variance)-1. +var_mean + var_S)).sum()/(2.*self.nModels)
+
+    def update_gradients_KL(self, variational_posterior):
+        mu = variational_posterior.mean
+        S = variational_posterior.variance
+        N = variational_posterior.num_data
+        if self.group_spike:
+            gamma = variational_posterior.binary_prob.values[0]
+        else:
+            gamma = variational_posterior.binary_prob.values
+        if len(self.pi.shape)==2:
+            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            pi = self.pi[idx]
+        else:
+            pi = self.pi
+
+        if self.group_spike:
+            tmp = self._b_prob_all/(1.-gamma)
+            variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))/N +tmp*((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
+        else:
+            variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
+        mu.gradient -= (1.-self._b_prob_all)*mu/(self.variance*self.nModels)
+        S.gradient -= (1./self.variance - 1./S) * (1.-self._b_prob_all) /(2.*self.nModels)
+        if self.learnPi:
+            raise 'Not Supported!'
+
+class IBPPrior_SSMRD(VariationalPrior):
+    def __init__(self, nModels, input_dim, alpha =2., tau=None, name='IBPPrior', **kw):
+        super(IBPPrior_SSMRD, self).__init__(name=name, **kw)
+        from ..core.parameterization.transformations import Logexp, __fixed__  
+        self.nModels = nModels
+        self._b_prob_all = 0.5
+        self.input_dim = input_dim
+        self.variance = 1.
+        self.alpha = Param('alpha', alpha, __fixed__)
+        self.link_parameter(self.alpha)
+        
+    def _update_inernal(self, varp_list):
+        """Make an update of the internal status by gathering the variational posteriors for all the individual models."""
+        # The probability for the binary variable for the same latent dimension of any of the models is on.
+        self._b_prob_all = 1.-param_to_array(varp_list[0].gamma_group)
+        [np.multiply(self._b_prob_all, 1.-vp.gamma_group, self._b_prob_all) for vp in varp_list[1:]]
+
+    def KL_divergence(self, variational_posterior):
+        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma_group.values, variational_posterior.tau.values
+            
+        var_mean = np.square(mu)/self.variance
+        var_S = (S/self.variance - np.log(S))
+        part1 = ((1.-self._b_prob_all)* (np.log(self.variance)-1. +var_mean + var_S)).sum()/(2.*self.nModels)
+        
+        ad = self.alpha/self.input_dim
+        from scipy.special import betaln,digamma
+        part2 = (gamma*np.log(gamma)).sum() + ((1.-gamma)*np.log(1.-gamma)).sum() + (betaln(ad,1.)*self.input_dim -betaln(tau[:,0], tau[:,1]).sum())/self.nModels \
+                 + (( (tau[:,0]-ad)/self.nModels -gamma)*digamma(tau[:,0])).sum() + \
+                (((tau[:,1]-1.)/self.nModels+gamma-1.)*digamma(tau[:,1])).sum() + (((1.+ad-tau[:,0]-tau[:,1])/self.nModels+1.)*digamma(tau.sum(axis=1))).sum()
+        return part1+part2
+
+    def update_gradients_KL(self, variational_posterior):
+        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma_group.values, variational_posterior.tau.values
+
+        variational_posterior.mean.gradient -= (1.-self._b_prob_all)*mu/(self.variance*self.nModels)
+        variational_posterior.variance.gradient -= (1./self.variance - 1./S) * (1.-self._b_prob_all) /(2.*self.nModels)
+        from scipy.special import digamma,polygamma
+        tmp = self._b_prob_all/(1.-gamma)
+        dgamma = (np.log(gamma/(1.-gamma))+ digamma(tau[:,1])-digamma(tau[:,0]))/variational_posterior.num_data
+        variational_posterior.binary_prob.gradient -= dgamma+tmp*((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
+        ad = self.alpha/self.input_dim
+        common = ((1.+ad-tau[:,0]-tau[:,1])/self.nModels+1.)*polygamma(1,tau.sum(axis=1))
+        variational_posterior.tau.gradient[:,0] = -(((tau[:,0]-ad)/self.nModels -gamma)*polygamma(1,tau[:,0])+common)
+        variational_posterior.tau.gradient[:,1] = -(((tau[:,1]-1.)/self.nModels+gamma-1.)*polygamma(1,tau[:,1])+common)
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@ -11,13 +11,14 @@ from base_plots import gpplot, x_frame1D, x_frame2D
 from ...models.gp_coregionalized_regression import GPCoregionalizedRegression
 from ...models.sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
 from scipy import sparse
+from ...core.parameterization.variational import VariationalPosterior

 def plot_fit(model, plot_limits=None, which_data_rows='all',
        which_data_ycols='all', fixed_inputs=[],
        levels=20, samples=0, fignum=None, ax=None, resolution=None,
        plot_raw=False,
        linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue'], Y_metadata=None, data_symbol='kx',
-        apply_link=False, samples_f=0, plot_uncertain_inputs=True):
+        apply_link=False, samples_f=0, plot_uncertain_inputs=True, predict_kw=None):
    """
    Plot the posterior of the GP.
      - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
@ -76,6 +77,9 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',

    if hasattr(model, 'Z'): Z = model.Z

+    if predict_kw is None:
+        predict_kw = {}
+
    #work out what the inputs are for plotting (1D or 2D)
    fixed_dims = np.array([i for i,v in fixed_inputs])
    free_dims = np.setdiff1d(np.arange(model.input_dim),fixed_dims)
@ -92,7 +96,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',

        #make a prediction on the frame and plot it
        if plot_raw:
-            m, v = model._raw_predict(Xgrid)
+            m, v = model._raw_predict(Xgrid, **predict_kw)
            if apply_link:
                lower = model.likelihood.gp_link.transf(m - 2*np.sqrt(v))
                upper = model.likelihood.gp_link.transf(m + 2*np.sqrt(v))
@ -103,11 +107,13 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
                upper = m + 2*np.sqrt(v)
        else:
            if isinstance(model,GPCoregionalizedRegression) or isinstance(model,SparseGPCoregionalizedRegression):
-                meta = {'output_index': Xgrid[:,-1:].astype(np.int)}
-            else:
-                meta = None
-            m, v = model.predict(Xgrid, full_cov=False, Y_metadata=meta)
-            lower, upper = model.predict_quantiles(Xgrid, Y_metadata=meta)
+                extra_data = Xgrid[:,-1:].astype(np.int)
+                if Y_metadata is None:
+                    Y_metadata = {'output_index': extra_data}
+                else:
+                    Y_metadata['output_index'] = extra_data
+            m, v = model.predict(Xgrid, full_cov=False, Y_metadata=Y_metadata, **predict_kw)
+            lower, upper = model.predict_quantiles(Xgrid, Y_metadata=Y_metadata)


        for d in which_data_ycols:
@ -116,7 +122,9 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',

        #optionally plot some samples
        if samples: #NOTE not tested with fixed_inputs
-            Ysim = model.posterior_samples(Xgrid, samples)
+            Ysim = model.posterior_samples(Xgrid, samples, Y_metadata=Y_metadata)
+            print Ysim.shape
+            print Xnew.shape
            for yi in Ysim.T:
                plots['posterior_samples'] = ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
                #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
@ -178,13 +186,15 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',

        #predict on the frame and plot
        if plot_raw:
-            m, _ = model._raw_predict(Xgrid)
+            m, _ = model._raw_predict(Xgrid, **predict_kw)
        else:
            if isinstance(model,GPCoregionalizedRegression) or isinstance(model,SparseGPCoregionalizedRegression):
-                meta = {'output_index': Xgrid[:,-1:].astype(np.int)}
-            else:
-                meta = None
-            m, v = model.predict(Xgrid, full_cov=False, Y_metadata=meta)
+                extra_data = Xgrid[:,-1:].astype(np.int)
+                if Y_metadata is None:
+                    Y_metadata = {'output_index': extra_data}
+                else:
+                    Y_metadata['output_index'] = extra_data
+            m, v = model.predict(Xgrid, full_cov=False, Y_metadata=Y_metadata, **predict_kw)
        for d in which_data_ycols:
            m_d = m[:,d].reshape(resolution, resolution).T
            plots['contour'] = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
@ -216,7 +226,7 @@ def plot_fit_f(model, *args, **kwargs):
    kwargs['plot_raw'] = True
    plot_fit(model,*args, **kwargs)

-def fixed_inputs(model, non_fixed_inputs, fix_routine='median'):
+def fixed_inputs(model, non_fixed_inputs, fix_routine='median', as_list=True, X_all=False):
    """
    Convenience function for returning back fixed_inputs where the other inputs
    are fixed using fix_routine
@ -226,18 +236,30 @@ def fixed_inputs(model, non_fixed_inputs, fix_routine='median'):
    :type non_fixed_inputs: list
    :param fix_routine: fixing routine to use, 'mean', 'median', 'zero'
    :type fix_routine: string
+    :param as_list: if true, will return a list of tuples with (dimension, fixed_val) otherwise it will create the corresponding X matrix
+    :type as_list: boolean
    """
    f_inputs = []
    if hasattr(model, 'has_uncertain_inputs') and model.has_uncertain_inputs():
        X = model.X.mean.values.copy()
-    else:
+    elif isinstance(model.X, VariationalPosterior):
        X = model.X.values.copy()
+    else:
+        if X_all:
+            X = model.X_all.copy()
+        else:
+            X = model.X.copy()
    for i in range(X.shape[1]):
        if i not in non_fixed_inputs:
            if fix_routine == 'mean':
                f_inputs.append( (i, np.mean(X[:,i])) )
            if fix_routine == 'median':
                f_inputs.append( (i, np.median(X[:,i])) )
-            elif fix_routine == 'zero':
+            else: # set to zero zero
                f_inputs.append( (i, 0) )
-    return f_inputs
+            if not as_list:
+                X[:,i] = f_inputs[-1][1]
+    if as_list:
+        return f_inputs
+    else:
+        return X
--- a/GPy/plotting/matplot_dep/visualize.py
+++ b/GPy/plotting/matplot_dep/visualize.py
@ -408,12 +408,13 @@ class mocap_data_show_vpython(vpython_show):
 class mocap_data_show(matplotlib_show):
    """Base class for visualizing motion capture data."""

-    def __init__(self, vals, axes=None, connect=None):
+    def __init__(self, vals, axes=None, connect=None, color='b'):
        if axes==None:
            fig = plt.figure()
            axes = fig.add_subplot(111, projection='3d', aspect='equal')
        matplotlib_show.__init__(self, vals, axes)

+        self.color = color
        self.connect = connect
        self.process_values()
        self.initialize_axes()
@ -423,7 +424,7 @@ class mocap_data_show(matplotlib_show):
        self.axes.figure.canvas.draw()

    def draw_vertices(self):
-        self.points_handle = self.axes.scatter(self.vals[:, 0], self.vals[:, 1], self.vals[:, 2])
+        self.points_handle = self.axes.scatter(self.vals[:, 0], self.vals[:, 1], self.vals[:, 2], color=self.color)

    def draw_edges(self):
        self.line_handle = []
@ -442,7 +443,7 @@ class mocap_data_show(matplotlib_show):
                z.append(self.vals[i, 2])
                z.append(self.vals[j, 2])
                z.append(np.NaN)
-            self.line_handle = self.axes.plot(np.array(x), np.array(y), np.array(z), 'b-')
+            self.line_handle = self.axes.plot(np.array(x), np.array(y), np.array(z), '-', color=self.color)

    def modify(self, vals):
        self.vals = vals.copy()
@ -450,7 +451,7 @@ class mocap_data_show(matplotlib_show):
        self.initialize_axes_modify()
        self.draw_vertices()
        self.initialize_axes()
-        self.finalize_axes_modify()
+        #self.finalize_axes_modify()
        self.draw_edges()
        self.axes.figure.canvas.draw()

@ -469,12 +470,20 @@ class mocap_data_show(matplotlib_show):
        self.line_handle[0].remove()

    def finalize_axes(self):
-        self.axes.set_xlim(self.x_lim)
-        self.axes.set_ylim(self.y_lim)
-        self.axes.set_zlim(self.z_lim)
-        self.axes.auto_scale_xyz([-1., 1.], [-1., 1.], [-1., 1.])
+#         self.axes.set_xlim(self.x_lim)
+#         self.axes.set_ylim(self.y_lim)
+#         self.axes.set_zlim(self.z_lim)
+#         self.axes.auto_scale_xyz([-1., 1.], [-1., 1.], [-1., 1.])

-#        self.axes.set_aspect('equal')
+        extents = np.array([getattr(self.axes, 'get_{}lim'.format(dim))() for dim in 'xyz'])
+        sz = extents[:,1] - extents[:,0]
+        centers = np.mean(extents, axis=1)
+        maxsize = max(abs(sz))
+        r = maxsize/2
+        for ctr, dim in zip(centers, 'xyz'):
+            getattr(self.axes, 'set_{}lim'.format(dim))(ctr - r, ctr + r)
+        
+#         self.axes.set_aspect('equal')
 #         self.axes.autoscale(enable=False)

    def finalize_axes_modify(self):
@ -494,7 +503,7 @@ class stick_show(mocap_data_show):

 class skeleton_show(mocap_data_show):
    """data_show class for visualizing motion capture data encoded as a skeleton with angles."""
-    def __init__(self, vals, skel, axes=None, padding=0):
+    def __init__(self, vals, skel, axes=None, padding=0, color='b'):
        """data_show class for visualizing motion capture data encoded as a skeleton with angles.
        :param vals: set of modeled angles to use for printing in the axis when it's first created.
        :type vals: np.array
@ -506,7 +515,7 @@ class skeleton_show(mocap_data_show):
        self.skel = skel
        self.padding = padding
        connect = skel.connection_matrix()
-        mocap_data_show.__init__(self, vals, axes=axes, connect=connect)
+        mocap_data_show.__init__(self, vals, axes=axes, connect=connect, color=color)
    def process_values(self):
        """Takes a set of angles and converts them to the x,y,z coordinates in the internal prepresentation of the class, ready for plotting.

--- a/GPy/testing/cython_tests.py
+++ b/GPy/testing/cython_tests.py
@ -0,0 +1,65 @@
+import numpy as np
+import scipy as sp
+from GPy.util import choleskies
+import GPy
+
+"""
+These tests make sure that the opure python and cython codes work the same
+"""
+
+class CythonTestChols(np.testing.TestCase):
+    def setUp(self):
+        self.flat = np.random.randn(45,5)
+        self.triang = np.array([np.eye(20) for i in range(3)])
+    def test_flat_to_triang(self):
+        L1 = choleskies._flat_to_triang_pure(self.flat)
+        L2 = choleskies._flat_to_triang_cython(self.flat)
+        np.testing.assert_allclose(L1, L2)
+    def test_triang_to_flat(self):
+        A1 = choleskies._triang_to_flat_pure(self.triang)
+        A2 = choleskies._triang_to_flat_cython(self.triang)
+        np.testing.assert_allclose(A1, A2)
+
+class test_stationary(np.testing.TestCase):
+    def setUp(self):
+        self.k = GPy.kern.RBF(10)
+        self.X = np.random.randn(300,10)
+        self.Z = np.random.randn(20,10)
+        self.dKxx = np.random.randn(300,300)
+        self.dKzz = np.random.randn(20,20)
+        self.dKxz = np.random.randn(300,20)
+
+    def test_square_gradX(self):
+        g1 = self.k._gradients_X_cython(self.dKxx, self.X)
+        g2 = self.k._gradients_X_pure(self.dKxx, self.X)
+        np.testing.assert_allclose(g1, g2)
+
+    def test_rect_gradx(self):
+        g1 = self.k._gradients_X_cython(self.dKxz, self.X, self.Z)
+        g2 = self.k._gradients_X_pure(self.dKxz, self.X, self.Z)
+        np.testing.assert_allclose(g1, g2)
+
+    def test_square_lengthscales(self):
+        g1 = self.k._lengthscale_grads_pure(self.dKxx, self.X, self.X)
+        g2 = self.k._lengthscale_grads_cython(self.dKxx, self.X, self.X)
+        np.testing.assert_allclose(g1, g2)
+
+    def test_rect_lengthscales(self):
+        g1 = self.k._lengthscale_grads_pure(self.dKxz, self.X, self.Z)
+        g2 = self.k._lengthscale_grads_cython(self.dKxz, self.X, self.Z)
+        np.testing.assert_allclose(g1, g2)
+
+class test_choleskies_backprop(np.testing.TestCase):
+    def setUp(self):
+        self.dL, self.L = np.random.randn(2, 100, 100)
+    def test(self):
+        r1 = GPy.util.choleskies._backprop_gradient_pure(self.dL, self.L)
+        r2 = GPy.util.choleskies.choleskies_cython.backprop_gradient(self.dL, self.L)
+        np.testing.assert_allclose(r1, r2)
+
+
+
+
+
+
+
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -366,9 +366,9 @@ class KernelTestsNonContinuous(unittest.TestCase):
        X2 = self.X2[self.X2[:,-1]!=2]
        self.assertTrue(check_kernel_gradient_functions(kern, X=X, X2=X2, verbose=verbose, fixed_X_dims=-1))

-class Coregionalize_weave_test(unittest.TestCase):
+class Coregionalize_cython_test(unittest.TestCase):
    """
-    Make sure that the coregionalize kernel work with and without weave enabled
+    Make sure that the coregionalize kernel work with and without cython enabled
    """
    def setUp(self):
        self.k = GPy.kern.Coregionalize(1, output_dim=12)
@ -378,36 +378,42 @@ class Coregionalize_weave_test(unittest.TestCase):

    def test_sym(self):
        dL_dK = np.random.randn(self.N1, self.N1)
-        GPy.util.config.config.set('weave', 'working', 'True')
-        K_weave = self.k.K(self.X)
+        GPy.util.config.config.set('cython', 'working', 'True')
+        K_cython = self.k.K(self.X)
        self.k.update_gradients_full(dL_dK, self.X)
-        grads_weave = self.k.gradient.copy()
+        grads_cython = self.k.gradient.copy()

-        GPy.util.config.config.set('weave', 'working', 'False')
+        GPy.util.config.config.set('cython', 'working', 'False')
        K_numpy = self.k.K(self.X)
        self.k.update_gradients_full(dL_dK, self.X)
        grads_numpy = self.k.gradient.copy()

-        self.assertTrue(np.allclose(K_numpy, K_weave))
-        self.assertTrue(np.allclose(grads_numpy, grads_weave))
+        self.assertTrue(np.allclose(K_numpy, K_cython))
+        self.assertTrue(np.allclose(grads_numpy, grads_cython))
+
+        #reset the cython state for any other tests
+        GPy.util.config.config.set('cython', 'working', 'true')

    def test_nonsym(self):
        dL_dK = np.random.randn(self.N1, self.N2)
-        GPy.util.config.config.set('weave', 'working', 'True')
-        K_weave = self.k.K(self.X, self.X2)
+        GPy.util.config.config.set('cython', 'working', 'True')
+        K_cython = self.k.K(self.X, self.X2)
+        self.k.gradient = 0.
        self.k.update_gradients_full(dL_dK, self.X, self.X2)
-        grads_weave = self.k.gradient.copy()
+        grads_cython = self.k.gradient.copy()

-        GPy.util.config.config.set('weave', 'working', 'False')
+        GPy.util.config.config.set('cython', 'working', 'False')
        K_numpy = self.k.K(self.X, self.X2)
+        self.k.gradient = 0.
        self.k.update_gradients_full(dL_dK, self.X, self.X2)
        grads_numpy = self.k.gradient.copy()

-        self.assertTrue(np.allclose(K_numpy, K_weave))
-        self.assertTrue(np.allclose(grads_numpy, grads_weave))
+        self.assertTrue(np.allclose(K_numpy, K_cython))
+        self.assertTrue(np.allclose(grads_numpy, grads_cython))
+
+        #reset the cython state for any other tests
+        GPy.util.config.config.set('cython', 'working', 'true')

-    #reset the weave state for any other tests
-    GPy.util.config.config.set('weave', 'working', 'False')


 class KernelTestsProductWithZeroValues(unittest.TestCase):
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@ -93,6 +93,9 @@ def dparam_checkgrad(func, dfunc, params, params_names, args, constraints=None,
            if not grad.checkgrad(verbose=True):
                gradchecking = False

+            if not grad.checkgrad(verbose=True):
+                gradchecking = False
+
    return gradchecking


@ -116,6 +119,7 @@ class TestNoiseModels(object):
        self.integer_Y = np.where(tmp > 0, tmp, 0)

        self.var = 0.2
+        self.deg_free = 4.0

        #Make a bigger step as lower bound can be quite curved
        self.step = 1e-4
@ -135,7 +139,7 @@ class TestNoiseModels(object):
                }
        """
        self.noise_models = {"Student_t_default": {
-            "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
+            "model": GPy.likelihoods.StudentT(deg_free=self.deg_free, sigma2=self.var),
            "grad_params": {
                "names": [".*t_scale2"],
                "vals": [self.var],
@ -143,8 +147,17 @@ class TestNoiseModels(object):
            },
            "laplace": True
            },
+            #"Student_t_deg_free": {
+                #"model": GPy.likelihoods.StudentT(deg_free=self.deg_free, sigma2=self.var),
+                #"grad_params": {
+                    #"names": [".*deg_free"],
+                    #"vals": [self.deg_free],
+                    #"constraints": [(".*t_scale2", self.constrain_fixed), (".*deg_free", self.constrain_positive)]
+                #},
+                #"laplace": True
+            #},
            "Student_t_1_var": {
-                "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
+                "model": GPy.likelihoods.StudentT(deg_free=self.deg_free, sigma2=self.var),
                "grad_params": {
                    "names": [".*t_scale2"],
                    "vals": [1.0],
@ -162,7 +175,7 @@ class TestNoiseModels(object):
                "laplace": True
            },
            "Student_t_small_var": {
-                "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
+                "model": GPy.likelihoods.StudentT(deg_free=self.deg_free, sigma2=self.var),
                "grad_params": {
                    "names": [".*t_scale2"],
                    "vals": [0.001],
@ -171,7 +184,7 @@ class TestNoiseModels(object):
                "laplace": True
            },
            "Student_t_large_var": {
-                "model": GPy.likelihoods.StudentT(deg_free=5, sigma2=self.var),
+                "model": GPy.likelihoods.StudentT(deg_free=self.deg_free, sigma2=self.var),
                "grad_params": {
                    "names": [".*t_scale2"],
                    "vals": [10.0],
--- a/GPy/testing/linalg_test.py
+++ b/GPy/testing/linalg_test.py
@ -1,6 +1,7 @@
 import numpy as np
 import scipy as sp
-from ..util.linalg import jitchol
+from GPy.util.linalg import jitchol
+import GPy

 class LinalgTests(np.testing.TestCase):
    def setUp(self):
@ -35,3 +36,25 @@ class LinalgTests(np.testing.TestCase):
            return False
        except sp.linalg.LinAlgError:
            return True
+
+    def test_einsum_ijk_jlk_to_il(self):
+        A = np.random.randn(50, 150, 5)
+        B = np.random.randn(150, 100, 5)
+        pure = np.einsum('ijk,jlk->il', A, B)
+        quick = GPy.util.linalg.ijk_jlk_to_il(A, B)
+        np.testing.assert_allclose(pure, quick)
+
+    def test_einsum_ij_jlk_to_ilk(self):
+        A = np.random.randn(15, 150, 5)
+        B = np.random.randn(150, 50, 5)
+        pure = np.einsum('ijk,jlk->il', A, B)
+        quick = GPy.util.linalg.ijk_jlk_to_il(A,B)
+        np.testing.assert_allclose(pure, quick)
+
+    def test_einsum_ijk_ljk_to_ilk(self):
+        A = np.random.randn(150, 20, 5)
+        B = np.random.randn(150, 20, 5)
+        #B = A.copy()
+        pure = np.einsum('ijk,ljk->ilk', A, B)
+        quick = GPy.util.linalg.ijk_ljk_to_ilk(A,B)
+        np.testing.assert_allclose(pure, quick)
--- a/GPy/testing/link_function_tests.py
+++ b/GPy/testing/link_function_tests.py
@ -49,7 +49,7 @@ class LinkFunctionTests(np.testing.TestCase):
        self.assertTrue(grad3.checkgrad(verbose=True))

        if test_lim:
-            print "Testing limits"
+            print("Testing limits")
            #Remove some otherwise we are too close to the limit for gradcheck to work effectively
            lim_of_inf = lim_of_inf - 1e-4
            grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=lim_of_inf)
--- a/GPy/util/block_matrices.py
+++ b/GPy/util/block_matrices.py
@ -100,10 +100,10 @@ def block_dot(A, B, diagonal=False):
        Dshape = D.shape
        if diagonal and (len(Cshape) == 1 or len(Dshape) == 1\
                or C.shape[0] != C.shape[1] or D.shape[0] != D.shape[1]):
-            print "Broadcasting, C: {} D:{}".format(C.shape, D.shape)
+            print("Broadcasting, C: {} D:{}".format(C.shape, D.shape))
            return C*D
        else:
-            print "Dotting, C: {} C:{}".format(C.shape, D.shape)
+            print("Dotting, C: {} C:{}".format(C.shape, D.shape))
            return np.dot(C,D)
    dot = np.vectorize(f, otypes = [np.object])
    return dot(A,B)
--- a/GPy/util/choleskies.py
+++ b/GPy/util/choleskies.py
@ -1,14 +1,11 @@
-# Copyright James Hensman and Max Zwiessele 2014
+# Copyright James Hensman and Max Zwiessele 2014, 2015
 # Licensed under the GNU GPL version 3.0

 import numpy as np
 from . import linalg
 from .config import config

-try:
-    from scipy import weave
-except ImportError:
-    config.set('weave', 'working', 'False')
+from . import choleskies_cython

 def safe_root(N):
    i = np.sqrt(N)
@ -17,125 +14,72 @@ def safe_root(N):
        raise ValueError("N is not square!")
    return j

-def _flat_to_triang_weave(flat):
-    """take a matrix N x D and return a M X M x D array where
-
-    N = M(M+1)/2
-
-    the lower triangluar portion of the d'th slice of the result is filled by the d'th column of flat.
-    This is the weave implementation
-    """
-    N, D = flat.shape
-    M = (-1 + safe_root(8*N+1))/2
-    ret = np.zeros((M, M, D))
-    flat = np.ascontiguousarray(flat)
-
-    code = """
-    int count = 0;
-    for(int m=0; m<M; m++)
-    {
-      for(int mm=0; mm<=m; mm++)
-      {
-        for(int d=0; d<D; d++)
-        {
-          ret[d + m*D*M + mm*D] = flat[count];
-          count++;
-        }
-      }
-    }
-    """
-    weave.inline(code, ['flat', 'ret', 'D', 'M'])
-    return ret
-
 def _flat_to_triang_pure(flat_mat):
    N, D = flat_mat.shape
    M = (-1 + safe_root(8*N+1))//2
-    ret = np.zeros((M, M, D))
-    count = 0
-    for m in range(M):
-        for mm in range(m+1):
-            for d in range(D):
-              ret.flat[d + m*D*M + mm*D] = flat_mat.flat[count];
+    ret = np.zeros((D, M, M))
+    for d in range(D):
+        count = 0
+        for m in range(M):
+            for mm in range(m+1):
+              ret[d,m, mm] = flat_mat[count, d];
              count = count+1
    return ret

-if config.getboolean('weave', 'working'):
-	flat_to_triang =  _flat_to_triang_weave
-else:
-        flat_to_triang =  _flat_to_triang_pure
+def _flat_to_triang_cython(flat_mat):
+    N, D = flat_mat.shape
+    M = (-1 + safe_root(8*N+1))//2
+    return choleskies_cython.flat_to_triang(flat_mat, M)

-def _triang_to_flat_weave(L):
-    M, _, D = L.shape
-
-    L = np.ascontiguousarray(L) # should do nothing if L was created by flat_to_triang
-
-    N = M*(M+1)/2
-    flat = np.empty((N, D))
-    code = """
-    int count = 0;
-    for(int m=0; m<M; m++)
-    {
-      for(int mm=0; mm<=m; mm++)
-      {
-        for(int d=0; d<D; d++)
-        {
-          flat[count] = L[d + m*D*M + mm*D];
-          count++;
-        }
-      }
-    }
-    """
-    weave.inline(code, ['flat', 'L', 'D', 'M'])
-    return flat

 def _triang_to_flat_pure(L):
-    M, _, D = L.shape
+    D, _, M = L.shape

    N = M*(M+1)//2
    flat = np.empty((N, D))
-    count = 0;
-    for m in range(M):
-        for mm in range(m+1):
-            for d in range(D):
-                flat.flat[count] = L.flat[d + m*D*M + mm*D];
+    for d in range(D):
+        count = 0;
+        for m in range(M):
+            for mm in range(m+1):
+                flat[count,d] = L[d, m, mm]
                count = count +1
    return flat

-if config.getboolean('weave', 'working'):
-    triang_to_flat =  _triang_to_flat_weave
-else:
-    triang_to_flat =  _triang_to_flat_pure
+def _triang_to_flat_cython(L):
+    return choleskies_cython.triang_to_flat(L)
+
+def _backprop_gradient_pure(dL, L):
+    """
+    Given the derivative of an objective fn with respect to the cholesky L,
+    compute the derivate with respect to the original matrix K, defined as
+
+        K = LL^T
+
+    where L was obtained by Cholesky decomposition
+    """
+    dL_dK = np.tril(dL).copy()
+    N = L.shape[0]
+    for k in range(N - 1, -1, -1):
+        for j in range(k + 1, N):
+            for i in range(j, N):
+                dL_dK[i, k] -= dL_dK[i, j] * L[j, k]
+                dL_dK[j, k] -= dL_dK[i, j] * L[i, k]
+        for j in range(k + 1, N):
+            dL_dK[j, k] /= L[k, k]
+            dL_dK[k, k] -= L[j, k] * dL_dK[j, k]
+        dL_dK[k, k] /= (2 * L[k, k])
+    return dL_dK

 def triang_to_cov(L):
    return np.dstack([np.dot(L[:,:,i], L[:,:,i].T) for i in range(L.shape[-1])])

-def multiple_dpotri_old(Ls):
-    M, _, D = Ls.shape
-    Kis = np.rollaxis(Ls, -1).copy()
-    [dpotri(Kis[i,:,:], overwrite_c=1, lower=1) for i in range(D)]
-    code = """
-    for(int d=0; d<D; d++)
-    {
-      for(int m=0; m<M; m++)
-      {
-        for(int mm=0; mm<m; mm++)
-        {
-          Kis[d*M*M + mm*M + m ] = Kis[d*M*M + m*M + mm];
-        }
-      }
-    }
-
-    """
-    weave.inline(code, ['Kis', 'D', 'M'])
-    Kis = np.rollaxis(Kis, 0, 3) #wtf rollaxis?
-    return Kis
-
 def multiple_dpotri(Ls):
-    return np.dstack([linalg.dpotri(np.asfortranarray(Ls[:,:,i]), lower=1)[0] for i in range(Ls.shape[-1])])
+    return np.array([linalg.dpotri(np.asfortranarray(Ls[i]), lower=1)[0] for i in range(Ls.shape[0])])

 def indexes_to_fix_for_low_rank(rank, size):
    """
-    work out which indexes of the flatteneed array should be fixed if we want the cholesky to represent a low rank matrix
+    Work out which indexes of the flatteneed array should be fixed if we want
+    the cholesky to represent a low rank matrix
    """
    #first we'll work out what to keep, and the do the set difference.

@ -153,15 +97,11 @@ def indexes_to_fix_for_low_rank(rank, size):
    return np.setdiff1d(np.arange((size**2+size)/2), keep)


-
-#class cholchecker(GPy.core.Model):
-    #def __init__(self, L, name='cholchecker'):
-        #super(cholchecker, self).__init__(name)
-        #self.L = GPy.core.Param('L',L)
-        #self.link_parameter(self.L)
-    #def parameters_changed(self):
-        #LL = flat_to_triang(self.L)
-        #Ki = multiple_dpotri(LL)
-        #self.L.gradient = 2*np.einsum('ijk,jlk->ilk', Ki, LL)
-        #self._loglik = np.sum([np.sum(np.log(np.abs(np.diag()))) for i in range(self.L.shape[-1])])
-#
+if config.getboolean('cython', 'working'):
+    triang_to_flat = _triang_to_flat_cython
+    flat_to_triang = _flat_to_triang_cython
+    backprop_gradient = choleskies_cython.backprop_gradient_par_c
+else:
+    backprop_gradient = _backprop_gradient_pure
+    triang_to_flat =  _triang_to_flat_pure
+    flat_to_triang = _flat_to_triang_pure
--- a/GPy/util/choleskies_cython.c
+++ b/GPy/util/choleskies_cython.c
--- a/GPy/util/choleskies_cython.pyx
+++ b/GPy/util/choleskies_cython.pyx
@ -0,0 +1,107 @@
+#cython: wraparaound=False
+#cython: boundscheck=False
+#cython: nonecheck=False
+
+# Copyright James Hensman and Alan Saul 2015
+
+import numpy as np
+from cython.parallel import prange, parallel
+cimport numpy as np
+
+def flat_to_triang(np.ndarray[double, ndim=2] flat, int M):
+    """take a matrix N x D and return a D X M x M array where
+
+    N = M(M+1)/2
+
+    the lower triangluar portion of the d'th slice of the result is filled by the d'th column of flat.
+    """
+    cdef int D = flat.shape[1]
+    cdef int N = flat.shape[0]
+    cdef int count = 0
+    cdef np.ndarray[double, ndim=3] ret = np.zeros((D, M, M))
+    cdef int d, m, mm
+    for d in range(D):
+        count = 0
+        for m in range(M):
+            for mm in range(m+1):
+                ret[d, m, mm] = flat[count,d]
+                count += 1
+    return ret
+
+def triang_to_flat(np.ndarray[double, ndim=3] L):
+    cdef int D = L.shape[0]
+    cdef int M = L.shape[1]
+    cdef int N = M*(M+1)/2
+    cdef int count = 0
+    cdef np.ndarray[double, ndim=2] flat = np.empty((N, D))
+    cdef int d, m, mm
+    for d in range(D):
+        count = 0
+        for m in range(M):
+            for mm in range(m+1):
+                flat[count,d] = L[d, m, mm]
+                count += 1
+    return flat
+
+
+def backprop_gradient(np.ndarray[double, ndim=2] dL, np.ndarray[double, ndim=2] L):
+    cdef np.ndarray[double, ndim=2] dL_dK = np.tril(dL).copy()
+    cdef int N = L.shape[0]
+    cdef int k, j, i
+    for k in range(N - 1, -1, -1):
+        for j in range(k + 1, N):
+            for i in range(j, N):
+                dL_dK[i, k] -= dL_dK[i, j] * L[j, k]
+                dL_dK[j, k] -= dL_dK[i, j] * L[i, k]
+        for j in range(k + 1, N):
+            dL_dK[j, k] /= L[k, k]
+            dL_dK[k, k] -= L[j, k] * dL_dK[j, k]
+        dL_dK[k, k] /= (2. * L[k, k])
+    return dL_dK
+
+def backprop_gradient_par(double[:,:] dL, double[:,:] L):
+    cdef double[:,:] dL_dK = np.tril(dL).copy()
+    cdef int N = L.shape[0]
+    cdef int k, j, i
+    for k in range(N - 1, -1, -1):
+        with nogil, parallel():
+            for i in prange(k + 1, N):
+                for j in range(k+1, i+1):
+                    dL_dK[i, k] -= dL_dK[i, j] * L[j, k]
+                for j in range(i, N):
+                    dL_dK[i, k] -= dL_dK[j, i] * L[j, k]
+        for j in range(k + 1, N):
+            dL_dK[j, k] /= L[k, k]
+            dL_dK[k, k] -= L[j, k] * dL_dK[j, k]
+        dL_dK[k, k] /= (2. * L[k, k])
+    return dL_dK
+
+#here's a pure C version...
+cdef extern from "cholesky_backprop.h" nogil:
+    void chol_backprop(int N, double* dL, double* L)
+
+def backprop_gradient_par_c(np.ndarray[double, ndim=2] dL, np.ndarray[double, ndim=2] L):
+    cdef np.ndarray[double, ndim=2] dL_dK = np.tril(dL) # makes a copy, c-contig
+    cdef int N = L.shape[0]
+    with nogil:
+        chol_backprop(N, <double*> dL_dK.data, <double*> L.data)
+    return dL_dK
+
+cdef extern from "cholesky_backprop.h" nogil:
+    void old_chol_backprop(int N, double* dL, double* L)
+
+def backprop_gradient_par_c_old(np.ndarray[double, ndim=2] dL, np.ndarray[double, ndim=2] L):
+    cdef np.ndarray[double, ndim=2] dL_dK = np.tril(dL) # makes a copy, c-contig
+    cdef int N = L.shape[0]
+    with nogil:
+        old_chol_backprop(N, <double*> dL_dK.data, <double*> L.data)
+    return dL_dK
+
+
+
+
+
+
+
+
+
--- a/GPy/util/cholesky_backprop.c
+++ b/GPy/util/cholesky_backprop.c
@ -0,0 +1,51 @@
+#include <cblas.h>
+void chol_backprop(int N, double* dL, double* L){
+    //at the input to this fn, dL is df_dL. after this fn is complet, dL is df_dK
+    int i,k;
+
+    dL[N*N - 1] /= (2. * L[N*N - 1]);
+    for(k=N-2;k>(-1);k--){
+        cblas_dsymv(CblasRowMajor, CblasLower,
+	            N-k-1, -1,
+		    &dL[(N*(k+1) + k+1)],N,
+		    &L[k*N+k+1],1,
+		    1, &dL[N*(k+1)+k], N);
+        for(i=0;i<(N-k-1); i++){
+		dL[N*(k+1+i)+k] -= dL[N*(k+1)+k+i*(N+1)+1] * L[k*N+k+1+i];
+	}
+
+	cblas_dscal(N-k-1, 1.0/L[k*N+k], &dL[(k+1)*N+k], N);
+        dL[k*N + k] -= cblas_ddot(N-k-1, &dL[(k+1)*N+k], N, &L[k*N+k+1], 1);
+        dL[k*N + k] /= (2.0 * L[k*N + k]);
+    }
+}
+
+double mydot(int n, double* a,  int stride_a, double* b, int stride_b){
+    double ret = 0;	
+    for(int i=0; i<n; i++){
+	ret += a[i*stride_a]*b[i*stride_b];
+    }
+    return ret;
+}
+void old_chol_backprop(int N, double* dL, double* U){
+    //at the input to this fn, dL is df_dL. after this fn is complet, dL is df_dK
+    int iN, kN,i,j,k;
+    dL[N*N-1] /= (2. * U[N*N-1]);
+    for(k=N-2;k>(-1);k--){
+	kN = k*N;
+        #pragma omp parallel for private(i,iN)
+        for(i=k+1; i<N; i++){
+	    iN = i*N;
+	    dL[iN+k] -= mydot(i-k, &dL[iN+k+1], 1, &U[kN+k+1], 1);
+	    dL[iN+k] -= mydot(N-i, &dL[iN+i], N, &U[kN+i],  1);
+
+	}
+        for(i=(k + 1); i<N; i++){
+	    iN = i*N;
+            dL[iN + k] /= U[kN + k];
+            dL[kN + k] -= U[kN + i] * dL[iN + k];
+	}
+        dL[kN + k] /= (2. * U[kN + k]);
+    }
+}
+
--- a/GPy/util/cholesky_backprop.h
+++ b/GPy/util/cholesky_backprop.h
@ -0,0 +1,5 @@
+#include <cblas.h>
+
+void dsymv(int N, double*A, double*b, double*y);
+double mydot(int n, double* a,  int stride_a, double* b, int stride_b);
+void chol_backprop(int N, double* dL, double* L);
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -15,11 +15,7 @@ import warnings
 import os
 from .config import config
 import logging
-
-try:
-    from scipy import weave
-except ImportError:
-    config.set('weave', 'working', 'False')
+from . import linalg_cython


 _scipyversion = np.float64((scipy.__version__).split('.')[:2])
@ -422,114 +418,33 @@ def DSYR(*args, **kwargs):

 def symmetrify(A, upper=False):
    """
-    Take the square matrix A and make it symmetrical by copting elements from the lower half to the upper
+    Take the square matrix A and make it symmetrical by copting elements from
+    the lower half to the upper

    works IN PLACE.

-    note: tries to use weave, falls back to a slower numpy version
+    note: tries to use cython, falls back to a slower numpy version
    """
-    if config.getboolean('weave', 'working'):
-        try:
-            symmetrify_weave(A, upper)
-        except:
-            print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
-            config.set('weave', 'working', 'False')
-            symmetrify_numpy(A, upper)
+    if config.getboolean('cython', 'working'):
+        _symmetrify_cython(A, upper)
    else:
-        symmetrify_numpy(A, upper)
+        _symmetrify_numpy(A, upper)


-def symmetrify_weave(A, upper=False):
-    """
-    Take the square matrix A and make it symmetrical by copting elements from the lower half to the upper
+def _symmetrify_cython(A, upper=False):
+    return linalg_cython.symmetrify(A, upper)

-    works IN PLACE.
-
-
-    """
-    N, M = A.shape
-    assert N == M
-
-    c_contig_code = """
-    int iN;
-    for (int i=1; i<N; i++){
-      iN = i*N;
-      for (int j=0; j<i; j++){
-        A[i+j*N] = A[iN+j];
-      }
-    }
-    """
-    f_contig_code = """
-    int iN;
-    for (int i=1; i<N; i++){
-      iN = i*N;
-      for (int j=0; j<i; j++){
-        A[iN+j] = A[i+j*N];
-      }
-    }
-    """
-
-    N = int(N) # for safe type casting
-    if A.flags['C_CONTIGUOUS'] and upper:
-        weave.inline(f_contig_code, ['A', 'N'], extra_compile_args=['-O3'])
-    elif A.flags['C_CONTIGUOUS'] and not upper:
-        weave.inline(c_contig_code, ['A', 'N'], extra_compile_args=['-O3'])
-    elif A.flags['F_CONTIGUOUS'] and upper:
-        weave.inline(c_contig_code, ['A', 'N'], extra_compile_args=['-O3'])
-    elif A.flags['F_CONTIGUOUS'] and not upper:
-        weave.inline(f_contig_code, ['A', 'N'], extra_compile_args=['-O3'])
-    else:
-        if upper:
-            tmp = np.tril(A.T)
-        else:
-            tmp = np.tril(A)
-        A[:] = 0.0
-        A += tmp
-        A += np.tril(tmp, -1).T
-
-
-def symmetrify_numpy(A, upper=False):
-    """
-    Force a matrix to be symmetric
-    """
+def _symmetrify_numpy(A, upper=False):
    triu = np.triu_indices_from(A,k=1)
    if upper:
        A.T[triu] = A[triu]
    else:
        A[triu] = A.T[triu]

-#This function appears to be unused. It's use of weave makes it problematic
-#Commenting out for now
-#def cholupdate(L, x):
-#    """
-#    update the LOWER cholesky factor of a pd matrix IN PLACE
-#
-#    if L is the lower chol. of K, then this function computes L\_
-#    where L\_ is the lower chol of K + x*x^T
-#    """
-#    support_code = """
-#    #include <math.h>
-#    """
-#    code = """
-#    double r,c,s;
-#    int j,i;
-#    for(j=0; j<N; j++){
-#      r = sqrt(L(j,j)*L(j,j) + x(j)*x(j));
-#      c = r / L(j,j);
-#      s = x(j) / L(j,j);
-#      L(j,j) = r;
-#      for (i=j+1; i<N; i++){
-#        L(i,j) = (L(i,j) + s*x(i))/c;
-#        x(i) = c*x(i) - s*L(i,j);
-#      }
-#    }
-#    """
-#    x = x.copy()
-#    N = x.size
-#    weave.inline(code, support_code=support_code, arg_names=['N', 'L', 'x'], type_converters=weave.converters.blitz)
-
 def backsub_both_sides(L, X, transpose='left'):
-    """ Return L^-T * X * L^-1, assumuing X is symmetrical and L is lower cholesky"""
+    """
+    Return L^-T * X * L^-1, assumuing X is symmetrical and L is lower cholesky
+    """
    if transpose == 'left':
        tmp, _ = dtrtrs(L, X, lower=1, trans=1)
        return dtrtrs(L, tmp.T, lower=1, trans=1)[0].T
@ -537,3 +452,27 @@ def backsub_both_sides(L, X, transpose='left'):
        tmp, _ = dtrtrs(L, X, lower=1, trans=0)
        return dtrtrs(L, tmp.T, lower=1, trans=0)[0].T

+def ij_jlk_to_ilk(A, B):
+    """
+    Faster version of einsum 'ij,jlk->ilk'
+    """
+    return A.dot(B.reshape(B.shape[0], -1)).reshape(A.shape[0], B.shape[1], B.shape[2])
+
+def ijk_jlk_to_il(A, B):
+    """
+    Faster version of einsum einsum('ijk,jlk->il', A,B)
+    """
+    res = np.zeros((A.shape[0], B.shape[1]))
+    [np.add(np.dot(A[:,:,k], B[:,:,k]), res, out=res) for k in range(B.shape[-1])]
+    return res
+
+def ijk_ljk_to_ilk(A, B):
+    """
+    Faster version of einsum np.einsum('ijk,ljk->ilk', A, B)
+
+    I.e A.dot(B.T) for every dimension
+    """
+    res = np.zeros((A.shape[-1], A.shape[0], B.shape[0]))
+    [np.dot(A[:,:,i], B[:,:,i].T, out=res[i,:,:]) for i in range(A.shape[-1])]
+    res = res.swapaxes(0, 2).swapaxes(0,1)
+    return res
--- a/GPy/util/linalg_cython.c
+++ b/GPy/util/linalg_cython.c
--- a/GPy/util/linalg_cython.pyx
+++ b/GPy/util/linalg_cython.pyx
@ -0,0 +1,34 @@
+cimport numpy as np
+from cpython cimport bool
+import cython
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.nonecheck(False)
+def symmetrify(np.ndarray[double, ndim=2] A, bool upper):
+    cdef int N = A.shape[0]
+    if not upper:
+        for i in xrange(N):
+            for j in xrange(i):
+                A[j, i] = A[i, j]
+    else:
+        for j in xrange(N):
+            for i in xrange(j):
+                A[j, i] = A[i, j]
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+@cython.nonecheck(False)
+def cholupdate(np.ndarray[double, ndim=1] x, np.ndarray[double, ndim=2] L, int N):
+    cdef double r
+    cdef double c
+    cdef double s
+    for j in xrange(N):
+        r = np.sqrt(L[j,j]*L[j,j] + x[j]*x[j])
+        c = r / L[j,j]
+        s = x[j] / L[j,j]
+        L[j,j] = r
+        for i in xrange(j):
+            L[i,j] = (L[i,j] + s*x[i])/c
+            x[i] = c*x[i] - s*L[i,j];
+        r = np.sqrt(L[j,j])
--- a/GPy/util/misc.py
+++ b/GPy/util/misc.py
@ -42,9 +42,6 @@ def chain_1(df_dg, dg_dx):
    """
    if np.all(dg_dx==1.):
        return df_dg
-    if len(df_dg) > 1 and len(df_dg.shape)>1 and df_dg.shape[-1] > 1:
-        #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
-        raise NotImplementedError('Not implemented for matricies yet')
    return df_dg * dg_dx

 def chain_2(d2f_dg2, dg_dx, df_dg, d2g_dx2):
@ -56,8 +53,6 @@ def chain_2(d2f_dg2, dg_dx, df_dg, d2g_dx2):
    """
    if np.all(dg_dx==1.) and np.all(d2g_dx2 == 0):
        return d2f_dg2
-    if  len(d2f_dg2) > 1 and len(d2f_dg2.shape)>1 and d2f_dg2.shape[-1] > 1:
-        raise NotImplementedError('Not implemented for matricies yet')
    dg_dx_2 = np.clip(dg_dx, -np.inf, _lim_val_square)**2
    #dg_dx_2 = dg_dx**2
    return d2f_dg2*(dg_dx_2) + df_dg*d2g_dx2
@ -71,11 +66,7 @@ def chain_3(d3f_dg3, dg_dx, d2f_dg2, d2g_dx2, df_dg, d3g_dx3):
    """
    if np.all(dg_dx==1.) and np.all(d2g_dx2==0) and np.all(d3g_dx3==0):
        return d3f_dg3
-    if (  (len(d2f_dg2) > 1 and d2f_dg2.shape[-1] > 1)
-           or (len(d3f_dg3) > 1 and d3f_dg3.shape[-1] > 1)):
-        raise NotImplementedError('Not implemented for matricies yet')
    dg_dx_3 = np.clip(dg_dx, -np.inf, _lim_val_cube)**3
-    #dg_dx_3 = dg_dx**3
    return d3f_dg3*(dg_dx_3) + 3*d2f_dg2*dg_dx*d2g_dx2 + df_dg*d3g_dx3

 def opt_wrapper(m, **kwargs):
@ -133,10 +124,11 @@ def kmm_init(X, m = 10):
 ### make a parameter to its corresponding array:
 def param_to_array(*param):
    """
-Convert an arbitrary number of parameters to :class:ndarray class objects. This is for
-converting parameter objects to numpy arrays, when using scipy.weave.inline routine.
-In scipy.weave.blitz there is no automatic array detection (even when the array inherits
-from :class:ndarray)"""
+    Convert an arbitrary number of parameters to :class:ndarray class objects.
+    This is for converting parameter objects to numpy arrays, when using
+    scipy.weave.inline routine.  In scipy.weave.blitz there is no automatic
+    array detection (even when the array inherits from :class:ndarray)
+    """
    import warnings
    warnings.warn("Please use param.values, as this function will be deprecated in the next release.", DeprecationWarning)
    assert len(param) > 0, "At least one parameter needed"
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-from GPy.core.parameterization import Parameterized, Param
+from ..core.parameterization import Parameterized, Param
 from ..core.parameterization.transformations import Logexp

 class WarpingFunction(Parameterized):
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -6,3 +6,6 @@ include *.cfg
 recursive-include doc *.cfg
 include *.json
 recursive-include doc *.json
+recursive-include GPy *.c
+recursive-include GPy *.so
+recursive-include GPy *.pyx
--- a/README.md
+++ b/README.md
@ -13,7 +13,6 @@ Continuous integration status: ![CI status](https://travis-ci.org/SheffieldML/GP
 ### Python 3 Compatibility
 Work is underway to make GPy run on Python 3.

-* Python 2.x compatibility is currently broken in this fork
 * All tests in the testsuite now run on Python3. 

 To see this for yourself, in Ubuntu 14.04, you can do
@ -21,12 +20,17 @@ To see this for yourself, in Ubuntu 14.04, you can do
    git clone https://github.com/mikecroucher/GPy.git
    cd GPy
    git checkout devel
+    python3 setup.py build_ext --inplace
    nosetests3 GPy/testing

 nosetests3 is Ubuntu's way of reffering to the Python 3 version of nosetests. You install it with 

    sudo apt-get install python3-nose

+The command `python3 setup.py build_ext --inplace` builds the Cython extensions. IF it doesn't work, you may need to install this:
+
+    sudo apt-get install python3-dev
+
 * Test coverage is less than 100% so it is expected that there is still more work to be done. We need more tests and examples to try out.
 * All weave functions not covered by the test suite are *simply commented out*. Can add equivalents later as test functions become available
 * A set of benchmarks would be useful! 
--- a/setup.py
+++ b/setup.py
@ -2,7 +2,8 @@
 # -*- coding: utf-8 -*-

 import os
-from setuptools import setup
+from setuptools import setup, Extension
+import numpy as np

 # Version number
 version = '0.6.1'
@ -10,6 +11,28 @@ version = '0.6.1'
 def read(fname):
    return open(os.path.join(os.path.dirname(__file__), fname)).read()

+#compile_flags = ["-march=native", '-fopenmp', '-O3', ]
+compile_flags = [ '-fopenmp', '-O3', ]
+
+ext_mods = [Extension(name='GPy.kern._src.stationary_cython',
+                      sources=['GPy/kern/_src/stationary_cython.c','GPy/kern/_src/stationary_utils.c'],
+                      include_dirs=[np.get_include()],
+                      extra_compile_args=compile_flags,
+                      extra_link_args = ['-lgomp']),
+            Extension(name='GPy.util.choleskies_cython',
+                      sources=['GPy/util/choleskies_cython.c', 'GPy/util/cholesky_backprop.c'],
+                      include_dirs=[np.get_include()],
+                      extra_link_args = ['-lgomp', '-lblas'],
+                      extra_compile_args=compile_flags+['-std=c99']),
+            Extension(name='GPy.util.linalg_cython',
+                      sources=['GPy/util/linalg_cython.c'],
+                      include_dirs=[np.get_include()],
+                      extra_compile_args=compile_flags),
+            Extension(name='GPy.kern._src.coregionalize_cython',
+                      sources=['GPy/kern/_src/coregionalize_cython.c'],
+                      include_dirs=[np.get_include()],
+                      extra_compile_args=compile_flags)]
+
 setup(name = 'GPy',
      version = version,
      author = read('AUTHORS.txt'),
@ -18,6 +41,7 @@ setup(name = 'GPy',
      license = "BSD 3-clause",
      keywords = "machine-learning gaussian-processes kernels",
      url = "http://sheffieldml.github.com/GPy/",
+      ext_modules = ext_mods,
      packages = ["GPy.models",
                  "GPy.inference.optimization",
                  "GPy.inference.mcmc",