Merge branch 'params' of github.com:SheffieldML/GPy into params

Conflicts: GPy/core/gp.py GPy/plotting/matplot_dep/models_plots.py
2026-05-06 10:32:39 +02:00 · 2014-03-12 12:57:04 +00:00 · 2014-03-12 12:57:04 +00:00 · 4ce6d2d72c
commit 4ce6d2d72c
parent 6ced5b1242 603b70bb6e
34 changed files with 800 additions and 710 deletions
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -48,7 +48,7 @@ class GP(Model):
            self.Y_metadata = None
        assert isinstance(kernel, kern.Kern)
-        assert self.input_dim == kernel.input_dim
+        #assert self.input_dim == kernel.input_dim
        self.kern = kernel
        assert isinstance(likelihood, likelihoods.Likelihood)
@ -69,6 +69,8 @@ class GP(Model):
    def parameters_changed(self):
        self.posterior, self._log_marginal_likelihood, grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y, **self.Y_metadata)
        self.likelihood.update_gradients(np.diag(grad_dict['dL_dK']), **self.Y_metadata)
        #self.posterior, self._log_marginal_likelihood, grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y, Y_metadata=self.Y_metadata)
        #self.likelihood.update_gradients(np.diag(grad_dict['dL_dK']))
        self.kern.update_gradients_full(grad_dict['dL_dK'], self.X)
    def log_likelihood(self):
@ -186,7 +188,7 @@ class GP(Model):
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import models_plots
-        models_plots.plot_fit_f(self,*args,**kwargs)
+        return models_plots.plot_fit_f(self,*args,**kwargs)
    def plot(self, *args, **kwargs):
        """
@ -207,7 +209,7 @@ class GP(Model):
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import models_plots
-        models_plots.plot_fit(self,*args,**kwargs)
+        return models_plots.plot_fit(self,*args,**kwargs)
    def _getstate(self):
        """
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -253,7 +253,7 @@ class Model(Parameterized):
        sgd.run()
        self.optimization_runs.append(sgd)
-    def _checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3):
+    def _checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3, _debug=False):
        """
        Check the gradient of the ,odel by comparing to a numerical
        estimate.  If the verbose flag is passed, invividual
@ -299,8 +299,11 @@ class Model(Parameterized):
            dx = dx[transformed_index]
            gradient = gradient[transformed_index]
-            global_ratio = (f1 - f2) / (2 * np.dot(dx, np.where(gradient == 0, 1e-32, gradient)))
+            denominator = (2 * np.dot(dx, gradient))
-            return (np.abs(1. - global_ratio) < tolerance)
+            global_ratio = (f1 - f2) / np.where(denominator==0., 1e-32, denominator)
            gloabl_diff = (f1 - f2) - denominator
            return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gloabl_diff) == 0) 
        else:
            # check the gradient of each parameter individually, and do some pretty printing
            try:
@ -336,7 +339,7 @@ class Model(Parameterized):
                    print "No free parameters to check"
                    return
-            gradient = self.objective_function_gradients(x)
+            gradient = self.objective_function_gradients(x).copy()
            np.where(gradient == 0, 1e-312, gradient)
            ret = True
            for nind, xind in itertools.izip(param_index, transformed_index):
@ -346,7 +349,15 @@ class Model(Parameterized):
                xx[xind] -= 2.*step
                f2 = self.objective_function(xx)
                numerical_gradient = (f1 - f2) / (2 * step)
-                ratio = (f1 - f2) / (2 * step * gradient[xind])
+                if _debug:
                    for p in self.kern.flattened_parameters:
                        p._parent_._debug=True
                    self.gradient[xind] = numerical_gradient
                    self._set_params_transformed(x)
                    for p in self.kern.flattened_parameters:
                        p._parent_._debug=False
                if np.all(gradient[xind]==0): ratio = (f1-f2) == gradient[xind] 
                else: ratio = (f1 - f2) / (2 * step * gradient[xind])
                difference = np.abs((f1 - f2) / 2 / step - gradient[xind])
                if (np.abs(1. - ratio) < tolerance) or np.abs(difference) < tolerance:
--- a/GPy/core/parameterization/array_core.py
+++ b/GPy/core/parameterization/array_core.py
@ -14,16 +14,14 @@ class ObservableArray(np.ndarray, Observable):
    takes exactly one argument, which is this array itself.
    """
    __array_priority__ = -1 # Never give back ObservableArray
-    def __new__(cls, input_array):
+    def __new__(cls, input_array, *a, **kw):
        if not isinstance(input_array, ObservableArray):
-            obj = np.atleast_1d(input_array).view(cls)
+            obj = np.atleast_1d(np.require(input_array, dtype=np.float64, requirements=['W', 'C'])).view(cls)
        else: obj = input_array
        cls.__name__ = "ObservableArray\n     "
        super(ObservableArray, obj).__init__(*a, **kw)
        return obj
    def __init__(self, *a, **kw):
        super(ObservableArray, self).__init__(*a, **kw)
    def __array_finalize__(self, obj):
        # see InfoArray.__array_finalize__ for comments
        if obj is None: return
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@ -3,7 +3,7 @@
 import itertools
 import numpy
-from parameter_core import OptimizationHandlable, Gradcheckable, adjust_name_for_printing
+from parameter_core import OptimizationHandlable, adjust_name_for_printing
 from array_core import ObservableArray
 ###### printing
@ -43,13 +43,12 @@ class Param(OptimizationHandlable, ObservableArray):
    _fixes_ = None
    _parameters_ = []
    def __new__(cls, name, input_array, default_constraint=None):
-        obj = numpy.atleast_1d(super(Param, cls).__new__(cls, input_array=input_array))
+        obj = numpy.atleast_1d(super(Param, cls).__new__(cls, input_array=input_array, name=name, default_constraint=default_constraint))
        cls.__name__ = "Param"
        obj._current_slice_ = (slice(obj.shape[0]),)
        obj._realshape_ = obj.shape
        obj._realsize_ = obj.size
        obj._realndim_ = obj.ndim
        obj._updated_ = False
        from lists_and_dicts import SetDict
        obj._tied_to_me_ = SetDict()
        obj._tied_to_ = []
@ -86,7 +85,6 @@ class Param(OptimizationHandlable, ObservableArray):
        self._realshape_ = getattr(obj, '_realshape_', None)
        self._realsize_ = getattr(obj, '_realsize_', None)
        self._realndim_ = getattr(obj, '_realndim_', None)
        self._updated_ = getattr(obj, '_updated_', None)
        self._original_ = getattr(obj, '_original_', None)
        self._name = getattr(obj, 'name', None)
        self._gradient_array_ = getattr(obj, '_gradient_array_', None)
@ -121,14 +119,12 @@ class Param(OptimizationHandlable, ObservableArray):
                             self._realndim_,
                             self._tied_to_me_,
                             self._tied_to_,
                             self._updated_,
                            )
                            )
    def __setstate__(self, state):
        super(Param, self).__setstate__(state[0])
        state = list(state[1])
        self._updated_ = state.pop()
        self._tied_to_ = state.pop()
        self._tied_to_me_ = state.pop()
        self._realndim_ = state.pop()
@ -393,7 +389,7 @@ class ParamConcatenation(object):
        if update:
            self.update_all_params()
    def _vals(self):
-        return numpy.hstack([p._get_params() for p in self.params])
+        return numpy.hstack([p._param_array_ for p in self.params])
    #===========================================================================
    # parameter operations:
    #===========================================================================
@ -450,8 +446,8 @@ class ParamConcatenation(object):
    def untie(self, *ties):
        [param.untie(*ties) for param in self.params]
-    def checkgrad(self, verbose=0, step=1e-6, tolerance=1e-3):
+    def checkgrad(self, verbose=0, step=1e-6, tolerance=1e-3, _debug=False):
-        return self.params[0]._highest_parent_._checkgrad(self, verbose, step, tolerance)
+        return self.params[0]._highest_parent_._checkgrad(self, verbose, step, tolerance, _debug=_debug)
    #checkgrad.__doc__ = Gradcheckable.checkgrad.__doc__
    __lt__ = lambda self, val: self._vals() < val
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@ -15,9 +15,8 @@ Observable Pattern for patameterization
 from transformations import Transformation, Logexp, NegativeLogexp, Logistic, __fixed__, FIXED, UNFIXED
 import numpy as np
 import itertools
-__updated__ = '2013-12-16'
+__updated__ = '2014-03-11'
 class HierarchyError(Exception):
    """
@ -42,9 +41,8 @@ class Observable(object):
    """
    _updated = True
    def __init__(self, *args, **kwargs):
        super(Observable, self).__init__(*args, **kwargs)
        self._observer_callables_ = []
    def __del__(self, *args, **kwargs):
        del self._observer_callables_
    def add_observer(self, observer, callble, priority=0):
        self._insert_sorted(priority, observer, callble)
@ -161,6 +159,8 @@ class Parentable(object):
    """
    _parent_ = None
    _parent_index_ = None
    def __init__(self, *args, **kwargs):
        super(Parentable, self).__init__(*args, **kwargs)
    def has_parent(self):
        """
@ -205,7 +205,8 @@ class Gradcheckable(Parentable):
    """
    def __init__(self, *a, **kw):
        super(Gradcheckable, self).__init__(*a, **kw)
-    def checkgrad(self, verbose=0, step=1e-6, tolerance=1e-3):
+
    def checkgrad(self, verbose=0, step=1e-6, tolerance=1e-3, _debug=False):
        """
        Check the gradient of this parameter with respect to the highest parent's 
        objective function.
@ -219,14 +220,15 @@ class Gradcheckable(Parentable):
        :param flaot tolerance: the tolerance for the gradient ratio or difference.
        """
        if self.has_parent():
-            return self._highest_parent_._checkgrad(self, verbose=verbose, step=step, tolerance=tolerance)
+            return self._highest_parent_._checkgrad(self, verbose=verbose, step=step, tolerance=tolerance, _debug=_debug)
-        return self._checkgrad(self[''], verbose=verbose, step=step, tolerance=tolerance)
+        return self._checkgrad(self[''], verbose=verbose, step=step, tolerance=tolerance, _debug=_debug)
-    def _checkgrad(self, param):
+
    def _checkgrad(self, param, verbose=0, step=1e-6, tolerance=1e-3, _debug=False):
        """
        Perform the checkgrad on the model.
        TODO: this can be done more efficiently, when doing it inside here
        """
-        raise NotImplementedError, "Need log likelihood to check gradient against"
+        raise HierarchyError, "This parameter is not in a model with a likelihood, and, therefore, cannot be gradient checked!"
 class Nameable(Gradcheckable):
@ -272,6 +274,9 @@ class Indexable(object):
    Enable enraveled indexes and offsets for this object.
    The raveled index of an object is the index for its parameters in a flattened int array.
    """
    def __init__(self, *a, **kw):
        super(Indexable, self).__init__(*a, **kw)
    def _raveled_index(self):
        """
        Flattened array of ints, specifying the index of this object.
@ -534,8 +539,11 @@ class OptimizationHandlable(Constrainable, Observable):
    """
    This enables optimization handles on an Object as done in GPy 0.4.
-    transformed: make sure the transformations and constraints etc are handled
+    `..._transformed`: make sure the transformations and constraints etc are handled
    """
    def __init__(self, name, default_constraint=None, *a, **kw):
        super(OptimizationHandlable, self).__init__(name, default_constraint=default_constraint, *a, **kw)
    def transform(self):
        [np.put(self._param_array_, ind, c.finv(self._param_array_[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
@ -625,6 +633,24 @@ class OptimizationHandlable(Constrainable, Observable):
        [np.put(x, ind, p.rvs(ind.size)) for p, ind in self.priors.iteritems() if not p is None]
        self._set_params_transformed(x) # makes sure all of the tied parameters get the same init (since there's only one prior object...)
    #===========================================================================
    # For shared memory arrays. This does nothing in Param, but sets the memory
    # for all parameterized objects
    #===========================================================================
    def _propagate_param_grad(self, parray, garray):
        pi_old_size = 0
        for pi in self._parameters_:
            pislice = slice(pi_old_size, pi_old_size+pi.size)
            self._param_array_[pislice] = pi._param_array_.ravel()#, requirements=['C', 'W']).flat
            self._gradient_array_[pislice] = pi._gradient_array_.ravel()#, requirements=['C', 'W']).flat
            pi._param_array_.data = parray[pislice].data
            pi._gradient_array_.data = garray[pislice].data
            pi._propagate_param_grad(parray[pislice], garray[pislice])
            pi_old_size += pi.size
 class Parameterizable(OptimizationHandlable):
    def __init__(self, *args, **kwargs):
        super(Parameterizable, self).__init__(*args, **kwargs)
@ -811,22 +837,21 @@ class Parameterizable(OptimizationHandlable):
            p._parent_index_ = i
            pslice = slice(old_size, old_size+p.size)
            pi_old_size = old_size
            for pi in p.flattened_parameters:
                pislice = slice(pi_old_size, pi_old_size+pi.size)
-                self._param_array_[pislice] = pi._param_array_.flat
+            # first connect all children
-                self._gradient_array_[pislice] = pi._gradient_array_.flat
+            p._propagate_param_grad(self._param_array_[pslice], self._gradient_array_[pslice])            
-                pi._param_array_.data = self._param_array_[pislice].data
+            # then connect children to self
-                pi._gradient_array_.data = self._gradient_array_[pislice].data
+            self._param_array_[pslice] = p._param_array_.ravel()#, requirements=['C', 'W']).ravel(order='C')
-                
+            self._gradient_array_[pslice] = p._gradient_array_.ravel()#, requirements=['C', 'W']).ravel(order='C')
                pi_old_size += pi.size
            if not p._param_array_.flags['C_CONTIGUOUS']:
                import ipdb;ipdb.set_trace()
            p._param_array_.data = self._param_array_[pslice].data
            p._gradient_array_.data = self._gradient_array_[pslice].data
            self._param_slices_.append(pslice)
            self._add_parameter_name(p, ignore_added_names=ignore_added_names)
            old_size += p.size
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -65,8 +65,8 @@ class Parameterized(Parameterizable, Pickleable):
    # **Never** call parameters_changed() yourself 
    __metaclass__ = ParametersChangedMeta    
    #===========================================================================
-    def __init__(self, name=None, *a, **kw):
+    def __init__(self, name=None, parameters=[], *a, **kw):
-        super(Parameterized, self).__init__(name=name, parent=None, parent_index=None, *a, **kw)
+        super(Parameterized, self).__init__(name=name, *a, **kw)
        self._in_init_ = True
        self._parameters_ = ArrayList()
        self.size = sum(p.size for p in self._parameters_)
@ -76,6 +76,7 @@ class Parameterized(Parameterizable, Pickleable):
        self._param_slices_ = []
        self._connect_parameters()
        del self._in_init_
        self.add_parameters(*parameters)
    def build_pydot(self, G=None):
        import pydot  # @UnresolvedImport
@ -205,20 +206,24 @@ class Parameterized(Parameterizable, Pickleable):
        return found_params
    def __getitem__(self, name, paramlist=None):
-        if paramlist is None:
+        if isinstance(name, (int, slice, tuple, np.ndarray)):
-            paramlist = self.grep_param_names(name)
+            return self._param_array_[name]
-        if len(paramlist) < 1: raise AttributeError, name
+        else:
-        if len(paramlist) == 1:
+            if paramlist is None:
-            if isinstance(paramlist[-1], Parameterized):
+                paramlist = self.grep_param_names(name)
-                paramlist = paramlist[-1].flattened_parameters
+            if len(paramlist) < 1: raise AttributeError, name
-                if len(paramlist) != 1:
+            if len(paramlist) == 1:
-                    return ParamConcatenation(paramlist)
+                if isinstance(paramlist[-1], Parameterized):
-            return paramlist[-1]
+                    paramlist = paramlist[-1].flattened_parameters
-        return ParamConcatenation(paramlist)
+                    if len(paramlist) != 1:
                        return ParamConcatenation(paramlist)
                return paramlist[-1]
            return ParamConcatenation(paramlist)
    def __setitem__(self, name, value, paramlist=None):
        if isinstance(name, (slice, tuple, np.ndarray)):
            self._param_array_[name] = value
            self.notify_observers()
        else:
            try: param = self.__getitem__(name, paramlist)
            except AttributeError as a: raise a
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@ -63,12 +63,13 @@ class SpikeAndSlabPrior(VariationalPrior):
 class VariationalPosterior(Parameterized):
-    def __init__(self, means=None, variances=None, name=None, **kw):
+    def __init__(self, means=None, variances=None, name=None, *a, **kw):
-        super(VariationalPosterior, self).__init__(name=name, **kw)
+        super(VariationalPosterior, self).__init__(name=name, *a, **kw)
        self.mean = Param("mean", means)
        self.variance = Param("variance", variances, Logexp())
        self.ndim = self.mean.ndim
        self.shape = self.mean.shape
-        self.variance = Param("variance", variances, Logexp())
+        self.num_data, self.input_dim = self.mean.shape
        self.add_parameters(self.mean, self.variance)
        self.num_data, self.input_dim = self.mean.shape
        if self.has_uncertain_inputs():
@ -77,6 +78,24 @@ class VariationalPosterior(Parameterized):
    def has_uncertain_inputs(self):
        return not self.variance is None
    def __getitem__(self, s):
        if isinstance(s, (int, slice, tuple, list, np.ndarray)):
            import copy
            n = self.__new__(self.__class__, self.name)
            dc = self.__dict__.copy()
            dc['mean'] = self.mean[s]
            dc['variance'] = self.variance[s]
            dc['_parameters_'] = copy.copy(self._parameters_)
            n.__dict__.update(dc)
            n._parameters_[dc['mean']._parent_index_] = dc['mean']
            n._parameters_[dc['variance']._parent_index_] = dc['variance']
            n.ndim = n.mean.ndim
            n.shape = n.mean.shape
            n.num_data = n.mean.shape[0]
            n.input_dim = n.mean.shape[1] if n.ndim != 1 else 1
            return n
        else:
            return super(VariationalPrior, self).__getitem__(s)
 class NormalPosterior(VariationalPosterior):
    '''
@ -117,4 +136,4 @@ class SpikeAndSlabPosterior(VariationalPosterior):
        import sys
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ...plotting.matplot_dep import variational_plots
-        return variational_plots.plot(self,*args)
+        return variational_plots.plot_SpikeSlab(self,*args)
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -64,8 +64,8 @@ class SparseGP(GP):
            self.kern.gradient += target
            #gradients wrt Z
-            self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z)
+            self.Z.gradient[:,self.kern.active_dims] = self.kern.gradients_X(dL_dKmm, self.Z)
-            self.Z.gradient += self.kern.gradients_Z_expectations(
+            self.Z.gradient[:,self.kern.active_dims] += self.kern.gradients_Z_expectations(
                               self.grad_dict['dL_dpsi1'], self.grad_dict['dL_dpsi2'], Z=self.Z, variational_posterior=self.X)
        else:
            #gradients wrt kernel
@ -77,8 +77,8 @@ class SparseGP(GP):
            self.kern.gradient += target
            #gradients wrt Z
-            self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
+            self.Z.gradient[:,self.kern.active_dims] = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
-            self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)
+            self.Z.gradient[:,self.kern.active_dims] += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)
    def _raw_predict(self, Xnew, full_cov=False):
        """
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -324,14 +324,14 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
    likelihood_list = [Gaussian(x, normalize=True) for x in Ylist]
-    k = kern.Linear(Q, ARD=True) + kern.Bias(Q, _np.exp(-2)) + kern.White(Q, _np.exp(-2))
+    #Ylist = [Ylist[0]]
-    m = MRD(likelihood_list, input_dim=Q, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw)
+    k = [kern.Linear(Q, ARD=True) + kern.White(Q, 1e-4) for _ in range(len(Ylist))]
-    m.ensure_default_constraints()
+    m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing, kernel=k, initx="", initz='permute', **kw)
-    for i, bgplvm in enumerate(m.bgplvms):
+    m['.*noise'] = [Y.var()/500. for Y in Ylist]
-        m['{}_noise'.format(i)] = bgplvm.likelihood.Y.var() / 500.
+    #for i, Y in enumerate(Ylist):
    #    m['.*Y_{}.*Gaussian.*noise'.format(i)] = Y.var(1) / 500.
    if optimize:
        print "Optimizing Model:"
@ -515,3 +515,28 @@ def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose
        lvm_visualizer.close()
    return m
 def ssgplvm_simulation_linear():
    import numpy as np
    import GPy
    N, D, Q = 1000, 20, 5
    pi = 0.2
    def sample_X(Q, pi):
        x = np.empty(Q)
        dies = np.random.rand(Q)
        for q in xrange(Q):
            if dies[q]<pi:
                x[q] = np.random.randn()
            else:
                x[q] = 0.
        return x
    Y = np.empty((N,D))
    X = np.empty((N,Q))
    # Generate data from random sampled weight matrices
    for n in xrange(N):
        X[n] = sample_X(Q,pi)
        w = np.random.randn(D,Q)
        Y[n] = np.dot(w,X[n])
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -284,7 +284,7 @@ def toy_poisson_rbf_1d_laplace(optimize=True, plot=True):
    kern = GPy.kern.RBF(1)
    poisson_lik = GPy.likelihoods.Poisson()
-    laplace_inf = GPy.inference.latent_function_inference.LaplaceInference()
+    laplace_inf = GPy.inference.latent_function_inference.Laplace()
    # create simple GP Model
    m = GPy.core.GP(X, Y, kernel=kern, likelihood=poisson_lik, inference_method=laplace_inf)
@ -468,7 +468,7 @@ def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, opt
 def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
    """Run a 1D example of a sparse GP regression with uncertain inputs."""
-    fig, axes = pb.subplots(1, 2, figsize=(12, 5))
+    fig, axes = pb.subplots(1, 2, figsize=(12, 5), sharex=True, sharey=True)
    # sample inputs and outputs
    S = np.ones((20, 1))
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@ -3,6 +3,7 @@
 from posterior import Posterior
 from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify
 from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
 import numpy as np
 from ...util.misc import param_to_array
@ -77,8 +78,9 @@ class VarDTC(object):
        num_inducing = Z.shape[0]
        num_data = Y.shape[0]
        # kernel computations, using BGPLVM notation
        Kmm = kern.K(Z)
        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
        Lm = jitchol(Kmm)
        # The rather complex computations of A
@ -168,7 +170,6 @@ class VarDTC(object):
        Bi, _ = dpotri(LB, lower=1)
        symmetrify(Bi)
        Bi = -dpotri(LB, lower=1)[0]
        from ...util import diag
        diag.add(Bi, 1)
        woodbury_inv = backsub_both_sides(Lm, Bi)
@ -237,7 +238,8 @@ class VarDTCMissingData(object):
        dL_dKmm = 0
        log_marginal = 0
-        Kmm = kern.K(Z)
+        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
        #factor Kmm
        Lm = jitchol(Kmm)
        if uncertain_inputs: LmInv = dtrtri(Lm)
@ -323,7 +325,6 @@ class VarDTCMissingData(object):
            Bi, _ = dpotri(LB, lower=1)
            symmetrify(Bi)
            Bi = -dpotri(LB, lower=1)[0]
            from ...util import diag
            diag.add(Bi, 1)
            woodbury_inv_all[:, :, ind] = backsub_both_sides(Lm, Bi)[:,:,None]
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@ -1,49 +1,53 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import sys
 import numpy as np
 import itertools
 from linear import Linear
 from ...core.parameterization import Parameterized
-from ...core.parameterization.param import Param
+from ...util.caching import Cache_this
-from kern import Kern
+from kern import CombinationKernel
-class Add(Kern):
+class Add(CombinationKernel):
-    def __init__(self, subkerns, tensor):
+    """
-        assert all([isinstance(k, Kern) for k in subkerns])
+    Add given list of kernels together.
-        if tensor:
+    propagates gradients thorugh.
-            input_dim  = sum([k.input_dim for k in subkerns])
+    """
-            self.input_slices = []
+    def __init__(self, subkerns, name='add'):
-            n = 0
+        super(Add, self).__init__(subkerns, name)
            for k in subkerns:
                self.input_slices.append(slice(n, n+k.input_dim))
                n += k.input_dim
        else:
            assert all([k.input_dim == subkerns[0].input_dim for k in subkerns])
            input_dim = subkerns[0].input_dim
            self.input_slices = [slice(None) for k in subkerns]
        super(Add, self).__init__(input_dim, 'add')
        self.add_parameters(*subkerns)
-
+    @Cache_this(limit=2, force_kwargs=['which_parts'])
-    def K(self, X, X2=None):
+    def K(self, X, X2=None, which_parts=None):
        """
-        Compute the kernel function.
+        Add all kernels together.
-
+        If a list of parts (of this kernel!) `which_parts` is given, only
-        :param X: the first set of inputs to the kernel
+        the parts of the list are taken to compute the covariance.
        :param X2: (optional) the second set of arguments to the kernel. If X2
                   is None, this is passed throgh to the 'part' object, which
                   handLes this as X2 == X.
        """
        assert X.shape[1] == self.input_dim
-        if X2 is None:
+        if which_parts is None:
-            return sum([p.K(X[:, i_s], None) for p, i_s in zip(self._parameters_, self.input_slices)])
+            which_parts = self.parts
-        else:
+        elif not isinstance(which_parts, (list, tuple)):
-            return sum([p.K(X[:, i_s], X2[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)])
+            # if only one part is given
            which_parts = [which_parts]
        return reduce(np.add, (p.K(X, X2) for p in which_parts))
-    def update_gradients_full(self, dL_dK, X):
+    @Cache_this(limit=2, force_kwargs=['which_parts'])
-        [p.update_gradients_full(dL_dK, X[:,i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
+    def Kdiag(self, X, which_parts=None):
        assert X.shape[1] == self.input_dim
        if which_parts is None:
            which_parts = self.parts
        elif not isinstance(which_parts, (list, tuple)):
            # if only one part is given
            which_parts = [which_parts]
        return reduce(np.add, (p.Kdiag(X) for p in which_parts))
    def update_gradients_full(self, dL_dK, X, X2=None):
        [p.update_gradients_full(dL_dK, X, X2) for p in self.parts]
    def update_gradients_diag(self, dL_dK, X):
        [p.update_gradients_diag(dL_dK, X) for p in self.parts]
    def update_gradients_diag(self, dL_dKdiag, X):
        [p.update_gradients_diag(dL_dKdiag, X[:,i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
    def gradients_X(self, dL_dK, X, X2=None):
        """Compute the gradient of the objective function with respect to X.
@ -55,92 +59,72 @@ class Add(Kern):
        :param X2: Observed data inputs (optional, defaults to X)
        :type X2: np.ndarray (num_inducing x input_dim)"""
-        target = np.zeros_like(X)
+        target = np.zeros(X.shape)
-        if X2 is None:
+        [target.__setitem__([Ellipsis, p.active_dims], target[:, p.active_dims]+p.gradients_X(dL_dK, X, X2)) for p in self.parts]
            [np.add(target[:,i_s], p.gradients_X(dL_dK, X[:, i_s], None), target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
        else:
            [np.add(target[:,i_s], p.gradients_X(dL_dK, X[:, i_s], X2[:,i_s]), target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
        return target
    def Kdiag(self, X):
        assert X.shape[1] == self.input_dim
        return sum([p.Kdiag(X[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)])
    def psi0(self, Z, variational_posterior):
-        return np.sum([p.psi0(Z[:, i_s], mu[:, i_s], S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)],0)
+        return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts))
    def psi1(self, Z, variational_posterior):
-        return np.sum([p.psi1(Z[:, i_s], mu[:, i_s], S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)], 0)
+        return reduce(np.add, (p.psi1(Z, variational_posterior) for p in self.parts))
    def psi2(self, Z, variational_posterior):
-        psi2 = np.sum([p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)], 0)
+        psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts))
-
+        #return psi2
        # compute the "cross" terms
-        from white import White
+        from static import White, Bias
        from rbf import RBF
        #from rbf_inv import RBFInv
        from bias import Bias
        from linear import Linear
        #ffrom fixed import Fixed
-        for (p1, i1), (p2, i2) in itertools.combinations(itertools.izip(self._parameters_, self.input_slices), 2):
+        for p1, p2 in itertools.combinations(self.parts, 2):
            # i1, i2 = p1.active_dims, p2.active_dims
            # white doesn;t combine with anything
            if isinstance(p1, White) or isinstance(p2, White):
                pass
            # rbf X bias
            #elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
            elif isinstance(p1,  Bias) and isinstance(p2, (RBF, Linear)):
-                tmp = p2.psi1(Z[:,i2], mu[:,i2], S[:,i2])
+                tmp = p2.psi1(Z, variational_posterior)
                psi2 += p1.variance * (tmp[:, :, None] + tmp[:, None, :])
            #elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
            elif isinstance(p2, Bias) and isinstance(p1, (RBF, Linear)):
-                tmp = p1.psi1(Z[:,i1], mu[:,i1], S[:,i1])
+                tmp = p1.psi1(Z, variational_posterior)
                psi2 += p2.variance * (tmp[:, :, None] + tmp[:, None, :])
            elif isinstance(p2, (RBF, Linear)) and isinstance(p1, (RBF, Linear)):
                assert np.intersect1d(p1.active_dims, p2.active_dims).size == 0, "only non overlapping kernel dimensions allowed so far"
                tmp1 = p1.psi1(Z, variational_posterior)
                tmp2 = p2.psi1(Z, variational_posterior)
                psi2 += (tmp1[:, :, None] * tmp2[:, None, :]) + (tmp2[:, :, None] * tmp1[:, None, :])
            else:
                raise NotImplementedError, "psi2 cannot be computed for this kernel"
        return psi2
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        from white import White
+        from static import White, Bias
-        from rbf import RBF
+        for p1 in self.parts:
        #from rbf_inv import RBFInv
        #from bias import Bias
        from linear import Linear
        #ffrom fixed import Fixed
        for p1, is1 in zip(self._parameters_, self.input_slices):
            #compute the effective dL_dpsi1. Extra terms appear becaue of the cross terms in psi2!
            eff_dL_dpsi1 = dL_dpsi1.copy()
-            for p2, is2 in zip(self._parameters_, self.input_slices):
+            for p2 in self.parts:
                if p2 is p1:
                    continue
                if isinstance(p2, White):
                    continue
                elif isinstance(p2, Bias):
                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
-                else:
+                else:# np.setdiff1d(p1.active_dims, ar2, assume_unique): # TODO: Careful, not correct for overlapping active_dims
-                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z[:,is2], mu[:,is2], S[:,is2]) * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
-
+            p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
            p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], S[:,is1], Z[:,is1])
    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        from white import White
+        from static import White, Bias
        from rbf import RBF
        #from rbf_inv import rbfinv
        from bias import Bias
        from linear import Linear
        #ffrom fixed import fixed
        target = np.zeros(Z.shape)
-        for p1, is1 in zip(self._parameters_, self.input_slices):
+        for p1 in self.parts:
            #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
            eff_dL_dpsi1 = dL_dpsi1.copy()
-            for p2, is2 in zip(self._parameters_, self.input_slices):
+            for p2 in self.parts:
                if p2 is p1:
                    continue
                if isinstance(p2, White):
@ -148,63 +132,39 @@ class Add(Kern):
                elif isinstance(p2, Bias):
                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
                else:
-                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z[:,is2], mu[:,is2], S[:,is2]) * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
-
+            target[:, p1.active_dims] += p1.gradients_Z_expectations(eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
            target += p1.gradients_z_variational(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], S[:,is1], Z[:,is1])
        return target
    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        from white import white
+        from static import White, Bias
-        from rbf import rbf
+        target_mu = np.zeros(variational_posterior.shape)
-        #from rbf_inv import rbfinv
+        target_S = np.zeros(variational_posterior.shape)
-        #from bias import bias
+        for p1 in self._parameters_:
        from linear import linear
        #ffrom fixed import fixed
        target_mu = np.zeros(mu.shape)
        target_S = np.zeros(S.shape)
        for p1, is1 in zip(self._parameters_, self.input_slices):
            #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
            eff_dL_dpsi1 = dL_dpsi1.copy()
-            for p2, is2 in zip(self._parameters_, self.input_slices):
+            for p2 in self._parameters_:
                if p2 is p1:
                    continue
-                if isinstance(p2, white):
+                if isinstance(p2, White):
                    continue
-                elif isinstance(p2, bias):
+                elif isinstance(p2, Bias):
                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
                else:
-                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(z[:,is2], mu[:,is2], s[:,is2]) * 2.
+                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
-
+            a, b = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
-
+            target_mu[:, p1.active_dims] += a
-            a, b = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], s[:,is1], z[:,is1])
+            target_S[:, p1.active_dims] += b
            target_mu += a
            target_S += b
        return target_mu, target_S
    def input_sensitivity(self):
        in_sen = np.zeros((self.num_params, self.input_dim))
        for i, [p, i_s] in enumerate(zip(self._parameters_, self.input_slices)):
            in_sen[i, i_s] = p.input_sensitivity()
        return in_sen
    def _getstate(self):
        """
        Get the current state of the class,
        here just all the indices, rest can get recomputed
        """
-        return Parameterized._getstate(self) + [#self._parameters_,
+        return super(Add, self)._getstate()
                self.input_dim,
                self.input_slices,
                self._param_slices_
                ]
    def _setstate(self, state):
-        self._param_slices_ = state.pop()
+        super(Add, self)._setstate(state)
        self.input_slices = state.pop()
        self.input_dim = state.pop()
        Parameterized._setstate(self, state)
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@ -3,12 +3,19 @@
 import sys
 import numpy as np
-import itertools
+from ...core.parameterization.parameterized import Parameterized
-from ...core.parameterization import Parameterized
+from kernel_slice_operations import KernCallsViaSlicerMeta
-from ...core.parameterization.param import Param
+from ...util.caching import Cache_this
 class Kern(Parameterized):
    #===========================================================================
    # This adds input slice support. The rather ugly code for slicing can be 
    # found in kernel_slice_operations
    __metaclass__ = KernCallsViaSlicerMeta
    #===========================================================================
    _debug=False
    def __init__(self, input_dim, name, *a, **kw):
        """
        The base class for a kernel: a positive definite function
@ -20,11 +27,29 @@ class Kern(Parameterized):
        Do not instantiate.
        """
        super(Kern, self).__init__(name=name, *a, **kw)
-        self.input_dim = input_dim
+        if isinstance(input_dim, int):
            self.active_dims = np.r_[0:input_dim]
            self.input_dim = input_dim
        else:
            self.active_dims = np.r_[input_dim]
            self.input_dim = len(self.active_dims)
        self._sliced_X = 0
    @Cache_this(limit=10)#, ignore_args = (0,))
    def _slice_X(self, X):
        return X[:, self.active_dims]
    def K(self, X, X2):
        """
        Compute the kernel function.
        :param X: the first set of inputs to the kernel
        :param X2: (optional) the second set of arguments to the kernel. If X2
                   is None, this is passed throgh to the 'part' object, which
                   handLes this as X2 == X.
        """
        raise NotImplementedError
-    def Kdiag(self, Xa):
+    def Kdiag(self, X):
        raise NotImplementedError
    def psi0(self, Z, variational_posterior):
        raise NotImplementedError
@ -34,13 +59,19 @@ class Kern(Parameterized):
        raise NotImplementedError
    def gradients_X(self, dL_dK, X, X2):
        raise NotImplementedError
-    def gradients_X_diag(self, dL_dK, X):
+    def gradients_X_diag(self, dL_dKdiag, X):
        raise NotImplementedError
    def update_gradients_diag(self, dL_dKdiag, X):
        """ update the gradients of all parameters when using only the diagonal elements of the covariance matrix"""
        raise NotImplementedError
    def update_gradients_full(self, dL_dK, X, X2):
        """Set the gradients of all parameters when doing full (N) inference."""
        raise NotImplementedError
-
+    def update_gradients_diag(self, dL_dKdiag, X):
        """Set the gradients for all parameters for the derivative of the diagonal of the covariance w.r.t the kernel parameters."""
        raise NotImplementedError
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        """
        Set the gradients of all parameters when doing inference with
@ -89,23 +120,16 @@ class Kern(Parameterized):
        """
        Returns the sensitivity for each dimension of this kernel.
        """
-        return self.kern.input_sensitivity()
+        return np.zeros(self.input_dim)
    def __add__(self, other):
        """ Overloading of the '+' operator. for more control, see self.add """
        return self.add(other)
-    def add(self, other, tensor=False):
+    def add(self, other, name='add'):
        """
        Add another kernel to this one.
        If Tensor is False, both kernels are defined on the same _space_. then
        the created kernel will have the same number of inputs as self and
        other (which must be the same).
        If Tensor is True, then the dimensions are stacked 'horizontally', so
        that the resulting kernel has self.input_dim + other.input_dim
        :param other: the other kernel to be added
        :type other: GPy.kern
@ -113,23 +137,23 @@ class Kern(Parameterized):
        assert isinstance(other, Kern), "only kernels can be added to kernels..."
        from add import Add
        kernels = []
-        if not tensor and isinstance(self, Add): kernels.extend(self._parameters_)
+        if isinstance(self, Add): kernels.extend(self._parameters_)
        else: kernels.append(self)
-        if not tensor and isinstance(other, Add): kernels.extend(other._parameters_)
+        if isinstance(other, Add): kernels.extend(other._parameters_)
        else: kernels.append(other)
-        return Add(kernels, tensor)
+        return Add(kernels, name=name)
    def __mul__(self, other):
        """ Here we overload the '*' operator. See self.prod for more information"""
        return self.prod(other)
-    def __pow__(self, other):
+    #def __pow__(self, other):
-        """
+    #    """
-        Shortcut for tensor `prod`.
+    #    Shortcut for tensor `prod`.
-        """
+    #    """
-        return self.prod(other, tensor=True)
+    #    return self.prod(other, tensor=True)
-    def prod(self, other, tensor=False, name=None):
+    def prod(self, other, name=None):
        """
        Multiply two kernels (either on the same space, or on the tensor
        product of the input space).
@ -142,4 +166,42 @@ class Kern(Parameterized):
        """
        assert isinstance(other, Kern), "only kernels can be added to kernels..."
        from prod import Prod
-        return Prod(self, other, tensor, name)
+        kernels = []
        if isinstance(self, Prod): kernels.extend(self._parameters_)
        else: kernels.append(self)
        if isinstance(other, Prod): kernels.extend(other._parameters_)
        else: kernels.append(other)
        return Prod(self, other, name)
    def _getstate(self):
        """
        Get the current state of the class,
        here just all the indices, rest can get recomputed
        """
        return super(Kern, self)._getstate() + [
                self.active_dims,
                self.input_dim,
                self._sliced_X]
    def _setstate(self, state):
        self._sliced_X = state.pop()
        self.input_dim = state.pop()
        self.active_dims = state.pop()
        super(Kern, self)._setstate(state)
 class CombinationKernel(Kern):
    def __init__(self, kernels, name):
        assert all([isinstance(k, Kern) for k in kernels])
        input_dim = reduce(np.union1d, (x.active_dims for x in kernels))
        super(CombinationKernel, self).__init__(input_dim, name)
        self.add_parameters(*kernels)
    @property
    def parts(self):
        return self._parameters_
    def input_sensitivity(self):
        in_sen = np.zeros((self.num_params, self.input_dim))
        for i, p in enumerate(self.parts):
            in_sen[i, p.active_dims] = p.input_sensitivity()
        return in_sen
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@ -6,10 +6,12 @@ import numpy as np
 from scipy import weave
 from kern import Kern
 from ...util.linalg import tdot
-from ...util.misc import fast_array_equal, param_to_array
+from ...util.misc import param_to_array
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ...util.caching import Cache_this
 from ...core.parameterization import variational
 from psi_comp import linear_psi_comp
 class Linear(Kern):
    """
@ -104,49 +106,112 @@ class Linear(Kern):
    #---------------------------------------#
    def psi0(self, Z, variational_posterior):
-        return np.sum(self.variances * self._mu2S(variational_posterior), 1)
+        if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
            gamma = variational_posterior.binary_prob
            mu = variational_posterior.mean
            S = variational_posterior.variance
            return np.einsum('q,nq,nq->n',self.variances,gamma,np.square(mu)+S)
 #            return (self.variances*gamma*(np.square(mu)+S)).sum(axis=1)
        else:
            return np.sum(self.variances * self._mu2S(variational_posterior), 1)
    def psi1(self, Z, variational_posterior):
-        return self.K(variational_posterior.mean, Z) #the variance, it does nothing
+        if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
            gamma = variational_posterior.binary_prob
            mu = variational_posterior.mean
            return np.einsum('nq,q,mq,nq->nm',gamma,self.variances,Z,mu)
 #            return (self.variances*gamma*mu).sum(axis=1)       
        else:
            return self.K(variational_posterior.mean, Z) #the variance, it does nothing
    @Cache_this(limit=1)
    def psi2(self, Z, variational_posterior):
-        ZA = Z * self.variances
+        if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
-        ZAinner = self._ZAinner(variational_posterior, Z)
+            gamma = variational_posterior.binary_prob
-        return np.dot(ZAinner, ZA.T)
+            mu = variational_posterior.mean
            S = variational_posterior.variance
            mu2 = np.square(mu)
            variances2 = np.square(self.variances)
            tmp = np.einsum('nq,q,mq,nq->nm',gamma,self.variances,Z,mu)
            return np.einsum('nq,q,mq,oq,nq->nmo',gamma,variances2,Z,Z,mu2+S)+\
                np.einsum('nm,no->nmo',tmp,tmp) - np.einsum('nq,q,mq,oq,nq->nmo',np.square(gamma),variances2,Z,Z,mu2)
        else:
            ZA = Z * self.variances
            ZAinner = self._ZAinner(variational_posterior, Z)
            return np.dot(ZAinner, ZA.T)
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        #psi1
+        if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
-        self.update_gradients_full(dL_dpsi1, variational_posterior.mean, Z)
+            gamma = variational_posterior.binary_prob
-        # psi0:
+            mu = variational_posterior.mean
-        tmp = dL_dpsi0[:, None] * self._mu2S(variational_posterior)
+            S = variational_posterior.variance
-        if self.ARD: self.variances.gradient += tmp.sum(0)
+            mu2S = np.square(mu)+S
-        else: self.variances.gradient += tmp.sum()
+            _dpsi2_dvariance, _, _, _, _ = linear_psi_comp._psi2computations(self.variances, Z, mu, S, gamma)
-        #psi2
+            grad = np.einsum('n,nq,nq->q',dL_dpsi0,gamma,mu2S) + np.einsum('nm,nq,mq,nq->q',dL_dpsi1,gamma,Z,mu) +\
-        if self.ARD:
+                 np.einsum('nmo,nmoq->q',dL_dpsi2,_dpsi2_dvariance)
-            tmp = dL_dpsi2[:, :, :, None] * (self._ZAinner(variational_posterior, Z)[:, :, None, :] * Z[None, None, :, :])
+            if self.ARD:
-            self.variances.gradient += 2.*tmp.sum(0).sum(0).sum(0)
+                self.variances.gradient = grad
            else:
                self.variances.gradient = grad.sum()
        else:
-            self.variances.gradient += 2.*np.sum(dL_dpsi2 * self.psi2(Z, variational_posterior))/self.variances
+            #psi1
            self.update_gradients_full(dL_dpsi1, variational_posterior.mean, Z)
            # psi0:
            tmp = dL_dpsi0[:, None] * self._mu2S(variational_posterior)
            if self.ARD: self.variances.gradient += tmp.sum(0)
            else: self.variances.gradient += tmp.sum()
            #psi2
            if self.ARD:
                tmp = dL_dpsi2[:, :, :, None] * (self._ZAinner(variational_posterior, Z)[:, :, None, :] * Z[None, None, :, :])
                self.variances.gradient += 2.*tmp.sum(0).sum(0).sum(0)
            else:
                self.variances.gradient += 2.*np.sum(dL_dpsi2 * self.psi2(Z, variational_posterior))/self.variances
    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        #psi1
+        if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
-        grad = self.gradients_X(dL_dpsi1.T, Z, variational_posterior.mean)
+            gamma = variational_posterior.binary_prob
-        #psi2
+            mu = variational_posterior.mean
-        self._weave_dpsi2_dZ(dL_dpsi2, Z, variational_posterior, grad)
+            S = variational_posterior.variance
-        return grad
+            _, _, _, _, _dpsi2_dZ = linear_psi_comp._psi2computations(self.variances, Z, mu, S, gamma)
            grad =  np.einsum('nm,nq,q,nq->mq',dL_dpsi1,gamma, self.variances,mu) +\
                 np.einsum('nmo,noq->mq',dL_dpsi2,_dpsi2_dZ)
            return grad
        else:
            #psi1
            grad = self.gradients_X(dL_dpsi1.T, Z, variational_posterior.mean)
            #psi2
            self._weave_dpsi2_dZ(dL_dpsi2, Z, variational_posterior, grad)
            return grad
    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        grad_mu, grad_S = np.zeros(variational_posterior.mean.shape), np.zeros(variational_posterior.mean.shape)
+        if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
-        # psi0
+            gamma = variational_posterior.binary_prob
-        grad_mu += dL_dpsi0[:, None] * (2.0 * variational_posterior.mean * self.variances)
+            mu = variational_posterior.mean
-        grad_S += dL_dpsi0[:, None] * self.variances
+            S = variational_posterior.variance
-        # psi1
+            mu2S = np.square(mu)+S            
-        grad_mu += (dL_dpsi1[:, :, None] * (Z * self.variances)).sum(1)
+            _, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _ = linear_psi_comp._psi2computations(self.variances, Z, mu, S, gamma)
        # psi2
        self._weave_dpsi2_dmuS(dL_dpsi2, Z, variational_posterior, grad_mu, grad_S)
-        return grad_mu, grad_S
+            grad_gamma = np.einsum('n,q,nq->nq',dL_dpsi0,self.variances,mu2S) + np.einsum('nm,q,mq,nq->nq',dL_dpsi1,self.variances,Z,mu) +\
                 np.einsum('nmo,nmoq->nq',dL_dpsi2,_dpsi2_dgamma)
            grad_mu = np.einsum('n,nq,q,nq->nq',dL_dpsi0,gamma,2.*self.variances,mu) + np.einsum('nm,nq,q,mq->nq',dL_dpsi1,gamma,self.variances,Z) +\
                 np.einsum('nmo,nmoq->nq',dL_dpsi2,_dpsi2_dmu)
            grad_S = np.einsum('n,nq,q->nq',dL_dpsi0,gamma,self.variances) + np.einsum('nmo,nmoq->nq',dL_dpsi2,_dpsi2_dS)
            return grad_mu, grad_S, grad_gamma
        else:
            grad_mu, grad_S = np.zeros(variational_posterior.mean.shape), np.zeros(variational_posterior.mean.shape)
            # psi0
            grad_mu += dL_dpsi0[:, None] * (2.0 * variational_posterior.mean * self.variances)
            grad_S += dL_dpsi0[:, None] * self.variances
            # psi1
            grad_mu += (dL_dpsi1[:, :, None] * (Z * self.variances)).sum(1)
            # psi2
            self._weave_dpsi2_dmuS(dL_dpsi2, Z, variational_posterior, grad_mu, grad_S)
            return grad_mu, grad_S
    #--------------------------------------------------#
    #            Helpers for psi statistics            #
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@ -1,10 +1,12 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kern import Kern
 import numpy as np
 from kern import CombinationKernel
 from ...util.caching import Cache_this
 import itertools
-class Prod(Kern):
+class Prod(CombinationKernel):
    """
    Computes the product of 2 kernels
@ -15,34 +17,31 @@ class Prod(Kern):
    :rtype: kernel object
    """
-    def __init__(self, k1, k2, tensor=False,name=None):
+    def __init__(self, kernels, name='prod'):
-        if tensor:
+        super(Prod, self).__init__(kernels, name)
            name = k1.name + '_xx_' + k2.name if name is None else name
            super(Prod, self).__init__(k1.input_dim + k2.input_dim, name)
            self.slice1 = slice(0,k1.input_dim)
            self.slice2 = slice(k1.input_dim,k1.input_dim+k2.input_dim)
        else:
            assert k1.input_dim == k2.input_dim, "Error: The input spaces of the kernels to multiply don't have the same dimension."
            name = k1.name + '_x_' + k2.name if name is None else name
            super(Prod, self).__init__(k1.input_dim, name)
            self.slice1 = slice(0, self.input_dim)
            self.slice2 = slice(0, self.input_dim)
        self.k1 = k1
        self.k2 = k2
        self.add_parameters(self.k1, self.k2)
-    def K(self, X, X2=None):
+    @Cache_this(limit=2, force_kwargs=['which_parts'])
-        if X2 is None:
+    def K(self, X, X2=None, which_parts=None):
-            return self.k1.K(X[:,self.slice1], None) * self.k2.K(X[:,self.slice2], None)
+        assert X.shape[1] == self.input_dim
-        else:
+        if which_parts is None:
-            return self.k1.K(X[:,self.slice1], X2[:,self.slice1]) * self.k2.K(X[:,self.slice2], X2[:,self.slice2])
+            which_parts = self.parts
        elif not isinstance(which_parts, (list, tuple)):
            # if only one part is given
            which_parts = [which_parts]
        return reduce(np.multiply, (p.K(X, X2) for p in which_parts))
-    def Kdiag(self, X):
+    @Cache_this(limit=2, force_kwargs=['which_parts'])
-        return self.k1.Kdiag(X[:,self.slice1]) * self.k2.Kdiag(X[:,self.slice2])
+    def Kdiag(self, X, which_parts=None):
        assert X.shape[1] == self.input_dim
        if which_parts is None:
            which_parts = self.parts
        return reduce(np.multiply, (p.Kdiag(X) for p in which_parts))
    def update_gradients_full(self, dL_dK, X):
-        self.k1.update_gradients_full(dL_dK*self.k2.K(X[:,self.slice2]), X[:,self.slice1])
+        for k1,k2 in itertools.combinations(self.parts, 2):
-        self.k2.update_gradients_full(dL_dK*self.k1.K(X[:,self.slice1]), X[:,self.slice2])
+            k1._sliced_X = k1._sliced_X2 = k2._sliced_X = k2._sliced_X2 = True
            k1.update_gradients_full(dL_dK*k2.K(X, X)
            self.k2.update_gradients_full(dL_dK*self.k1.K(X[:,self.slice1]), X[:,self.slice2])
    def gradients_X(self, dL_dK, X, X2=None):
        target = np.zeros(X.shape)
--- a/GPy/kern/_src/rbf_psi_comp/init.py
+++ b/GPy/kern/_src/rbf_psi_comp/init.py
--- a/GPy/kern/_src/psi_comp/linear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/linear_psi_comp.py
@ -0,0 +1,51 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 The package for the Psi statistics computation of the linear kernel for SSGPLVM
 """
 import numpy as np
 from GPy.util.caching import Cache_this
 #@Cache_this(limit=1)
 def _psi2computations(variance, Z, mu, S, gamma):
    """
    Z - MxQ
    mu - NxQ
    S - NxQ
    gamma - NxQ
    """
    # here are the "statistics" for psi1 and psi2
    # Produced intermediate results:
    # _psi2                NxMxM
    # _psi2_dvariance      NxMxMxQ
    # _psi2_dZ             NxMxQ
    # _psi2_dgamma         NxMxMxQ
    # _psi2_dmu            NxMxMxQ
    # _psi2_dS             NxMxMxQ
    mu2 = np.square(mu)
    gamma2 = np.square(gamma)
    variance2 = np.square(variance)
    mu2S = mu2+S # NxQ
    common_sum = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) # NxM
    _dpsi2_dvariance = np.einsum('nq,q,mq,oq->nmoq',2.*(gamma*mu2S-gamma2*mu2),variance,Z,Z)+\
        np.einsum('nq,mq,nq,no->nmoq',gamma,Z,mu,common_sum)+\
        np.einsum('nq,oq,nq,nm->nmoq',gamma,Z,mu,common_sum)
    _dpsi2_dgamma = np.einsum('q,mq,oq,nq->nmoq',variance2,Z,Z,(mu2S-2.*gamma*mu2))+\
        np.einsum('q,mq,nq,no->nmoq',variance,Z,mu,common_sum)+\
        np.einsum('q,oq,nq,nm->nmoq',variance,Z,mu,common_sum)
    _dpsi2_dmu = np.einsum('q,mq,oq,nq,nq->nmoq',variance2,Z,Z,mu,2.*(gamma-gamma2))+\
        np.einsum('nq,q,mq,no->nmoq',gamma,variance,Z,common_sum)+\
        np.einsum('nq,q,oq,nm->nmoq',gamma,variance,Z,common_sum)
    _dpsi2_dS = np.einsum('nq,q,mq,oq->nmoq',gamma,variance2,Z,Z)
    _dpsi2_dZ = 2.*(np.einsum('nq,q,mq,nq->nmq',gamma,variance2,Z,mu2S)+np.einsum('nq,q,nq,nm->nmq',gamma,variance,mu,common_sum)
                    -np.einsum('nq,q,mq,nq->nmq',gamma2,variance2,Z,mu2))
    return _dpsi2_dvariance, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _dpsi2_dZ
--- a/GPy/kern/_src/rbf_psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/_src/rbf_psi_comp/ssrbf_psi_comp.py
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@ -8,7 +8,7 @@ from ...util.misc import param_to_array
 from stationary import Stationary
 from GPy.util.caching import Cache_this
 from ...core.parameterization import variational
-from rbf_psi_comp import ssrbf_psi_comp
+from psi_comp import ssrbf_psi_comp
 class RBF(Stationary):
    """
@ -19,7 +19,6 @@ class RBF(Stationary):
       k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg)
    """
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, name='rbf'):
        super(RBF, self).__init__(input_dim, variance, lengthscale, ARD, name)
        self.weave_options = {}
@ -67,7 +66,6 @@ class RBF(Stationary):
            else:
                self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).sum()  
            #from psi2
            self.variance.gradient += (dL_dpsi2 * _dpsi2_dvariance).sum()
            if self.ARD:
@ -76,11 +74,14 @@ class RBF(Stationary):
                self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).sum()
        elif isinstance(variational_posterior, variational.NormalPosterior):
-        
+            l2 = self.lengthscale**2
-            l2 = self.lengthscale **2
+            if l2.size != self.input_dim:
                l2 = l2*np.ones(self.input_dim)
            #contributions from psi0:
            self.variance.gradient = np.sum(dL_dpsi0)
            if self._debug:
                num_grad = self.lengthscale.gradient.copy()
            self.lengthscale.gradient = 0.
            #from psi1
@ -92,16 +93,16 @@ class RBF(Stationary):
            else:
                self.lengthscale.gradient += dpsi1_dlength.sum()
            self.variance.gradient += np.sum(dL_dpsi1 * psi1) / self.variance
            #from psi2
            S = variational_posterior.variance
            _, Zdist_sq, _, mudist_sq, psi2 = self._psi2computations(Z, variational_posterior)
            if not self.ARD:
                self.lengthscale.gradient += self._weave_psi2_lengthscale_grads(dL_dpsi2, psi2, Zdist_sq, S, mudist_sq, l2).sum()
            else:
                self.lengthscale.gradient += self._weave_psi2_lengthscale_grads(dL_dpsi2, psi2, Zdist_sq, S, mudist_sq, l2)
            if self._debug:
                import ipdb;ipdb.set_trace()
            self.variance.gradient += 2.*np.sum(dL_dpsi2 * psi2)/self.variance
        else:
@ -122,7 +123,6 @@ class RBF(Stationary):
            return grad
        elif isinstance(variational_posterior, variational.NormalPosterior):
            l2 = self.lengthscale **2
            #psi1
@ -153,6 +153,7 @@ class RBF(Stationary):
            grad_mu = (dL_dpsi1[:, :, None] * _dpsi1_dmu).sum(axis=1)
            grad_S = (dL_dpsi1[:, :, None] * _dpsi1_dS).sum(axis=1)
            grad_gamma = (dL_dpsi1[:,:,None] * _dpsi1_dgamma).sum(axis=1)
            #psi2
            grad_mu += (dL_dpsi2[:, :, :, None] * _dpsi2_dmu).reshape(ndata,-1,self.input_dim).sum(axis=1)
            grad_S += (dL_dpsi2[:, :, :, None] * _dpsi2_dS).reshape(ndata,-1,self.input_dim).sum(axis=1)
--- a/GPy/kern/_src/ssrbf.py
+++ b/GPy/kern/_src/ssrbf.py
@ -7,7 +7,7 @@ import numpy as np
 from ...util.linalg import tdot
 from ...util.config import *
 from stationary import Stationary
-from rbf_psi_comp import ssrbf_psi_comp
+from psi_comp import ssrbf_psi_comp
 class SSRBF(Stationary):
    """
--- a/GPy/kern/_src/static.py
+++ b/GPy/kern/_src/static.py
@ -55,7 +55,7 @@ class White(Static):
    def psi2(self, Z, variational_posterior):
        return np.zeros((variational_posterior.shape[0], Z.shape[0], Z.shape[0]), dtype=np.float64)
-    def update_gradients_full(self, dL_dK, X):
+    def update_gradients_full(self, dL_dK, X, X2=None):
        self.variance.gradient = np.trace(dL_dK)
    def update_gradients_diag(self, dL_dKdiag, X):
@ -79,13 +79,41 @@ class Bias(Static):
        self.variance.gradient = dL_dK.sum()
    def update_gradients_diag(self, dL_dKdiag, X):
-        self.variance.gradient = dL_dK.sum()
+        self.variance.gradient = dL_dKdiag.sum()
    def psi2(self, Z, variational_posterior):
-        ret = np.empty((mu.shape[0], Z.shape[0], Z.shape[0]), dtype=np.float64)
+        ret = np.empty((variational_posterior.shape[0], Z.shape[0], Z.shape[0]), dtype=np.float64)
        ret[:] = self.variance**2
        return ret
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        self.variance.gradient = dL_dpsi0.sum() + dL_dpsi1.sum() + 2.*self.variance*dL_dpsi2.sum()
 class Fixed(Static):
    def __init__(self, input_dim, covariance_matrix, variance=1., name='fixed'):
        """
        :param input_dim: the number of input dimensions
        :type input_dim: int
        :param variance: the variance of the kernel
        :type variance: float
        """
        super(Bias, self).__init__(input_dim, variance, name)
        self.fixed_K = covariance_matrix
    def K(self, X, X2):
        return self.variance * self.fixed_K
    def Kdiag(self, X):
        return self.variance * self.fixed_K.diag()
    def update_gradients_full(self, dL_dK, X, X2=None):
        self.variance.gradient = np.einsum('ij,ij', dL_dK, self.fixed_K)
    def update_gradients_diag(self, dL_dKdiag, X):
        self.variance.gradient = np.einsum('i,i', dL_dKdiag, self.fixed_K)
    def psi2(self, Z, variational_posterior):
        return np.zeros((variational_posterior.shape[0], Z.shape[0], Z.shape[0]), dtype=np.float64)
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        self.variance.gradient = dL_dpsi0.sum()
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@ -85,12 +85,14 @@ class Stationary(Kern):
        Compute the Euclidean distance between each row of X and X2, or between
        each pair of rows of X if X2 is None.
        """
        #X, = self._slice_X(X)
        if X2 is None:
            Xsq = np.sum(np.square(X),1)
            r2 = -2.*tdot(X) + (Xsq[:,None] + Xsq[None,:])
            util.diag.view(r2)[:,]= 0. # force diagnoal to be zero: sometime numerically a little negative
            return np.sqrt(r2)
        else:
            #X2, = self._slice_X(X2)
            X1sq = np.sum(np.square(X),1)
            X2sq = np.sum(np.square(X2),1)
            return np.sqrt(-2.*np.dot(X, X2.T) + (X1sq[:,None] + X2sq[None,:]))
@ -124,7 +126,6 @@ class Stationary(Kern):
        self.lengthscale.gradient = 0.
    def update_gradients_full(self, dL_dK, X, X2=None):
        self.variance.gradient = np.einsum('ij,ij,i', self.K(X, X2), dL_dK, 1./self.variance)
        #now the lengthscale gradient(s)
@ -136,7 +137,7 @@ class Stationary(Kern):
            #self.lengthscale.gradient = -((dL_dr*rinv)[:,:,None]*x_xl3).sum(0).sum(0)/self.lengthscale**3
            tmp = dL_dr*self._inv_dist(X, X2)
            if X2 is None: X2 = X
-            self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)])
+            self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(self._slice_X(X)[:,q:q+1] - self._slice_X(X2)[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)])
        else:
            r = self._scaled_dist(X, X2)
            self.lengthscale.gradient = -np.sum(dL_dr*r)/self.lengthscale
@ -176,7 +177,6 @@ class Stationary(Kern):
        ret = np.empty(X.shape, dtype=np.float64)
        [np.einsum('ij,ij->i', tmp, X[:,q][:,None]-X2[:,q][None,:], out=ret[:,q]) for q in xrange(self.input_dim)]
        ret /= self.lengthscale**2
        return ret
    def gradients_X_diag(self, dL_dKdiag, X):
--- a/GPy/kern/_src/sympykern.py
+++ b/GPy/kern/_src/sympykern.py
@ -5,7 +5,6 @@ try:
    from sympy.utilities.lambdify import lambdify
 except ImportError:
    sympy_available=False
    exit()
 import numpy as np
 from kern import Kern
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -358,7 +358,7 @@ class Likelihood(Parameterized):
        return dlogpdf_dtheta, dlogpdf_df_dtheta, d2logpdf_df2_dtheta
-    def predictive_values(self, mu, var, full_cov=False, sampling=False, num_samples=10000):
+    def predictive_values(self, mu, var, full_cov=False, sampling=True, num_samples=10000):
        """
        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction.
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@ -5,15 +5,15 @@ import numpy as np
 import itertools
 import pylab
-from ..core import Model, SparseGP
+from ..core import Model
 from ..util.linalg import PCA
 from ..kern import Kern
 from bayesian_gplvm import BayesianGPLVM
 from ..core.parameterization.variational import NormalPosterior, NormalPrior
-from ..inference.latent_function_inference.var_dtc import VarDTCMissingData
+from ..core.parameterization import Param, Parameterized
-from ..likelihoods.gaussian import Gaussian
+from ..inference.latent_function_inference.var_dtc import VarDTCMissingData, VarDTC
 from ..likelihoods import Gaussian
-class MRD2(Model):
+class MRD(Model):
    """
    Apply MRD to all given datasets Y in Ylist. 
@ -43,61 +43,110 @@ class MRD2(Model):
    :param :class:`~GPy.inference.latent_function_inference inference_method: the inference method to use
    :param :class:`~GPy.likelihoods.likelihood.Likelihood` likelihood: the likelihood to use
    :param str name: the name of this model
    :param [str] Ynames: the names for the datasets given, must be of equal length as Ylist or None
    """
    def __init__(self, Ylist, input_dim, X=None, X_variance=None, 
                 initx = 'PCA', initz = 'permute',
                 num_inducing=10, Z=None, kernel=None, 
-                 inference_method=None, likelihood=None, name='mrd'):
+                 inference_method=None, likelihood=None, name='mrd', Ynames=None):
-        super(MRD2, self).__init__(name)
+        super(MRD, self).__init__(name)
        # sort out the kernels
        if kernel is None:
            from ..kern import RBF
-            self.kern = [RBF(input_dim, ARD=1, name='Y_{}'.format(i)) for i in range(len(Ylist))]
+            self.kern = [RBF(input_dim, ARD=1, name='rbf'.format(i)) for i in range(len(Ylist))]
        elif isinstance(kernel, Kern):
-            self.kern = [kernel.copy(name='Y_{}'.format(i)) for i in range(len(Ylist))]
+            self.kern = [kernel.copy(name='{}'.format(kernel.name, i)) for i in range(len(Ylist))]
        else:
            assert len(kernel) == len(Ylist), "need one kernel per output"
            assert all([isinstance(k, Kern) for k in kernel]), "invalid kernel object detected!"
-
+            self.kern = kernel
        self.input_dim = input_dim
        self.num_inducing = num_inducing
        self.Ylist = Ylist
        self._in_init_ = True
        X = self._init_X(initx, Ylist)
-        self.Z = self._init_Z(initz, X)
+        self.Z = Param('inducing inputs', self._init_Z(initz, X))
        self.num_inducing = self.Z.shape[0] # ensure M==N if M>N
        if X_variance is None:
-            X_variance = np.random.uniform(0,.2,X.shape)
+            X_variance = np.random.uniform(0, .2, X.shape)
        self.variational_prior = NormalPrior()
        self.X = NormalPosterior(X, X_variance)
        if likelihood is None:
-            likelihood = Gaussian()
+            self.likelihood = [Gaussian(name='Gaussian_noise'.format(i)) for i in range(len(Ylist))]
        else: self.likelihood = likelihood
        if inference_method is None:
-            if any(np.any(np.isnan(y)) for y in Ylist):
+            self.inference_method= []
-                self.inference_method = VarDTCMissingData(limit=len(Ylist))
+            for y in Ylist:
                if np.any(np.isnan(y)):
                    self.inference_method.append(VarDTCMissingData(limit=1))
                else:
                    self.inference_method.append(VarDTC(limit=1))
        else:
            self.inference_method = inference_method
            self.inference_method.set_limit(len(Ylist))
-        self.Ylist = Ylist
+        self.add_parameters(self.X, self.Z)
        if Ynames is None:
            Ynames = ['Y{}'.format(i) for i in range(len(Ylist))]
        for i, n, k, l in itertools.izip(itertools.count(), Ynames, self.kern, self.likelihood):
            p = Parameterized(name=n)
            p.add_parameter(k)
            p.add_parameter(l)
            setattr(self, 'Y{}'.format(i), p)
            self.add_parameter(p)
        self._in_init_ = False
    def parameters_changed(self):
-        for y in self.Ylist:
+        self._log_marginal_likelihood = 0
-            pass
+        self.posteriors = []
        self.Z.gradient = 0.
        self.X.mean.gradient = 0.
        self.X.variance.gradient = 0.
-    def _init_X(self, init='PCA', likelihood_list=None):
+        for y, k, l, i in itertools.izip(self.Ylist, self.kern, self.likelihood, self.inference_method):
-        if likelihood_list is None:
+            posterior, lml, grad_dict = i.inference(k, self.X, self.Z, l, y)
-            likelihood_list = self.likelihood_list
+            
-        Ylist = []
+            self.posteriors.append(posterior)
-        for likelihood_or_Y in likelihood_list:
+            self._log_marginal_likelihood += lml
-            if type(likelihood_or_Y) is np.ndarray:
+            
-                Ylist.append(likelihood_or_Y)
+            # likelihood gradients
-            else:
+            l.update_gradients(grad_dict.pop('partial_for_likelihood'))
-                Ylist.append(likelihood_or_Y.Y)
+            
-        del likelihood_list
+            #gradients wrt kernel
            dL_dKmm = grad_dict.pop('dL_dKmm')
            k.update_gradients_full(dL_dKmm, self.Z, None)
            target = k.gradient.copy()
            k.update_gradients_expectations(variational_posterior=self.X, Z=self.Z, **grad_dict)
            k.gradient += target
            #gradients wrt Z
            self.Z.gradient += k.gradients_X(dL_dKmm, self.Z)
            self.Z.gradient += k.gradients_Z_expectations(
                               grad_dict['dL_dpsi1'], grad_dict['dL_dpsi2'], Z=self.Z, variational_posterior=self.X)
            dL_dmean, dL_dS = k.gradients_qX_expectations(variational_posterior=self.X, Z=self.Z, **grad_dict)
            self.X.mean.gradient += dL_dmean
            self.X.variance.gradient += dL_dS
        # update for the KL divergence
        self.variational_prior.update_gradients_KL(self.X)
        self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
    def log_likelihood(self):
        return self._log_marginal_likelihood
    def _init_X(self, init='PCA', Ylist=None):
        if Ylist is None:
            Ylist = self.Ylist
        if init in "PCA_concat":
            X = PCA(np.hstack(Ylist), self.input_dim)[0]
        elif init in "PCA_single":
@ -106,7 +155,6 @@ class MRD2(Model):
                X[:, qs] = PCA(Y, len(qs))[0]
        else: # init == 'random':
            X = np.random.randn(Ylist[0].shape[0], self.input_dim)
        self.X = X
        return X
    def _init_Z(self, init="permute", X=None):
@ -116,259 +164,8 @@ class MRD2(Model):
            Z = np.random.permutation(X.copy())[:self.num_inducing]
        elif init in "random":
            Z = np.random.randn(self.num_inducing, self.input_dim) * X.var()
        self.Z = Z
        return Z
 class MRD(Model):
    """
    Do MRD on given Datasets in Ylist.
    All Ys in likelihood_list are in [N x Dn], where Dn can be different per Yn,
    N must be shared across datasets though.
    :param likelihood_list: list of observed datasets (:py:class:`~GPy.likelihoods.gaussian.Gaussian` if not supplied directly)
    :type likelihood_list: [:py:class:`~GPy.likelihoods.likelihood.likelihood` | :py:class:`ndarray`]
    :param names: names for different gplvm models
    :type names: [str]
    :param input_dim: latent dimensionality
    :type input_dim: int
    :param initx: initialisation method for the latent space :
        * 'concat' - PCA on concatenation of all datasets
        * 'single' - Concatenation of PCA on datasets, respectively
        * 'random' - Random draw from a normal
    :type initx: ['concat'|'single'|'random']
    :param initz: initialisation method for inducing inputs
    :type initz: 'permute'|'random'
    :param X: Initial latent space
    :param X_variance: Initial latent space variance
    :param Z: initial inducing inputs
    :param num_inducing: number of inducing inputs to use
    :param kernels: list of kernels or kernel shared for all BGPLVMS
    :type kernels: [GPy.kern.kern] | GPy.kern.kern | None (default)
    """
    def __init__(self, likelihood_or_Y_list, input_dim, num_inducing=10, names=None,
                 kernels=None, initx='PCA',
                 initz='permute', _debug=False, **kw):
        if names is None:
            self.names = ["{}".format(i) for i in range(len(likelihood_or_Y_list))]
        # sort out the kernels
        if kernels is None:
            kernels = [None] * len(likelihood_or_Y_list)
        elif isinstance(kernels, Kern):
            kernels = [kernels.copy() for i in range(len(likelihood_or_Y_list))]
        else:
            assert len(kernels) == len(likelihood_or_Y_list), "need one kernel per output"
            assert all([isinstance(k, Kern) for k in kernels]), "invalid kernel object detected!"
        assert not ('kernel' in kw), "pass kernels through `kernels` argument"
        self.input_dim = input_dim
        self._debug = _debug
        self.num_inducing = num_inducing
        self._in_init_ = True
        X = self._init_X(initx, likelihood_or_Y_list)
        Z = self._init_Z(initz, X)
        self.num_inducing = Z.shape[0] # ensure M==N if M>N
        self.bgplvms = [BayesianGPLVM(l, input_dim=input_dim, kernel=k, X=X, Z=Z, num_inducing=self.num_inducing, **kw) for l, k in zip(likelihood_or_Y_list, kernels)]
        del self._in_init_
        self.gref = self.bgplvms[0]
        nparams = np.array([0] + [SparseGP._get_params(g).size - g.Z.size for g in self.bgplvms])
        self.nparams = nparams.cumsum()
        self.num_data = self.gref.num_data
        self.NQ = self.num_data * self.input_dim
        self.MQ = self.num_inducing * self.input_dim
        Model.__init__(self)
        self.ensure_default_constraints()
    def _getstate(self):
        return Model._getstate(self) + [self.names,
                self.bgplvms,
                self.gref,
                self.nparams,
                self.input_dim,
                self.num_inducing,
                self.num_data,
                self.NQ,
                self.MQ]
    def _setstate(self, state):
        self.MQ = state.pop()
        self.NQ = state.pop()
        self.num_data = state.pop()
        self.num_inducing = state.pop()
        self.input_dim = state.pop()
        self.nparams = state.pop()
        self.gref = state.pop()
        self.bgplvms = state.pop()
        self.names = state.pop()
        Model._setstate(self, state)
    @property
    def X(self):
        return self.gref.X
    @X.setter
    def X(self, X):
        try:
            self.propagate_param(X=X)
        except AttributeError:
            if not self._in_init_:
                raise AttributeError("bgplvm list not initialized")
    @property
    def Z(self):
        return self.gref.Z
    @Z.setter
    def Z(self, Z):
        try:
            self.propagate_param(Z=Z)
        except AttributeError:
            if not self._in_init_:
                raise AttributeError("bgplvm list not initialized")
    @property
    def X_variance(self):
        return self.gref.X_variance
    @X_variance.setter
    def X_variance(self, X_var):
        try:
            self.propagate_param(X_variance=X_var)
        except AttributeError:
            if not self._in_init_:
                raise AttributeError("bgplvm list not initialized")
    @property
    def likelihood_list(self):
        return [g.likelihood.Y for g in self.bgplvms]
    @likelihood_list.setter
    def likelihood_list(self, likelihood_list):
        for g, Y in itertools.izip(self.bgplvms, likelihood_list):
            g.likelihood.Y = Y
    @property
    def auto_scale_factor(self):
        """
        set auto_scale_factor for all gplvms
        :param b: auto_scale_factor
        :type b:
        """
        return self.gref.auto_scale_factor
    @auto_scale_factor.setter
    def auto_scale_factor(self, b):
        self.propagate_param(auto_scale_factor=b)
    def propagate_param(self, **kwargs):
        for key, val in kwargs.iteritems():
            for g in self.bgplvms:
                g.__setattr__(key, val)
    def randomize(self, initx='concat', initz='permute', *args, **kw):
        super(MRD, self).randomize(*args, **kw)
        self._init_X(initx, self.likelihood_list)
        self._init_Z(initz, self.X)
    #def _get_latent_param_names(self):
    def _get_param_names(self):
        n1 = self.gref._get_param_names()
        n1var = n1[:self.NQ * 2 + self.MQ]
    #    return n1var
    #
    #def _get_kernel_names(self):
        map_names = lambda ns, name: map(lambda x: "{1}_{0}".format(*x),
                                         itertools.izip(ns,
                                                        itertools.repeat(name)))
        return list(itertools.chain(n1var, *(map_names(\
                SparseGP._get_param_names(g)[self.MQ:], n) \
                for g, n in zip(self.bgplvms, self.names))))
    #    kernel_names = (map_names(SparseGP._get_param_names(g)[self.MQ:], n) for g, n in zip(self.bgplvms, self.names))
    #    return kernel_names
    #def _get_param_names(self):
        # X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
        # S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
    #    n1var = self._get_latent_param_names()
    #    kernel_names = self._get_kernel_names()
    #    return list(itertools.chain(n1var, *kernel_names))
    #def _get_print_names(self):
    #    return list(itertools.chain(*self._get_kernel_names()))
    def _get_params(self):
        """
        return parameter list containing private and shared parameters as follows:
        =================================================================
        | mu | S | Z || theta1 | theta2 | .. | thetaN |
        =================================================================
        """
        X = self.gref.X.ravel()
        X_var = self.gref.X_variance.ravel()
        Z = self.gref.Z.ravel()
        thetas = [SparseGP._get_params(g)[g.Z.size:] for g in self.bgplvms]
        params = np.hstack([X, X_var, Z, np.hstack(thetas)])
        return params
 #     def _set_var_params(self, g, X, X_var, Z):
 #         g.X = X.reshape(self.num_data, self.input_dim)
 #         g.X_variance = X_var.reshape(self.num_data, self.input_dim)
 #         g.Z = Z.reshape(self.num_inducing, self.input_dim)
 #
 #     def _set_kern_params(self, g, p):
 #         g.kern._set_params(p[:g.kern.num_params])
 #         g.likelihood._set_params(p[g.kern.num_params:])
    def _set_params(self, x):
        start = 0; end = self.NQ
        X = x[start:end]
        start = end; end += start
        X_var = x[start:end]
        start = end; end += self.MQ
        Z = x[start:end]
        thetas = x[end:]
        # set params for all:
        for g, s, e in itertools.izip(self.bgplvms, self.nparams, self.nparams[1:]):
            g._set_params(np.hstack([X, X_var, Z, thetas[s:e]]))
 #             self._set_var_params(g, X, X_var, Z)
 #             self._set_kern_params(g, thetas[s:e].copy())
 #             g._compute_kernel_matrices()
 #             if self.auto_scale_factor:
 #                 g.scale_factor = np.sqrt(g.psi2.sum(0).mean() * g.likelihood.precision)
 # #                 self.scale_factor = np.sqrt(self.psi2.sum(0).mean() * self.likelihood.precision)
 #             g._computations()
    def update_likelihood_approximation(self): # TODO: object oriented vs script base
        for bgplvm in self.bgplvms:
            bgplvm.update_likelihood_approximation()
    def log_likelihood(self):
        ll = -self.gref.KL_divergence()
        for g in self.bgplvms:
            ll += SparseGP.log_likelihood(g)
        return ll
    def _log_likelihood_gradients(self):
        dLdmu, dLdS = reduce(lambda a, b: [a[0] + b[0], a[1] + b[1]], (g.dL_dmuS() for g in self.bgplvms))
        dKLmu, dKLdS = self.gref.dKL_dmuS()
        dLdmu -= dKLmu
        dLdS -= dKLdS
        dLdmuS = np.hstack((dLdmu.flatten(), dLdS.flatten())).flatten()
        dldzt1 = reduce(lambda a, b: a + b, (SparseGP._log_likelihood_gradients(g)[:self.MQ] for g in self.bgplvms))
        return np.hstack((dLdmuS,
                             dldzt1,
                np.hstack([np.hstack([g.dL_dtheta(),
                                            g.likelihood._gradients(\
                                                partial=g.partial_for_likelihood)]) \
                              for g in self.bgplvms])))
    def _handle_plotting(self, fignum, axes, plotf, sharex=False, sharey=False):
        if axes is None:
            fig = pylab.figure(num=fignum)
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@ -68,7 +68,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
    #work out what the inputs are for plotting (1D or 2D)
    fixed_dims = np.array([i for i,v in fixed_inputs])
    free_dims = np.setdiff1d(np.arange(model.input_dim),fixed_dims)
-
+    plots = {}
    #one dimensional plotting
    if len(free_dims) == 1:
@ -96,20 +96,20 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
                m, v, lower, upper = model.predict(Xgrid,full_cov=False)
            Y = Y
        for d in which_data_ycols:
-            gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], ax=ax, edgecol=linecol, fillcol=fillcol)
+            plots['gpplot'] = gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], ax=ax, edgecol=linecol, fillcol=fillcol)
-            ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], 'kx', mew=1.5)
+            plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], 'kx', mew=1.5)
        #optionally plot some samples
        if samples: #NOTE not tested with fixed_inputs
            Ysim = model.posterior_samples(Xgrid, samples)
            for yi in Ysim.T:
-                ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
+                plots['posterior_samples'] = ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
                #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
        #add error bars for uncertain (if input uncertainty is being modelled)
        if hasattr(model,"has_uncertain_inputs") and model.has_uncertain_inputs():
-            ax.errorbar(X[which_data_rows, free_dims].flatten(), Y[which_data_rows, which_data_ycols].flatten(),
+            plots['xerrorbar'] = ax.errorbar(X[which_data_rows, free_dims].flatten(), Y[which_data_rows, which_data_ycols].flatten(),
                        xerr=2 * np.sqrt(X_variance[which_data_rows, free_dims].flatten()),
                        ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
@ -125,7 +125,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
            #Zu = model.Z[:,free_dims] * model._Xscale[:,free_dims] + model._Xoffset[:,free_dims]
            Zu = Z[:,free_dims]
            z_height = ax.get_ylim()[0]
-            ax.plot(Zu, np.zeros_like(Zu) + z_height, 'r|', mew=1.5, markersize=12)
+            plots['inducing_inputs'] = ax.plot(Zu, np.zeros_like(Zu) + z_height, 'r|', mew=1.5, markersize=12)
@ -150,8 +150,8 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
            Y = Y
        for d in which_data_ycols:
            m_d = m[:,d].reshape(resolution, resolution).T
-            ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
+            plots['contour'] = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
-            ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+            plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
        #set the limits of the plot to some sensible values
        ax.set_xlim(xmin[0], xmax[0])
@ -164,11 +164,11 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
        if hasattr(model,"Z"):
            #Zu = model.Z[:,free_dims] * model._Xscale[:,free_dims] + model._Xoffset[:,free_dims]
            Zu = Z[:,free_dims]
-            ax.plot(Zu[:,free_dims[0]], Zu[:,free_dims[1]], 'wo')
+            plots['inducing_inputs'] = ax.plot(Zu[:,free_dims[0]], Zu[:,free_dims[1]], 'wo')
    else:
        raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
-
+    return plots
 def plot_fit_f(model, *args, **kwargs):
    """
--- a/GPy/plotting/matplot_dep/variational_plots.py
+++ b/GPy/plotting/matplot_dep/variational_plots.py
@ -44,3 +44,48 @@ def plot(parameterized, fignum=None, ax=None, colors=None):
    pb.draw()
    fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
    return fig
 def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None):
    """
    Plot latent space X in 1D:
        - if fig is given, create input_dim subplots in fig and plot in these
        - if ax is given plot input_dim 1D latent space plots of X into each `axis`
        - if neither fig nor ax is given create a figure with fignum and plot in there
    colors:
        colors of different latent space dimensions input_dim
    """
    if ax is None:
        fig = pb.figure(num=fignum, figsize=(8, min(12, (2 * parameterized.mean.shape[1]))))
    if colors is None:
        colors = pb.gca()._get_lines.color_cycle
        pb.clf()
    else:
        colors = iter(colors)
    plots = []
    means, variances, gamma = param_to_array(parameterized.mean, parameterized.variance, parameterized.binary_prob)
    x = np.arange(means.shape[0])
    for i in range(means.shape[1]):
        # mean and variance plot
        a = fig.add_subplot(means.shape[1]*2, 1, 2*i + 1)
        a.plot(means, c='k', alpha=.3)
        plots.extend(a.plot(x, means.T[i], c=colors.next(), label=r"$\mathbf{{X_{{{}}}}}$".format(i)))
        a.fill_between(x,
                        means.T[i] - 2 * np.sqrt(variances.T[i]),
                        means.T[i] + 2 * np.sqrt(variances.T[i]),
                        facecolor=plots[-1].get_color(),
                        alpha=.3)
        a.legend(borderaxespad=0.)
        a.set_xlim(x.min(), x.max())
        if i < means.shape[1] - 1:
            a.set_xticklabels('')
        # binary prob plot
        a = fig.add_subplot(means.shape[1]*2, 1, 2*i + 2)
        a.bar(x,gamma[:,i],bottom=0.,linewidth=0,align='center')
        a.set_xlim(x.min(), x.max())
        a.set_ylim([0.,1.])
    pb.draw()
    fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
    return fig
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -6,7 +6,9 @@ import numpy as np
 import GPy
 import sys
-verbose = True
+verbose = 0
 class Kern_check_model(GPy.core.Model):
    """
@ -91,7 +93,7 @@ class Kern_check_dKdiag_dX(Kern_check_dK_dX):
-def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False):
+def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verbose=False):
    """
    This function runs on kernels to check the correctness of their
    implementation. It checks that the covariance function is positive definite
@ -210,7 +212,7 @@ def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False):
-class KernelTestsContinuous(unittest.TestCase):
+class KernelGradientTestsContinuous(unittest.TestCase):
    def setUp(self):
        self.X = np.random.randn(100,2)
        self.X2 = np.random.randn(110,2)
@ -220,16 +222,34 @@ class KernelTestsContinuous(unittest.TestCase):
    def test_Matern32(self):
        k = GPy.kern.Matern32(2)
-        self.assertTrue(kern_test(k, X=self.X, X2=self.X2, verbose=verbose))
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
    def test_Matern52(self):
        k = GPy.kern.Matern52(2)
-        self.assertTrue(kern_test(k, X=self.X, X2=self.X2, verbose=verbose))
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
    #TODO: turn off grad checkingwrt X for indexed kernels liek coregionalize
 class KernelTestsMiscellaneous(unittest.TestCase):
    def setUp(self):
        N, D = 100, 10
        self.X = np.linspace(-np.pi, +np.pi, N)[:,None] * np.ones(D)
        self.rbf = GPy.kern.RBF(range(2))
        self.linear = GPy.kern.Linear((3,5,6))
        self.matern = GPy.kern.Matern32(np.array([2,4,7]))
        self.sumkern = self.rbf + self.linear
        self.sumkern += self.matern
        self.sumkern.randomize()
    def test_active_dims(self):
        self.assertListEqual(self.sumkern.active_dims.tolist(), range(8))
    def test_which_parts(self):
        self.assertTrue(np.allclose(self.sumkern.K(self.X, which_parts=[self.linear, self.matern]), self.linear.K(self.X)+self.matern.K(self.X)))
        self.assertTrue(np.allclose(self.sumkern.K(self.X, which_parts=[self.linear, self.rbf]), self.linear.K(self.X)+self.rbf.K(self.X)))
        self.assertTrue(np.allclose(self.sumkern.K(self.X, which_parts=self.sumkern.parts[0]), self.rbf.K(self.X)))
 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@ -664,8 +664,7 @@ class LaplaceTests(unittest.TestCase):
            print m1
            print m2
-        m2.parameters_changed()
+        m2[:] = m1[:]
        #m2._set_params(m1._get_params())
        #Predict for training points to get posterior mean and variance
        post_mean, post_var, _, _ = m1.predict(X)
@ -701,8 +700,9 @@ class LaplaceTests(unittest.TestCase):
        np.testing.assert_almost_equal(m1.log_likelihood(), m2.log_likelihood(), decimal=2)
        #Check marginals are the same with random
        m1.randomize()
-        #m2._set_params(m1._get_params())
+        import ipdb;ipdb.set_trace()
-        m2.parameters_changed()
+        m2[:] = m1[:]
        np.testing.assert_almost_equal(m1.log_likelihood(), m2.log_likelihood(), decimal=2)
        #Check they are checkgradding
--- a/GPy/testing/psi_stat_expectation_tests.py
+++ b/GPy/testing/psi_stat_expectation_tests.py
@ -12,6 +12,7 @@ import numpy
 from GPy.kern import RBF
 from GPy.kern import Linear
 from copy import deepcopy
 from GPy.core.parameterization.variational import NormalPosterior
 __test__ = lambda: 'deep' in sys.argv
 # np.random.seed(0)
@ -28,53 +29,21 @@ def ard(p):
 class Test(unittest.TestCase):
    input_dim = 9
    num_inducing = 13
-    N = 300
+    N = 1000
    Nsamples = 1e6
    def setUp(self):
        i_s_dim_list = [2,4,3]
        indices = numpy.cumsum(i_s_dim_list).tolist()
        input_slices = [slice(a,b) for a,b in zip([None]+indices, indices)]
        #input_slices[2] = deepcopy(input_slices[1])
        input_slice_kern = GPy.kern.kern(9,
                                         [
                                          RBF(i_s_dim_list[0], np.random.rand(), np.random.rand(i_s_dim_list[0]), ARD=True),
                                          RBF(i_s_dim_list[1], np.random.rand(), np.random.rand(i_s_dim_list[1]), ARD=True),
                                          Linear(i_s_dim_list[2], np.random.rand(i_s_dim_list[2]), ARD=True)
                                          ],
                                         input_slices = input_slices
                                         )
        self.kerns = (
-#                     input_slice_kern,
+                      #GPy.kern.RBF([0,1,2], ARD=True)+GPy.kern.Bias(self.input_dim)+GPy.kern.White(self.input_dim),
-#                       (GPy.kern.rbf(self.input_dim, ARD=True) +
+                      #GPy.kern.RBF(self.input_dim)+GPy.kern.Bias(self.input_dim)+GPy.kern.White(self.input_dim),
-#                        GPy.kern.linear(self.input_dim, ARD=True) +
+                      #GPy.kern.Linear(self.input_dim) + GPy.kern.Bias(self.input_dim) + GPy.kern.White(self.input_dim),
-#                        GPy.kern.bias(self.input_dim) +
+                      #GPy.kern.Linear(self.input_dim, ARD=True) + GPy.kern.Bias(self.input_dim) + GPy.kern.White(self.input_dim),
-#                        GPy.kern.white(self.input_dim)),
+                      GPy.kern.Linear([1,3,6,7], ARD=True) + GPy.kern.RBF([0,5,8], ARD=True) + GPy.kern.White(self.input_dim),
                    (#GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True)
                     GPy.kern.Linear(self.input_dim, np.random.rand(self.input_dim), ARD=True)
                     +GPy.kern.RBF(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True)
 #                      +GPy.kern.bias(self.input_dim)
 #                      +GPy.kern.white(self.input_dim)),
                    ),
 #                     (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
 #                      GPy.kern.bias(self.input_dim, np.random.rand())),
 #         (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True)
 #          +GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True)
 #          #+GPy.kern.bias(self.input_dim, np.random.rand())
 #          #+GPy.kern.white(self.input_dim, np.random.rand())),
 #         ),
 #                     GPy.kern.white(self.input_dim, np.random.rand())),
 #                     GPy.kern.rbf(self.input_dim), GPy.kern.rbf(self.input_dim, ARD=True),
 #                       GPy.kern.linear(self.input_dim, ARD=False), GPy.kern.linear(self.input_dim, ARD=True),
 #                       GPy.kern.linear(self.input_dim) + GPy.kern.bias(self.input_dim),
 #                     GPy.kern.rbf(self.input_dim) + GPy.kern.bias(self.input_dim),
 #                       GPy.kern.linear(self.input_dim) + GPy.kern.bias(self.input_dim) + GPy.kern.white(self.input_dim),
 #                       GPy.kern.rbf(self.input_dim) + GPy.kern.bias(self.input_dim) + GPy.kern.white(self.input_dim),
 #                       GPy.kern.bias(self.input_dim), GPy.kern.white(self.input_dim),
                      )
-        self.q_x_mean = np.random.randn(self.input_dim)
+        self.q_x_mean = np.random.randn(self.input_dim)[None]
-        self.q_x_variance = np.exp(np.random.randn(self.input_dim))
+        self.q_x_variance = np.exp(.5*np.random.randn(self.input_dim))[None]
        self.q_x_samples = np.random.randn(self.Nsamples, self.input_dim) * np.sqrt(self.q_x_variance) + self.q_x_mean
        self.q_x = NormalPosterior(self.q_x_mean, self.q_x_variance)
        self.Z = np.random.randn(self.num_inducing, self.input_dim)
        self.q_x_mean.shape = (1, self.input_dim)
        self.q_x_variance.shape = (1, self.input_dim)
@ -114,8 +83,9 @@ class Test(unittest.TestCase):
    def test_psi2(self):
        for kern in self.kerns:
            kern.randomize()
            Nsamples = int(np.floor(self.Nsamples/self.N))
-            psi2 = kern.psi2(self.Z, self.q_x_mean, self.q_x_variance)
+            psi2 = kern.psi2(self.Z, self.q_x)
            K_ = np.zeros((self.num_inducing, self.num_inducing))
            diffs = []
            for i, q_x_sample_stripe in enumerate(np.array_split(self.q_x_samples, self.Nsamples / Nsamples)):
@ -130,8 +100,8 @@ class Test(unittest.TestCase):
                pylab.figure(msg)
                pylab.plot(diffs, marker='x', mew=.2)
 #                 print msg, np.allclose(psi2.squeeze(), K_, rtol=1e-1, atol=.1)
-                self.assertTrue(np.allclose(psi2.squeeze(), K_),
+                self.assertTrue(np.allclose(psi2.squeeze(), K_,
-                                            #rtol=1e-1, atol=.1),
+                                            atol=.1, rtol=1),
                                msg=msg + ": not matching")
 #                 sys.stdout.write(".")
            except:
--- a/GPy/testing/psi_stat_gradient_tests.py
+++ b/GPy/testing/psi_stat_gradient_tests.py
@ -11,6 +11,7 @@ import itertools
 from GPy.core import Model
 from GPy.core.parameterization.param import Param
 from GPy.core.parameterization.transformations import Logexp
 from GPy.core.parameterization.variational import NormalPosterior
 class PsiStatModel(Model):
    def __init__(self, which, X, X_variance, Z, num_inducing, kernel):
@ -18,23 +19,24 @@ class PsiStatModel(Model):
        self.which = which
        self.X = Param("X", X)
        self.X_variance = Param('X_variance', X_variance, Logexp())
        self.q = NormalPosterior(self.X, self.X_variance)
        self.Z = Param("Z", Z)
        self.N, self.input_dim = X.shape
        self.num_inducing, input_dim = Z.shape
        assert self.input_dim == input_dim, "shape missmatch: Z:{!s} X:{!s}".format(Z.shape, X.shape)
        self.kern = kernel
-        self.psi_ = self.kern.__getattribute__(self.which)(self.Z, self.X, self.X_variance)
+        self.psi_ = self.kern.__getattribute__(self.which)(self.Z, self.q)
-        self.add_parameters(self.X, self.X_variance, self.Z, self.kern)
+        self.add_parameters(self.q, self.Z, self.kern)
    def log_likelihood(self):
        return self.kern.__getattribute__(self.which)(self.Z, self.X, self.X_variance).sum()
    def parameters_changed(self):
-        psimu, psiS = self.kern.__getattribute__("d" + self.which + "_dmuS")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance)
+        psimu, psiS = self.kern.__getattribute__("d" + self.which + "_dmuS")(numpy.ones_like(self.psi_), self.Z, self.q)
        self.X.gradient = psimu
        self.X_variance.gradient = psiS
        #psimu, psiS = numpy.ones(self.N * self.input_dim), numpy.ones(self.N * self.input_dim)
-        try: psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance)
+        try: psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(numpy.ones_like(self.psi_), self.Z, self.q)
        except AttributeError: psiZ = numpy.zeros_like(self.Z)
        self.Z.gradient = psiZ
        #psiZ = numpy.ones(self.num_inducing * self.input_dim)
@ -176,6 +178,6 @@ if __name__ == "__main__":
            +GPy.kern.White(input_dim)
            )
            )
-        m2.ensure_default_constraints()
+        #m2.ensure_default_constraints()
    else:
        unittest.main()
--- a/GPy/util/caching.py
+++ b/GPy/util/caching.py
@ -9,24 +9,27 @@ class Cacher(object):
    """
-    def __init__(self, operation, limit=5, ignore_args=()):
+    def __init__(self, operation, limit=5, ignore_args=(), force_kwargs=()):
        self.limit = int(limit)
        self.ignore_args = ignore_args
        self.force_kwargs = force_kwargs
        self.operation=operation
        self.cached_inputs = []
        self.cached_outputs = []
        self.inputs_changed = []
-    def __call__(self, *args):
+    def __call__(self, *args, **kw):
        """
        A wrapper function for self.operation,
        """
        #ensure that specified arguments are ignored
        items = sorted(kw.items(), key=lambda x: x[0])
        oa_all = args + tuple(a for _,a in items)
        if len(self.ignore_args) != 0:
-            oa = [a for i,a in enumerate(args) if i not in self.ignore_args]
+            oa = [a for i,a in itertools.chain(enumerate(args), items) if i not in self.ignore_args and i not in self.force_kwargs]
        else:
-            oa = args
+            oa = oa_all
        # this makes sure we only add an observer once, and that None can be in args
        observable_args = []
@ -37,8 +40,13 @@ class Cacher(object):
        #make sure that all the found argument really are observable:
        #otherswise don't cache anything, pass args straight though
        if not all([isinstance(arg, Observable) for arg in observable_args]):
-            return self.operation(*args)
+            return self.operation(*args, **kw)
        if len(self.force_kwargs) != 0:
            # check if there are force args, which force reloading
            for k in self.force_kwargs:
                if k in kw and kw[k] is not None:
                    return self.operation(*args, **kw)
        # TODO: WARNING !!! Cache OFFSWITCH !!! WARNING
        # return self.operation(*args)
@ -48,7 +56,7 @@ class Cacher(object):
            i = state.index(True)
            if self.inputs_changed[i]:
                #(elements of) the args have changed since we last computed: update
-                self.cached_outputs[i] = self.operation(*args)
+                self.cached_outputs[i] = self.operation(*args, **kw)
                self.inputs_changed[i] = False
            return self.cached_outputs[i]
        else:
@ -62,11 +70,11 @@ class Cacher(object):
                self.cached_outputs.pop(0)
            #compute
-            self.cached_inputs.append(args)
+            self.cached_inputs.append(oa_all)
-            self.cached_outputs.append(self.operation(*args))
+            self.cached_outputs.append(self.operation(*args, **kw))
            self.inputs_changed.append(False)
            [a.add_observer(self, self.on_cache_changed) for a in observable_args]
-            return self.cached_outputs[-1]#Max says return.
+            return self.cached_outputs[-1]#return
    def on_cache_changed(self, arg):
        """
@ -90,15 +98,16 @@ class Cache_this(object):
    """
    A decorator which can be applied to bound methods in order to cache them
    """
-    def __init__(self, limit=5, ignore_args=()):
+    def __init__(self, limit=5, ignore_args=(), force_kwargs=()):
        self.limit = limit
        self.ignore_args = ignore_args
        self.force_args = force_kwargs
        self.c = None
    def __call__(self, f):
-        def f_wrap(*args):
+        def f_wrap(*args, **kw):
            if self.c is None:
-                self.c = Cacher(f, self.limit, ignore_args=self.ignore_args)
+                self.c = Cacher(f, self.limit, ignore_args=self.ignore_args, force_kwargs=self.force_args)
-            return self.c(*args)
+            return self.c(*args, **kw)
        f_wrap._cacher = self
-        f_wrap.__doc__ = "**cached**\n\n" + (f.__doc__ or "")
+        f_wrap.__doc__ = "**cached**" + (f.__doc__ or "")
        return f_wrap