Merge branch 'params' of github.com:SheffieldML/GPy into params

2026-04-28 22:36:24 +02:00 · 2014-02-24 11:35:45 +00:00 · 2014-02-24 11:35:45 +00:00 · 632a702532
commit 632a702532
parent be74fff45a 811fcc3423
78 changed files with 2892 additions and 3760 deletions
--- a/GPy/core/init.py
+++ b/GPy/core/init.py
@ -2,7 +2,9 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from model import *
-from parameterization.parameterized import *
+from parameterization.parameterized import adjust_name_for_printing, Parameterizable
 from parameterization.param import Param, ParamConcatenation
 from gp import GP
 from sparse_gp import SparseGP
 from svigp import SVIGP
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -30,7 +30,10 @@ class GP(Model):
        super(GP, self).__init__(name)
        assert X.ndim == 2
-        self.X = ObservableArray(X)
+        if isinstance(X, ObservableArray):
            self.X = self.X = X
        else: self.X = ObservableArray(X)
        self.num_data, self.input_dim = self.X.shape
        assert Y.ndim == 2
@ -43,7 +46,8 @@ class GP(Model):
        else:
            self.Y_metadata = None
-        assert isinstance(kernel, kern.kern)
+        assert isinstance(kernel, kern.Kern)
        assert self.input_dim == kernel.input_dim
        self.kern = kernel
        assert isinstance(likelihood, likelihoods.Likelihood)
@ -70,7 +74,7 @@ class GP(Model):
    def log_likelihood(self):
        return self._log_marginal_likelihood
-    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False):
+    def _raw_predict(self, _Xnew, full_cov=False):
        """
        Internal helper function for making predictions, does not account
        for normalization or likelihood
@ -80,29 +84,27 @@ class GP(Model):
        diagonal of the covariance is returned.
        """
-        Kx = self.kern.K(_Xnew, self.X, which_parts=which_parts).T
+        Kx = self.kern.K(_Xnew, self.X).T
        #LiKx, _ = dtrtrs(self.posterior.woodbury_chol, np.asfortranarray(Kx), lower=1)
        WiKx = np.dot(self.posterior.woodbury_inv, Kx)
        mu = np.dot(Kx.T, self.posterior.woodbury_vector)
        if full_cov:
-            Kxx = self.kern.K(_Xnew, which_parts=which_parts)
+            Kxx = self.kern.K(_Xnew)
            #var = Kxx - tdot(LiKx.T)
            var = np.dot(Kx.T, WiKx)
        else:
-            Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
+            Kxx = self.kern.Kdiag(_Xnew)
            #var = Kxx - np.sum(LiKx*LiKx, 0)
            var = Kxx - np.sum(WiKx*Kx, 0)
            var = var.reshape(-1, 1)
        return mu, var
-    def predict(self, Xnew, which_parts='all', full_cov=False, **likelihood_args):
+    def predict(self, Xnew, full_cov=False, **likelihood_args):
        """
        Predict the function(s) at the new point(s) Xnew.
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.input_dim
        :param which_parts:  specifies which outputs kernel(s) to use in prediction
        :type which_parts: ('all', list of bools)
        :param full_cov: whether to return the full covariance matrix, or just
                         the diagonal
        :type full_cov: bool
@ -118,13 +120,13 @@ class GP(Model):
        """
        #predict the latent function values
-        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)
+        mu, var = self._raw_predict(Xnew, full_cov=full_cov)
        # now push through likelihood
        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, **likelihood_args)
        return mean, var, _025pm, _975pm
-    def posterior_samples_f(self,X,size=10,which_parts='all',full_cov=True):
+    def posterior_samples_f(self,X,size=10, full_cov=True):
        """
        Samples the posterior GP at the points X.
@ -132,13 +134,11 @@ class GP(Model):
        :type X: np.ndarray, Nnew x self.input_dim.
        :param size: the number of a posteriori samples.
        :type size: int.
        :param which_parts: which of the kernel functions to use (additively).
        :type which_parts: 'all', or list of bools.
        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
        :type full_cov: bool.
        :returns: Ysim: set of simulations, a Numpy array (N x samples).
        """
-        m, v = self._raw_predict(X, which_parts=which_parts, full_cov=full_cov)
+        m, v = self._raw_predict(X,  full_cov=full_cov)
        v = v.reshape(m.size,-1) if len(v.shape)==3 else v
        if not full_cov:
            Ysim = np.random.multivariate_normal(m.flatten(), np.diag(v.flatten()), size).T
@ -147,7 +147,7 @@ class GP(Model):
        return Ysim
-    def posterior_samples(self,X,size=10,which_parts='all',full_cov=True,noise_model=None):
+    def posterior_samples(self,X,size=10, full_cov=True,noise_model=None):
        """
        Samples the posterior GP at the points X.
@ -155,15 +155,13 @@ class GP(Model):
        :type X: np.ndarray, Nnew x self.input_dim.
        :param size: the number of a posteriori samples.
        :type size: int.
        :param which_parts: which of the kernel functions to use (additively).
        :type which_parts: 'all', or list of bools.
        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
        :type full_cov: bool.
        :param noise_model: for mixed noise likelihood, the noise model to use in the samples.
        :type noise_model: integer.
        :returns: Ysim: set of simulations, a Numpy array (N x samples).
        """
-        Ysim = self.posterior_samples_f(X, size, which_parts=which_parts, full_cov=full_cov)
+        Ysim = self.posterior_samples_f(X, size, full_cov=full_cov)
        if isinstance(self.likelihood, Gaussian):
            noise_std = np.sqrt(self.likelihood._get_params())
            Ysim += np.random.normal(0,noise_std,Ysim.shape)
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -4,12 +4,8 @@
 from .. import likelihoods
 from ..inference import optimization
 from ..util.linalg import jitchol
 from ..util.misc import opt_wrapper
 from parameterization import Parameterized
 from parameterization.parameterized import UNFIXED
 from parameterization.domains import _POSITIVE, _REAL
 from parameterization.index_operations import ParameterIndexOperations
 import multiprocessing as mp
 import numpy as np
 from numpy.linalg.linalg import LinAlgError
@ -240,7 +236,7 @@ class Model(Parameterized):
        constrained positive.
        """
        raise DeprecationWarning, 'parameters now have default constraints'
-        positive_strings = ['variance', 'lengthscale', 'precision', 'kappa', 'sensitivity']
+        #positive_strings = ['variance', 'lengthscale', 'precision', 'kappa', 'sensitivity']
        # param_names = self._get_param_names()
 #         for s in positive_strings:
@ -489,20 +485,17 @@ class Model(Parameterized):
        if not hasattr(self, 'kern'):
            raise ValueError, "this model has no kernel"
-        k = [p for p in self.kern._parameters_ if hasattr(p, "ARD") and p.ARD]
+        k = self.kern#[p for p in self.kern._parameters_ if hasattr(p, "ARD") and p.ARD]
-        if (not len(k) == 1):
+        from ..kern import RBF, Linear#, RBFInv
-            raise ValueError, "cannot determine sensitivity for this kernel"
+
        k = k[0]
        from ..kern.parts.rbf import RBF
        from ..kern.parts.rbf_inv import RBFInv
        from ..kern.parts.linear import Linear
        if isinstance(k, RBF):
            return 1. / k.lengthscale
-        elif isinstance(k, RBFInv):
+        #elif isinstance(k, RBFInv):
-            return k.inv_lengthscale
+        #    return k.inv_lengthscale
        elif isinstance(k, Linear):
            return k.variances
-
+        else:
            raise ValueError, "cannot determine sensitivity for this kernel"
    def pseudo_EM(self, stop_crit=.1, **kwargs):
        """
--- a/GPy/core/parameterization/array_core.py
+++ b/GPy/core/parameterization/array_core.py
@ -28,14 +28,20 @@ class ObservableArray(np.ndarray, Observable):
    """
    __array_priority__ = -1 # Never give back ObservableArray
    def __new__(cls, input_array):
        if not isinstance(input_array, ObservableArray):
            obj = np.atleast_1d(input_array).view(cls)
        else: obj = input_array
        cls.__name__ = "ObservableArray\n     "
        obj._observers_ = {}
        return obj
    def __init__(self, *a, **kw):
        super(ObservableArray, self).__init__(*a, **kw)
    def __array_finalize__(self, obj):
        # see InfoArray.__array_finalize__ for comments
        if obj is None: return
-        self._observers_ = getattr(obj, '_observers_', None)
+        self._observer_callables_ = getattr(obj, '_observer_callables_', None)
    def __array_wrap__(self, out_arr, context=None):
        return out_arr.view(np.ndarray)
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@ -83,12 +83,22 @@ class ParameterIndexOperations(object):
    def iterproperties(self):
        return self._properties.iterkeys()
-    def shift(self, start, size):
+    def shift_right(self, start, size):
        for ind in self.iterindices():
            toshift = ind>=start
            if toshift.size > 0:
            ind[toshift] += size
    def shift_left(self, start, size):
        for v, ind in self.items():
            todelete = (ind>=start) * (ind<start+size)
            if todelete.size != 0:
                ind = ind[~todelete]
            toshift = ind>=start
            if toshift.size != 0:
                ind[toshift] -= size
            if ind.size != 0: self._properties[v] = ind
            else: del self._properties[v]
    def clear(self):
        self._properties.clear()
@ -183,7 +193,7 @@ class ParameterIndexOperationsView(object):
            yield i 
-    def shift(self, start, size):
+    def shift_right(self, start, size):
        raise NotImplementedError, 'Shifting only supported in original ParamIndexOperations'
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@ -3,7 +3,7 @@
 import itertools
 import numpy
-from parameter_core import Constrainable, Gradcheckable, Indexable, Parameterizable, adjust_name_for_printing
+from parameter_core import Constrainable, Gradcheckable, Indexable, Parentable, adjust_name_for_printing
 from array_core import ObservableArray, ParamList
 ###### printing
@ -15,7 +15,7 @@ __precision__ = numpy.get_printoptions()['precision'] # numpy printing precision
 __print_threshold__ = 5
 ######
-class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameterizable):
+class Param(Constrainable, ObservableArray, Gradcheckable, Indexable):
    """
    Parameter object for GPy models.
@ -54,11 +54,11 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameteri
        obj._tied_to_me_ = SetDict()
        obj._tied_to_ = []
        obj._original_ = True
-        obj.gradient = None
+        obj._gradient_ = None
        return obj
-    def __init__(self, name, input_array, default_constraint=None):
+    def __init__(self, name, input_array, default_constraint=None, *a, **kw):
-        super(Param, self).__init__(name=name, default_constraint=default_constraint)
+        super(Param, self).__init__(name=name, default_constraint=default_constraint, *a, **kw)
    def __array_finalize__(self, obj):
        # see InfoArray.__array_finalize__ for comments
@ -76,10 +76,20 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameteri
        self._updated_ = getattr(obj, '_updated_', None)
        self._original_ = getattr(obj, '_original_', None)
        self._name = getattr(obj, 'name', None)
-        self.gradient = getattr(obj, 'gradient', None)
+        self._gradient_ = getattr(obj, '_gradient_', None)
        self.constraints = getattr(obj, 'constraints', None)
        self.priors = getattr(obj, 'priors', None)
    @property
    def gradient(self):
        if self._gradient_ is None:
            self._gradient_ = numpy.zeros(self._realshape_)
        return self._gradient_[self._current_slice_]
    @gradient.setter
    def gradient(self, val):
        self.gradient[:] = val
    #===========================================================================
    # Pickling operations
    #===========================================================================
@ -115,6 +125,13 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameteri
        self._direct_parent_ = state.pop()
        self.name = state.pop()
    def copy(self, *args):
        constr = self.constraints.copy()
        priors = self.priors.copy()
        p = Param(self.name, self.view(numpy.ndarray).copy(), self._default_constraint_)
        p.constraints = constr
        p.priors = priors
        return p
    #===========================================================================
    # get/set parameters
    #===========================================================================
@ -127,7 +144,10 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameteri
        return self.flat
    def _collect_gradient(self, target):
-        target[:] = self.gradient.flat
+        target += self.gradient.flat
    def _set_gradient(self, g):
        self.gradient = g.reshape(self._realshape_)
    #===========================================================================
    # Array operations -> done
@ -214,7 +234,9 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameteri
    def _description_str(self):
        if self.size <= 1: return ["%f" % self]
        else: return [str(self.shape)]
-    def parameter_names(self, add_name=False):
+    def parameter_names(self, add_self=False, adjust_for_printing=False):
        if adjust_for_printing:
            return [adjust_name_for_printing(self.name)]
        return [self.name]
    @property
    def flattened_parameters(self):
@ -231,14 +253,9 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameteri
    @property
    def _ties_str(self):
        return [t._short() for t in self._tied_to_] or ['']
    @property
    def name_hirarchical(self):
        if self.has_parent():
            return self._direct_parent_.hirarchy_name() + adjust_name_for_printing(self.name)
        return adjust_name_for_printing(self.name)
    def __repr__(self, *args, **kwargs):
        name = "\033[1m{x:s}\033[0;0m:\n".format(
-                            x=self.name_hirarchical)
+                            x=self.hirarchy_name())
        return name + super(Param, self).__repr__(*args, **kwargs)
    def _ties_for(self, rav_index):
        # size = sum(p.size for p in self._tied_to_)
@ -272,12 +289,12 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameteri
        gen = map(lambda x: " ".join(map(str, x)), gen)
        return reduce(lambda a, b:max(a, len(b)), gen, len(header))
    def _max_len_values(self):
-        return reduce(lambda a, b:max(a, len("{x:=.{0}g}".format(__precision__, x=b))), self.flat, len(self.name_hirarchical))
+        return reduce(lambda a, b:max(a, len("{x:=.{0}g}".format(__precision__, x=b))), self.flat, len(self.hirarchy_name()))
    def _max_len_index(self, ind):
        return reduce(lambda a, b:max(a, len(str(b))), ind, len(__index_name__))
    def _short(self):
        # short string to print
-        name = self._direct_parent_.hirarchy_name() + adjust_name_for_printing(self.name)
+        name = self.hirarchy_name()
        if self._realsize_ < 2:
            return name
        ind = self._indices()
@ -300,8 +317,8 @@ class Param(ObservableArray, Constrainable, Gradcheckable, Indexable, Parameteri
        if lp is None: lp = self._max_len_names(prirs, __tie_name__)
        sep = '-'
        header_format = "  {i:{5}^{2}s}  |  \033[1m{x:{5}^{1}s}\033[0;0m  |  {c:{5}^{0}s}  |  {p:{5}^{4}s}  |  {t:{5}^{3}s}"
-        if only_name: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.name_hirarchical, c=sep*lc, i=sep*li, t=sep*lt, p=sep*lp)  # nice header for printing
+        if only_name: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.hirarchy_name(), c=sep*lc, i=sep*li, t=sep*lt, p=sep*lp)  # nice header for printing
-        else: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.name_hirarchical, c=__constraints_name__, i=__index_name__, t=__tie_name__, p=__priors_name__)  # nice header for printing
+        else: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.hirarchy_name(), c=__constraints_name__, i=__index_name__, t=__tie_name__, p=__priors_name__)  # nice header for printing
        if not ties: ties = itertools.cycle([''])
        return "\n".join([header] + ["  {i!s:^{3}s}  |  {x: >{1}.{2}g}  |  {c:^{0}s}  |  {p:^{5}s}  |  {t:^{4}s}  ".format(lc, lx, __precision__, li, lt, lp, x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in itertools.izip(indices, vals, constr_matrix, ties, prirs)])  # return all the constraints with right indices
        # except: return super(Param, self).__str__()
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@ -11,14 +11,19 @@ def adjust_name_for_printing(name):
    return ''
 class Observable(object):
-    _observers_ = {}
+    def __init__(self, *args, **kwargs):
        from collections import defaultdict
        self._observer_callables_ = defaultdict(list)
    def add_observer(self, observer, callble):
-        self._observers_[observer] = callble
+        self._observer_callables_[observer].append(callble)
-        #callble(self)
+
-    def remove_observer(self, observer):
+    def remove_observer(self, observer, callble):
-        del self._observers_[observer]
+        del self._observer_callables_[observer][callble]
    def _notify_observers(self):
-        [callble(self) for callble in self._observers_.itervalues()]
+        [[callble(self) for callble in callables]
         for callables in self._observer_callables_.itervalues()]
 class Pickleable(object):
    def _getstate(self):
@ -47,10 +52,8 @@ class Pickleable(object):
 #===============================================================================
 class Parentable(object):
-    def __init__(self, direct_parent=None, parent_index=None):
+    _direct_parent_ = None
-        super(Parentable,self).__init__()
+    _parent_index_ = None
        self._direct_parent_ = direct_parent
        self._parent_index_ = parent_index
    def has_parent(self):
        return self._direct_parent_ is not None
@ -68,10 +71,13 @@ class Parentable(object):
            return self
        return self._direct_parent_._highest_parent_
    def _notify_parameters_changed(self):
        if self.has_parent():
            self._direct_parent_._notify_parameters_changed()
 class Nameable(Parentable):
-    _name = None
+    def __init__(self, name, *a, **kw):
-    def __init__(self, name, direct_parent=None, parent_index=None):
+        super(Nameable, self).__init__(*a, **kw)
        super(Nameable,self).__init__(direct_parent, parent_index)
        self._name = name or self.__class__.__name__
    @property
@ -80,58 +86,21 @@ class Nameable(Parentable):
    @name.setter
    def name(self, name):
        from_name = self.name
        assert isinstance(name, str)
        self._name = name
        if self.has_parent():
            self._direct_parent_._name_changed(self, from_name)
-
+    def hirarchy_name(self, adjust_for_printing=True):
-
+        if adjust_for_printing: adjust = lambda x: adjust_name_for_printing(x)
-class Parameterizable(Parentable):
+        else: adjust = lambda x: x
    def __init__(self, *args, **kwargs):
        super(Parameterizable, self).__init__(*args, **kwargs)
        from GPy.core.parameterization.array_core import ParamList
        _parameters_ = ParamList()
    def parameter_names(self, add_name=False):
        if add_name:
            return [adjust_name_for_printing(self.name) + "." + xi for x in self._parameters_ for xi in x.parameter_names(add_name=True)]
        return [xi for x in self._parameters_ for xi in x.parameter_names(add_name=True)]
    def _collect_gradient(self, target):
        import itertools
        [p._collect_gradient(target[s]) for p, s in itertools.izip(self._parameters_, self._param_slices_)]
    def _get_params(self):
        import numpy as np
        # don't overwrite this anymore!
        if not self.size:
            return np.empty(shape=(0,), dtype=np.float64)
        return np.hstack([x._get_params() for x in self._parameters_ if x.size > 0])
    def _set_params(self, params, update=True):
        # don't overwrite this anymore!
        import itertools
        [p._set_params(params[s], update=update) for p, s in itertools.izip(self._parameters_, self._param_slices_)]
        self.parameters_changed()
    def parameters_changed(self):
        """
        This method gets called when parameters have changed.
        Another way of listening to param changes is to
        add self as a listener to the param, such that
        updates get passed through. See :py:function:``GPy.core.param.Observable.add_observer``
        """
        pass
    def _notify_parameters_changed(self):
        self.parameters_changed()
        if self.has_parent():
-            self._direct_parent_._notify_parameters_changed()
+            return self._direct_parent_.hirarchy_name() + "." + adjust(self.name)
        return adjust(self.name)
 class Gradcheckable(Parentable):
-    #===========================================================================
+    def __init__(self, *a, **kw):
-    # Gradchecking
+        super(Gradcheckable, self).__init__(*a, **kw)
    #===========================================================================
    def checkgrad(self, verbose=0, step=1e-6, tolerance=1e-3):
        if self.has_parent():
            return self._highest_parent_._checkgrad(self, verbose=verbose, step=step, tolerance=tolerance)
@ -139,6 +108,7 @@ class Gradcheckable(Parentable):
    def _checkgrad(self, param):
        raise NotImplementedError, "Need log likelihood to check gradient against"
 class Indexable(object):
    def _raveled_index(self):
        raise NotImplementedError, "Need to be able to get the raveled Index"
@ -157,9 +127,10 @@ class Indexable(object):
        """
        raise NotImplementedError, "shouldnt happen, raveld index transformation required from non parameterization object?"        
-class Constrainable(Nameable, Indexable, Parameterizable):
+
-    def __init__(self, name, default_constraint=None):
+class Constrainable(Nameable, Indexable):
-        super(Constrainable,self).__init__(name)
+    def __init__(self, name, default_constraint=None, *a, **kw):
        super(Constrainable, self).__init__(name=name, *a, **kw)
        self._default_constraint_ = default_constraint
        from index_operations import ParameterIndexOperations
        self.constraints = ParameterIndexOperations()
@ -167,6 +138,16 @@ class Constrainable(Nameable, Indexable, Parameterizable):
        if self._default_constraint_ is not None:
            self.constrain(self._default_constraint_)
    def _disconnect_parent(self, constr=None):
        if constr is None:
            constr = self.constraints.copy()
        self.constraints.clear()
        self.constraints = constr
        self._direct_parent_ = None
        self._parent_index_ = None
        self._connect_fixes()
        self._notify_parent_change()
    #===========================================================================
    # Fixing Parameters:
    #===========================================================================
@ -344,5 +325,108 @@ class Constrainable(Nameable, Indexable, Parameterizable):
        return removed
 class Parameterizable(Constrainable):
    def __init__(self, *args, **kwargs):
        super(Parameterizable, self).__init__(*args, **kwargs)
        from GPy.core.parameterization.array_core import ParamList
        _parameters_ = ParamList()
        self._added_names_ = set()
    def parameter_names(self, add_self=False, adjust_for_printing=False, recursive=True):
        if adjust_for_printing: adjust = lambda x: adjust_name_for_printing(x)
        else: adjust = lambda x: x
        if recursive: names = [xi for x in self._parameters_ for xi in x.parameter_names(add_self=True, adjust_for_printing=adjust_for_printing)]
        else: names = [adjust(x.name) for x in self._parameters_]
        if add_self: names = map(lambda x: adjust(self.name) + "." + x, names)
        return names
    def _add_parameter_name(self, param):
        pname = adjust_name_for_printing(param.name)
        # and makes sure to not delete programmatically added parameters
        if pname in self.__dict__:
            if not (param is self.__dict__[pname]):
                if pname in self._added_names_:
                    del self.__dict__[pname]
                    self._add_parameter_name(param)
        else:
            self.__dict__[pname] = param
            self._added_names_.add(pname)
    def _remove_parameter_name(self, param=None, pname=None):
        assert param is None or pname is None, "can only delete either param by name, or the name of a param"
        pname = adjust_name_for_printing(pname) or adjust_name_for_printing(param.name)
        if pname in self._added_names_:
            del self.__dict__[pname]
            self._added_names_.remove(pname)
        self._connect_parameters()
    def _name_changed(self, param, old_name):
        self._remove_parameter_name(None, old_name)
        self._add_parameter_name(param)
    def _collect_gradient(self, target):
        import itertools
        [p._collect_gradient(target[s]) for p, s in itertools.izip(self._parameters_, self._param_slices_)]
    def _set_gradient(self, g):
        import itertools
        [p._set_gradient(g[s]) for p, s in itertools.izip(self._parameters_, self._param_slices_)]
    def _get_params(self):
        import numpy as np
        # don't overwrite this anymore!
        if not self.size:
            return np.empty(shape=(0,), dtype=np.float64)
        return np.hstack([x._get_params() for x in self._parameters_ if x.size > 0])
    def _set_params(self, params, update=True):
        # don't overwrite this anymore!
        import itertools
        [p._set_params(params[s], update=update) for p, s in itertools.izip(self._parameters_, self._param_slices_)]
        self.parameters_changed()
    def copy(self):
        """Returns a (deep) copy of the current model"""
        import copy
        from .index_operations import ParameterIndexOperations, ParameterIndexOperationsView
        from .array_core import ParamList
        dc = dict()
        for k, v in self.__dict__.iteritems():
            if k not in ['_direct_parent_', '_parameters_', '_parent_index_'] + self.parameter_names():
                if isinstance(v, (Constrainable, ParameterIndexOperations, ParameterIndexOperationsView)):
                    dc[k] = v.copy()
                else:
                    dc[k] = copy.deepcopy(v)
            if k == '_parameters_':
                params = [p.copy() for p in v]
        dc['_direct_parent_'] = None
        dc['_parent_index_'] = None
        dc['_parameters_'] = ParamList()
        dc['constraints'].clear()
        dc['priors'].clear()
        dc['size'] = 0
        s = self.__new__(self.__class__)
        s.__dict__ = dc
        for p in params:
            s.add_parameter(p)
        return s
    def _notify_parameters_changed(self):
        self.parameters_changed()
        if self.has_parent():
            self._direct_parent_._notify_parameters_changed()
    def parameters_changed(self):
        """
        This method gets called when parameters have changed.
        Another way of listening to param changes is to
        add self as a listener to the param, such that
        updates get passed through. See :py:function:``GPy.core.param.Observable.add_observer``
        """
        pass
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -3,16 +3,15 @@
 import numpy; np = numpy
 import copy
 import cPickle
 import itertools
 from re import compile, _pattern_type
-from param import ParamConcatenation, Param
+from param import ParamConcatenation
-from parameter_core import Constrainable, Pickleable, Observable, adjust_name_for_printing, Gradcheckable
+from parameter_core import Constrainable, Pickleable, Parentable, Observable, Parameterizable, adjust_name_for_printing, Gradcheckable
-from transformations import __fixed__, FIXED, UNFIXED
+from transformations import __fixed__
 from array_core import ParamList
-class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
+class Parameterized(Parameterizable, Pickleable, Observable, Gradcheckable):
    """
    Parameterized class
@ -54,8 +53,8 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
        If you want to operate on all parameters use m[''] to wildcard select all paramters
        and concatenate them. Printing m[''] will result in printing of all parameters in detail.
    """
-    def __init__(self, name=None):
+    def __init__(self, name=None, *a, **kw):
-        super(Parameterized, self).__init__(name=name)
+        super(Parameterized, self).__init__(name=name, parent=None, parent_index=None, *a, **kw)
        self._in_init_ = True
        self._parameters_ = ParamList()
        self.size = sum(p.size for p in self._parameters_)
@ -63,7 +62,6 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
            self._fixes_ = None
        self._param_slices_ = []
        self._connect_parameters()
        self._added_names_ = set()
        del self._in_init_
    def add_parameter(self, param, index=None):
@ -89,8 +87,8 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
                self._parameters_.append(param)
            else:
                start = sum(p.size for p in self._parameters_[:index])
-                self.constraints.shift(start, param.size)
+                self.constraints.shift_right(start, param.size)
-                self.priors.shift(start, param.size)
+                self.priors.shift_right(start, param.size)
                self.constraints.update(param.constraints, start)
                self.priors.update(param.priors, start)
                self._parameters_.insert(index, param)
@ -115,21 +113,18 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
        """
        if not param in self._parameters_:
            raise RuntimeError, "Parameter {} does not belong to this object, remove parameters directly from their respective parents".format(param._short())
-        del self._parameters_[param._parent_index_]
+        
        start = sum([p.size for p in self._parameters_[:param._parent_index_]])
        self._remove_parameter_name(param)
        self.size -= param.size
-        constr = param.constraints.copy()
+        del self._parameters_[param._parent_index_]
-        param.constraints.clear()
+        
-        param.constraints = constr
+        param._disconnect_parent()
-        param._direct_parent_ = None
+        self.constraints.shift_left(start, param.size)
        param._parent_index_ = None
        param._connect_fixes()
        param._notify_parent_change()
        pname = adjust_name_for_printing(param.name)
        if pname in self._added_names_:
            del self.__dict__[pname]
        self._connect_parameters()
        #self._notify_parent_change()
        self._connect_fixes()
        self._connect_parameters()
        self._notify_parent_change()
    def _connect_parameters(self):
        # connect parameterlist to this parameterized object
@ -145,19 +140,9 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
        for i, p in enumerate(self._parameters_):
            p._direct_parent_ = self
            p._parent_index_ = i
            not_unique = []
            sizes.append(p.size + sizes[-1])
            self._param_slices_.append(slice(sizes[-2], sizes[-1]))
-            pname = adjust_name_for_printing(p.name)
+            self._add_parameter_name(p)
            # and makes sure to not delete programmatically added parameters
            if pname in self.__dict__:
                if isinstance(self.__dict__[pname], (Parameterized, Param)):
                    if not p is self.__dict__[pname]:
                        not_unique.append(pname)
                        del self.__dict__[pname]
            elif not (pname in not_unique):
                self.__dict__[pname] = p
                self._added_names_.add(pname)
    #===========================================================================
    # Pickling operations
@ -174,19 +159,7 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
                cPickle.dump(self, f, protocol)
        else:
            cPickle.dump(self, f, protocol)
    def copy(self):
        """Returns a (deep) copy of the current model """
        # dc = dict()
        # for k, v in self.__dict__.iteritems():
            # if k not in ['_highest_parent_', '_direct_parent_']:
                # dc[k] = copy.deepcopy(v)
        # dc = copy.deepcopy(self.__dict__)
        # dc['_highest_parent_'] = None
        # dc['_direct_parent_'] = None
        # s = self.__class__.new()
        # s.__dict__ = dc
        return copy.deepcopy(self)
    def __getstate__(self):
        if self._has_get_set_state():
            return self._getstate()
@ -243,7 +216,7 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
    # Optimization handles:
    #===========================================================================
    def _get_param_names(self):
-        n = numpy.array([p.name_hirarchical + '[' + str(i) + ']' for p in self.flattened_parameters for i in p._indices()])
+        n = numpy.array([p.hirarchy_name() + '[' + str(i) + ']' for p in self.flattened_parameters for i in p._indices()])
        return n
    def _get_param_names_transformed(self):
        n = self._get_param_names()
@ -265,14 +238,6 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
        if self._has_fixes(): tmp = self._get_params(); tmp[self._fixes_] = p; p = tmp; del tmp
        [numpy.put(p, ind, c.f(p[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
        return p
    def _name_changed(self, param, old_name):
        if hasattr(self, old_name) and old_name in self._added_names_:
            delattr(self, old_name)
            self._added_names_.remove(old_name)
        pname = adjust_name_for_printing(param.name)
        if pname not in self.__dict__:
            self._added_names_.add(pname)
            self.__dict__[pname] = param
    #===========================================================================
    # Indexable Handling
    #===========================================================================
@ -335,10 +300,6 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
        # you can retrieve the original param through this method, by passing
        # the copy here
        return self._parameters_[param._parent_index_]
    def hirarchy_name(self):
        if self.has_parent():
            return self._direct_parent_.hirarchy_name() + adjust_name_for_printing(self.name) + "."
        return ''
    #===========================================================================
    # Get/set parameters:
    #===========================================================================
@ -348,13 +309,11 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
        """
        if not isinstance(regexp, _pattern_type): regexp = compile(regexp)
        found_params = []
-        for p in self._parameters_:
+        for n, p in itertools.izip(self.parameter_names(False, False, True), self.flattened_parameters):
-            if regexp.match(p.name) is not None:
+            if regexp.match(n) is not None:
                found_params.append(p)
            if isinstance(p, Parameterized):
                found_params.extend(p.grep_param_names(regexp))
        return found_params
-        return [param for param in self._parameters_ if regexp.match(param.name) is not None]
+
    def __getitem__(self, name, paramlist=None):
        if paramlist is None:
            paramlist = self.grep_param_names(name)
@ -366,36 +325,22 @@ class Parameterized(Constrainable, Pickleable, Observable, Gradcheckable):
                    return ParamConcatenation(paramlist)
            return paramlist[-1]
        return ParamConcatenation(paramlist)
    def __setitem__(self, name, value, paramlist=None):
        try: param = self.__getitem__(name, paramlist)
        except AttributeError as a: raise a
        param[:] = value
 #     def __getattr__(self, name):
 #         return self.__getitem__(name)
 #     def __getattribute__(self, name):
 #         #try:
 #             return object.__getattribute__(self, name)
        # except AttributeError:
        #    _, a, tb = sys.exc_info()
        #    try:
        #        return self.__getitem__(name)
        #    except AttributeError:
        #        raise AttributeError, a.message, tb
    def __setattr__(self, name, val):
        # override the default behaviour, if setting a param, so broadcasting can by used        
-        if hasattr(self, "_parameters_"):
+        if hasattr(self, '_parameters_'):
-            paramlist = self.grep_param_names(name)
+            pnames = self.parameter_names(False, adjust_for_printing=True, recursive=False)
-            if len(paramlist) == 1: self.__setitem__(name, val, paramlist); return
+            if name in pnames: self._parameters_[pnames.index(name)][:] = val; return
        object.__setattr__(self, name, val);
    #===========================================================================
    # Printing:
    #===========================================================================
    def _short(self):
-        # short string to print
+        return self.hirarchy_name()
        if self.has_parent():
            return self._direct_parent_.hirarchy_name() + adjust_name_for_printing(self.name)
        else:
            return adjust_name_for_printing(self.name)
    @property
    def flattened_parameters(self):
        return [xi for x in self._parameters_ for xi in x.flattened_parameters]
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@ -3,21 +3,77 @@ Created on 6 Nov 2013
@author: maxz
 '''
 import numpy as np
 from parameterized import Parameterized
 from param import Param
 from transformations import Logexp
-class Normal(Parameterized):
+class VariationalPrior(object):
    def KL_divergence(self, variational_posterior):
        raise NotImplementedError, "override this for variational inference of latent space"
    def update_gradients_KL(self, variational_posterior):
        """
        updates the gradients for mean and variance **in place**
        """
        raise NotImplementedError, "override this for variational inference of latent space"
 class NormalPrior(VariationalPrior):
    def KL_divergence(self, variational_posterior):
        var_mean = np.square(variational_posterior.mean).sum()
        var_S = (variational_posterior.variance - np.log(variational_posterior.variance)).sum()
        return 0.5 * (var_mean + var_S) - 0.5 * variational_posterior.input_dim * variational_posterior.num_data
    def update_gradients_KL(self, variational_posterior):
        # dL:
        variational_posterior.mean.gradient -= variational_posterior.mean
        variational_posterior.variance.gradient -= (1. - (1. / (variational_posterior.variance))) * 0.5
 class VariationalPosterior(Parameterized):
    def __init__(self, means=None, variances=None, name=None, **kw):
        super(VariationalPosterior, self).__init__(name=name, **kw)
        self.mean = Param("mean", means)
        self.variance = Param("variance", variances, Logexp())
        self.add_parameters(self.mean, self.variance)
        self.num_data, self.input_dim = self.mean.shape
        if self.has_uncertain_inputs():
            assert self.variance.shape == self.mean.shape, "need one variance per sample and dimenion"
    def has_uncertain_inputs(self):
        return not self.variance is None
 class NormalPosterior(VariationalPosterior):
    '''
-    Normal distribution for variational approximations.
+    NormalPosterior distribution for variational approximations.
    holds the means and variances for a factorizing multivariate normal distribution
    '''
-    def __init__(self, means, variances, name='latent space'):
+
-        Parameterized.__init__(self, name=name)
+    def plot(self, *args):
-        self.mean = Param("mean", means)
+        """
-        self.variance = Param('variance', variances, Logexp())
+        Plot latent space X in 1D:
-        self.add_parameters(self.mean, self.variance)
+
        See  GPy.plotting.matplot_dep.variational_plots
        """
        import sys
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ...plotting.matplot_dep import variational_plots
        return variational_plots.plot(self,*args)
 class SpikeAndSlabPosterior(VariationalPosterior):
    '''
    The SpikeAndSlab distribution for variational approximations.
    '''
    def __init__(self, means, variances, binary_prob, name='latent space'):
        """
        binary_prob : the probability of the distribution on the slab part.
        """
        super(SpikeAndSlabPosterior, self).__init__(means, variances, name)
        self.gamma = Param("binary_prob",binary_prob,)
        self.add_parameter(self.gamma)
    def plot(self, *args):
        """
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -5,8 +5,9 @@ import numpy as np
 from ..util.linalg import mdot
 from gp import GP
 from parameterization.param import Param
-from GPy.inference.latent_function_inference import var_dtc
+from ..inference.latent_function_inference import var_dtc
 from .. import likelihoods
 from parameterization.variational import NormalPosterior
 class SparseGP(GP):
    """
@ -45,45 +46,44 @@ class SparseGP(GP):
        self.Z = Param('inducing inputs', Z)
        self.num_inducing = Z.shape[0]
-        if not (X_variance is None):
+        self.q = NormalPosterior(X, X_variance)
            assert X_variance.shape == X.shape
        self.X_variance = X_variance
-        GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name)
+        GP.__init__(self, self.q.mean, Y, kernel, likelihood, inference_method=inference_method, name=name)
        self.add_parameter(self.Z, index=0)
        self.parameters_changed()
-    def _update_gradients_Z(self, add=False):
+    def has_uncertain_inputs(self):
-    #The derivative of the bound wrt the inducing inputs Z ( unless they're all fixed)
+        return self.q.has_uncertain_inputs()                
        if not self.Z.is_fixed:
            if add: self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
            else: self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
            if self.X_variance is None:
                self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)
            else:
                self.Z.gradient += self.kern.dpsi1_dZ(self.grad_dict['dL_dpsi1'], self.Z, self.X, self.X_variance)
                self.Z.gradient += self.kern.dpsi2_dZ(self.grad_dict['dL_dpsi2'], self.Z, self.X, self.X_variance)
    def parameters_changed(self):
        if self.has_uncertain_inputs():
            self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference_latent(self.kern, self.q, self.Z, self.likelihood, self.Y)
        else:
            self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
-        self._update_gradients_Z(add=False)
+        self.likelihood.update_gradients(self.grad_dict.pop('partial_for_likelihood'))
        if self.has_uncertain_inputs():
            self.kern.update_gradients_variational(posterior_variational=self.q, Z=self.Z, **self.grad_dict)
            self.Z.gradient = self.kern.gradients_Z_variational(posterior_variational=self.q, Z=self.Z, **self.grad_dict)
        else:
            self.kern.update_gradients_sparse(X=self.X, Z=self.Z, **self.grad_dict)
            self.Z.gradient = self.kern.gradients_Z_sparse(X=self.X, Z=self.Z, **self.grad_dict)
-    def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
+    def _raw_predict(self, Xnew, X_variance_new=None, full_cov=False):
        """
        Make a prediction for the latent function values
        """
        if X_variance_new is None:
-            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
+            Kx = self.kern.K(self.Z, Xnew)
            mu = np.dot(Kx.T, self.posterior.woodbury_vector)
            if full_cov:
-                Kxx = self.kern.K(Xnew, which_parts=which_parts)
+                Kxx = self.kern.K(Xnew)
-                var = Kxx - mdot(Kx.T, self.posterior.woodbury_inv, Kx) # NOTE this won't work for plotting
+                #var = Kxx - mdot(Kx.T, self.posterior.woodbury_inv, Kx)
                var = Kxx - np.tensordot(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx).T, Kx, [1,0]).swapaxes(1,2)
            else:
-                Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)
+                Kxx = self.kern.Kdiag(Xnew)
-                var = Kxx - np.sum(Kx * np.dot(self.posterior.woodbury_inv, Kx), 0)
+                var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
        else:
-            # assert which_parts=='all', "swithching out parts of variational kernels is not implemented"
+            Kx = self.kern.psi1(self.Z, Xnew, X_variance_new)
            Kx = self.kern.psi1(self.Z, Xnew, X_variance_new) # , which_parts=which_parts) TODO: which_parts
            mu = np.dot(Kx, self.Cpsi1V)
            if full_cov:
                raise NotImplementedError, "TODO"
@ -91,7 +91,7 @@ class SparseGP(GP):
                Kxx = self.kern.psi0(self.Z, Xnew, X_variance_new)
                psi2 = self.kern.psi2(self.Z, Xnew, X_variance_new)
                var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
-        return mu, var[:,None]
+        return mu, var
    def _getstate(self):
@ -101,12 +101,10 @@ class SparseGP(GP):
        """
        return GP._getstate(self) + [self.Z,
                self.num_inducing,
                self.has_uncertain_inputs,
                self.X_variance]
    def _setstate(self, state):
        self.X_variance = state.pop()
        self.has_uncertain_inputs = state.pop()
        self.num_inducing = state.pop()
        self.Z = state.pop()
        GP._setstate(self, state)
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -1,9 +1,9 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as _np
-default_seed = _np.random.seed(123344)
+#default_seed = _np.random.seed(123344)
-def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False, output_dim=200, nan=False):
+def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan=False):
    """
    model for testing purposes. Samples from a GP with rbf kernel and learns
    the samples with a new kernel. Normally not for optimization, just model cheking
@ -21,19 +21,20 @@ def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False,
    # generate GPLVM-like data
    X = _np.random.rand(num_inputs, input_dim)
-    lengthscales = _np.random.rand(input_dim)
+    #lengthscales = _np.random.rand(input_dim)
-    k = (GPy.kern.rbf(input_dim, .5, lengthscales, ARD=True)
+    #k = (GPy.kern.RBF(input_dim, .5, lengthscales, ARD=True)
-         #+ GPy.kern.white(input_dim, 0.01)
+         ##+ GPy.kern.white(input_dim, 0.01)
-         )
+         #)
    k = GPy.kern.Linear(input_dim, ARD=1)# + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
    K = k.K(X)
    Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, (output_dim,)).T
-    # k = GPy.kern.rbf_inv(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
+    # k = GPy.kern.RBF_inv(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
-    k = GPy.kern.linear(input_dim)# + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+    #k = GPy.kern.linear(input_dim)# + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
-    # k = GPy.kern.rbf(input_dim, ARD = False)  + GPy.kern.white(input_dim, 0.00001)
+    # k = GPy.kern.RBF(input_dim, ARD = False)  + GPy.kern.white(input_dim, 0.00001)
-    # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.rbf(input_dim, .3, _np.ones(input_dim) * .2, ARD=True)
+    # k = GPy.kern.RBF(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.RBF(input_dim, .3, _np.ones(input_dim) * .2, ARD=True)
-    # k = GPy.kern.rbf(input_dim, .5, 2., ARD=0) + GPy.kern.rbf(input_dim, .3, .2, ARD=0)
+    # k = GPy.kern.RBF(input_dim, .5, 2., ARD=0) + GPy.kern.RBF(input_dim, .3, .2, ARD=0)
-    # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.linear(input_dim, _np.ones(input_dim) * .2, ARD=True)
+    # k = GPy.kern.RBF(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.linear(input_dim, _np.ones(input_dim) * .2, ARD=True)
    p = .3
@ -41,14 +42,14 @@ def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False,
    if nan:
        m.inference_method = GPy.inference.latent_function_inference.var_dtc.VarDTCMissingData()
-        m.Y[_np.random.binomial(1,p,size=(Y.shape))] = _np.nan
+        m.Y[_np.random.binomial(1,p,size=(Y.shape)).astype(bool)] = _np.nan
        m.parameters_changed()
    #===========================================================================
    # randomly obstruct data with percentage p
    #===========================================================================
    #m2 = GPy.models.BayesianGPLVMWithMissingData(Y_obstruct, input_dim, kernel=k, num_inducing=num_inducing)
-    m.lengthscales = lengthscales
+    #m.lengthscales = lengthscales
    if plot:
        import matplotlib.pyplot as pb
@ -73,7 +74,7 @@ def gplvm_oil_100(optimize=True, verbose=1, plot=True):
    data = GPy.util.datasets.oil_100()
    Y = data['X']
    # create simple GP model
-    kernel = GPy.kern.rbf(6, ARD=True) + GPy.kern.bias(6)
+    kernel = GPy.kern.RBF(6, ARD=True) + GPy.kern.Bias(6)
    m = GPy.models.GPLVM(Y, 6, kernel=kernel)
    m.data_labels = data['Y'].argmax(axis=1)
    if optimize: m.optimize('scg', messages=verbose)
@ -88,7 +89,7 @@ def sparse_gplvm_oil(optimize=True, verbose=0, plot=True, N=100, Q=6, num_induci
    Y = Y - Y.mean(0)
    Y /= Y.std(0)
    # Create the model
-    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q)
+    kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.Bias(Q)
    m = GPy.models.SparseGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing)
    m.data_labels = data['Y'][:N].argmax(axis=1)
@ -138,7 +139,7 @@ def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=15, Q=4
                                         (1 - var))) + .001
    Z = _np.random.permutation(X)[:num_inducing]
-    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))
+    kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.Bias(Q, _np.exp(-2)) + GPy.kern.White(Q, _np.exp(-2))
    m = BayesianGPLVM(Y, Q, X=X, X_variance=S, num_inducing=num_inducing, Z=Z, kernel=kernel)
    m.data_colors = c
@ -158,46 +159,51 @@ def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=15, Q=4
 def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, max_iters=1000, **k):
    import GPy
    from GPy.likelihoods import Gaussian
    from matplotlib import pyplot as plt
    _np.random.seed(0)
    data = GPy.util.datasets.oil()
-    kernel = GPy.kern.rbf_inv(Q, 1., [.1] * Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2))
+    kernel = GPy.kern.RBF(Q, 1., [.1] * Q, ARD=True)# + GPy.kern.Bias(Q, _np.exp(-2))
    Y = data['X'][:N]
-    Yn = Gaussian(Y, normalize=True)
+    m = GPy.models.BayesianGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing, **k)
    m = GPy.models.BayesianGPLVM(Yn, Q, kernel=kernel, num_inducing=num_inducing, **k)
    m.data_labels = data['Y'][:N].argmax(axis=1)
-    m['noise'] = Yn.Y.var() / 100.
+    m['.*noise.var'] = Y.var() / 100.
    if optimize:
        m.optimize('scg', messages=verbose, max_iters=max_iters, gtol=.05)
    if plot:
-        y = m.likelihood.Y[0, :]
+        y = m.Y[0, :]
        fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
        m.plot_latent(ax=latent_axes)
-        data_show = GPy.util.visualize.vector_show(y)
+        data_show = GPy.plotting.matplot_dep.visualize.vector_show(y)
-        lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :], # @UnusedVariable
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X[0, :], # @UnusedVariable
            m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
        raw_input('Press enter to finish')
        plt.close(fig)
    return m
 def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
    _np.random.seed(1234)
    x = _np.linspace(0, 4 * _np.pi, N)[:, None]
-    s1 = _np.vectorize(lambda x: _np.sin(x))
+    s1 = _np.vectorize(lambda x: -_np.sin(_np.exp(x)))
-    s2 = _np.vectorize(lambda x: _np.cos(x))
+    s2 = _np.vectorize(lambda x: _np.cos(x)**2)
    s3 = _np.vectorize(lambda x:-_np.exp(-_np.cos(2 * x)))
-    sS = _np.vectorize(lambda x: _np.sin(2 * x))
+    sS = _np.vectorize(lambda x: x*_np.sin(x))
    s1 = s1(x)
    s2 = s2(x)
    s3 = s3(x)
    sS = sS(x)
-    S1 = _np.hstack([s1, sS])
+    s1 -= s1.mean(); s1 /= s1.std(0)
    s2 -= s2.mean(); s2 /= s2.std(0)
    s3 -= s3.mean(); s3 /= s3.std(0)
    sS -= sS.mean(); sS /= sS.std(0)
    S1 = _np.hstack([s1, s2, sS])
    S2 = _np.hstack([s2, s3, sS])
    S3 = _np.hstack([s3, sS])
@ -268,7 +274,7 @@ def bgplvm_simulation(optimize=True, verbose=1,
    D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 3, 10
    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
    Y = Ylist[0]
-    k = kern.linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+    k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
    m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k)
    if optimize:
@ -288,16 +294,18 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
    from GPy.models import BayesianGPLVM
    from GPy.inference.latent_function_inference.var_dtc import VarDTCMissingData
-    D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 3, 10
+    D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 5, 9
    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
    Y = Ylist[0]
-    k = kern.linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+    k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
-    inan = _np.random.binomial(1, .3, size=Y.shape)
+    inan = _np.random.binomial(1, .6, size=Y.shape).astype(bool)
-    m = BayesianGPLVM(Y, Q, init="random", num_inducing=num_inducing, kernel=k)
+    m = BayesianGPLVM(Y.copy(), Q, init="random", num_inducing=num_inducing, kernel=k)
    m.inference_method = VarDTCMissingData()
    m.Y[inan] = _np.nan
    m.q.variance *= .1
    m.parameters_changed()
    m.Yreal = Y
    if optimize:
        print "Optimizing model:"
@ -318,7 +326,7 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
    likelihood_list = [Gaussian(x, normalize=True) for x in Ylist]
-    k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2))
+    k = kern.Linear(Q, ARD=True) + kern.Bias(Q, _np.exp(-2)) + kern.White(Q, _np.exp(-2))
    m = MRD(likelihood_list, input_dim=Q, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw)
    m.ensure_default_constraints()
@ -345,15 +353,15 @@ def brendan_faces(optimize=True, verbose=True, plot=True):
    m = GPy.models.GPLVM(Yn, Q)
    # optimize
-    m.constrain('rbf|noise|white', GPy.core.transformations.logexp_clipped())
+    m.constrain('rbf|noise|white', GPy.transformations.LogexpClipped())
    if optimize: m.optimize('scg', messages=verbose, max_iters=1000)
    if plot:
        ax = m.plot_latent(which_indices=(0, 1))
        y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, order='F', invert=False, scale=False)
+        data_show = GPy.plotting.matplot_dep.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, order='F', invert=False, scale=False)
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')
    return m
@ -372,8 +380,8 @@ def olivetti_faces(optimize=True, verbose=True, plot=True):
    if plot:
        ax = m.plot_latent(which_indices=(0, 1))
        y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
+        data_show = GPy.plotting.matplot_dep.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')
    return m
@ -388,8 +396,8 @@ def stick_play(range=None, frame_rate=15, optimize=False, verbose=True, plot=Tru
        Y = data['Y'][range[0]:range[1], :].copy()
    if plot:
        y = Y[0, :]
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.util.visualize.data_play(Y, data_show, frame_rate)
+        GPy.plotting.matplot_dep.visualize.data_play(Y, data_show, frame_rate)
    return Y
 def stick(kernel=None, optimize=True, verbose=True, plot=True):
@ -400,12 +408,12 @@ def stick(kernel=None, optimize=True, verbose=True, plot=True):
    # optimize
    m = GPy.models.GPLVM(data['Y'], 2, kernel=kernel)
    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
-    if plot and GPy.util.visualize.visual_available:
+    if plot and GPy.plotting.matplot_dep.visualize.visual_available:
        plt.clf
        ax = m.plot_latent()
        y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')
    return m
@ -419,12 +427,12 @@ def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):
    mapping = GPy.mappings.Linear(data['Y'].shape[1], 2)
    m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
-    if plot and GPy.util.visualize.visual_available:
+    if plot and GPy.plotting.matplot_dep.visualize.visual_available:
        plt.clf
        ax = m.plot_latent()
        y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')
    return m
@ -435,16 +443,16 @@ def bcgplvm_stick(kernel=None, optimize=True, verbose=True, plot=True):
    data = GPy.util.datasets.osu_run1()
    # optimize
-    back_kernel=GPy.kern.rbf(data['Y'].shape[1], lengthscale=5.)
+    back_kernel=GPy.kern.RBF(data['Y'].shape[1], lengthscale=5.)
    mapping = GPy.mappings.Kernel(X=data['Y'], output_dim=2, kernel=back_kernel)
    m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
-    if plot and GPy.util.visualize.visual_available:
+    if plot and GPy.plotting.matplot_dep.visualize.visual_available:
        plt.clf
        ax = m.plot_latent()
        y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')
    return m
@ -470,7 +478,7 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
    data = GPy.util.datasets.osu_run1()
    Q = 6
-    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))
+    kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.Bias(Q, _np.exp(-2)) + GPy.kern.White(Q, _np.exp(-2))
    m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel)
    # optimize
    m.ensure_default_constraints()
@ -481,8 +489,8 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
        plt.sca(latent_axes)
        m.plot_latent()
        y = m.likelihood.Y[0, :].copy()
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.util.visualize.lvm_dimselect(m.X[0, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
+        GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X[0, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
        raw_input('Press enter to finish')
    return m
@ -501,8 +509,8 @@ def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose
    if plot:
        ax = m.plot_latent()
        y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.skeleton_show(y[None, :], data['skel'])
+        data_show = GPy.plotting.matplot_dep.visualize.skeleton_show(y[None, :], data['skel'])
-        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')
        lvm_visualizer.close()
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -41,7 +41,7 @@ def coregionalization_toy2(optimize=True, plot=True):
    Y = np.vstack((Y1, Y2))
    #build the kernel
-    k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
+    k1 = GPy.kern.RBF(1) + GPy.kern.bias(1)
    k2 = GPy.kern.coregionalize(2,1)
    k = k1**k2
    m = GPy.models.GPRegression(X, Y, kernel=k)
@ -68,7 +68,7 @@ def coregionalization_toy2(optimize=True, plot=True):
 #    Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
 #    Y = np.vstack((Y1, Y2))
 #
-#    k1 = GPy.kern.rbf(1)
+#    k1 = GPy.kern.RBF(1)
 #    m = GPy.models.GPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1])
 #    m.constrain_fixed('.*rbf_var', 1.)
 #    m.optimize(max_iters=100)
@ -127,7 +127,7 @@ def epomeo_gpx(max_iters=200, optimize=True, plot=True):
    Z = np.hstack((np.linspace(t[:,0].min(), t[:, 0].max(), num_inducing)[:, None],
                   np.random.randint(0, 4, num_inducing)[:, None]))
-    k1 = GPy.kern.rbf(1)
+    k1 = GPy.kern.RBF(1)
    k2 = GPy.kern.coregionalize(output_dim=5, rank=5)
    k = k1**k2
@ -156,7 +156,7 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
    data['Y'] = data['Y'] - np.mean(data['Y'])
-    lls = GPy.examples.regression._contour_data(data, length_scales, log_SNRs, GPy.kern.rbf)
+    lls = GPy.examples.regression._contour_data(data, length_scales, log_SNRs, GPy.kern.RBF)
    if plot:
        pb.contour(length_scales, log_SNRs, np.exp(lls), 20, cmap=pb.cm.jet)
        ax = pb.gca()
@ -172,8 +172,8 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
    optim_point_y = np.empty(2)
    np.random.seed(seed=seed)
    for i in range(0, model_restarts):
-        # kern = GPy.kern.rbf(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.))
+        # kern = GPy.kern.RBF(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.))
-        kern = GPy.kern.rbf(1, variance=np.random.uniform(1e-3, 1), lengthscale=np.random.uniform(5, 50))
+        kern = GPy.kern.RBF(1, variance=np.random.uniform(1e-3, 1), lengthscale=np.random.uniform(5, 50))
        m = GPy.models.GPRegression(data['X'], data['Y'], kernel=kern)
        m['noise_variance'] = np.random.uniform(1e-3, 1)
@ -196,7 +196,7 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
        ax.set_ylim(ylim)
    return m # (models, lls)
-def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):
+def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.RBF):
    """
    Evaluate the GP objective function for a given data set for a range of
    signal to noise ratios and a range of lengthscales.
@ -278,10 +278,10 @@ def toy_poisson_rbf_1d_laplace(optimize=True, plot=True):
    optimizer='scg'
    x_len = 30
    X = np.linspace(0, 10, x_len)[:, None]
-    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.rbf(1).K(X))
+    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.RBF(1).K(X))
    Y = np.array([np.random.poisson(np.exp(f)) for f in f_true])[:,None]
-    kern = GPy.kern.rbf(1)
+    kern = GPy.kern.RBF(1)
    poisson_lik = GPy.likelihoods.Poisson()
    laplace_inf = GPy.inference.latent_function_inference.LaplaceInference()
@ -319,10 +319,10 @@ def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize
    if kernel_type == 'linear':
        kernel = GPy.kern.linear(X.shape[1], ARD=1)
    elif kernel_type == 'rbf_inv':
-        kernel = GPy.kern.rbf_inv(X.shape[1], ARD=1)
+        kernel = GPy.kern.RBF_inv(X.shape[1], ARD=1)
    else:
-        kernel = GPy.kern.rbf(X.shape[1], ARD=1)
+        kernel = GPy.kern.RBF(X.shape[1], ARD=1)
-    kernel += GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+    kernel += GPy.kern.White(X.shape[1]) + GPy.kern.bias(X.shape[1])
    m = GPy.models.GPRegression(X, Y, kernel)
    # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
    # m.set_prior('.*lengthscale',len_prior)
@ -358,9 +358,9 @@ def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4, o
    if kernel_type == 'linear':
        kernel = GPy.kern.linear(X.shape[1], ARD=1)
    elif kernel_type == 'rbf_inv':
-        kernel = GPy.kern.rbf_inv(X.shape[1], ARD=1)
+        kernel = GPy.kern.RBF_inv(X.shape[1], ARD=1)
    else:
-        kernel = GPy.kern.rbf(X.shape[1], ARD=1)
+        kernel = GPy.kern.RBF(X.shape[1], ARD=1)
    #kernel += GPy.kern.bias(X.shape[1])
    X_variance = np.ones(X.shape) * 0.5
    m = GPy.models.SparseGPRegression(X, Y, kernel, X_variance=X_variance)
@ -421,7 +421,7 @@ def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, opti
    X = np.random.uniform(-3., 3., (num_samples, 1))
    Y = np.sin(X) + np.random.randn(num_samples, 1) * 0.05
    # construct kernel
-    rbf = GPy.kern.rbf(1)
+    rbf = GPy.kern.RBF(1)
    # create simple GP Model
    m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing)
    m.checkgrad(verbose=1)
@ -444,7 +444,7 @@ def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, opt
        Y[inan] = np.nan
    # construct kernel
-    rbf = GPy.kern.rbf(2)
+    rbf = GPy.kern.RBF(2)
    # create simple GP Model
    m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing)
@ -476,9 +476,9 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
    # likelihood = GPy.likelihoods.Gaussian(Y)
    Z = np.random.uniform(-3., 3., (7, 1))
-    k = GPy.kern.rbf(1)
+    k = GPy.kern.RBF(1)
    # create simple GP Model - no input uncertainty on this one
-    m = GPy.models.SparseGPRegression(X, Y, kernel=GPy.kern.rbf(1), Z=Z)
+    m = GPy.models.SparseGPRegression(X, Y, kernel=GPy.kern.RBF(1), Z=Z)
    if optimize:
        m.optimize('scg', messages=1, max_iters=max_iters)
@ -489,7 +489,7 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
    print m
    # the same Model with uncertainty
-    m = GPy.models.SparseGPRegression(X, Y, kernel=GPy.kern.rbf(1), Z=Z, X_variance=S)
+    m = GPy.models.SparseGPRegression(X, Y, kernel=GPy.kern.RBF(1), Z=Z, X_variance=S)
    if optimize:
        m.optimize('scg', messages=1, max_iters=max_iters)
    if plot:
--- a/GPy/inference/latent_function_inference/init.py
+++ b/GPy/inference/latent_function_inference/init.py
@ -16,7 +16,9 @@ If the likelihood object is something other than Gaussian, then exact inference
 is not tractable. We then resort to a Laplace approximation (laplace.py) or
 expectation propagation (ep.py).
-The inference methods return a "Posterior" instance, which is a simple
+The inference methods return a 
 :class:`~GPy.inference.latent_function_inference.posterior.Posterior` 
 instance, which is a simple
 structure which contains a summary of the posterior. The model classes can then
 use this posterior object for making predictions, optimizing hyper-parameters,
 etc.
@ -29,3 +31,15 @@ expectation_propagation = 'foo' # TODO
 from GPy.inference.latent_function_inference.var_dtc import VarDTC
 from dtc import DTC
 from fitc import FITC
 # class FullLatentFunctionData(object):
 #     
 # 
 # class LatentFunctionInference(object):
 #     def inference(self, kern, X, likelihood, Y, Y_metadata=None):
 #         """
 #         Do inference on the latent functions given a covariance function `kern`,
 #         inputs and outputs `X` and `Y`, and a likelihood `likelihood`. 
 #         Additional metadata for the outputs `Y` can be given in `Y_metadata`.
 #         """
 #         raise NotImplementedError, "Abstract base class for full inference"
--- a/GPy/inference/latent_function_inference/dtc.py
+++ b/GPy/inference/latent_function_inference/dtc.py
@ -32,7 +32,7 @@ class DTC(object):
        #make sure the noise is not hetero
        beta = 1./np.squeeze(likelihood.variance)
        if beta.size <1:
-            raise NotImplementedError, "no hetero noise with this implementatino of DTC"
+            raise NotImplementedError, "no hetero noise with this implementation of DTC"
        Kmm = kern.K(Z)
        Knn = kern.Kdiag(X)
@ -89,4 +89,85 @@ class DTC(object):
        return post, log_marginal, grad_dict
 class vDTC(object):
    def __init__(self):
        self.const_jitter = 1e-6
    def inference(self, kern, X, X_variance, Z, likelihood, Y):
        assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
        #TODO: MAX! fix this!
        from ...util.misc import param_to_array
        Y = param_to_array(Y)
        num_inducing, _ = Z.shape
        num_data, output_dim = Y.shape
        #make sure the noise is not hetero
        beta = 1./np.squeeze(likelihood.variance)
        if beta.size <1:
            raise NotImplementedError, "no hetero noise with this implementation of DTC"
        Kmm = kern.K(Z)
        Knn = kern.Kdiag(X)
        Knm = kern.K(X, Z)
        U = Knm
        Uy = np.dot(U.T,Y)
        #factor Kmm 
        Kmmi, L, Li, _ = pdinv(Kmm)
        # Compute A
        LiUTbeta = np.dot(Li, U.T)*np.sqrt(beta)
        A_ = tdot(LiUTbeta)
        trace_term = -0.5*(np.sum(Knn)*beta - np.trace(A_))
        A = A_ + np.eye(num_inducing)
        # factor A
        LA = jitchol(A)
        # back substutue to get b, P, v
        tmp, _ = dtrtrs(L, Uy, lower=1)
        b, _ = dtrtrs(LA, tmp*beta, lower=1)
        tmp, _ = dtrtrs(LA, b, lower=1, trans=1)
        v, _ = dtrtrs(L, tmp, lower=1, trans=1)
        tmp, _ = dtrtrs(LA, Li, lower=1, trans=0)
        P = tdot(tmp.T)
        #compute log marginal
        log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \
                       -np.sum(np.log(np.diag(LA)))*output_dim + \
                       0.5*num_data*output_dim*np.log(beta) + \
                       -0.5*beta*np.sum(np.square(Y)) + \
                       0.5*np.sum(np.square(b)) + \
                       trace_term
        # Compute dL_dKmm
        vvT_P = tdot(v.reshape(-1,1)) + P
        LAL = Li.T.dot(A).dot(Li)
        dL_dK = Kmmi - 0.5*(vvT_P + LAL)
        # Compute dL_dU
        vY = np.dot(v.reshape(-1,1),Y.T)
        #dL_dU = vY - np.dot(vvT_P, U.T)
        dL_dU = vY - np.dot(vvT_P - Kmmi, U.T)
        dL_dU *= beta
        #compute dL_dR
        Uv = np.dot(U, v)
        dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - 1./beta + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) + np.sum(np.square(Uv), 1) )*beta**2
        dL_dR -=beta*trace_term/num_data
        grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':np.zeros_like(Knn) + -0.5*beta, 'dL_dKnm':dL_dU.T}
        #update gradients
        kern.update_gradients_sparse(X=X, Z=Z, **grad_dict)
        likelihood.update_gradients(dL_dR)
        #construct a posterior object
        post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L)
        return post, log_marginal, grad_dict
--- a/GPy/inference/latent_function_inference/ep.py
+++ b/GPy/inference/latent_function_inference/ep.py
@ -3,390 +3,91 @@ from scipy import stats
 from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
 from likelihood import likelihood
-class EP(likelihood):
+class EP(object):
-    def __init__(self,data,noise_model):
+    def __init__(self, epsilon=1e-6, eta=1., delta=1.):
        """
        Expectation Propagation
        :param data: data to model
        :type data: numpy array
        :param noise_model: noise distribution
        :type noise_model: A GPy noise model
        """
        self.noise_model = noise_model
        self.data = data
        self.num_data, self.output_dim = self.data.shape
        self.is_heteroscedastic = True
        self.num_params = 0
        #Initial values - Likelihood approximation parameters:
        #p(y|f) = t(f|tau_tilde,v_tilde)
        self.tau_tilde = np.zeros(self.num_data)
        self.v_tilde = np.zeros(self.num_data)
        #initial values for the GP variables
        self.Y = np.zeros((self.num_data,1))
        self.covariance_matrix = np.eye(self.num_data)
        self.precision = np.ones(self.num_data)[:,None]
        self.Z = 0
        self.YYT = None
        self.V = self.precision * self.Y
        self.VVT_factor = self.V
        self.trYYT = 0.
        super(EP, self).__init__()
    def restart(self):
        self.tau_tilde = np.zeros(self.num_data)
        self.v_tilde = np.zeros(self.num_data)
        self.Y = np.zeros((self.num_data,1))
        self.covariance_matrix = np.eye(self.num_data)
        self.precision = np.ones(self.num_data)[:,None]
        self.Z = 0
        self.YYT = None
        self.V = self.precision * self.Y
        self.VVT_factor = self.V
        self.trYYT = 0.
    def predictive_values(self,mu,var,full_cov,**noise_args):
        if full_cov:
            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
        return self.noise_model.predictive_values(mu,var,**noise_args)
    def log_predictive_density(self, y_test, mu_star, var_star):
        """
        Calculation of the log predictive density
        .. math:
            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
        :param y_test: test observations (y_{*})
        :type y_test: (Nx1) array
        :param mu_star: predictive mean of gaussian p(f_{*}|mu_{*}, var_{*})
        :type mu_star: (Nx1) array
        :param var_star: predictive variance of gaussian p(f_{*}|mu_{*}, var_{*})
        :type var_star: (Nx1) array
        """
        return self.noise_model.log_predictive_density(y_test, mu_star, var_star)
    def _get_params(self):
        #return np.zeros(0)
        return self.noise_model._get_params()
    def _get_param_names(self):
        #return []
        return self.noise_model._get_param_names()
    def _set_params(self,p):
        #pass # TODO: the EP likelihood might want to take some parameters...
        self.noise_model._set_params(p)
    def _gradients(self,partial):
        #return np.zeros(0) # TODO: the EP likelihood might want to take some parameters...
        return self.noise_model._gradients(partial)
    def _compute_GP_variables(self):
        #Variables to be called from GP
        mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model
        sigma_sum = 1./self.tau_ + 1./self.tau_tilde
        mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2
        self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep
        self.Z += 0.5*self.num_data*np.log(2*np.pi)
        self.Y =  mu_tilde[:,None]
        self.YYT = np.dot(self.Y,self.Y.T)
        self.covariance_matrix = np.diag(1./self.tau_tilde)
        self.precision = self.tau_tilde[:,None]
        self.V = self.precision * self.Y
        self.VVT_factor = self.V
        self.trYYT = np.trace(self.YYT)
    def fit_full(self, K, epsilon=1e-3,power_ep=[1.,1.]):
        """
        The expectation-propagation algorithm.
        For nomenclature see Rasmussen & Williams 2006.
        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
        :type epsilon: float
-        :param power_ep: Power EP parameters
+        :param eta: Power EP thing TODO: Ricardo: what, exactly?
-        :type power_ep: list of floats
+        :type eta: float64
-
+        :param delta: Power EP thing TODO: Ricardo: what, exactly?
        :type delta: float64
        """
-        self.epsilon = epsilon
+        self.epsilon, self.eta, self.delta = epsilon, eta, delta
-        self.eta, self.delta = power_ep
+        self.reset()
    def reset(self):
        self.old_mutilde, self.old_vtilde = None, None
    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
        K = kern.K(X)
        mu_tilde, tau_tilde = self.expectation_propagation()
    def expectation_propagation(self, K, Y, Y_metadata, likelihood)
        num_data, data_dim = Y.shape
        assert data_dim == 1, "This EP methods only works for 1D outputs"
        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
        mu = np.zeros(self.num_data)
        Sigma = K.copy()
        """
        Initial values - Cavity distribution parameters:
        q_(f|mu_,sigma2_) = Product{q_i(f|mu_i,sigma2_i)}
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
        self.tau_ = np.empty(self.num_data,dtype=float)
        self.v_ = np.empty(self.num_data,dtype=float)
        #Initial values - Marginal moments
-        z = np.empty(self.num_data,dtype=float)
+        Z_hat = np.empty(num_data,dtype=np.float64)
-        self.Z_hat = np.empty(self.num_data,dtype=float)
+        mu_hat = np.empty(num_data,dtype=np.float64)
-        phi = np.empty(self.num_data,dtype=float)
+        sigma2_hat = np.empty(num_data,dtype=np.float64)
-        mu_hat = np.empty(self.num_data,dtype=float)
+
-        sigma2_hat = np.empty(self.num_data,dtype=float)
+        #initial values - Gaussian factors
        if self.old_mutilde is None:
            tau_tilde, mu_tilde, v_tilde = np.zeros((3, num_data, num_data))
        else:
            assert old_mutilde.size == num_data, "data size mis-match: did you change the data? try resetting!"
            mu_tilde, v_tilde = self.old_mutilde, self.old_vtilde
            tau_tilde = v_tilde/mu_tilde
        #Approximation
        epsilon_np1 = self.epsilon + 1.
        epsilon_np2 = self.epsilon + 1.
-       	self.iterations = 0
+       	iterations = 0
-        self.np1 = [self.tau_tilde.copy()]
+        while (epsilon_np1 > self.epsilon) or (epsilon_np2 > self.epsilon):
-        self.np2 = [self.v_tilde.copy()]
+            update_order = np.random.permutation(num_data)
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
            update_order = np.random.permutation(self.num_data)
            for i in update_order:
                #Cavity distribution parameters
-                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
+                tau_cav = 1./Sigma[i,i] - self.eta*tau_tilde[i]
-                self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
+                v_cav = mu[i]/Sigma[i,i] - self.eta*v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
+                Z_hat[i], mu_hat[i], sigma2_hat[i] = likelihood.moments_match(Y[i], tau_cav, v_cav, Y_metadata=(None if Y_metadata is None else Y_metadata[i]))
                #Site parameters update
-                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
+                delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
-                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
+                delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
-                self.tau_tilde[i] += Delta_tau
+                tau_tilde[i] += delta_tau
-                self.v_tilde[i] += Delta_v
+                v_tilde[i] += delta_v
                #Posterior distribution parameters update
-                DSYR(Sigma,Sigma[:,i].copy(), -float(Delta_tau/(1.+ Delta_tau*Sigma[i,i])))
+                DSYR(Sigma, Sigma[:,i].copy(), -Delta_tau/(1.+ Delta_tau*Sigma[i,i]))
-                mu = np.dot(Sigma,self.v_tilde)
+                mu = np.dot(Sigma, v_tilde)
-                self.iterations += 1
+                iterations += 1
-            #Sigma recomptutation with Cholesky decompositon
+
-            Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K
+            #(re) compute Sigma and mu using full Cholesky decompy
-            B = np.eye(self.num_data) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
+            tau_tilde_root = np.sqrt(tau_tilde)
            Sroot_tilde_K = tau_tilde_root[:,None] * K
            B = np.eye(num_data) + Sroot_tilde_K * tau_tilde_root[None,:]
            L = jitchol(B)
-            V,info = dtrtrs(L,Sroot_tilde_K,lower=1)
+            V, _ = dtrtrs(L, Sroot_tilde_K, lower=1)
            Sigma = K - np.dot(V.T,V)
-            mu = np.dot(Sigma,self.v_tilde)
+            mu = np.dot(Sigma,v_tilde)
            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.num_data
            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.num_data
            self.np1.append(self.tau_tilde.copy())
            self.np2.append(self.v_tilde.copy())
-        return self._compute_GP_variables()
+            #monitor convergence
            epsilon_np1 = np.mean(np.square(tau_tilde-tau_tilde_old))
            epsilon_np2 = np.mean(np.square(v_tilde-v_tilde_old))
            tau_tilde_old = tau_tilde.copy()
            v_tilde_old = v_tilde.copy()
-    def fit_DTC(self, Kmm, Kmn, epsilon=1e-3,power_ep=[1.,1.]):
+        return mu, Sigma, mu_tilde, tau_tilde
        """
        The expectation-propagation algorithm with sparse pseudo-input.
        For nomenclature see ... 2013.
        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
        :type epsilon: float
        :param power_ep: Power EP parameters
        :type power_ep: list of floats
        """
        self.epsilon = epsilon
        self.eta, self.delta = power_ep
        num_inducing = Kmm.shape[0]
        #TODO: this doesn't work with uncertain inputs!
        """
        Prior approximation parameters:
        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
        Sigma0 = Qnn = Knm*Kmmi*Kmn
        """
        KmnKnm = np.dot(Kmn,Kmn.T)
        Lm = jitchol(Kmm)
        Lmi = chol_inv(Lm)
        Kmmi = np.dot(Lmi.T,Lmi)
        KmmiKmn = np.dot(Kmmi,Kmn)
        Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
        LLT0 = Kmm.copy()
        #Kmmi, Lm, Lmi, Kmm_logdet = pdinv(Kmm)
        #KmnKnm = np.dot(Kmn, Kmn.T)
        #KmmiKmn = np.dot(Kmmi,Kmn)
        #Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
        #LLT0 = Kmm.copy()
        """
        Posterior approximation: q(f|y) = N(f| mu, Sigma)
        Sigma = Diag + P*R.T*R*P.T + K
        mu = w + P*Gamma
        """
        mu = np.zeros(self.num_data)
        LLT = Kmm.copy()
        Sigma_diag = Qnn_diag.copy()
        """
        Initial values - Cavity distribution parameters:
        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
        self.tau_ = np.empty(self.num_data,dtype=float)
        self.v_ = np.empty(self.num_data,dtype=float)
        #Initial values - Marginal moments
        z = np.empty(self.num_data,dtype=float)
        self.Z_hat = np.empty(self.num_data,dtype=float)
        phi = np.empty(self.num_data,dtype=float)
        mu_hat = np.empty(self.num_data,dtype=float)
        sigma2_hat = np.empty(self.num_data,dtype=float)
        #Approximation
        epsilon_np1 = 1
        epsilon_np2 = 1
       	self.iterations = 0
        np1 = [self.tau_tilde.copy()]
        np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
            update_order = np.random.permutation(self.num_data)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
                self.tau_tilde[i] += Delta_tau
                self.v_tilde[i] += Delta_v
                #Posterior distribution parameters update
                DSYR(LLT,Kmn[:,i].copy(),Delta_tau) #LLT = LLT + np.outer(Kmn[:,i],Kmn[:,i])*Delta_tau
                L = jitchol(LLT)
                #cholUpdate(L,Kmn[:,i]*np.sqrt(Delta_tau))
                V,info = dtrtrs(L,Kmn,lower=1)
                Sigma_diag = np.sum(V*V,-2)
                si = np.sum(V.T*V[:,i],-1)
                mu += (Delta_v-Delta_tau*mu[i])*si
                self.iterations += 1
            #Sigma recomputation with Cholesky decompositon
            LLT = LLT0 + np.dot(Kmn*self.tau_tilde[None,:],Kmn.T)
            L = jitchol(LLT)
            V,info = dtrtrs(L,Kmn,lower=1)
            V2,info = dtrtrs(L.T,V,lower=0)
            Sigma_diag = np.sum(V*V,-2)
            Knmv_tilde = np.dot(Kmn,self.v_tilde)
            mu = np.dot(V2.T,Knmv_tilde)
            epsilon_np1 = sum((self.tau_tilde-np1[-1])**2)/self.num_data
            epsilon_np2 = sum((self.v_tilde-np2[-1])**2)/self.num_data
            np1.append(self.tau_tilde.copy())
            np2.append(self.v_tilde.copy())
        self._compute_GP_variables()
    def fit_FITC(self, Kmm, Kmn, Knn_diag, epsilon=1e-3,power_ep=[1.,1.]):
        """
        The expectation-propagation algorithm with sparse pseudo-input.
        For nomenclature see Naish-Guzman and Holden, 2008.
        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
        :type epsilon: float
        :param power_ep: Power EP parameters
        :type power_ep: list of floats
        """
        self.epsilon = epsilon
        self.eta, self.delta = power_ep
        num_inducing = Kmm.shape[0]
        """
        Prior approximation parameters:
        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
        Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn
        """
        Lm = jitchol(Kmm)
        Lmi = chol_inv(Lm)
        Kmmi = np.dot(Lmi.T,Lmi)
        P0 = Kmn.T
        KmnKnm = np.dot(P0.T, P0)
        KmmiKmn = np.dot(Kmmi,P0.T)
        Qnn_diag = np.sum(P0.T*KmmiKmn,-2)
        Diag0 = Knn_diag - Qnn_diag
        R0 = jitchol(Kmmi).T
        """
        Posterior approximation: q(f|y) = N(f| mu, Sigma)
        Sigma = Diag + P*R.T*R*P.T + K
        mu = w + P*Gamma
        """
        self.w = np.zeros(self.num_data)
        self.Gamma = np.zeros(num_inducing)
        mu = np.zeros(self.num_data)
        P = P0.copy()
        R = R0.copy()
        Diag = Diag0.copy()
        Sigma_diag = Knn_diag
        RPT0 = np.dot(R0,P0.T)
        """
        Initial values - Cavity distribution parameters:
        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
        self.tau_ = np.empty(self.num_data,dtype=float)
        self.v_ = np.empty(self.num_data,dtype=float)
        #Initial values - Marginal moments
        z = np.empty(self.num_data,dtype=float)
        self.Z_hat = np.empty(self.num_data,dtype=float)
        phi = np.empty(self.num_data,dtype=float)
        mu_hat = np.empty(self.num_data,dtype=float)
        sigma2_hat = np.empty(self.num_data,dtype=float)
        #Approximation
        epsilon_np1 = 1
        epsilon_np2 = 1
       	self.iterations = 0
        self.np1 = [self.tau_tilde.copy()]
        self.np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
            update_order = np.random.permutation(self.num_data)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.noise_model.moments_match(self.data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
                self.tau_tilde[i] += Delta_tau
                self.v_tilde[i] += Delta_v
                #Posterior distribution parameters update
                dtd1 = Delta_tau*Diag[i] + 1.
                dii = Diag[i]
                Diag[i] = dii - (Delta_tau * dii**2.)/dtd1
                pi_ = P[i,:].reshape(1,num_inducing)
                P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_
                Rp_i = np.dot(R,pi_.T)
                RTR = np.dot(R.T,np.dot(np.eye(num_inducing) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R))
                R = jitchol(RTR).T
                self.w[i] += (Delta_v - Delta_tau*self.w[i])*dii/dtd1
                self.Gamma += (Delta_v - Delta_tau*mu[i])*np.dot(RTR,P[i,:].T)
                RPT = np.dot(R,P.T)
                Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
                mu = self.w + np.dot(P,self.Gamma)
                self.iterations += 1
            #Sigma recomptutation with Cholesky decompositon
            Iplus_Dprod_i = 1./(1.+ Diag0 * self.tau_tilde)
            Diag = Diag0 * Iplus_Dprod_i
            P = Iplus_Dprod_i[:,None] * P0
            safe_diag = np.where(Diag0 < self.tau_tilde, self.tau_tilde/(1.+Diag0*self.tau_tilde), (1. - Iplus_Dprod_i)/Diag0)
            L = jitchol(np.eye(num_inducing) + np.dot(RPT0,safe_diag[:,None]*RPT0.T))
            R,info = dtrtrs(L,R0,lower=1)
            RPT = np.dot(R,P.T)
            Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
            self.w = Diag * self.v_tilde
            self.Gamma = np.dot(R.T, np.dot(RPT,self.v_tilde))
            mu = self.w + np.dot(P,self.Gamma)
            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.num_data
            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.num_data
            self.np1.append(self.tau_tilde.copy())
            self.np2.append(self.v_tilde.copy())
        return self._compute_GP_variables()
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
-from ...util.linalg import pdinv, dpotrs, tdot, dtrtrs, dpotri, symmetrify, jitchol, dtrtri
+from ...util.linalg import pdinv, dpotrs, dpotri, symmetrify, jitchol
 class Posterior(object):
    """
@ -81,13 +81,17 @@ class Posterior(object):
    def covariance(self):
        if self._covariance is None:
            #LiK, _ = dtrtrs(self.woodbury_chol, self._K, lower=1)
-            self._covariance = self._K - self._K.dot(self.woodbury_inv).dot(self._K)
+            self._covariance = np.tensordot(np.dot(np.atleast_3d(self.woodbury_inv).T, self._K), self._K, [1,0]).T
-        return self._covariance
+            #self._covariance = self._K - self._K.dot(self.woodbury_inv).dot(self._K)
        return self._covariance.squeeze()
    @property
    def precision(self):
        if self._precision is None:
-            self._precision, _, _, _ = pdinv(self.covariance)
+            cov = np.atleast_3d(self.covariance)
            self._precision = np.zeros(cov.shape) # if one covariance per dimension
            for p in xrange(cov.shape[-1]):
                self._precision[:,:,p] = pdinv(cov[:,:,p])[0]
        return self._precision
    @property
@ -95,7 +99,10 @@ class Posterior(object):
        if self._woodbury_chol is None:
            #compute woodbury chol from 
            if self._woodbury_inv is not None:
-                _, _, self._woodbury_chol, _ = pdinv(self._woodbury_inv)
+                winv = np.atleast_3d(self._woodbury_inv)
                self._woodbury_chol = np.zeros(winv.shape)
                for p in xrange(winv.shape[-1]):
                    self._woodbury_chol[:,:,p] = pdinv(winv[:,:,p])[2]
                #Li = jitchol(self._woodbury_inv)
                #self._woodbury_chol, _ = dtrtri(Li)
                #W, _, _, _, = pdinv(self._woodbury_inv)
@ -129,7 +136,7 @@ class Posterior(object):
    @property
    def K_chol(self):
        if self._K_chol is None:
-            self._K_chol = dportf(self._K)
+            self._K_chol = jitchol(self._K)
        return self._K_chol
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@ -43,9 +43,20 @@ class VarDTC(object):
        return Y * prec # TODO chache this, and make it effective
    def inference(self, kern, X, X_variance, Z, likelihood, Y):
        """Inference for normal sparseGP"""
        uncertain_inputs = False
        psi0, psi1, psi2 = _compute_psi(kern, X, X_variance, Z, uncertain_inputs)
        return self._inference(kern, psi0, psi1, psi2, Z, likelihood, Y, uncertain_inputs)
    def inference_latent(self, kern, posterior_variational, Z, likelihood, Y):
        """Inference for GPLVM with uncertain inputs"""
        uncertain_inputs = True
        psi0, psi1, psi2 = _compute_psi_latent(kern, posterior_variational, Z)
        return self._inference(kern, psi0, psi1, psi2, Z, likelihood, Y, uncertain_inputs)
    def _inference(self, kern, psi0, psi1, psi2, Z, likelihood, Y, uncertain_inputs):
        #see whether we're using variational uncertain inputs
        uncertain_inputs = not (X_variance is None)
        _, output_dim = Y.shape
@ -60,20 +71,87 @@ class VarDTC(object):
        trYYT = self.get_trYYT(Y)
        # do the inference:
-        dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Cpsi1Vf, \
+        het_noise = beta.size < 1
-         psi1, Lm, LB, log_marginal, Kmm, partial_for_likelihood  = _do_inference_on(
+        num_inducing = Z.shape[0]
-                kern, X, X_variance, Z, likelihood, 
+        num_data = Y.shape[0]
-                uncertain_inputs, output_dim, 
+        # kernel computations, using BGPLVM notation
-                beta, VVT_factor, trYYT)
+        Kmm = kern.K(Z) 
-        likelihood.update_gradients(partial_for_likelihood)
+        Lm = jitchol(Kmm)
        # The rather complex computations of A
        if uncertain_inputs:
            if het_noise:
                psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0)
            else:
                psi2_beta = psi2.sum(0) * beta
            #if 0:
            #    evals, evecs = linalg.eigh(psi2_beta)
            #    clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
            #    if not np.array_equal(evals, clipped_evals):
            #        pass # print evals
            #    tmp = evecs * np.sqrt(clipped_evals)
            #    tmp = tmp.T
            # no backsubstitution because of bound explosion on tr(A) if not...
            LmInv = dtrtri(Lm)
            A = LmInv.dot(psi2_beta.dot(LmInv.T))
        else:
            if het_noise:
                tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
            else:
                tmp = psi1 * (np.sqrt(beta))
            tmp, _ = dtrtrs(Lm, tmp.T, lower=1)
            A = tdot(tmp) #print A.sum()
        # factor B
        B = np.eye(num_inducing) + A
        LB = jitchol(B)
        psi1Vf = np.dot(psi1.T, VVT_factor)
        # back substutue C into psi1Vf
        tmp, _ = dtrtrs(Lm, psi1Vf, lower=1, trans=0)
        _LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0)
        tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
        Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
        # data fit and derivative of L w.r.t. Kmm
        delit = tdot(_LBi_Lmi_psi1Vf)
        data_fit = np.trace(delit)
        DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit)
        delit = -0.5 * DBi_plus_BiPBi
        delit += -0.5 * B * output_dim
        delit += output_dim * np.eye(num_inducing)
        # Compute dL_dKmm
        dL_dKmm = backsub_both_sides(Lm, delit)
        # derivatives of L w.r.t. psi
        dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, 
            VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, 
            psi1, het_noise, uncertain_inputs)
        # log marginal likelihood
        log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, 
            psi0, A, LB, trYYT, data_fit)
        #put the gradients in the right places
        partial_for_likelihood = _compute_partial_for_likelihood(likelihood, 
            het_noise, uncertain_inputs, LB, 
            _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, 
            psi0, psi1, beta, 
            data_fit, num_data, output_dim, trYYT)
        #likelihood.update_gradients(partial_for_likelihood)
        if uncertain_inputs:
-            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, 'dL_dpsi2':dL_dpsi2}
+            grad_dict = {'dL_dKmm': dL_dKmm, 
-            kern.update_gradients_variational(mu=X, S=X_variance, Z=Z, **grad_dict)
+                         'dL_dpsi0':dL_dpsi0, 
                         'dL_dpsi1':dL_dpsi1, 
                         'dL_dpsi2':dL_dpsi2, 
                         'partial_for_likelihood':partial_for_likelihood}
        else:
-            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1}
+            grad_dict = {'dL_dKmm': dL_dKmm, 
-            kern.update_gradients_sparse(X=X, Z=Z, **grad_dict)
+                         'dL_dKdiag':dL_dpsi0, 
                         'dL_dKnm':dL_dpsi1, 
                         'partial_for_likelihood':partial_for_likelihood}
        #get sufficient things for posterior prediction
        #TODO: do we really want to do this in  the loop?
@ -125,21 +203,33 @@ class VarDTCMissingData(object):
            return [Y], [(Y**2).sum()]
    def inference(self, kern, X, X_variance, Z, likelihood, Y):
        """Inference for normal sparseGP"""
        uncertain_inputs = False
        psi0, psi1, psi2 = _compute_psi(kern, X, X_variance, Z, uncertain_inputs)
        return self._inference(kern, psi0, psi1, psi2, Z, likelihood, Y, uncertain_inputs)
    def inference_latent(self, kern, posterior_variational, Z, likelihood, Y):
        """Inference for GPLVM with uncertain inputs"""
        uncertain_inputs = True
        psi0, psi1, psi2 = _compute_psi_latent(kern, posterior_variational, Z)
        return self._inference(kern, psi0, psi1, psi2, Z, likelihood, Y, uncertain_inputs)
    def _inference(self, kern, psi0_all, psi1_all, psi2_all, Z, likelihood, Y, uncertain_inputs):
        Ys, traces = self._Y(Y)
        beta_all = 1./likelihood.variance
        uncertain_inputs = not (X_variance is None)
        het_noise = beta_all.size != 1
        import itertools
        num_inducing = Z.shape[0]
-        dL_dpsi0_all = np.zeros(X.shape[0])
+        dL_dpsi0_all = np.zeros(Y.shape[0])
-        dL_dpsi1_all = np.zeros((X.shape[0], num_inducing))
+        dL_dpsi1_all = np.zeros((Y.shape[0], num_inducing))
        if uncertain_inputs:
-            dL_dpsi2_all = np.zeros((X.shape[0], num_inducing, num_inducing))
+            dL_dpsi2_all = np.zeros((Y.shape[0], num_inducing, num_inducing))
        partial_for_likelihood = 0
-        LB_all = Cpsi1Vf_all = 0
+        woodbury_vector = np.zeros((num_inducing, Y.shape[1]))
        woodbury_inv_all = np.zeros((num_inducing, num_inducing, Y.shape[1]))
        dL_dKmm = 0
        log_marginal = 0
@ -148,11 +238,10 @@ class VarDTCMissingData(object):
        Lm = jitchol(Kmm)
        if uncertain_inputs: LmInv = dtrtri(Lm)
        # kernel computations, using BGPLVM notation
        psi0_all, psi1_all, psi2_all = _compute_psi(kern, X, X_variance, Z, uncertain_inputs)
        VVT_factor_all = np.empty(Y.shape)
        full_VVT_factor = VVT_factor_all.shape[1] == Y.shape[1]
        if not full_VVT_factor:
            psi1V = np.dot(Y.T*beta_all, psi1_all).T
        for y, trYYT, [v, ind] in itertools.izip(Ys, traces, self._subarray_indices):
            if het_noise: beta = beta_all[ind]
@ -183,10 +272,10 @@ class VarDTCMissingData(object):
            LB = jitchol(B)
            psi1Vf = psi1.T.dot(VVT_factor)
-            _LBi_Lmi_psi1Vf, Cpsi1Vf = _compute_psi1Vf(Lm, LB, psi1Vf)
+            tmp, _ = dtrtrs(Lm, psi1Vf, lower=1, trans=0)
-            
+            _LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0)
-            if full_VVT_factor: Cpsi1Vf_all += Cpsi1Vf
+            tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
-            LB_all += LB    
+            Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
            # data fit and derivative of L w.r.t. Kmm
            delit = tdot(_LBi_Lmi_psi1Vf)
@ -219,92 +308,67 @@ class VarDTCMissingData(object):
                psi0, psi1, beta, 
                data_fit, num_data, output_dim, trYYT)
-        # gradients:
+            if full_VVT_factor: woodbury_vector[:, ind] = Cpsi1Vf
        likelihood.update_gradients(partial_for_likelihood)
        if uncertain_inputs:
            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0_all, 'dL_dpsi1':dL_dpsi1_all, 'dL_dpsi2':dL_dpsi2_all}
            kern.update_gradients_variational(mu=X, S=X_variance, Z=Z, **grad_dict)
            else:
-            grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dKdiag':dL_dpsi0_all, 'dL_dKnm':dL_dpsi1_all}
+                print 'foobar'
-            kern.update_gradients_sparse(X=X, Z=Z, **grad_dict)
+                tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
                tmp, _ = dpotrs(LB, tmp, lower=1)
                woodbury_vector[:, ind] = dtrtrs(Lm, tmp, lower=1, trans=1)[0]
            #import ipdb;ipdb.set_trace()
            Bi, _ = dpotri(LB, lower=1)
            symmetrify(Bi)
            Bi = -dpotri(LB, lower=1)[0]
            from ...util import diag
            diag.add(Bi, 1)
            woodbury_inv_all[:, :, ind] = backsub_both_sides(Lm, Bi)[:,:,None]
        # gradients:
        if uncertain_inputs:
            grad_dict = {'dL_dKmm': dL_dKmm, 
                         'dL_dpsi0':dL_dpsi0_all, 
                         'dL_dpsi1':dL_dpsi1_all, 
                         'dL_dpsi2':dL_dpsi2_all, 
                         'partial_for_likelihood':partial_for_likelihood}
        else:
            grad_dict = {'dL_dKmm': dL_dKmm, 
                         'dL_dKdiag':dL_dpsi0_all, 
                         'dL_dKnm':dL_dpsi1_all, 
                         'partial_for_likelihood':partial_for_likelihood}
        #get sufficient things for posterior prediction
        #TODO: do we really want to do this in  the loop?
-        if full_VVT_factor:
+        #if not full_VVT_factor:
-            woodbury_vector = Cpsi1Vf_all # == Cpsi1V
+        #    print 'foobar'
-        else:
+        #    psi1V = np.dot(Y.T*beta_all, psi1_all).T
-            print 'foobar'
+        #    tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
-            psi1V = np.dot(Y.T*beta_all, psi1_all).T
+        #    tmp, _ = dpotrs(LB_all, tmp, lower=1)
-            tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
+        #    woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
-            tmp, _ = dpotrs(LB_all, tmp, lower=1)
+        #import ipdb;ipdb.set_trace()
-            woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
+        #Bi, _ = dpotri(LB_all, lower=1)
        #symmetrify(Bi)
        #Bi = -dpotri(LB_all, lower=1)[0]
        #from ...util import diag
        #diag.add(Bi, 1)
-        Bi, _ = dpotri(LB_all, lower=1)
+        #woodbury_inv = backsub_both_sides(Lm, Bi)
        symmetrify(Bi)
        Bi = -dpotri(LB_all, lower=1)[0]
        from ...util import diag
        diag.add(Bi, 1)
-        woodbury_inv = backsub_both_sides(Lm, Bi)
+        post = Posterior(woodbury_inv=woodbury_inv_all, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
        post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
        return post, log_marginal, grad_dict
-def _compute_A(num_data, uncertain_inputs, beta, het_noise, psi1, psi2, Lm):
+def _compute_psi(kern, X, X_variance, Z):
 # The rather complex computations of A
    if uncertain_inputs:
        if het_noise:
            psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0)
        else:
            psi2_beta = psi2.sum(0) * beta
        #if 0:
        #    evals, evecs = linalg.eigh(psi2_beta)
        #    clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
        #    if not np.array_equal(evals, clipped_evals):
        #        pass # print evals
        #    tmp = evecs * np.sqrt(clipped_evals)
        #    tmp = tmp.T
        # no backsubstitution because of bound explosion on tr(A) if not...
        LmInv = dtrtri(Lm)
        A = LmInv.dot(psi2_beta.dot(LmInv.T))
    else:
        if het_noise:
            tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
        else:
            tmp = psi1 * (np.sqrt(beta))
        tmp, _ = dtrtrs(Lm, tmp.T, lower=1)
        A = tdot(tmp) #print A.sum()
    return A
 def _compute_psi(kern, X, X_variance, Z, uncertain_inputs):
    if uncertain_inputs:
        psi0 = kern.psi0(Z, X, X_variance)
        psi1 = kern.psi1(Z, X, X_variance)
        psi2 = kern.psi2(Z, X, X_variance)
    else:
    psi0 = kern.Kdiag(X)
    psi1 = kern.K(X, Z)
    psi2 = None
    return psi0, psi1, psi2
-def _compute_Kmm(kern, X, X_variance, Z, uncertain_inputs):
+def _compute_psi_latent(kern, posterior_variational, Z):
-    Kmm = kern.K(Z)
+    psi0 = kern.psi0(Z, posterior_variational)
-    psi0, psi1, psi2 = _compute_psi(kern, X, X_variance, Z, uncertain_inputs) 
+    psi1 = kern.psi1(Z, posterior_variational)
-    return Kmm, psi0, psi1, psi2
+    psi2 = kern.psi2(Z, posterior_variational)
-
+    return psi0, psi1, psi2
 def _compute_dL_dKmm(num_inducing, output_dim, Lm, B, LB, _LBi_Lmi_psi1Vf):
    # Compute dL_dKmm
    delit = tdot(_LBi_Lmi_psi1Vf)
    data_fit = np.trace(delit)
    DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit)
    delit = -0.5 * DBi_plus_BiPBi
    delit += -0.5 * B * output_dim
    delit += output_dim * np.eye(num_inducing)
    dL_dKmm = backsub_both_sides(Lm, delit)
    return DBi_plus_BiPBi, data_fit, dL_dKmm
 def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs):
    dL_dpsi0 = -0.5 * output_dim * (beta * np.ones([num_data, 1])).flatten()
@ -329,15 +393,6 @@ def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, C
    return dL_dpsi0, dL_dpsi1, dL_dpsi2
 def _compute_psi1Vf(Lm, LB, psi1Vf):
    # back substutue C into psi1Vf
    tmp, _ = dtrtrs(Lm, psi1Vf, lower=1, trans=0)
    _LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0)
    tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
    Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
    return _LBi_Lmi_psi1Vf, Cpsi1Vf
 def _compute_partial_for_likelihood(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT):
    # the partial derivative vector for the likelihood
    if likelihood.size == 0:
@ -379,35 +434,3 @@ def _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het
    lik_4 = 0.5 * data_fit
    log_marginal = lik_1 + lik_2 + lik_3 + lik_4
    return log_marginal
 def _do_inference_on(kern, X, X_variance, Z, likelihood, uncertain_inputs, output_dim, beta, VVT_factor, trYYT):
    het_noise = beta.size < 1
    num_inducing = Z.shape[0]
    num_data = X.shape[0]
    # kernel computations, using BGPLVM notation
    Kmm, psi0, psi1, psi2 = _compute_Kmm(kern, X, X_variance, Z, uncertain_inputs)
    #factor Kmm # TODO: cache?
    Lm = jitchol(Kmm)
    A = _compute_A(num_data, uncertain_inputs, beta, het_noise, psi1, psi2, Lm)
    # factor B
    B = np.eye(num_inducing) + A
    LB = jitchol(B)
    psi1Vf = np.dot(psi1.T, VVT_factor)
    _LBi_Lmi_psi1Vf, Cpsi1Vf = _compute_psi1Vf(Lm, LB, psi1Vf)
    # data fit and derivative of L w.r.t. Kmm
    DBi_plus_BiPBi, data_fit, dL_dKmm = _compute_dL_dKmm(num_inducing, output_dim, 
        Lm, B, LB, _LBi_Lmi_psi1Vf)
    # derivatives of L w.r.t. psi
    dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, 
        VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, 
        psi1, het_noise, uncertain_inputs)
    # log marginal likelihood
    log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, 
        psi0, A, LB, trYYT, data_fit)
    #put the gradients in the right places
    partial_for_likelihood = _compute_partial_for_likelihood(likelihood, 
        het_noise, uncertain_inputs, LB, 
        _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, 
        psi0, psi1, beta, 
        data_fit, num_data, output_dim, trYYT)
    return dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Cpsi1Vf, psi1, Lm, LB, log_marginal, Kmm, partial_for_likelihood
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -1,9 +1,34 @@
-# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
+from _src.rbf import RBF
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
+from _src.white import White
-
+from _src.kern import Kern
-from constructors import *
+from _src.linear import Linear
-try:
+from _src.bias import Bias
-    from constructors import rbf_sympy, sympykern # these depend on sympy
+from _src.brownian import Brownian
-except:
+from _src.stationary import Exponential, Matern32, Matern52, ExpQuad
-    pass
+#import coregionalize
-from kern import *
+#import exponential
 #import eq_ode1
 #import finite_dimensional
 #import fixed
 #import gibbs
 #import hetero
 #import hierarchical
 #import independent_outputs
 #import linear
 #import Matern32
 #import Matern52
 #import mlp
 #import ODE_1
 #import periodic_exponential
 #import periodic_Matern32
 #import periodic_Matern52
 #import poly
 #import prod_orthogonal
 #import prod
 #import rational_quadratic
 #import rbfcos
 #import rbf
 #import rbf_inv
 #import spline
 #import symmetric
 #import white
--- a/GPy/kern/parts/ODE_1.py
+++ b/GPy/kern/parts/ODE_1.py
--- a/GPy/kern/_src/init.py
+++ b/GPy/kern/_src/init.py
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@ -0,0 +1,215 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import sys
 import numpy as np
 import itertools
 from linear import Linear
 from ...core.parameterization import Parameterized
 from ...core.parameterization.param import Param
 from kern import Kern
 class Add(Kern):
    def __init__(self, subkerns, tensor):
        assert all([isinstance(k, Kern) for k in subkerns])
        if tensor:
            input_dim  = sum([k.input_dim for k in subkerns])
            self.input_slices = []
            n = 0
            for k in subkerns:
                self.input_slices.append(slice(n, n+k.input_dim))
                n += k.input_dim
        else:
            assert all([k.input_dim == subkerns[0].input_dim for k in subkerns])
            input_dim = subkerns[0].input_dim
            self.input_slices = [slice(None) for k in subkerns]
        super(Add, self).__init__(input_dim, 'add')
        self.add_parameters(*subkerns)
    def K(self, X, X2=None):
        """
        Compute the kernel function.
        :param X: the first set of inputs to the kernel
        :param X2: (optional) the second set of arguments to the kernel. If X2
                   is None, this is passed throgh to the 'part' object, which
                   handLes this as X2 == X.
        """
        assert X.shape[1] == self.input_dim
        if X2 is None:
            return sum([p.K(X[:, i_s], None) for p, i_s in zip(self._parameters_, self.input_slices)])
        else:
            return sum([p.K(X[:, i_s], X2[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)])
    def update_gradients_full(self, dL_dK, X):
        [p.update_gradients_full(dL_dK, X[:,i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
        [p.update_gradients_sparse(dL_dKmm, dL_dKnm, dL_dKdiag, X[:,i_s], Z[:,i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
    def gradients_X(self, dL_dK, X, X2=None):
        """Compute the gradient of the objective function with respect to X.
        :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
        :type dL_dK: np.ndarray (num_samples x num_inducing)
        :param X: Observed data inputs
        :type X: np.ndarray (num_samples x input_dim)
        :param X2: Observed data inputs (optional, defaults to X)
        :type X2: np.ndarray (num_inducing x input_dim)"""
        target = np.zeros_like(X)
        if X2 is None:
            [np.add(target[:,i_s], p.gradients_X(dL_dK, X[:, i_s], None), target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
        else:
            [np.add(target[:,i_s], p.gradients_X(dL_dK, X[:, i_s], X2[:,i_s]), target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
        return target
    def Kdiag(self, X):
        assert X.shape[1] == self.input_dim
        return sum([p.Kdiag(X[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)])
    def psi0(self, Z, mu, S):
        return np.sum([p.psi0(Z[:, i_s], mu[:, i_s], S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)],0)
    def psi1(self, Z, mu, S):
        return np.sum([p.psi1(Z[:, i_s], mu[:, i_s], S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)], 0)
    def psi2(self, Z, mu, S):
        psi2 = np.sum([p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)], 0)
        # compute the "cross" terms
        from white import White
        from rbf import RBF
        #from rbf_inv import RBFInv
        #from bias import Bias
        from linear import Linear
        #ffrom fixed import Fixed
        for (p1, i1), (p2, i2) in itertools.combinations(itertools.izip(self._parameters_, self.input_slices), 2):
            # white doesn;t combine with anything
            if isinstance(p1, White) or isinstance(p2, White):
                pass
            # rbf X bias
            #elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
            elif isinstance(p1,  Bias) and isinstance(p2, (RBF, Linear)):
                tmp = p2.psi1(Z[:,i2], mu[:,i2], S[:,i2])
                psi2 += p1.variance * (tmp[:, :, None] + tmp[:, None, :])
            #elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
            elif isinstance(p2, Bias) and isinstance(p1, (RBF, Linear)):
                tmp = p1.psi1(Z[:,i1], mu[:,i1], S[:,i1])
                psi2 += p2.variance * (tmp[:, :, None] + tmp[:, None, :])
            else:
                raise NotImplementedError, "psi2 cannot be computed for this kernel"
        return psi2
    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
        from white import White
        from rbf import RBF
        #from rbf_inv import RBFInv
        #from bias import Bias
        from linear import Linear
        #ffrom fixed import Fixed
        for p1, is1 in zip(self._parameters_, self.input_slices):
            #compute the effective dL_dpsi1. Extra terms appear becaue of the cross terms in psi2!
            eff_dL_dpsi1 = dL_dpsi1.copy()
            for p2, is2 in zip(self._parameters_, self.input_slices):
                if p2 is p1:
                    continue
                if isinstance(p2, White):
                    continue
                elif isinstance(p2, Bias):
                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
                else:
                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z[:,is2], mu[:,is2], S[:,is2]) * 2.
            p1.update_gradients_variational(dL_dKmm, dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], S[:,is1], Z[:,is1])
    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
        from white import white
        from rbf import rbf
        #from rbf_inv import rbfinv
        #from bias import bias
        from linear import linear
        #ffrom fixed import fixed
        target = np.zeros(Z.shape)
        for p1, is1 in zip(self._parameters_, self.input_slices):
            #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
            eff_dL_dpsi1 = dL_dpsi1.copy()
            for p2, is2 in zip(self._parameters_, self.input_slices):
                if p2 is p1:
                    continue
                if isinstance(p2, white):
                    continue
                elif isinstance(p2, bias):
                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
                else:
                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(z[:,is2], mu[:,is2], s[:,is2]) * 2.
            target += p1.gradients_z_variational(dL_dkmm, dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], s[:,is1], z[:,is1])
        return target
    def gradients_muS_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
        from white import white
        from rbf import rbf
        #from rbf_inv import rbfinv
        #from bias import bias
        from linear import linear
        #ffrom fixed import fixed
        target_mu = np.zeros(mu.shape)
        target_S = np.zeros(S.shape)
        for p1, is1 in zip(self._parameters_, self.input_slices):
            #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
            eff_dL_dpsi1 = dL_dpsi1.copy()
            for p2, is2 in zip(self._parameters_, self.input_slices):
                if p2 is p1:
                    continue
                if isinstance(p2, white):
                    continue
                elif isinstance(p2, bias):
                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
                else:
                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(z[:,is2], mu[:,is2], s[:,is2]) * 2.
            a, b = p1.gradients_muS_variational(dL_dkmm, dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], s[:,is1], z[:,is1])
            target_mu += a
            target_S += b
        return target_mu, target_S
    def plot(self, *args, **kwargs):
        """
        See GPy.plotting.matplot_dep.plot
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import kernel_plots
        kernel_plots.plot(self,*args)
    def _getstate(self):
        """
        Get the current state of the class,
        here just all the indices, rest can get recomputed
        """
        return Parameterized._getstate(self) + [#self._parameters_,
                self.input_dim,
                self.input_slices,
                self._param_slices_
                ]
    def _setstate(self, state):
        self._param_slices_ = state.pop()
        self.input_slices = state.pop()
        self.input_dim = state.pop()
        Parameterized._setstate(self, state)
--- a/GPy/kern/_src/bias.py
+++ b/GPy/kern/_src/bias.py
@ -0,0 +1,62 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
 class Bias(Kern):
    def __init__(self,input_dim,variance=1.,name=None):
        super(Bias, self).__init__(input_dim, name)
        self.variance = Param("variance", variance, Logexp())
        self.add_parameter(self.variance)
    def K(self, X, X2=None):
        shape = (X.shape[0], X.shape[0] if X2 is None else X2.shape[0])
        ret = np.empty(shape, dtype=np.float64)
        ret[:] = self.variance
        return ret
    def Kdiag(self,X):
        ret = np.empty((X.shape[0],), dtype=np.float64)
        ret[:] = self.variance
        return ret
    def update_gradients_full(self, dL_dK, X, X2=None):
        self.variance.gradient = dL_dK.sum()
    def update_gradients_diag(self, dL_dKdiag, X):
        self.variance.gradient = dL_dK.sum()
    def gradients_X(self, dL_dK,X, X2, target):
        return np.zeros(X.shape)
    def gradients_X_diag(self,dL_dKdiag,X,target):
        return np.zeros(X.shape)
    #---------------------------------------#
    #             PSI statistics            #
    #---------------------------------------#
    def psi0(self, Z, mu, S):
        return self.Kdiag(mu)
    def psi1(self, Z, mu, S, target):
        return self.K(mu, S)
    def psi2(self, Z, mu, S, target):
        ret = np.empty((mu.shape[0], Z.shape[0], Z.shape[0]), dtype=np.float64)
        ret[:] = self.variance**2
        return ret
    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
        self.variance.gradient = dL_dKmm.sum() + dL_dpsi0.sum() + dL_dpsi1.sum() + 2.*self.variance*dL_dpsi2.sum()
    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
        return np.zeros(Z.shape)
    def gradients_muS_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
        return np.zeros(mu.shape), np.zeros(S.shape)
--- a/GPy/kern/_src/brownian.py
+++ b/GPy/kern/_src/brownian.py
@ -0,0 +1,50 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
 class Brownian(Kern):
    """
    Brownian motion in 1D only.
    Negative times are treated as a separate (backwards!) Brownian motion.
    :param input_dim: the number of input dimensions
    :type input_dim: int
    :param variance:
    :type variance: float
    """
    def __init__(self, input_dim=1, variance=1., name='Brownian'):
        assert input_dim==1, "Brownian motion in 1D only"
        super(Brownian, self).__init__(input_dim, name)
        self.variance = Param('variance', variance, Logexp())
        self.add_parameters(self.variance)
    def K(self,X,X2=None):
        if X2 is None:
            X2 = X
        return self.variance*np.where(np.sign(X)==np.sign(X2.T),np.fmin(np.abs(X),np.abs(X2.T)), 0.)
    def Kdiag(self,X):
        return self.variance*np.abs(X.flatten())
    def update_gradients_full(self, dL_dK, X, X2=None):
        if X2 is None:
            X2 = X
        self.variance.gradient = np.sum(dL_dK * np.where(np.sign(X)==np.sign(X2.T),np.fmin(np.abs(X),np.abs(X2.T)), 0.))
    #def update_gradients_diag(self, dL_dKdiag, X):
        #self.variance.gradient = np.dot(np.abs(X.flatten()), dL_dKdiag)
    #def gradients_X(self, dL_dK, X, X2=None):
        #if X2 is None:
            #return np.sum(self.variance*dL_dK*np.abs(X),1)[:,None]
        #else:
            #return np.sum(np.where(np.logical_and(np.abs(X)<np.abs(X2.T), np.sign(X)==np.sign(X2)), self.variance*dL_dK,0.),1)[:,None]
--- a/GPy/kern/_src/constructors.py
+++ b/GPy/kern/_src/constructors.py
--- a/GPy/kern/parts/coregionalize.py
+++ b/GPy/kern/parts/coregionalize.py
@ -1,12 +1,13 @@
 # Copyright (c) 2012, James Hensman and Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-from kernpart import Kernpart
+from kern import Kern
 import numpy as np
 from scipy import weave
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
-class Coregionalize(Kernpart):
+class Coregionalize(Kern):
    """
    Covariance function for intrinsic/linear coregionalization models
@ -37,7 +38,7 @@ class Coregionalize(Kernpart):
        super(Coregionalize, self).__init__(input_dim=1, name=name)
        self.output_dim = output_dim
        self.rank = rank
-        if self.rank>output_dim-1:
+        if self.rank>output_dim:
            print("Warning: Unusual choice of rank, it should normally be less than the output_dim.")
        if W is None:
            W = 0.5*np.random.randn(self.output_dim, self.rank)/np.sqrt(self.rank)
@ -48,7 +49,7 @@ class Coregionalize(Kernpart):
            kappa = 0.5*np.ones(self.output_dim)
        else:
            assert kappa.shape==(self.output_dim, )
-        self.kappa = Param('kappa', kappa)
+        self.kappa = Param('kappa', kappa, Logexp())
        self.add_parameters(self.W, self.kappa)
        self.parameters_changed()
@ -56,8 +57,8 @@ class Coregionalize(Kernpart):
    def parameters_changed(self):
        self.B = np.dot(self.W, self.W.T) + np.diag(self.kappa)
-    def K(self,index,index2,target):
+    def K(self, X, X2=None):
-        index = np.asarray(index,dtype=np.int)
+        index = np.asarray(X, dtype=np.int)
        #here's the old code (numpy)
        #if index2 is None:
@ -69,41 +70,45 @@ class Coregionalize(Kernpart):
        #ii, jj = ii.T, jj.T
        #false_target += self.B[ii, jj]
-        if index2 is None:
+
        if X2 is None:
            target = np.empty((X.shape[0], X.shape[0]), dtype=np.float64)
            code="""
            for(int i=0;i<N; i++){
-              target[i+i*N] += B[index[i]+output_dim*index[i]];
+              target[i+i*N] = B[index[i]+output_dim*index[i]];
              for(int j=0; j<i; j++){
-                  target[j+i*N] += B[index[i]+output_dim*index[j]];
+                  target[j+i*N] = B[index[i]+output_dim*index[j]];
-                  target[i+j*N] += target[j+i*N];
+                  target[i+j*N] = target[j+i*N];
                }
              }
            """
            N, B, output_dim = index.size, self.B, self.output_dim
            weave.inline(code, ['target', 'index', 'N', 'B', 'output_dim'])
        else:
-            index2 = np.asarray(index2,dtype=np.int)
+            index2 = np.asarray(X2, dtype=np.int)
            target = np.empty((X.shape[0], X2.shape[0]), dtype=np.float64)
            code="""
            for(int i=0;i<num_inducing; i++){
              for(int j=0; j<N; j++){
-                  target[i+j*num_inducing] += B[output_dim*index[j]+index2[i]];
+                  target[i+j*num_inducing] = B[output_dim*index[j]+index2[i]];
                }
              }
            """
            N, num_inducing, B, output_dim = index.size, index2.size, self.B, self.output_dim
            weave.inline(code, ['target', 'index', 'index2', 'N', 'num_inducing', 'B', 'output_dim'])
        return target
-    def Kdiag(self,index,target):
+    def Kdiag(self, X):
-        target += np.diag(self.B)[np.asarray(index,dtype=np.int).flatten()]
+        return np.diag(self.B)[np.asarray(X, dtype=np.int).flatten()]
-    def update_gradients_full(self,dL_dK, index, index2=None):
+    def update_gradients_full(self, dL_dK, X, X2=None):
-        index = np.asarray(index,dtype=np.int)
+        index = np.asarray(X, dtype=np.int)
        dL_dK_small = np.zeros_like(self.B)
-        if index2 is None:
+        if X2 is None:
            index2 = index
        else:
-            index2 = np.asarray(index2,dtype=np.int)
+            index2 = np.asarray(X2, dtype=np.int)
        code="""
        for(int i=0; i<num_inducing; i++){
@ -122,17 +127,15 @@ class Coregionalize(Kernpart):
        self.W.gradient = dW
        self.kappa.gradient = dkappa
-    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
+    def update_gradients_diag(self, dL_dKdiag, X):
-        raise NotImplementedError, "some code below"
+        index = np.asarray(X, dtype=np.int).flatten()
-    #def dKdiag_dtheta(self,dL_dKdiag,index,target):
+        dL_dKdiag_small = np.array([dL_dKdiag[index==i] for i in xrange(output_dim)])
-        #index = np.asarray(index,dtype=np.int).flatten()
+        self.W.gradient = 2.*self.W*dL_dKdiag_small[:, None]
-        #dL_dKdiag_small = np.zeros(self.output_dim)
+        self.kappa.gradient = dL_dKdiag_small
-        #for i in range(self.output_dim):
+
-            #dL_dKdiag_small[i] += np.sum(dL_dKdiag[index==i])
+    def gradients_X(self, dL_dK, X, X2=None):
-        #dW = 2.*self.W*dL_dKdiag_small[:,None]
+        return np.zeros(X.shape)
-        #dkappa = dL_dKdiag_small
+
-        #target += np.hstack([dW.flatten(),dkappa])
+    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)
    def gradients_X(self,dL_dK,X,X2,target):
        #NOTE In this case, pass is equivalent to returning zero.
        pass
--- a/GPy/kern/parts/eq_ode1.py
+++ b/GPy/kern/parts/eq_ode1.py
--- a/GPy/kern/parts/finite_dimensional.py
+++ b/GPy/kern/parts/finite_dimensional.py
--- a/GPy/kern/parts/fixed.py
+++ b/GPy/kern/parts/fixed.py
--- a/GPy/kern/parts/gibbs.py
+++ b/GPy/kern/parts/gibbs.py
--- a/GPy/kern/parts/hetero.py
+++ b/GPy/kern/parts/hetero.py
--- a/GPy/kern/parts/hierarchical.py
+++ b/GPy/kern/parts/hierarchical.py
--- a/GPy/kern/parts/independent_outputs.py
+++ b/GPy/kern/parts/independent_outputs.py
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@ -0,0 +1,328 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import sys
 import numpy as np
 import itertools
 from ...core.parameterization import Parameterized
 from ...core.parameterization.param import Param
 class Kern(Parameterized):
    def __init__(self, input_dim, name, *a, **kw):
        """
        The base class for a kernel: a positive definite function
        which forms of a covariance function (kernel).
        :param input_dim: the number of input dimensions to the function
        :type input_dim: int
        Do not instantiate.
        """
        super(Kern, self).__init__(name=name, *a, **kw)
        self.input_dim = input_dim
    def K(self, X, X2):
        raise NotImplementedError
    def Kdiag(self, Xa):
        raise NotImplementedError
    def psi0(self,Z,posterior_variational):
        raise NotImplementedError
    def psi1(self,Z,posterior_variational):
        raise NotImplementedError
    def psi2(self,Z,posterior_variational):
        raise NotImplementedError
    def gradients_X(self, dL_dK, X, X2):
        raise NotImplementedError
    def gradients_X_diag(self, dL_dK, X):
        raise NotImplementedError
    def update_gradients_full(self, dL_dK, X):
        """Set the gradients of all parameters when doing full (N) inference."""
        raise NotImplementedError
    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
        target = np.zeros(self.size)
        self.update_gradients_diag(dL_dKdiag, X)
        self._collect_gradient(target)
        self.update_gradients_full(dL_dKnm, X, Z)
        self._collect_gradient(target)
        self.update_gradients_full(dL_dKmm, Z, None)
        self._collect_gradient(target)
        self._set_gradient(target)
    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, posterior_variational):
        """Set the gradients of all parameters when doing variational (M) inference with uncertain inputs."""
        raise NotImplementedError
    def gradients_Z_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
        grad = self.gradients_X(dL_dKmm, Z)
        grad += self.gradients_X(dL_dKnm.T, Z, X)
        return grad
    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, posterior_variational):
        raise NotImplementedError
    def gradients_q_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, posterior_variational):
        raise NotImplementedError
    def plot_ARD(self, *args):
        """If an ARD kernel is present, plot a bar representation using matplotlib
        See GPy.plotting.matplot_dep.plot_ARD
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ...plotting.matplot_dep import kernel_plots
        return kernel_plots.plot_ARD(self,*args)
    def __add__(self, other):
        """ Overloading of the '+' operator. for more control, see self.add """
        return self.add(other)
    def add(self, other, tensor=False):
        """
        Add another kernel to this one.
        If Tensor is False, both kernels are defined on the same _space_. then
        the created kernel will have the same number of inputs as self and
        other (which must be the same).
        If Tensor is True, then the dimensions are stacked 'horizontally', so
        that the resulting kernel has self.input_dim + other.input_dim
        :param other: the other kernel to be added
        :type other: GPy.kern
        """
        assert isinstance(other, Kern), "only kernels can be added to kernels..."
        from add import Add
        return Add([self, other], tensor)
    def __call__(self, X, X2=None):
        return self.K(X, X2)
    def __mul__(self, other):
        """ Here we overload the '*' operator. See self.prod for more information"""
        return self.prod(other)
    def __pow__(self, other, tensor=False):
        """
        Shortcut for tensor `prod`.
        """
        return self.prod(other, tensor=True)
    def prod(self, other, tensor=False):
        """
        Multiply two kernels (either on the same space, or on the tensor product of the input space).
        :param other: the other kernel to be added
        :type other: GPy.kern
        :param tensor: whether or not to use the tensor space (default is false).
        :type tensor: bool
        """
        assert isinstance(other, Kern), "only kernels can be added to kernels..."
        from prod import Prod
        return Prod(self, other, tensor)
 from GPy.core.model import Model
 class Kern_check_model(Model):
    """This is a dummy model class used as a base class for checking that the gradients of a given kernel are implemented correctly. It enables checkgradient() to be called independently on a kernel."""
    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
        from GPy.kern import RBF
        Model.__init__(self, 'kernel_test_model')
        num_samples = 20
        num_samples2 = 10
        if kernel==None:
            kernel = RBF(1)
        if X==None:
            X = np.random.randn(num_samples, kernel.input_dim)
        if dL_dK==None:
            if X2==None:
                dL_dK = np.ones((X.shape[0], X.shape[0]))
            else:
                dL_dK = np.ones((X.shape[0], X2.shape[0]))
        self.kernel=kernel
        self.add_parameter(kernel)
        self.X = X
        self.X2 = X2
        self.dL_dK = dL_dK
    def is_positive_definite(self):
        v = np.linalg.eig(self.kernel.K(self.X))[0]
        if any(v<-10*sys.float_info.epsilon):
            return False
        else:
            return True
    def log_likelihood(self):
        return (self.dL_dK*self.kernel.K(self.X, self.X2)).sum()
    def _log_likelihood_gradients(self):
        raise NotImplementedError, "This needs to be implemented to use the kern_check_model class."
 class Kern_check_dK_dtheta(Kern_check_model):
    """This class allows gradient checks for the gradient of a kernel with respect to parameters. """
    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
    def _log_likelihood_gradients(self):
        return self.kernel._param_grad_helper(self.dL_dK, self.X, self.X2)
 class Kern_check_dKdiag_dtheta(Kern_check_model):
    """This class allows gradient checks of the gradient of the diagonal of a kernel with respect to the parameters."""
    def __init__(self, kernel=None, dL_dK=None, X=None):
        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
        if dL_dK==None:
            self.dL_dK = np.ones((self.X.shape[0]))
    def parameters_changed(self):
        self.kernel.update_gradients_full(self.dL_dK, self.X)        
    def log_likelihood(self):
        return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
    def _log_likelihood_gradients(self):
        return self.kernel.dKdiag_dtheta(self.dL_dK, self.X)
 class Kern_check_dK_dX(Kern_check_model):
    """This class allows gradient checks for the gradient of a kernel with respect to X. """
    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
        self.remove_parameter(kernel)
        self.X = Param('X', self.X)
        self.add_parameter(self.X)
    def _log_likelihood_gradients(self):
        return self.kernel.gradients_X(self.dL_dK, self.X, self.X2).flatten()
 class Kern_check_dKdiag_dX(Kern_check_dK_dX):
    """This class allows gradient checks for the gradient of a kernel diagonal with respect to X. """
    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
        Kern_check_dK_dX.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
        if dL_dK==None:
            self.dL_dK = np.ones((self.X.shape[0]))
    def log_likelihood(self):
        return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
    def _log_likelihood_gradients(self):
        return self.kernel.dKdiag_dX(self.dL_dK, self.X).flatten()
 def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False):
    """
    This function runs on kernels to check the correctness of their
    implementation. It checks that the covariance function is positive definite
    for a randomly generated data set.
    :param kern: the kernel to be tested.
    :type kern: GPy.kern.Kernpart
    :param X: X input values to test the covariance function.
    :type X: ndarray
    :param X2: X2 input values to test the covariance function.
    :type X2: ndarray
    """
    pass_checks = True
    if X==None:
        X = np.random.randn(10, kern.input_dim)
        if output_ind is not None:
            X[:, output_ind] = np.random.randint(kern.output_dim, X.shape[0])
    if X2==None:
        X2 = np.random.randn(20, kern.input_dim)
        if output_ind is not None:
            X2[:, output_ind] = np.random.randint(kern.output_dim, X2.shape[0])
    if verbose:
        print("Checking covariance function is positive definite.")
    result = Kern_check_model(kern, X=X).is_positive_definite()
    if result and verbose:
        print("Check passed.")
    if not result:
        print("Positive definite check failed for " + kern.name + " covariance function.")
        pass_checks = False
        return False
    if verbose:
        print("Checking gradients of K(X, X) wrt theta.")
    result = Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=verbose)
    if result and verbose:
        print("Check passed.")
    if not result:
        print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
        Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=True)
        pass_checks = False
        return False
    if verbose:
        print("Checking gradients of K(X, X2) wrt theta.")
    result = Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=verbose)
    if result and verbose:
        print("Check passed.")
    if not result:
        print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
        Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=True)
        pass_checks = False
        return False
    if verbose:
        print("Checking gradients of Kdiag(X) wrt theta.")
    result = Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=verbose)
    if result and verbose:
        print("Check passed.")
    if not result:
        print("Gradient of Kdiag(X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
        Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=True)
        pass_checks = False
        return False
    if verbose:
        print("Checking gradients of K(X, X) wrt X.")
    try:
        result = Kern_check_dK_dX(kern, X=X, X2=None).checkgrad(verbose=verbose)
    except NotImplementedError:
        result=True
        if verbose:
            print("gradients_X not implemented for " + kern.name)
    if result and verbose:
        print("Check passed.")
    if not result:
        print("Gradient of K(X, X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
        Kern_check_dK_dX(kern, X=X, X2=None).checkgrad(verbose=True)
        pass_checks = False
        return False
    if verbose:
        print("Checking gradients of K(X, X2) wrt X.")
    try:
        result = Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=verbose)
    except NotImplementedError:
        result=True
        if verbose:
            print("gradients_X not implemented for " + kern.name)
    if result and verbose:
        print("Check passed.")
    if not result:
        print("Gradient of K(X, X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
        Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=True)
        pass_checks = False
        return False
    if verbose:
        print("Checking gradients of Kdiag(X) wrt X.")
    try:
        result = Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=verbose)
    except NotImplementedError:
        result=True
        if verbose:
            print("gradients_X not implemented for " + kern.name)
    if result and verbose:
        print("Check passed.")
    if not result:
        print("Gradient of Kdiag(X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
        Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=True)
        pass_checks = False
        return False
    return pass_checks
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@ -0,0 +1,254 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import weave
 from kern import Kern
 from ...util.linalg import tdot
 from ...util.misc import fast_array_equal, param_to_array
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ...util.caching import cache_this
 class Linear(Kern):
    """
    Linear kernel
    .. math::
       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i x_iy_i
    :param input_dim: the number of input dimensions
    :type input_dim: int
    :param variances: the vector of variances :math:`\sigma^2_i`
    :type variances: array or list of the appropriate size (or float if there is only one variance parameter)
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel has only one variance parameter \sigma^2, otherwise there is one variance parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
    """
    def __init__(self, input_dim, variances=None, ARD=False, name='linear'):
        super(Linear, self).__init__(input_dim, name)
        self.ARD = ARD
        if ARD == False:
            if variances is not None:
                variances = np.asarray(variances)
                assert variances.size == 1, "Only one variance needed for non-ARD kernel"
            else:
                variances = np.ones(1)
            self._Xcache, self._X2cache = np.empty(shape=(2,))
        else:
            if variances is not None:
                variances = np.asarray(variances)
                assert variances.size == self.input_dim, "bad number of variances, need one ARD variance per input_dim"
            else:
                variances = np.ones(self.input_dim)
        self.variances = Param('variances', variances, Logexp())
        self.add_parameter(self.variances)
        self.variances.add_observer(self, self._on_changed)
    def _on_changed(self, obj):
        #TODO: move this to base class? isnt it jst for the caching?
        self._notify_observers()
    #@cache_this(limit=3, reset_on_self=True)
    def K(self, X, X2=None):
        if self.ARD:
            if X2 is None:
                return tdot(X*np.sqrt(self.variances))
            else:
                rv = np.sqrt(self.variances)
                return np.dot(X*rv, (X2*rv).T)
        else:
            return self._dot_product(X, X2) * self.variances
    #@cache_this(limit=3, reset_on_self=False)
    def _dot_product(self, X, X2=None):
        if X2 is None:
            return tdot(X)
        else:
            return np.dot(X, X2.T)
    def Kdiag(self, X):
        return np.sum(self.variances * np.square(X), -1)
    def update_gradients_full(self, dL_dK, X, X2=None):
        if self.ARD:
            if X2 is None:
                self.variances.gradient = np.array([np.sum(dL_dK * tdot(X[:, i:i + 1])) for i in range(self.input_dim)])
            else:
                product = X[:, None, :] * X2[None, :, :]
                self.variances.gradient = (dL_dK[:, :, None] * product).sum(0).sum(0)
        else:
            self.variances.gradient = np.sum(self._dot_product(X, X2) * dL_dK)
    def update_gradients_diag(self, dL_dKdiag, X):
        tmp = dL_dKdiag[:, None] * X ** 2
        if self.ARD:
            self.variances.gradient = tmp.sum(0)
        else:
            self.variances.gradient = np.atleast_1d(tmp.sum())
    def gradients_X(self, dL_dK, X, X2=None):
        if X2 is None:
            return 2.*(((X[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
        else:
            return (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
    def gradients_X_diag(self, dL_dKdiag, X):
        return 2.*self.variances*dL_dKdiag[:,None]*X
    #---------------------------------------#
    #             PSI statistics            #
    #              variational              #
    #---------------------------------------#
    def psi0(self, Z, posterior_variational):
        return np.sum(self.variances * self._mu2S(posterior_variational), 1)
    def psi1(self, Z, posterior_variational):
        return self.K(posterior_variational.mean, Z) #the variance, it does nothing
    def psi2(self, Z, posterior_variational):
        ZA = Z * self.variances
        ZAinner = self._ZAinner(posterior_variational, Z)
        return np.dot(ZAinner, ZA.T)
    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, posterior_variational, Z):
        mu, S = posterior_variational.mean, posterior_variational.variance
        # psi0:
        tmp = dL_dpsi0[:, None] * self._mu2S(posterior_variational)
        if self.ARD: grad = tmp.sum(0)
        else: grad = np.atleast_1d(tmp.sum())
        #psi1
        self.update_gradients_full(dL_dpsi1, mu, Z)
        grad += self.variances.gradient
        #psi2
        tmp = dL_dpsi2[:, :, :, None] * (self._ZAinner(posterior_variational, Z)[:, :, None, :] * (2. * Z)[None, None, :, :])
        if self.ARD: grad += tmp.sum(0).sum(0).sum(0)
        else: grad += tmp.sum()
        #from Kmm
        self.update_gradients_full(dL_dKmm, Z, None)
        self.variances.gradient += grad
    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, posterior_variational, Z):
        # Kmm
        grad = self.gradients_X(dL_dKmm, Z, None)
        #psi1
        grad += self.gradients_X(dL_dpsi1.T, Z, posterior_variational.mean)
        #psi2
        self._weave_dpsi2_dZ(dL_dpsi2, Z, posterior_variational, grad)
        return grad
    def gradients_q_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, posterior_variational, Z):
        grad_mu, grad_S = np.zeros(posterior_variational.mean.shape), np.zeros(posterior_variational.mean.shape)
        # psi0
        grad_mu += dL_dpsi0[:, None] * (2.0 * posterior_variational.mean * self.variances)
        grad_S += dL_dpsi0[:, None] * self.variances
        # psi1
        grad_mu += (dL_dpsi1[:, :, None] * (Z * self.variances)).sum(1)
        # psi2
        self._weave_dpsi2_dmuS(dL_dpsi2, Z, posterior_variational, grad_mu, grad_S)
        return grad_mu, grad_S
    #--------------------------------------------------#
    #            Helpers for psi statistics            #
    #--------------------------------------------------#
    def _weave_dpsi2_dmuS(self, dL_dpsi2, Z, pv, target_mu, target_S):
        # Think N,num_inducing,num_inducing,input_dim
        ZA = Z * self.variances
        AZZA = ZA.T[:, None, :, None] * ZA[None, :, None, :]
        AZZA = AZZA + AZZA.swapaxes(1, 2)
        AZZA_2 = AZZA/2.
        #Using weave, we can exploit the symmetry of this problem:
        code = """
        int n, m, mm,q,qq;
        double factor,tmp;
        #pragma omp parallel for private(m,mm,q,qq,factor,tmp)
        for(n=0;n<N;n++){
          for(m=0;m<num_inducing;m++){
            for(mm=0;mm<=m;mm++){
              //add in a factor of 2 for the off-diagonal terms (and then count them only once)
              if(m==mm)
                factor = dL_dpsi2(n,m,mm);
              else
                factor = 2.0*dL_dpsi2(n,m,mm);
              for(q=0;q<input_dim;q++){
                //take the dot product of mu[n,:] and AZZA[:,m,mm,q] TODO: blas!
                tmp = 0.0;
                for(qq=0;qq<input_dim;qq++){
                  tmp += mu(n,qq)*AZZA(qq,m,mm,q);
                }
                target_mu(n,q) += factor*tmp;
                target_S(n,q) += factor*AZZA_2(q,m,mm,q);
              }
            }
          }
        }
        """
        support_code = """
        #include <omp.h>
        #include <math.h>
        """
        weave_options = {'headers'           : ['<omp.h>'],
                         'extra_compile_args': ['-fopenmp -O3'],  #-march=native'],
                         'extra_link_args'   : ['-lgomp']}
        mu = pv.mean
        N,num_inducing,input_dim,mu = mu.shape[0],Z.shape[0],mu.shape[1],param_to_array(mu)
        weave.inline(code, support_code=support_code, libraries=['gomp'],
                     arg_names=['N','num_inducing','input_dim','mu','AZZA','AZZA_2','target_mu','target_S','dL_dpsi2'],
                     type_converters=weave.converters.blitz,**weave_options)
    def _weave_dpsi2_dZ(self, dL_dpsi2, Z, pv, target):
        AZA = self.variances*self._ZAinner(pv, Z)
        code="""
        int n,m,mm,q;
        #pragma omp parallel for private(n,mm,q)
        for(m=0;m<num_inducing;m++){
          for(q=0;q<input_dim;q++){
            for(mm=0;mm<num_inducing;mm++){
              for(n=0;n<N;n++){
                target(m,q) += 2*dL_dpsi2(n,m,mm)*AZA(n,mm,q);
              }
            }
          }
        }
        """
        support_code = """
        #include <omp.h>
        #include <math.h>
        """
        weave_options = {'headers'           : ['<omp.h>'],
                         'extra_compile_args': ['-fopenmp -O3'],  #-march=native'],
                         'extra_link_args'   : ['-lgomp']}
        N,num_inducing,input_dim = pv.mean.shape[0],Z.shape[0],pv.mean.shape[1]
        mu = param_to_array(pv.mean)
        weave.inline(code, support_code=support_code, libraries=['gomp'],
                     arg_names=['N','num_inducing','input_dim','AZA','target','dL_dpsi2'],
                     type_converters=weave.converters.blitz,**weave_options)
    def _mu2S(self, pv):
        return np.square(pv.mean) + pv.variance
    def _ZAinner(self, pv, Z):
        ZA = Z*self.variances
        inner = (pv.mean[:, None, :] * pv.mean[:, :, None])
        diag_indices = np.diag_indices(pv.mean.shape[1], 2)
        inner[:, diag_indices[0], diag_indices[1]] += pv.variance
        return np.dot(ZA, inner).swapaxes(0, 1)  # NOTE: self.ZAinner \in [num_inducing x N x input_dim]!
--- a/GPy/kern/parts/mlp.py
+++ b/GPy/kern/parts/mlp.py
--- a/GPy/kern/parts/odekern1.c
+++ b/GPy/kern/parts/odekern1.c
--- a/GPy/kern/parts/periodic_Matern32.py
+++ b/GPy/kern/parts/periodic_Matern32.py
--- a/GPy/kern/parts/periodic_Matern52.py
+++ b/GPy/kern/parts/periodic_Matern52.py
--- a/GPy/kern/parts/periodic_exponential.py
+++ b/GPy/kern/parts/periodic_exponential.py
--- a/GPy/kern/parts/poly.py
+++ b/GPy/kern/parts/poly.py
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@ -0,0 +1,65 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kern import Kern
 import numpy as np
 class Prod(Kern):
    """
    Computes the product of 2 kernels
    :param k1, k2: the kernels to multiply
    :type k1, k2: Kern
    :param tensor: The kernels are either multiply as functions defined on the same input space (default) or on the product of the input spaces
    :type tensor: Boolean
    :rtype: kernel object
    """
    def __init__(self, k1, k2, tensor=False):
        if tensor:
            super(Prod, self).__init__(k1.input_dim + k2.input_dim, k1.name + '_xx_' + k2.name)
            self.slice1 = slice(0,k1.input_dim)
            self.slice2 = slice(k1.input_dim,k1.input_dim+k2.input_dim)
        else:
            assert k1.input_dim == k2.input_dim, "Error: The input spaces of the kernels to multiply don't have the same dimension."
            super(Prod, self).__init__(k1.input_dim, k1.name + '_x_' + k2.name)
            self.slice1 = slice(0, self.input_dim)
            self.slice2 = slice(0, self.input_dim)
        self.k1 = k1
        self.k2 = k2
        self.add_parameters(self.k1, self.k2)
    def K(self, X, X2=None):
        if X2 is None:
            return self.k1.K(X[:,self.slice1], None) * self.k2.K(X[:,self.slice2], None)
        else:
            return self.k1.K(X[:,self.slice1], X2[:,self.slice1]) * self.k2.K(X[:,self.slice2], X2[:,self.slice2])
    def Kdiag(self, X):
        return self.k1.Kdiag(X[:,self.slice1]) * self.k2.Kdiag(X[:,self.slice2])
    def update_gradients_full(self, dL_dK, X):
        self.k1.update_gradients_full(dL_dK*self.k2(X[:,self.slice2]), X[:,self.slice1])
        self.k2.update_gradients_full(dL_dK*self.k1(X[:,self.slice1]), X[:,self.slice2])
    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
        self.k1.update_gradients_sparse(dL_dKmm * self.k2.K(Z[:,self.slice2]), dL_dKnm * self.k2(X[:,self.slice2], Z[:,self.slice2]), dL_dKdiag * self.k2.Kdiag(X[:,self.slice2]), X[:,self.slice1], Z[:,self.slice1] )
        self.k2.update_gradients_sparse(dL_dKmm * self.k1.K(Z[:,self.slice1]), dL_dKnm * self.k1(X[:,self.slice1], Z[:,self.slice1]), dL_dKdiag * self.k1.Kdiag(X[:,self.slice1]), X[:,self.slice2], Z[:,self.slice2] )
    def gradients_X(self, dL_dK, X, X2=None):
        target = np.zeros(X.shape)
        if X2 is None:
            target[:,self.slice1] += self.k1.gradients_X(dL_dK*self.k2(X[:,self.slice2]), X[:,self.slice1], None)
            target[:,self.slice2] += self.k2.gradients_X(dL_dK*self.k1(X[:,self.slice1]), X[:,self.slice2], None)
        else:
            target[:,self.slice1] += self.k1.gradients_X(dL_dK*self.k2(X[:,self.slice2], X2[:,self.slice2]), X[:,self.slice1], X2[:,self.slice1])
            target[:,self.slice2] += self.k2.gradients_X(dL_dK*self.k1(X[:,self.slice1], X2[:,self.slice1]), X[:,self.slice2], X2[:,self.slice2])
        return target
    def gradients_X_diag(self, dL_dKdiag, X):
        target = np.zeros(X.shape)
        target[:,self.slice1] = self.k1.gradients_X(dL_dKdiag*self.k2.Kdiag(X[:,self.slice2]), X[:,self.slice1])
        target[:,self.slice2] += self.k2.gradients_X(dL_dKdiag*self.k1.Kdiag(X[:,self.slice1]), X[:,self.slice2])
        return target
--- a/GPy/kern/parts/prod_orthogonal.py
+++ b/GPy/kern/parts/prod_orthogonal.py
--- a/GPy/kern/parts/rational_quadratic.py
+++ b/GPy/kern/parts/rational_quadratic.py
--- a/GPy/kern/parts/rbf.py
+++ b/GPy/kern/parts/rbf.py
@ -4,13 +4,13 @@
 import numpy as np
 from scipy import weave
-from kernpart import Kernpart
+from kern import Kern
 from ...util.linalg import tdot
 from ...util.misc import fast_array_equal, param_to_array
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
-class RBF(Kernpart):
+class RBF(Kern):
    """
    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel:
@ -60,22 +60,8 @@ class RBF(Kernpart):
        self.add_parameters(self.variance, self.lengthscale)
        self.parameters_changed() # initializes cache
        #self.update_inv_lengthscale(self.lengthscale)
        #self.parameters_changed()
        # initialize cache
        #self._Z, self._mu, self._S = np.empty(shape=(3, 1))
        #self._X, self._X2, self._params_save = np.empty(shape=(3, 1))
        # a set of optional args to pass to weave
        # self.weave_options = {'headers'           : ['<omp.h>'],
        #                       'extra_compile_args': ['-fopenmp -O3'], # -march=native'],
        #                       'extra_link_args'   : ['-lgomp']}
        self.weave_options = {}
    def on_input_change(self, X):
        #self._K_computations(X, None)
        pass
    def update_lengthscale(self, l):
        self.lengthscale2 = np.square(self.lengthscale)
@ -84,23 +70,32 @@ class RBF(Kernpart):
        self._X, self._X2 = np.empty(shape=(2, 1))
        self._Z, self._mu, self._S = np.empty(shape=(3, 1)) # cached versions of Z,mu,S
-    def K(self, X, X2, target):
+    def K(self, X, X2=None):
        self._K_computations(X, X2)
-        target += self.variance * self._K_dvar
+        return self.variance * self._K_dvar
-    def Kdiag(self, X, target):
+    def Kdiag(self, X):
-        np.add(target, self.variance, target)
+        ret = np.ones(X.shape[0])
        ret[:] = self.variance
        return ret
-    def psi0(self, Z, mu, S, target):
+    def psi0(self, Z, posterior_variational):
-        target += self.variance
+        mu = posterior_variational.mean
        ret = np.empty(mu.shape[0], dtype=np.float64)
        ret[:] = self.variance
        return ret
-    def psi1(self, Z, mu, S, target):
+    def psi1(self, Z, posterior_variational):
        mu = posterior_variational.mean
        S = posterior_variational.variance
        self._psi_computations(Z, mu, S)
-        target += self._psi1
+        return self._psi1
-    def psi2(self, Z, mu, S, target):
+    def psi2(self, Z, posterior_variational):
        mu = posterior_variational.mean
        S = posterior_variational.variance
        self._psi_computations(Z, mu, S)
-        target += self._psi2
+        return self._psi2
    def update_gradients_full(self, dL_dK, X):
        self._K_computations(X, None)
@ -131,7 +126,9 @@ class RBF(Kernpart):
        else:
            self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
-    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, posterior_variational):
        mu = posterior_variational.mean
        S = posterior_variational.variance        
        self._psi_computations(Z, mu, S)
        #contributions from psi0:
@ -165,7 +162,43 @@ class RBF(Kernpart):
        else:
            self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
-    def gradients_X(self, dL_dK, X, X2, target):
+    def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, posterior_variational):
        mu = posterior_variational.mean
        S = posterior_variational.variance
        self._psi_computations(Z, mu, S)
        #psi1
        denominator = (self.lengthscale2 * (self._psi1_denom))
        dpsi1_dZ = -self._psi1[:, :, None] * ((self._psi1_dist / denominator))
        grad = np.sum(dL_dpsi1[:, :, None] * dpsi1_dZ, 0)
        #psi2
        term1 = self._psi2_Zdist / self.lengthscale2 # num_inducing, num_inducing, input_dim
        term2 = self._psi2_mudist / self._psi2_denom / self.lengthscale2 # N, num_inducing, num_inducing, input_dim
        dZ = self._psi2[:, :, :, None] * (term1[None] + term2)
        grad += 2*(dL_dpsi2[:, :, :, None] * dZ).sum(0).sum(0)
        grad += self.gradients_X(dL_dKmm, Z, None)
        return grad
    def update_gradients_q_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, posterior_variational):
        mu = posterior_variational.mean
        S = posterior_variational.variance
        self._psi_computations(Z, mu, S)
        #psi1
        tmp = self._psi1[:, :, None] / self.lengthscale2 / self._psi1_denom
        grad_mu = np.sum(dL_dpsi1[:, :, None] * tmp * self._psi1_dist, 1)
        grad_S = np.sum(dL_dpsi1[:, :, None] * 0.5 * tmp * (self._psi1_dist_sq - 1), 1)
        #psi2
        tmp = self._psi2[:, :, :, None] / self.lengthscale2 / self._psi2_denom
        grad_mu += -2.*(dL_dpsi2[:, :, :, None] * tmp * self._psi2_mudist).sum(1).sum(1)
        grad_S += (dL_dpsi2[:, :, :, None] * tmp * (2.*self._psi2_mudist_sq - 1)).sum(1).sum(1)
        posterior_variational.mean.gradient = grad_mu
        posterior_variational.variance.gradient = grad_S
    def gradients_X(self, dL_dK, X, X2=None):
        #if self._X is None or X.base is not self._X.base or X2 is not None:
        self._K_computations(X, X2)
        if X2 is None:
@ -173,44 +206,15 @@ class RBF(Kernpart):
        else:
            _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
        gradients_X = (-self.variance / self.lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
-        target += np.sum(gradients_X * dL_dK.T[:, :, None], 0)
+        return np.sum(gradients_X * dL_dK.T[:, :, None], 0)
-    def dKdiag_dX(self, dL_dKdiag, X, target):
+    def dKdiag_dX(self, dL_dKdiag, X):
-        pass
+        return np.zeros(X.shape[0])
    #---------------------------------------#
    #             PSI statistics            #
    #---------------------------------------#
    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S, target_mu, target_S):
        pass
    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
        self._psi_computations(Z, mu, S)
        denominator = (self.lengthscale2 * (self._psi1_denom))
        dpsi1_dZ = -self._psi1[:, :, None] * ((self._psi1_dist / denominator))
        target += np.sum(dL_dpsi1[:, :, None] * dpsi1_dZ, 0)
    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
        self._psi_computations(Z, mu, S)
        tmp = self._psi1[:, :, None] / self.lengthscale2 / self._psi1_denom
        target_mu += np.sum(dL_dpsi1[:, :, None] * tmp * self._psi1_dist, 1)
        target_S += np.sum(dL_dpsi1[:, :, None] * 0.5 * tmp * (self._psi1_dist_sq - 1), 1)
    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
        self._psi_computations(Z, mu, S)
        term1 = self._psi2_Zdist / self.lengthscale2 # num_inducing, num_inducing, input_dim
        term2 = self._psi2_mudist / self._psi2_denom / self.lengthscale2 # N, num_inducing, num_inducing, input_dim
        dZ = self._psi2[:, :, :, None] * (term1[None] + term2)
        target += (dL_dpsi2[:, :, :, None] * dZ).sum(0).sum(0)
    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
        """Think N,num_inducing,num_inducing,input_dim """
        self._psi_computations(Z, mu, S)
        tmp = self._psi2[:, :, :, None] / self.lengthscale2 / self._psi2_denom
        target_mu += -2.*(dL_dpsi2[:, :, :, None] * tmp * self._psi2_mudist).sum(1).sum(1)
        target_S += (dL_dpsi2[:, :, :, None] * tmp * (2.*self._psi2_mudist_sq - 1)).sum(1).sum(1)
    #---------------------------------------#
    #            Precomputations            #
    #---------------------------------------#
@ -373,6 +377,7 @@ class RBF(Kernpart):
        #include <omp.h>
        #include <math.h>
        """
        mu = param_to_array(mu)
        weave.inline(code, support_code=support_code, libraries=['gomp'],
                     arg_names=['N', 'num_inducing', 'input_dim', 'mu', 'Zhat', 'mudist_sq', 'mudist', 'lengthscale2', '_psi2_denom', 'psi2_Zdist_sq', 'psi2_exponent', 'half_log_psi2_denom', 'psi2', 'variance_sq'],
                     type_converters=weave.converters.blitz, **self.weave_options)
--- a/GPy/kern/parts/rbf_inv.py
+++ b/GPy/kern/parts/rbf_inv.py
--- a/GPy/kern/parts/rbfcos.py
+++ b/GPy/kern/parts/rbfcos.py
--- a/GPy/kern/parts/spline.py
+++ b/GPy/kern/parts/spline.py
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@ -0,0 +1,211 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ... import util
 import numpy as np
 from scipy import integrate
 class Stationary(Kern):
    def __init__(self, input_dim, variance, lengthscale, ARD, name):
        super(Stationary, self).__init__(input_dim, name)
        self.ARD = ARD
        if not ARD:
            if lengthscale is None:
                lengthscale = np.ones(1)
            else:
                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size == 1, "Only  lengthscale needed for non-ARD kernel"
        else:
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size in [1, input_dim], "Bad lengthscales"
                if lengthscale.size != input_dim:
                    lengthscale = np.ones(input_dim)*lengthscale
            else:
                lengthscale = np.ones(self.input_dim)
        self.lengthscale = Param('lengthscale', lengthscale, Logexp())
        self.variance = Param('variance', variance, Logexp())
        assert self.variance.size==1
        self.add_parameters(self.variance, self.lengthscale)
    def _dist(self, X, X2):
        if X2 is None:
            X2 = X
        return X[:, None, :] - X2[None, :, :]
    def _scaled_dist(self, X, X2=None):
        return np.sqrt(np.sum(np.square(self._dist(X, X2) / self.lengthscale), -1))
    def Kdiag(self, X):
        ret = np.empty(X.shape[0])
        ret[:] = self.variance
        return ret
    def update_gradients_diag(self, dL_dKdiag, X):
        self.variance.gradient = np.sum(dL_dKdiag)
        self.lengthscale.gradient = 0.
    def update_gradients_full(self, dL_dK, X, X2=None):
        K = self.K(X, X2)
        self.variance.gradient = np.sum(K * dL_dK)/self.variance
        rinv = self._inv_dist(X, X2)
        dL_dr = self.dK_dr(X, X2) * dL_dK
        x_xl3 = np.square(self._dist(X, X2)) / self.lengthscale**3
        if self.ARD:
            self.lengthscale.gradient = -((dL_dr*rinv)[:,:,None]*x_xl3).sum(0).sum(0)
        else:
            self.lengthscale.gradient = -((dL_dr*rinv)[:,:,None]*x_xl3).sum()
    def _inv_dist(self, X, X2=None):
        dist = self._scaled_dist(X, X2)
        if X2 is None:
            nondiag = util.diag.offdiag_view(dist)
            nondiag[:] = 1./nondiag
            return dist
        else:
            return 1./np.where(dist != 0., dist, np.inf)
    def gradients_X(self, dL_dK, X, X2=None):
        dL_dr = self.dK_dr(X, X2) * dL_dK
        invdist = self._inv_dist(X, X2)
        ret = np.sum((invdist*dL_dr)[:,:,None]*self._dist(X, X2),1)/self.lengthscale**2
        if X2 is None:
            ret *= 2.
        return ret
    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)
 class Exponential(Stationary):
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, name='Exponential'):
        super(Exponential, self).__init__(input_dim, variance, lengthscale, ARD, name)
    def K(self, X, X2=None):
        dist = self._scaled_dist(X, X2)
        return self.variance * np.exp(-0.5 * dist)
    def dK_dr(self, X, X2):
        return -0.5*self.K(X, X2)
 class Matern32(Stationary):
    """
    Matern 3/2 kernel:
    .. math::
       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
    """
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, name='Mat32'):
        super(Matern32, self).__init__(input_dim, variance, lengthscale, ARD, name)
    def K(self, X, X2=None):
        dist = self._scaled_dist(X, X2)
        return self.variance * (1. + np.sqrt(3.) * dist) * np.exp(-np.sqrt(3.) * dist)
    def dK_dr(self, X, X2):
        dist = self._scaled_dist(X, X2)
        return -3.*self.variance*dist*np.exp(-np.sqrt(3.)*dist)
    def Gram_matrix(self, F, F1, F2, lower, upper):
        """
        Return the Gram matrix of the vector of functions F with respect to the
        RKHS norm. The use of this function is limited to input_dim=1.
        :param F: vector of functions
        :type F: np.array
        :param F1: vector of derivatives of F
        :type F1: np.array
        :param F2: vector of second derivatives of F
        :type F2: np.array
        :param lower,upper: boundaries of the input domain
        :type lower,upper: floats
        """
        assert self.input_dim == 1
        def L(x, i):
            return(3. / self.lengthscale ** 2 * F[i](x) + 2 * np.sqrt(3) / self.lengthscale * F1[i](x) + F2[i](x))
        n = F.shape[0]
        G = np.zeros((n, n))
        for i in range(n):
            for j in range(i, n):
                G[i, j] = G[j, i] = integrate.quad(lambda x : L(x, i) * L(x, j), lower, upper)[0]
        Flower = np.array([f(lower) for f in F])[:, None]
        F1lower = np.array([f(lower) for f in F1])[:, None]
        return(self.lengthscale ** 3 / (12.*np.sqrt(3) * self.variance) * G + 1. / self.variance * np.dot(Flower, Flower.T) + self.lengthscale ** 2 / (3.*self.variance) * np.dot(F1lower, F1lower.T))
 class Matern52(Stationary):
    """
    Matern 5/2 kernel:
    .. math::
       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r) \ \ \ \ \  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
       """
    def K(self, X, X2=None):
        r = self._scaled_dist(X, X2)
        return self.variance*(1+np.sqrt(5.)*r+5./3*r**2)*np.exp(-np.sqrt(5.)*r)
    def dK_dr(self, X, X2):
        r = self._scaled_dist(X, X2)
        return self.variance*(10./3*r -5.*r -5.*np.sqrt(5.)/3*r**2)*np.exp(-np.sqrt(5.)*r)
    def Gram_matrix(self,F,F1,F2,F3,lower,upper):
        """
        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to input_dim=1.
        :param F: vector of functions
        :type F: np.array
        :param F1: vector of derivatives of F
        :type F1: np.array
        :param F2: vector of second derivatives of F
        :type F2: np.array
        :param F3: vector of third derivatives of F
        :type F3: np.array
        :param lower,upper: boundaries of the input domain
        :type lower,upper: floats
        """
        assert self.input_dim == 1
        def L(x,i):
            return(5*np.sqrt(5)/self.lengthscale**3*F[i](x) + 15./self.lengthscale**2*F1[i](x)+ 3*np.sqrt(5)/self.lengthscale*F2[i](x) + F3[i](x))
        n = F.shape[0]
        G = np.zeros((n,n))
        for i in range(n):
            for j in range(i,n):
                G[i,j] = G[j,i] = integrate.quad(lambda x : L(x,i)*L(x,j),lower,upper)[0]
        G_coef = 3.*self.lengthscale**5/(400*np.sqrt(5))
        Flower = np.array([f(lower) for f in F])[:,None]
        F1lower = np.array([f(lower) for f in F1])[:,None]
        F2lower = np.array([f(lower) for f in F2])[:,None]
        orig = 9./8*np.dot(Flower,Flower.T) + 9.*self.lengthscale**4/200*np.dot(F2lower,F2lower.T)
        orig2 = 3./5*self.lengthscale**2 * ( np.dot(F1lower,F1lower.T) + 1./8*np.dot(Flower,F2lower.T) + 1./8*np.dot(F2lower,Flower.T))
        return(1./self.variance* (G_coef*G + orig + orig2))
 class ExpQuad(Stationary):
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, name='ExpQuad'):
        super(ExpQuad, self).__init__(input_dim, variance, lengthscale, ARD, name)
    def K(self, X, X2=None):
        r = self._scaled_dist(X, X2)
        return self.variance * np.exp(-0.5 * r**2)
    def dK_dr(self, X, X2):
        dist = self._scaled_dist(X, X2)
        return -dist*self.K(X, X2)
--- a/GPy/kern/parts/symmetric.py
+++ b/GPy/kern/parts/symmetric.py
--- a/GPy/kern/parts/sympy_helpers.cpp
+++ b/GPy/kern/parts/sympy_helpers.cpp
--- a/GPy/kern/parts/sympy_helpers.h
+++ b/GPy/kern/parts/sympy_helpers.h
--- a/GPy/kern/_src/sympykern.py
+++ b/GPy/kern/_src/sympykern.py
@ -0,0 +1,563 @@
 # Check Matthew Rocklin's blog post.
 try: 
    import sympy as sp
    sympy_available=True
 except ImportError:
    sympy_available=False
    exit()
 from sympy.core.cache import clear_cache
 from sympy.utilities.codegen import codegen
 try:
    from scipy import weave
    weave_available = True
 except ImportError:
    weave_available = False
 import os
 current_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
 import sys
 import numpy as np
 import re
 import tempfile
 import pdb
 import ast
 from kernpart import Kernpart
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 # TODO have this set up in a set up file!
 user_code_storage = tempfile.gettempdir()
 class spkern(Kernpart):
    """
    A kernel object, where all the hard work in done by sympy.
    :param k: the covariance function
    :type k: a positive definite sympy function of x_0, z_0, x_1, z_1, x_2, z_2...
    To construct a new sympy kernel, you'll need to define:
     - a kernel function using a sympy object. Ensure that the kernel is of the form k(x,z).
     - that's it! we'll extract the variables from the function k.
    Note:
     - to handle multiple inputs, call them x_1, z_1, etc
     - to handle multpile correlated outputs, you'll need to add parameters with an index, such as lengthscale_i and lengthscale_j.
    """
    def __init__(self, input_dim, k=None, output_dim=1, name=None, param=None):
        if name is None:
            name='sympykern'
        if k is None:
            raise ValueError, "You must provide an argument for the covariance function."
        super(spkern, self).__init__(input_dim, name)
        self._sp_k = k
        # pull the variable names out of the symbolic covariance function.
        sp_vars = [e for e in k.atoms() if e.is_Symbol]
        self._sp_x= sorted([e for e in sp_vars if e.name[0:2]=='x_'],key=lambda x:int(x.name[2:]))
        self._sp_z= sorted([e for e in sp_vars if e.name[0:2]=='z_'],key=lambda z:int(z.name[2:]))
        # Check that variable names make sense.
        assert all([x.name=='x_%i'%i for i,x in enumerate(self._sp_x)])
        assert all([z.name=='z_%i'%i for i,z in enumerate(self._sp_z)])
        assert len(self._sp_x)==len(self._sp_z)
        x_dim=len(self._sp_x)
        # If it is a multi-output covariance, add an input for indexing the outputs.
        self._real_input_dim = x_dim
        # Check input dim is number of xs + 1 if output_dim is >1
        assert self.input_dim == x_dim + int(output_dim > 1)
        self.output_dim = output_dim
        # extract parameter names from the covariance
        thetas = sorted([e for e in sp_vars if not (e.name[0:2]=='x_' or e.name[0:2]=='z_')],key=lambda e:e.name)
        # Look for parameters with index (subscripts), they are associated with different outputs.
        if self.output_dim>1:
            self._sp_theta_i = sorted([e for e in thetas if (e.name[-2:]=='_i')], key=lambda e:e.name)
            self._sp_theta_j = sorted([e for e in thetas if (e.name[-2:]=='_j')], key=lambda e:e.name)
            # Make sure parameter appears with both indices!
            assert len(self._sp_theta_i)==len(self._sp_theta_j)
            assert all([theta_i.name[:-2]==theta_j.name[:-2] for theta_i, theta_j in zip(self._sp_theta_i, self._sp_theta_j)])
            # Extract names of shared parameters (those without a subscript)
            self._sp_theta = [theta for theta in thetas if theta not in self._sp_theta_i and theta not in self._sp_theta_j]
            self.num_split_params = len(self._sp_theta_i)
            self._split_theta_names = ["%s"%theta.name[:-2] for theta in self._sp_theta_i]
            for theta in self._split_theta_names:
                setattr(self, theta, Param(theta, np.ones(self.output_dim), None))
                self.add_parameters(getattr(self, theta))
                #setattr(self, theta, np.ones(self.output_dim))
            self.num_shared_params = len(self._sp_theta)
            #self.num_params = self.num_shared_params+self.num_split_params*self.output_dim
        else:
            self.num_split_params = 0
            self._split_theta_names = []
            self._sp_theta = thetas
            self.num_shared_params = len(self._sp_theta)
            #self.num_params = self.num_shared_params
        # Add parameters to the model.
        for theta in self._sp_theta:
            val = 1.0
            if param is not None:
                if param.has_key(theta):
                    val = param[theta]
            #setattr(self, theta.name, val)
            setattr(self, theta.name, Param(theta.name, val, None))
            self.add_parameters(getattr(self, theta.name))
        #deal with param            
        #self._set_params(self._get_params())
        # Differentiate with respect to parameters.
        self._sp_dk_dtheta = [sp.diff(k,theta).simplify() for theta in self._sp_theta]
        if self.output_dim > 1:
            self._sp_dk_dtheta_i = [sp.diff(k,theta).simplify() for theta in self._sp_theta_i]
        # differentiate with respect to input variables.
        self._sp_dk_dx = [sp.diff(k,xi).simplify() for xi in self._sp_x]
        # psi_stats aren't yet implemented.
        if False:
            self.compute_psi_stats()
        self._code = {}
        # generate the code for the covariance functions
        self._gen_code()
        if weave_available:
            if False:
                extra_compile_args = ['-ftree-vectorize', '-mssse3', '-ftree-vectorizer-verbose=5']
            else:
                extra_compile_args = []
                self.weave_kwargs = {
                    'support_code': None, #self._function_code,
                    'include_dirs':[user_code_storage, os.path.join(current_dir,'parts/')],
                    'headers':['"sympy_helpers.h"', '"'+self.name+'.h"'],
                    'sources':[os.path.join(current_dir,"parts/sympy_helpers.cpp"), os.path.join(user_code_storage, self.name+'.cpp')],
                    'extra_compile_args':extra_compile_args,
                    'extra_link_args':['-lgomp'],
                    'verbose':True}
        self.parameters_changed() # initializes caches
    def __add__(self,other):
        return spkern(self._sp_k+other._sp_k)
    def _gen_code(self):
        argument_sequence = self._sp_x+self._sp_z+self._sp_theta
        code_list = [('k',self._sp_k)]
        # gradients with respect to covariance input
        code_list += [('dk_d%s'%x.name,dx) for x,dx in zip(self._sp_x,self._sp_dk_dx)]
        # gradient with respect to parameters
        code_list += [('dk_d%s'%theta.name,dtheta) for theta,dtheta in zip(self._sp_theta,self._sp_dk_dtheta)]
        # gradient with respect to multiple output parameters
        if self.output_dim > 1:
            argument_sequence += self._sp_theta_i + self._sp_theta_j
            code_list += [('dk_d%s'%theta.name,dtheta) for theta,dtheta in zip(self._sp_theta_i,self._sp_dk_dtheta_i)]
        # generate c functions from sympy objects
        if weave_available:
            code_type = "C"
        else:
            code_type = "PYTHON"
        # Need to add the sympy_helpers header in here.
        (foo_c,self._function_code), (foo_h,self._function_header) = \
                                     codegen(code_list,
                                             code_type,
                                             self.name,
                                             argument_sequence=argument_sequence)
        # Use weave to compute the underlying functions.
        if weave_available:
            # put the header file where we can find it
            f = file(os.path.join(user_code_storage, self.name + '.h'),'w')
            f.write(self._function_header)
            f.close()
        if weave_available:
            # Substitute any known derivatives which sympy doesn't compute
            self._function_code = re.sub('DiracDelta\(.+?,.+?\)','0.0',self._function_code)
            # put the cpp file in user code storage (defaults to temp file location)
            f = file(os.path.join(user_code_storage, self.name + '.cpp'),'w')
        else:
            # put the python file in user code storage
            f = file(os.path.join(user_code_storage, self.name + '.py'),'w')
        f.write(self._function_code)
        f.close()
        if weave_available:
            # arg_list will store the arguments required for the C code.
            input_arg_list = (["X2(i, %s)"%x.name[2:] for x in self._sp_x]
                        + ["Z2(j, %s)"%z.name[2:] for z in self._sp_z])
            # for multiple outputs reverse argument list is also required
            if self.output_dim>1:
                reverse_input_arg_list = list(input_arg_list)
                reverse_input_arg_list.reverse()
            # This gives the parameters for the arg list.
            param_arg_list = [shared_params.name for shared_params in self._sp_theta]
            arg_list = input_arg_list + param_arg_list
            precompute_list=[]
            if self.output_dim > 1:
                reverse_arg_list= reverse_input_arg_list + list(param_arg_list)
                # For multiple outputs, also need the split parameters.
                split_param_arg_list = ["%s1(%s)"%(theta.name[:-2].upper(),index) for index in ['ii', 'jj'] for theta in self._sp_theta_i]
                split_param_reverse_arg_list = ["%s1(%s)"%(theta.name[:-2].upper(),index) for index in ['jj', 'ii'] for theta in self._sp_theta_i]
                arg_list += split_param_arg_list
                reverse_arg_list += split_param_reverse_arg_list
                # Extract the right output indices from the inputs.
                c_define_output_indices = [' '*16 + "int %s=(int)%s(%s, %i);"%(index, var, index2, self.input_dim-1) for index, var, index2 in zip(['ii', 'jj'], ['X2', 'Z2'], ['i', 'j'])]
                precompute_list += c_define_output_indices
                reverse_arg_string = ", ".join(reverse_arg_list)
            arg_string = ", ".join(arg_list)
            precompute_string = "\n".join(precompute_list)
            # Now we use the arguments in code that computes the separate parts.
            # Any precomputations will be done here eventually.
            self._precompute = \
                """
                // Precompute code would go here. It will be called when parameters are updated. 
                """
            # Here's the code to do the looping for K
            self._code['K'] =\
            """
            // _K_code
            // Code for computing the covariance function.
            int i;
            int j;
            int n = target_array->dimensions[0];
            int num_inducing = target_array->dimensions[1];
            int input_dim = X_array->dimensions[1];
            //#pragma omp parallel for private(j)
            for (i=0;i<n;i++){
                for (j=0;j<num_inducing;j++){
                    %s
                    //target[i*num_inducing+j] = 
                    TARGET2(i, j) += k(%s);
                }
            }
            %s
            """%(precompute_string,arg_string,"/*"+str(self._sp_k)+"*/")
           # adding a string representation of the function in the
           # comment forces recompile when needed
            self._code['K_X'] = self._code['K'].replace('Z2(', 'X2(')
            # Code to compute diagonal of covariance.
            diag_arg_string = re.sub('Z','X',arg_string)
            diag_arg_string = re.sub('int jj','//int jj',diag_arg_string)
            diag_arg_string = re.sub('j','i',diag_arg_string)
            diag_precompute_string = re.sub('int jj','//int jj',precompute_string)
            diag_precompute_string = re.sub('Z','X',diag_precompute_string)
            diag_precompute_string = re.sub('j','i',diag_precompute_string)
            # Code to do the looping for Kdiag
            self._code['Kdiag'] =\
            """
            // _code['Kdiag']
            // Code for computing diagonal of covariance function.
            int i;
            int n = target_array->dimensions[0];
            int input_dim = X_array->dimensions[1];
            //#pragma omp parallel for
            for (i=0;i<n;i++){
                    %s
                    //target[i] =
                    TARGET1(i)=k(%s);
            }
            %s
            """%(diag_precompute_string,diag_arg_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
            # Code to compute gradients
            if self.output_dim>1:
                for i, theta in enumerate(self._sp_theta_i):
                    grad_func_list = [' '*26 + 'TARGET1(ii) += PARTIAL2(i, j)*dk_d%s(%s);'%(theta.name, arg_string)]
                    grad_func_list += [' '*26 + 'TARGET1(jj) += PARTIAL2(i, j)*dk_d%s(%s);'%(theta.name, reverse_arg_string)]
                    grad_func_list = c_define_output_indices+grad_func_list
                    grad_func_string = '\n'.join(grad_func_list) 
                    self._code['dK_d' + theta.name] =\
                      """
                      int i;
                      int j;
                      int n = partial_array->dimensions[0];
                      int num_inducing = partial_array->dimensions[1];
                      int input_dim = X_array->dimensions[1];
                      //#pragma omp parallel for private(j)
                      for (i=0;i<n;i++){
                        for (j=0;j<num_inducing;j++){
 %s
                        }
                      }
                      %s
                      """%(grad_func_string,"/*"+str(self._sp_k)+"*/") # adding a string representation forces recompile when needed
                    self._code['dK_d' +theta.name + '_X'] = self._code['dK_d' + theta.name].replace('Z2(', 'X2(')
                    # Code to compute gradients for Kdiag TODO: needs clean up
                    diag_grad_func_string = re.sub('Z','X',grad_func_string,count=0)
                    diag_grad_func_string = re.sub('int jj','//int jj',diag_grad_func_string)
                    diag_grad_func_string = re.sub('j','i',diag_grad_func_string)
                    diag_grad_func_string = re.sub('PARTIAL2\(i, i\)','PARTIAL(i)',diag_grad_func_string)
                    self._code['dKdiag_d' + theta.name] =\
                      """
                      // _dKdiag_dtheta_code
                      // Code for computing gradient of diagonal with respect to parameters.
                      int i;
                      int n = partial_array->dimensions[0];
                      int input_dim = X_array->dimensions[1];
                      for (i=0;i<n;i++){
                        %s
                      }
                      %s
                      """%(diag_grad_func_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
            for i, theta in enumerate(self._sp_theta):
                grad_func_list = [' '*26 + 'TARGET1(%i) += PARTIAL2(i, j)*dk_d%s(%s);'%(i,theta.name,arg_string)]
                grad_func_string = '\n'.join(grad_func_list) 
                self._code['dK_d' + theta.name] =\
                  """
                  // _dK_dtheta_code
                  // Code for computing gradient of covariance with respect to parameters.
                  int i;
                  int j;
                  int n = partial_array->dimensions[0];
                  int num_inducing = partial_array->dimensions[1];
                  int input_dim = X_array->dimensions[1];
                  //#pragma omp parallel for private(j)
                  for (i=0;i<n;i++){
                    for (j=0;j<num_inducing;j++){
                      %s
                    }
                  }
                  %s
                  """%(grad_func_string,"/*"+str(self._sp_k)+"*/") # adding a string representation forces recompile when needed
                self._code['dK_d' + theta.name +'_X'] = self._code['dK_d' + theta.name].replace('Z2(', 'X2(')
                # Code to compute gradients for Kdiag TODO: needs clean up
                diag_grad_func_string = re.sub('Z','X',grad_func_string,count=0)
                diag_grad_func_string = re.sub('int jj','//int jj',diag_grad_func_string)
                diag_grad_func_string = re.sub('j','i',diag_grad_func_string)
                diag_grad_func_string = re.sub('PARTIAL2\(i, i\)','PARTIAL(i)',diag_grad_func_string)
                self._code['dKdiag_d' + theta.name] =\
                   """
                   // _dKdiag_dtheta_code
                   // Code for computing gradient of diagonal with respect to parameters.
                   int i;
                   int n = partial_array->dimensions[0];
                   int input_dim = X_array->dimensions[1];
                   for (i=0;i<n;i++){
                     %s
                   }
                   %s
                   """%(diag_grad_func_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
            # Code for gradients wrt X, TODO: may need to deal with special case where one input is actually an output.
            gradX_func_list = []
            if self.output_dim>1:
                gradX_func_list += c_define_output_indices
            gradX_func_list += ["TARGET2(i, %i) += partial[i*num_inducing+j]*dk_dx_%i(%s);"%(q,q,arg_string) for q in range(self._real_input_dim)]
            gradX_func_string = "\n".join(gradX_func_list)
            self._code['dK_dX'] = \
            """
            // _dK_dX_code
            // Code for computing gradient of covariance with respect to inputs.
            int i;
            int j;
            int n = partial_array->dimensions[0];
            int num_inducing = partial_array->dimensions[1];
            int input_dim = X_array->dimensions[1];
            //#pragma omp parallel for private(j)
            for (i=0;i<n; i++){
              for (j=0; j<num_inducing; j++){
                %s
              }
            }
            %s
            """%(gradX_func_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
            self._code['dK_dX_X'] = self._code['dK_dX'].replace('Z2(', 'X2(')
            diag_gradX_func_string = re.sub('Z','X',gradX_func_string,count=0)
            diag_gradX_func_string = re.sub('int jj','//int jj',diag_gradX_func_string)
            diag_gradX_func_string = re.sub('j','i',diag_gradX_func_string)
            diag_gradX_func_string = re.sub('PARTIAL2\(i\, i\)','2*PARTIAL(i)',diag_gradX_func_string)
            # Code for gradients of Kdiag wrt X
            self._code['dKdiag_dX'] = \
            """
            // _dKdiag_dX_code
            // Code for computing gradient of diagonal with respect to inputs.
            int n = partial_array->dimensions[0];
            int input_dim = X_array->dimensions[1];
            for (int i=0;i<n; i++){
                %s
            }
            %s
            """%(diag_gradX_func_string,"/*"+str(self._sp_k)+"*/") #adding a
            # string representation forces recompile when needed Get rid
            # of Zs in argument for diagonal. TODO: Why wasn't
            # diag_func_string called here? Need to check that.
            #TODO: insert multiple functions here via string manipulation
            #TODO: similar functions for psi_stats
            #TODO: similar functions when cython available.
            #TODO: similar functions when only python available.
    def _get_arg_names(self, target=None, Z=None, partial=None):
        arg_names = ['X']
        if target is not None:
            arg_names += ['target']
        for shared_params in self._sp_theta:
            arg_names += [shared_params.name]
        if Z is not None:
            arg_names += ['Z']
        if partial is not None:
            arg_names += ['partial']
        if self.output_dim>1:
            arg_names += self._split_theta_names
            arg_names += ['output_dim']
        return arg_names
    def _generate_inline(self, code, X, target=None, Z=None, partial=None):
        output_dim = self.output_dim
        # Need to extract parameters to local variables first
        for shared_params in self._sp_theta:
            locals()[shared_params.name] = getattr(self, shared_params.name)
        for split_params in self._split_theta_names:
            locals()[split_params] = np.asarray(getattr(self, split_params))
        arg_names = self._get_arg_names(target, Z, partial)        
        if weave_available:
            return weave.inline(code=code, arg_names=arg_names,**self.weave_kwargs)
        else:
            raise RuntimeError('Weave not available and other variants of sympy covariance not yet implemented')
    def K(self,X,Z,target):        
        if Z is None:
            self._generate_inline(self._code['K_X'], X, target)
        else:
            self._generate_inline(self._code['K'], X, target, Z)
    def Kdiag(self,X,target):
        self._generate_inline(self._code['Kdiag'], X, target)
    def _param_grad_helper(self,partial,X,Z,target):
        if Z is None:
            self._generate_inline(self._code['dK_dtheta_X'], X, target, Z, partial)
        else:
            self._generate_inline(self._code['dK_dtheta'], X, target, Z, partial)
    def dKdiag_dtheta(self,partial,X,target):
        self._generate_inline(self._code['dKdiag_dtheta'], X, target, Z=None, partial=partial).namelocals()[shared_params.name] = getattr(self, shared_params.name)
    def gradients_X(self,partial,X,Z,target):
        if Z is None:
            self._generate_inline(self._code['dK_dX_X'], X, target, Z, partial)
        else:
            self._generate_inline(self._code['dK_dX'], X, target, Z, partial)
    def dKdiag_dX(self,partial,X,target):
        self._generate_inline(self._code['dKdiag_dX'], X, target, Z, partial)
    def compute_psi_stats(self):
        #define some normal distributions
        mus = [sp.var('mu_%i'%i,real=True) for i in range(self.input_dim)]
        Ss = [sp.var('S_%i'%i,positive=True) for i in range(self.input_dim)]
        normals = [(2*sp.pi*Si)**(-0.5)*sp.exp(-0.5*(xi-mui)**2/Si) for xi, mui, Si in zip(self._sp_x, mus, Ss)]
        #do some integration!
        #self._sp_psi0 = ??
        self._sp_psi1 = self._sp_k
        for i in range(self.input_dim):
            print 'perfoming integrals %i of %i'%(i+1,2*self.input_dim)
            sys.stdout.flush()
            self._sp_psi1 *= normals[i]
            self._sp_psi1 = sp.integrate(self._sp_psi1,(self._sp_x[i],-sp.oo,sp.oo))
            clear_cache()
        self._sp_psi1 = self._sp_psi1.simplify()
        #and here's psi2 (eek!)
        zprime = [sp.Symbol('zp%i'%i) for i in range(self.input_dim)]
        self._sp_psi2 = self._sp_k.copy()*self._sp_k.copy().subs(zip(self._sp_z,zprime))
        for i in range(self.input_dim):
            print 'perfoming integrals %i of %i'%(self.input_dim+i+1,2*self.input_dim)
            sys.stdout.flush()
            self._sp_psi2 *= normals[i]
            self._sp_psi2 = sp.integrate(self._sp_psi2,(self._sp_x[i],-sp.oo,sp.oo))
            clear_cache()
        self._sp_psi2 = self._sp_psi2.simplify()
    def parameters_changed(self):
        # Reset the caches
        self._cache, self._cache2 = np.empty(shape=(2, 1))
        self._cache3, self._cache4, self._cache5 = np.empty(shape=(3, 1)) 
    def update_gradients_full(self, dL_dK, X):
        # Need to extract parameters to local variables first
        self._K_computations(X, None)
        for shared_params in self._sp_theta:
            parameter = getattr(self, shared_params.name)
            code = self._code['dK_d' + shared_params.name]
            setattr(parameter, 'gradient', self._generate_inline(code, X, target=None, Z=None, partial=dL_dK))
        for split_params in self._split_theta_names:
            parameter = getattr(self, split_params.name)
            code = self._code['dK_d' + split_params.name]
            setattr(parameter, 'gradient', self._generate_inline(code, X, target=None, Z=None, partial=dL_dK))
    # def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
    #     #contributions from Kdiag
    #     self.variance.gradient = np.sum(dL_dKdiag)
    #     #from Knm
    #     self._K_computations(X, Z)
    #     self.variance.gradient += np.sum(dL_dKnm * self._K_dvar)
    #     if self.ARD:
    #         self.lengthscale.gradient = self._dL_dlengthscales_via_K(dL_dKnm, X, Z)
    #     else:
    #         self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKnm)
    #     #from Kmm
    #     self._K_computations(Z, None)
    #     self.variance.gradient += np.sum(dL_dKmm * self._K_dvar)
    #     if self.ARD:
    #         self.lengthscale.gradient += self._dL_dlengthscales_via_K(dL_dKmm, Z, None)
    #     else:
    #         self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
    #---------------------------------------#
    #            Precomputations            #
    #---------------------------------------#
    def _K_computations(self, X, Z):
        if Z is None:
            self._generate_inline(self._precompute, X)
        else:
            self._generate_inline(self._precompute, X, Z=Z)
--- a/GPy/kern/parts/white.py
+++ b/GPy/kern/parts/white.py
@ -1,12 +1,12 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-from kernpart import Kernpart
+from kern import Kern
 import numpy as np
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
-class White(Kernpart):
+class White(Kern):
    """
    White noise kernel.
@ -20,14 +20,17 @@ class White(Kernpart):
        self.input_dim = input_dim
        self.variance = Param('variance', variance, Logexp())
        self.add_parameters(self.variance)
        self._psi1 = 0 # TODO: more elegance here
-    def K(self,X,X2,target):
+    def K(self, X, X2=None):
        if X2 is None:
-            target += np.eye(X.shape[0])*self.variance
+            return np.eye(X.shape[0])*self.variance
        else:
            return np.zeros((X.shape[0], X2.shape[0]))
-    def Kdiag(self,X,target):
+    def Kdiag(self,X):
-        target += self.variance
+        ret = np.ones(X.shape[0])
        ret[:] = self.variance
        return ret
    def update_gradients_full(self, dL_dK, X):
        self.variance.gradient = np.trace(dL_dK)
@ -38,14 +41,8 @@ class White(Kernpart):
    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
        raise NotImplementedError
-    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+    def gradients_X(self,dL_dK,X,X2):
-        target += np.sum(dL_dKdiag)
+        return np.zeros_like(X)
    def gradients_X(self,dL_dK,X,X2,target):
        pass
    def dKdiag_dX(self,dL_dKdiag,X,target):
        pass
    def psi0(self,Z,mu,S,target):
        pass # target += self.variance
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@ -1,680 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import sys
 import numpy as np
 import itertools
 from parts.prod import Prod as prod
 from parts.linear import Linear
 from parts.kernpart import Kernpart
 from ..core.parameterization import Parameterized
 from GPy.core.parameterization.param import Param
 class kern(Parameterized):
    def __init__(self, input_dim, parts=[], input_slices=None):
        """
        This is the main kernel class for GPy. It handles multiple
        (additive) kernel functions, and keeps track of various things
        like which parameters live where.
        The technical code for kernels is divided into _parts_ (see
        e.g. rbf.py). This object contains a list of parts, which are
        computed additively. For multiplication, special _prod_ parts
        are used.
        :param input_dim: The dimensionality of the kernel's input space
        :type input_dim: int
        :param parts: the 'parts' (PD functions) of the kernel
        :type parts: list of Kernpart objects
        :param input_slices: the slices on the inputs which apply to each kernel
        :type input_slices: list of slice objects, or list of bools
        """
        super(kern, self).__init__('kern')
        self.add_parameters(*parts)
        self.input_dim = input_dim
        if input_slices is None:
            self.input_slices = [slice(None) for p in self._parameters_]
        else:
            assert len(input_slices) == len(self._parameters_)
            self.input_slices = [sl if type(sl) is slice else slice(None) for sl in input_slices]
        for p in self._parameters_:
            assert isinstance(p, Kernpart), "bad kernel part"
    def parameters_changed(self):
        [p.parameters_changed() for p in self._parameters_]
    def connect_input(self, Xparam):
        [p.connect_input(Xparam) for p in self._parameters_]
    def _getstate(self):
        """
        Get the current state of the class,
        here just all the indices, rest can get recomputed
        """
        return Parameterized._getstate(self) + [#self._parameters_,
                #self.num_params,
                self.input_dim,
                self.input_slices,
                self._param_slices_
                ]
    def _setstate(self, state):
        self._param_slices_ = state.pop()
        self.input_slices = state.pop()
        self.input_dim = state.pop()
        #self.num_params = state.pop()
        #self._parameters_ = state.pop()
        Parameterized._setstate(self, state)
    def plot_ARD(self, *args):
        """If an ARD kernel is present, plot a bar representation using matplotlib
        See GPy.plotting.matplot_dep.plot_ARD
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import kernel_plots
        return kernel_plots.plot_ARD(self,*args)
 #     def _transform_gradients(self, g):
 #         """
 #         Apply the transformations of the kernel so that the returned vector
 #         represents the gradient in the transformed space (i.e. that given by
 #         get_params_transformed())
 #
 #         :param g: the gradient vector for the current model, usually created by _param_grad_helper
 #         """
 #         x = self._get_params()
 #         [np.place(g, index, g[index] * constraint.gradfactor(x[index]))
 #          for constraint, index in self.constraints.iteritems() if constraint is not __fixed__]
 # #         for constraint, index in self.constraints.iteritems():
 # #             if constraint != __fixed__:
 # #                 g[index] = g[index] * constraint.gradfactor(x[index])
 #         #[np.put(g, i, v) for i, v in [(t[0], np.sum(g[t])) for t in self.tied_indices]]
 #         [np.put(g, i, v) for i, v in [[i, t.sum()] for p in self._parameters_ for t,i in p._tied_to_me_.iteritems()]]
 # #         if len(self.tied_indices) or len(self.fixed_indices):
 # #             to_remove = np.hstack((self.fixed_indices + [t[1:] for t in self.tied_indices]))
 # #             return np.delete(g, to_remove)
 # #         else:
 #         if self._fixes_ is not None: return g[self._fixes_]
 #         return g
 #         x = self._get_params()
 #         [np.put(x, i, x * t.gradfactor(x[i])) for i, t in zip(self.constrained_indices, self.constraints)]
 #         [np.put(g, i, v) for i, v in [(t[0], np.sum(g[t])) for t in self.tied_indices]]
 #         if len(self.tied_indices) or len(self.fixed_indices):
 #             to_remove = np.hstack((self.fixed_indices + [t[1:] for t in self.tied_indices]))
 #             return np.delete(g, to_remove)
 #         else:
 #             return g
    def __add__(self, other):
        """ Overloading of the '+' operator. for more control, see self.add """
        return self.add(other)
    def add(self, other, tensor=False):
        """
        Add another kernel to this one.
        If Tensor is False, both kernels are defined on the same _space_. then
        the created kernel will have the same number of inputs as self and
        other (which must be the same).
        If Tensor is True, then the dimensions are stacked 'horizontally', so
        that the resulting kernel has self.input_dim + other.input_dim
        :param other: the other kernel to be added
        :type other: GPy.kern
        """
        if tensor:
            D = self.input_dim + other.input_dim
            self_input_slices = [slice(*sl.indices(self.input_dim)) for sl in self.input_slices]
            other_input_indices = [sl.indices(other.input_dim) for sl in other.input_slices]
            other_input_slices = [slice(i[0] + self.input_dim, i[1] + self.input_dim, i[2]) for i in other_input_indices]
            newkern = kern(D, self._parameters_ + other._parameters_, self_input_slices + other_input_slices)
            # transfer constraints:
 #             newkern.constrained_indices = self.constrained_indices + [x + self.num_params for x in other.constrained_indices]
 #             newkern.constraints = self.constraints + other.constraints
 #             newkern.fixed_indices = self.fixed_indices + [self.num_params + x for x in other.fixed_indices]
 #             newkern.fixed_values = self.fixed_values + other.fixed_values
 #             newkern.constraints = self.constraints + other.constraints
 #             newkern.tied_indices = self.tied_indices + [self.num_params + x for x in other.tied_indices]
        else:
            assert self.input_dim == other.input_dim
            newkern = kern(self.input_dim, self._parameters_ + other._parameters_, self.input_slices + other.input_slices)
            # transfer constraints:
 #             newkern.constrained_indices = self.constrained_indices + [i + self.num_params  for i in other.constrained_indices]
 #             newkern.constraints = self.constraints + other.constraints
 #             newkern.fixed_indices = self.fixed_indices + [self.num_params + x for x in other.fixed_indices]
 #             newkern.fixed_values = self.fixed_values + other.fixed_values
 #             newkern.tied_indices = self.tied_indices + [self.num_params + x for x in other.tied_indices]
        [newkern.constraints.add(transform, ind) for transform, ind in self.constraints.iteritems()]
        [newkern.constraints.add(transform, ind+self.size) for transform, ind in other.constraints.iteritems()]
        newkern._fixes_ = ((self._fixes_ or 0) + (other._fixes_ or 0)) or None
        return newkern
    def __call__(self, X, X2=None):
        return self.K(X, X2)
    def __mul__(self, other):
        """ Here we overload the '*' operator. See self.prod for more information"""
        return self.prod(other)
    def __pow__(self, other, tensor=False):
        """
        Shortcut for tensor `prod`.
        """
        return self.prod(other, tensor=True)
    def prod(self, other, tensor=False):
        """
        Multiply two kernels (either on the same space, or on the tensor product of the input space).
        :param other: the other kernel to be added
        :type other: GPy.kern
        :param tensor: whether or not to use the tensor space (default is false).
        :type tensor: bool
        """
        K1 = self
        K2 = other
        #K1 = self.copy()
        #K2 = other.copy()
        slices = []
        for sl1, sl2 in itertools.product(K1.input_slices, K2.input_slices):
            s1, s2 = [False] * K1.input_dim, [False] * K2.input_dim
            s1[sl1], s2[sl2] = [True], [True]
            slices += [s1 + s2]
        newkernparts = [prod(k1, k2, tensor) for k1, k2 in itertools.product(K1._parameters_, K2._parameters_)]
        if tensor:
            newkern = kern(K1.input_dim + K2.input_dim, newkernparts, slices)
        else:
            newkern = kern(K1.input_dim, newkernparts, slices)
        #newkern._follow_constrains(K1, K2)
        return newkern
 #     def _follow_constrains(self, K1, K2):
 #
 #         # Build the array that allows to go from the initial indices of the param to the new ones
 #         K1_param = []
 #         n = 0
 #         for k1 in K1.parts:
 #             K1_param += [range(n, n + k1.num_params)]
 #             n += k1.num_params
 #         n = 0
 #         K2_param = []
 #         for k2 in K2.parts:
 #             K2_param += [range(K1.num_params + n, K1.num_params + n + k2.num_params)]
 #             n += k2.num_params
 #         index_param = []
 #         for p1 in K1_param:
 #             for p2 in K2_param:
 #                 index_param += p1 + p2
 #         index_param = np.array(index_param)
 #
 #         # Get the ties and constrains of the kernels before the multiplication
 #         prev_ties = K1.tied_indices + [arr + K1.num_params for arr in K2.tied_indices]
 #
 #         prev_constr_ind = [K1.constrained_indices] + [K1.num_params + i for i in K2.constrained_indices]
 #         prev_constr = K1.constraints + K2.constraints
 #
 #         # prev_constr_fix = K1.fixed_indices + [arr + K1.num_params for arr in K2.fixed_indices]
 #         # prev_constr_fix_values = K1.fixed_values + K2.fixed_values
 #
 #         # follow the previous ties
 #         for arr in prev_ties:
 #             for j in arr:
 #                 index_param[np.where(index_param == j)[0]] = arr[0]
 #
 #         # ties and constrains
 #         for i in range(K1.num_params + K2.num_params):
 #             index = np.where(index_param == i)[0]
 #             if index.size > 1:
 #                 self.tie_params(index)
 #         for i, t in zip(prev_constr_ind, prev_constr):
 #             self.constrain(np.where(index_param == i)[0], t)
 #
 #     def _get_params(self):
 #         return np.hstack(self._parameters_)
 #         return np.hstack([p._get_params() for p in self._parameters_])
 #     def _set_params(self, x):
 #         import ipdb;ipdb.set_trace()
 #         [p._set_params(x[s]) for p, s in zip(self._parameters_, self._param_slices_)]
 #     def _get_param_names(self):
 #         # this is a bit nasty: we want to distinguish between parts with the same name by appending a count
 #         part_names = np.array([k.name for k in self._parameters_], dtype=np.str)
 #         counts = [np.sum(part_names == ni) for i, ni in enumerate(part_names)]
 #         cum_counts = [np.sum(part_names[i:] == ni) for i, ni in enumerate(part_names)]
 #         names = [name + '_' + str(cum_count) if count > 1 else name for name, count, cum_count in zip(part_names, counts, cum_counts)]
 #
 #         return sum([[name + '_' + n for n in k._get_param_names()] for name, k in zip(names, self._parameters_)], [])
    def K(self, X, X2=None, which_parts='all'):
        """
        Compute the kernel function.
        :param X: the first set of inputs to the kernel
        :param X2: (optional) the second set of arguments to the kernel. If X2
                   is None, this is passed throgh to the 'part' object, which
                   handles this as X2 == X.
        :param which_parts: a list of booleans detailing whether to include
                            each of the part functions. By default, 'all'
                            indicates all parts
        """
        if which_parts == 'all':
            which_parts = [True] * self.size
        assert X.shape[1] == self.input_dim
        if X2 is None:
            target = np.zeros((X.shape[0], X.shape[0]))
            [p.K(X[:, i_s], None, target=target) for p, i_s, part_i_used in zip(self._parameters_, self.input_slices, which_parts) if part_i_used]
        else:
            target = np.zeros((X.shape[0], X2.shape[0]))
            [p.K(X[:, i_s], X2[:, i_s], target=target) for p, i_s, part_i_used in zip(self._parameters_, self.input_slices, which_parts) if part_i_used]
        return target
    def update_gradients_full(self, dL_dK, X):
        [p.update_gradients_full(dL_dK, X) for p in self._parameters_]
    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
        [p.update_gradients_sparse(dL_dKmm, dL_dKnm, dL_dKdiag, X, Z) for p in self._parameters_]
    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
        [p.update_gradients_variational(dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z) for p in self._parameters_]
    def _param_grad_helper(self, dL_dK, X, X2=None):
        """
        Compute the gradient of the covariance function with respect to the parameters.
        :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
        :type dL_dK: Np.ndarray (num_samples x num_inducing)
        :param X: Observed data inputs
        :type X: np.ndarray (num_samples x input_dim)
        :param X2: Observed data inputs (optional, defaults to X)
        :type X2: np.ndarray (num_inducing x input_dim)
        returns: dL_dtheta
        """
        assert X.shape[1] == self.input_dim
        target = np.zeros(self.size)
        if X2 is None:
            [p._param_grad_helper(dL_dK, X[:, i_s], None, target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)]
        else:
            [p._param_grad_helper(dL_dK, X[:, i_s], X2[:, i_s], target[ps]) for p, i_s, ps, in zip(self._parameters_, self.input_slices, self._param_slices_)]
        return self._transform_gradients(target)
    def gradients_X(self, dL_dK, X, X2=None):
        """Compute the gradient of the objective function with respect to X.
        :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
        :type dL_dK: np.ndarray (num_samples x num_inducing)
        :param X: Observed data inputs
        :type X: np.ndarray (num_samples x input_dim)
        :param X2: Observed data inputs (optional, defaults to X)
        :type X2: np.ndarray (num_inducing x input_dim)"""
        target = np.zeros_like(X)
        if X2 is None:
            [p.gradients_X(dL_dK, X[:, i_s], None, target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
        else:
            [p.gradients_X(dL_dK, X[:, i_s], X2[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
        return target
    def Kdiag(self, X, which_parts='all'):
        """Compute the diagonal of the covariance function for inputs X."""
        if which_parts == 'all':
            which_parts = [True] * self.size
        assert X.shape[1] == self.input_dim
        target = np.zeros(X.shape[0])
        [p.Kdiag(X[:, i_s], target=target) for p, i_s, part_on in zip(self._parameters_, self.input_slices, which_parts) if part_on]
        return target
    def dKdiag_dtheta(self, dL_dKdiag, X):
        """Compute the gradient of the diagonal of the covariance function with respect to the parameters."""
        assert X.shape[1] == self.input_dim
        assert dL_dKdiag.size == X.shape[0]
        target = np.zeros(self.size)
        [p.dKdiag_dtheta(dL_dKdiag, X[:, i_s], target[ps]) for p, i_s, ps in zip(self._parameters_, self.input_slices, self._param_slices_)]
        return self._transform_gradients(target)
    def dKdiag_dX(self, dL_dKdiag, X):
        assert X.shape[1] == self.input_dim
        target = np.zeros_like(X)
        [p.dKdiag_dX(dL_dKdiag, X[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
        return target
    def psi0(self, Z, mu, S):
        target = np.zeros(mu.shape[0])
        [p.psi0(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self._parameters_, self.input_slices)]
        return target
    def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S):
        target = np.zeros(self.size)
        [p.dpsi0_dtheta(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self._parameters_, self._param_slices_, self.input_slices)]
        return self._transform_gradients(target)
    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S):
        target_mu, target_S = np.zeros_like(mu), np.zeros_like(S)
        [p.dpsi0_dmuS(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
        return target_mu, target_S
    def psi1(self, Z, mu, S):
        target = np.zeros((mu.shape[0], Z.shape[0]))
        [p.psi1(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self._parameters_, self.input_slices)]
        return target
    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S):
        target = np.zeros((self.size))
        [p.dpsi1_dtheta(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self._parameters_, self._param_slices_, self.input_slices)]
        return self._transform_gradients(target)
    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S):
        target = np.zeros_like(Z)
        [p.dpsi1_dZ(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
        return target
    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S):
        """return shapes are num_samples,num_inducing,input_dim"""
        target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
        [p.dpsi1_dmuS(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
        return target_mu, target_S
    def psi2(self, Z, mu, S):
        """
        Computer the psi2 statistics for the covariance function.
        :param Z: np.ndarray of inducing inputs (num_inducing x input_dim)
        :param mu, S: np.ndarrays of means and variances (each num_samples x input_dim)
        :returns psi2: np.ndarray (num_samples,num_inducing,num_inducing)
        """
        target = np.zeros((mu.shape[0], Z.shape[0], Z.shape[0]))
        [p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self._parameters_, self.input_slices)]
        # compute the "cross" terms
        # TODO: input_slices needed
        crossterms = 0
        for [p1, i_s1], [p2, i_s2] in itertools.combinations(zip(self._parameters_, self.input_slices), 2):
            if i_s1 == i_s2:
                # TODO psi1 this must be faster/better/precached/more nice
                tmp1 = np.zeros((mu.shape[0], Z.shape[0]))
                p1.psi1(Z[:, i_s1], mu[:, i_s1], S[:, i_s1], tmp1)
                tmp2 = np.zeros((mu.shape[0], Z.shape[0]))
                p2.psi1(Z[:, i_s2], mu[:, i_s2], S[:, i_s2], tmp2)
                prod = np.multiply(tmp1, tmp2)
                crossterms += prod[:, :, None] + prod[:, None, :]
        target += crossterms
        return target
    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S):
        """Gradient of the psi2 statistics with respect to the parameters."""
        target = np.zeros(self.size)
        [p.dpsi2_dtheta(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, i_s, ps in zip(self._parameters_, self.input_slices, self._param_slices_)]
        # compute the "cross" terms
        # TODO: better looping, input_slices
        for i1, i2 in itertools.permutations(range(len(self._parameters_)), 2):
            p1, p2 = self._parameters_[i1], self._parameters_[i2]
 #             ipsl1, ipsl2 = self.input_slices[i1], self.input_slices[i2]
            ps1, ps2 = self._param_slices_[i1], self._param_slices_[i2]
            tmp = np.zeros((mu.shape[0], Z.shape[0]))
            p1.psi1(Z, mu, S, tmp)
            p2.dpsi1_dtheta((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target[ps2])
        return self._transform_gradients(target)
    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S):
        target = np.zeros_like(Z)
        [p.dpsi2_dZ(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
        # target *= 2
        # compute the "cross" terms
        # TODO: we need input_slices here.
        for p1, p2 in itertools.permutations(self._parameters_, 2):
 #             if p1.name == 'linear' and p2.name == 'linear':
 #                 raise NotImplementedError("We don't handle linear/linear cross-terms")
            tmp = np.zeros((mu.shape[0], Z.shape[0]))
            p1.psi1(Z, mu, S, tmp)
            p2.dpsi1_dZ((tmp[:, None, :] * dL_dpsi2).sum(1), Z, mu, S, target)
        return target * 2
    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S):
        target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
        [p.dpsi2_dmuS(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
        # compute the "cross" terms
        # TODO: we need input_slices here.
        for p1, p2 in itertools.permutations(self._parameters_, 2):
 #             if p1.name == 'linear' and p2.name == 'linear':
 #                 raise NotImplementedError("We don't handle linear/linear cross-terms")
            tmp = np.zeros((mu.shape[0], Z.shape[0]))
            p1.psi1(Z, mu, S, tmp)
            p2.dpsi1_dmuS((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target_mu, target_S)
        return target_mu, target_S
    def plot(self, *args, **kwargs):
        """
        See GPy.plotting.matplot_dep.plot
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import kernel_plots
        kernel_plots.plot(self,*args)
 from GPy.core.model import Model
 class Kern_check_model(Model):
    """This is a dummy model class used as a base class for checking that the gradients of a given kernel are implemented correctly. It enables checkgradient() to be called independently on a kernel."""
    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
        Model.__init__(self, 'kernel_test_model')
        num_samples = 20
        num_samples2 = 10
        if kernel==None:
            kernel = GPy.kern.rbf(1)
        if X==None:
            X = np.random.randn(num_samples, kernel.input_dim)
        if dL_dK==None:
            if X2==None:
                dL_dK = np.ones((X.shape[0], X.shape[0]))
            else:
                dL_dK = np.ones((X.shape[0], X2.shape[0]))
        self.kernel=kernel
        self.add_parameter(kernel)
        self.X = X
        self.X2 = X2
        self.dL_dK = dL_dK
    def is_positive_definite(self):
        v = np.linalg.eig(self.kernel.K(self.X))[0]
        if any(v<-10*sys.float_info.epsilon):
            return False
        else:
            return True
    def log_likelihood(self):
        return (self.dL_dK*self.kernel.K(self.X, self.X2)).sum()
    def _log_likelihood_gradients(self):
        raise NotImplementedError, "This needs to be implemented to use the kern_check_model class."
 class Kern_check_dK_dtheta(Kern_check_model):
    """This class allows gradient checks for the gradient of a kernel with respect to parameters. """
    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
    def _log_likelihood_gradients(self):
        return self.kernel._param_grad_helper(self.dL_dK, self.X, self.X2)
 class Kern_check_dKdiag_dtheta(Kern_check_model):
    """This class allows gradient checks of the gradient of the diagonal of a kernel with respect to the parameters."""
    def __init__(self, kernel=None, dL_dK=None, X=None):
        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
        if dL_dK==None:
            self.dL_dK = np.ones((self.X.shape[0]))
    def parameters_changed(self):
        self.kernel.update_gradients_full(self.dL_dK, self.X)        
    def log_likelihood(self):
        return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
    def _log_likelihood_gradients(self):
        return self.kernel.dKdiag_dtheta(self.dL_dK, self.X)
 class Kern_check_dK_dX(Kern_check_model):
    """This class allows gradient checks for the gradient of a kernel with respect to X. """
    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
        self.remove_parameter(kernel)
        self.X = Param('X', self.X)
        self.add_parameter(self.X)
    def _log_likelihood_gradients(self):
        return self.kernel.gradients_X(self.dL_dK, self.X, self.X2).flatten()
 class Kern_check_dKdiag_dX(Kern_check_dK_dX):
    """This class allows gradient checks for the gradient of a kernel diagonal with respect to X. """
    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
        Kern_check_dK_dX.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
        if dL_dK==None:
            self.dL_dK = np.ones((self.X.shape[0]))
    def log_likelihood(self):
        return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
    def _log_likelihood_gradients(self):
        return self.kernel.dKdiag_dX(self.dL_dK, self.X).flatten()
 def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False):
    """
    This function runs on kernels to check the correctness of their
    implementation. It checks that the covariance function is positive definite
    for a randomly generated data set.
    :param kern: the kernel to be tested.
    :type kern: GPy.kern.Kernpart
    :param X: X input values to test the covariance function.
    :type X: ndarray
    :param X2: X2 input values to test the covariance function.
    :type X2: ndarray
    """
    pass_checks = True
    if X==None:
        X = np.random.randn(10, kern.input_dim)
        if output_ind is not None:
            X[:, output_ind] = np.random.randint(kern.output_dim, X.shape[0])
    if X2==None:
        X2 = np.random.randn(20, kern.input_dim)
        if output_ind is not None:
            X2[:, output_ind] = np.random.randint(kern.output_dim, X2.shape[0])
    if verbose:
        print("Checking covariance function is positive definite.")
    result = Kern_check_model(kern, X=X).is_positive_definite()
    if result and verbose:
        print("Check passed.")
    if not result:
        print("Positive definite check failed for " + kern.name + " covariance function.")
        pass_checks = False
        return False
    if verbose:
        print("Checking gradients of K(X, X) wrt theta.")
    result = Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=verbose)
    if result and verbose:
        print("Check passed.")
    if not result:
        print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
        Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=True)
        pass_checks = False
        return False
    if verbose:
        print("Checking gradients of K(X, X2) wrt theta.")
    result = Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=verbose)
    if result and verbose:
        print("Check passed.")
    if not result:
        print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
        Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=True)
        pass_checks = False
        return False
    if verbose:
        print("Checking gradients of Kdiag(X) wrt theta.")
    result = Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=verbose)
    if result and verbose:
        print("Check passed.")
    if not result:
        print("Gradient of Kdiag(X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
        Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=True)
        pass_checks = False
        return False
    if verbose:
        print("Checking gradients of K(X, X) wrt X.")
    try:
        result = Kern_check_dK_dX(kern, X=X, X2=None).checkgrad(verbose=verbose)
    except NotImplementedError:
        result=True
        if verbose:
            print("gradients_X not implemented for " + kern.name)
    if result and verbose:
        print("Check passed.")
    if not result:
        print("Gradient of K(X, X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
        Kern_check_dK_dX(kern, X=X, X2=None).checkgrad(verbose=True)
        pass_checks = False
        return False
    if verbose:
        print("Checking gradients of K(X, X2) wrt X.")
    try:
        result = Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=verbose)
    except NotImplementedError:
        result=True
        if verbose:
            print("gradients_X not implemented for " + kern.name)
    if result and verbose:
        print("Check passed.")
    if not result:
        print("Gradient of K(X, X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
        Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=True)
        pass_checks = False
        return False
    if verbose:
        print("Checking gradients of Kdiag(X) wrt X.")
    try:
        result = Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=verbose)
    except NotImplementedError:
        result=True
        if verbose:
            print("gradients_X not implemented for " + kern.name)
    if result and verbose:
        print("Check passed.")
    if not result:
        print("Gradient of Kdiag(X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
        Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=True)
        pass_checks = False
        return False
    return pass_checks
--- a/GPy/kern/parts/Brownian.py
+++ b/GPy/kern/parts/Brownian.py
@ -1,65 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kernpart import Kernpart
 import numpy as np
 def theta(x):
    """Heavisdie step function"""
    return np.where(x>=0.,1.,0.)
 class Brownian(Kernpart):
    """
    Brownian Motion kernel.
    :param input_dim: the number of input dimensions
    :type input_dim: int
    :param variance:
    :type variance: float
    """
    def __init__(self,input_dim,variance=1.):
        self.input_dim = input_dim
        assert self.input_dim==1, "Brownian motion in 1D only"
        self.num_params = 1
        self.name = 'Brownian'
        self._set_params(np.array([variance]).flatten())
    def _get_params(self):
        return self.variance
    def _set_params(self,x):
        assert x.shape==(1,)
        self.variance = x
    def _get_param_names(self):
        return ['variance']
    def K(self,X,X2,target):
        if X2 is None:
            X2 = X
        target += self.variance*np.fmin(X,X2.T)
    def Kdiag(self,X,target):
        target += self.variance*X.flatten()
    def _param_grad_helper(self,dL_dK,X,X2,target):
        if X2 is None:
            X2 = X
        target += np.sum(np.fmin(X,X2.T)*dL_dK)
    def dKdiag_dtheta(self,dL_dKdiag,X,target):
        target += np.dot(X.flatten(), dL_dKdiag)
    def gradients_X(self,dL_dK,X,X2,target):
        raise NotImplementedError, "TODO"
        #target += self.variance
        #target -= self.variance*theta(X-X2.T)
        #if X.shape==X2.shape:
            #if np.all(X==X2):
                #np.add(target[:,:,0],self.variance*np.diag(X2.flatten()-X.flatten()),target[:,:,0])
    def dKdiag_dX(self,dL_dKdiag,X,target):
        target += self.variance*dL_dKdiag[:,None]
--- a/GPy/kern/parts/Matern32.py
+++ b/GPy/kern/parts/Matern32.py
@ -1,139 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kernpart import Kernpart
 import numpy as np
 from scipy import integrate
 class Matern32(Kernpart):
    """
    Matern 3/2 kernel:
    .. math::
       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
    :param input_dim: the number of input dimensions
    :type input_dim: int
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
    :param lengthscale: the vector of lengthscale :math:`\ell_i`
    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
    """
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False):
        self.input_dim = input_dim
        self.ARD = ARD
        if ARD == False:
            self.num_params = 2
            self.name = 'Mat32'
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
            else:
                lengthscale = np.ones(1)
        else:
            self.num_params = self.input_dim + 1
            self.name = 'Mat32'
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size == self.input_dim, "bad number of lengthscales"
            else:
                lengthscale = np.ones(self.input_dim)
        self._set_params(np.hstack((variance, lengthscale.flatten())))
    def _get_params(self):
        """return the value of the parameters."""
        return np.hstack((self.variance, self.lengthscale))
    def _set_params(self, x):
        """set the value of the parameters."""
        assert x.size == self.num_params
        self.variance = x[0]
        self.lengthscale = x[1:]
    def _get_param_names(self):
        """return parameter names."""
        if self.num_params == 2:
            return ['variance', 'lengthscale']
        else:
            return ['variance'] + ['lengthscale_%i' % i for i in range(self.lengthscale.size)]
    def K(self, X, X2, target):
        """Compute the covariance matrix between X and X2."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
        np.add(self.variance * (1 + np.sqrt(3.) * dist) * np.exp(-np.sqrt(3.) * dist), target, target)
    def Kdiag(self, X, target):
        """Compute the diagonal of the covariance matrix associated to X."""
        np.add(target, self.variance, target)
    def _param_grad_helper(self, dL_dK, X, X2, target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
        dvar = (1 + np.sqrt(3.) * dist) * np.exp(-np.sqrt(3.) * dist)
        invdist = 1. / np.where(dist != 0., dist, np.inf)
        dist2M = np.square(X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 3
        # dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
        target[0] += np.sum(dvar * dL_dK)
        if self.ARD == True:
            dl = (self.variance * 3 * dist * np.exp(-np.sqrt(3.) * dist))[:, :, np.newaxis] * dist2M * invdist[:, :, np.newaxis]
            # dl = self.variance*dvar[:,:,None]*dist2M*invdist[:,:,None]
            target[1:] += (dl * dL_dK[:, :, None]).sum(0).sum(0)
        else:
            dl = (self.variance * 3 * dist * np.exp(-np.sqrt(3.) * dist)) * dist2M.sum(-1) * invdist
            # dl = self.variance*dvar*dist2M.sum(-1)*invdist
            target[1] += np.sum(dl * dL_dK)
    def dKdiag_dtheta(self, dL_dKdiag, X, target):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
        target[0] += np.sum(dL_dKdiag)
    def gradients_X(self, dL_dK, X, X2, target):
        """derivative of the covariance matrix with respect to X."""
        if X2 is None:
            dist = np.sqrt(np.sum(np.square((X[:, None, :] - X[None, :, :]) / self.lengthscale), -1))[:, :, None]
            ddist_dX = 2*(X[:, None, :] - X[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
        else:
            dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
            ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
        gradients_X = -np.transpose(3 * self.variance * dist * np.exp(-np.sqrt(3) * dist) * ddist_dX, (1, 0, 2))
        target += np.sum(gradients_X * dL_dK.T[:, :, None], 0)
    def dKdiag_dX(self, dL_dKdiag, X, target):
        pass
    def Gram_matrix(self, F, F1, F2, lower, upper):
        """
        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to input_dim=1.
        :param F: vector of functions
        :type F: np.array
        :param F1: vector of derivatives of F
        :type F1: np.array
        :param F2: vector of second derivatives of F
        :type F2: np.array
        :param lower,upper: boundaries of the input domain
        :type lower,upper: floats
        """
        assert self.input_dim == 1
        def L(x, i):
            return(3. / self.lengthscale ** 2 * F[i](x) + 2 * np.sqrt(3) / self.lengthscale * F1[i](x) + F2[i](x))
        n = F.shape[0]
        G = np.zeros((n, n))
        for i in range(n):
            for j in range(i, n):
                G[i, j] = G[j, i] = integrate.quad(lambda x : L(x, i) * L(x, j), lower, upper)[0]
        Flower = np.array([f(lower) for f in F])[:, None]
        F1lower = np.array([f(lower) for f in F1])[:, None]
        # print "OLD \n", np.dot(F1lower,F1lower.T), "\n \n"
        # return(G)
        return(self.lengthscale ** 3 / (12.*np.sqrt(3) * self.variance) * G + 1. / self.variance * np.dot(Flower, Flower.T) + self.lengthscale ** 2 / (3.*self.variance) * np.dot(F1lower, F1lower.T))
--- a/GPy/kern/parts/Matern52.py
+++ b/GPy/kern/parts/Matern52.py
@ -1,145 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kernpart import Kernpart
 import numpy as np
 import hashlib
 from scipy import integrate
 class Matern52(Kernpart):
    """
    Matern 5/2 kernel:
    .. math::
       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r) \ \ \ \ \  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
    :param input_dim: the number of input dimensions
    :type input_dim: int
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
    :param lengthscale: the vector of lengthscale :math:`\ell_i`
    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
    """
    def __init__(self,input_dim,variance=1.,lengthscale=None,ARD=False):
        self.input_dim = input_dim
        self.ARD = ARD
        if ARD == False:
            self.num_params = 2
            self.name = 'Mat52'
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
            else:
                lengthscale = np.ones(1)
        else:
            self.num_params = self.input_dim + 1
            self.name = 'Mat52'
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size == self.input_dim, "bad number of lengthscales"
            else:
                lengthscale = np.ones(self.input_dim)
        self._set_params(np.hstack((variance,lengthscale.flatten())))
    def _get_params(self):
        """return the value of the parameters."""
        return np.hstack((self.variance,self.lengthscale))
    def _set_params(self,x):
        """set the value of the parameters."""
        assert x.size == self.num_params
        self.variance = x[0]
        self.lengthscale = x[1:]
    def _get_param_names(self):
        """return parameter names."""
        if self.num_params == 2:
            return ['variance','lengthscale']
        else:
            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscale.size)]
    def K(self,X,X2,target):
        """Compute the covariance matrix between X and X2."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
        np.add(self.variance*(1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist), target,target)
    def Kdiag(self,X,target):
        """Compute the diagonal of the covariance matrix associated to X."""
        np.add(target,self.variance,target)
    def _param_grad_helper(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
        invdist = 1./np.where(dist!=0.,dist,np.inf)
        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscale**3
        dvar = (1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist)
        dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
        target[0] += np.sum(dvar*dL_dK)
        if self.ARD:
            dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
            #dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
            target[1:] += (dl*dL_dK[:,:,None]).sum(0).sum(0)
        else:
            dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist)) * dist2M.sum(-1)*invdist
            #dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist)) * dist2M.sum(-1)*invdist
            target[1] += np.sum(dl*dL_dK)
    def dKdiag_dtheta(self,dL_dKdiag,X,target):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
        target[0] += np.sum(dL_dKdiag)
    def gradients_X(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
        if X2 is None:
            dist = np.sqrt(np.sum(np.square((X[:,None,:]-X[None,:,:])/self.lengthscale),-1))[:,:,None]
            ddist_dX = 2*(X[:,None,:]-X[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
        else:
            dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
            ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
        gradients_X = -  np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2))
        target += np.sum(gradients_X*dL_dK.T[:,:,None],0)
    def dKdiag_dX(self,dL_dKdiag,X,target):
        pass
    def Gram_matrix(self,F,F1,F2,F3,lower,upper):
        """
        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to input_dim=1.
        :param F: vector of functions
        :type F: np.array
        :param F1: vector of derivatives of F
        :type F1: np.array
        :param F2: vector of second derivatives of F
        :type F2: np.array
        :param F3: vector of third derivatives of F
        :type F3: np.array
        :param lower,upper: boundaries of the input domain
        :type lower,upper: floats
        """
        assert self.input_dim == 1
        def L(x,i):
            return(5*np.sqrt(5)/self.lengthscale**3*F[i](x) + 15./self.lengthscale**2*F1[i](x)+ 3*np.sqrt(5)/self.lengthscale*F2[i](x) + F3[i](x))
        n = F.shape[0]
        G = np.zeros((n,n))
        for i in range(n):
            for j in range(i,n):
                G[i,j] = G[j,i] = integrate.quad(lambda x : L(x,i)*L(x,j),lower,upper)[0]
        G_coef = 3.*self.lengthscale**5/(400*np.sqrt(5))
        Flower = np.array([f(lower) for f in F])[:,None]
        F1lower = np.array([f(lower) for f in F1])[:,None]
        F2lower = np.array([f(lower) for f in F2])[:,None]
        orig = 9./8*np.dot(Flower,Flower.T) + 9.*self.lengthscale**4/200*np.dot(F2lower,F2lower.T)
        orig2 = 3./5*self.lengthscale**2 * ( np.dot(F1lower,F1lower.T) + 1./8*np.dot(Flower,F2lower.T) + 1./8*np.dot(F2lower,Flower.T))
        return(1./self.variance* (G_coef*G + orig + orig2))
--- a/GPy/kern/parts/init.py
+++ b/GPy/kern/parts/init.py
@ -1,29 +0,0 @@
 import bias
 import Brownian
 import coregionalize
 import exponential
 import eq_ode1
 import finite_dimensional
 import fixed
 import gibbs
 import hetero
 import hierarchical
 import independent_outputs
 import linear
 import Matern32
 import Matern52
 import mlp
 import ODE_1
 import periodic_exponential
 import periodic_Matern32
 import periodic_Matern52
 import poly
 import prod_orthogonal
 import prod
 import rational_quadratic
 import rbfcos
 import rbf
 import rbf_inv
 import spline
 import symmetric
 import white
--- a/GPy/kern/parts/bias.py
+++ b/GPy/kern/parts/bias.py
@ -1,81 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kernpart import Kernpart
 from ...core.parameterization import Param
 class Bias(Kernpart):
    def __init__(self,input_dim,variance=1.,name=None):
        """
        :param input_dim: the number of input dimensions
        :type input_dim: int
        :param variance: the variance of the kernel
        :type variance: float
        """
        super(Bias, self).__init__(input_dim, name)
        from ...core.parameterization.transformations import Logexp
        self.variance = Param("variance", variance, Logexp())
        self.add_parameter(self.variance)
    def K(self,X,X2,target):
        target += self.variance
    def Kdiag(self,X,target):
        target += self.variance
    #def dK_dtheta(self,dL_dKdiag,X,X2,target):
        #target += dL_dKdiag.sum()
    def update_gradients_full(self, dL_dK, X):
        self.variance.gradient = dL_dK.sum()
    def dKdiag_dtheta(self,dL_dKdiag,X,target):
        target += dL_dKdiag.sum()
    def gradients_X(self, dL_dK,X, X2, target):
        pass
    def dKdiag_dX(self,dL_dKdiag,X,target):
        pass
    #---------------------------------------#
    #             PSI statistics            #
    #---------------------------------------#
    def psi0(self, Z, mu, S, target):
        target += self.variance
    def psi1(self, Z, mu, S, target):
        self._psi1 = self.variance
        target += self._psi1
    def psi2(self, Z, mu, S, target):
        target += self.variance**2
    def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S, target):
        target += dL_dpsi0.sum()
    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target):
        target += dL_dpsi1.sum()
    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target):
        target += 2.*self.variance*dL_dpsi2.sum()
    def dpsi0_dZ(self, dL_dpsi0, Z, mu, S, target):
        pass
    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S, target_mu, target_S):
        pass
    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
        pass
    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
        pass
    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
        pass
    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
        pass
--- a/GPy/kern/parts/exponential.py
+++ b/GPy/kern/parts/exponential.py
@ -1,129 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kernpart import Kernpart
 import numpy as np
 from scipy import integrate
 class Exponential(Kernpart):
    """
    Exponential kernel (aka Ornstein-Uhlenbeck or Matern 1/2)
    .. math::
       k(r) = \sigma^2 \exp(- r) \ \ \ \ \  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
    :param input_dim: the number of input dimensions
    :type input_dim: int
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
    :param lengthscale: the vector of lengthscale :math:`\ell_i`
    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
    :type ARD: Boolean
    :param name: the name of the kernel
    :rtype: kernel object
    """
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, name='exp'):
        self.input_dim = input_dim
        self.ARD = ARD
        self.variance = variance
        self.name = name
        if ARD == False:
            self.num_params = 2
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
            else:
                lengthscale = np.ones(1)
        else:
            self.num_params = self.input_dim + 1
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size == self.input_dim, "bad number of lengthscales"
            else:
                lengthscale = np.ones(self.input_dim)
        #self._set_params(np.hstack((variance, lengthscale.flatten())))
        self.set_as_parameter('variance', 'lengthscale')
 #     def _get_params(self):
 #         """return the value of the parameters."""
 #         return np.hstack((self.variance, self.lengthscale))
 # 
 #     def _set_params(self, x):
 #         """set the value of the parameters."""
 #         assert x.size == self.num_params
 #         self.variance = x[0]
 #         self.lengthscale = x[1:]
 # 
 #     def _get_param_names(self):
 #         """return parameter names."""
 #         if self.num_params == 2:
 #             return ['variance', 'lengthscale']
 #         else:
 #             return ['variance'] + ['lengthscale_%i' % i for i in range(self.lengthscale.size)]
    def K(self, X, X2, target):
        """Compute the covariance matrix between X and X2."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
        np.add(self.variance * np.exp(-dist), target, target)
    def Kdiag(self, X, target):
        """Compute the diagonal of the covariance matrix associated to X."""
        np.add(target, self.variance, target)
    def _param_grad_helper(self, dL_dK, X, X2, target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
        invdist = 1. / np.where(dist != 0., dist, np.inf)
        dist2M = np.square(X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 3
        dvar = np.exp(-dist)
        target[0] += np.sum(dvar * dL_dK)
        if self.ARD == True:
            dl = self.variance * dvar[:, :, None] * dist2M * invdist[:, :, None]
            target[1:] += (dl * dL_dK[:, :, None]).sum(0).sum(0)
        else:
            dl = self.variance * dvar * dist2M.sum(-1) * invdist
            target[1] += np.sum(dl * dL_dK)
    def dKdiag_dtheta(self, dL_dKdiag, X, target):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
        # NB: derivative of diagonal elements wrt lengthscale is 0
        target[0] += np.sum(dL_dKdiag)
    def gradients_X(self, dL_dK, X, X2, target):
        """derivative of the covariance matrix with respect to X."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
        ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
        gradients_X = -np.transpose(self.variance * np.exp(-dist) * ddist_dX, (1, 0, 2))
        target += np.sum(gradients_X * dL_dK.T[:, :, None], 0)
    def dKdiag_dX(self, dL_dKdiag, X, target):
        pass
    def Gram_matrix(self, F, F1, lower, upper):
        """
        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to input_dim=1.
        :param F: vector of functions
        :type F: np.array
        :param F1: vector of derivatives of F
        :type F1: np.array
        :param lower,upper: boundaries of the input domain
        :type lower,upper: floats
        """
        assert self.input_dim == 1
        def L(x, i):
            return(1. / self.lengthscale * F[i](x) + F1[i](x))
        n = F.shape[0]
        G = np.zeros((n, n))
        for i in range(n):
            for j in range(i, n):
                G[i, j] = G[j, i] = integrate.quad(lambda x : L(x, i) * L(x, j), lower, upper)[0]
        Flower = np.array([f(lower) for f in F])[:, None]
        return(self.lengthscale / 2. / self.variance * G + 1. / self.variance * np.dot(Flower, Flower.T))
--- a/GPy/kern/parts/kernpart.py
+++ b/GPy/kern/parts/kernpart.py
@ -1,176 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 #from ...core.parameterized.Parameterized import set_as_parameter
 from ...core.parameterization import Parameterized
 class Kernpart(Parameterized):
    def __init__(self,input_dim,name):
        """
        The base class for a kernpart: a positive definite function 
        which forms part of a covariance function (kernel).
        :param input_dim: the number of input dimensions to the function
        :type input_dim: int
        Do not instantiate.
        """
        super(Kernpart, self).__init__(name)
        # the input dimensionality for the covariance
        self.input_dim = input_dim
        # the number of optimisable parameters
        # the name of the covariance function.
        # link to parameterized objects
        #self._X = None
    def connect_input(self, X):
        X.add_observer(self, self.on_input_change)
        #self._X = X
    def on_input_change(self, X):
        """
        During optimization this function will be called when
        the inputs X changed. Use this to update caches dependent
        on the inputs X.
        """
        # overwrite this to update kernel when inputs X change
        pass
 #     def set_as_parameter_named(self, name, gradient, index=None, *args, **kwargs):
 #         """
 #         :param names:        name of parameter to set as parameter
 #         :param gradient:     gradient method to get the gradient of this parameter
 #         :param index:        index of where to place parameter in printing
 #         :param args, kwargs: additional arguments to gradient
 #     
 #         Convenience method to connect Kernpart parameters:
 #         parameter with name (attribute of this Kernpart) will be set as parameter with following name:
 #         
 #             kernel_name + _ + parameter_name
 #     
 #         To add the kernels name to the parameter name use this method to 
 #         add parameters.
 #         """
 #         self.set_as_parameter(name, getattr(self, name), gradient, index, *args, **kwargs)
 #     def set_as_parameter(self, name, array, gradient, index=None, *args, **kwargs):
 #         """
 #         See :py:func:`GPy.core.parameterized.Parameterized.set_as_parameter`
 #         
 #         Note: this method adds the kernels name in front of the parameter.
 #         """
 #         p = Param(self.name+"_"+name, array, gradient, *args, **kwargs)
 #         if index is None:
 #             self._parameters_.append(p)
 #         else:
 #             self._parameters_.insert(index, p)
 #         self.__dict__[name] = p
    #set_as_parameter.__doc__ += set_as_parameter.__doc__  # @UndefinedVariable
 #     def _get_params(self):
 #         raise NotImplementedError
 #     def _set_params(self,x):
 #         raise NotImplementedError
 #     def _get_param_names(self):
 #         raise NotImplementedError
    def K(self,X,X2,target):
        raise NotImplementedError
    def Kdiag(self,X,target):
        raise NotImplementedError
    def _param_grad_helper(self,dL_dK,X,X2,target):
        raise NotImplementedError
    def dKdiag_dtheta(self,dL_dKdiag,X,target):
        # In the base case compute this by calling _param_grad_helper. Need to
        # override for stationary covariances (for example) to save
        # time.
        for i in range(X.shape[0]):
            self._param_grad_helper(dL_dKdiag[i], X[i, :][None, :], X2=None, target=target)
    def psi0(self,Z,mu,S,target):
        raise NotImplementedError
    def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target):
        raise NotImplementedError
    def dpsi0_dmuS(self,dL_dpsi0,Z,mu,S,target_mu,target_S):
        raise NotImplementedError
    def psi1(self,Z,mu,S,target):
        raise NotImplementedError
    def dpsi1_dtheta(self,Z,mu,S,target):
        raise NotImplementedError
    def dpsi1_dZ(self,dL_dpsi1,Z,mu,S,target):
        raise NotImplementedError
    def dpsi1_dmuS(self,dL_dpsi1,Z,mu,S,target_mu,target_S):
        raise NotImplementedError
    def psi2(self,Z,mu,S,target):
        raise NotImplementedError
    def dpsi2_dZ(self,dL_dpsi2,Z,mu,S,target):
        raise NotImplementedError
    def dpsi2_dtheta(self,dL_dpsi2,Z,mu,S,target):
        raise NotImplementedError
    def dpsi2_dmuS(self,dL_dpsi2,Z,mu,S,target_mu,target_S):
        raise NotImplementedError
    def gradients_X(self, dL_dK, X, X2, target):
        raise NotImplementedError
    def dKdiag_dX(self, dL_dK, X, target):
        raise NotImplementedError
    def update_gradients_full(self, dL_dK, X):
        """Set the gradients of all parameters when doing full (N) inference."""
        raise NotImplementedError
    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
        """Set the gradients of all parameters when doing sparse (M) inference."""
        raise NotImplementedError
    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
        """Set the gradients of all parameters when doing variational (M) inference with uncertain inputs."""
        raise NotImplementedError
 class Kernpart_stationary(Kernpart):
    def __init__(self, input_dim, lengthscale=None, ARD=False):
        self.input_dim = input_dim
        self.ARD = ARD
        if not ARD:
            self.num_params = 2
            if lengthscale is not None:
                self.lengthscale = np.asarray(lengthscale)
                assert self.lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
            else:
                self.lengthscale = np.ones(1)
        else:
            self.num_params = self.input_dim + 1
            if lengthscale is not None:
                self.lengthscale = np.asarray(lengthscale)
                assert self.lengthscale.size == self.input_dim, "bad number of lengthscales"
            else:
                self.lengthscale = np.ones(self.input_dim)
        # initialize cache
        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
        self._X, self._X2, self._parameters_ = np.empty(shape=(3, 1))
    def _set_params(self, x):
        self.lengthscale = x
        self.lengthscale2 = np.square(self.lengthscale)
        # reset cached results
        self._X, self._X2, self._parameters_ = np.empty(shape=(3, 1))
        self._Z, self._mu, self._S = np.empty(shape=(3, 1)) # cached versions of Z,mu,S
    def dKdiag_dtheta(self, dL_dKdiag, X, target):
        # For stationary covariances, derivative of diagonal elements
        # wrt lengthscale is 0.
        target[0] += np.sum(dL_dKdiag)
    def dKdiag_dX(self, dL_dK, X, target):
        pass # true for all stationary kernels
 class Kernpart_inner(Kernpart):
    def __init__(self,input_dim):
        """
        The base class for a kernpart_inner: a positive definite function which forms part of a kernel that is based on the inner product between inputs.
        :param input_dim: the number of input dimensions to the function
        :type input_dim: int
        Do not instantiate.
        """
        Kernpart.__init__(self, input_dim)
        # initialize cache
        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
        self._X, self._X2, self._parameters_ = np.empty(shape=(3, 1))
--- a/GPy/kern/parts/linear.py
+++ b/GPy/kern/parts/linear.py
@ -1,306 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import weave
 from kernpart import Kernpart
 from ...util.linalg import tdot
 from ...util.misc import fast_array_equal, param_to_array
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 class Linear(Kernpart):
    """
    Linear kernel
    .. math::
       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i x_iy_i
    :param input_dim: the number of input dimensions
    :type input_dim: int
    :param variances: the vector of variances :math:`\sigma^2_i`
    :type variances: array or list of the appropriate size (or float if there is only one variance parameter)
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel has only one variance parameter \sigma^2, otherwise there is one variance parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
    """
    def __init__(self, input_dim, variances=None, ARD=False, name='linear'):
        super(Linear, self).__init__(input_dim, name)
        self.ARD = ARD
        if ARD == False:
            if variances is not None:
                variances = np.asarray(variances)
                assert variances.size == 1, "Only one variance needed for non-ARD kernel"
            else:
                variances = np.ones(1)
            self._Xcache, self._X2cache = np.empty(shape=(2,))
        else:
            if variances is not None:
                variances = np.asarray(variances)
                assert variances.size == self.input_dim, "bad number of variances, need one ARD variance per input_dim"
            else:
                variances = np.ones(self.input_dim)
        self.variances = Param('variances', variances, Logexp())
        self.variances.gradient = np.zeros(self.variances.shape)
        self.add_parameter(self.variances)
        self.variances.add_observer(self, self.update_variance)
        # initialize cache
        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
        self._X, self._X2 = np.empty(shape=(2, 1))
    def update_variance(self, v):
        self.variances2 = np.square(self.variances)
    def on_input_change(self, X):
        self._K_computations(X, None)
    def update_gradients_full(self, dL_dK, X):
        self.variances.gradient[:] = 0
        self._param_grad_helper(dL_dK, X, None, self.variances.gradient)
    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
        tmp = dL_dKdiag[:, None] * X ** 2
        if self.ARD:
            self.variances.gradient = tmp.sum(0)
        else:
            self.variances.gradient = tmp.sum()
        self._param_grad_helper(dL_dKmm, Z, None, self.variances.gradient)
        self._param_grad_helper(dL_dKnm, X, Z, self.variances.gradient)
    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
        self._psi_computations(Z, mu, S)
        # psi0:
        tmp = dL_dpsi0[:, None] * self.mu2_S
        if self.ARD: self.variances.gradient[:] = tmp.sum(0)
        else: self.variances.gradient[:] = tmp.sum()
        #psi1
        self._param_grad_helper(dL_dpsi1, mu, Z, self.variances.gradient)
        #psi2
        tmp = dL_dpsi2[:, :, :, None] * (self.ZAinner[:, :, None, :] * (2 * Z)[None, None, :, :])
        if self.ARD: self.variances.gradient += tmp.sum(0).sum(0).sum(0)
        else: self.variances.gradient += tmp.sum()
        #from Kmm
        self._K_computations(Z, None)
        self._param_grad_helper(dL_dKmm, Z, None, self.variances.gradient)
    def K(self, X, X2, target):
        if self.ARD:
            XX = X * np.sqrt(self.variances)
            if X2 is None:
                target += tdot(XX)
            else:
                XX2 = X2 * np.sqrt(self.variances)
                target += np.dot(XX, XX2.T)
        else:
            if X is not self._X or X2 is not None:
                self._K_computations(X, X2)
            target += self.variances * self._dot_product
    def Kdiag(self, X, target):
        np.add(target, np.sum(self.variances * np.square(X), -1), target)
    def _param_grad_helper(self, dL_dK, X, X2, target):
        if self.ARD:
            if X2 is None:
                [np.add(target[i:i + 1], np.sum(dL_dK * tdot(X[:, i:i + 1])), target[i:i + 1]) for i in range(self.input_dim)]
            else:
                product = X[:, None, :] * X2[None, :, :]
                target += (dL_dK[:, :, None] * product).sum(0).sum(0)
        else:
            if X is not self._X or X2 is not None:
                self._K_computations(X, X2)
            target += np.sum(self._dot_product * dL_dK)
    def gradients_X(self, dL_dK, X, X2, target):
        if X2 is None:
            target += 2*(((X[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
        else:
            target += (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
    def dKdiag_dX(self,dL_dKdiag,X,target):
        target += 2.*self.variances*dL_dKdiag[:,None]*X
    #---------------------------------------#
    #             PSI statistics            #
    #---------------------------------------#
    def psi0(self, Z, mu, S, target):
        self._psi_computations(Z, mu, S)
        target += np.sum(self.variances * self.mu2_S, 1)
    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S, target_mu, target_S):
        target_mu += dL_dpsi0[:, None] * (2.0 * mu * self.variances)
        target_S += dL_dpsi0[:, None] * self.variances
    def psi1(self, Z, mu, S, target):
        """the variance, it does nothing"""
        self._psi1 = self.K(mu, Z, target)
    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
        """Do nothing for S, it does not affect psi1"""
        self._psi_computations(Z, mu, S)
        target_mu += (dL_dpsi1[:, :, None] * (Z * self.variances)).sum(1)
    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
        self.gradients_X(dL_dpsi1.T, Z, mu, target)
    def psi2(self, Z, mu, S, target):
        self._psi_computations(Z, mu, S)
        target += self._psi2
    def psi2_new(self,Z,mu,S,target):
        tmp = np.zeros((mu.shape[0], Z.shape[0]))
        self.K(mu,Z,tmp)
        target += tmp[:,:,None]*tmp[:,None,:] + np.sum(S[:,None,None,:]*self.variances**2*Z[None,:,None,:]*Z[None,None,:,:],-1)
    def dpsi2_dtheta_new(self, dL_dpsi2, Z, mu, S, target):
        tmp = np.zeros((mu.shape[0], Z.shape[0]))
        self.K(mu,Z,tmp)
        self._param_grad_helper(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target)
        result= 2.*(dL_dpsi2[:,:,:,None]*S[:,None,None,:]*self.variances*Z[None,:,None,:]*Z[None,None,:,:]).sum(0).sum(0).sum(0)
        if self.ARD:
            target += result.sum(0).sum(0).sum(0)
        else:
            target += result.sum()
    def dpsi2_dmuS_new(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
        tmp = np.zeros((mu.shape[0], Z.shape[0]))
        self.K(mu,Z,tmp)
        self.gradients_X(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target_mu)
        Zs = Z*self.variances
        Zs_sq = Zs[:,None,:]*Zs[None,:,:]
        target_S += (dL_dpsi2[:,:,:,None]*Zs_sq[None,:,:,:]).sum(1).sum(1)
    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
        """Think N,num_inducing,num_inducing,input_dim """
        self._psi_computations(Z, mu, S)
        AZZA = self.ZA.T[:, None, :, None] * self.ZA[None, :, None, :]
        AZZA = AZZA + AZZA.swapaxes(1, 2)
        AZZA_2 = AZZA/2.
        #muAZZA = np.tensordot(mu,AZZA,(-1,0))
        #target_mu_dummy, target_S_dummy = np.zeros_like(target_mu), np.zeros_like(target_S)
        #target_mu_dummy += (dL_dpsi2[:, :, :, None] * muAZZA).sum(1).sum(1)
        #target_S_dummy += (dL_dpsi2[:, :, :, None] * self.ZA[None, :, None, :] * self.ZA[None, None, :, :]).sum(1).sum(1)
        #Using weave, we can exploiut the symmetry of this problem:
        code = """
        int n, m, mm,q,qq;
        double factor,tmp;
        #pragma omp parallel for private(m,mm,q,qq,factor,tmp)
        for(n=0;n<N;n++){
          for(m=0;m<num_inducing;m++){
            for(mm=0;mm<=m;mm++){
              //add in a factor of 2 for the off-diagonal terms (and then count them only once)
              if(m==mm)
                factor = dL_dpsi2(n,m,mm);
              else
                factor = 2.0*dL_dpsi2(n,m,mm);
              for(q=0;q<input_dim;q++){
                //take the dot product of mu[n,:] and AZZA[:,m,mm,q] TODO: blas!
                tmp = 0.0;
                for(qq=0;qq<input_dim;qq++){
                  tmp += mu(n,qq)*AZZA(qq,m,mm,q);
                }
                target_mu(n,q) += factor*tmp;
                target_S(n,q) += factor*AZZA_2(q,m,mm,q);
              }
            }
          }
        }
        """
        support_code = """
        #include <omp.h>
        #include <math.h>
        """
        weave_options = {'headers'           : ['<omp.h>'],
                         'extra_compile_args': ['-fopenmp -O3'],  #-march=native'],
                         'extra_link_args'   : ['-lgomp']}
        N,num_inducing,input_dim,mu = mu.shape[0],Z.shape[0],mu.shape[1],param_to_array(mu)
        weave.inline(code, support_code=support_code, libraries=['gomp'],
                     arg_names=['N','num_inducing','input_dim','mu','AZZA','AZZA_2','target_mu','target_S','dL_dpsi2'],
                     type_converters=weave.converters.blitz,**weave_options)
    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
        self._psi_computations(Z, mu, S)
        #psi2_dZ = dL_dpsi2[:, :, :, None] * self.variances * self.ZAinner[:, :, None, :]
        #dummy_target = np.zeros_like(target)
        #dummy_target += psi2_dZ.sum(0).sum(0)
        AZA = self.variances*self.ZAinner
        code="""
        int n,m,mm,q;
        #pragma omp parallel for private(n,mm,q)
        for(m=0;m<num_inducing;m++){
          for(q=0;q<input_dim;q++){
            for(mm=0;mm<num_inducing;mm++){
              for(n=0;n<N;n++){
                target(m,q) += dL_dpsi2(n,m,mm)*AZA(n,mm,q);
              }
            }
          }
        }
        """
        support_code = """
        #include <omp.h>
        #include <math.h>
        """
        weave_options = {'headers'           : ['<omp.h>'],
                         'extra_compile_args': ['-fopenmp -O3'],  #-march=native'],
                         'extra_link_args'   : ['-lgomp']}
        N,num_inducing,input_dim = mu.shape[0],Z.shape[0],mu.shape[1]
        mu, AZA, target, dL_dpsi2 = param_to_array(mu, AZA, target, dL_dpsi2)
        weave.inline(code, support_code=support_code, libraries=['gomp'],
                     arg_names=['N','num_inducing','input_dim','AZA','target','dL_dpsi2'],
                     type_converters=weave.converters.blitz,**weave_options)
    #---------------------------------------#
    #            Precomputations            #
    #---------------------------------------#
    def _K_computations(self, X, X2):
        if not (fast_array_equal(X, self._X) and fast_array_equal(X2, self._X2)):
            self._X = X.copy()
            if X2 is None:
                self._dot_product = tdot(param_to_array(X))
                self._X2 = None
            else:
                self._X2 = X2.copy()
                self._dot_product = np.dot(param_to_array(X), param_to_array(X2.T))  
    def _psi_computations(self, Z, mu, S):
        # here are the "statistics" for psi1 and psi2
        Zv_changed = not (fast_array_equal(Z, self._Z) and fast_array_equal(self.variances, self._variances))
        muS_changed = not (fast_array_equal(mu, self._mu) and fast_array_equal(S, self._S))
        if Zv_changed:
            # Z has changed, compute Z specific stuff
            # self.ZZ = Z[:,None,:]*Z[None,:,:] # num_inducing,num_inducing,input_dim
 #             self.ZZ = np.empty((Z.shape[0], Z.shape[0], Z.shape[1]), order='F')
 #             [tdot(Z[:, i:i + 1], self.ZZ[:, :, i].T) for i in xrange(Z.shape[1])]
            self.ZA = Z * self.variances
            self._Z = Z.copy()
            self._variances = self.variances.copy()
        if muS_changed:
            self.mu2_S = np.square(mu) + S
            self.inner = (mu[:, None, :] * mu[:, :, None])
            diag_indices = np.diag_indices(mu.shape[1], 2)
            self.inner[:, diag_indices[0], diag_indices[1]] += S
            self._mu, self._S = mu.copy(), S.copy()
        if Zv_changed or muS_changed:
            self.ZAinner = np.dot(self.ZA, self.inner).swapaxes(0, 1)  # NOTE: self.ZAinner \in [num_inducing x N x input_dim]!
            self._psi2 = np.dot(self.ZAinner, self.ZA.T)
--- a/GPy/kern/parts/prod.py
+++ b/GPy/kern/parts/prod.py
@ -1,125 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kernpart import Kernpart
 from coregionalize import Coregionalize
 import numpy as np
 import hashlib
 class Prod(Kernpart):
    """
    Computes the product of 2 kernels
    :param k1, k2: the kernels to multiply
    :type k1, k2: Kernpart
    :param tensor: The kernels are either multiply as functions defined on the same input space (default) or on the product of the input spaces
    :type tensor: Boolean
    :rtype: kernel object
    """
    def __init__(self,k1,k2,tensor=False):
        if tensor:
            super(Prod, self).__init__(k1.input_dim + k2.input_dim, k1.name + '_xx_' + k2.name)
            self.slice1 = slice(0,k1.input_dim)
            self.slice2 = slice(k1.input_dim,k1.input_dim+k2.input_dim)
        else:
            assert k1.input_dim == k2.input_dim, "Error: The input spaces of the kernels to multiply don't have the same dimension."
            super(Prod, self).__init__(k1.input_dim, k1.name + '_x_' + k2.name)
            self.slice1 = slice(0,self.input_dim)
            self.slice2 = slice(0,self.input_dim)
        self.k1 = k1
        self.k2 = k2
        self.add_parameters(self.k1, self.k2)
        #initialize cache
        self._X, self._X2 = np.empty(shape=(2,1))
        self._params = None
    def K(self,X,X2,target):
        self._K_computations(X,X2)
        target += self._K1 * self._K2
    def K1(self,X, X2):
        """Compute the part of the kernel associated with k1."""
        self._K_computations(X, X2)
        return self._K1
    def K2(self, X, X2):
        """Compute the part of the kernel associated with k2."""
        self._K_computations(X, X2)
        return self._K2
    def update_gradients_full(self, dL_dK, X):
        self._K_computations(X, None)
        self.k1.update_gradients_full(dL_dK*self._K2, X[:,self.slice1])
        self.k2.update_gradients_full(dL_dK*self._K1, X[:,self.slice2])
    def _param_grad_helper(self,dL_dK,X,X2,target):
        """Derivative of the covariance matrix with respect to the parameters."""
        self._K_computations(X,X2)
        if X2 is None:
            self.k1._param_grad_helper(dL_dK*self._K2, X[:,self.slice1], None, target[:self.k1.num_params])
            self.k2._param_grad_helper(dL_dK*self._K1, X[:,self.slice2], None, target[self.k1.num_params:])
        else:
            self.k1._param_grad_helper(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:self.k1.num_params])
            self.k2._param_grad_helper(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[self.k1.num_params:])
    def Kdiag(self,X,target):
        """Compute the diagonal of the covariance matrix associated to X."""
        target1 = np.zeros(X.shape[0])
        target2 = np.zeros(X.shape[0])
        self.k1.Kdiag(X[:,self.slice1],target1)
        self.k2.Kdiag(X[:,self.slice2],target2)
        target += target1 * target2
    def dKdiag_dtheta(self,dL_dKdiag,X,target):
        K1 = np.zeros(X.shape[0])
        K2 = np.zeros(X.shape[0])
        self.k1.Kdiag(X[:,self.slice1],K1)
        self.k2.Kdiag(X[:,self.slice2],K2)
        self.k1.dKdiag_dtheta(dL_dKdiag*K2,X[:,self.slice1],target[:self.k1.num_params])
        self.k2.dKdiag_dtheta(dL_dKdiag*K1,X[:,self.slice2],target[self.k1.num_params:])
    def gradients_X(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
        self._K_computations(X,X2)
        if X2 is None:
            if not isinstance(self.k1,Coregionalize) and not isinstance(self.k2,Coregionalize):
                self.k1.gradients_X(dL_dK*self._K2, X[:,self.slice1], None, target[:,self.slice1])
                self.k2.gradients_X(dL_dK*self._K1, X[:,self.slice2], None, target[:,self.slice2])
            else:#if isinstance(self.k1,Coregionalize) or isinstance(self.k2,Coregionalize):
                #NOTE The indices column in the inputs makes the ki.gradients_X fail when passing None instead of X[:,self.slicei]
                X2 = X
                self.k1.gradients_X(2.*dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
                self.k2.gradients_X(2.*dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
        else:
            self.k1.gradients_X(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
            self.k2.gradients_X(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
    def dKdiag_dX(self, dL_dKdiag, X, target):
        K1 = np.zeros(X.shape[0])
        K2 = np.zeros(X.shape[0])
        self.k1.Kdiag(X[:,self.slice1],K1)
        self.k2.Kdiag(X[:,self.slice2],K2)
        self.k1.gradients_X(dL_dKdiag*K2, X[:,self.slice1], target[:,self.slice1])
        self.k2.gradients_X(dL_dKdiag*K1, X[:,self.slice2], target[:,self.slice2])
    def _K_computations(self,X,X2):
        if not (np.array_equal(X,self._X) and np.array_equal(X2,self._X2) and np.array_equal(self._params , self._get_params())):
            self._X = X.copy()
            self._params == self._get_params().copy()
            if X2 is None:
                self._X2 = None
                self._K1 = np.zeros((X.shape[0],X.shape[0]))
                self._K2 = np.zeros((X.shape[0],X.shape[0]))
                self.k1.K(X[:,self.slice1],None,self._K1)
                self.k2.K(X[:,self.slice2],None,self._K2)
            else:
                self._X2 = X2.copy()
                self._K1 = np.zeros((X.shape[0],X2.shape[0]))
                self._K2 = np.zeros((X.shape[0],X2.shape[0]))
                self.k1.K(X[:,self.slice1],X2[:,self.slice1],self._K1)
                self.k2.K(X[:,self.slice2],X2[:,self.slice2],self._K2)
--- a/GPy/kern/parts/ss_rbf.py
+++ b/GPy/kern/parts/ss_rbf.py
@ -1,352 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from kernpart import Kernpart
 from ...util.linalg import tdot
 from ...util.misc import fast_array_equal, param_to_array
 from ...core.parameterization import Param
 class SS_RBF(Kernpart):
    """
    The RBF kernel for Spike-and-Slab GPLVM
    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel:
    .. math::
       k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \ \ \ \ \  \\text{ where  } r^2 = \sum_{i=1}^d \\frac{ (x_i-x^\prime_i)^2}{\ell_i^2}
    where \ell_i is the lengthscale, \sigma^2 the variance and d the dimensionality of the input.
    :param input_dim: the number of input dimensions
    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float
    :param lengthscale: the vector of lengthscale of the kernel
    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
    :rtype: kernel object
    """
    def __init__(self, input_dim, variance=1., lengthscale=None, name='rbf'):
        super(RBF, self).__init__(input_dim, name)
        self.input_dim = input_dim
        if lengthscale is not None:
            lengthscale = np.asarray(lengthscale)
            assert lengthscale.size == self.input_dim, "bad number of lengthscales"
        else:
            lengthscale = np.ones(self.input_dim)
        self.variance = Param('variance', variance)
        self.lengthscale = Param('lengthscale', lengthscale)
        self.lengthscale.add_observer(self, self.update_lengthscale)
        self.add_parameters(self.variance, self.lengthscale)
        self.parameters_changed() # initializes cache
    def on_input_change(self, X):
        #self._K_computations(X, None)
        pass
    def update_lengthscale(self, l):
        self.lengthscale2 = np.square(self.lengthscale)
    def parameters_changed(self):
        # reset cached results
        self._X, self._X2 = np.empty(shape=(2, 1))
        self._Z, self._mu, self._S = np.empty(shape=(3, 1)) # cached versions of Z,mu,S
    def K(self, X, X2, target):
        self._K_computations(X, X2)
        target += self.variance * self._K_dvar
    def Kdiag(self, X, target):
        np.add(target, self.variance, target)
    def psi0(self, Z, mu, S, target):
        target += self.variance
    def psi1(self, Z, mu, S, target):
        self._psi_computations(Z, mu, S)
        target += self._psi1
    def psi2(self, Z, mu, S, target):
        self._psi_computations(Z, mu, S)
        target += self._psi2
    def update_gradients_full(self, dL_dK, X):
        self._K_computations(X, None)
        self.variance.gradient = np.sum(self._K_dvar * dL_dK)
        if self.ARD:
            self.lengthscale.gradient = self._dL_dlengthscales_via_K(dL_dK, X, None)
        else:
            self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dK)
    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
        #contributions from Kdiag
        self.variance.gradient = np.sum(dL_dKdiag)
        #from Knm
        self._K_computations(X, Z)
        self.variance.gradient += np.sum(dL_dKnm * self._K_dvar)
        if self.ARD:
            self.lengthscales.gradient = self._dL_dlengthscales_via_K(dL_dKnm, X, Z)
        else:
            self.lengthscale.gradient = (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
        #from Kmm
        self._K_computations(Z, None)
        self.variance.gradient += np.sum(dL_dKmm * self._K_dvar)
        if self.ARD:
            self.lengthscales.gradient += self._dL_dlengthscales_via_K(dL_dKmm, Z, None)
        else:
            self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dKmm)
    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
        self._psi_computations(Z, mu, S)
        #contributions from psi0:
        self.variance.gradient = np.sum(dL_dpsi0)
        #from psi1
        self.variance.gradient += np.sum(dL_dpsi1 * self._psi1 / self.variance)
        d_length = self._psi1[:,:,None] * ((self._psi1_dist_sq - 1.)/(self.lengthscale*self._psi1_denom) +1./self.lengthscale)
        dpsi1_dlength = d_length * dL_dpsi1[:, :, None]
        if not self.ARD:
            self.lengthscale.gradeint = dpsi1_dlength.sum()
        else:
            self.lengthscale.gradient = dpsi1_dlength.sum(0).sum(0)
        #from psi2
        d_var = 2.*self._psi2 / self.variance
        d_length = 2.*self._psi2[:, :, :, None] * (self._psi2_Zdist_sq * self._psi2_denom + self._psi2_mudist_sq + S[:, None, None, :] / self.lengthscale2) / (self.lengthscale * self._psi2_denom)
        self.variance.gradient += np.sum(dL_dpsi2 * d_var)
        dpsi2_dlength = d_length * dL_dpsi2[:, :, :, None]
        if not self.ARD:
            self.lengthscale.gradient += dpsi2_dlength.sum()
        else:
            self.lengthscale.gradient += dpsi2_dlength.sum(0).sum(0).sum(0)
        #from Kmm
        self._K_computations(Z, None)
        self.variance.gradient += np.sum(dL_dKmm * self._K_dvar)
        if self.ARD:
            self.lengthscales.gradient += self._dL_dlengthscales_via_K(dL_dKmm, Z, None)
        else:
            self.lengthscale.gradient += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dK)
    def gradients_X(self, dL_dK, X, X2, target):
        #if self._X is None or X.base is not self._X.base or X2 is not None:
        self._K_computations(X, X2)
        if X2 is None:
            _K_dist = 2*(X[:, None, :] - X[None, :, :])
        else:
            _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
        gradients_X = (-self.variance / self.lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
        target += np.sum(gradients_X * dL_dK.T[:, :, None], 0)
    def dKdiag_dX(self, dL_dKdiag, X, target):
        pass
    #---------------------------------------#
    #             PSI statistics            #
    #---------------------------------------#
    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S, target_mu, target_S):
        pass
    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
        self._psi_computations(Z, mu, S)
        denominator = (self.lengthscale2 * (self._psi1_denom))
        dpsi1_dZ = -self._psi1[:, :, None] * ((self._psi1_dist / denominator))
        target += np.sum(dL_dpsi1[:, :, None] * dpsi1_dZ, 0)
    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
        self._psi_computations(Z, mu, S)
        tmp = self._psi1[:, :, None] / self.lengthscale2 / self._psi1_denom
        target_mu += np.sum(dL_dpsi1[:, :, None] * tmp * self._psi1_dist, 1)
        target_S += np.sum(dL_dpsi1[:, :, None] * 0.5 * tmp * (self._psi1_dist_sq - 1), 1)
    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
        self._psi_computations(Z, mu, S)
        term1 = self._psi2_Zdist / self.lengthscale2 # num_inducing, num_inducing, input_dim
        term2 = self._psi2_mudist / self._psi2_denom / self.lengthscale2 # N, num_inducing, num_inducing, input_dim
        dZ = self._psi2[:, :, :, None] * (term1[None] + term2)
        target += (dL_dpsi2[:, :, :, None] * dZ).sum(0).sum(0)
    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
        """Think N,num_inducing,num_inducing,input_dim """
        self._psi_computations(Z, mu, S)
        tmp = self._psi2[:, :, :, None] / self.lengthscale2 / self._psi2_denom
        target_mu += -2.*(dL_dpsi2[:, :, :, None] * tmp * self._psi2_mudist).sum(1).sum(1)
        target_S += (dL_dpsi2[:, :, :, None] * tmp * (2.*self._psi2_mudist_sq - 1)).sum(1).sum(1)
    #---------------------------------------#
    #            Precomputations            #
    #---------------------------------------#
    def _K_computations(self, X, X2):
        #params = self._get_params()
        if not (fast_array_equal(X, self._X) and fast_array_equal(X2, self._X2)):# and fast_array_equal(self._params_save , params)):
            #self._X = X.copy()
            #self._params_save = params.copy()
            if X2 is None:
                self._X2 = None
                X = X / self.lengthscale
                Xsquare = np.sum(np.square(X), 1)
                self._K_dist2 = -2.*tdot(X) + (Xsquare[:, None] + Xsquare[None, :])
            else:
                self._X2 = X2.copy()
                X = X / self.lengthscale
                X2 = X2 / self.lengthscale
                self._K_dist2 = -2.*np.dot(X, X2.T) + (np.sum(np.square(X), 1)[:, None] + np.sum(np.square(X2), 1)[None, :])
            self._K_dvar = np.exp(-0.5 * self._K_dist2)
    def _dL_dlengthscales_via_K(self, dL_dK, X, X2):
        """
        A helper function for update_gradients_* methods
        Computes the derivative of the objective L wrt the lengthscales via
        dL_dl = sum_{i,j}(dL_dK_{ij} dK_dl)
        assumes self._K_computations has just been called.
        This is only valid if self.ARD=True
        """
        target = np.zeros(self.input_dim)
        dvardLdK = self._K_dvar * dL_dK
        var_len3 = self.variance / np.power(self.lengthscale, 3)
        if X2 is None:
            # save computation for the symmetrical case
            dvardLdK = dvardLdK + dvardLdK.T
            code = """
            int q,i,j;
            double tmp;
            for(q=0; q<input_dim; q++){
              tmp = 0;
              for(i=0; i<num_data; i++){
                for(j=0; j<i; j++){
                  tmp += (X(i,q)-X(j,q))*(X(i,q)-X(j,q))*dvardLdK(i,j);
                }
              }
              target(q) += var_len3(q)*tmp;
            }
            """
            num_data, num_inducing, input_dim = X.shape[0], X.shape[0], self.input_dim
            X, dvardLdK = param_to_array(X, dvardLdK)
            weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'target', 'dvardLdK', 'var_len3'], type_converters=weave.converters.blitz, **self.weave_options)
        else:
            code = """
            int q,i,j;
            double tmp;
            for(q=0; q<input_dim; q++){
              tmp = 0;
              for(i=0; i<num_data; i++){
                for(j=0; j<num_inducing; j++){
                  tmp += (X(i,q)-X2(j,q))*(X(i,q)-X2(j,q))*dvardLdK(i,j);
                }
              }
              target(q) += var_len3(q)*tmp;
            }
            """
            num_data, num_inducing, input_dim = X.shape[0], X2.shape[0], self.input_dim
            X, X2, dvardLdK = param_to_array(X, X2, dvardLdK)
            weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3'], type_converters=weave.converters.blitz, **self.weave_options)
        return target
    def _psi_computations(self, Z, mu, S):
        # here are the "statistics" for psi1 and psi2
        Z_changed = not fast_array_equal(Z, self._Z)
        if Z_changed:
            # Z has changed, compute Z specific stuff
            self._psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
            self._psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
            self._psi2_Zdist_sq = np.square(self._psi2_Zdist / self.lengthscale) # M,M,Q
        if Z_changed or not fast_array_equal(mu, self._mu) or not fast_array_equal(S, self._S):
            # something's changed. recompute EVERYTHING
            # psi1
            self._psi1_denom = S[:, None, :] / self.lengthscale2 + 1.
            self._psi1_dist = Z[None, :, :] - mu[:, None, :]
            self._psi1_dist_sq = np.square(self._psi1_dist) / self.lengthscale2 / self._psi1_denom
            self._psi1_exponent = -0.5 * np.sum(self._psi1_dist_sq + np.log(self._psi1_denom), -1)
            self._psi1 = self.variance * np.exp(self._psi1_exponent)
            # psi2
            self._psi2_denom = 2.*S[:, None, None, :] / self.lengthscale2 + 1. # N,M,M,Q
            self._psi2_mudist, self._psi2_mudist_sq, self._psi2_exponent, _ = self.weave_psi2(mu, self._psi2_Zhat)
            # self._psi2_mudist = mu[:,None,None,:]-self._psi2_Zhat #N,M,M,Q
            # self._psi2_mudist_sq = np.square(self._psi2_mudist)/(self.lengthscale2*self._psi2_denom)
            # self._psi2_exponent = np.sum(-self._psi2_Zdist_sq -self._psi2_mudist_sq -0.5*np.log(self._psi2_denom),-1) #N,M,M,Q
            self._psi2 = np.square(self.variance) * np.exp(self._psi2_exponent) # N,M,M,Q
            # store matrices for caching
            self._Z, self._mu, self._S = Z, mu, S
    def weave_psi2(self, mu, Zhat):
        N, input_dim = mu.shape
        num_inducing = Zhat.shape[0]
        mudist = np.empty((N, num_inducing, num_inducing, input_dim))
        mudist_sq = np.empty((N, num_inducing, num_inducing, input_dim))
        psi2_exponent = np.zeros((N, num_inducing, num_inducing))
        psi2 = np.empty((N, num_inducing, num_inducing))
        psi2_Zdist_sq = self._psi2_Zdist_sq
        _psi2_denom = self._psi2_denom.squeeze().reshape(N, self.input_dim)
        half_log_psi2_denom = 0.5 * np.log(self._psi2_denom).squeeze().reshape(N, self.input_dim)
        variance_sq = float(np.square(self.variance))
        if self.ARD:
            lengthscale2 = self.lengthscale2
        else:
            lengthscale2 = np.ones(input_dim) * self.lengthscale2
        code = """
        double tmp;
        #pragma omp parallel for private(tmp)
        for (int n=0; n<N; n++){
            for (int m=0; m<num_inducing; m++){
               for (int mm=0; mm<(m+1); mm++){
                   for (int q=0; q<input_dim; q++){
                       //compute mudist
                       tmp = mu(n,q) - Zhat(m,mm,q);
                       mudist(n,m,mm,q) = tmp;
                       mudist(n,mm,m,q) = tmp;
                       //now mudist_sq
                       tmp = tmp*tmp/lengthscale2(q)/_psi2_denom(n,q);
                       mudist_sq(n,m,mm,q) = tmp;
                       mudist_sq(n,mm,m,q) = tmp;
                       //now psi2_exponent
                       tmp = -psi2_Zdist_sq(m,mm,q) - tmp - half_log_psi2_denom(n,q);
                       psi2_exponent(n,mm,m) += tmp;
                       if (m !=mm){
                           psi2_exponent(n,m,mm) += tmp;
                       }
                   //psi2 would be computed like this, but np is faster
                   //tmp = variance_sq*exp(psi2_exponent(n,m,mm));
                   //psi2(n,m,mm) = tmp;
                   //psi2(n,mm,m) = tmp;
                   }
                }
            }
        }
        """
        support_code = """
        #include <omp.h>
        #include <math.h>
        """
        weave.inline(code, support_code=support_code, libraries=['gomp'],
                     arg_names=['N', 'num_inducing', 'input_dim', 'mu', 'Zhat', 'mudist_sq', 'mudist', 'lengthscale2', '_psi2_denom', 'psi2_Zdist_sq', 'psi2_exponent', 'half_log_psi2_denom', 'psi2', 'variance_sq'],
                     type_converters=weave.converters.blitz, **self.weave_options)
        return mudist, mudist_sq, psi2_exponent, psi2
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
@ -1,423 +0,0 @@
 import numpy as np
 import sympy as sp
 from sympy.utilities.codegen import codegen
 from sympy.core.cache import clear_cache
 from scipy import weave
 import re
 import os
 import sys
 current_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
 import tempfile
 import pdb
 import ast
 from kernpart import Kernpart
 class spkern(Kernpart):
    """
    A kernel object, where all the hard work in done by sympy.
    :param k: the covariance function
    :type k: a positive definite sympy function of x_0, z_0, x_1, z_1, x_2, z_2...
    To construct a new sympy kernel, you'll need to define:
     - a kernel function using a sympy object. Ensure that the kernel is of the form k(x,z).
     - that's it! we'll extract the variables from the function k.
    Note:
     - to handle multiple inputs, call them x_1, z_1, etc
     - to handle multpile correlated outputs, you'll need to add parameters with an index, such as lengthscale_i and lengthscale_j.
    """
    def __init__(self, input_dim, k=None, output_dim=1, name=None, param=None):
        if name is None:
            self.name='sympykern'
        else:
            self.name = name
        if k is None:
            raise ValueError, "You must provide an argument for the covariance function."
        self._sp_k = k
        sp_vars = [e for e in k.atoms() if e.is_Symbol]
        self._sp_x= sorted([e for e in sp_vars if e.name[0:2]=='x_'],key=lambda x:int(x.name[2:]))
        self._sp_z= sorted([e for e in sp_vars if e.name[0:2]=='z_'],key=lambda z:int(z.name[2:]))
        # Check that variable names make sense.
        assert all([x.name=='x_%i'%i for i,x in enumerate(self._sp_x)])
        assert all([z.name=='z_%i'%i for i,z in enumerate(self._sp_z)])
        assert len(self._sp_x)==len(self._sp_z)
        self.input_dim = len(self._sp_x)
        self._real_input_dim = self.input_dim
        if output_dim > 1:
            self.input_dim += 1
        assert self.input_dim == input_dim
        self.output_dim = output_dim
        # extract parameter names
        thetas = sorted([e for e in sp_vars if not (e.name[0:2]=='x_' or e.name[0:2]=='z_')],key=lambda e:e.name)
        # Look for parameters with index.
        if self.output_dim>1:
            self._sp_theta_i = sorted([e for e in thetas if (e.name[-2:]=='_i')], key=lambda e:e.name)
            self._sp_theta_j = sorted([e for e in thetas if (e.name[-2:]=='_j')], key=lambda e:e.name)
            # Make sure parameter appears with both indices!
            assert len(self._sp_theta_i)==len(self._sp_theta_j)
            assert all([theta_i.name[:-2]==theta_j.name[:-2] for theta_i, theta_j in zip(self._sp_theta_i, self._sp_theta_j)])
            # Extract names of shared parameters
            self._sp_theta = [theta for theta in thetas if theta not in self._sp_theta_i and theta not in self._sp_theta_j]
            self.num_split_params = len(self._sp_theta_i)
            self._split_theta_names = ["%s"%theta.name[:-2] for theta in self._sp_theta_i]
            for theta in self._split_theta_names:
                setattr(self, theta, np.ones(self.output_dim))
            self.num_shared_params = len(self._sp_theta)
            self.num_params = self.num_shared_params+self.num_split_params*self.output_dim
        else:
            self.num_split_params = 0
            self._split_theta_names = []
            self._sp_theta = thetas
            self.num_shared_params = len(self._sp_theta)
            self.num_params = self.num_shared_params
        for theta in self._sp_theta:
            val = 1.0
            if param is not None:
                if param.has_key(theta):
                    val = param[theta]
            setattr(self, theta.name, val)
        #deal with param            
        self._set_params(self._get_params())
        #Differentiate!
        self._sp_dk_dtheta = [sp.diff(k,theta).simplify() for theta in self._sp_theta]
        if self.output_dim > 1:
            self._sp_dk_dtheta_i = [sp.diff(k,theta).simplify() for theta in self._sp_theta_i]
        self._sp_dk_dx = [sp.diff(k,xi).simplify() for xi in self._sp_x]
        if False:
            self.compute_psi_stats()
        self._gen_code()
        if False:
            extra_compile_args = ['-ftree-vectorize', '-mssse3', '-ftree-vectorizer-verbose=5']
        else:
            extra_compile_args = []
        self.weave_kwargs = {
            'support_code':self._function_code,
            'include_dirs':[tempfile.gettempdir(), os.path.join(current_dir,'parts/')],
            'headers':['"sympy_helpers.h"'],
            'sources':[os.path.join(current_dir,"parts/sympy_helpers.cpp")],
            'extra_compile_args':extra_compile_args,
            'extra_link_args':['-lgomp'],
            'verbose':True}
    def __add__(self,other):
        return spkern(self._sp_k+other._sp_k)
    def _gen_code(self):
        #generate c functions from sympy objects        
        argument_sequence = self._sp_x+self._sp_z+self._sp_theta
        code_list = [('k',self._sp_k)]
        # gradients with respect to covariance input
        code_list += [('dk_d%s'%x.name,dx) for x,dx in zip(self._sp_x,self._sp_dk_dx)]
        # gradient with respect to parameters
        code_list += [('dk_d%s'%theta.name,dtheta) for theta,dtheta in zip(self._sp_theta,self._sp_dk_dtheta)]
        # gradient with respect to multiple output parameters
        if self.output_dim > 1:
            argument_sequence += self._sp_theta_i + self._sp_theta_j
            code_list += [('dk_d%s'%theta.name,dtheta) for theta,dtheta in zip(self._sp_theta_i,self._sp_dk_dtheta_i)]
        (foo_c,self._function_code), (foo_h,self._function_header) = \
                                     codegen(code_list, "C",'foobar',argument_sequence=argument_sequence)
        #put the header file where we can find it
        f = file(os.path.join(tempfile.gettempdir(),'foobar.h'),'w')
        f.write(self._function_header)
        f.close()
        # Substitute any known derivatives which sympy doesn't compute
        self._function_code = re.sub('DiracDelta\(.+?,.+?\)','0.0',self._function_code)
        # This is the basic argument construction for the C code.
        #arg_list = (["X[i*input_dim+%s]"%x.name[2:] for x in self._sp_x]
        #            + ["Z[j*input_dim+%s]"%z.name[2:] for z in self._sp_z])
        arg_list = (["X2(i, %s)"%x.name[2:] for x in self._sp_x]
                    + ["Z2(j, %s)"%z.name[2:] for z in self._sp_z])
        if self.output_dim>1:
            reverse_arg_list = list(arg_list)
            reverse_arg_list.reverse()
        param_arg_list = [shared_params.name for shared_params in self._sp_theta]
        arg_list += param_arg_list
        precompute_list=[]
        if self.output_dim > 1:
            reverse_arg_list+=list(param_arg_list)
            split_param_arg_list = ["%s1(%s)"%(theta.name[:-2].upper(),index) for index in ['ii', 'jj'] for theta in self._sp_theta_i]
            split_param_reverse_arg_list = ["%s1(%s)"%(theta.name[:-2].upper(),index) for index in ['jj', 'ii'] for theta in self._sp_theta_i]
            arg_list += split_param_arg_list
            reverse_arg_list += split_param_reverse_arg_list
            # Extract the right output indices from the inputs.
            c_define_output_indices = [' '*16 + "int %s=(int)%s(%s, %i);"%(index, var, index2, self.input_dim-1) for index, var, index2 in zip(['ii', 'jj'], ['X2', 'Z2'], ['i', 'j'])]
            precompute_list += c_define_output_indices
            reverse_arg_string = ", ".join(reverse_arg_list)
        arg_string = ", ".join(arg_list)
        precompute_string = "\n".join(precompute_list)
        # Here's the code to do the looping for K
        self._K_code =\
        """
        // _K_code
        // Code for computing the covariance function.
        int i;
        int j;
        int N = target_array->dimensions[0];
        int num_inducing = target_array->dimensions[1];
        int input_dim = X_array->dimensions[1];
        //#pragma omp parallel for private(j)
        for (i=0;i<N;i++){
            for (j=0;j<num_inducing;j++){
 %s
                //target[i*num_inducing+j] = 
                TARGET2(i, j) += k(%s);
            }
        }
        %s
        """%(precompute_string,arg_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
        # Code to compute diagonal of covariance.
        diag_arg_string = re.sub('Z','X',arg_string)
        diag_arg_string = re.sub('int jj','//int jj',diag_arg_string)
        diag_arg_string = re.sub('j','i',diag_arg_string)
        diag_precompute_string = re.sub('int jj','//int jj',precompute_string)
        diag_precompute_string = re.sub('Z','X',diag_precompute_string)
        diag_precompute_string = re.sub('j','i',diag_precompute_string)
        # Code to do the looping for Kdiag
        self._Kdiag_code =\
        """
        // _Kdiag_code
        // Code for computing diagonal of covariance function.
        int i;
        int N = target_array->dimensions[0];
        int input_dim = X_array->dimensions[1];
        //#pragma omp parallel for
        for (i=0;i<N;i++){
                %s
                //target[i] =
                TARGET1(i)=k(%s);
        }
        %s
        """%(diag_precompute_string,diag_arg_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
        # Code to compute gradients
        grad_func_list = []
        if self.output_dim>1:
            grad_func_list += c_define_output_indices
            grad_func_list += [' '*16 + 'TARGET1(%i+ii) += partial[i*num_inducing+j]*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, arg_string) for i, theta in enumerate(self._sp_theta_i)]
            grad_func_list += [' '*16 + 'TARGET1(%i+jj) += partial[i*num_inducing+j]*dk_d%s(%s);'%(self.num_shared_params+i*self.output_dim, theta.name, reverse_arg_string) for i, theta in enumerate(self._sp_theta_i)]
        grad_func_list += ([' '*16 + 'TARGET1(%i) += partial[i*num_inducing+j]*dk_d%s(%s);'%(i,theta.name,arg_string) for i,theta in  enumerate(self._sp_theta)])
        grad_func_string = '\n'.join(grad_func_list) 
        self._dK_dtheta_code =\
        """
        // _dK_dtheta_code
        // Code for computing gradient of covariance with respect to parameters.
        int i;
        int j;
        int N = partial_array->dimensions[0];
        int num_inducing = partial_array->dimensions[1];
        int input_dim = X_array->dimensions[1];
        //#pragma omp parallel for private(j)
        for (i=0;i<N;i++){
            for (j=0;j<num_inducing;j++){
 %s
            }
        }
        %s
        """%(grad_func_string,"/*"+str(self._sp_k)+"*/") # adding a string representation forces recompile when needed
        # Code to compute gradients for Kdiag TODO: needs clean up
        diag_grad_func_string = re.sub('Z','X',grad_func_string,count=0)
        diag_grad_func_string = re.sub('int jj','//int jj',diag_grad_func_string)
        diag_grad_func_string = re.sub('j','i',diag_grad_func_string)
        diag_grad_func_string = re.sub('partial\[i\*num_inducing\+i\]','partial[i]',diag_grad_func_string)
        self._dKdiag_dtheta_code =\
        """
        // _dKdiag_dtheta_code
        // Code for computing gradient of diagonal with respect to parameters.
        int i;
        int N = partial_array->dimensions[0];
        int input_dim = X_array->dimensions[1];
        for (i=0;i<N;i++){
                %s
        }
        %s
        """%(diag_grad_func_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
        # Code for gradients wrt X, TODO: may need to deal with special case where one input is actually an output.
        gradX_func_list = []
        if self.output_dim>1:
            gradX_func_list += c_define_output_indices
        gradX_func_list += ["TARGET2(i, %i) += partial[i*num_inducing+j]*dk_dx_%i(%s);"%(q,q,arg_string) for q in range(self._real_input_dim)]
        gradX_func_string = "\n".join(gradX_func_list)
        self._dK_dX_code = \
        """
        // _dK_dX_code
        // Code for computing gradient of covariance with respect to inputs.
        int i;
        int j;
        int N = partial_array->dimensions[0];
        int num_inducing = partial_array->dimensions[1];
        int input_dim = X_array->dimensions[1];
        //#pragma omp parallel for private(j)
        for (i=0;i<N; i++){
          for (j=0; j<num_inducing; j++){
            %s
          }
        }
        %s
        """%(gradX_func_string,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
        diag_gradX_func_string = re.sub('Z','X',gradX_func_string,count=0)
        diag_gradX_func_string = re.sub('int jj','//int jj',diag_gradX_func_string)
        diag_gradX_func_string = re.sub('j','i',diag_gradX_func_string)
        diag_gradX_func_string = re.sub('partial\[i\*num_inducing\+i\]','2*partial[i]',diag_gradX_func_string)
        # Code for gradients of Kdiag wrt X
        self._dKdiag_dX_code= \
        """
        // _dKdiag_dX_code
        // Code for computing gradient of diagonal with respect to inputs.
        int N = partial_array->dimensions[0];
        int input_dim = X_array->dimensions[1];
        for (int i=0;i<N; i++){
            %s
        }
        %s
        """%(diag_gradX_func_string,"/*"+str(self._sp_k)+"*/") #adding a
        # string representation forces recompile when needed Get rid
        # of Zs in argument for diagonal. TODO: Why wasn't
        # diag_func_string called here? Need to check that.
        #self._dKdiag_dX_code = self._dKdiag_dX_code.replace('Z[j', 'X[i')
        # Code to use when only X is provided. 
        self._K_code_X = self._K_code.replace('Z[', 'X[')
        self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z[', 'X[')
        self._dK_dX_code_X = self._dK_dX_code.replace('Z[', 'X[').replace('+= partial[', '+= 2*partial[')
        self._K_code_X = self._K_code.replace('Z2(', 'X2(')
        self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z2(', 'X2(')
        self._dK_dX_code_X = self._dK_dX_code.replace('Z2(', 'X2(')
        #TODO: insert multiple functions here via string manipulation
        #TODO: similar functions for psi_stats
    def _get_arg_names(self, Z=None, partial=None):
        arg_names = ['target','X']
        for shared_params in self._sp_theta:
            arg_names += [shared_params.name]
        if Z is not None:
            arg_names += ['Z']
        if partial is not None:
            arg_names += ['partial']
        if self.output_dim>1:
            arg_names += self._split_theta_names
            arg_names += ['output_dim']
        return arg_names
    def _weave_inline(self, code, X, target, Z=None, partial=None):
        output_dim = self.output_dim
        for shared_params in self._sp_theta:
            locals()[shared_params.name] = getattr(self, shared_params.name)
        # Need to extract parameters first
        for split_params in self._split_theta_names:
            locals()[split_params] = getattr(self, split_params)
        arg_names = self._get_arg_names(Z, partial)        
        weave.inline(code=code, arg_names=arg_names,**self.weave_kwargs)
    def K(self,X,Z,target):        
        if Z is None:
            self._weave_inline(self._K_code_X, X, target)
        else:
            self._weave_inline(self._K_code, X, target, Z)
    def Kdiag(self,X,target):
        self._weave_inline(self._Kdiag_code, X, target)
    def _param_grad_helper(self,partial,X,Z,target):
        if Z is None:
            self._weave_inline(self._dK_dtheta_code_X, X, target, Z, partial)
        else:
            self._weave_inline(self._dK_dtheta_code, X, target, Z, partial)
    def dKdiag_dtheta(self,partial,X,target):
        self._weave_inline(self._dKdiag_dtheta_code, X, target, Z=None, partial=partial)
    def gradients_X(self,partial,X,Z,target):
        if Z is None:
            self._weave_inline(self._dK_dX_code_X, X, target, Z, partial)
        else:
            self._weave_inline(self._dK_dX_code, X, target, Z, partial)
    def dKdiag_dX(self,partial,X,target):
        self._weave.inline(self._dKdiag_dX_code, X, target, Z, partial)
    def compute_psi_stats(self):
        #define some normal distributions
        mus = [sp.var('mu_%i'%i,real=True) for i in range(self.input_dim)]
        Ss = [sp.var('S_%i'%i,positive=True) for i in range(self.input_dim)]
        normals = [(2*sp.pi*Si)**(-0.5)*sp.exp(-0.5*(xi-mui)**2/Si) for xi, mui, Si in zip(self._sp_x, mus, Ss)]
        #do some integration!
        #self._sp_psi0 = ??
        self._sp_psi1 = self._sp_k
        for i in range(self.input_dim):
            print 'perfoming integrals %i of %i'%(i+1,2*self.input_dim)
            sys.stdout.flush()
            self._sp_psi1 *= normals[i]
            self._sp_psi1 = sp.integrate(self._sp_psi1,(self._sp_x[i],-sp.oo,sp.oo))
            clear_cache()
        self._sp_psi1 = self._sp_psi1.simplify()
        #and here's psi2 (eek!)
        zprime = [sp.Symbol('zp%i'%i) for i in range(self.input_dim)]
        self._sp_psi2 = self._sp_k.copy()*self._sp_k.copy().subs(zip(self._sp_z,zprime))
        for i in range(self.input_dim):
            print 'perfoming integrals %i of %i'%(self.input_dim+i+1,2*self.input_dim)
            sys.stdout.flush()
            self._sp_psi2 *= normals[i]
            self._sp_psi2 = sp.integrate(self._sp_psi2,(self._sp_x[i],-sp.oo,sp.oo))
            clear_cache()
        self._sp_psi2 = self._sp_psi2.simplify()
    def _set_params(self,param):        
        assert param.size == (self.num_params)
        for i, shared_params in enumerate(self._sp_theta):
            setattr(self, shared_params.name, param[i])
        if self.output_dim>1:
            for i, split_params in enumerate(self._split_theta_names):
                start = self.num_shared_params + i*self.output_dim
                end = self.num_shared_params + (i+1)*self.output_dim
                setattr(self, split_params, param[start:end])
    def _get_params(self):
        params = np.zeros(0)
        for shared_params in self._sp_theta:
            params = np.hstack((params, getattr(self, shared_params.name)))
        if self.output_dim>1:
            for split_params in self._split_theta_names:
                params = np.hstack((params, getattr(self, split_params).flatten()))
        return params
    def _get_param_names(self):
        if self.output_dim>1:
            return [x.name for x in self._sp_theta] + [x.name[:-2] + str(i)  for x in self._sp_theta_i for i in range(self.output_dim)]
        else:
            return [x.name for x in self._sp_theta]
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@ -8,7 +8,7 @@ from ..core import SparseGP
 from ..likelihoods import Gaussian
 from ..inference.optimization import SCG
 from ..util import linalg
-from ..core.parameterization.variational import Normal
+from ..core.parameterization.variational import NormalPosterior, NormalPrior
 class BayesianGPLVM(SparseGP, GPLVM):
    """
@ -29,18 +29,20 @@ class BayesianGPLVM(SparseGP, GPLVM):
        self.init = init
        if X_variance is None:
-            X_variance = np.clip((np.ones_like(X) * 0.5) + .01 * np.random.randn(*X.shape), 0.001, 1)
+            X_variance = np.random.uniform(0,.1,X.shape)
        if Z is None:
            Z = np.random.permutation(X.copy())[:num_inducing]
        assert Z.shape[1] == X.shape[1]
        if kernel is None:
-            kernel = kern.rbf(input_dim) # + kern.white(input_dim)
+            kernel = kern.RBF(input_dim) # + kern.white(input_dim)
        if likelihood is None:
            likelihood = Gaussian()
-        self.q = Normal(X, X_variance)
+        self.q = NormalPosterior(X, X_variance)
        self.variational_prior = NormalPrior()
        SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, X_variance, name, **kwargs)
        self.add_parameter(self.q, index=0)
        #self.ensure_default_constraints()
@ -57,34 +59,15 @@ class BayesianGPLVM(SparseGP, GPLVM):
        self.init = state.pop()
        SparseGP._setstate(self, state)
    def dL_dmuS(self):
        dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi0_dmuS(self.grad_dict['dL_dpsi0'], self.Z, self.X, self.X_variance)
        dL_dmu_psi1, dL_dS_psi1 = self.kern.dpsi1_dmuS(self.grad_dict['dL_dpsi1'], self.Z, self.X, self.X_variance)
        dL_dmu_psi2, dL_dS_psi2 = self.kern.dpsi2_dmuS(self.grad_dict['dL_dpsi2'], self.Z, self.X, self.X_variance)
        dL_dmu = dL_dmu_psi0 + dL_dmu_psi1 + dL_dmu_psi2
        dL_dS = dL_dS_psi0 + dL_dS_psi1 + dL_dS_psi2
        return dL_dmu, dL_dS
    def KL_divergence(self):
        var_mean = np.square(self.X).sum()
        var_S = np.sum(self.X_variance - np.log(self.X_variance))
        return 0.5 * (var_mean + var_S) - 0.5 * self.input_dim * self.num_data
    def parameters_changed(self):
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.X_variance, self.Z, self.likelihood, self.Y)
+        super(BayesianGPLVM, self).parameters_changed()
-        self._update_gradients_Z(add=False)
+        self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.q)
-        self._log_marginal_likelihood -= self.KL_divergence()
+        self.kern.update_gradients_q_variational(posterior_variational=self.q, Z=self.Z, **self.grad_dict)
        dL_dmu, dL_dS = self.dL_dmuS()
-        # dL:
+        # update for the KL divergence
-        self.q.mean.gradient  = dL_dmu
+        self.variational_prior.update_gradients_KL(self.q)
        self.q.variance.gradient  = dL_dS  
        # dKL:
        self.q.mean.gradient -= self.X
        self.q.variance.gradient -= (1. - (1. / (self.X_variance))) * 0.5
    def plot_latent(self, plot_inducing=True, *args, **kwargs):
        """
@ -157,6 +140,7 @@ class BayesianGPLVM(SparseGP, GPLVM):
        """
        See GPy.plotting.matplot_dep.dim_reduction_plots.plot_steepest_gradient_map
        """
        import sys
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import dim_reduction_plots
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@ -23,7 +23,7 @@ class GPRegression(GP):
    def __init__(self, X, Y, kernel=None):
        if kernel is None:
-            kernel = kern.rbf(X.shape[1])
+            kernel = kern.RBF(X.shape[1])
        likelihood = likelihoods.Gaussian()
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@ -7,9 +7,25 @@ from GPy.util.linalg import PCA
 import numpy
 import itertools
 import pylab
-from GPy.kern.kern import kern
+from GPy.kern import Kern
 from GPy.models.bayesian_gplvm import BayesianGPLVM
 class MRD2(Model):
    """
    Apply MRD to all given datasets Y in Ylist. 
    Y_i in [n x p_i]
    The samples n in the datasets need 
    to match up, whereas the dimensionality p_d can differ.
    :param [array-like] Ylist: List of datasets to apply MRD on
    :param array-like q_mean: mean of starting latent space q in [n x q]
    :param array-like q_variance: variance of starting latent space q in [n x q]
    :param :class:`~GPy.inference.latent_function_inference
    """
 class MRD(Model):
    """
    Do MRD on given Datasets in Ylist.
@ -48,11 +64,11 @@ class MRD(Model):
        # sort out the kernels
        if kernels is None:
            kernels = [None] * len(likelihood_or_Y_list)
-        elif isinstance(kernels, kern):
+        elif isinstance(kernels, Kern):
            kernels = [kernels.copy() for i in range(len(likelihood_or_Y_list))]
        else:
            assert len(kernels) == len(likelihood_or_Y_list), "need one kernel per output"
-            assert all([isinstance(k, kern) for k in kernels]), "invalid kernel object detected!"
+            assert all([isinstance(k, Kern) for k in kernels]), "invalid kernel object detected!"
        assert not ('kernel' in kw), "pass kernels through `kernels` argument"
        self.input_dim = input_dim
--- a/GPy/plotting/matplot_dep/dim_reduction_plots.py
+++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py
@ -1,8 +1,8 @@
 import pylab as pb
 import numpy as np
 from ... import util
 from latent_space_visualizations.controllers.imshow_controller import ImshowController,ImAnnotateController
-from GPy.util.misc import param_to_array
+from ...util.misc import param_to_array
 from .base_plots import x_frame2D
 import itertools
 import Tango
 from matplotlib.cm import get_cmap
@ -37,7 +37,7 @@ def plot_latent(model, labels=None, which_indices=None,
    if ax is None:
        fig = pb.figure(num=fignum)
        ax = fig.add_subplot(111)
-    util.plot.Tango.reset()
+    Tango.reset()
    if labels is None:
        labels = np.ones(model.num_data)
@ -46,7 +46,7 @@ def plot_latent(model, labels=None, which_indices=None,
    X = param_to_array(model.X)
    # first, plot the output variance as a function of the latent space
-    Xtest, xx, yy, xmin, xmax = util.plot.x_frame2D(X[:, [input_1, input_2]], resolution=resolution)
+    Xtest, xx, yy, xmin, xmax = x_frame2D(X[:, [input_1, input_2]], resolution=resolution)
    Xtest_full = np.zeros((Xtest.shape[0], model.X.shape[1]))
    def plot_function(x):
@ -87,7 +87,7 @@ def plot_latent(model, labels=None, which_indices=None,
        else:
            x = X[index, input_1]
            y = X[index, input_2]
-        ax.scatter(x, y, marker=m, s=s, color=util.plot.Tango.nextMedium(), label=this_label)
+        ax.scatter(x, y, marker=m, s=s, color=Tango.nextMedium(), label=this_label)
    ax.set_xlabel('latent dimension %i' % input_1)
    ax.set_ylabel('latent dimension %i' % input_2)
@ -120,7 +120,7 @@ def plot_magnification(model, labels=None, which_indices=None,
    if ax is None:
        fig = pb.figure(num=fignum)
        ax = fig.add_subplot(111)
-    util.plot.Tango.reset()
+    Tango.reset()
    if labels is None:
        labels = np.ones(model.num_data)
@ -128,7 +128,7 @@ def plot_magnification(model, labels=None, which_indices=None,
    input_1, input_2 = most_significant_input_dimensions(model, which_indices)
    # first, plot the output variance as a function of the latent space
-    Xtest, xx, yy, xmin, xmax = util.plot.x_frame2D(model.X[:, [input_1, input_2]], resolution=resolution)
+    Xtest, xx, yy, xmin, xmax = x_frame2D(model.X[:, [input_1, input_2]], resolution=resolution)
    Xtest_full = np.zeros((Xtest.shape[0], model.X.shape[1]))
    def plot_function(x):
@ -165,7 +165,7 @@ def plot_magnification(model, labels=None, which_indices=None,
        else:
            x = model.X[index, input_1]
            y = model.X[index, input_2]
-        ax.scatter(x, y, marker=m, s=s, color=util.plot.Tango.nextMedium(), label=this_label)
+        ax.scatter(x, y, marker=m, s=s, color=Tango.nextMedium(), label=this_label)
    ax.set_xlabel('latent dimension %i' % input_1)
    ax.set_ylabel('latent dimension %i' % input_2)
@ -205,7 +205,7 @@ def plot_steepest_gradient_map(model, fignum=None, ax=None, which_indices=None,
        return dmu_dX[indices, argmax], np.array(labels)[argmax]
    if ax is None:
-        fig = pyplot.figure(num=fignum)
+        fig = pb.figure(num=fignum)
        ax = fig.add_subplot(111)
    if data_labels is None:
@ -241,7 +241,7 @@ def plot_steepest_gradient_map(model, fignum=None, ax=None, which_indices=None,
    ax.legend()
    ax.figure.tight_layout()
    if updates:
-        pyplot.show()
+        pb.show()
        clear = raw_input('Enter to continue')
        if clear.lower() in 'yes' or clear == '':
            controller.deactivate()
--- a/GPy/plotting/matplot_dep/kernel_plots.py
+++ b/GPy/plotting/matplot_dep/kernel_plots.py
@ -1,13 +1,12 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import sys
 import numpy as np
 import pylab as pb
 import Tango
 from matplotlib.textpath import TextPath
 from matplotlib.transforms import offset_copy
-from ...kern.parts.linear import Linear
+from ...kern import Linear
 def plot_ARD(kernel, fignum=None, ax=None, title='', legend=False):
@ -29,7 +28,8 @@ def plot_ARD(kernel, fignum=None, ax=None, title='', legend=False):
    xticklabels = []
    bars = []
    x0 = 0
-    for p in kernel._parameters_:
+    #for p in kernel._parameters_:
    p = kernel
    c = Tango.nextMedium()
    if hasattr(p, 'ARD') and p.ARD:
        if title is None:
@ -40,9 +40,9 @@ def plot_ARD(kernel, fignum=None, ax=None, title='', legend=False):
            ard_params = p.variances
        else:
            ard_params = 1. / p.lengthscale
        x = np.arange(x0, x0 + len(ard_params))
-            bars.append(ax.bar(x, ard_params, align='center', color=c, edgecolor='k', linewidth=1.2, label=p.name.replace("_"," ")))
+        from ...util.misc import param_to_array
        bars.append(ax.bar(x, param_to_array(ard_params), align='center', color=c, edgecolor='k', linewidth=1.2, label=p.name.replace("_"," ")))
        xticklabels.extend([r"$\mathrm{{{name}}}\ {x}$".format(name=p.name, x=i) for i in np.arange(len(ard_params))])
        x0 += len(ard_params)
    x = np.arange(x0)
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@ -9,7 +9,7 @@ from ...util.misc import param_to_array
 def plot_fit(model, plot_limits=None, which_data_rows='all',
-        which_data_ycols='all', which_parts='all', fixed_inputs=[],
+        which_data_ycols='all', fixed_inputs=[],
        levels=20, samples=0, fignum=None, ax=None, resolution=None,
        plot_raw=False,
        linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
@ -20,7 +20,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
      - In higher dimensions, use fixed_inputs to plot the GP  with some of the inputs fixed.
    Can plot only part of the data and part of the posterior functions
-    using which_data_rowsm which_data_ycols and which_parts
+    using which_data_rowsm which_data_ycols. 
    :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
    :type plot_limits: np.array
@ -28,8 +28,6 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
    :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
    :param which_data_ycols: when the data has several columns (independant outputs), only plot these
    :type which_data_rows: 'all' or a list of integers
    :param which_parts: which of the kernel functions to plot (additively)
    :type which_parts: 'all', or list of bools
    :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
    :type fixed_inputs: a list of tuples
    :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
@ -59,6 +57,9 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
        fig = pb.figure(num=fignum)
        ax = fig.add_subplot(111)
    X, Y, Z = param_to_array(model.X, model.Y, model.Z)
    if model.has_uncertain_inputs(): X_variance = param_to_array(model.q.variance)
    #work out what the inputs are for plotting (1D or 2D)
    fixed_dims = np.array([i for i,v in fixed_inputs])
    free_dims = np.setdiff1d(np.arange(model.input_dim),fixed_dims)
@ -68,7 +69,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
        #define the frame on which to plot
        resolution = resolution or 200
-        Xnew, xmin, xmax = x_frame1D(model.X[:,free_dims], plot_limits=plot_limits)
+        Xnew, xmin, xmax = x_frame1D(X[:,free_dims], plot_limits=plot_limits)
        Xgrid = np.empty((Xnew.shape[0],model.input_dim))
        Xgrid[:,free_dims] = Xnew
        for i,v in fixed_inputs:
@ -76,30 +77,30 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
        #make a prediction on the frame and plot it
        if plot_raw:
-            m, v = model._raw_predict(Xgrid, which_parts=which_parts)
+            m, v = model._raw_predict(Xgrid)
            lower = m - 2*np.sqrt(v)
            upper = m + 2*np.sqrt(v)
-            Y = model.Y
+            Y = Y
        else:
-            m, v, lower, upper = model.predict(Xgrid, which_parts=which_parts)
+            m, v, lower, upper = model.predict(Xgrid)
-            Y = model.Y
+            Y = Y
        for d in which_data_ycols:
            gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
-            ax.plot(model.X[which_data_rows,free_dims], Y[which_data_rows, d], 'kx', mew=1.5)
+            ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], 'kx', mew=1.5)
        #optionally plot some samples
        if samples: #NOTE not tested with fixed_inputs
-            Ysim = model.posterior_samples(Xgrid, samples, which_parts=which_parts)
+            Ysim = model.posterior_samples(Xgrid, samples)
            for yi in Ysim.T:
                ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
                #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
        #add error bars for uncertain (if input uncertainty is being modelled)
-        if hasattr(model,"has_uncertain_inputs"):
+        #if hasattr(model,"has_uncertain_inputs") and model.has_uncertain_inputs():
-            ax.errorbar(model.X[which_data, free_dims], model.likelihood.data[which_data, 0],
+        #    ax.errorbar(X[which_data_rows, free_dims].flatten(), Y[which_data_rows, which_data_ycols].flatten(),
-                        xerr=2 * np.sqrt(model.X_variance[which_data, free_dims]),
+        #                xerr=2 * np.sqrt(X_variance[which_data_rows, free_dims].flatten()),
-                        ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
+        #                ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
        #set the limits of the plot to some sensible values
@ -111,7 +112,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
        #add inducing inputs (if a sparse model is used)
        if hasattr(model,"Z"):
            #Zu = model.Z[:,free_dims] * model._Xscale[:,free_dims] + model._Xoffset[:,free_dims]
-            Zu = param_to_array(model.Z[:,free_dims])
+            Zu = Z[:,free_dims]
            z_height = ax.get_ylim()[0]
            ax.plot(Zu, np.zeros_like(Zu) + z_height, 'r|', mew=1.5, markersize=12)
@ -122,7 +123,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
        #define the frame for plotting on
        resolution = resolution or 50
-        Xnew, _, _, xmin, xmax = x_frame2D(model.X[:,free_dims], plot_limits, resolution)
+        Xnew, _, _, xmin, xmax = x_frame2D(X[:,free_dims], plot_limits, resolution)
        Xgrid = np.empty((Xnew.shape[0],model.input_dim))
        Xgrid[:,free_dims] = Xnew
        for i,v in fixed_inputs:
@ -131,15 +132,15 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
        #predict on the frame and plot
        if plot_raw:
-            m, _ = model._raw_predict(Xgrid, which_parts=which_parts)
+            m, _ = model._raw_predict(Xgrid)
-            Y = model.Y
+            Y = Y
        else:
-            m, _, _, _ = model.predict(Xgrid, which_parts=which_parts)
+            m, _, _, _ = model.predict(Xgrid)
-            Y = model.data
+            Y = Y
        for d in which_data_ycols:
            m_d = m[:,d].reshape(resolution, resolution).T
            ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
-            ax.scatter(model.X[which_data_rows, free_dims[0]], model.X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+            ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
        #set the limits of the plot to some sensible values
        ax.set_xlim(xmin[0], xmax[0])
@ -151,7 +152,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
        #add inducing inputs (if a sparse model is used)
        if hasattr(model,"Z"):
            #Zu = model.Z[:,free_dims] * model._Xscale[:,free_dims] + model._Xoffset[:,free_dims]
-            Zu = model.Z[:,free_dims]
+            Zu = Z[:,free_dims]
            ax.plot(Zu[:,free_dims[0]], Zu[:,free_dims[1]], 'wo')
    else:
--- a/GPy/testing/index_operations_tests.py
+++ b/GPy/testing/index_operations_tests.py
@ -24,6 +24,18 @@ class Test(unittest.TestCase):
        self.param_index.remove(one, [1])
        self.assertListEqual(self.param_index[one].tolist(), [3])        
    def test_shift_left(self):
        self.param_index.shift_left(1, 2)
        self.assertListEqual(self.param_index[three].tolist(), [2,5])
        self.assertListEqual(self.param_index[two].tolist(), [0,3])
        self.assertListEqual(self.param_index[one].tolist(), [1])        
    def test_shift_right(self):
        self.param_index.shift_right(5, 2)
        self.assertListEqual(self.param_index[three].tolist(), [2,4,9])
        self.assertListEqual(self.param_index[two].tolist(), [0,7])
        self.assertListEqual(self.param_index[one].tolist(), [3])        
    def test_index_view(self):
        #=======================================================================
        #          0    1    2    3    4    5    6    7    8    9
--- a/GPy/testing/parameterized_tests.py
+++ b/GPy/testing/parameterized_tests.py
@ -10,8 +10,8 @@ import numpy as np
 class Test(unittest.TestCase):
    def setUp(self):
-        self.rbf = GPy.kern.rbf(1)
+        self.rbf = GPy.kern.RBF(1)
-        self.white = GPy.kern.white(1)
+        self.white = GPy.kern.White(1)
        from GPy.core.parameterization import Param
        from GPy.core.parameterization.transformations import Logistic
        self.param = Param('param', np.random.rand(25,2), Logistic(0, 1))
@ -39,14 +39,13 @@ class Test(unittest.TestCase):
    def test_remove_parameter(self):
-        from GPy.core.parameterization.transformations import FIXED, UNFIXED, __fixed__
+        from GPy.core.parameterization.transformations import FIXED, UNFIXED, __fixed__, Logexp
        self.white.fix()
        self.test1.remove_parameter(self.white)
        self.assertIs(self.test1._fixes_,None)
        self.assertListEqual(self.white._fixes_.tolist(), [FIXED])
-        self.assertIs(self.white.constraints,self.white.white.constraints._param_index_ops)
+        self.assertEquals(self.white.constraints._offset, 0)
        self.assertEquals(self.white.white.constraints._offset, 0)
        self.assertIs(self.test1.constraints, self.rbf.constraints._param_index_ops)
        self.assertIs(self.test1.constraints, self.param.constraints._param_index_ops)        
@ -57,18 +56,19 @@ class Test(unittest.TestCase):
        self.assertListEqual(self.test1.constraints[__fixed__].tolist(), [0])
        self.assertIs(self.white._fixes_,None)
        self.assertListEqual(self.test1._fixes_.tolist(),[FIXED] + [UNFIXED] * 52)
        self.test1.remove_parameter(self.white)
        self.assertIs(self.test1._fixes_,None)
        self.assertListEqual(self.white._fixes_.tolist(), [FIXED])
        self.assertIs(self.white.constraints,self.white.white.constraints._param_index_ops)
        self.assertIs(self.test1.constraints, self.rbf.constraints._param_index_ops)
        self.assertIs(self.test1.constraints, self.param.constraints._param_index_ops)
        self.assertListEqual(self.test1.constraints[Logexp()].tolist(), [0,1])
    def test_add_parameter_already_in_hirarchy(self):
        self.test1.add_parameter(self.white._parameters_[0])
    def test_default_constraints(self):
-        self.assertIs(self.rbf.rbf.variance.constraints._param_index_ops, self.rbf.constraints._param_index_ops)
+        self.assertIs(self.rbf.variance.constraints._param_index_ops, self.rbf.constraints._param_index_ops)
        self.assertIs(self.test1.constraints, self.rbf.constraints._param_index_ops)
        self.assertListEqual(self.rbf.constraints.indices()[0].tolist(), range(2))
        from GPy.core.parameterization.transformations import Logexp
--- a/GPy/util/init.py
+++ b/GPy/util/init.py
@ -12,6 +12,7 @@ import decorators
 import classification
 import subarray_and_sorting
 import caching
 import diag
 try:
    import sympy
--- a/GPy/util/caching.py
+++ b/GPy/util/caching.py
@ -1,44 +1,86 @@
-from ..core.parameterization.array_core import ObservableArray, ParamList
+from ..core.parameterization.parameter_core import Observable
 class Cacher(object):
-    def __init__(self, operation, limit=5):
+    def __init__(self, operation, limit=5, reset_on_first=False):
        self.limit = int(limit)
        self._reset_on_first = reset_on_first
        self.operation=operation
-        self.cached_inputs = ParamList([])
+        self.cached_inputs = []
        self.cached_outputs = []
        self.inputs_changed = []
-    def __call__(self, X):
+    def __call__(self, *args):
-        assert isinstance(X, ObservableArray)
+        if self._reset_on_first:
-        if X in self.cached_inputs:
+            assert isinstance(args[0], Observable)
-            i = self.cached_inputs.index(X)
+            args[0].add_observer(self, self.reset)
            cached_args = args
        else:
            cached_args = args[1:]
        if not all([isinstance(arg, Observable) for arg in cached_args]):
            return self.operation(*args)
        if cached_args in self.cached_inputs:
            i = self.cached_inputs.index(cached_args)
            if self.inputs_changed[i]:
-                self.cached_outputs[i] = self.operation(X)
+                self.cached_outputs[i] = self.operation(*args)
                self.inputs_changed[i] = False
            return self.cached_outputs[i]
        else:
            if len(self.cached_inputs) == self.limit:
-                X_ = self.cached_inputs.pop(0)
+                args_ = self.cached_inputs.pop(0)
-                X_.remove_observer(self)
+                [a.remove_observer(self, self.on_cache_changed) for a in args_]
                self.inputs_changed.pop(0)
                self.cached_outputs.pop(0)
-            self.cached_inputs.append(X)
+            self.cached_inputs.append(cached_args)
-            self.cached_outputs.append(self.operation(X))
+            self.cached_outputs.append(self.operation(*args))
            self.inputs_changed.append(False)
-            X.add_observer(self, self.on_cache_changed)
+            [a.add_observer(self, self.on_cache_changed) for a in args]
            return self.cached_outputs[-1]
-    def on_cache_changed(self, X):
+    def on_cache_changed(self, arg):
-        #print id(X)
+        self.inputs_changed = [any([a is arg for a in args]) or old_ic for args, old_ic in zip(self.cached_inputs, self.inputs_changed)]
-        Xbase = X
+
-        while Xbase is not None:
+    def reset(self, obj):
-            try:
+        [[a.remove_observer(self, self.reset) for a in args] for args in self.cached_inputs]
-                i = self.cached_inputs.index(X)
+        self.cached_inputs = []
-                break
+        self.cached_outputs = []
-            except ValueError:
+        self.inputs_changed = []
-                Xbase = X.base
+
-                continue
+
-        self.inputs_changed[i] = True
+
 def cache_this(limit=5, reset_on_self=False):
    def limited_cache(f):
        c = Cacher(f, limit, reset_on_first=reset_on_self)
        def f_wrap(*args):
            return c(*args)
        f_wrap._cacher = c
        return f_wrap
    return limited_cache
        #Xbase = X
        #while Xbase is not None:
            #try:
                #i = self.cached_inputs.index(X)
                #break
            #except ValueError:
                #Xbase = X.base
                #continue
        #self.inputs_changed[i] = True
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -513,8 +513,8 @@ def toy_rbf_1d(seed=default_seed, num_samples=500):
    num_in = 1
    X = np.random.uniform(low= -1.0, high=1.0, size=(num_samples, num_in))
    X.sort(axis=0)
-    rbf = GPy.kern.rbf(num_in, variance=1., lengthscale=np.array((0.25,)))
+    rbf = GPy.kern.RBF(num_in, variance=1., lengthscale=np.array((0.25,)))
-    white = GPy.kern.white(num_in, variance=1e-2)
+    white = GPy.kern.White(num_in, variance=1e-2)
    kernel = rbf + white
    K = kernel.K(X)
    y = np.reshape(np.random.multivariate_normal(np.zeros(num_samples), K), (num_samples, 1))
--- a/GPy/util/diag.py
+++ b/GPy/util/diag.py
@ -44,6 +44,12 @@ def view(A, offset=0):
    else:
        return as_strided(A, shape=(A.shape[0], ), strides=((A.shape[0]+1)*A.itemsize, ))
 def offdiag_view(A, offset=0):
    from numpy.lib.stride_tricks import as_strided
    assert A.ndim == 2, "only implemented for 2 dimensions"
    Af = as_strided(A, shape=(A.size,), strides=(A.itemsize,))
    return as_strided(Af[(1+offset):], shape=(A.shape[0]-1, A.shape[1]), strides=(A.strides[0] + A.itemsize, A.strides[1]))
 def _diag_ufunc(A,b,offset,func):
    dA = view(A, offset); func(dA,b,dA)
    return A
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@ -3,8 +3,6 @@
 import numpy as np
 import scipy as sp
 import pylab as plt
 class WarpingFunction(object):
    """
@ -39,6 +37,7 @@ class WarpingFunction(object):
    def plot(self, psi, xmin, xmax):
        y = np.arange(xmin, xmax, 0.01)
        f_y = self.f(y, psi)
        from matplotlib import pyplot as plt
        plt.figure()
        plt.plot(y, f_y)
        plt.xlabel('y')