Merge branch 'devel' of github.com:SheffieldML/GPy into devel

2026-05-15 06:52:39 +02:00 · 2014-05-15 18:47:26 +01:00 · 2014-05-15 18:47:26 +01:00 · 382645ff37
commit 382645ff37
parent f820e3b0bf 1851a821c7
35 changed files with 943 additions and 508 deletions
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -10,7 +10,7 @@ from model import Model
 from parameterization import ObsAr
 from .. import likelihoods
 from ..likelihoods.gaussian import Gaussian
-from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
+from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation, LatentFunctionInference
 from parameterization.variational import VariationalPosterior
 class GP(Model):
@ -21,6 +21,7 @@ class GP(Model):
    :param Y: output observations
    :param kernel: a GPy kernel, defaults to rbf+white
    :param likelihood: a GPy likelihood
    :param :class:`~GPy.inference.latent_function_inference.LatentFunctionInference` inference_method: The inference method to use for this GP
    :rtype: model object
    .. Note:: Multiple independent outputs are allowed using columns of Y
@ -220,3 +221,20 @@ class GP(Model):
        """
        return self.kern.input_sensitivity()
    def optimize(self, optimizer=None, start=None, **kwargs):
        """
        Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
        kwargs are passed to the optimizer. They can be:
        :param max_f_eval: maximum number of function evaluations
        :type max_f_eval: int
        :messages: whether to display during optimisation
        :type messages: bool
        :param optimizer: which optimizer to use (defaults to self.preferred optimizer)
        :type optimizer: string
        TODO: valid args
        """
        self.inference_method.on_optimization_start()
        super(GP, self).optimize(optimizer, start, **kwargs)
        self.inference_method.on_optimization_end()
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -220,7 +220,7 @@ class Model(Parameterized):
        if self.is_fixed:
            raise RuntimeError, "Cannot optimize, when everything is fixed"
        if self.size == 0:
-            raise RuntimeError, "Model without parameters cannot be minimized"
+            raise RuntimeError, "Model without parameters cannot be optimized"
        if optimizer is None:
            optimizer = self.preferred_optimizer
--- a/GPy/core/parameterization/lists_and_dicts.py
+++ b/GPy/core/parameterization/lists_and_dicts.py
@ -38,7 +38,12 @@ class ArrayList(list):
        raise ValueError, "{} is not in list".format(item)
    pass
-class ObservablesList(object):
+class ObserverList(object):
    """
    A list which containts the observables.
    It only holds weak references to observers, such that unbound
    observers dont dangle in memory.
    """
    def __init__(self):
        self._poc = []
@ -46,27 +51,30 @@ class ObservablesList(object):
        p,o,c = self._poc[ind]
        return p, o(), c
-    def remove(self, priority, observable, callble):
+    def remove(self, priority, observer, callble):
        """
        Remove one observer, which had priority and callble.
        """
        self.flush()
        for i in range(len(self) - 1, -1, -1):
            p,o,c = self[i]
-            if priority==p and observable==o and callble==c:
+            if priority==p and observer==o and callble==c:
                del self._poc[i]
    def __repr__(self):
        return self._poc.__repr__()
-    def add(self, priority, observable, callble):
+    def add(self, priority, observer, callble):
-        if observable is not None:
+        """
        Add an observer with priority and callble
        """
        if observer is not None:
            ins = 0
            for pr, _, _ in self:
                if priority > pr:
                    break
                ins += 1
-            self._poc.insert(ins, (priority, weakref.ref(observable), callble))
+            self._poc.insert(ins, (priority, weakref.ref(observer), callble))
    def __str__(self):
        ret = []
@ -83,6 +91,9 @@ class ObservablesList(object):
        return '\n'.join(ret)
    def flush(self):
        """
        Make sure all weak references, which point to nothing are flushed (deleted)
        """
        self._poc = [(p,o,c) for p,o,c in self._poc if o() is not None]
    def __iter__(self):
@ -95,7 +106,7 @@ class ObservablesList(object):
        return self._poc.__len__()
    def __deepcopy__(self, memo):
-        s = ObservablesList()
+        s = ObserverList()
        for p,o,c in self:
            import copy
            s.add(p, copy.deepcopy(o, memo), copy.deepcopy(c, memo))
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@ -4,7 +4,7 @@
 import itertools
 import numpy
 np = numpy
-from parameter_core import OptimizationHandlable, adjust_name_for_printing
+from parameter_core import Parameterizable, adjust_name_for_printing
 from observable_array import ObsAr
 ###### printing
@ -16,7 +16,7 @@ __precision__ = numpy.get_printoptions()['precision'] # numpy printing precision
 __print_threshold__ = 5
 ######
-class Param(OptimizationHandlable, ObsAr):
+class Param(Parameterizable, ObsAr):
    """
    Parameter object for GPy models.
@ -42,7 +42,7 @@ class Param(OptimizationHandlable, ObsAr):
    """
    __array_priority__ = -1 # Never give back Param
    _fixes_ = None
-    _parameters_ = []
+    parameters = []
    def __new__(cls, name, input_array, default_constraint=None):
        obj = numpy.atleast_1d(super(Param, cls).__new__(cls, input_array=input_array))
        obj._current_slice_ = (slice(obj.shape[0]),)
@ -87,6 +87,9 @@ class Param(OptimizationHandlable, ObsAr):
    @property
    def param_array(self):
        """
        As we are a leaf, this just returns self
        """
        return self
    @property
@ -139,6 +142,9 @@ class Param(OptimizationHandlable, ObsAr):
    def _raveled_index_for(self, obj):
        return self._raveled_index()
    #===========================================================================
    # Index recreation
    #===========================================================================
    def _expand_index(self, slice_index=None):
        # this calculates the full indexing arrays from the slicing objects given by get_item for _real..._ attributes
        # it basically translates slices to their respective index arrays and turns negative indices around
@ -147,6 +153,8 @@ class Param(OptimizationHandlable, ObsAr):
            slice_index = self._current_slice_
        def f(a):
            a, b = a
            if isinstance(a, numpy.ndarray) and a.dtype == bool:
                raise ValueError, "Boolean indexing not implemented, use Param[np.where(index)] to index by boolean arrays!"
            if a not in (slice(None), Ellipsis):
                if isinstance(a, slice):
                    start, stop, step = a.indices(b)
@ -175,15 +183,17 @@ class Param(OptimizationHandlable, ObsAr):
        This will function will just call visit on self, as Param are leaf nodes.
        """
        self.__visited = True
        visit(self, *args, **kwargs)
-    
+        self.__visited = False
    def traverse_parents(self, visit, *args, **kwargs):
        """
        Traverse the hierarchy upwards, visiting all parents and their children, except self.
        See "visitor pattern" in literature. This is implemented in pre-order fashion.
-    
+
        Example:
-    
+
        parents = []
        self.traverse_parents(parents.append)
        print parents
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@ -17,7 +17,7 @@ from transformations import Logexp, NegativeLogexp, Logistic, __fixed__, FIXED,
 import numpy as np
 import re
-__updated__ = '2014-05-12'
+__updated__ = '2014-05-15'
 class HierarchyError(Exception):
    """
@ -52,13 +52,23 @@ class Observable(object):
    _updated = True
    def __init__(self, *args, **kwargs):
        super(Observable, self).__init__()
-        from lists_and_dicts import ObservablesList
+        from lists_and_dicts import ObserverList
-        self.observers = ObservablesList()
+        self.observers = ObserverList()
    def add_observer(self, observer, callble, priority=0):
        """
        Add an observer `observer` with the callback `callble` 
        and priority `priority` to this observers list.
        """
        self.observers.add(priority, observer, callble)
    def remove_observer(self, observer, callble=None):
        """
        Either (if callble is None) remove all callables, 
        which were added alongside observer,
        or remove callable `callble` which was added alongside
        the observer `observer`.
        """
        to_remove = []
        for poc in self.observers:
            _, obs, clble = poc
@ -91,10 +101,6 @@ class Observable(object):
                    break
                callble(self, which=which)
 #===============================================================================
 # Foundation framework for parameterized and param objects:
 #===============================================================================
 class Parentable(object):
    """
    Enable an Object to have a parent.
@ -172,7 +178,11 @@ class Pickleable(object):
    # copy and pickling
    #===========================================================================
    def copy(self):
-        """Returns a (deep) copy of the current model"""
+        """
        Returns a (deep) copy of the current parameter handle. 
        All connections to parents of the copy will be cut.
        """
        #raise NotImplementedError, "Copy is not yet implemented, TODO: Observable hierarchy"
        import copy
        memo = {}
@ -196,12 +206,11 @@ class Pickleable(object):
        return s
    def __getstate__(self):
-        ignore_list = ([#'_parent_', '_parent_index_',
+        ignore_list = ['_param_array_', # parameters get set from bottom to top
-                        #'observers',
+                       '_gradient_array_', # as well as gradients
-                        '_param_array_', '_gradient_array_', '_fixes_',
+                       '_fixes_', # and fixes
-                        '_Cacher_wrap__cachers']
+                       '_Cacher_wrap__cachers', # never pickle cachers
-                       #+ self.parameter_names(recursive=False)
+                       ]
                       )
        dc = dict()
        for k,v in self.__dict__.iteritems():
            if k not in ignore_list:
@ -246,7 +255,6 @@ class Gradcheckable(Pickleable, Parentable):
        """
        raise HierarchyError, "This parameter is not in a model with a likelihood, and, therefore, cannot be gradient checked!"
 class Nameable(Gradcheckable):
    """
    Make an object nameable inside the hierarchy.
@ -285,41 +293,8 @@ class Nameable(Gradcheckable):
            return self._parent_.hierarchy_name() + "." + adjust(self.name)
        return adjust(self.name)
 class Indexable(object):
    """
    Enable enraveled indexes and offsets for this object.
    The raveled index of an object is the index for its parameters in a flattened int array.
    """
    def __init__(self, *a, **kw):
        super(Indexable, self).__init__()
-    def _raveled_index(self):
+class Indexable(Nameable, Observable):
        """
        Flattened array of ints, specifying the index of this object.
        This has to account for shaped parameters!
        """
        raise NotImplementedError, "Need to be able to get the raveled Index"
    def _offset_for(self, param):
        """
        Return the offset of the param inside this parameterized object.
        This does not need to account for shaped parameters, as it
        basically just sums up the parameter sizes which come before param.
        """
        return 0
        #raise NotImplementedError, "shouldnt happen, offset required from non parameterization object?"
    def _raveled_index_for(self, param):
        """
        get the raveled index for a param
        that is an int array, containing the indexes for the flattened
        param inside this parameterized logic.
        """
        return param._raveled_index()
        #raise NotImplementedError, "shouldnt happen, raveld index transformation required from non parameterization object?"
 class Constrainable(Nameable, Indexable, Observable):
    """
    Make an object constrainable with Priors and Transformations.
    TODO: Mappings!!
@ -330,7 +305,7 @@ class Constrainable(Nameable, Indexable, Observable):
    :func:`constrain()` and :func:`unconstrain()` are main methods here
    """
    def __init__(self, name, default_constraint=None, *a, **kw):
-        super(Constrainable, self).__init__(name=name, *a, **kw)
+        super(Indexable, self).__init__(name=name, *a, **kw)
        self._default_constraint_ = default_constraint
        from index_operations import ParameterIndexOperations
        self.constraints = ParameterIndexOperations()
@ -352,6 +327,39 @@ class Constrainable(Nameable, Indexable, Observable):
        self._connect_fixes()
        self._notify_parent_change()
    #===========================================================================
    # Indexable
    #===========================================================================
    def _offset_for(self, param):
        """
        Return the offset of the param inside this parameterized object.
        This does not need to account for shaped parameters, as it
        basically just sums up the parameter sizes which come before param.
        """
        if param.has_parent():
            if param._parent_._get_original(param) in self.parameters:
                return self._param_slices_[param._parent_._get_original(param)._parent_index_].start
            return self._offset_for(param._parent_) + param._parent_._offset_for(param)
        return 0
    def _raveled_index_for(self, param):
        """
        get the raveled index for a param
        that is an int array, containing the indexes for the flattened
        param inside this parameterized logic.
        """
        from param import ParamConcatenation
        if isinstance(param, ParamConcatenation):
            return np.hstack((self._raveled_index_for(p) for p in param.params))
        return param._raveled_index() + self._offset_for(param)
    def _raveled_index(self):
        """
        Flattened array of ints, specifying the index of this object.
        This has to account for shaped parameters!
        """
        return np.r_[:self.size]
    #===========================================================================
    # Fixing Parameters:
    #===========================================================================
@ -406,9 +414,24 @@ class Constrainable(Nameable, Indexable, Observable):
            self._fixes_ = None
            del self.constraints[__fixed__]
    #===========================================================================
    # Convenience for fixed
    #===========================================================================
    def _has_fixes(self):
        return hasattr(self, "_fixes_") and self._fixes_ is not None and self._fixes_.size == self.size
    @property
    def is_fixed(self):
        for p in self.parameters:
            if not p.is_fixed: return False
        return True
    def _get_original(self, param):
        # if advanced indexing is activated it happens that the array is a copy
        # you can retrieve the original param through this method, by passing
        # the copy here
        return self.parameters[param._parent_index_]
    #===========================================================================
    # Prior Operations
    #===========================================================================
@ -432,8 +455,7 @@ class Constrainable(Nameable, Indexable, Observable):
    def unset_priors(self, *priors):
        """
-        Un-set all priors given from this parameter handle.
+        Un-set all priors given (in *priors) from this parameter handle.
        """
        return self._remove_from_index_operations(self.priors, priors)
@ -535,7 +557,7 @@ class Constrainable(Nameable, Indexable, Observable):
        self.constraints = ParameterIndexOperationsView(parent.constraints, parent._offset_for(self), self.size)
        self.priors = ParameterIndexOperationsView(parent.priors, parent._offset_for(self), self.size)
        self._fixes_ = None
-        for p in self._parameters_:
+        for p in self.parameters:
            p._parent_changed(parent)
    def _add_to_index_operations(self, which, reconstrained, what, warning):
@ -563,14 +585,13 @@ class Constrainable(Nameable, Indexable, Observable):
        removed = np.empty((0,), dtype=int)
        for t in transforms:
            unconstrained = which.remove(t, self._raveled_index())
            print unconstrained
            removed = np.union1d(removed, unconstrained)
            if t is __fixed__:
                self._highest_parent_._set_unfixed(self, unconstrained)
        return removed
-class OptimizationHandlable(Constrainable):
+class OptimizationHandlable(Indexable):
    """
    This enables optimization handles on an Object as done in GPy 0.4.
@ -580,7 +601,7 @@ class OptimizationHandlable(Constrainable):
        super(OptimizationHandlable, self).__init__(name, default_constraint=default_constraint, *a, **kw)
    def _get_params_transformed(self):
-        # transformed parameters (apply transformation rules)
+        # transformed parameters (apply un-transformation rules)
        p = self.param_array.copy()
        [np.put(p, ind, c.finv(p[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
        if self.has_parent() and self.constraints[__fixed__].size != 0:
@ -592,6 +613,11 @@ class OptimizationHandlable(Constrainable):
        return p
    def _set_params_transformed(self, p):
        """
        Set parameters p, but make sure they get transformed before setting.
        This means, the optimizer sees p, whereas the model sees transformed(p), 
        such that, the parameters the model sees are in the right domain.
        """
        if not(p is self.param_array):
            if self.has_parent() and self.constraints[__fixed__].size != 0:
                fixes = np.ones(self.size).astype(bool)
@ -604,12 +630,33 @@ class OptimizationHandlable(Constrainable):
        self._trigger_params_changed()
    def _trigger_params_changed(self, trigger_parent=True):
-        [p._trigger_params_changed(trigger_parent=False) for p in self._parameters_]
+        """
        First tell all children to update,
        then update yourself.
        If trigger_parent is True, we will tell the parent, otherwise not.
        """
        [p._trigger_params_changed(trigger_parent=False) for p in self.parameters]
        self.notify_observers(None, None if trigger_parent else -np.inf)
    def _size_transformed(self):
        """
        As fixes are not passed to the optimiser, the size of the model for the optimiser
        is the size of all parameters minus the size of the fixes.
        """
        return self.size - self.constraints[__fixed__].size
    def _transform_gradients(self, g):
        """
        Transform the gradients by multiplying the gradient factor for each
        constraint to it.
        """
        if self.has_parent():
            return g
        [np.put(g, i, g[i] * c.gradfactor(self.param_array[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
        if self._has_fixes(): return g[self._fixes_]
        return g
    @property
    def num_params(self):
        """
@ -628,8 +675,8 @@ class OptimizationHandlable(Constrainable):
        """
        if adjust_for_printing: adjust = lambda x: adjust_name_for_printing(x)
        else: adjust = lambda x: x
-        if recursive: names = [xi for x in self._parameters_ for xi in x.parameter_names(add_self=True, adjust_for_printing=adjust_for_printing)]
+        if recursive: names = [xi for x in self.parameters for xi in x.parameter_names(add_self=True, adjust_for_printing=adjust_for_printing)]
-        else: names = [adjust(x.name) for x in self._parameters_]
+        else: names = [adjust(x.name) for x in self.parameters]
        if add_self: names = map(lambda x: adjust(self.name) + "." + x, names)
        return names
@ -651,7 +698,7 @@ class OptimizationHandlable(Constrainable):
        Randomize the model.
        Make this draw from the prior if one exists, else draw from given random generator
-        :param rand_gen: numpy random number generator which takes args and kwargs
+        :param rand_gen: np random number generator which takes args and kwargs
        :param flaot loc: loc parameter for random number generator
        :param float scale: scale parameter for random number generator
        :param args, kwargs: will be passed through to random number generator
@ -667,7 +714,7 @@ class OptimizationHandlable(Constrainable):
    # for all parameterized objects
    #===========================================================================
    @property
-    def full_gradient(self):
+    def gradient_full(self):
        """
        Note to users:
        This does not return the gradient in the right shape! Use self.gradient
@ -681,26 +728,43 @@ class OptimizationHandlable(Constrainable):
        return self._gradient_array_
    def _propagate_param_grad(self, parray, garray):
        """
        For propagating the param_array and gradient_array.
        This ensures the in memory view of each subsequent array.
        1.) connect param_array of children to self.param_array
        2.) tell all children to propagate further
        """
        pi_old_size = 0
-        for pi in self._parameters_:
+        for pi in self.parameters:
            pislice = slice(pi_old_size, pi_old_size + pi.size)
            self.param_array[pislice] = pi.param_array.flat  # , requirements=['C', 'W']).flat
-            self.full_gradient[pislice] = pi.full_gradient.flat  # , requirements=['C', 'W']).flat
+            self.gradient_full[pislice] = pi.gradient_full.flat  # , requirements=['C', 'W']).flat
            pi.param_array.data = parray[pislice].data
-            pi.full_gradient.data = garray[pislice].data
+            pi.gradient_full.data = garray[pislice].data
            pi._propagate_param_grad(parray[pislice], garray[pislice])
            pi_old_size += pi.size
 class Parameterizable(OptimizationHandlable):
    """
    A parameterisable class.
    This class provides the parameters list (ArrayList) and standard parameter handling,
    such as {add|remove}_parameter(), traverse hierarchy and param_array, gradient_array
    and the empty parameters_changed().
    This class is abstract and should not be instantiated.
    Use GPy.core.Parameterized() as node (or leaf) in the parameterized hierarchy.
    Use GPy.core.Param() for a leaf in the parameterized hierarchy.
    """
    def __init__(self, *args, **kwargs):
        super(Parameterizable, self).__init__(*args, **kwargs)
        from GPy.core.parameterization.lists_and_dicts import ArrayList
-        self._parameters_ = ArrayList()
+        self.parameters = ArrayList()
        self._param_array_ = None
        self.size = 0
        self._added_names_ = set()
        self.__visited = False # for traversing in reverse order we need to know if we were here already
@ -735,7 +799,7 @@ class Parameterizable(OptimizationHandlable):
        if not self.__visited:
            visit(self, *args, **kwargs)
            self.__visited = True
-            for c in self._parameters_:
+            for c in self.parameters:
                c.traverse(visit, *args, **kwargs)
            self.__visited = False
@ -743,9 +807,9 @@ class Parameterizable(OptimizationHandlable):
        """
        Traverse the hierarchy upwards, visiting all parents and their children except self.
        See "visitor pattern" in literature. This is implemented in pre-order fashion.
-    
+
        Example:
-    
+
        parents = []
        self.traverse_parents(parents.append)
        print parents
@ -754,7 +818,7 @@ class Parameterizable(OptimizationHandlable):
            self.__visited = True
            self._parent_._traverse_parents(visit, *args, **kwargs)
            self.__visited = False
-        
+
    def _traverse_parents(self, visit, *args, **kwargs):
        if not self.__visited:
            self.__visited = True
@ -779,7 +843,7 @@ class Parameterizable(OptimizationHandlable):
    @property
    def num_params(self):
-        return len(self._parameters_)
+        return len(self.parameters)
    def _add_parameter_name(self, param, ignore_added_names=False):
        pname = adjust_name_for_printing(param.name)
@ -812,132 +876,6 @@ class Parameterizable(OptimizationHandlable):
        self._remove_parameter_name(None, old_name)
        self._add_parameter_name(param)
    def add_parameter(self, param, index=None, _ignore_added_names=False):
        """
        :param parameters:  the parameters to add
        :type parameters:   list of or one :py:class:`GPy.core.param.Param`
        :param [index]:     index of where to put parameters
        :param bool _ignore_added_names: whether the name of the parameter overrides a possibly existing field
        Add all parameters to this param class, you can insert parameters
        at any given index using the :func:`list.insert` syntax
        """
        if param in self._parameters_ and index is not None:
            self.remove_parameter(param)
            self.add_parameter(param, index)
        # elif param.has_parent():
        #    raise HierarchyError, "parameter {} already in another model ({}), create new object (or copy) for adding".format(param._short(), param._highest_parent_._short())
        elif param not in self._parameters_:
            if param.has_parent():
                def visit(parent, self):
                    if parent is self:
                        raise HierarchyError, "You cannot add a parameter twice into the hierarchy"
                param.traverse_parents(visit, self)
                param._parent_.remove_parameter(param)
            # make sure the size is set
            if index is None:
                self.constraints.update(param.constraints, self.size)
                self.priors.update(param.priors, self.size)
                self._parameters_.append(param)
            else:
                start = sum(p.size for p in self._parameters_[:index])
                self.constraints.shift_right(start, param.size)
                self.priors.shift_right(start, param.size)
                self.constraints.update(param.constraints, start)
                self.priors.update(param.priors, start)
                self._parameters_.insert(index, param)
            param.add_observer(self, self._pass_through_notify_observers, -np.inf)
            parent = self
            while parent is not None:
                parent.size += param.size
                parent = parent._parent_
            self._connect_parameters()
            self._highest_parent_._connect_parameters(ignore_added_names=_ignore_added_names)
            self._highest_parent_._notify_parent_change()
            self._highest_parent_._connect_fixes()
        else:
            raise HierarchyError, """Parameter exists already and no copy made"""
    def add_parameters(self, *parameters):
        """
        convenience method for adding several
        parameters without gradient specification
        """
        [self.add_parameter(p) for p in parameters]
    def remove_parameter(self, param):
        """
        :param param: param object to remove from being a parameter of this parameterized object.
        """
        if not param in self._parameters_:
            raise RuntimeError, "Parameter {} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name)
        start = sum([p.size for p in self._parameters_[:param._parent_index_]])
        self._remove_parameter_name(param)
        self.size -= param.size
        del self._parameters_[param._parent_index_]
        param._disconnect_parent()
        param.remove_observer(self, self._pass_through_notify_observers)
        self.constraints.shift_left(start, param.size)
        self._connect_parameters()
        self._notify_parent_change()
        parent = self._parent_
        while parent is not None:
            parent.size -= param.size
            parent = parent._parent_
        self._highest_parent_._connect_parameters()
        self._highest_parent_._connect_fixes()
        self._highest_parent_._notify_parent_change()
    def _connect_parameters(self, ignore_added_names=False):
        # connect parameterlist to this parameterized object
        # This just sets up the right connection for the params objects
        # to be used as parameters
        # it also sets the constraints for each parameter to the constraints
        # of their respective parents
        if not hasattr(self, "_parameters_") or len(self._parameters_) < 1:
            # no parameters for this class
            return
        if self.param_array.size != self.size:
            self.param_array = np.empty(self.size, dtype=np.float64)
        if self.gradient.size != self.size:
            self._gradient_array_ = np.empty(self.size, dtype=np.float64)
        old_size = 0
        self._param_slices_ = []
        for i, p in enumerate(self._parameters_):
            p._parent_ = self
            p._parent_index_ = i
            pslice = slice(old_size, old_size + p.size)
            # first connect all children
            p._propagate_param_grad(self.param_array[pslice], self.full_gradient[pslice])
            # then connect children to self
            self.param_array[pslice] = p.param_array.flat  # , requirements=['C', 'W']).ravel(order='C')
            self.full_gradient[pslice] = p.full_gradient.flat  # , requirements=['C', 'W']).ravel(order='C')
            if not p.param_array.flags['C_CONTIGUOUS']:
                raise ValueError, "This should not happen! Please write an email to the developers with the code, which reproduces this error. All parameter arrays must be C_CONTIGUOUS"
            p.param_array.data = self.param_array[pslice].data
            p.full_gradient.data = self.full_gradient[pslice].data
            self._param_slices_.append(pslice)
            self._add_parameter_name(p, ignore_added_names=ignore_added_names)
            old_size += p.size
    #===========================================================================
    # notification system
    #===========================================================================
@ -947,30 +885,13 @@ class Parameterizable(OptimizationHandlable):
        self.notify_observers(which=which)
    #===========================================================================
    # Pickling
    #===========================================================================
    def __setstate__(self, state):
        super(Parameterizable, self).__setstate__(state)
        self._connect_parameters()
        self._connect_fixes()
        self._notify_parent_change()
        self.parameters_changed()
    def copy(self):
        c = super(Parameterizable, self).copy()
        c._connect_parameters()
        c._connect_fixes()
        c._notify_parent_change()
        return c
    #===========================================================================
    # From being parentable, we have to define the parent_change notification
    #===========================================================================
    def _notify_parent_change(self):
        """
        Notify all parameters that the parent has changed
        """
-        for p in self._parameters_:
+        for p in self.parameters:
            p._parent_changed(self)
    def parameters_changed(self):
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -3,13 +3,10 @@
 import numpy; np = numpy
 import cPickle
 import itertools
 from re import compile, _pattern_type
 from param import ParamConcatenation
-from parameter_core import Pickleable, Parameterizable, adjust_name_for_printing
+from parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing
 from transformations import __fixed__
 from lists_and_dicts import ArrayList
 class ParametersChangedMeta(type):
    def __call__(self, *args, **kw):
@ -68,8 +65,7 @@ class Parameterized(Parameterizable):
    def __init__(self, name=None, parameters=[], *a, **kw):
        super(Parameterized, self).__init__(name=name, *a, **kw)
        self._in_init_ = True
-        self._parameters_ = ArrayList()
+        self.size = sum(p.size for p in self.parameters)
        self.size = sum(p.size for p in self._parameters_)
        self.add_observer(self, self._parameters_changed_notification, -100)
        if not self._has_fixes():
            self._fixes_ = None
@ -86,7 +82,7 @@ class Parameterized(Parameterizable):
            iamroot=True
        node = pydot.Node(id(self), shape='box', label=self.name)#, color='white')
        G.add_node(node)
-        for child in self._parameters_:
+        for child in self.parameters:
            child_node = child.build_pydot(G)
            G.add_edge(pydot.Edge(node, child_node))#, color='white'))
@ -102,58 +98,133 @@ class Parameterized(Parameterizable):
        return node
    #===========================================================================
-    # Gradient control
+    # Add remove parameters:
    #===========================================================================
-    def _transform_gradients(self, g):
+    def add_parameter(self, param, index=None, _ignore_added_names=False):
        if self.has_parent():
            return g
        [numpy.put(g, i, g[i] * c.gradfactor(self.param_array[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
        if self._has_fixes(): return g[self._fixes_]
        return g
    #===========================================================================
    # Indexable
    #===========================================================================
    def _offset_for(self, param):
        # get the offset in the parameterized index array for param
        if param.has_parent():
            if param._parent_._get_original(param) in self._parameters_:
                return self._param_slices_[param._parent_._get_original(param)._parent_index_].start
            return self._offset_for(param._parent_) + param._parent_._offset_for(param)
        return 0
    def _raveled_index_for(self, param):
        """
-        get the raveled index for a param
+        :param parameters:  the parameters to add
-        that is an int array, containing the indexes for the flattened
+        :type parameters:   list of or one :py:class:`GPy.core.param.Param`
-        param inside this parameterized logic.
+        :param [index]:     index of where to put parameters
        """
        if isinstance(param, ParamConcatenation):
            return numpy.hstack((self._raveled_index_for(p) for p in param.params))
        return param._raveled_index() + self._offset_for(param)
-    def _raveled_index(self):
+        :param bool _ignore_added_names: whether the name of the parameter overrides a possibly existing field
        """
        get the raveled index for this object,
        this is not in the global view of things!
        """
        return numpy.r_[:self.size]
-    #===========================================================================
+        Add all parameters to this param class, you can insert parameters
-    # Convenience for fixed, tied checking of param:
+        at any given index using the :func:`list.insert` syntax
-    #===========================================================================
+        """
-    @property
+        if param in self.parameters and index is not None:
-    def is_fixed(self):
+            self.remove_parameter(param)
-        for p in self._parameters_:
+            self.add_parameter(param, index)
-            if not p.is_fixed: return False
+        # elif param.has_parent():
-        return True
+        #    raise HierarchyError, "parameter {} already in another model ({}), create new object (or copy) for adding".format(param._short(), param._highest_parent_._short())
        elif param not in self.parameters:
            if param.has_parent():
                def visit(parent, self):
                    if parent is self:
                        raise HierarchyError, "You cannot add a parameter twice into the hierarchy"
                param.traverse_parents(visit, self)
                param._parent_.remove_parameter(param)
            # make sure the size is set
            if index is None:
                self.constraints.update(param.constraints, self.size)
                self.priors.update(param.priors, self.size)
                self.parameters.append(param)
            else:
                start = sum(p.size for p in self.parameters[:index])
                self.constraints.shift_right(start, param.size)
                self.priors.shift_right(start, param.size)
                self.constraints.update(param.constraints, start)
                self.priors.update(param.priors, start)
                self.parameters.insert(index, param)
-    def _get_original(self, param):
+            param.add_observer(self, self._pass_through_notify_observers, -np.inf)
-        # if advanced indexing is activated it happens that the array is a copy
+
-        # you can retrieve the original param through this method, by passing
+            parent = self
-        # the copy here
+            while parent is not None:
-        return self._parameters_[param._parent_index_]
+                parent.size += param.size
                parent = parent._parent_
            self._connect_parameters()
            self._highest_parent_._connect_parameters(ignore_added_names=_ignore_added_names)
            self._highest_parent_._notify_parent_change()
            self._highest_parent_._connect_fixes()
        else:
            raise HierarchyError, """Parameter exists already and no copy made"""
    def add_parameters(self, *parameters):
        """
        convenience method for adding several
        parameters without gradient specification
        """
        [self.add_parameter(p) for p in parameters]
    def remove_parameter(self, param):
        """
        :param param: param object to remove from being a parameter of this parameterized object.
        """
        if not param in self.parameters:
            raise RuntimeError, "Parameter {} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name)
        start = sum([p.size for p in self.parameters[:param._parent_index_]])
        self._remove_parameter_name(param)
        self.size -= param.size
        del self.parameters[param._parent_index_]
        param._disconnect_parent()
        param.remove_observer(self, self._pass_through_notify_observers)
        self.constraints.shift_left(start, param.size)
        self._connect_parameters()
        self._notify_parent_change()
        parent = self._parent_
        while parent is not None:
            parent.size -= param.size
            parent = parent._parent_
        self._highest_parent_._connect_parameters()
        self._highest_parent_._connect_fixes()
        self._highest_parent_._notify_parent_change()
    def _connect_parameters(self, ignore_added_names=False):
        # connect parameterlist to this parameterized object
        # This just sets up the right connection for the params objects
        # to be used as parameters
        # it also sets the constraints for each parameter to the constraints
        # of their respective parents
        if not hasattr(self, "parameters") or len(self.parameters) < 1:
            # no parameters for this class
            return
        if self.param_array.size != self.size:
            self.param_array = np.empty(self.size, dtype=np.float64)
        if self.gradient.size != self.size:
            self._gradient_array_ = np.empty(self.size, dtype=np.float64)
        old_size = 0
        self._param_slices_ = []
        for i, p in enumerate(self.parameters):
            p._parent_ = self
            p._parent_index_ = i
            pslice = slice(old_size, old_size + p.size)
            # first connect all children
            p._propagate_param_grad(self.param_array[pslice], self.gradient_full[pslice])
            # then connect children to self
            self.param_array[pslice] = p.param_array.flat  # , requirements=['C', 'W']).ravel(order='C')
            self.gradient_full[pslice] = p.gradient_full.flat  # , requirements=['C', 'W']).ravel(order='C')
            if not p.param_array.flags['C_CONTIGUOUS']:
                raise ValueError, "This should not happen! Please write an email to the developers with the code, which reproduces this error. All parameter arrays must be C_CONTIGUOUS"
            p.param_array.data = self.param_array[pslice].data
            p.gradient_full.data = self.gradient_full[pslice].data
            self._param_slices_.append(pslice)
            self._add_parameter_name(p, ignore_added_names=ignore_added_names)
            old_size += p.size
    #===========================================================================
    # Get/set parameters:
@ -200,10 +271,28 @@ class Parameterized(Parameterizable):
    def __setattr__(self, name, val):
        # override the default behaviour, if setting a param, so broadcasting can by used
-        if hasattr(self, "_parameters_"):
+        if hasattr(self, "parameters"):
            pnames = self.parameter_names(False, adjust_for_printing=True, recursive=False)
-            if name in pnames: self._parameters_[pnames.index(name)][:] = val; return
+            if name in pnames: self.parameters[pnames.index(name)][:] = val; return
        object.__setattr__(self, name, val);
    #===========================================================================
    # Pickling
    #===========================================================================
    def __setstate__(self, state):
        super(Parameterized, self).__setstate__(state)
        self._connect_parameters()
        self._connect_fixes()
        self._notify_parent_change()
        self.parameters_changed()
    def copy(self):
        c = super(Parameterized, self).copy()
        c._connect_parameters()
        c._connect_fixes()
        c._notify_parent_change()
        return c
    #===========================================================================
    # Printing:
    #===========================================================================
@ -211,22 +300,22 @@ class Parameterized(Parameterizable):
        return self.hierarchy_name()
    @property
    def flattened_parameters(self):
-        return [xi for x in self._parameters_ for xi in x.flattened_parameters]
+        return [xi for x in self.parameters for xi in x.flattened_parameters]
    @property
    def _parameter_sizes_(self):
-        return [x.size for x in self._parameters_]
+        return [x.size for x in self.parameters]
    @property
    def parameter_shapes(self):
-        return [xi for x in self._parameters_ for xi in x.parameter_shapes]
+        return [xi for x in self.parameters for xi in x.parameter_shapes]
    @property
    def _constraints_str(self):
-        return [cs for p in self._parameters_ for cs in p._constraints_str]
+        return [cs for p in self.parameters for cs in p._constraints_str]
    @property
    def _priors_str(self):
-        return [cs for p in self._parameters_ for cs in p._priors_str]
+        return [cs for p in self.parameters for cs in p._priors_str]
    @property
    def _description_str(self):
-        return [xi for x in self._parameters_ for xi in x._description_str]
+        return [xi for x in self.parameters for xi in x._description_str]
    @property
    def _ties_str(self):
        return [','.join(x._ties_str) for x in self.flattened_parameters]
@ -246,7 +335,7 @@ class Parameterized(Parameterizable):
        to_print = []
        for n, d, c, t, p in itertools.izip(names, desc, constrs, ts, prirs):
            to_print.append(format_spec.format(name=n, desc=d, const=c, t=t, pri=p))
-        # to_print = [format_spec.format(p=p, const=c, t=t) if isinstance(p, Param) else p.__str__(header=False) for p, c, t in itertools.izip(self._parameters_, constrs, ts)]
+        # to_print = [format_spec.format(p=p, const=c, t=t) if isinstance(p, Param) else p.__str__(header=False) for p, c, t in itertools.izip(self.parameters, constrs, ts)]
        sep = '-' * (nl + sl + cl + + pl + tl + 8 * 2 + 3)
        if header:
            header = "  {{0:<{0}s}}  |  {{1:^{1}s}}  |  {{2:^{2}s}}  |  {{3:^{3}s}}  |  {{4:^{4}s}}".format(nl, sl, cl, pl, tl).format(name, "Value", "Constraint", "Prior", "Tied to")
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@ -81,7 +81,7 @@ class VariationalPosterior(Parameterized):
    def _raveled_index(self):
        index = np.empty(dtype=int, shape=0)
        size = 0
-        for p in self._parameters_:
+        for p in self.parameters:
            index = np.hstack((index, p._raveled_index()+size))
            size += p._realsize_ if hasattr(p, '_realsize_') else p.size
        return index
@ -96,10 +96,10 @@ class VariationalPosterior(Parameterized):
            dc = self.__dict__.copy()
            dc['mean'] = self.mean[s]
            dc['variance'] = self.variance[s]
-            dc['_parameters_'] = copy.copy(self._parameters_)
+            dc['parameters'] = copy.copy(self.parameters)
            n.__dict__.update(dc)
-            n._parameters_[dc['mean']._parent_index_] = dc['mean']
+            n.parameters[dc['mean']._parent_index_] = dc['mean']
-            n._parameters_[dc['variance']._parent_index_] = dc['variance']
+            n.parameters[dc['variance']._parent_index_] = dc['variance']
            n._gradient_array_ = None
            oversize = self.size - self.mean.size - self.variance.size
            n.size = n.mean.size + n.variance.size + oversize
@ -150,11 +150,11 @@ class SpikeAndSlabPosterior(VariationalPosterior):
            dc['mean'] = self.mean[s]
            dc['variance'] = self.variance[s]
            dc['binary_prob'] = self.binary_prob[s]
-            dc['_parameters_'] = copy.copy(self._parameters_)
+            dc['parameters'] = copy.copy(self.parameters)
            n.__dict__.update(dc)
-            n._parameters_[dc['mean']._parent_index_] = dc['mean']
+            n.parameters[dc['mean']._parent_index_] = dc['mean']
-            n._parameters_[dc['variance']._parent_index_] = dc['variance']
+            n.parameters[dc['variance']._parent_index_] = dc['variance']
-            n._parameters_[dc['binary_prob']._parent_index_] = dc['binary_prob']
+            n.parameters[dc['binary_prob']._parent_index_] = dc['binary_prob']
            n.ndim = n.mean.ndim
            n.shape = n.mean.shape
            n.num_data = n.mean.shape[0]
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -66,7 +66,7 @@ class SparseGP(GP):
            #gradients wrt Z
            self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z)
            self.Z.gradient += self.kern.gradients_Z_expectations(
-                               self.grad_dict['dL_dpsi1'], self.grad_dict['dL_dpsi2'], Z=self.Z, variational_posterior=self.X)
+                               self.grad_dict['dL_dpsi0'], self.grad_dict['dL_dpsi1'], self.grad_dict['dL_dpsi2'], Z=self.Z, variational_posterior=self.X)
        else:
            #gradients wrt kernel
            self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X)
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -96,15 +96,11 @@ def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=
    # Optimize
    if optimize:
        #m.update_likelihood_approximation()
        # Parameters optimization:
        try:
            m.optimize('scg', messages=1)
        except Exception as e:
            return m
        #m.pseudo_EM()
    # Plot
    if plot:
        fig, axes = pb.subplots(2, 1)
@ -133,10 +129,7 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti
    # Optimize
    if optimize:
-        #m.update_likelihood_approximation()
+        m.optimize()
        # Parameters optimization:
        #m.optimize()
        m.pseudo_EM()
    # Plot
    if plot:
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -414,9 +414,10 @@ def stick(kernel=None, optimize=True, verbose=True, plot=True):
        ax = m.plot_latent()
        y = m.Y[0, :]
        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
-        vis = GPy.plotting.matplot_dep.visualize.lvm(m.X[:1, :].copy(), m, data_show, latent_axes=ax)
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[:1, :].copy(), m, data_show, latent_axes=ax)
        raw_input('Press enter to finish')
-
+        lvm_visualizer.close()
        data_show.close()
    return m
 def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):
@ -516,9 +517,10 @@ def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose
        ax = m.plot_latent()
        y = m.Y[0, :]
        data_show = GPy.plotting.matplot_dep.visualize.skeleton_show(y[None, :], data['skel'])
-        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[0].copy(), m, data_show, latent_axes=ax)
        raw_input('Press enter to finish')
        lvm_visualizer.close()
        data_show.close()
    return m
--- a/GPy/inference/latent_function_inference/init.py
+++ b/GPy/inference/latent_function_inference/init.py
@ -25,6 +25,20 @@ etc.
 """
 class LatentFunctionInference(object):
    def on_optimization_start(self):
        """
        This function gets called, just before the optimization loop to start.
        """
        pass
    def on_optimization_end(self):
        """
        This function gets called, just after the optimization loop ended.
        """
        pass
 from exact_gaussian_inference import ExactGaussianInference
 from laplace import Laplace
 from GPy.inference.latent_function_inference.var_dtc import VarDTC
@ -38,11 +52,26 @@ from var_dtc_gpu import VarDTC_GPU
 # class FullLatentFunctionData(object):
 #
 #
-# class LatentFunctionInference(object):
+
-#     def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+# class EMLikeLatentFunctionInference(LatentFunctionInference):
 #     def update_approximation(self):
 #         """
 #         This function gets called when the 
 #         """
 #     
 #     def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
 #         """
 #         Do inference on the latent functions given a covariance function `kern`,
-#         inputs and outputs `X` and `Y`, and a likelihood `likelihood`.
+#         inputs and outputs `X` and `Y`, inducing_inputs `Z`, and a likelihood `likelihood`.
 #         Additional metadata for the outputs `Y` can be given in `Y_metadata`.
 #         """
 #         raise NotImplementedError, "Abstract base class for full inference"
 # 
 # class VariationalLatentFunctionInference(LatentFunctionInference):
 #     def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
 #         """
 #         Do inference on the latent functions given a covariance function `kern`,
 #         inputs and outputs `X` and `Y`, inducing_inputs `Z`, and a likelihood `likelihood`.
 #         Additional metadata for the outputs `Y` can be given in `Y_metadata`.
 #         """
 #         raise NotImplementedError, "Abstract base class for full inference"
--- a/GPy/inference/latent_function_inference/dtc.py
+++ b/GPy/inference/latent_function_inference/dtc.py
@ -4,9 +4,10 @@
 from posterior import Posterior
 from ...util.linalg import jitchol, tdot, dtrtrs, dpotri, pdinv
 import numpy as np
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
-class DTC(object):
+class DTC(LatentFunctionInference):
    """
    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@ -5,10 +5,11 @@ from posterior import Posterior
 from ...util.linalg import pdinv, dpotrs, tdot
 from ...util import diag
 import numpy as np
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
-class ExactGaussianInference(object):
+class ExactGaussianInference(LatentFunctionInference):
    """
    An object for inference when the likelihood is Gaussian.
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@ -1,9 +1,10 @@
 import numpy as np
 from ...util.linalg import pdinv,jitchol,DSYR,tdot,dtrtrs, dpotrs
 from posterior import Posterior
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
-class EP(object):
+class EP(LatentFunctionInference):
    def __init__(self, epsilon=1e-6, eta=1., delta=1.):
        """
        The expectation-propagation algorithm.
@ -21,6 +22,14 @@ class EP(object):
    def reset(self):
        self.old_mutilde, self.old_vtilde = None, None
        self._ep_approximation = None
    def on_optimization_start(self):
        self._ep_approximation = None
    def on_optimization_end(self):
        # TODO: update approximation in the end as well? Maybe even with a switch?
        pass
    def inference(self, kern, X, likelihood, Y, Y_metadata=None, Z=None):
        num_data, output_dim = X.shape
@ -28,7 +37,10 @@ class EP(object):
        K = kern.K(X)
-        mu, Sigma, mu_tilde, tau_tilde, Z_hat = self.expectation_propagation(K, Y, likelihood, Y_metadata)
+        if self._ep_approximation is None:
            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
        else:
            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation
        Wi, LW, LWi, W_logdet = pdinv(K + np.diag(1./tau_tilde))
@ -42,8 +54,6 @@ class EP(object):
        return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
    def expectation_propagation(self, K, Y, likelihood, Y_metadata):
        num_data, data_dim = Y.shape
@ -108,4 +118,3 @@ class EP(object):
        mu_tilde = v_tilde/tau_tilde
        return mu, Sigma, mu_tilde, tau_tilde, Z_hat
--- a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
@ -1,11 +1,59 @@
 import numpy as np
-from ...util.linalg import pdinv,jitchol,DSYR,tdot,dtrtrs, dpotrs
+from ...util import diag
-from expectation_propagation import EP
+from ...util.linalg import mdot, jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify, DSYR
 from ...util.misc import param_to_array
 from ...core.parameterization.variational import VariationalPosterior
 from . import LatentFunctionInference
 from posterior import Posterior
 log_2_pi = np.log(2*np.pi)
-class EPDTC(EP):
+class EPDTC(LatentFunctionInference):
-    #def __init__(self, epsilon=1e-6, eta=1., delta=1.):
+    const_jitter = 1e-6
    def __init__(self, epsilon=1e-6, eta=1., delta=1., limit=1):
        from ...util.caching import Cacher
        self.limit = limit
        self.get_trYYT = Cacher(self._get_trYYT, limit)
        self.get_YYTfactor = Cacher(self._get_YYTfactor, limit)
        self.epsilon, self.eta, self.delta = epsilon, eta, delta
        self.reset()
    def set_limit(self, limit):
        self.get_trYYT.limit = limit
        self.get_YYTfactor.limit = limit
    def _get_trYYT(self, Y):
        return param_to_array(np.sum(np.square(Y)))
    def __getstate__(self):
        # has to be overridden, as Cacher objects cannot be pickled.
        return self.limit
    def __setstate__(self, state):
        # has to be overridden, as Cacher objects cannot be pickled.
        self.limit = state
        from ...util.caching import Cacher
        self.get_trYYT = Cacher(self._get_trYYT, self.limit)
        self.get_YYTfactor = Cacher(self._get_YYTfactor, self.limit)
    def _get_YYTfactor(self, Y):
        """
        find a matrix L which satisfies LLT = YYT.
        Note that L may have fewer columns than Y.
        """
        N, D = Y.shape
        if (N>=D):
            return param_to_array(Y)
        else:
            return jitchol(tdot(Y))
    def get_VVTfactor(self, Y, prec):
        return Y * prec # TODO chache this, and make it effective
    def reset(self):
        self.old_mutilde, self.old_vtilde = None, None
        self._ep_approximation = None
    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
        num_data, output_dim = X.shape
@ -14,26 +62,131 @@ class EPDTC(EP):
        Kmm = kern.K(Z)
        Kmn = kern.K(Z,X)
        if self._ep_approximation is None:
            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
        else:
            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation
        if isinstance(X, VariationalPosterior):
            uncertain_inputs = True
            psi0 = kern.psi0(Z, X)
            psi1 = Kmn.T#kern.psi1(Z, X)
            psi2 = kern.psi2(Z, X)
        else:
            uncertain_inputs = False
            psi0 = kern.Kdiag(X)
            psi1 = Kmn.T#kern.K(X, Z)
            psi2 = None
        #see whether we're using variational uncertain inputs
        _, output_dim = Y.shape
        #see whether we've got a different noise variance for each datum
        #beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6)
        beta = tau_tilde
        VVT_factor = beta[:,None]*mu_tilde[:,None]
        trYYT = self.get_trYYT(mu_tilde[:,None])
        # do the inference:
        het_noise = beta.size > 1
        num_inducing = Z.shape[0]
        num_data = Y.shape[0]
        # kernel computations, using BGPLVM notation
        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
        Lm = jitchol(Kmm)
-        Lmi = dtrtrs(Lm,np.eye(Lm.shape[0]))[0]
+
-        Kmmi = np.dot(Lmi.T,Lmi)
+        # The rather complex computations of A
-        KmmiKmn = np.dot(Kmmi,Kmn)
+        if uncertain_inputs:
-        K = np.dot(Kmn.T,KmmiKmn)
+            if het_noise:
                psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0)
            else:
                psi2_beta = psi2.sum(0) * beta
            LmInv = dtrtri(Lm)
            A = LmInv.dot(psi2_beta.dot(LmInv.T))
        else:
            if het_noise:
                tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
            else:
                tmp = psi1 * (np.sqrt(beta))
            tmp, _ = dtrtrs(Lm, tmp.T, lower=1)
            A = tdot(tmp) #print A.sum()
        # factor B
        B = np.eye(num_inducing) + A
        LB = jitchol(B)
        psi1Vf = np.dot(psi1.T, VVT_factor)
        # back substutue C into psi1Vf
        tmp, _ = dtrtrs(Lm, psi1Vf, lower=1, trans=0)
        _LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0)
        tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
        Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
        # data fit and derivative of L w.r.t. Kmm
        delit = tdot(_LBi_Lmi_psi1Vf)
        data_fit = np.trace(delit)
        DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit)
        delit = -0.5 * DBi_plus_BiPBi
        delit += -0.5 * B * output_dim
        delit += output_dim * np.eye(num_inducing)
        # Compute dL_dKmm
        dL_dKmm = backsub_both_sides(Lm, delit)
        # derivatives of L w.r.t. psi
        dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm,
            VVT_factor, Cpsi1Vf, DBi_plus_BiPBi,
            psi1, het_noise, uncertain_inputs)
        # log marginal likelihood
        log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise,
            psi0, A, LB, trYYT, data_fit, VVT_factor)
        #put the gradients in the right places
        dL_dR = _compute_dL_dR(likelihood,
            het_noise, uncertain_inputs, LB,
            _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A,
            psi0, psi1, beta,
            data_fit, num_data, output_dim, trYYT, mu_tilde[:,None])
        dL_dthetaL = 0#likelihood.exact_inference_gradients(dL_dR,Y_metadata)
        if uncertain_inputs:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dpsi0':dL_dpsi0,
                         'dL_dpsi1':dL_dpsi1,
                         'dL_dpsi2':dL_dpsi2,
                         'dL_dthetaL':dL_dthetaL}
        else:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dKdiag':dL_dpsi0,
                         'dL_dKnm':dL_dpsi1,
                         'dL_dthetaL':dL_dthetaL}
        #get sufficient things for posterior prediction
        #TODO: do we really want to do this in  the loop?
        if VVT_factor.shape[1] == Y.shape[1]:
            woodbury_vector = Cpsi1Vf # == Cpsi1V
        else:
            print 'foobar'
            psi1V = np.dot(mu_tilde[:,None].T*beta, psi1).T
            tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
            tmp, _ = dpotrs(LB, tmp, lower=1)
            woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
        Bi, _ = dpotri(LB, lower=1)
        symmetrify(Bi)
        Bi = -dpotri(LB, lower=1)[0]
        diag.add(Bi, 1)
        woodbury_inv = backsub_both_sides(Lm, Bi)
        #construct a posterior object
        post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
        return post, log_marginal, grad_dict
        mu, Sigma, mu_tilde, tau_tilde, Z_hat = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
        Wi, LW, LWi, W_logdet = pdinv(K + np.diag(1./tau_tilde))
        alpha, _ = dpotrs(LW, mu_tilde, lower=1)
        log_marginal =  0.5*(-num_data * log_2_pi - W_logdet - np.sum(alpha * mu_tilde)) # TODO: add log Z_hat??
        dL_dK = 0.5 * (tdot(alpha[:,None]) - Wi)
        dL_dthetaL = np.zeros(likelihood.size)#TODO: derivatives of the likelihood parameters
        return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
@ -121,3 +274,69 @@ class EPDTC(EP):
        mu_tilde = v_tilde/tau_tilde
        return mu, Sigma, mu_tilde, tau_tilde, Z_hat
 def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs):
    dL_dpsi0 = -0.5 * output_dim * (beta[:,None] * np.ones([num_data, 1])).flatten()
    dL_dpsi1 = np.dot(VVT_factor, Cpsi1Vf.T)
    dL_dpsi2_beta = 0.5 * backsub_both_sides(Lm, output_dim * np.eye(num_inducing) - DBi_plus_BiPBi)
    if het_noise:
        if uncertain_inputs:
            dL_dpsi2 = beta[:, None, None] * dL_dpsi2_beta[None, :, :]
        else:
            dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * beta.reshape(num_data, 1)).T).T
            dL_dpsi2 = None
    else:
        dL_dpsi2 = beta * dL_dpsi2_beta
        if uncertain_inputs:
            # repeat for each of the N psi_2 matrices
            dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], num_data, axis=0)
        else:
            # subsume back into psi1 (==Kmn)
            dL_dpsi1 += 2.*np.dot(psi1, dL_dpsi2)
            dL_dpsi2 = None
    return dL_dpsi0, dL_dpsi1, dL_dpsi2
 def _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT, Y):
    # the partial derivative vector for the likelihood
    if likelihood.size == 0:
        # save computation here.
        dL_dR = None
    elif het_noise:
        if uncertain_inputs:
            raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
        else:
            #from ...util.linalg import chol_inv
            #LBi = chol_inv(LB)
            LBi, _ = dtrtrs(LB,np.eye(LB.shape[0]))
            Lmi_psi1, nil = dtrtrs(Lm, psi1.T, lower=1, trans=0)
            _LBi_Lmi_psi1, _ = dtrtrs(LB, Lmi_psi1, lower=1, trans=0)
            dL_dR = -0.5 * beta + 0.5 * (beta*Y)**2
            dL_dR += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * beta**2
            dL_dR += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*beta**2
            dL_dR += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * Y * beta**2
            dL_dR += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * beta**2
    else:
        # likelihood is not heteroscedatic
        dL_dR = -0.5 * num_data * output_dim * beta + 0.5 * trYYT * beta ** 2
        dL_dR += 0.5 * output_dim * (psi0.sum() * beta ** 2 - np.trace(A) * beta)
        dL_dR += beta * (0.5 * np.sum(A * DBi_plus_BiPBi) - data_fit)
    return dL_dR
 def _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT, data_fit,Y):
    #compute log marginal likelihood
    if het_noise:
        lik_1 = -0.5 * num_data * output_dim * np.log(2. * np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(beta * np.square(Y).sum(axis=-1))
        lik_2 = -0.5 * output_dim * (np.sum(beta.flatten() * psi0) - np.trace(A))
    else:
        lik_1 = -0.5 * num_data * output_dim * (np.log(2. * np.pi) - np.log(beta)) - 0.5 * beta * trYYT
        lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
    lik_3 = -output_dim * (np.sum(np.log(np.diag(LB))))
    lik_4 = 0.5 * data_fit
    log_marginal = lik_1 + lik_2 + lik_3 + lik_4
    return log_marginal
--- a/GPy/inference/latent_function_inference/fitc.py
+++ b/GPy/inference/latent_function_inference/fitc.py
@ -5,9 +5,10 @@ from posterior import Posterior
 from ...util.linalg import jitchol, tdot, dtrtrs, dpotri, pdinv
 from ...util import diag
 import numpy as np
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
-class FITC(object):
+class FITC(LatentFunctionInference):
    """
    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@ -16,8 +16,9 @@ from ...util.misc import param_to_array
 from posterior import Posterior
 import warnings
 from scipy import optimize
 from . import LatentFunctionInference
-class Laplace(object):
+class Laplace(LatentFunctionInference):
    def __init__(self):
        """
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@ -7,9 +7,10 @@ from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
 import numpy as np
 from ...util.misc import param_to_array
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
-class VarDTC(object):
+class VarDTC(LatentFunctionInference):
    """
    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
@ -190,7 +191,7 @@ class VarDTC(object):
        post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
        return post, log_marginal, grad_dict
-class VarDTCMissingData(object):
+class VarDTCMissingData(LatentFunctionInference):
    const_jitter = 1e-6
    def __init__(self, limit=1, inan=None):
        from ...util.caching import Cacher
--- a/GPy/inference/latent_function_inference/var_dtc_gpu.py
+++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py
@ -7,6 +7,7 @@ from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
 import numpy as np
 from ...util.misc import param_to_array
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 from ...util import gpu_init
@ -19,7 +20,7 @@ try:
 except:
    pass
-class VarDTC_GPU(object):
+class VarDTC_GPU(LatentFunctionInference):
    """
    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@ -7,9 +7,10 @@ from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
 import numpy as np
 from ...util.misc import param_to_array
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
-class VarDTC_minibatch(object):
+class VarDTC_minibatch(LatentFunctionInference):
    """
    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
@ -70,12 +71,13 @@ class VarDTC_minibatch(object):
        #see whether we've got a different noise variance for each datum
        beta = 1./np.fmax(likelihood.variance, 1e-6)
        het_noise = beta.size > 1
        if het_noise:
            self.batchsize = 1
        # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
        #self.YYTfactor = beta*self.get_YYTfactor(Y)
        YYT_factor = Y
        trYYT = self.get_trYYT(Y)
        psi2_full = np.zeros((num_inducing,num_inducing))
        psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM
        psi0_full = 0
@ -104,19 +106,18 @@ class VarDTC_minibatch(object):
                YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum()
            else:
                psi0_full += psi0.sum()
-                psi1Y_full += np.dot(Y_slice.T,psi1) # DxM
+                psi1Y_full += np.dot(Y_slice.T,psi1) # DxM                
            if uncertain_inputs:
                if het_noise:
-                    psi2_full += np.einsum('n,nmo->mo',beta_slice,psi2)
+                    psi2_full += beta_slice*psi2
                else:
-                    psi2_full += psi2.sum(axis=0)
+                    psi2_full += psi2
            else:
                if het_noise:
-                    psi2_full += np.einsum('n,nm,no->mo',beta_slice,psi1,psi1)
+                    psi2_full += beta_slice*np.outer(psi1,psi1)
                else:
-                    psi2_full += tdot(psi1.T)
+                    psi2_full += np.outer(psi1,psi1)
        if not het_noise:
            psi0_full *= beta
@ -223,7 +224,7 @@ class VarDTC_minibatch(object):
            psi2 = None
        if het_noise:
-            beta = beta[n_start:n_end]
+            beta = beta[n_start] # assuming batchsize==1
        betaY = beta*Y_slice
        betapsi1 = np.einsum('n,nm->nm',beta,psi1)
@ -244,7 +245,7 @@ class VarDTC_minibatch(object):
        dL_dpsi1 = np.dot(betaY,v.T)
        if uncertain_inputs:
-            dL_dpsi2 = np.einsum('n,mo->nmo',beta * np.ones((n_end-n_start,)),dL_dpsi2R)
+            dL_dpsi2 = beta* dL_dpsi2R
        else:
            dL_dpsi1 += np.dot(betapsi1,dL_dpsi2R)*2.
            dL_dpsi2 = None
@ -262,11 +263,11 @@ class VarDTC_minibatch(object):
            dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1)
        else:
            if uncertain_inputs:
-                psiR = np.einsum('mo,nmo->',dL_dpsi2R,psi2)
+                psiR = np.einsum('mo,mo->',dL_dpsi2R,psi2)
            else:
                psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R)
-            dL_dthetaL = ((np.square(betaY)).sum() + np.square(beta)*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum()
+            dL_dthetaL = ((np.square(betaY)).sum() + beta*beta*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - beta*beta*psiR- (betaY*np.dot(betapsi1,v)).sum()
        if uncertain_inputs:
            grad_dict = {'dL_dpsi0':dL_dpsi0,
@ -296,7 +297,7 @@ def update_gradients(model):
    kern_grad = model.kern.gradient.copy()
    #gradients w.r.t. Z
-    model.Z.gradient[:,model.kern.active_dims] = model.kern.gradients_X(dL_dKmm, model.Z)
+    model.Z.gradient = model.kern.gradients_X(dL_dKmm, model.Z)
    isEnd = False
    while not isEnd:
@ -309,8 +310,8 @@ def update_gradients(model):
            kern_grad += model.kern.gradient
            #gradients w.r.t. Z
-            model.Z.gradient[:,model.kern.active_dims] += model.kern.gradients_Z_expectations(
+            model.Z.gradient += model.kern.gradients_Z_expectations(
-                               grad_dict['dL_dpsi1'], grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=X_slice)
+                               dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=X_slice)
            #gradients w.r.t. posterior parameters of X
            X_grad = model.kern.gradients_qX_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'])
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@ -119,7 +119,7 @@ class Add(CombinationKernel):
                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
            p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
-    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_psi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        from static import White, Bias
        target = np.zeros(Z.shape)
        for p1 in self.parts:
@ -141,10 +141,10 @@ class Add(CombinationKernel):
        from static import White, Bias
        target_mu = np.zeros(variational_posterior.shape)
        target_S = np.zeros(variational_posterior.shape)
-        for p1 in self._parameters_:
+        for p1 in self.parameters:
            #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
            eff_dL_dpsi1 = dL_dpsi1.copy()
-            for p2 in self._parameters_:
+            for p2 in self.parameters:
                if p2 is p1:
                    continue
                if isinstance(p2, White):
@ -160,7 +160,7 @@ class Add(CombinationKernel):
    def add(self, other, name='sum'):
        if isinstance(other, Add):
-            other_params = other._parameters_[:]
+            other_params = other.parameters[:]
            for p in other_params:
                other.remove_parameter(p)
            self.add_parameters(*other_params)
@ -170,4 +170,4 @@ class Add(CombinationKernel):
        return self
    def input_sensitivity(self):
-        return reduce(np.add, [k.input_sensitivity() for k in self.parts])
+        return reduce(np.add, [k.input_sensitivity() for k in self.parts])
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@ -103,7 +103,7 @@ class Kern(Parameterized):
        """
        raise NotImplementedError
-    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        """
        Returns the derivative of the objective wrt Z, using the chain rule
        through the expectation variables.
@ -183,9 +183,9 @@ class Kern(Parameterized):
        assert isinstance(other, Kern), "only kernels can be added to kernels..."
        from prod import Prod
        #kernels = []
-        #if isinstance(self, Prod): kernels.extend(self._parameters_)
+        #if isinstance(self, Prod): kernels.extend(self.parameters)
        #else: kernels.append(self)
-        #if isinstance(other, Prod): kernels.extend(other._parameters_)
+        #if isinstance(other, Prod): kernels.extend(other.parameters)
        #else: kernels.append(other)
        return Prod([self, other], name)
@ -222,7 +222,7 @@ class CombinationKernel(Kern):
    @property
    def parts(self):
-        return self._parameters_
+        return self.parameters
    def get_input_dim_active_dims(self, kernels, extra_dims = None):
        #active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int))
--- a/GPy/kern/_src/kernel_slice_operations.py
+++ b/GPy/kern/_src/kernel_slice_operations.py
@ -124,9 +124,9 @@ def _slice_update_gradients_expectations(f):
 def _slice_gradients_Z_expectations(f):
    @wraps(f)
-    def wrap(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        with _Slice_wrap(self, Z, variational_posterior) as s:
-            ret = s.handle_return_array(f(self, dL_dpsi1, dL_dpsi2, s.X, s.X2))
+            ret = s.handle_return_array(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2))
        return ret
    return wrap
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@ -169,7 +169,7 @@ class Linear(Kern):
            else:
                self.variances.gradient += 2.*np.sum(dL_dpsi2 * self.psi2(Z, variational_posterior))/self.variances
-    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
            gamma = variational_posterior.binary_prob
            mu = variational_posterior.mean
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
@ -9,12 +9,23 @@ import numpy as np
 from GPy.util.caching import Cache_this
@Cache_this(limit=1)
-def _Z_distances(Z):
+def psicomputations(variance, lengthscale, Z, mu, S, gamma):
-    Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
+    """
-    Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
+    Z - MxQ
-    return Zhat, Zdist
+    mu - NxQ
    S - NxQ
    gamma - NxQ
    """
    # here are the "statistics" for psi0, psi1 and psi2
    # Produced intermediate results:
    # _psi1                NxM
    psi0 = np.empty(mu.shape[0])
    psi0[:] = variance
    psi1 = _psi1computations(variance, lengthscale, Z, mu, S, gamma)
    psi2 = _psi2computations(variance, lengthscale, Z, mu, S, gamma)
    return psi0, psi1, psi2
@Cache_this(limit=1)
 def _psi1computations(variance, lengthscale, Z, mu, S, gamma):
    """
    Z - MxQ
@ -22,15 +33,10 @@ def _psi1computations(variance, lengthscale, Z, mu, S, gamma):
    S - NxQ
    gamma - NxQ
    """
-    # here are the "statistics" for psi1 and psi2
+    # here are the "statistics" for psi1
    # Produced intermediate results:
    # _psi1                NxM
-    # _dpsi1_dvariance     NxM
+
    # _dpsi1_dlengthscale  NxMxQ
    # _dpsi1_dZ            NxMxQ
    # _dpsi1_dgamma        NxMxQ
    # _dpsi1_dmu           NxMxQ
    # _dpsi1_dS            NxMxQ
    lengthscale2 = np.square(lengthscale)
@ -40,25 +46,15 @@ def _psi1computations(variance, lengthscale, Z, mu, S, gamma):
    _psi1_dist = Z[None, :, :] - mu[:, None, :]  # NxMxQ
    _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom) # NxMxQ
    _psi1_common = gamma[:,None,:] / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #Nx1xQ
-    _psi1_exponent1 = np.log(gamma[:,None,:]) -0.5 * (_psi1_dist_sq + np.log(_psi1_denom)) # NxMxQ
+    _psi1_exponent1 = np.log(gamma[:,None,:]) - (_psi1_dist_sq + np.log(_psi1_denom))/2. # NxMxQ
-    _psi1_exponent2 = np.log(1.-gamma[:,None,:]) -0.5 * (np.square(Z[None,:,:])/lengthscale2) # NxMxQ
+    _psi1_exponent2 = np.log(1.-gamma[:,None,:]) - (np.square(Z[None,:,:])/lengthscale2)/2. # NxMxQ
    _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2)
    _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ
    _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM
    _psi1_exp_dist_sq = np.exp(-0.5*_psi1_dist_sq) # NxMxQ
    _psi1_exp_Z = np.exp(-0.5*np.square(Z[None,:,:])/lengthscale2) # 1xMxQ
    _psi1_q = variance * np.exp(_psi1_exp_sum[:,:,None] - _psi1_exponent) # NxMxQ
    _psi1 = variance * np.exp(_psi1_exp_sum) # NxM
    _dpsi1_dvariance = _psi1 / variance # NxM
    _dpsi1_dgamma = _psi1_q * (_psi1_exp_dist_sq/_psi1_denom_sqrt-_psi1_exp_Z) # NxMxQ
    _dpsi1_dmu = _psi1_q * (_psi1_exp_dist_sq * _psi1_dist * _psi1_common) # NxMxQ
    _dpsi1_dS = _psi1_q * (_psi1_exp_dist_sq * _psi1_common * 0.5 * (_psi1_dist_sq - 1.)) # NxMxQ
    _dpsi1_dZ = _psi1_q * (- _psi1_common * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z) # NxMxQ
    _dpsi1_dlengthscale = 2.*lengthscale*_psi1_q * (0.5*_psi1_common*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + 0.5*(1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z) # NxMxQ
-    return _psi1, _dpsi1_dvariance, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _dpsi1_dZ, _dpsi1_dlengthscale
+    return _psi1
@Cache_this(limit=1)
 def _psi2computations(variance, lengthscale, Z, mu, S, gamma):
    """
    Z - MxQ
@ -66,19 +62,14 @@ def _psi2computations(variance, lengthscale, Z, mu, S, gamma):
    S - NxQ
    gamma - NxQ
    """
-    # here are the "statistics" for psi1 and psi2
+    # here are the "statistics" for psi2
    # Produced intermediate results:
-    # _psi2                NxMxM
+    # _psi2                MxM
    # _psi2_dvariance      NxMxM
    # _psi2_dlengthscale   NxMxMxQ
    # _psi2_dZ             NxMxMxQ
    # _psi2_dgamma         NxMxMxQ
    # _psi2_dmu            NxMxMxQ
    # _psi2_dS             NxMxMxQ
    lengthscale2 = np.square(lengthscale)
-    _psi2_Zhat, _psi2_Zdist = _Z_distances(Z)
+    _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
    _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
    _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
    _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
@ -93,15 +84,130 @@ def _psi2computations(variance, lengthscale, Z, mu, S, gamma):
    _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2)
    _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
    _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
-    _psi2_q = np.square(variance) * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ 
+    _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
    return _psi2
 def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
    ARD = (len(lengthscale)!=1)
    dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1, dgamma_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
    dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2, dgamma_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
    dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
    dL_dlengscale = dl_psi1 + dl_psi2
    if not ARD:
        dL_dlengscale = dL_dlengscale.sum()
    dL_dgamma = dgamma_psi1 + dgamma_psi2
    dL_dmu = dmu_psi1 + dmu_psi2
    dL_dS = dS_psi1 + dS_psi2
    dL_dZ = dZ_psi1 + dZ_psi2
    return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma
 def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, gamma):
    """
    dL_dpsi1 - NxM
    Z - MxQ
    mu - NxQ
    S - NxQ
    gamma - NxQ
    """
    # here are the "statistics" for psi1
    # Produced intermediate results: dL_dparams w.r.t. psi1
    # _dL_dvariance     1
    # _dL_dlengthscale  Q
    # _dL_dZ            MxQ
    # _dL_dgamma        NxQ
    # _dL_dmu           NxQ
    # _dL_dS            NxQ
    lengthscale2 = np.square(lengthscale)
    # psi1
    _psi1_denom = S / lengthscale2 + 1.  # NxQ
    _psi1_denom_sqrt = np.sqrt(_psi1_denom) #NxQ
    _psi1_dist = Z[None, :, :] - mu[:, None, :]  # NxMxQ
    _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom[:,None,:]) # NxMxQ
    _psi1_common = gamma / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #NxQ
    _psi1_exponent1 = np.log(gamma[:,None,:]) -0.5 * (_psi1_dist_sq + np.log(_psi1_denom[:, None,:])) # NxMxQ
    _psi1_exponent2 = np.log(1.-gamma[:,None,:]) -0.5 * (np.square(Z[None,:,:])/lengthscale2) # NxMxQ
    _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2)
    _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ
    _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM
    _psi1_exp_dist_sq = np.exp(-0.5*_psi1_dist_sq) # NxMxQ
    _psi1_exp_Z = np.exp(-0.5*np.square(Z[None,:,:])/lengthscale2) # 1xMxQ
    _psi1_q = variance * np.exp(_psi1_exp_sum[:,:,None] - _psi1_exponent) # NxMxQ
    _psi1 = variance * np.exp(_psi1_exp_sum) # NxM
    _dL_dvariance = np.einsum('nm,nm->',dL_dpsi1, _psi1)/variance # 1
    _dL_dgamma = np.einsum('nm,nmq,nmq->nq',dL_dpsi1, _psi1_q, (_psi1_exp_dist_sq/_psi1_denom_sqrt[:,None,:]-_psi1_exp_Z)) # NxQ
    _dL_dmu = np.einsum('nm, nmq, nmq, nmq, nq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_dist,_psi1_common)  # NxQ
    _dL_dS = np.einsum('nm,nmq,nmq,nq,nmq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_common,(_psi1_dist_sq-1.))/2.  # NxQ
    _dL_dZ = np.einsum('nm,nmq,nmq->mq',dL_dpsi1,_psi1_q, (- _psi1_common[:,None,:] * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z))
    _dL_dlengthscale = lengthscale* np.einsum('nm,nmq,nmq->q',dL_dpsi1,_psi1_q,(_psi1_common[:,None,:]*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + (1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z))
 #     _dpsi1_dmu = _psi1_q * (_psi1_exp_dist_sq * _psi1_dist * _psi1_common) # NxMxQ
 #     _dpsi1_dS = _psi1_q * (_psi1_exp_dist_sq * _psi1_common * 0.5 * (_psi1_dist_sq - 1.)) # NxMxQ
 #     _dpsi1_dZ = _psi1_q * (- _psi1_common * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z) # NxMxQ
 #     _dpsi1_dlengthscale = 2.*lengthscale*_psi1_q * (0.5*_psi1_common*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + 0.5*(1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z) # NxMxQ
    return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma 
 def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, gamma):
    """
    Z - MxQ
    mu - NxQ
    S - NxQ
    gamma - NxQ
    dL_dpsi2 - MxM
    """
    # here are the "statistics" for psi2
    # Produced the derivatives w.r.t. psi2:
    # _dL_dvariance      1
    # _dL_dlengthscale   Q
    # _dL_dZ             MxQ
    # _dL_dgamma         NxQ
    # _dL_dmu            NxQ
    # _dL_dS             NxQ
    lengthscale2 = np.square(lengthscale)
    _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
    _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
    _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
    _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
    # psi2
    _psi2_denom = 2.*S / lengthscale2 + 1. # NxQ
    _psi2_denom_sqrt = np.sqrt(_psi2_denom)
    _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q
    _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom[:,None,None,:])
    _psi2_common = gamma/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # NxQ
    _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom[:,None,None,:])+np.log(gamma[:,None,None,:]) #N,M,M,Q
    _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ
    _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2)
    _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
    _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
    _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ 
    _psi2_exp_dist_sq = np.exp(-_psi2_Zdist_sq -_psi2_mudist_sq) # NxMxMxQ
    _psi2_exp_Z = np.exp(-0.5*_psi2_Z_sq_sum) # MxMxQ
-    _psi2 = np.square(variance) * np.exp(_psi2_exp_sum) # N,M,M
+    _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
-    _dpsi2_dvariance = 2. * _psi2/variance # NxMxM
+    _dL_dvariance = np.einsum('mo,mo->',dL_dpsi2,_psi2)*2./variance
-    _dpsi2_dgamma = _psi2_q * (_psi2_exp_dist_sq/_psi2_denom_sqrt - _psi2_exp_Z) # NxMxMxQ
+    _dL_dgamma = np.einsum('mo,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,(_psi2_exp_dist_sq/_psi2_denom_sqrt[:,None,None,:] - _psi2_exp_Z))
-    _dpsi2_dmu = _psi2_q * (-2.*_psi2_common*_psi2_mudist * _psi2_exp_dist_sq) # NxMxMxQ
+    _dL_dmu = -2.*np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,_psi2_common,_psi2_mudist,_psi2_exp_dist_sq)
-    _dpsi2_dS = _psi2_q * (_psi2_common * (2.*_psi2_mudist_sq - 1.) * _psi2_exp_dist_sq) # NxMxMxQ
+    _dL_dS = np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q, _psi2_common, (2.*_psi2_mudist_sq-1.), _psi2_exp_dist_sq)
-    _dpsi2_dZ = 2.*_psi2_q * (_psi2_common*(-_psi2_Zdist*_psi2_denom+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z) # NxMxMxQ
+    _dL_dZ = 2.*np.einsum('mo,nmoq,nmoq->mq',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(-_psi2_Zdist*_psi2_denom[:,None,None,:]+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z))
-    _dpsi2_dlengthscale = 2.*lengthscale* _psi2_q * (_psi2_common*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z) # NxMxMxQ
+#    print _psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq #+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z)
    _dL_dlengthscale = 2.*lengthscale* np.einsum('mo,nmoq,nmoq->q',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z))
 #     _dpsi2_dvariance = 2. * _psi2/variance # NxMxM
 #     _dpsi2_dgamma = _psi2_q * (_psi2_exp_dist_sq/_psi2_denom_sqrt - _psi2_exp_Z) # NxMxMxQ
 #     _dpsi2_dmu = _psi2_q * (-2.*_psi2_common*_psi2_mudist * _psi2_exp_dist_sq) # NxMxMxQ
 #     _dpsi2_dS = _psi2_q * (_psi2_common * (2.*_psi2_mudist_sq - 1.) * _psi2_exp_dist_sq) # NxMxMxQ
 #     _dpsi2_dZ = 2.*_psi2_q * (_psi2_common*(-_psi2_Zdist*_psi2_denom+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z) # NxMxMxQ
 #     _dpsi2_dlengthscale = 2.*lengthscale* _psi2_q * (_psi2_common*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z) # NxMxMxQ
-    return _psi2, _dpsi2_dvariance, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _dpsi2_dZ, _dpsi2_dlengthscale
+    return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@ -42,9 +42,11 @@ class RBF(Stationary):
    #---------------------------------------#
    def psi0(self, Z, variational_posterior):
-        if self.useGPU:
+        if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
-            if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
+            if self.useGPU:
                return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[0]
            else:
                return ssrbf_psi_comp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[0]
        else:
            return self.Kdiag(variational_posterior.mean)
@ -53,7 +55,7 @@ class RBF(Stationary):
            if self.useGPU:
                return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[1]
            else:
-                psi1, _, _, _, _, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+                return ssrbf_psi_comp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[1]
        else:
            _, _, _, psi1 = self._psi1computations(Z, variational_posterior)
        return psi1
@ -63,7 +65,7 @@ class RBF(Stationary):
            if self.useGPU:
                return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[2]
            else:
-                psi2, _, _, _, _, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+                return ssrbf_psi_comp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[2]
        else:
            _, _, _, _, psi2 = self._psi2computations(Z, variational_posterior)
        return psi2
@ -74,26 +76,30 @@ class RBF(Stationary):
            if self.useGPU:
                self.psicomp.update_gradients_expectations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
            else:
 #                 dL_dvar, dL_dlengscale, dL_dZ, dL_dgamma, dL_dmu, dL_dS = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
                dL_dvar, dL_dlengscale, _, _, _, _ = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
                self.variance.gradient = dL_dvar
                self.lengthscale.gradient = dL_dlengscale
-                _, _dpsi1_dvariance, _, _, _, _, _dpsi1_dlengthscale = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+#                 _, _dpsi1_dvariance, _, _, _, _, _dpsi1_dlengthscale = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-                _, _dpsi2_dvariance, _, _, _, _, _dpsi2_dlengthscale = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+#                 _, _dpsi2_dvariance, _, _, _, _, _dpsi2_dlengthscale = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-    
+#     
-                #contributions from psi0:
+#                 #contributions from psi0:
-                self.variance.gradient = np.sum(dL_dpsi0)
+#                 self.variance.gradient = np.sum(dL_dpsi0)
-    
+#     
-                #from psi1
+#                 #from psi1
-                self.variance.gradient += np.sum(dL_dpsi1 * _dpsi1_dvariance)
+#                 self.variance.gradient += np.sum(dL_dpsi1 * _dpsi1_dvariance)
-                if self.ARD:
+#                 if self.ARD:
-                    self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
+#                     self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
-                else:
+#                 else:
-                    self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).sum()  
+#                     self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).sum()  
-    
+#     
-                #from psi2
+#                 #from psi2
-                self.variance.gradient += (dL_dpsi2 * _dpsi2_dvariance).sum()
+#                 self.variance.gradient += (dL_dpsi2 * _dpsi2_dvariance).sum()
-                if self.ARD:
+#                 if self.ARD:
-                    self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
+#                     self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
-                else:
+#                 else:
-                    self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).sum()
+#                     self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).sum()
        elif isinstance(variational_posterior, variational.NormalPosterior):
            l2 = self.lengthscale**2
@ -126,22 +132,25 @@ class RBF(Stationary):
        else:
            raise ValueError, "unknown distriubtion received for psi-statistics"
-    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        # Spike-and-Slab GPLVM
        if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
            if self.useGPU:
                return self.psicomp.gradients_Z_expectations(dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
            else:
-                _, _, _, _, _, _dpsi1_dZ, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+                _, _, dL_dZ, _, _, _ = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
-                _, _, _, _, _, _dpsi2_dZ, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+                return dL_dZ
-    
+                
-                #psi1
+#                 _, _, _, _, _, _dpsi1_dZ, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-                grad = (dL_dpsi1[:, :, None] * _dpsi1_dZ).sum(axis=0)
+#                 _, _, _, _, _, _dpsi2_dZ, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-    
+#     
-                #psi2
+#                 #psi1
-                grad += (dL_dpsi2[:, :, :, None] * _dpsi2_dZ).sum(axis=0).sum(axis=1)
+#                 grad = (dL_dpsi1[:, :, None] * _dpsi1_dZ).sum(axis=0)
-    
+#     
-                return grad
+#                 #psi2
 #                 grad += (dL_dpsi2[:, :, :, None] * _dpsi2_dZ).sum(axis=0).sum(axis=1)
 #     
 #                 return grad
        elif isinstance(variational_posterior, variational.NormalPosterior):
            l2 = self.lengthscale **2
@ -167,26 +176,29 @@ class RBF(Stationary):
        if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
            if self.useGPU:
                return self.psicomp.gradients_qX_expectations(dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
-            else:   
+            else:
-                ndata = variational_posterior.mean.shape[0]
+                _, _, _, dL_dmu, dL_dS, dL_dgamma = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
-    
+                return dL_dmu, dL_dS, dL_dgamma
-                _, _, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+            
-                _, _, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+#                 ndata = variational_posterior.mean.shape[0]
-    
+#     
-                #psi1
+#                 _, _, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-                grad_mu = (dL_dpsi1[:, :, None] * _dpsi1_dmu).sum(axis=1)
+#                 _, _, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-                grad_S = (dL_dpsi1[:, :, None] * _dpsi1_dS).sum(axis=1)
+#     
-                grad_gamma = (dL_dpsi1[:,:,None] * _dpsi1_dgamma).sum(axis=1)
+#                 #psi1
-    
+#                 grad_mu = (dL_dpsi1[:, :, None] * _dpsi1_dmu).sum(axis=1)
-                #psi2
+#                 grad_S = (dL_dpsi1[:, :, None] * _dpsi1_dS).sum(axis=1)
-                grad_mu += (dL_dpsi2[:, :, :, None] * _dpsi2_dmu).reshape(ndata,-1,self.input_dim).sum(axis=1)
+#                 grad_gamma = (dL_dpsi1[:,:,None] * _dpsi1_dgamma).sum(axis=1)
-                grad_S += (dL_dpsi2[:, :, :, None] * _dpsi2_dS).reshape(ndata,-1,self.input_dim).sum(axis=1)
+#     
-                grad_gamma += (dL_dpsi2[:,:,:, None] * _dpsi2_dgamma).reshape(ndata,-1,self.input_dim).sum(axis=1)
+#                 #psi2
-                
+#                 grad_mu += (dL_dpsi2[:, :, :, None] * _dpsi2_dmu).reshape(ndata,-1,self.input_dim).sum(axis=1)
-                if self.group_spike_prob:
+#                 grad_S += (dL_dpsi2[:, :, :, None] * _dpsi2_dS).reshape(ndata,-1,self.input_dim).sum(axis=1)
-                    grad_gamma[:] = grad_gamma.mean(axis=0)
+#                 grad_gamma += (dL_dpsi2[:,:,:, None] * _dpsi2_dgamma).reshape(ndata,-1,self.input_dim).sum(axis=1)
-    
+#                 
-                return grad_mu, grad_S, grad_gamma
+#                 if self.group_spike_prob:
 #                     grad_gamma[:] = grad_gamma.mean(axis=0)
 #     
 #                 return grad_mu, grad_S, grad_gamma
        elif isinstance(variational_posterior, variational.NormalPosterior):
--- a/GPy/kern/_src/static.py
+++ b/GPy/kern/_src/static.py
@ -25,7 +25,7 @@ class Static(Kern):
    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)
-    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        return np.zeros(Z.shape)
    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@ -227,3 +227,6 @@ class Bernoulli(Likelihood):
        ns = np.ones_like(gp, dtype=int)
        Ysim = np.random.binomial(ns, self.gp_link.transf(gp))
        return Ysim.reshape(orig_shape)
    def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
        pass
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@ -11,7 +11,7 @@ from ..likelihoods import Gaussian
 from ..inference.optimization import SCG
 from ..util import linalg
 from ..core.parameterization.variational import SpikeAndSlabPrior, SpikeAndSlabPosterior
-from ..inference.latent_function_inference.var_dtc_parallel import update_gradients
+from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch
 from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU
@ -41,7 +41,7 @@ class SSGPLVM(SparseGP):
        if X_variance is None: # The variance of the variational approximation (S)
            X_variance = np.random.uniform(0,.1,X.shape)
-        gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
+        gamma = np.empty_like(X, order='F') # The posterior probabilities of the binary variable in the variational approximation
        gamma[:] = 0.5 + 0.01 * np.random.randn(X.shape[0], input_dim)
        if group_spike:
@ -60,12 +60,15 @@ class SSGPLVM(SparseGP):
        pi = np.empty((input_dim))
        pi[:] = 0.5
        self.variational_prior = SpikeAndSlabPrior(pi=pi) # the prior probability of the latent binary variable b
        X = np.asfortranarray(X)
        X_variance = np.asfortranarray(X_variance)
        gamma = np.asfortranarray(gamma)
        X = SpikeAndSlabPosterior(X, X_variance, gamma)
        if group_spike:
            kernel.group_spike_prob = True
            self.variational_prior.group_spike_prob = True
        SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs)
        self.add_parameter(self.X, index=0)
@ -76,7 +79,7 @@ class SSGPLVM(SparseGP):
        X.mean.gradient, X.variance.gradient, X.binary_prob.gradient = X_grad
    def parameters_changed(self):
-        if isinstance(self.inference_method, VarDTC_GPU):
+        if isinstance(self.inference_method, VarDTC_GPU) or isinstance(self.inference_method, VarDTC_minibatch):
            update_gradients(self)
            return
--- a/GPy/plotting/matplot_dep/kernel_plots.py
+++ b/GPy/plotting/matplot_dep/kernel_plots.py
@ -68,7 +68,7 @@ def plot_ARD(kernel, fignum=None, ax=None, title='', legend=False):
    for i in range(ard_params.shape[0]):
        c = Tango.nextMedium()
-        bars.append(plot_bars(fig, ax, x, ard_params[i,:], c, kernel._parameters_[i].name, bottom=bottom))
+        bars.append(plot_bars(fig, ax, x, ard_params[i,:], c, kernel.parameters[i].name, bottom=bottom))
        bottom += ard_params[i,:]
    ax.set_xlim(-.5, kernel.input_dim - .5)
--- a/GPy/plotting/matplot_dep/visualize.py
+++ b/GPy/plotting/matplot_dep/visualize.py
@ -102,7 +102,8 @@ class lvm(matplotlib_show):
                vals = param_to_array(model.X.mean)
            else:
                vals = param_to_array(model.X)
-
+        if len(vals.shape)==1:
            vals = vals[None,:]
        matplotlib_show.__init__(self, vals, axes=latent_axes)
        if isinstance(latent_axes,mpl.axes.Axes):
@ -393,14 +394,13 @@ class mocap_data_show_vpython(vpython_show):
    def process_values(self):
        raise NotImplementedError, "this needs to be implemented to use the data_show class"
 class mocap_data_show(matplotlib_show):
    """Base class for visualizing motion capture data."""
    def __init__(self, vals, axes=None, connect=None):
        if axes==None:
            fig = plt.figure()
-            axes = fig.add_subplot(111, projection='3d')
+            axes = fig.add_subplot(111, projection='3d',aspect='equal')
        matplotlib_show.__init__(self, vals, axes)
        self.connect = connect
@ -445,11 +445,12 @@ class mocap_data_show(matplotlib_show):
    def process_values(self):
        raise NotImplementedError, "this needs to be implemented to use the data_show class"
-    def initialize_axes(self):
+    def initialize_axes(self, boundary=0.05):
        """Set up the axes with the right limits and scaling."""
-        self.x_lim = np.array([self.vals[:, 0].min(), self.vals[:, 0].max()])
+        bs = [(self.vals[:, i].max()-self.vals[:, i].min())*boundary for i in xrange(3)]
-        self.y_lim = np.array([self.vals[:, 1].min(), self.vals[:, 1].max()])
+        self.x_lim = np.array([self.vals[:, 0].min()-bs[0], self.vals[:, 0].max()+bs[0]])
-        self.z_lim = np.array([self.vals[:, 2].min(), self.vals[:, 2].max()])
+        self.y_lim = np.array([self.vals[:, 1].min()-bs[1], self.vals[:, 1].max()+bs[1]])
        self.z_lim = np.array([self.vals[:, 2].min()-bs[2], self.vals[:, 2].max()+bs[2]])
    def initialize_axes_modify(self):
        self.points_handle.remove()
@ -472,6 +473,8 @@ class mocap_data_show(matplotlib_show):
 class stick_show(mocap_data_show):
    """Show a three dimensional point cloud as a figure. Connect elements of the figure together using the matrix connect."""
    def __init__(self, vals, connect=None, axes=None):
        if len(vals.shape)==1:
            vals = vals[None,:]
        mocap_data_show.__init__(self, vals, axes=axes, connect=connect)
    def process_values(self):
--- a/GPy/testing/parameterized_tests.py
+++ b/GPy/testing/parameterized_tests.py
@ -95,7 +95,7 @@ class ParameterizedTest(unittest.TestCase):
        self.assertListEqual(self.test1.kern.param_array.tolist(), val[:2].tolist())
    def test_add_parameter_already_in_hirarchy(self):
-        self.assertRaises(HierarchyError, self.test1.add_parameter, self.white._parameters_[0])
+        self.assertRaises(HierarchyError, self.test1.add_parameter, self.white.parameters[0])
    def test_default_constraints(self):
        self.assertIs(self.rbf.variance.constraints._param_index_ops, self.rbf.constraints._param_index_ops)
--- a/GPy/testing/pickle_tests.py
+++ b/GPy/testing/pickle_tests.py
@ -89,28 +89,28 @@ class Test(ListDictTestCase):
        self.assertIs(pcopy.constraints, pcopy.rbf.lengthscale.constraints._param_index_ops)
        self.assertIs(pcopy.constraints, pcopy.linear.constraints._param_index_ops)
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
        with tempfile.TemporaryFile('w+b') as f:
            par.pickle(f)
            f.seek(0)
            pcopy = pickle.load(f)
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
        pcopy.gradient = 10
-        np.testing.assert_allclose(par.linear.full_gradient, pcopy.linear.full_gradient)
+        np.testing.assert_allclose(par.linear.gradient_full, pcopy.linear.gradient_full)
-        np.testing.assert_allclose(pcopy.linear.full_gradient, 10)
+        np.testing.assert_allclose(pcopy.linear.gradient_full, 10)
        self.assertSequenceEqual(str(par), str(pcopy))
    def test_model(self):
        par = toy_rbf_1d_50(optimize=0, plot=0)
        pcopy = par.copy()
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
        self.assertTrue(pcopy.checkgrad())
        self.assert_(np.any(pcopy.gradient!=0.0))
        with tempfile.TemporaryFile('w+b') as f:
@ -118,7 +118,7 @@ class Test(ListDictTestCase):
            f.seek(0)
            pcopy = pickle.load(f)
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        np.testing.assert_allclose(par.full_gradient, pcopy.full_gradient)
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assert_(pcopy.checkgrad())
@ -126,10 +126,10 @@ class Test(ListDictTestCase):
        par = toy_rbf_1d_50(optimize=0, plot=0)
        pcopy = GPRegression(par.X.copy(), par.Y.copy(), kernel=par.kern.copy())
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
        self.assertTrue(pcopy.checkgrad())
        self.assert_(np.any(pcopy.gradient!=0.0))
        pcopy.optimize('bfgs')
@ -140,7 +140,7 @@ class Test(ListDictTestCase):
            f.seek(0)
            pcopy = pickle.load(f)
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        np.testing.assert_allclose(par.full_gradient, pcopy.full_gradient)
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assert_(pcopy.checkgrad())
@ -151,18 +151,18 @@ class Test(ListDictTestCase):
        par.gradient = 10
        pcopy = par.copy()
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
        with tempfile.TemporaryFile('w+b') as f:
            par.pickle(f)
            f.seek(0)
            pcopy = pickle.load(f)
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
        pcopy.gradient = 10
-        np.testing.assert_allclose(par.full_gradient, pcopy.full_gradient)
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
-        np.testing.assert_allclose(pcopy.mean.full_gradient, 10)
+        np.testing.assert_allclose(pcopy.mean.gradient_full, 10)
        self.assertSequenceEqual(str(par), str(pcopy))
    def test_model_concat(self):
@ -170,10 +170,10 @@ class Test(ListDictTestCase):
        par.randomize()
        pcopy = par.copy()
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
        self.assertTrue(pcopy.checkgrad())
        self.assert_(np.any(pcopy.gradient!=0.0))
        with tempfile.TemporaryFile('w+b') as f:
@ -181,7 +181,7 @@ class Test(ListDictTestCase):
            f.seek(0)
            pcopy = pickle.load(f)
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        np.testing.assert_allclose(par.full_gradient, pcopy.full_gradient)
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assert_(pcopy.checkgrad())
--- a/GPy/util/initialization.py
+++ b/GPy/util/initialization.py
@ -8,7 +8,7 @@ import numpy as np
 from GPy.util.pca import pca
 def initialize_latent(init, input_dim, Y):
-    Xr = np.random.randn(Y.shape[0], input_dim)
+    Xr = np.asfortranarray(np.random.randn(Y.shape[0], input_dim))
    if init == 'PCA':
        p = pca(Y)
        PC = p.project(Y, min(input_dim, Y.shape[1]))
@ -20,4 +20,4 @@ def initialize_latent(init, input_dim, Y):
    Xr -= Xr.mean(0)
    Xr /= Xr.var(0)
-    return Xr, var/var.max()
+    return Xr, var/var.max()
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -123,7 +123,7 @@ def dtrtrs(A, B, lower=1, trans=0, unitdiag=0):
    :returns:
    """
-    A = force_F_ordered(A)
+    A = np.asfortranarray(A)
    #Note: B does not seem to need to be F ordered!
    return lapack.dtrtrs(A, B, lower=lower, trans=trans, unitdiag=unitdiag)