Merge branch 'devel' of https://github.com/SheffieldML/GPy into devel

2026-05-15 06:52:39 +02:00 · 2014-05-24 15:31:50 +01:00 · 2014-05-24 15:31:50 +01:00 · e3b6d9c9c5
commit e3b6d9c9c5
parent de75d7dac1 f84f3f4308
49 changed files with 1817 additions and 867 deletions
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -10,7 +10,7 @@ from model import Model
 from parameterization import ObsAr
 from .. import likelihoods
 from ..likelihoods.gaussian import Gaussian
-from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
+from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation, LatentFunctionInference
 from parameterization.variational import VariationalPosterior

 class GP(Model):
@ -21,6 +21,7 @@ class GP(Model):
    :param Y: output observations
    :param kernel: a GPy kernel, defaults to rbf+white
    :param likelihood: a GPy likelihood
+    :param :class:`~GPy.inference.latent_function_inference.LatentFunctionInference` inference_method: The inference method to use for this GP
    :rtype: model object

    .. Note:: Multiple independent outputs are allowed using columns of Y
@ -32,13 +33,13 @@ class GP(Model):

        assert X.ndim == 2
        if isinstance(X, (ObsAr, VariationalPosterior)):
-            self.X = X
-        else: self.X = ObsAr(X)
+            self.X = X.copy()
+        else: self.X = ObsAr(X.copy())

        self.num_data, self.input_dim = self.X.shape

        assert Y.ndim == 2
-        self.Y = ObsAr(Y)
+        self.Y = ObsAr(Y.copy())
        assert Y.shape[0] == self.num_data
        _, self.output_dim = self.Y.shape

@ -179,40 +180,80 @@ class GP(Model):

        return Ysim

-    def plot_f(self, *args, **kwargs):
+    def plot_f(self, plot_limits=None, which_data_rows='all',
+        which_data_ycols='all', fixed_inputs=[],
+        levels=20, samples=0, fignum=None, ax=None, resolution=None,
+        plot_raw=True,
+        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx'):
        """
-
-        Plot the GP's view of the world, where the data is normalized and
-        before applying a likelihood.
-
-        This is a convenience function: arguments are passed to
-        GPy.plotting.matplot_dep.models_plots.plot_f_fit
-
+        Plot the GP's view of the world, where the data is normalized and before applying a likelihood.
+        This is a call to plot with plot_raw=True.
+        Data will not be plotted in this, as the GP's view of the world
+        may live in another space, or units then the data.
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import models_plots
-        return models_plots.plot_fit_f(self,*args,**kwargs)
+        kw = {}
+        if linecol is not None:
+            kw['linecol'] = linecol
+        if fillcol is not None:
+            kw['fillcol'] = fillcol
+        return models_plots.plot_fit(self, plot_limits, which_data_rows,
+                                     which_data_ycols, fixed_inputs, 
+                                     levels, samples, fignum, ax, resolution, 
+                                     plot_raw=plot_raw, Y_metadata=Y_metadata, 
+                                     data_symbol=data_symbol, **kw)

-    def plot(self, *args, **kwargs):
+    def plot(self, plot_limits=None, which_data_rows='all',
+        which_data_ycols='all', fixed_inputs=[],
+        levels=20, samples=0, fignum=None, ax=None, resolution=None,
+        plot_raw=False,
+        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx'):
        """
        Plot the posterior of the GP.
-          - In one dimension, the function is plotted with a shaded region
-            identifying two standard deviations.
-          - In two dimsensions, a contour-plot shows the mean predicted
-            function
-          - In higher dimensions, use fixed_inputs to plot the GP  with some of
-            the inputs fixed.
+          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
+          - In two dimsensions, a contour-plot shows the mean predicted function
+          - In higher dimensions, use fixed_inputs to plot the GP  with some of the inputs fixed.

        Can plot only part of the data and part of the posterior functions
-        using which_data_rows which_data_ycols and which_parts
-
-        This is a convenience function: arguments are passed to
-        GPy.plotting.matplot_dep.models_plots.plot_fit
+        using which_data_rowsm which_data_ycols.

+        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+        :type plot_limits: np.array
+        :param which_data_rows: which of the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
+        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+        :type which_data_rows: 'all' or a list of integers
+        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
+        :type fixed_inputs: a list of tuples
+        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+        :type resolution: int
+        :param levels: number of levels to plot in a contour plot.
+        :type levels: int
+        :param samples: the number of a posteriori samples to plot
+        :type samples: int
+        :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+        :type output: integer (first output is 0)
+        :param linecol: color of line to plot [Tango.colorsHex['darkBlue']]
+        :type linecol:
+        :param fillcol: color of fill [Tango.colorsHex['lightBlue']]
+        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import models_plots
-        return models_plots.plot_fit(self,*args,**kwargs)
+        kw = {}
+        if linecol is not None:
+            kw['linecol'] = linecol
+        if fillcol is not None:
+            kw['fillcol'] = fillcol
+        return models_plots.plot_fit(self, plot_limits, which_data_rows,
+                                     which_data_ycols, fixed_inputs, 
+                                     levels, samples, fignum, ax, resolution, 
+                                     plot_raw=plot_raw, Y_metadata=Y_metadata, 
+                                     data_symbol=data_symbol, **kw)

    def input_sensitivity(self):
        """
@ -220,3 +261,24 @@ class GP(Model):
        """
        return self.kern.input_sensitivity()

+    def optimize(self, optimizer=None, start=None, **kwargs):
+        """
+        Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
+        kwargs are passed to the optimizer. They can be:
+
+        :param max_f_eval: maximum number of function evaluations
+        :type max_f_eval: int
+        :messages: whether to display during optimisation
+        :type messages: bool
+        :param optimizer: which optimizer to use (defaults to self.preferred optimizer)
+        :type optimizer: string
+
+        TODO: valid args
+        """
+        self.inference_method.on_optimization_start()
+        try:
+            super(GP, self).optimize(optimizer, start, **kwargs)
+        except KeyboardInterrupt:
+            print "KeyboardInterrupt caught, calling on_optimization_end() to round things up"
+            self.inference_method.on_optimization_end()
+            raise
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -20,7 +20,7 @@ class Model(Parameterized):
        super(Model, self).__init__(name)  # Parameterized.__init__(self)
        self.optimization_runs = []
        self.sampling_runs = []
-        self.preferred_optimizer = 'scg'
+        self.preferred_optimizer = 'bfgs'

    def log_likelihood(self):
        raise NotImplementedError, "this needs to be implemented to use the model class"
@ -61,7 +61,7 @@ class Model(Parameterized):
        on the current machine.

        """
-        initial_parameters = self._get_params_transformed()
+        initial_parameters = self.optimizer_array

        if parallel:
            try:
@ -124,13 +124,15 @@ class Model(Parameterized):

        For probabilistic models this is the negative log_likelihood
        (including the MAP prior), so we return it here. If your model is not 
-        probabilistic, just return your objective here!
+        probabilistic, just return your objective to minimize here!
        """
        return -float(self.log_likelihood()) - self.log_prior()

    def objective_function_gradients(self):
        """
        The gradients for the objective function for the given algorithm.
+        The gradients are w.r.t. the *negative* objective function, as 
+        this framework works with *negative* log-likelihoods as a default.

        You can find the gradient for the parameters in self.gradient at all times.
        This is the place, where gradients get stored for parameters.
@ -141,7 +143,7 @@ class Model(Parameterized):

        For probabilistic models this is the gradient of the negative log_likelihood
        (including the MAP prior), so we return it here. If your model is not 
-        probabilistic, just return your gradient here!
+        probabilistic, just return your *negative* gradient here!
        """
        return -(self._log_likelihood_gradients() + self._log_prior_gradients())

@ -157,7 +159,8 @@ class Model(Parameterized):
        :type x: np.array
        """
        try:
-            self._set_params_transformed(x)
+            # self._set_params_transformed(x)
+            self.optimizer_array = x
            obj_grads = self._transform_gradients(self.objective_function_gradients())
            self._fail_count = 0
        except (LinAlgError, ZeroDivisionError, ValueError):
@ -180,7 +183,7 @@ class Model(Parameterized):
        :parameter type: np.array
        """
        try:
-            self._set_params_transformed(x)
+            self.optimizer_array = x
            obj = self.objective_function()
            self._fail_count = 0
        except (LinAlgError, ZeroDivisionError, ValueError):
@ -192,7 +195,7 @@ class Model(Parameterized):

    def _objective_grads(self, x):
        try:
-            self._set_params_transformed(x)
+            self.optimizer_array = x
            obj_f, obj_grads = self.objective_function(), self._transform_gradients(self.objective_function_gradients())
            self._fail_count = 0
        except (LinAlgError, ZeroDivisionError, ValueError):
@ -220,13 +223,13 @@ class Model(Parameterized):
        if self.is_fixed:
            raise RuntimeError, "Cannot optimize, when everything is fixed"
        if self.size == 0:
-            raise RuntimeError, "Model without parameters cannot be minimized"
+            raise RuntimeError, "Model without parameters cannot be optimized"

        if optimizer is None:
            optimizer = self.preferred_optimizer

        if start == None:
-            start = self._get_params_transformed()
+            start = self.optimizer_array

        optimizer = optimization.get_optimizer(optimizer)
        opt = optimizer(start, model=self, **kwargs)
@ -235,7 +238,7 @@ class Model(Parameterized):

        self.optimization_runs.append(opt)

-        self._set_params_transformed(opt.x_opt)
+        self.optimizer_array = opt.x_opt

    def optimize_SGD(self, momentum=0.1, learning_rate=0.01, iterations=20, **kwargs):
        # assert self.Y.shape[1] > 1, "SGD only works with D > 1"
@ -260,7 +263,7 @@ class Model(Parameterized):
           The gradient is considered correct if the ratio of the analytical
           and numerical gradients is within <tolerance> of unity.
        """
-        x = self._get_params_transformed().copy()
+        x = self.optimizer_array.copy()

        if not verbose:
            # make sure only to test the selected parameters
@ -270,8 +273,8 @@ class Model(Parameterized):
                transformed_index = self._raveled_index_for(target_param)
                if self._has_fixes():
                    indices = np.r_[:self.size]
-                    which = (transformed_index[:,None]==indices[self._fixes_][None,:]).nonzero()
-                    transformed_index = (indices-(~self._fixes_).cumsum())[transformed_index[which[0]]]
+                    which = (transformed_index[:, None] == indices[self._fixes_][None, :]).nonzero()
+                    transformed_index = (indices - (~self._fixes_).cumsum())[transformed_index[which[0]]]

                if transformed_index.size == 0:
                    print "No free parameters to check"
@ -290,7 +293,7 @@ class Model(Parameterized):
            gradient = gradient[transformed_index]

            denominator = (2 * np.dot(dx, gradient))
-            global_ratio = (f1 - f2) / np.where(denominator==0., 1e-32, denominator)
+            global_ratio = (f1 - f2) / np.where(denominator == 0., 1e-32, denominator)
            global_diff = np.abs(f1 - f2) < tolerance and np.allclose(gradient, 0, atol=tolerance)
            if global_ratio is np.nan:
                global_ratio = 0
@ -319,10 +322,10 @@ class Model(Parameterized):
                param_index = self._raveled_index_for(target_param)
                if self._has_fixes():
                    indices = np.r_[:self.size]
-                    which = (param_index[:,None]==indices[self._fixes_][None,:]).nonzero()
+                    which = (param_index[:, None] == indices[self._fixes_][None, :]).nonzero()
                    param_index = param_index[which[0]]
-                    transformed_index = (indices-(~self._fixes_).cumsum())[param_index]
-                    #print param_index, transformed_index
+                    transformed_index = (indices - (~self._fixes_).cumsum())[param_index]
+                    # print param_index, transformed_index
                else:
                    transformed_index = param_index

@ -340,7 +343,7 @@ class Model(Parameterized):
                xx[xind] -= 2.*step
                f2 = self._objective(xx)
                numerical_gradient = (f1 - f2) / (2 * step)
-                if np.all(gradient[xind]==0): ratio = (f1-f2) == gradient[xind]
+                if np.all(gradient[xind] == 0): ratio = (f1 - f2) == gradient[xind]
                else: ratio = (f1 - f2) / (2 * step * gradient[xind])
                difference = np.abs((f1 - f2) / 2 / step - gradient[xind])

@ -358,7 +361,7 @@ class Model(Parameterized):
                grad_string = "{0:<{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}".format(formatted_name, r, d, g, ng, c0=cols[0] + 9, c1=cols[1], c2=cols[2], c3=cols[3], c4=cols[4])
                print grad_string

-            self._set_params_transformed(x)
+            self.optimizer_array = x
            return ret


--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@ -7,6 +7,20 @@ import numpy
 from numpy.lib.function_base import vectorize
 from lists_and_dicts import IntArrayDict

+def extract_properties_to_index(index, props):
+    prop_index = dict()
+    for i, cl in enumerate(props):
+        for c in cl:
+            ind = prop_index.get(c, list())
+            ind.append(index[i])
+            prop_index[c] = ind
+
+    for c, i in prop_index.items():
+        prop_index[c] = numpy.array(i, dtype=int)
+
+    return prop_index
+
+
 class ParameterIndexOperations(object):
    '''
    Index operations for storing param index _properties
@ -66,8 +80,34 @@ class ParameterIndexOperations(object):
        return self._properties.values()

    def properties_for(self, index):
+        """
+        Returns a list of properties, such that each entry in the list corresponds
+        to the element of the index given.
+
+        Example:
+        let properties: 'one':[1,2,3,4], 'two':[3,5,6]
+
+        >>> properties_for([2,3,5])
+        [['one'], ['one', 'two'], ['two']]
+        """
        return vectorize(lambda i: [prop for prop in self.iterproperties() if i in self[prop]], otypes=[list])(index)

+    def properties_to_index_dict(self, index):
+        """
+        Return a dictionary, containing properties as keys and indices as index
+        Thus, the indices for each constraint, which is contained will be collected as
+        one dictionary
+
+        Example:
+        let properties: 'one':[1,2,3,4], 'two':[3,5,6]
+
+        >>> properties_to_index_dict([2,3,5])
+        {'one':[2,3], 'two':[3,5]}
+        """
+        props = self.properties_for(index)
+        prop_index = extract_properties_to_index(index, props)
+        return prop_index
+
    def add(self, prop, indices):
        self._properties[prop] = combine_indices(self._properties[prop], indices)

@ -174,8 +214,32 @@ class ParameterIndexOperationsView(object):


    def properties_for(self, index):
+        """
+        Returns a list of properties, such that each entry in the list corresponds
+        to the element of the index given.
+
+        Example:
+        let properties: 'one':[1,2,3,4], 'two':[3,5,6]
+
+        >>> properties_for([2,3,5])
+        [['one'], ['one', 'two'], ['two']]
+        """
        return vectorize(lambda i: [prop for prop in self.iterproperties() if i in self[prop]], otypes=[list])(index)

+    def properties_to_index_dict(self, index):
+        """
+        Return a dictionary, containing properties as keys and indices as index
+        Thus, the indices for each constraint, which is contained will be collected as
+        one dictionary
+
+        Example:
+        let properties: 'one':[1,2,3,4], 'two':[3,5,6]
+
+        >>> properties_to_index_dict([2,3,5])
+        {'one':[2,3], 'two':[3,5]}
+        """
+        return extract_properties_to_index(index, self.properties_for(index))
+

    def add(self, prop, indices):
        self._param_index_ops.add(prop, indices+self._offset)
--- a/GPy/core/parameterization/lists_and_dicts.py
+++ b/GPy/core/parameterization/lists_and_dicts.py
@ -38,7 +38,12 @@ class ArrayList(list):
        raise ValueError, "{} is not in list".format(item)
    pass

-class ObservablesList(object):
+class ObserverList(object):
+    """
+    A list which containts the observables.
+    It only holds weak references to observers, such that unbound
+    observers dont dangle in memory.
+    """
    def __init__(self):
        self._poc = []

@ -46,31 +51,44 @@ class ObservablesList(object):
        p,o,c = self._poc[ind]
        return p, o(), c

-    def remove(self, priority, observable, callble):
+    def remove(self, priority, observer, callble):
        """
+        Remove one observer, which had priority and callble.
        """
        self.flush()
        for i in range(len(self) - 1, -1, -1):
            p,o,c = self[i]
-            if priority==p and observable==o and callble==c:
+            if priority==p and observer==o and callble==c:
                del self._poc[i]

    def __repr__(self):
        return self._poc.__repr__()
-    

-    def add(self, priority, observable, callble):
-        if observable is not None:
+    def add(self, priority, observer, callble):
+        """
+        Add an observer with priority and callble
+        """
+        if observer is not None:
            ins = 0
            for pr, _, _ in self:
                if priority > pr:
                    break
                ins += 1
-            self._poc.insert(ins, (priority, weakref.ref(observable), callble))
+            self._poc.insert(ins, (priority, weakref.ref(observer), callble))

    def __str__(self):
+        from . import ObsAr, Param
+        from parameter_core import Parameterizable
        ret = []
        curr_p = None
+        
+        def frmt(o):
+            if isinstance(o, ObsAr):
+                return 'ObsArr <{}>'.format(hex(id(o)))
+            elif isinstance(o, (Param,Parameterizable)):
+                return '{}'.format(o.hierarchy_name())
+            else:
+                return repr(o)                
        for p, o, c in self:
            curr = ''
            if curr_p != p:
@ -79,10 +97,14 @@ class ObservablesList(object):
            else: curr_pre = " "*len(pre)
            curr_p = p
            curr += curr_pre
-            ret.append(curr + ", ".join(map(repr, [o,c])))
-        return '\n'.join(ret)
+            
+            ret.append(curr + ", ".join([frmt(o), str(c)]))
+            return '\n'.join(ret)

    def flush(self):
+        """
+        Make sure all weak references, which point to nothing are flushed (deleted)
+        """
        self._poc = [(p,o,c) for p,o,c in self._poc if o() is not None]

    def __iter__(self):
@ -95,7 +117,7 @@ class ObservablesList(object):
        return self._poc.__len__()

    def __deepcopy__(self, memo):
-        s = ObservablesList()
+        s = ObserverList()
        for p,o,c in self:
            import copy
            s.add(p, copy.deepcopy(o, memo), copy.deepcopy(c, memo))
--- a/GPy/core/parameterization/observable_array.py
+++ b/GPy/core/parameterization/observable_array.py
@ -30,16 +30,22 @@ class ObsAr(np.ndarray, Pickleable, Observable):
    def __array_wrap__(self, out_arr, context=None):
        return out_arr.view(np.ndarray)

+    def _setup_observers(self):
+        # do not setup anything, as observable arrays do not have default observers
+        pass
+    
    def copy(self):
+        from lists_and_dicts import ObserverList
        memo = {}
        memo[id(self)] = self
+        memo[id(self.observers)] = ObserverList()
        return self.__deepcopy__(memo)

    def __deepcopy__(self, memo):
        s = self.__new__(self.__class__, input_array=self.view(np.ndarray).copy())
        memo[id(self)] = s
        import copy
-        s.__dict__.update(copy.deepcopy(self.__dict__, memo))
+        Pickleable.__setstate__(s, copy.deepcopy(self.__getstate__(), memo))
        return s

    def __reduce__(self):
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@ -4,7 +4,7 @@
 import itertools
 import numpy
 np = numpy
-from parameter_core import OptimizationHandlable, adjust_name_for_printing
+from parameter_core import Parameterizable, adjust_name_for_printing, Pickleable
 from observable_array import ObsAr

 ###### printing
@ -16,7 +16,7 @@ __precision__ = numpy.get_printoptions()['precision'] # numpy printing precision
 __print_threshold__ = 5
 ######

-class Param(OptimizationHandlable, ObsAr):
+class Param(Parameterizable, ObsAr):
    """
    Parameter object for GPy models.

@ -42,7 +42,7 @@ class Param(OptimizationHandlable, ObsAr):
    """
    __array_priority__ = -1 # Never give back Param
    _fixes_ = None
-    _parameters_ = []
+    parameters = []
    def __new__(cls, name, input_array, default_constraint=None):
        obj = numpy.atleast_1d(super(Param, cls).__new__(cls, input_array=input_array))
        obj._current_slice_ = (slice(obj.shape[0]),)
@ -87,6 +87,9 @@ class Param(OptimizationHandlable, ObsAr):

    @property
    def param_array(self):
+        """
+        As we are a leaf, this just returns self
+        """
        return self

    @property
@ -139,6 +142,9 @@ class Param(OptimizationHandlable, ObsAr):
    def _raveled_index_for(self, obj):
        return self._raveled_index()

+    #===========================================================================
+    # Index recreation
+    #===========================================================================
    def _expand_index(self, slice_index=None):
        # this calculates the full indexing arrays from the slicing objects given by get_item for _real..._ attributes
        # it basically translates slices to their respective index arrays and turns negative indices around
@ -147,6 +153,8 @@ class Param(OptimizationHandlable, ObsAr):
            slice_index = self._current_slice_
        def f(a):
            a, b = a
+            if isinstance(a, numpy.ndarray) and a.dtype == bool:
+                raise ValueError, "Boolean indexing not implemented, use Param[np.where(index)] to index by boolean arrays!"
            if a not in (slice(None), Ellipsis):
                if isinstance(a, slice):
                    start, stop, step = a.indices(b)
@ -165,34 +173,6 @@ class Param(OptimizationHandlable, ObsAr):
    def _ensure_fixes(self):
        if not self._has_fixes(): self._fixes_ = numpy.ones(self._realsize_, dtype=bool)

-    #===========================================================================
-    # parameterizable
-    #===========================================================================
-    def traverse(self, visit, *args, **kwargs):
-        """
-        Traverse the hierarchy performing visit(self, *args, **kwargs) at every node passed by.
-        See "visitor pattern" in literature. This is implemented in pre-order fashion.
-
-        This will function will just call visit on self, as Param are leaf nodes.
-        """
-        visit(self, *args, **kwargs)
-    
-    def traverse_parents(self, visit, *args, **kwargs):
-        """
-        Traverse the hierarchy upwards, visiting all parents and their children, except self.
-        See "visitor pattern" in literature. This is implemented in pre-order fashion.
-    
-        Example:
-    
-        parents = []
-        self.traverse_parents(parents.append)
-        print parents
-        """
-        if self.has_parent():
-            self.__visited = True
-            self._parent_._traverse_parents(visit, *args, **kwargs)
-            self.__visited = False
-
    #===========================================================================
    # Convenience
    #===========================================================================
@ -207,14 +187,24 @@ class Param(OptimizationHandlable, ObsAr):
    #===========================================================================
    # Pickling and copying
    #===========================================================================
+    def copy(self):
+        return Parameterizable.copy(self, which=self)
+    
    def __deepcopy__(self, memo):
        s = self.__new__(self.__class__, name=self.name, input_array=self.view(numpy.ndarray).copy())
-        memo[id(self)] = s
+        memo[id(self)] = s        
        import copy
-        s.__dict__.update(copy.deepcopy(self.__dict__, memo))
+        Pickleable.__setstate__(s, copy.deepcopy(self.__getstate__(), memo))
        return s
-
-
+    def _setup_observers(self):
+        """
+        Setup the default observers
+        
+        1: pass through to parent, if present
+        """
+        if self.has_parent():
+            self.add_observer(self._parent_, self._parent_._pass_through_notify_observers, -np.inf)
+    
    #===========================================================================
    # Printing -> done
    #===========================================================================
@ -316,7 +306,7 @@ class Param(OptimizationHandlable, ObsAr):
 class ParamConcatenation(object):
    def __init__(self, params):
        """
-        Parameter concatenation for convienience of printing regular expression matched arrays
+        Parameter concatenation for convenience of printing regular expression matched arrays
        you can index this concatenation as if it was the flattened concatenation
        of all the parameters it contains, same for setting parameters (Broadcasting enabled).

--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@ -16,8 +16,9 @@ Observable Pattern for patameterization
 from transformations import Logexp, NegativeLogexp, Logistic, __fixed__, FIXED, UNFIXED
 import numpy as np
 import re
+import logging

-__updated__ = '2014-05-12'
+__updated__ = '2014-05-21'

 class HierarchyError(Exception):
    """
@ -49,21 +50,49 @@ class Observable(object):
    as an observer. Every time the observable changes, it sends a notification with
    self as only argument to all its observers.
    """
-    _updated = True
+    _updates = True
    def __init__(self, *args, **kwargs):
        super(Observable, self).__init__()
-        from lists_and_dicts import ObservablesList
-        self.observers = ObservablesList()
+        from lists_and_dicts import ObserverList
+        self.observers = ObserverList()
+
+    @property
+    def updates(self):
+        p = getattr(self, '_highest_parent_', None)
+        if p is not None:
+            self._updates = p._updates
+        return self._updates
+
+    @updates.setter
+    def updates(self, ups):
+        assert isinstance(ups, bool), "updates are either on (True) or off (False)"
+        p = getattr(self, '_highest_parent_', None)
+        if p is not None:
+            p._updates = ups
+        else:
+            self._updates = ups
+        if ups:
+            self._trigger_params_changed()

    def add_observer(self, observer, callble, priority=0):
+        """
+        Add an observer `observer` with the callback `callble` 
+        and priority `priority` to this observers list.
+        """
        self.observers.add(priority, observer, callble)

    def remove_observer(self, observer, callble=None):
+        """
+        Either (if callble is None) remove all callables, 
+        which were added alongside observer,
+        or remove callable `callble` which was added alongside
+        the observer `observer`.
+        """
        to_remove = []
        for poc in self.observers:
            _, obs, clble = poc
            if callble is not None:
-                if (obs == observer) and (callble == clble):
+                if (obs is observer) and (callble == clble):
                    to_remove.append(poc)
            else:
                if obs is observer:
@ -81,6 +110,8 @@ class Observable(object):
        :param min_priority: only notify observers with priority > min_priority
                             if min_priority is None, notify all observers in order
        """
+        if not self.updates:
+            return
        if which is None:
            which = self
        if min_priority is None:
@ -91,10 +122,6 @@ class Observable(object):
                    break
                callble(self, which=which)

-#===============================================================================
-# Foundation framework for parameterized and param objects:
-#===============================================================================
-
 class Parentable(object):
    """
    Enable an Object to have a parent.
@ -151,6 +178,7 @@ class Pickleable(object):
    """
    def __init__(self, *a, **kw):
        super(Pickleable, self).__init__()
+
    #===========================================================================
    # Pickling operations
    #===========================================================================
@ -171,37 +199,49 @@ class Pickleable(object):
    #===========================================================================
    # copy and pickling
    #===========================================================================
-    def copy(self):
-        """Returns a (deep) copy of the current model"""
+    def copy(self, memo=None, which=None):
+        """
+        Returns a (deep) copy of the current parameter handle. 
+
+        All connections to parents of the copy will be cut.
+        
+        :param dict memo: memo for deepcopy
+        :param Parameterized which: parameterized object which started the copy process [default: self] 
+        """
        #raise NotImplementedError, "Copy is not yet implemented, TODO: Observable hierarchy"
+        if memo is None:
+            memo = {}
        import copy
-        memo = {}
        # the next part makes sure that we do not include parents in any form:
        parents = []
-        self.traverse_parents(parents.append) # collect parents
+        if which is None:
+            which = self
+        which.traverse_parents(parents.append) # collect parents
        for p in parents:
-            memo[id(p)] = None # set all parents to be None, so they will not be copied
-        memo[id(self.gradient)] = None # reset the gradient
-        memo[id(self.param_array)] = None # and param_array
-        memo[id(self._fixes_)] = None # fixes have to be reset, as this is now highest parent
-        c = copy.deepcopy(self, memo) # and start the copy
-        c._parent_index_ = None
-        return c
+            if not memo.has_key(id(p)):memo[id(p)] = None # set all parents to be None, so they will not be copied
+        if not memo.has_key(id(self.gradient)):memo[id(self.gradient)] = None # reset the gradient
+        if not memo.has_key(id(self._fixes_)):memo[id(self._fixes_)] = None # fixes have to be reset, as this is now highest parent
+        copy = copy.deepcopy(self, memo) # and start the copy
+        copy._parent_index_ = None
+        copy._trigger_params_changed()
+        return copy

    def __deepcopy__(self, memo):
        s = self.__new__(self.__class__) # fresh instance
        memo[id(self)] = s # be sure to break all cycles --> self is already done
        import copy
-        s.__dict__.update(copy.deepcopy(self.__dict__, memo)) # standard copy
+        s.__setstate__(copy.deepcopy(self.__getstate__(), memo)) # standard copy
        return s

    def __getstate__(self):
-        ignore_list = ([#'_parent_', '_parent_index_',
-                        #'observers',
-                        '_param_array_', '_gradient_array_', '_fixes_',
-                        '_Cacher_wrap__cachers']
-                       #+ self.parameter_names(recursive=False)
-                       )
+        ignore_list = ['_param_array_', # parameters get set from bottom to top
+                       '_gradient_array_', # as well as gradients
+                       '_optimizer_copy_',
+                       'logger',
+                       'observers',
+                       '_fixes_', # and fixes
+                       '_Cacher_wrap__cachers', # never pickle cachers
+                       ]
        dc = dict()
        for k,v in self.__dict__.iteritems():
            if k not in ignore_list:
@ -210,7 +250,11 @@ class Pickleable(object):
 
    def __setstate__(self, state):
        self.__dict__.update(state)
-        return self
+        from lists_and_dicts import ObserverList
+        self.observers = ObserverList()
+        self._setup_observers()
+        self._optimizer_copy_transformed = False
+

 class Gradcheckable(Pickleable, Parentable):
    """
@ -246,7 +290,6 @@ class Gradcheckable(Pickleable, Parentable):
        """
        raise HierarchyError, "This parameter is not in a model with a likelihood, and, therefore, cannot be gradient checked!"

-
 class Nameable(Gradcheckable):
    """
    Make an object nameable inside the hierarchy.
@ -285,41 +328,8 @@ class Nameable(Gradcheckable):
            return self._parent_.hierarchy_name() + "." + adjust(self.name)
        return adjust(self.name)

-class Indexable(object):
-    """
-    Enable enraveled indexes and offsets for this object.
-    The raveled index of an object is the index for its parameters in a flattened int array.
-    """
-    def __init__(self, *a, **kw):
-        super(Indexable, self).__init__()

-    def _raveled_index(self):
-        """
-        Flattened array of ints, specifying the index of this object.
-        This has to account for shaped parameters!
-        """
-        raise NotImplementedError, "Need to be able to get the raveled Index"
-
-    def _offset_for(self, param):
-        """
-        Return the offset of the param inside this parameterized object.
-        This does not need to account for shaped parameters, as it
-        basically just sums up the parameter sizes which come before param.
-        """
-        return 0
-        #raise NotImplementedError, "shouldnt happen, offset required from non parameterization object?"
-
-    def _raveled_index_for(self, param):
-        """
-        get the raveled index for a param
-        that is an int array, containing the indexes for the flattened
-        param inside this parameterized logic.
-        """
-        return param._raveled_index()
-        #raise NotImplementedError, "shouldnt happen, raveld index transformation required from non parameterization object?"
-
-
-class Constrainable(Nameable, Indexable, Observable):
+class Indexable(Nameable, Observable):
    """
    Make an object constrainable with Priors and Transformations.
    TODO: Mappings!!
@ -330,7 +340,7 @@ class Constrainable(Nameable, Indexable, Observable):
    :func:`constrain()` and :func:`unconstrain()` are main methods here
    """
    def __init__(self, name, default_constraint=None, *a, **kw):
-        super(Constrainable, self).__init__(name=name, *a, **kw)
+        super(Indexable, self).__init__(name=name, *a, **kw)
        self._default_constraint_ = default_constraint
        from index_operations import ParameterIndexOperations
        self.constraints = ParameterIndexOperations()
@ -352,6 +362,39 @@ class Constrainable(Nameable, Indexable, Observable):
        self._connect_fixes()
        self._notify_parent_change()

+    #===========================================================================
+    # Indexable
+    #===========================================================================
+    def _offset_for(self, param):
+        """
+        Return the offset of the param inside this parameterized object.
+        This does not need to account for shaped parameters, as it
+        basically just sums up the parameter sizes which come before param.
+        """
+        if param.has_parent():
+            if param._parent_._get_original(param) in self.parameters:
+                return self._param_slices_[param._parent_._get_original(param)._parent_index_].start
+            return self._offset_for(param._parent_) + param._parent_._offset_for(param)
+        return 0
+
+    def _raveled_index_for(self, param):
+        """
+        get the raveled index for a param
+        that is an int array, containing the indexes for the flattened
+        param inside this parameterized logic.
+        """
+        from param import ParamConcatenation
+        if isinstance(param, ParamConcatenation):
+            return np.hstack((self._raveled_index_for(p) for p in param.params))
+        return param._raveled_index() + self._offset_for(param)
+
+    def _raveled_index(self):
+        """
+        Flattened array of ints, specifying the index of this object.
+        This has to account for shaped parameters!
+        """
+        return np.r_[:self.size]
+
    #===========================================================================
    # Fixing Parameters:
    #===========================================================================
@ -363,8 +406,10 @@ class Constrainable(Nameable, Indexable, Observable):
        """
        if value is not None:
            self[:] = value
-        reconstrained = self.unconstrain()
-        index = self._add_to_index_operations(self.constraints, reconstrained, __fixed__, warning)
+
+        index = self._raveled_index()
+        #reconstrained = self.unconstrain()
+        index = self._add_to_index_operations(self.constraints, index, __fixed__, warning)
        self._highest_parent_._set_fixed(self, index)
        self.notify_observers(self, None if trigger_parent else -np.inf)
        return index
@ -406,9 +451,24 @@ class Constrainable(Nameable, Indexable, Observable):
            self._fixes_ = None
            del self.constraints[__fixed__]

+    #===========================================================================
+    # Convenience for fixed
+    #===========================================================================
    def _has_fixes(self):
        return hasattr(self, "_fixes_") and self._fixes_ is not None and self._fixes_.size == self.size

+    @property
+    def is_fixed(self):
+        for p in self.parameters:
+            if not p.is_fixed: return False
+        return True
+
+    def _get_original(self, param):
+        # if advanced indexing is activated it happens that the array is a copy
+        # you can retrieve the original param through this method, by passing
+        # the copy here
+        return self.parameters[param._parent_index_]
+
    #===========================================================================
    # Prior Operations
    #===========================================================================
@ -432,8 +492,7 @@ class Constrainable(Nameable, Indexable, Observable):

    def unset_priors(self, *priors):
        """
-        Un-set all priors given from this parameter handle.
-
+        Un-set all priors given (in *priors) from this parameter handle.
        """
        return self._remove_from_index_operations(self.priors, priors)

@ -535,7 +594,7 @@ class Constrainable(Nameable, Indexable, Observable):
        self.constraints = ParameterIndexOperationsView(parent.constraints, parent._offset_for(self), self.size)
        self.priors = ParameterIndexOperationsView(parent.priors, parent._offset_for(self), self.size)
        self._fixes_ = None
-        for p in self._parameters_:
+        for p in self.parameters:
            p._parent_changed(parent)

    def _add_to_index_operations(self, which, reconstrained, what, warning):
@ -563,53 +622,142 @@ class Constrainable(Nameable, Indexable, Observable):
        removed = np.empty((0,), dtype=int)
        for t in transforms:
            unconstrained = which.remove(t, self._raveled_index())
-            print unconstrained
            removed = np.union1d(removed, unconstrained)
            if t is __fixed__:
                self._highest_parent_._set_unfixed(self, unconstrained)

        return removed

-class OptimizationHandlable(Constrainable):
+class OptimizationHandlable(Indexable):
    """
    This enables optimization handles on an Object as done in GPy 0.4.

-    `..._transformed`: make sure the transformations and constraints etc are handled
+    `..._optimizer_copy_transformed`: make sure the transformations and constraints etc are handled
    """
    def __init__(self, name, default_constraint=None, *a, **kw):
        super(OptimizationHandlable, self).__init__(name, default_constraint=default_constraint, *a, **kw)
+        self._optimizer_copy_ = None
+        self._optimizer_copy_transformed = False

-    def _get_params_transformed(self):
-        # transformed parameters (apply transformation rules)
-        p = self.param_array.copy()
-        [np.put(p, ind, c.finv(p[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
-        if self.has_parent() and self.constraints[__fixed__].size != 0:
-            fixes = np.ones(self.size).astype(bool)
-            fixes[self.constraints[__fixed__]] = FIXED
-            return p[fixes]
-        elif self._has_fixes():
-            return p[self._fixes_]
-        return p
-
-    def _set_params_transformed(self, p):
-        if not(p is self.param_array):
+    #===========================================================================
+    # Optimizer copy
+    #===========================================================================    
+    @property
+    def optimizer_array(self):
+        """
+        Array for the optimizer to work on.
+        This array always lives in the space for the optimizer.
+        Thus, it is untransformed, going from Transformations.
+        
+        Setting this array, will make sure the transformed parameters for this model
+        will be set accordingly. It has to be set with an array, retrieved from
+        this method, as e.g. fixing will resize the array.  
+        
+        The optimizer should only interfere with this array, such that transofrmations
+        are secured.
+        """
+        if self.__dict__.get('_optimizer_copy_', None) is None or self.size != self._optimizer_copy_.size:
+            self._optimizer_copy_ = np.empty(self.size)
+        
+        if not self._optimizer_copy_transformed:
+            self._optimizer_copy_.flat = self.param_array.flat
+            [np.put(self._optimizer_copy_, ind, c.finv(self.param_array[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
            if self.has_parent() and self.constraints[__fixed__].size != 0:
                fixes = np.ones(self.size).astype(bool)
                fixes[self.constraints[__fixed__]] = FIXED
-                self.param_array.flat[fixes] = p
-            elif self._has_fixes(): self.param_array.flat[self._fixes_] = p
-            else: self.param_array.flat = p
-        [np.put(self.param_array, ind, c.f(self.param_array.flat[ind])) 
-         for c, ind in self.constraints.iteritems() if c != __fixed__]
+                return self._optimizer_copy_[fixes]
+            elif self._has_fixes():
+                return self._optimizer_copy_[self._fixes_]
+            self._optimizer_copy_transformed = True
+        
+        return self._optimizer_copy_
+    
+    @optimizer_array.setter
+    def optimizer_array(self, p):
+        """
+        Make sure the optimizer copy does not get touched, thus, we only want to 
+        set the values *inside* not the array itself.
+        
+        Also we want to update param_array in here.
+        """
+        f = None
+        if self.has_parent() and self.constraints[__fixed__].size != 0:
+            f = np.ones(self.size).astype(bool)
+            f[self.constraints[__fixed__]] = FIXED
+        elif self._has_fixes(): 
+            f = self._fixes_
+        if f is None:
+            self.param_array.flat = p
+            [np.put(self.param_array, ind, c.f(self.param_array.flat[ind])) 
+             for c, ind in self.constraints.iteritems() if c != __fixed__]
+        else:
+            self.param_array.flat[f] = p
+            [np.put(self.param_array, ind[f[ind]], c.f(self.param_array.flat[ind[f[ind]]])) 
+             for c, ind in self.constraints.iteritems() if c != __fixed__]
+    
+        self._optimizer_copy_transformed = False
        self._trigger_params_changed()

+    def _get_params_transformed(self):
+        raise DeprecationWarning, "_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!"
+#         # transformed parameters (apply un-transformation rules)
+#         p = self.param_array.copy()
+#         [np.put(p, ind, c.finv(p[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
+#         if self.has_parent() and self.constraints[__fixed__].size != 0:
+#             fixes = np.ones(self.size).astype(bool)
+#             fixes[self.constraints[__fixed__]] = FIXED
+#             return p[fixes]
+#         elif self._has_fixes():
+#             return p[self._fixes_]
+#         return p
+# 
+    def _set_params_transformed(self, p):
+        raise DeprecationWarning, "_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!"
+
+#         """
+#         Set parameters p, but make sure they get transformed before setting.
+#         This means, the optimizer sees p, whereas the model sees transformed(p), 
+#         such that, the parameters the model sees are in the right domain.
+#         """
+#         if not(p is self.param_array):
+#             if self.has_parent() and self.constraints[__fixed__].size != 0:
+#                 fixes = np.ones(self.size).astype(bool)
+#                 fixes[self.constraints[__fixed__]] = FIXED
+#                 self.param_array.flat[fixes] = p
+#             elif self._has_fixes(): self.param_array.flat[self._fixes_] = p
+#             else: self.param_array.flat = p
+#         [np.put(self.param_array, ind, c.f(self.param_array.flat[ind])) 
+#          for c, ind in self.constraints.iteritems() if c != __fixed__]
+#         self._trigger_params_changed()
+
    def _trigger_params_changed(self, trigger_parent=True):
-        [p._trigger_params_changed(trigger_parent=False) for p in self._parameters_]
+        """
+        First tell all children to update,
+        then update yourself.
+
+        If trigger_parent is True, we will tell the parent, otherwise not.
+        """
+        [p._trigger_params_changed(trigger_parent=False) for p in self.parameters]
        self.notify_observers(None, None if trigger_parent else -np.inf)

    def _size_transformed(self):
+        """
+        As fixes are not passed to the optimiser, the size of the model for the optimiser
+        is the size of all parameters minus the size of the fixes.
+        """
        return self.size - self.constraints[__fixed__].size

+    def _transform_gradients(self, g):
+        """
+        Transform the gradients by multiplying the gradient factor for each
+        constraint to it.
+        """
+        if self.has_parent():
+            return g
+        [np.put(g, i, g[i] * c.gradfactor(self.param_array[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
+        if self._has_fixes(): return g[self._fixes_]
+        return g
+
    @property
    def num_params(self):
        """
@ -628,8 +776,8 @@ class OptimizationHandlable(Constrainable):
        """
        if adjust_for_printing: adjust = lambda x: adjust_name_for_printing(x)
        else: adjust = lambda x: x
-        if recursive: names = [xi for x in self._parameters_ for xi in x.parameter_names(add_self=True, adjust_for_printing=adjust_for_printing)]
-        else: names = [adjust(x.name) for x in self._parameters_]
+        if recursive: names = [xi for x in self.parameters for xi in x.parameter_names(add_self=True, adjust_for_printing=adjust_for_printing)]
+        else: names = [adjust(x.name) for x in self.parameters]
        if add_self: names = map(lambda x: adjust(self.name) + "." + x, names)
        return names

@ -651,7 +799,7 @@ class OptimizationHandlable(Constrainable):
        Randomize the model.
        Make this draw from the prior if one exists, else draw from given random generator

-        :param rand_gen: numpy random number generator which takes args and kwargs
+        :param rand_gen: np random number generator which takes args and kwargs
        :param flaot loc: loc parameter for random number generator
        :param float scale: scale parameter for random number generator
        :param args, kwargs: will be passed through to random number generator
@ -660,14 +808,14 @@ class OptimizationHandlable(Constrainable):
        x = rand_gen(loc=loc, scale=scale, size=self._size_transformed(), *args, **kwargs)
        # now draw from prior where possible
        [np.put(x, ind, p.rvs(ind.size)) for p, ind in self.priors.iteritems() if not p is None]
-        self._set_params_transformed(x)  # makes sure all of the tied parameters get the same init (since there's only one prior object...)
+        self.optimizer_array = x  # makes sure all of the tied parameters get the same init (since there's only one prior object...)

    #===========================================================================
    # For shared memory arrays. This does nothing in Param, but sets the memory
    # for all parameterized objects
    #===========================================================================
    @property
-    def full_gradient(self):
+    def gradient_full(self):
        """
        Note to users:
        This does not return the gradient in the right shape! Use self.gradient
@ -681,27 +829,45 @@ class OptimizationHandlable(Constrainable):
        return self._gradient_array_

    def _propagate_param_grad(self, parray, garray):
+        """
+        For propagating the param_array and gradient_array.
+        This ensures the in memory view of each subsequent array.
+
+        1.) connect param_array of children to self.param_array
+        2.) tell all children to propagate further
+        """
        pi_old_size = 0
-        for pi in self._parameters_:
+        for pi in self.parameters:
            pislice = slice(pi_old_size, pi_old_size + pi.size)

            self.param_array[pislice] = pi.param_array.flat  # , requirements=['C', 'W']).flat
-            self.full_gradient[pislice] = pi.full_gradient.flat  # , requirements=['C', 'W']).flat
+            self.gradient_full[pislice] = pi.gradient_full.flat  # , requirements=['C', 'W']).flat

            pi.param_array.data = parray[pislice].data
-            pi.full_gradient.data = garray[pislice].data
+            pi.gradient_full.data = garray[pislice].data

            pi._propagate_param_grad(parray[pislice], garray[pislice])
            pi_old_size += pi.size

 class Parameterizable(OptimizationHandlable):
+    """
+    A parameterisable class.
+
+    This class provides the parameters list (ArrayList) and standard parameter handling,
+    such as {add|remove}_parameter(), traverse hierarchy and param_array, gradient_array
+    and the empty parameters_changed().
+
+    This class is abstract and should not be instantiated.
+    Use GPy.core.Parameterized() as node (or leaf) in the parameterized hierarchy.
+    Use GPy.core.Param() for a leaf in the parameterized hierarchy.
+    """
    def __init__(self, *args, **kwargs):
        super(Parameterizable, self).__init__(*args, **kwargs)
        from GPy.core.parameterization.lists_and_dicts import ArrayList
-        self._parameters_ = ArrayList()
+        self.parameters = ArrayList()
        self._param_array_ = None
-        self.size = 0
        self._added_names_ = set()
+        self.logger = logging.getLogger(self.__class__.__name__)
        self.__visited = False # for traversing in reverse order we need to know if we were here already

    @property
@ -735,7 +901,7 @@ class Parameterizable(OptimizationHandlable):
        if not self.__visited:
            visit(self, *args, **kwargs)
            self.__visited = True
-            for c in self._parameters_:
+            for c in self.parameters:
                c.traverse(visit, *args, **kwargs)
            self.__visited = False

@ -743,9 +909,9 @@ class Parameterizable(OptimizationHandlable):
        """
        Traverse the hierarchy upwards, visiting all parents and their children except self.
        See "visitor pattern" in literature. This is implemented in pre-order fashion.
-    
+
        Example:
-    
+
        parents = []
        self.traverse_parents(parents.append)
        print parents
@ -754,7 +920,7 @@ class Parameterizable(OptimizationHandlable):
            self.__visited = True
            self._parent_._traverse_parents(visit, *args, **kwargs)
            self.__visited = False
-        
+
    def _traverse_parents(self, visit, *args, **kwargs):
        if not self.__visited:
            self.__visited = True
@ -779,7 +945,7 @@ class Parameterizable(OptimizationHandlable):

    @property
    def num_params(self):
-        return len(self._parameters_)
+        return len(self.parameters)

    def _add_parameter_name(self, param, ignore_added_names=False):
        pname = adjust_name_for_printing(param.name)
@ -812,131 +978,10 @@ class Parameterizable(OptimizationHandlable):
        self._remove_parameter_name(None, old_name)
        self._add_parameter_name(param)

-    def add_parameter(self, param, index=None, _ignore_added_names=False):
-        """
-        :param parameters:  the parameters to add
-        :type parameters:   list of or one :py:class:`GPy.core.param.Param`
-        :param [index]:     index of where to put parameters
-
-        :param bool _ignore_added_names: whether the name of the parameter overrides a possibly existing field
-
-        Add all parameters to this param class, you can insert parameters
-        at any given index using the :func:`list.insert` syntax
-        """
-        if param in self._parameters_ and index is not None:
-            self.remove_parameter(param)
-            self.add_parameter(param, index)
-        # elif param.has_parent():
-        #    raise HierarchyError, "parameter {} already in another model ({}), create new object (or copy) for adding".format(param._short(), param._highest_parent_._short())
-        elif param not in self._parameters_:
-            if param.has_parent():
-                def visit(parent, self):
-                    if parent is self:
-                        raise HierarchyError, "You cannot add a parameter twice into the hierarchy"
-                param.traverse_parents(visit, self)
-                param._parent_.remove_parameter(param)
-            # make sure the size is set
-            if index is None:
-                self.constraints.update(param.constraints, self.size)
-                self.priors.update(param.priors, self.size)
-                self._parameters_.append(param)
-            else:
-                start = sum(p.size for p in self._parameters_[:index])
-                self.constraints.shift_right(start, param.size)
-                self.priors.shift_right(start, param.size)
-                self.constraints.update(param.constraints, start)
-                self.priors.update(param.priors, start)
-                self._parameters_.insert(index, param)
-
-            param.add_observer(self, self._pass_through_notify_observers, -np.inf)
-
-            parent = self
-            while parent is not None:
-                parent.size += param.size
-                parent = parent._parent_
-
-            self._connect_parameters()
-
-            self._highest_parent_._connect_parameters(ignore_added_names=_ignore_added_names)
-            self._highest_parent_._notify_parent_change()
-            self._highest_parent_._connect_fixes()
-
-        else:
-            raise HierarchyError, """Parameter exists already and no copy made"""
-
-
-    def add_parameters(self, *parameters):
-        """
-        convenience method for adding several
-        parameters without gradient specification
-        """
-        [self.add_parameter(p) for p in parameters]
-
-    def remove_parameter(self, param):
-        """
-        :param param: param object to remove from being a parameter of this parameterized object.
-        """
-        if not param in self._parameters_:
-            raise RuntimeError, "Parameter {} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name)
-
-        start = sum([p.size for p in self._parameters_[:param._parent_index_]])
-        self._remove_parameter_name(param)
-        self.size -= param.size
-        del self._parameters_[param._parent_index_]
-
-        param._disconnect_parent()
-        param.remove_observer(self, self._pass_through_notify_observers)
-        self.constraints.shift_left(start, param.size)
-
-        self._connect_parameters()
-        self._notify_parent_change()
-
-        parent = self._parent_
-        while parent is not None:
-            parent.size -= param.size
-            parent = parent._parent_
-
-        self._highest_parent_._connect_parameters()
-        self._highest_parent_._connect_fixes()
-        self._highest_parent_._notify_parent_change()
-
-    def _connect_parameters(self, ignore_added_names=False):
-        # connect parameterlist to this parameterized object
-        # This just sets up the right connection for the params objects
-        # to be used as parameters
-        # it also sets the constraints for each parameter to the constraints
-        # of their respective parents
-        if not hasattr(self, "_parameters_") or len(self._parameters_) < 1:
-            # no parameters for this class
-            return
-        if self.param_array.size != self.size:
-            self.param_array = np.empty(self.size, dtype=np.float64)
-        if self.gradient.size != self.size:
-            self._gradient_array_ = np.empty(self.size, dtype=np.float64)
-
-        old_size = 0
-        self._param_slices_ = []
-        for i, p in enumerate(self._parameters_):
-            p._parent_ = self
-            p._parent_index_ = i
-
-            pslice = slice(old_size, old_size + p.size)
-            # first connect all children
-            p._propagate_param_grad(self.param_array[pslice], self.full_gradient[pslice])
-            # then connect children to self
-            self.param_array[pslice] = p.param_array.flat  # , requirements=['C', 'W']).ravel(order='C')
-            self.full_gradient[pslice] = p.full_gradient.flat  # , requirements=['C', 'W']).ravel(order='C')
-
-            if not p.param_array.flags['C_CONTIGUOUS']:
-                raise ValueError, "This should not happen! Please write an email to the developers with the code, which reproduces this error. All parameter arrays must be C_CONTIGUOUS"
-            
-            p.param_array.data = self.param_array[pslice].data
-            p.full_gradient.data = self.full_gradient[pslice].data
-
-            self._param_slices_.append(pslice)
-
-            self._add_parameter_name(p, ignore_added_names=ignore_added_names)
-            old_size += p.size
+    def __setstate__(self, state):
+        super(Parameterizable, self).__setstate__(state)
+        self.logger = logging.getLogger(self.__class__.__name__)
+        return self

    #===========================================================================
    # notification system
@ -945,24 +990,16 @@ class Parameterizable(OptimizationHandlable):
        self.parameters_changed()
    def _pass_through_notify_observers(self, me, which=None):
        self.notify_observers(which=which)
-
-    #===========================================================================
-    # Pickling
-    #===========================================================================
-    def __setstate__(self, state):
-        super(Parameterizable, self).__setstate__(state)
-        self._connect_parameters()
-        self._connect_fixes()
-        self._notify_parent_change()
-
-        self.parameters_changed()
-
-    def copy(self):
-        c = super(Parameterizable, self).copy()
-        c._connect_parameters()
-        c._connect_fixes()
-        c._notify_parent_change()
-        return c
+    def _setup_observers(self):
+        """
+        Setup the default observers
+        
+        1: parameters_changed_notify
+        2: pass through to parent, if present
+        """
+        self.add_observer(self, self._parameters_changed_notification, -100)
+        if self.has_parent():
+            self.add_observer(self._parent_, self._parent_._pass_through_notify_observers, -np.inf)
    #===========================================================================
    # From being parentable, we have to define the parent_change notification
    #===========================================================================
@ -970,7 +1007,7 @@ class Parameterizable(OptimizationHandlable):
        """
        Notify all parameters that the parent has changed
        """
-        for p in self._parameters_:
+        for p in self.parameters:
            p._parent_changed(self)

    def parameters_changed(self):
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -3,13 +3,10 @@


 import numpy; np = numpy
-import cPickle
 import itertools
 from re import compile, _pattern_type
 from param import ParamConcatenation
-from parameter_core import Pickleable, Parameterizable, adjust_name_for_printing
-from transformations import __fixed__
-from lists_and_dicts import ArrayList
+from parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing

 class ParametersChangedMeta(type):
    def __call__(self, *args, **kw):
@ -68,8 +65,7 @@ class Parameterized(Parameterizable):
    def __init__(self, name=None, parameters=[], *a, **kw):
        super(Parameterized, self).__init__(name=name, *a, **kw)
        self._in_init_ = True
-        self._parameters_ = ArrayList()
-        self.size = sum(p.size for p in self._parameters_)
+        self.size = sum(p.size for p in self.parameters)
        self.add_observer(self, self._parameters_changed_notification, -100)
        if not self._has_fixes():
            self._fixes_ = None
@ -86,7 +82,7 @@ class Parameterized(Parameterizable):
            iamroot=True
        node = pydot.Node(id(self), shape='box', label=self.name)#, color='white')
        G.add_node(node)
-        for child in self._parameters_:
+        for child in self.parameters:
            child_node = child.build_pydot(G)
            G.add_edge(pydot.Edge(node, child_node))#, color='white'))

@ -102,58 +98,133 @@ class Parameterized(Parameterizable):
        return node

    #===========================================================================
-    # Gradient control
+    # Add remove parameters:
    #===========================================================================
-    def _transform_gradients(self, g):
-        if self.has_parent():
-            return g
-        [numpy.put(g, i, g[i] * c.gradfactor(self.param_array[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
-        if self._has_fixes(): return g[self._fixes_]
-        return g
-
-
-    #===========================================================================
-    # Indexable
-    #===========================================================================
-    def _offset_for(self, param):
-        # get the offset in the parameterized index array for param
-        if param.has_parent():
-            if param._parent_._get_original(param) in self._parameters_:
-                return self._param_slices_[param._parent_._get_original(param)._parent_index_].start
-            return self._offset_for(param._parent_) + param._parent_._offset_for(param)
-        return 0
-
-    def _raveled_index_for(self, param):
+    def add_parameter(self, param, index=None, _ignore_added_names=False):
        """
-        get the raveled index for a param
-        that is an int array, containing the indexes for the flattened
-        param inside this parameterized logic.
-        """
-        if isinstance(param, ParamConcatenation):
-            return numpy.hstack((self._raveled_index_for(p) for p in param.params))
-        return param._raveled_index() + self._offset_for(param)
+        :param parameters:  the parameters to add
+        :type parameters:   list of or one :py:class:`GPy.core.param.Param`
+        :param [index]:     index of where to put parameters

-    def _raveled_index(self):
-        """
-        get the raveled index for this object,
-        this is not in the global view of things!
-        """
-        return numpy.r_[:self.size]
+        :param bool _ignore_added_names: whether the name of the parameter overrides a possibly existing field

-    #===========================================================================
-    # Convenience for fixed, tied checking of param:
-    #===========================================================================
-    @property
-    def is_fixed(self):
-        for p in self._parameters_:
-            if not p.is_fixed: return False
-        return True
+        Add all parameters to this param class, you can insert parameters
+        at any given index using the :func:`list.insert` syntax
+        """
+        if param in self.parameters and index is not None:
+            self.remove_parameter(param)
+            self.add_parameter(param, index)
+        # elif param.has_parent():
+        #    raise HierarchyError, "parameter {} already in another model ({}), create new object (or copy) for adding".format(param._short(), param._highest_parent_._short())
+        elif param not in self.parameters:
+            if param.has_parent():
+                def visit(parent, self):
+                    if parent is self:
+                        raise HierarchyError, "You cannot add a parameter twice into the hierarchy"
+                param.traverse_parents(visit, self)
+                param._parent_.remove_parameter(param)
+            # make sure the size is set
+            if index is None:
+                self.constraints.update(param.constraints, self.size)
+                self.priors.update(param.priors, self.size)
+                self.parameters.append(param)
+            else:
+                start = sum(p.size for p in self.parameters[:index])
+                self.constraints.shift_right(start, param.size)
+                self.priors.shift_right(start, param.size)
+                self.constraints.update(param.constraints, start)
+                self.priors.update(param.priors, start)
+                self.parameters.insert(index, param)

-    def _get_original(self, param):
-        # if advanced indexing is activated it happens that the array is a copy
-        # you can retrieve the original param through this method, by passing
-        # the copy here
-        return self._parameters_[param._parent_index_]
+            param.add_observer(self, self._pass_through_notify_observers, -np.inf)
+
+            parent = self
+            while parent is not None:
+                parent.size += param.size
+                parent = parent._parent_
+
+            self._connect_parameters()
+
+            self._highest_parent_._connect_parameters(ignore_added_names=_ignore_added_names)
+            self._highest_parent_._notify_parent_change()
+            self._highest_parent_._connect_fixes()
+
+        else:
+            raise HierarchyError, """Parameter exists already and no copy made"""
+
+
+    def add_parameters(self, *parameters):
+        """
+        convenience method for adding several
+        parameters without gradient specification
+        """
+        [self.add_parameter(p) for p in parameters]
+
+    def remove_parameter(self, param):
+        """
+        :param param: param object to remove from being a parameter of this parameterized object.
+        """
+        if not param in self.parameters:
+            raise RuntimeError, "Parameter {} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name)
+
+        start = sum([p.size for p in self.parameters[:param._parent_index_]])
+        self._remove_parameter_name(param)
+        self.size -= param.size
+        del self.parameters[param._parent_index_]
+
+        param._disconnect_parent()
+        param.remove_observer(self, self._pass_through_notify_observers)
+        self.constraints.shift_left(start, param.size)
+
+        self._connect_parameters()
+        self._notify_parent_change()
+
+        parent = self._parent_
+        while parent is not None:
+            parent.size -= param.size
+            parent = parent._parent_
+
+        self._highest_parent_._connect_parameters()
+        self._highest_parent_._connect_fixes()
+        self._highest_parent_._notify_parent_change()
+
+    def _connect_parameters(self, ignore_added_names=False):
+        # connect parameterlist to this parameterized object
+        # This just sets up the right connection for the params objects
+        # to be used as parameters
+        # it also sets the constraints for each parameter to the constraints
+        # of their respective parents
+        if not hasattr(self, "parameters") or len(self.parameters) < 1:
+            # no parameters for this class
+            return
+        if self.param_array.size != self.size:
+            self.param_array = np.empty(self.size, dtype=np.float64)
+        if self.gradient.size != self.size:
+            self._gradient_array_ = np.empty(self.size, dtype=np.float64)
+
+        old_size = 0
+        self._param_slices_ = []
+        for i, p in enumerate(self.parameters):
+            p._parent_ = self
+            p._parent_index_ = i
+
+            pslice = slice(old_size, old_size + p.size)
+            # first connect all children
+            p._propagate_param_grad(self.param_array[pslice], self.gradient_full[pslice])
+            # then connect children to self
+            self.param_array[pslice] = p.param_array.flat  # , requirements=['C', 'W']).ravel(order='C')
+            self.gradient_full[pslice] = p.gradient_full.flat  # , requirements=['C', 'W']).ravel(order='C')
+
+            if not p.param_array.flags['C_CONTIGUOUS']:
+                raise ValueError, "This should not happen! Please write an email to the developers with the code, which reproduces this error. All parameter arrays must be C_CONTIGUOUS"
+
+            p.param_array.data = self.param_array[pslice].data
+            p.gradient_full.data = self.gradient_full[pslice].data
+
+            self._param_slices_.append(pslice)
+
+            self._add_parameter_name(p, ignore_added_names=ignore_added_names)
+            old_size += p.size

    #===========================================================================
    # Get/set parameters:
@ -200,10 +271,38 @@ class Parameterized(Parameterizable):

    def __setattr__(self, name, val):
        # override the default behaviour, if setting a param, so broadcasting can by used
-        if hasattr(self, "_parameters_"):
-            pnames = self.parameter_names(False, adjust_for_printing=True, recursive=False)
-            if name in pnames: self._parameters_[pnames.index(name)][:] = val; return
+        if hasattr(self, "parameters"):
+            try:
+                pnames = self.parameter_names(False, adjust_for_printing=True, recursive=False)
+                if name in pnames: self.parameters[pnames.index(name)][:] = val; return
+            except AttributeError:
+                pass
        object.__setattr__(self, name, val);
+
+    #===========================================================================
+    # Pickling
+    #===========================================================================
+    def __setstate__(self, state):
+        super(Parameterized, self).__setstate__(state)
+        try:
+            self._connect_parameters()
+            self._connect_fixes()
+            self._notify_parent_change()
+            self.parameters_changed()
+        except Exception as e:
+            print "WARNING: caught exception {!s}, trying to continue".format(e)
+
+    def copy(self, memo=None):
+        if memo is None:
+            memo = {}
+        memo[id(self.optimizer_array)] = None # and param_array
+        memo[id(self.param_array)] = None # and param_array
+        copy = super(Parameterized, self).copy(memo)
+        copy._connect_parameters()
+        copy._connect_fixes()
+        copy._notify_parent_change()
+        return copy
+
    #===========================================================================
    # Printing:
    #===========================================================================
@ -211,22 +310,22 @@ class Parameterized(Parameterizable):
        return self.hierarchy_name()
    @property
    def flattened_parameters(self):
-        return [xi for x in self._parameters_ for xi in x.flattened_parameters]
+        return [xi for x in self.parameters for xi in x.flattened_parameters]
    @property
    def _parameter_sizes_(self):
-        return [x.size for x in self._parameters_]
+        return [x.size for x in self.parameters]
    @property
    def parameter_shapes(self):
-        return [xi for x in self._parameters_ for xi in x.parameter_shapes]
+        return [xi for x in self.parameters for xi in x.parameter_shapes]
    @property
    def _constraints_str(self):
-        return [cs for p in self._parameters_ for cs in p._constraints_str]
+        return [cs for p in self.parameters for cs in p._constraints_str]
    @property
    def _priors_str(self):
-        return [cs for p in self._parameters_ for cs in p._priors_str]
+        return [cs for p in self.parameters for cs in p._priors_str]
    @property
    def _description_str(self):
-        return [xi for x in self._parameters_ for xi in x._description_str]
+        return [xi for x in self.parameters for xi in x._description_str]
    @property
    def _ties_str(self):
        return [','.join(x._ties_str) for x in self.flattened_parameters]
@ -246,7 +345,7 @@ class Parameterized(Parameterizable):
        to_print = []
        for n, d, c, t, p in itertools.izip(names, desc, constrs, ts, prirs):
            to_print.append(format_spec.format(name=n, desc=d, const=c, t=t, pri=p))
-        # to_print = [format_spec.format(p=p, const=c, t=t) if isinstance(p, Param) else p.__str__(header=False) for p, c, t in itertools.izip(self._parameters_, constrs, ts)]
+        # to_print = [format_spec.format(p=p, const=c, t=t) if isinstance(p, Param) else p.__str__(header=False) for p, c, t in itertools.izip(self.parameters, constrs, ts)]
        sep = '-' * (nl + sl + cl + + pl + tl + 8 * 2 + 3)
        if header:
            header = "  {{0:<{0}s}}  |  {{1:^{1}s}}  |  {{2:^{2}s}}  |  {{3:^{3}s}}  |  {{4:^{4}s}}".format(nl, sl, cl, pl, tl).format(name, "Value", "Constraint", "Prior", "Tied to")
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@ -81,7 +81,7 @@ class VariationalPosterior(Parameterized):
    def _raveled_index(self):
        index = np.empty(dtype=int, shape=0)
        size = 0
-        for p in self._parameters_:
+        for p in self.parameters:
            index = np.hstack((index, p._raveled_index()+size))
            size += p._realsize_ if hasattr(p, '_realsize_') else p.size
        return index
@ -96,10 +96,10 @@ class VariationalPosterior(Parameterized):
            dc = self.__dict__.copy()
            dc['mean'] = self.mean[s]
            dc['variance'] = self.variance[s]
-            dc['_parameters_'] = copy.copy(self._parameters_)
+            dc['parameters'] = copy.copy(self.parameters)
            n.__dict__.update(dc)
-            n._parameters_[dc['mean']._parent_index_] = dc['mean']
-            n._parameters_[dc['variance']._parent_index_] = dc['variance']
+            n.parameters[dc['mean']._parent_index_] = dc['mean']
+            n.parameters[dc['variance']._parent_index_] = dc['variance']
            n._gradient_array_ = None
            oversize = self.size - self.mean.size - self.variance.size
            n.size = n.mean.size + n.variance.size + oversize
@ -150,11 +150,11 @@ class SpikeAndSlabPosterior(VariationalPosterior):
            dc['mean'] = self.mean[s]
            dc['variance'] = self.variance[s]
            dc['binary_prob'] = self.binary_prob[s]
-            dc['_parameters_'] = copy.copy(self._parameters_)
+            dc['parameters'] = copy.copy(self.parameters)
            n.__dict__.update(dc)
-            n._parameters_[dc['mean']._parent_index_] = dc['mean']
-            n._parameters_[dc['variance']._parent_index_] = dc['variance']
-            n._parameters_[dc['binary_prob']._parent_index_] = dc['binary_prob']
+            n.parameters[dc['mean']._parent_index_] = dc['mean']
+            n.parameters[dc['variance']._parent_index_] = dc['variance']
+            n.parameters[dc['binary_prob']._parent_index_] = dc['binary_prob']
            n.ndim = n.mean.ndim
            n.shape = n.mean.shape
            n.num_data = n.mean.shape[0]
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -66,7 +66,11 @@ class SparseGP(GP):
            #gradients wrt Z
            self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z)
            self.Z.gradient += self.kern.gradients_Z_expectations(
-                               self.grad_dict['dL_dpsi1'], self.grad_dict['dL_dpsi2'], Z=self.Z, variational_posterior=self.X)
+                               self.grad_dict['dL_dpsi0'], 
+                               self.grad_dict['dL_dpsi1'], 
+                               self.grad_dict['dL_dpsi2'], 
+                               Z=self.Z, 
+                               variational_posterior=self.X)
        else:
            #gradients wrt kernel
            self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X)
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -96,15 +96,11 @@ def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=

    # Optimize
    if optimize:
-        #m.update_likelihood_approximation()
-        # Parameters optimization:
        try:
            m.optimize('scg', messages=1)
        except Exception as e:
            return m

-        #m.pseudo_EM()
-
    # Plot
    if plot:
        fig, axes = pb.subplots(2, 1)
@ -133,10 +129,7 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti

    # Optimize
    if optimize:
-        #m.update_likelihood_approximation()
-        # Parameters optimization:
-        #m.optimize()
-        m.pseudo_EM()
+        m.optimize()

    # Plot
    if plot:
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -99,7 +99,7 @@ def sparse_gplvm_oil(optimize=True, verbose=0, plot=True, N=100, Q=6, num_induci
        m.kern.plot_ARD()
    return m

-def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=15, Q=4, sigma=.2):
+def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=25, Q=4, sigma=.2):
    import GPy
    from GPy.util.datasets import swiss_roll_generated
    from GPy.models import BayesianGPLVM
@ -144,16 +144,15 @@ def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=15, Q=4
    m = BayesianGPLVM(Y, Q, X=X, X_variance=S, num_inducing=num_inducing, Z=Z, kernel=kernel)
    m.data_colors = c
    m.data_t = t
-    m['noise_variance'] = Y.var() / 100.
-
+    
    if optimize:
-        m.optimize('scg', messages=verbose, max_iters=2e3)
+        m.optimize('bfgs', messages=verbose, max_iters=2e3)

    if plot:
        fig = plt.figure('fitted')
        ax = fig.add_subplot(111)
        s = m.input_sensitivity().argsort()[::-1][:2]
-        ax.scatter(*m.X.T[s], c=c)
+        ax.scatter(*m.X.mean.T[s], c=c)

    return m

@ -172,14 +171,14 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40,
    m.data_labels = data['Y'][:N].argmax(axis=1)
    
    if optimize:
-        m.optimize('scg', messages=verbose, max_iters=max_iters, gtol=.05)
+        m.optimize('bfgs', messages=verbose, max_iters=max_iters, gtol=.05)

    if plot:
        fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
        m.plot_latent(ax=latent_axes, labels=m.data_labels)
        data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0,:]))
        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(param_to_array(m.X.mean)[0:1,:], # @UnusedVariable
-            m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
+            m, data_show, latent_axes=latent_axes, sense_axes=sense_axes, labels=m.data_labels)
        raw_input('Press enter to finish')
        plt.close(fig)
    return m
@ -303,9 +302,11 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
    k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)

    inan = _np.random.binomial(1, .6, size=Y.shape).astype(bool)
-    m = BayesianGPLVM(Y.copy(), Q, init="random", num_inducing=num_inducing, kernel=k)
-    m.inference_method = VarDTCMissingData()
-    m.Y[inan] = _np.nan
+    Y[inan] = _np.nan
+
+    m = BayesianGPLVM(Y.copy(), Q, init="random", num_inducing=num_inducing, 
+                      inference_method=VarDTCMissingData(inan=inan), kernel=k)
+
    m.X.variance[:] = _np.random.uniform(0,.01,m.X.shape)
    m.likelihood.variance = .01
    m.parameters_changed()
@ -338,7 +339,40 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
        print "Optimizing Model:"
        m.optimize(messages=verbose, max_iters=8e3, gtol=.1)
    if plot:
-        m.plot_X_1d("MRD Latent Space 1D")
+        m.X.plot("MRD Latent Space 1D")
+        m.plot_scales("MRD Scales")
+    return m
+
+def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
+    from GPy import kern
+    from GPy.models import MRD
+    from GPy.inference.latent_function_inference.var_dtc import VarDTCMissingData
+
+    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
+    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
+
+    #Ylist = [Ylist[0]]
+    k = kern.Linear(Q, ARD=True)
+    inanlist = []
+
+    for Y in Ylist:
+        inan = _np.random.binomial(1, .6, size=Y.shape).astype(bool)
+        inanlist.append(inan)
+        Y[inan] = _np.nan
+
+    imlist = []
+    for inan in inanlist:
+        imlist.append(VarDTCMissingData(limit=1, inan=inan))
+
+    m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing, 
+            kernel=k, inference_method=imlist,
+            initx="random", initz='permute', **kw)
+
+    if optimize:
+        print "Optimizing Model:"
+        m.optimize('bfgs', messages=verbose, max_iters=8e3, gtol=.1)
+    if plot:
+        m.X.plot("MRD Latent Space 1D")
        m.plot_scales("MRD Scales")
    return m

@ -351,18 +385,17 @@ def brendan_faces(optimize=True, verbose=True, plot=True):
    Yn = Y - Y.mean()
    Yn /= Yn.std()

-    m = GPy.models.GPLVM(Yn, Q)
+    m = GPy.models.BayesianGPLVM(Yn, Q, num_inducing=20)

    # optimize
-    m.constrain('rbf|noise|white', GPy.transformations.LogexpClipped())

-    if optimize: m.optimize('scg', messages=verbose, max_iters=1000)
+    if optimize: m.optimize('bfgs', messages=verbose, max_iters=1000)

    if plot:
        ax = m.plot_latent(which_indices=(0, 1))
-        y = m.likelihood.Y[0, :]
+        y = m.Y[0, :]
        data_show = GPy.plotting.matplot_dep.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, order='F', invert=False, scale=False)
-        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        lvm = GPy.plotting.matplot_dep.visualize.lvm(m.X.mean[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')

    return m
@ -376,13 +409,14 @@ def olivetti_faces(optimize=True, verbose=True, plot=True):
    Yn = Y - Y.mean()
    Yn /= Yn.std()

-    m = GPy.models.GPLVM(Yn, Q)
-    if optimize: m.optimize('scg', messages=verbose, max_iters=1000)
+    m = GPy.models.BayesianGPLVM(Yn, Q, num_inducing=20)
+    
+    if optimize: m.optimize('bfgs', messages=verbose, max_iters=1000)
    if plot:
        ax = m.plot_latent(which_indices=(0, 1))
        y = m.likelihood.Y[0, :]
        data_show = GPy.plotting.matplot_dep.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
-        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        lvm = GPy.plotting.matplot_dep.visualize.lvm(m.X.mean[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')

    return m
@ -414,9 +448,10 @@ def stick(kernel=None, optimize=True, verbose=True, plot=True):
        ax = m.plot_latent()
        y = m.Y[0, :]
        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
-        vis = GPy.plotting.matplot_dep.visualize.lvm(m.X[:1, :].copy(), m, data_show, latent_axes=ax)
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[:1, :].copy(), m, data_show, latent_axes=ax)
        raw_input('Press enter to finish')
-
+        lvm_visualizer.close()
+        data_show.close()
    return m

 def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):
@ -464,9 +499,8 @@ def robot_wireless(optimize=True, verbose=True, plot=True):

    data = GPy.util.datasets.robot_wireless()
    # optimize
-    m = GPy.models.GPLVM(data['Y'], 2)
+    m = GPy.models.BayesianGPLVM(data['Y'], 4, num_inducing=25)
    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
-    m._set_params(m._get_params())
    if plot:
        m.plot_latent()

@ -482,21 +516,26 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
    Q = 6
    kernel = GPy.kern.RBF(Q, lengthscale=np.repeat(.5, Q), ARD=True) 
    m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel)
-    
+
    m.data = data
    m.likelihood.variance = 0.001
-    
+
    # optimize
-    if optimize: m.optimize('bfgs', messages=verbose, max_iters=800, xtol=1e-300, ftol=1e-300)
+    try:
+        if optimize: m.optimize('bfgs', messages=verbose, max_iters=5e3, bfgs_factor=10)
+    except KeyboardInterrupt:
+        print "Keyboard interrupt, continuing to plot and return"
+
    if plot:
-        plt.clf, (latent_axes, sense_axes) = plt.subplots(1, 2)
+        fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
        plt.sca(latent_axes)
        m.plot_latent(ax=latent_axes)
        y = m.Y[:1, :].copy()
        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y, connect=data['connect'])
-        GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean[:1, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
-        plt.draw()
-        #raw_input('Press enter to finish')
+        dim_select = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean[:1, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
+        fig.canvas.draw()
+        fig.canvas.show()
+        raw_input('Press enter to finish')

    return m

@ -515,9 +554,10 @@ def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose
        ax = m.plot_latent()
        y = m.Y[0, :]
        data_show = GPy.plotting.matplot_dep.visualize.skeleton_show(y[None, :], data['skel'])
-        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[0].copy(), m, data_show, latent_axes=ax)
        raw_input('Press enter to finish')
        lvm_visualizer.close()
+        data_show.close()

    return m

--- a/GPy/inference/latent_function_inference/init.py
+++ b/GPy/inference/latent_function_inference/init.py
@ -25,6 +25,39 @@ etc.

 """

+class LatentFunctionInference(object):
+    def on_optimization_start(self):
+        """
+        This function gets called, just before the optimization loop to start.
+        """
+        pass
+
+    def on_optimization_end(self):
+        """
+        This function gets called, just after the optimization loop ended.
+        """
+        pass
+
+class InferenceMethodList(LatentFunctionInference, list):
+
+    def on_optimization_start(self):
+        for inf in self:
+            inf.on_optimization_start()
+
+    def on_optimization_end(self):
+        for inf in self:
+            inf.on_optimization_end()
+    
+    def __getstate__(self):
+        state = []
+        for inf in self:
+            state.append(inf)
+        return state
+    
+    def __setstate__(self, state):
+        for inf in state:
+            self.append(inf)
+
 from exact_gaussian_inference import ExactGaussianInference
 from laplace import Laplace
 from GPy.inference.latent_function_inference.var_dtc import VarDTC
@ -38,11 +71,26 @@ from var_dtc_gpu import VarDTC_GPU
 # class FullLatentFunctionData(object):
 #
 #
-# class LatentFunctionInference(object):
-#     def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+
+# class EMLikeLatentFunctionInference(LatentFunctionInference):
+#     def update_approximation(self):
+#         """
+#         This function gets called when the 
+#         """
+#     
+#     def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
 #         """
 #         Do inference on the latent functions given a covariance function `kern`,
-#         inputs and outputs `X` and `Y`, and a likelihood `likelihood`.
+#         inputs and outputs `X` and `Y`, inducing_inputs `Z`, and a likelihood `likelihood`.
+#         Additional metadata for the outputs `Y` can be given in `Y_metadata`.
+#         """
+#         raise NotImplementedError, "Abstract base class for full inference"
+# 
+# class VariationalLatentFunctionInference(LatentFunctionInference):
+#     def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+#         """
+#         Do inference on the latent functions given a covariance function `kern`,
+#         inputs and outputs `X` and `Y`, inducing_inputs `Z`, and a likelihood `likelihood`.
 #         Additional metadata for the outputs `Y` can be given in `Y_metadata`.
 #         """
 #         raise NotImplementedError, "Abstract base class for full inference"
--- a/GPy/inference/latent_function_inference/dtc.py
+++ b/GPy/inference/latent_function_inference/dtc.py
@ -4,9 +4,10 @@
 from posterior import Posterior
 from ...util.linalg import jitchol, tdot, dtrtrs, dpotri, pdinv
 import numpy as np
+from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)

-class DTC(object):
+class DTC(LatentFunctionInference):
    """
    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.

--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@ -5,10 +5,11 @@ from posterior import Posterior
 from ...util.linalg import pdinv, dpotrs, tdot
 from ...util import diag
 import numpy as np
+from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)


-class ExactGaussianInference(object):
+class ExactGaussianInference(LatentFunctionInference):
    """
    An object for inference when the likelihood is Gaussian.

--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@ -1,9 +1,10 @@
 import numpy as np
 from ...util.linalg import pdinv,jitchol,DSYR,tdot,dtrtrs, dpotrs
 from posterior import Posterior
+from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)

-class EP(object):
+class EP(LatentFunctionInference):
    def __init__(self, epsilon=1e-6, eta=1., delta=1.):
        """
        The expectation-propagation algorithm.
@ -21,14 +22,25 @@ class EP(object):

    def reset(self):
        self.old_mutilde, self.old_vtilde = None, None
+        self._ep_approximation = None
+
+    def on_optimization_start(self):
+        self._ep_approximation = None
+
+    def on_optimization_end(self):
+        # TODO: update approximation in the end as well? Maybe even with a switch?
+        pass

    def inference(self, kern, X, likelihood, Y, Y_metadata=None, Z=None):
-        num_data, output_dim = X.shape
+        num_data, output_dim = Y.shape
        assert output_dim ==1, "ep in 1D only (for now!)"

        K = kern.K(X)

-        mu, Sigma, mu_tilde, tau_tilde, Z_hat = self.expectation_propagation(K, Y, likelihood, Y_metadata)
+        if self._ep_approximation is None:
+            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
+        else:
+            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation

        Wi, LW, LWi, W_logdet = pdinv(K + np.diag(1./tau_tilde))

@ -42,8 +54,6 @@ class EP(object):

        return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}

-
-
    def expectation_propagation(self, K, Y, likelihood, Y_metadata):

        num_data, data_dim = Y.shape
@ -108,4 +118,3 @@ class EP(object):

        mu_tilde = v_tilde/tau_tilde
        return mu, Sigma, mu_tilde, tau_tilde, Z_hat
-
--- a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
@ -1,11 +1,59 @@
 import numpy as np
-from ...util.linalg import pdinv,jitchol,DSYR,tdot,dtrtrs, dpotrs
-from expectation_propagation import EP
+from ...util import diag
+from ...util.linalg import mdot, jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify, DSYR
+from ...util.misc import param_to_array
+from ...core.parameterization.variational import VariationalPosterior
+from . import LatentFunctionInference
 from posterior import Posterior
 log_2_pi = np.log(2*np.pi)

-class EPDTC(EP):
-    #def __init__(self, epsilon=1e-6, eta=1., delta=1.):
+class EPDTC(LatentFunctionInference):
+    const_jitter = 1e-6
+    def __init__(self, epsilon=1e-6, eta=1., delta=1., limit=1):
+        from ...util.caching import Cacher
+        self.limit = limit
+        self.get_trYYT = Cacher(self._get_trYYT, limit)
+        self.get_YYTfactor = Cacher(self._get_YYTfactor, limit)
+
+        self.epsilon, self.eta, self.delta = epsilon, eta, delta
+        self.reset()
+
+    def set_limit(self, limit):
+        self.get_trYYT.limit = limit
+        self.get_YYTfactor.limit = limit
+
+    def _get_trYYT(self, Y):
+        return param_to_array(np.sum(np.square(Y)))
+
+    def __getstate__(self):
+        # has to be overridden, as Cacher objects cannot be pickled.
+        return self.limit
+
+    def __setstate__(self, state):
+        # has to be overridden, as Cacher objects cannot be pickled.
+        self.limit = state
+        from ...util.caching import Cacher
+        self.get_trYYT = Cacher(self._get_trYYT, self.limit)
+        self.get_YYTfactor = Cacher(self._get_YYTfactor, self.limit)
+
+    def _get_YYTfactor(self, Y):
+        """
+        find a matrix L which satisfies LLT = YYT.
+
+        Note that L may have fewer columns than Y.
+        """
+        N, D = Y.shape
+        if (N>=D):
+            return param_to_array(Y)
+        else:
+            return jitchol(tdot(Y))
+
+    def get_VVTfactor(self, Y, prec):
+        return Y * prec # TODO chache this, and make it effective
+
+    def reset(self):
+        self.old_mutilde, self.old_vtilde = None, None
+        self._ep_approximation = None

    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
        num_data, output_dim = X.shape
@ -14,26 +62,131 @@ class EPDTC(EP):
        Kmm = kern.K(Z)
        Kmn = kern.K(Z,X)

+        if self._ep_approximation is None:
+            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
+        else:
+            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation
+
+
+        if isinstance(X, VariationalPosterior):
+            uncertain_inputs = True
+            psi0 = kern.psi0(Z, X)
+            psi1 = Kmn.T#kern.psi1(Z, X)
+            psi2 = kern.psi2(Z, X)
+        else:
+            uncertain_inputs = False
+            psi0 = kern.Kdiag(X)
+            psi1 = Kmn.T#kern.K(X, Z)
+            psi2 = None
+
+        #see whether we're using variational uncertain inputs
+
+        _, output_dim = Y.shape
+
+        #see whether we've got a different noise variance for each datum
+        #beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6)
+        beta = tau_tilde
+        VVT_factor = beta[:,None]*mu_tilde[:,None]
+        trYYT = self.get_trYYT(mu_tilde[:,None])
+
+        # do the inference:
+        het_noise = beta.size > 1
+        num_inducing = Z.shape[0]
+        num_data = Y.shape[0]
+        # kernel computations, using BGPLVM notation
+
+        Kmm = kern.K(Z).copy()
+        diag.add(Kmm, self.const_jitter)
        Lm = jitchol(Kmm)
-        Lmi = dtrtrs(Lm,np.eye(Lm.shape[0]))[0]
-        Kmmi = np.dot(Lmi.T,Lmi)
-        KmmiKmn = np.dot(Kmmi,Kmn)
-        K = np.dot(Kmn.T,KmmiKmn)
+
+        # The rather complex computations of A
+        if uncertain_inputs:
+            if het_noise:
+                psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0)
+            else:
+                psi2_beta = psi2.sum(0) * beta
+            LmInv = dtrtri(Lm)
+            A = LmInv.dot(psi2_beta.dot(LmInv.T))
+        else:
+            if het_noise:
+                tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
+            else:
+                tmp = psi1 * (np.sqrt(beta))
+            tmp, _ = dtrtrs(Lm, tmp.T, lower=1)
+            A = tdot(tmp) #print A.sum()
+
+        # factor B
+        B = np.eye(num_inducing) + A
+        LB = jitchol(B)
+        psi1Vf = np.dot(psi1.T, VVT_factor)
+        # back substutue C into psi1Vf
+        tmp, _ = dtrtrs(Lm, psi1Vf, lower=1, trans=0)
+        _LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0)
+        tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
+        Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
+
+        # data fit and derivative of L w.r.t. Kmm
+        delit = tdot(_LBi_Lmi_psi1Vf)
+        data_fit = np.trace(delit)
+        DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit)
+        delit = -0.5 * DBi_plus_BiPBi
+        delit += -0.5 * B * output_dim
+        delit += output_dim * np.eye(num_inducing)
+        # Compute dL_dKmm
+        dL_dKmm = backsub_both_sides(Lm, delit)
+
+        # derivatives of L w.r.t. psi
+        dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm,
+            VVT_factor, Cpsi1Vf, DBi_plus_BiPBi,
+            psi1, het_noise, uncertain_inputs)
+
+        # log marginal likelihood
+        log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise,
+            psi0, A, LB, trYYT, data_fit, VVT_factor)
+
+        #put the gradients in the right places
+        dL_dR = _compute_dL_dR(likelihood,
+            het_noise, uncertain_inputs, LB,
+            _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A,
+            psi0, psi1, beta,
+            data_fit, num_data, output_dim, trYYT, mu_tilde[:,None])
+
+        dL_dthetaL = 0#likelihood.exact_inference_gradients(dL_dR,Y_metadata)
+
+        if uncertain_inputs:
+            grad_dict = {'dL_dKmm': dL_dKmm,
+                         'dL_dpsi0':dL_dpsi0,
+                         'dL_dpsi1':dL_dpsi1,
+                         'dL_dpsi2':dL_dpsi2,
+                         'dL_dthetaL':dL_dthetaL}
+        else:
+            grad_dict = {'dL_dKmm': dL_dKmm,
+                         'dL_dKdiag':dL_dpsi0,
+                         'dL_dKnm':dL_dpsi1,
+                         'dL_dthetaL':dL_dthetaL}
+
+        #get sufficient things for posterior prediction
+        #TODO: do we really want to do this in  the loop?
+        if VVT_factor.shape[1] == Y.shape[1]:
+            woodbury_vector = Cpsi1Vf # == Cpsi1V
+        else:
+            print 'foobar'
+            psi1V = np.dot(mu_tilde[:,None].T*beta, psi1).T
+            tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
+            tmp, _ = dpotrs(LB, tmp, lower=1)
+            woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
+        Bi, _ = dpotri(LB, lower=1)
+        symmetrify(Bi)
+        Bi = -dpotri(LB, lower=1)[0]
+        diag.add(Bi, 1)
+
+        woodbury_inv = backsub_both_sides(Lm, Bi)
+
+        #construct a posterior object
+        post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
+        return post, log_marginal, grad_dict


-        mu, Sigma, mu_tilde, tau_tilde, Z_hat = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
-
-        Wi, LW, LWi, W_logdet = pdinv(K + np.diag(1./tau_tilde))
-
-        alpha, _ = dpotrs(LW, mu_tilde, lower=1)
-
-        log_marginal =  0.5*(-num_data * log_2_pi - W_logdet - np.sum(alpha * mu_tilde)) # TODO: add log Z_hat??
-
-        dL_dK = 0.5 * (tdot(alpha[:,None]) - Wi)
-
-        dL_dthetaL = np.zeros(likelihood.size)#TODO: derivatives of the likelihood parameters
-
-        return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}



@ -121,3 +274,69 @@ class EPDTC(EP):

        mu_tilde = v_tilde/tau_tilde
        return mu, Sigma, mu_tilde, tau_tilde, Z_hat
+
+def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs):
+    dL_dpsi0 = -0.5 * output_dim * (beta[:,None] * np.ones([num_data, 1])).flatten()
+    dL_dpsi1 = np.dot(VVT_factor, Cpsi1Vf.T)
+    dL_dpsi2_beta = 0.5 * backsub_both_sides(Lm, output_dim * np.eye(num_inducing) - DBi_plus_BiPBi)
+    if het_noise:
+        if uncertain_inputs:
+            dL_dpsi2 = beta[:, None, None] * dL_dpsi2_beta[None, :, :]
+        else:
+            dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * beta.reshape(num_data, 1)).T).T
+            dL_dpsi2 = None
+    else:
+        dL_dpsi2 = beta * dL_dpsi2_beta
+        if uncertain_inputs:
+            # repeat for each of the N psi_2 matrices
+            dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], num_data, axis=0)
+        else:
+            # subsume back into psi1 (==Kmn)
+            dL_dpsi1 += 2.*np.dot(psi1, dL_dpsi2)
+            dL_dpsi2 = None
+
+    return dL_dpsi0, dL_dpsi1, dL_dpsi2
+
+
+def _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT, Y):
+    # the partial derivative vector for the likelihood
+    if likelihood.size == 0:
+        # save computation here.
+        dL_dR = None
+    elif het_noise:
+        if uncertain_inputs:
+            raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
+        else:
+            #from ...util.linalg import chol_inv
+            #LBi = chol_inv(LB)
+            LBi, _ = dtrtrs(LB,np.eye(LB.shape[0]))
+
+            Lmi_psi1, nil = dtrtrs(Lm, psi1.T, lower=1, trans=0)
+            _LBi_Lmi_psi1, _ = dtrtrs(LB, Lmi_psi1, lower=1, trans=0)
+
+            dL_dR = -0.5 * beta + 0.5 * (beta*Y)**2
+            dL_dR += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * beta**2
+
+            dL_dR += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*beta**2
+
+            dL_dR += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * Y * beta**2
+            dL_dR += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * beta**2
+    else:
+        # likelihood is not heteroscedatic
+        dL_dR = -0.5 * num_data * output_dim * beta + 0.5 * trYYT * beta ** 2
+        dL_dR += 0.5 * output_dim * (psi0.sum() * beta ** 2 - np.trace(A) * beta)
+        dL_dR += beta * (0.5 * np.sum(A * DBi_plus_BiPBi) - data_fit)
+    return dL_dR
+
+def _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT, data_fit,Y):
+    #compute log marginal likelihood
+    if het_noise:
+        lik_1 = -0.5 * num_data * output_dim * np.log(2. * np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(beta * np.square(Y).sum(axis=-1))
+        lik_2 = -0.5 * output_dim * (np.sum(beta.flatten() * psi0) - np.trace(A))
+    else:
+        lik_1 = -0.5 * num_data * output_dim * (np.log(2. * np.pi) - np.log(beta)) - 0.5 * beta * trYYT
+        lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
+    lik_3 = -output_dim * (np.sum(np.log(np.diag(LB))))
+    lik_4 = 0.5 * data_fit
+    log_marginal = lik_1 + lik_2 + lik_3 + lik_4
+    return log_marginal
--- a/GPy/inference/latent_function_inference/fitc.py
+++ b/GPy/inference/latent_function_inference/fitc.py
@ -5,9 +5,10 @@ from posterior import Posterior
 from ...util.linalg import jitchol, tdot, dtrtrs, dpotri, pdinv
 from ...util import diag
 import numpy as np
+from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)

-class FITC(object):
+class FITC(LatentFunctionInference):
    """
    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.

--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@ -16,8 +16,9 @@ from ...util.misc import param_to_array
 from posterior import Posterior
 import warnings
 from scipy import optimize
+from . import LatentFunctionInference

-class Laplace(object):
+class Laplace(LatentFunctionInference):

    def __init__(self):
        """
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@ -95,7 +95,7 @@ class Posterior(object):
        """
        if self._covariance is None:
            #LiK, _ = dtrtrs(self.woodbury_chol, self._K, lower=1)
-            self._covariance = self._K - (np.tensordot(np.dot(np.atleast_3d(self.woodbury_inv).T, self._K), self._K, [1,0]).T).squeeze()
+            self._covariance = (np.atleast_3d(self._K) - np.tensordot(np.dot(np.atleast_3d(self.woodbury_inv).T, self._K), self._K, [1,0]).T).squeeze()
            #self._covariance = self._K - self._K.dot(self.woodbury_inv).dot(self._K)
        return self._covariance

--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@ -7,9 +7,10 @@ from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
 import numpy as np
 from ...util.misc import param_to_array
+from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)

-class VarDTC(object):
+class VarDTC(LatentFunctionInference):
    """
    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.

@ -190,7 +191,7 @@ class VarDTC(object):
        post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
        return post, log_marginal, grad_dict

-class VarDTCMissingData(object):
+class VarDTCMissingData(LatentFunctionInference):
    const_jitter = 1e-6
    def __init__(self, limit=1, inan=None):
        from ...util.caching import Cacher
@ -201,6 +202,17 @@ class VarDTCMissingData(object):
    def set_limit(self, limit):
        self._Y.limit = limit

+    def __getstate__(self):
+        # has to be overridden, as Cacher objects cannot be pickled. 
+        return self._Y.limit, self._inan
+
+    def __setstate__(self, state):
+        # has to be overridden, as Cacher objects cannot be pickled. 
+        from ...util.caching import Cacher
+        self.limit = state[0]
+        self._inan = state[1]
+        self._Y = Cacher(self._subarray_computations, self.limit)
+
    def _subarray_computations(self, Y):
        if self._inan is None:
            inan = np.isnan(Y)
@ -271,7 +283,11 @@ class VarDTCMissingData(object):
            else: beta = beta_all

            VVT_factor = (beta*y)
-            VVT_factor_all[v, ind].flat = VVT_factor.flat
+            try:
+                VVT_factor_all[v, ind].flat = VVT_factor.flat
+            except ValueError:
+                mult = np.ravel_multi_index((v.nonzero()[0][:,None],ind[None,:]), VVT_factor_all.shape)
+                VVT_factor_all.flat[mult] = VVT_factor
            output_dim = y.shape[1]

            psi0 = psi0_all[v]
--- a/GPy/inference/latent_function_inference/var_dtc_gpu.py
+++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py
@ -7,6 +7,7 @@ from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
 import numpy as np
 from ...util.misc import param_to_array
+from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)

 from ...util import gpu_init
@ -19,7 +20,7 @@ try:
 except:
    pass

-class VarDTC_GPU(object):
+class VarDTC_GPU(LatentFunctionInference):
    """
    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.

--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@ -7,9 +7,10 @@ from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
 import numpy as np
 from ...util.misc import param_to_array
+from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)

-class VarDTC_minibatch(object):
+class VarDTC_minibatch(LatentFunctionInference):
    """
    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.

@ -70,12 +71,13 @@ class VarDTC_minibatch(object):
        #see whether we've got a different noise variance for each datum
        beta = 1./np.fmax(likelihood.variance, 1e-6)
        het_noise = beta.size > 1
+        if het_noise:
+            self.batchsize = 1
        # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
        #self.YYTfactor = beta*self.get_YYTfactor(Y)
        YYT_factor = Y
        trYYT = self.get_trYYT(Y)
        
-        
        psi2_full = np.zeros((num_inducing,num_inducing))
        psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM
        psi0_full = 0
@ -104,19 +106,18 @@ class VarDTC_minibatch(object):
                YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum()
            else:
                psi0_full += psi0.sum()
-                psi1Y_full += np.dot(Y_slice.T,psi1) # DxM
-                
+                psi1Y_full += np.dot(Y_slice.T,psi1) # DxM                
                
            if uncertain_inputs:
                if het_noise:
-                    psi2_full += np.einsum('n,nmo->mo',beta_slice,psi2)
+                    psi2_full += beta_slice*psi2
                else:
-                    psi2_full += psi2.sum(axis=0)
+                    psi2_full += psi2
            else:
                if het_noise:
-                    psi2_full += np.einsum('n,nm,no->mo',beta_slice,psi1,psi1)
+                    psi2_full += beta_slice*np.outer(psi1,psi1)
                else:
-                    psi2_full += tdot(psi1.T)
+                    psi2_full += np.outer(psi1,psi1)
                
        if not het_noise:
            psi0_full *= beta
@ -223,7 +224,7 @@ class VarDTC_minibatch(object):
            psi2 = None
            
        if het_noise:
-            beta = beta[n_start:n_end]
+            beta = beta[n_start] # assuming batchsize==1

        betaY = beta*Y_slice
        betapsi1 = np.einsum('n,nm->nm',beta,psi1)
@ -244,7 +245,7 @@ class VarDTC_minibatch(object):
        dL_dpsi1 = np.dot(betaY,v.T)
        
        if uncertain_inputs:
-            dL_dpsi2 = np.einsum('n,mo->nmo',beta * np.ones((n_end-n_start,)),dL_dpsi2R)
+            dL_dpsi2 = beta* dL_dpsi2R
        else:
            dL_dpsi1 += np.dot(betapsi1,dL_dpsi2R)*2.
            dL_dpsi2 = None
@ -262,11 +263,11 @@ class VarDTC_minibatch(object):
            dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1)
        else:
            if uncertain_inputs:
-                psiR = np.einsum('mo,nmo->',dL_dpsi2R,psi2)
+                psiR = np.einsum('mo,mo->',dL_dpsi2R,psi2)
            else:
                psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R)
            
-            dL_dthetaL = ((np.square(betaY)).sum() + np.square(beta)*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum()
+            dL_dthetaL = ((np.square(betaY)).sum() + beta*beta*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - beta*beta*psiR- (betaY*np.dot(betapsi1,v)).sum()

        if uncertain_inputs:
            grad_dict = {'dL_dpsi0':dL_dpsi0,
@ -296,7 +297,7 @@ def update_gradients(model):
    kern_grad = model.kern.gradient.copy()
            
    #gradients w.r.t. Z
-    model.Z.gradient[:,model.kern.active_dims] = model.kern.gradients_X(dL_dKmm, model.Z)
+    model.Z.gradient = model.kern.gradients_X(dL_dKmm, model.Z)
    
    isEnd = False
    while not isEnd:
@ -309,8 +310,8 @@ def update_gradients(model):
            kern_grad += model.kern.gradient
    
            #gradients w.r.t. Z
-            model.Z.gradient[:,model.kern.active_dims] += model.kern.gradients_Z_expectations(
-                               grad_dict['dL_dpsi1'], grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=X_slice)
+            model.Z.gradient += model.kern.gradients_Z_expectations(
+                               dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=X_slice)
        
            #gradients w.r.t. posterior parameters of X
            X_grad = model.kern.gradients_qX_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'])
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@ -119,7 +119,7 @@ class Add(CombinationKernel):
                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
            p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)

-    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_psi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        from static import White, Bias
        target = np.zeros(Z.shape)
        for p1 in self.parts:
@ -134,17 +134,17 @@ class Add(CombinationKernel):
                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
                else:
                    eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
-            target += p1.gradients_Z_expectations(eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
+            target += p1.gradients_Z_expectations(dL_psi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
        return target

    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        from static import White, Bias
        target_mu = np.zeros(variational_posterior.shape)
        target_S = np.zeros(variational_posterior.shape)
-        for p1 in self._parameters_:
+        for p1 in self.parameters:
            #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
            eff_dL_dpsi1 = dL_dpsi1.copy()
-            for p2 in self._parameters_:
+            for p2 in self.parameters:
                if p2 is p1:
                    continue
                if isinstance(p2, White):
@ -160,7 +160,7 @@ class Add(CombinationKernel):

    def add(self, other, name='sum'):
        if isinstance(other, Add):
-            other_params = other._parameters_[:]
+            other_params = other.parameters[:]
            for p in other_params:
                other.remove_parameter(p)
            self.add_parameters(*other_params)
@ -170,4 +170,4 @@ class Add(CombinationKernel):
        return self

    def input_sensitivity(self):
-        return reduce(np.add, [k.input_sensitivity() for k in self.parts])
+        return reduce(np.add, [k.input_sensitivity() for k in self.parts])
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@ -55,7 +55,7 @@ class Kern(Parameterized):
        self._sliced_X = 0
        self.useGPU = self._support_GPU and useGPU

-    @Cache_this(limit=10)
+    @Cache_this(limit=20)
    def _slice_X(self, X):
        return X[:, self.active_dims]

@ -103,7 +103,7 @@ class Kern(Parameterized):
        """
        raise NotImplementedError

-    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        """
        Returns the derivative of the objective wrt Z, using the chain rule
        through the expectation variables.
@ -183,9 +183,9 @@ class Kern(Parameterized):
        assert isinstance(other, Kern), "only kernels can be added to kernels..."
        from prod import Prod
        #kernels = []
-        #if isinstance(self, Prod): kernels.extend(self._parameters_)
+        #if isinstance(self, Prod): kernels.extend(self.parameters)
        #else: kernels.append(self)
-        #if isinstance(other, Prod): kernels.extend(other._parameters_)
+        #if isinstance(other, Prod): kernels.extend(other.parameters)
        #else: kernels.append(other)
        return Prod([self, other], name)

@ -222,7 +222,7 @@ class CombinationKernel(Kern):

    @property
    def parts(self):
-        return self._parameters_
+        return self.parameters

    def get_input_dim_active_dims(self, kernels, extra_dims = None):
        #active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int))
--- a/GPy/kern/_src/kernel_slice_operations.py
+++ b/GPy/kern/_src/kernel_slice_operations.py
@ -124,9 +124,9 @@ def _slice_update_gradients_expectations(f):

 def _slice_gradients_Z_expectations(f):
    @wraps(f)
-    def wrap(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        with _Slice_wrap(self, Z, variational_posterior) as s:
-            ret = s.handle_return_array(f(self, dL_dpsi1, dL_dpsi2, s.X, s.X2))
+            ret = s.handle_return_array(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2))
        return ret
    return wrap

--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@ -169,7 +169,7 @@ class Linear(Kern):
            else:
                self.variances.gradient += 2.*np.sum(dL_dpsi2 * self.psi2(Z, variational_posterior))/self.variances

-    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
            gamma = variational_posterior.binary_prob
            mu = variational_posterior.mean
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
@ -9,12 +9,23 @@ import numpy as np
 from GPy.util.caching import Cache_this

@Cache_this(limit=1)
-def _Z_distances(Z):
-    Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
-    Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
-    return Zhat, Zdist
+def psicomputations(variance, lengthscale, Z, mu, S, gamma):
+    """
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    """
+    # here are the "statistics" for psi0, psi1 and psi2
+    # Produced intermediate results:
+    # _psi1                NxM
+    
+    psi0 = np.empty(mu.shape[0])
+    psi0[:] = variance
+    psi1 = _psi1computations(variance, lengthscale, Z, mu, S, gamma)
+    psi2 = _psi2computations(variance, lengthscale, Z, mu, S, gamma)
+    return psi0, psi1, psi2

-@Cache_this(limit=1)
 def _psi1computations(variance, lengthscale, Z, mu, S, gamma):
    """
    Z - MxQ
@ -22,15 +33,10 @@ def _psi1computations(variance, lengthscale, Z, mu, S, gamma):
    S - NxQ
    gamma - NxQ
    """
-    # here are the "statistics" for psi1 and psi2
+    # here are the "statistics" for psi1
    # Produced intermediate results:
    # _psi1                NxM
-    # _dpsi1_dvariance     NxM
-    # _dpsi1_dlengthscale  NxMxQ
-    # _dpsi1_dZ            NxMxQ
-    # _dpsi1_dgamma        NxMxQ
-    # _dpsi1_dmu           NxMxQ
-    # _dpsi1_dS            NxMxQ
+
    
    lengthscale2 = np.square(lengthscale)

@ -40,25 +46,15 @@ def _psi1computations(variance, lengthscale, Z, mu, S, gamma):
    _psi1_dist = Z[None, :, :] - mu[:, None, :]  # NxMxQ
    _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom) # NxMxQ
    _psi1_common = gamma[:,None,:] / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #Nx1xQ
-    _psi1_exponent1 = np.log(gamma[:,None,:]) -0.5 * (_psi1_dist_sq + np.log(_psi1_denom)) # NxMxQ
-    _psi1_exponent2 = np.log(1.-gamma[:,None,:]) -0.5 * (np.square(Z[None,:,:])/lengthscale2) # NxMxQ
+    _psi1_exponent1 = np.log(gamma[:,None,:]) - (_psi1_dist_sq + np.log(_psi1_denom))/2. # NxMxQ
+    _psi1_exponent2 = np.log(1.-gamma[:,None,:]) - (np.square(Z[None,:,:])/lengthscale2)/2. # NxMxQ
    _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2)
    _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ
    _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM
-    _psi1_exp_dist_sq = np.exp(-0.5*_psi1_dist_sq) # NxMxQ
-    _psi1_exp_Z = np.exp(-0.5*np.square(Z[None,:,:])/lengthscale2) # 1xMxQ
-    _psi1_q = variance * np.exp(_psi1_exp_sum[:,:,None] - _psi1_exponent) # NxMxQ
    _psi1 = variance * np.exp(_psi1_exp_sum) # NxM
-    _dpsi1_dvariance = _psi1 / variance # NxM
-    _dpsi1_dgamma = _psi1_q * (_psi1_exp_dist_sq/_psi1_denom_sqrt-_psi1_exp_Z) # NxMxQ
-    _dpsi1_dmu = _psi1_q * (_psi1_exp_dist_sq * _psi1_dist * _psi1_common) # NxMxQ
-    _dpsi1_dS = _psi1_q * (_psi1_exp_dist_sq * _psi1_common * 0.5 * (_psi1_dist_sq - 1.)) # NxMxQ
-    _dpsi1_dZ = _psi1_q * (- _psi1_common * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z) # NxMxQ
-    _dpsi1_dlengthscale = 2.*lengthscale*_psi1_q * (0.5*_psi1_common*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + 0.5*(1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z) # NxMxQ

-    return _psi1, _dpsi1_dvariance, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _dpsi1_dZ, _dpsi1_dlengthscale
+    return _psi1

-@Cache_this(limit=1)
 def _psi2computations(variance, lengthscale, Z, mu, S, gamma):
    """
    Z - MxQ
@ -66,19 +62,14 @@ def _psi2computations(variance, lengthscale, Z, mu, S, gamma):
    S - NxQ
    gamma - NxQ
    """
-    # here are the "statistics" for psi1 and psi2
+    # here are the "statistics" for psi2
    # Produced intermediate results:
-    # _psi2                NxMxM
-    # _psi2_dvariance      NxMxM
-    # _psi2_dlengthscale   NxMxMxQ
-    # _psi2_dZ             NxMxMxQ
-    # _psi2_dgamma         NxMxMxQ
-    # _psi2_dmu            NxMxMxQ
-    # _psi2_dS             NxMxMxQ
+    # _psi2                MxM
    
    lengthscale2 = np.square(lengthscale)
    
-    _psi2_Zhat, _psi2_Zdist = _Z_distances(Z)
+    _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
+    _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
    _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
    _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ

@ -93,15 +84,130 @@ def _psi2computations(variance, lengthscale, Z, mu, S, gamma):
    _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2)
    _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
    _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
-    _psi2_q = np.square(variance) * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ 
+    _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
+
+    return _psi2
+
+def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+    ARD = (len(lengthscale)!=1)
+    
+    dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1, dgamma_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+    dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2, dgamma_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+
+    dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
+    
+    dL_dlengscale = dl_psi1 + dl_psi2
+    if not ARD:
+        dL_dlengscale = dL_dlengscale.sum()
+
+    dL_dgamma = dgamma_psi1 + dgamma_psi2
+    dL_dmu = dmu_psi1 + dmu_psi2
+    dL_dS = dS_psi1 + dS_psi2
+    dL_dZ = dZ_psi1 + dZ_psi2
+    
+    return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma
+
+def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, gamma):
+    """
+    dL_dpsi1 - NxM
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    """
+    # here are the "statistics" for psi1
+    # Produced intermediate results: dL_dparams w.r.t. psi1
+    # _dL_dvariance     1
+    # _dL_dlengthscale  Q
+    # _dL_dZ            MxQ
+    # _dL_dgamma        NxQ
+    # _dL_dmu           NxQ
+    # _dL_dS            NxQ
+    
+    lengthscale2 = np.square(lengthscale)
+
+    # psi1
+    _psi1_denom = S / lengthscale2 + 1.  # NxQ
+    _psi1_denom_sqrt = np.sqrt(_psi1_denom) #NxQ
+    _psi1_dist = Z[None, :, :] - mu[:, None, :]  # NxMxQ
+    _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom[:,None,:]) # NxMxQ
+    _psi1_common = gamma / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #NxQ
+    _psi1_exponent1 = np.log(gamma[:,None,:]) -0.5 * (_psi1_dist_sq + np.log(_psi1_denom[:, None,:])) # NxMxQ
+    _psi1_exponent2 = np.log(1.-gamma[:,None,:]) -0.5 * (np.square(Z[None,:,:])/lengthscale2) # NxMxQ
+    _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2)
+    _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ
+    _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM
+    _psi1_exp_dist_sq = np.exp(-0.5*_psi1_dist_sq) # NxMxQ
+    _psi1_exp_Z = np.exp(-0.5*np.square(Z[None,:,:])/lengthscale2) # 1xMxQ
+    _psi1_q = variance * np.exp(_psi1_exp_sum[:,:,None] - _psi1_exponent) # NxMxQ
+    _psi1 = variance * np.exp(_psi1_exp_sum) # NxM
+    _dL_dvariance = np.einsum('nm,nm->',dL_dpsi1, _psi1)/variance # 1
+    _dL_dgamma = np.einsum('nm,nmq,nmq->nq',dL_dpsi1, _psi1_q, (_psi1_exp_dist_sq/_psi1_denom_sqrt[:,None,:]-_psi1_exp_Z)) # NxQ
+    _dL_dmu = np.einsum('nm, nmq, nmq, nmq, nq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_dist,_psi1_common)  # NxQ
+    _dL_dS = np.einsum('nm,nmq,nmq,nq,nmq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_common,(_psi1_dist_sq-1.))/2.  # NxQ
+    _dL_dZ = np.einsum('nm,nmq,nmq->mq',dL_dpsi1,_psi1_q, (- _psi1_common[:,None,:] * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z))
+    _dL_dlengthscale = lengthscale* np.einsum('nm,nmq,nmq->q',dL_dpsi1,_psi1_q,(_psi1_common[:,None,:]*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + (1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z))
+    
+#     _dpsi1_dmu = _psi1_q * (_psi1_exp_dist_sq * _psi1_dist * _psi1_common) # NxMxQ
+#     _dpsi1_dS = _psi1_q * (_psi1_exp_dist_sq * _psi1_common * 0.5 * (_psi1_dist_sq - 1.)) # NxMxQ
+#     _dpsi1_dZ = _psi1_q * (- _psi1_common * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z) # NxMxQ
+#     _dpsi1_dlengthscale = 2.*lengthscale*_psi1_q * (0.5*_psi1_common*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + 0.5*(1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z) # NxMxQ
+
+    return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma 
+
+def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, gamma):
+    """
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    dL_dpsi2 - MxM
+    """
+    # here are the "statistics" for psi2
+    # Produced the derivatives w.r.t. psi2:
+    # _dL_dvariance      1
+    # _dL_dlengthscale   Q
+    # _dL_dZ             MxQ
+    # _dL_dgamma         NxQ
+    # _dL_dmu            NxQ
+    # _dL_dS             NxQ
+    
+    lengthscale2 = np.square(lengthscale)
+    
+    _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
+    _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
+    _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
+    _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
+
+    # psi2
+    _psi2_denom = 2.*S / lengthscale2 + 1. # NxQ
+    _psi2_denom_sqrt = np.sqrt(_psi2_denom)
+    _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q
+    _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom[:,None,None,:])
+    _psi2_common = gamma/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # NxQ
+    _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom[:,None,None,:])+np.log(gamma[:,None,None,:]) #N,M,M,Q
+    _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ
+    _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2)
+    _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
+    _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
+    _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ 
    _psi2_exp_dist_sq = np.exp(-_psi2_Zdist_sq -_psi2_mudist_sq) # NxMxMxQ
    _psi2_exp_Z = np.exp(-0.5*_psi2_Z_sq_sum) # MxMxQ
-    _psi2 = np.square(variance) * np.exp(_psi2_exp_sum) # N,M,M
-    _dpsi2_dvariance = 2. * _psi2/variance # NxMxM
-    _dpsi2_dgamma = _psi2_q * (_psi2_exp_dist_sq/_psi2_denom_sqrt - _psi2_exp_Z) # NxMxMxQ
-    _dpsi2_dmu = _psi2_q * (-2.*_psi2_common*_psi2_mudist * _psi2_exp_dist_sq) # NxMxMxQ
-    _dpsi2_dS = _psi2_q * (_psi2_common * (2.*_psi2_mudist_sq - 1.) * _psi2_exp_dist_sq) # NxMxMxQ
-    _dpsi2_dZ = 2.*_psi2_q * (_psi2_common*(-_psi2_Zdist*_psi2_denom+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z) # NxMxMxQ
-    _dpsi2_dlengthscale = 2.*lengthscale* _psi2_q * (_psi2_common*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z) # NxMxMxQ
+    _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
+    _dL_dvariance = np.einsum('mo,mo->',dL_dpsi2,_psi2)*2./variance
+    _dL_dgamma = np.einsum('mo,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,(_psi2_exp_dist_sq/_psi2_denom_sqrt[:,None,None,:] - _psi2_exp_Z))
+    _dL_dmu = -2.*np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,_psi2_common,_psi2_mudist,_psi2_exp_dist_sq)
+    _dL_dS = np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q, _psi2_common, (2.*_psi2_mudist_sq-1.), _psi2_exp_dist_sq)
+    _dL_dZ = 2.*np.einsum('mo,nmoq,nmoq->mq',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(-_psi2_Zdist*_psi2_denom[:,None,None,:]+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z))
+#    print _psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq #+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z)
+    _dL_dlengthscale = 2.*lengthscale* np.einsum('mo,nmoq,nmoq->q',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z))
+    
+    
+#     _dpsi2_dvariance = 2. * _psi2/variance # NxMxM
+#     _dpsi2_dgamma = _psi2_q * (_psi2_exp_dist_sq/_psi2_denom_sqrt - _psi2_exp_Z) # NxMxMxQ
+#     _dpsi2_dmu = _psi2_q * (-2.*_psi2_common*_psi2_mudist * _psi2_exp_dist_sq) # NxMxMxQ
+#     _dpsi2_dS = _psi2_q * (_psi2_common * (2.*_psi2_mudist_sq - 1.) * _psi2_exp_dist_sq) # NxMxMxQ
+#     _dpsi2_dZ = 2.*_psi2_q * (_psi2_common*(-_psi2_Zdist*_psi2_denom+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z) # NxMxMxQ
+#     _dpsi2_dlengthscale = 2.*lengthscale* _psi2_q * (_psi2_common*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z) # NxMxMxQ

-    return _psi2, _dpsi2_dvariance, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _dpsi2_dZ, _dpsi2_dlengthscale
+    return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@ -42,9 +42,11 @@ class RBF(Stationary):
    #---------------------------------------#

    def psi0(self, Z, variational_posterior):
-        if self.useGPU:
-            if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
+        if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
+            if self.useGPU:
                return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[0]
+            else:
+                return ssrbf_psi_comp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[0]
        else:
            return self.Kdiag(variational_posterior.mean)

@ -53,7 +55,7 @@ class RBF(Stationary):
            if self.useGPU:
                return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[1]
            else:
-                psi1, _, _, _, _, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+                return ssrbf_psi_comp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[1]
        else:
            _, _, _, psi1 = self._psi1computations(Z, variational_posterior)
        return psi1
@ -63,7 +65,7 @@ class RBF(Stationary):
            if self.useGPU:
                return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[2]
            else:
-                psi2, _, _, _, _, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+                return ssrbf_psi_comp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[2]
        else:
            _, _, _, _, psi2 = self._psi2computations(Z, variational_posterior)
        return psi2
@ -74,26 +76,30 @@ class RBF(Stationary):
            if self.useGPU:
                self.psicomp.update_gradients_expectations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
            else:
+#                 dL_dvar, dL_dlengscale, dL_dZ, dL_dgamma, dL_dmu, dL_dS = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
+                dL_dvar, dL_dlengscale, _, _, _, _ = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
+                self.variance.gradient = dL_dvar
+                self.lengthscale.gradient = dL_dlengscale
                
-                _, _dpsi1_dvariance, _, _, _, _, _dpsi1_dlengthscale = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-                _, _dpsi2_dvariance, _, _, _, _, _dpsi2_dlengthscale = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-    
-                #contributions from psi0:
-                self.variance.gradient = np.sum(dL_dpsi0)
-    
-                #from psi1
-                self.variance.gradient += np.sum(dL_dpsi1 * _dpsi1_dvariance)
-                if self.ARD:
-                    self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
-                else:
-                    self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).sum()  
-    
-                #from psi2
-                self.variance.gradient += (dL_dpsi2 * _dpsi2_dvariance).sum()
-                if self.ARD:
-                    self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
-                else:
-                    self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).sum()
+#                 _, _dpsi1_dvariance, _, _, _, _, _dpsi1_dlengthscale = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+#                 _, _dpsi2_dvariance, _, _, _, _, _dpsi2_dlengthscale = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+#     
+#                 #contributions from psi0:
+#                 self.variance.gradient = np.sum(dL_dpsi0)
+#     
+#                 #from psi1
+#                 self.variance.gradient += np.sum(dL_dpsi1 * _dpsi1_dvariance)
+#                 if self.ARD:
+#                     self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
+#                 else:
+#                     self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).sum()  
+#     
+#                 #from psi2
+#                 self.variance.gradient += (dL_dpsi2 * _dpsi2_dvariance).sum()
+#                 if self.ARD:
+#                     self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
+#                 else:
+#                     self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).sum()
                    
        elif isinstance(variational_posterior, variational.NormalPosterior):
            l2 = self.lengthscale**2
@ -126,22 +132,25 @@ class RBF(Stationary):
        else:
            raise ValueError, "unknown distriubtion received for psi-statistics"

-    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        # Spike-and-Slab GPLVM
        if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
            if self.useGPU:
                return self.psicomp.gradients_Z_expectations(dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
            else:
-                _, _, _, _, _, _dpsi1_dZ, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-                _, _, _, _, _, _dpsi2_dZ, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-    
-                #psi1
-                grad = (dL_dpsi1[:, :, None] * _dpsi1_dZ).sum(axis=0)
-    
-                #psi2
-                grad += (dL_dpsi2[:, :, :, None] * _dpsi2_dZ).sum(axis=0).sum(axis=1)
-    
-                return grad
+                _, _, dL_dZ, _, _, _ = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
+                return dL_dZ
+                
+#                 _, _, _, _, _, _dpsi1_dZ, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+#                 _, _, _, _, _, _dpsi2_dZ, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+#     
+#                 #psi1
+#                 grad = (dL_dpsi1[:, :, None] * _dpsi1_dZ).sum(axis=0)
+#     
+#                 #psi2
+#                 grad += (dL_dpsi2[:, :, :, None] * _dpsi2_dZ).sum(axis=0).sum(axis=1)
+#     
+#                 return grad

        elif isinstance(variational_posterior, variational.NormalPosterior):
            l2 = self.lengthscale **2
@ -167,26 +176,29 @@ class RBF(Stationary):
        if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
            if self.useGPU:
                return self.psicomp.gradients_qX_expectations(dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
-            else:   
-                ndata = variational_posterior.mean.shape[0]
-    
-                _, _, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-                _, _, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-    
-                #psi1
-                grad_mu = (dL_dpsi1[:, :, None] * _dpsi1_dmu).sum(axis=1)
-                grad_S = (dL_dpsi1[:, :, None] * _dpsi1_dS).sum(axis=1)
-                grad_gamma = (dL_dpsi1[:,:,None] * _dpsi1_dgamma).sum(axis=1)
-    
-                #psi2
-                grad_mu += (dL_dpsi2[:, :, :, None] * _dpsi2_dmu).reshape(ndata,-1,self.input_dim).sum(axis=1)
-                grad_S += (dL_dpsi2[:, :, :, None] * _dpsi2_dS).reshape(ndata,-1,self.input_dim).sum(axis=1)
-                grad_gamma += (dL_dpsi2[:,:,:, None] * _dpsi2_dgamma).reshape(ndata,-1,self.input_dim).sum(axis=1)
-                
-                if self.group_spike_prob:
-                    grad_gamma[:] = grad_gamma.mean(axis=0)
-    
-                return grad_mu, grad_S, grad_gamma
+            else:
+                _, _, _, dL_dmu, dL_dS, dL_dgamma = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
+                return dL_dmu, dL_dS, dL_dgamma
+            
+#                 ndata = variational_posterior.mean.shape[0]
+#     
+#                 _, _, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+#                 _, _, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+#     
+#                 #psi1
+#                 grad_mu = (dL_dpsi1[:, :, None] * _dpsi1_dmu).sum(axis=1)
+#                 grad_S = (dL_dpsi1[:, :, None] * _dpsi1_dS).sum(axis=1)
+#                 grad_gamma = (dL_dpsi1[:,:,None] * _dpsi1_dgamma).sum(axis=1)
+#     
+#                 #psi2
+#                 grad_mu += (dL_dpsi2[:, :, :, None] * _dpsi2_dmu).reshape(ndata,-1,self.input_dim).sum(axis=1)
+#                 grad_S += (dL_dpsi2[:, :, :, None] * _dpsi2_dS).reshape(ndata,-1,self.input_dim).sum(axis=1)
+#                 grad_gamma += (dL_dpsi2[:,:,:, None] * _dpsi2_dgamma).reshape(ndata,-1,self.input_dim).sum(axis=1)
+#                 
+#                 if self.group_spike_prob:
+#                     grad_gamma[:] = grad_gamma.mean(axis=0)
+#     
+#                 return grad_mu, grad_S, grad_gamma

        elif isinstance(variational_posterior, variational.NormalPosterior):

--- a/GPy/kern/_src/static.py
+++ b/GPy/kern/_src/static.py
@ -25,7 +25,7 @@ class Static(Kern):
    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)

-    def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        return np.zeros(Z.shape)

    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@ -180,7 +180,7 @@ class Stationary(Kern):
        return np.zeros(X.shape)

    def input_sensitivity(self):
-        return np.ones(self.input_dim)/self.lengthscale
+        return np.ones(self.input_dim)/self.lengthscale**2

 class Exponential(Stationary):
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Exponential'):
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@ -227,3 +227,6 @@ class Bernoulli(Likelihood):
        ns = np.ones_like(gp, dtype=int)
        Ysim = np.random.binomial(ns, self.gp_link.transf(gp))
        return Ysim.reshape(orig_shape)
+
+    def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
+        pass
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@ -10,6 +10,7 @@ from ..util import linalg
 from ..core.parameterization.variational import NormalPosterior, NormalPrior, VariationalPosterior
 from ..inference.latent_function_inference.var_dtc_parallel import update_gradients
 from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU
+import logging

 class BayesianGPLVM(SparseGP):
    """
@ -25,8 +26,10 @@ class BayesianGPLVM(SparseGP):
    """
    def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
                 Z=None, kernel=None, inference_method=None, likelihood=None, name='bayesian gplvm', **kwargs):
+        self.logger = logging.getLogger(self.__class__.__name__)
        if X == None:
            from ..util.initialization import initialize_latent
+            self.logger.info("initializing latent space X with method {}".format(init))
            X, fracs = initialize_latent(init, input_dim, Y)
        else:
            fracs = np.ones(input_dim)
@ -36,7 +39,6 @@ class BayesianGPLVM(SparseGP):
        if X_variance is None:
            X_variance = np.random.uniform(0,.1,X.shape)

-
        if Z is None:
            Z = np.random.permutation(X.copy())[:num_inducing]
        assert Z.shape[1] == X.shape[1]
@ -52,11 +54,14 @@ class BayesianGPLVM(SparseGP):
        X = NormalPosterior(X, X_variance)

        if inference_method is None:
-            if np.any(np.isnan(Y)):
+            inan = np.isnan(Y)
+            if np.any(inan):
                from ..inference.latent_function_inference.var_dtc import VarDTCMissingData
-                inference_method = VarDTCMissingData()
+                self.logger.debug("creating inference_method with var_dtc missing data")
+                inference_method = VarDTCMissingData(inan=inan)
            else:
                from ..inference.latent_function_inference.var_dtc import VarDTC
+                self.logger.debug("creating inference_method var_dtc")
                inference_method = VarDTC()

        SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs)
@ -83,7 +88,7 @@ class BayesianGPLVM(SparseGP):
                resolution=50, ax=None, marker='o', s=40,
                fignum=None, plot_inducing=True, legend=True,
                plot_limits=None, 
-                aspect='auto', updates=False, **kwargs):
+                aspect='auto', updates=False, predict_kwargs={}, imshow_kwargs={}):
        import sys
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import dim_reduction_plots
@ -91,7 +96,7 @@ class BayesianGPLVM(SparseGP):
        return dim_reduction_plots.plot_latent(self, labels, which_indices,
                resolution, ax, marker, s,
                fignum, plot_inducing, legend,
-                plot_limits, aspect, updates, **kwargs)
+                plot_limits, aspect, updates, predict_kwargs, imshow_kwargs)

    def do_test_latents(self, Y):
        """
@ -100,36 +105,41 @@ class BayesianGPLVM(SparseGP):
        Notes:
        This will only work with a univariate Gaussian likelihood (for now)
        """
-        assert not self.likelihood.is_heteroscedastic
        N_test = Y.shape[0]
        input_dim = self.Z.shape[1]
+        
        means = np.zeros((N_test, input_dim))
        covars = np.zeros((N_test, input_dim))
-
-        dpsi0 = -0.5 * self.input_dim * self.likelihood.precision
-        dpsi2 = self.dL_dpsi2[0][None, :, :] # TODO: this may change if we ignore het. likelihoods
-        V = self.likelihood.precision * Y
+        
+        dpsi0 = -0.5 * self.input_dim / self.likelihood.variance
+        dpsi2 = self.grad_dict['dL_dpsi2'][0][None, :, :] # TODO: this may change if we ignore het. likelihoods
+        V = Y/self.likelihood.variance

        #compute CPsi1V
-        if self.Cpsi1V is None:
-            psi1V = np.dot(self.psi1.T, self.likelihood.V)
-            tmp, _ = linalg.dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
-            tmp, _ = linalg.dpotrs(self.LB, tmp, lower=1)
-            self.Cpsi1V, _ = linalg.dtrtrs(self._Lm, tmp, lower=1, trans=1)
+        #if self.Cpsi1V is None:
+        #    psi1V = np.dot(self.psi1.T, self.likelihood.V)
+        #    tmp, _ = linalg.dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
+        #    tmp, _ = linalg.dpotrs(self.LB, tmp, lower=1)
+        #    self.Cpsi1V, _ = linalg.dtrtrs(self._Lm, tmp, lower=1, trans=1)

-        dpsi1 = np.dot(self.Cpsi1V, V.T)
+        dpsi1 = np.dot(self.posterior.woodbury_vector, V.T)

-        start = np.zeros(self.input_dim * 2)
+        #start = np.zeros(self.input_dim * 2)
+        
+
+        from scipy.optimize import minimize

        for n, dpsi1_n in enumerate(dpsi1.T[:, :, None]):
-            args = (self.kern, self.Z, dpsi0, dpsi1_n.T, dpsi2)
-            xopt, fopt, neval, status = SCG(f=latent_cost, gradf=latent_grad, x=start, optargs=args, display=False)
-
+            args = (input_dim, self.kern.copy(), self.Z, dpsi0, dpsi1_n.T, dpsi2)
+            res = minimize(latent_cost_and_grad, jac=True, x0=np.hstack((means[n], covars[n])), args=args, method='BFGS')
+            xopt = res.x
            mu, log_S = xopt.reshape(2, 1, -1)
            means[n] = mu[0].copy()
            covars[n] = np.exp(log_S[0]).copy()

-        return means, covars
+        X = NormalPosterior(means, covars)
+
+        return X 

    def dmu_dX(self, Xnew):
        """
@ -161,57 +171,26 @@ class BayesianGPLVM(SparseGP):
        return dim_reduction_plots.plot_steepest_gradient_map(self,*args,**kwargs)
    

-def latent_cost_and_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
+def latent_cost_and_grad(mu_S, input_dim, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
    """
    objective function for fitting the latent variables for test points
    (negative log-likelihood: should be minimised!)
    """
-    mu, log_S = mu_S.reshape(2, 1, -1)
+    mu = mu_S[:input_dim][None]
+    log_S = mu_S[input_dim:][None]
    S = np.exp(log_S)

-    psi0 = kern.psi0(Z, mu, S)
-    psi1 = kern.psi1(Z, mu, S)
-    psi2 = kern.psi2(Z, mu, S)
+    X = NormalPosterior(mu, S)

-    lik = dL_dpsi0 * psi0 + np.dot(dL_dpsi1.flatten(), psi1.flatten()) + np.dot(dL_dpsi2.flatten(), psi2.flatten()) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S)
+    psi0 = kern.psi0(Z, X)
+    psi1 = kern.psi1(Z, X)
+    psi2 = kern.psi2(Z, X)

-    mu0, S0 = kern.dpsi0_dmuS(dL_dpsi0, Z, mu, S)
-    mu1, S1 = kern.dpsi1_dmuS(dL_dpsi1, Z, mu, S)
-    mu2, S2 = kern.dpsi2_dmuS(dL_dpsi2, Z, mu, S)
-
-    dmu = mu0 + mu1 + mu2 - mu
+    lik = dL_dpsi0 * psi0.sum() + np.einsum('ij,kj->...', dL_dpsi1, psi1) + np.einsum('ijk,lkj->...', dL_dpsi2, psi2) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S)
+    
+    dLdmu, dLdS = kern.gradients_qX_expectations(dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, X)    
+    dmu = dLdmu - mu
    # dS = S0 + S1 + S2 -0.5 + .5/S
-    dlnS = S * (S0 + S1 + S2 - 0.5) + .5
+    dlnS = S * (dLdS - 0.5) + .5
+    
    return -lik, -np.hstack((dmu.flatten(), dlnS.flatten()))
-
-def latent_cost(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
-    """
-    objective function for fitting the latent variables (negative log-likelihood: should be minimised!)
-    This is the same as latent_cost_and_grad but only for the objective
-    """
-    mu, log_S = mu_S.reshape(2, 1, -1)
-    S = np.exp(log_S)
-
-    psi0 = kern.psi0(Z, mu, S)
-    psi1 = kern.psi1(Z, mu, S)
-    psi2 = kern.psi2(Z, mu, S)
-
-    lik = dL_dpsi0 * psi0 + np.dot(dL_dpsi1.flatten(), psi1.flatten()) + np.dot(dL_dpsi2.flatten(), psi2.flatten()) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S)
-    return -float(lik)
-
-def latent_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
-    """
-    This is the same as latent_cost_and_grad but only for the grad
-    """
-    mu, log_S = mu_S.reshape(2, 1, -1)
-    S = np.exp(log_S)
-
-    mu0, S0 = kern.dpsi0_dmuS(dL_dpsi0, Z, mu, S)
-    mu1, S1 = kern.dpsi1_dmuS(dL_dpsi1, Z, mu, S)
-    mu2, S2 = kern.dpsi2_dmuS(dL_dpsi2, Z, mu, S)
-
-    dmu = mu0 + mu1 + mu2 - mu
-    # dS = S0 + S1 + S2 -0.5 + .5/S
-    dlnS = S * (S0 + S1 + S2 - 0.5) + .5
-
-    return -np.hstack((dmu.flatten(), dlnS.flatten()))
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@ -2,23 +2,30 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-import itertools
-import pylab
+import itertools, logging

-from ..core import Model
 from ..kern import Kern
 from ..core.parameterization.variational import NormalPosterior, NormalPrior
 from ..core.parameterization import Param, Parameterized
+from ..core.parameterization.observable_array import ObsAr
 from ..inference.latent_function_inference.var_dtc import VarDTCMissingData, VarDTC
+from ..inference.latent_function_inference import InferenceMethodList
 from ..likelihoods import Gaussian
-from GPy.util.initialization import initialize_latent
+from ..util.initialization import initialize_latent
+from ..core.sparse_gp import SparseGP, GP

-class MRD(Model):
+class MRD(SparseGP):
    """
+    !WARNING: This is bleeding edge code and still in development. 
+    Functionality may change fundamentally during development!
+
    Apply MRD to all given datasets Y in Ylist.

    Y_i in [n x p_i]

+    If Ylist is a dictionary, the keys of the dictionary are the names, and the
+    values are the different datasets to compare. 
+
    The samples n in the datasets need
    to match up, whereas the dimensionality p_d can differ.

@ -39,40 +46,77 @@ class MRD(Model):
    :param num_inducing: number of inducing inputs to use
    :param Z: initial inducing inputs
    :param kernel: list of kernels or kernel to copy for each output
-    :type kernel: [GPy.kern.kern] | GPy.kern.kern | None (default)
-    :param :class:`~GPy.inference.latent_function_inference inference_method: the inference method to use
-    :param :class:`~GPy.likelihoods.likelihood.Likelihood` likelihood: the likelihood to use
+    :type kernel: [GPy.kernels.kernels] | GPy.kernels.kernels | None (default)
+    :param :class:`~GPy.inference.latent_function_inference inference_method: 
+        InferenceMethodList of inferences, or one inference method for all
+    :param :class:`~GPy.likelihoodss.likelihoods.likelihoods` likelihoods: the likelihoods to use
    :param str name: the name of this model
    :param [str] Ynames: the names for the datasets given, must be of equal length as Ylist or None
    """
    def __init__(self, Ylist, input_dim, X=None, X_variance=None,
                 initx = 'PCA', initz = 'permute',
                 num_inducing=10, Z=None, kernel=None,
-                 inference_method=None, likelihood=None, name='mrd', Ynames=None):
-        super(MRD, self).__init__(name)
+                 inference_method=None, likelihoods=None, name='mrd', Ynames=None):
+        super(GP, self).__init__(name)

+        self.logger = logging.getLogger(self.__class__.__name__)
        self.input_dim = input_dim
        self.num_inducing = num_inducing

-        self.Ylist = Ylist
+        if isinstance(Ylist, dict):
+            Ynames, Ylist = zip(*Ylist.items())
+
+        self.logger.debug("creating observable arrays")
+        self.Ylist = [ObsAr(Y) for Y in Ylist]
+
+        if Ynames is None:
+            self.logger.debug("creating Ynames")
+            Ynames = ['Y{}'.format(i) for i in range(len(Ylist))]
+        self.names = Ynames
+        assert len(self.names) == len(self.Ylist), "one name per dataset, or None if Ylist is a dict"
+
+        if inference_method is None:
+            self.inference_method= InferenceMethodList()
+            warned = False
+            for y in Ylist:
+                inan = np.isnan(y)
+                if np.any(inan):
+                    if not warned:
+                        self.logger.warn("WARNING: NaN values detected, make sure initx method can cope with NaN values or provide starting latent space X")
+                        warned = True
+                    self.inference_method.append(VarDTCMissingData(limit=1, inan=inan))
+                else:
+                    self.inference_method.append(VarDTC(limit=1))
+                self.logger.debug("created inference method <{}>".format(hex(id(self.inference_method[-1]))))
+        else:
+            if not isinstance(inference_method, InferenceMethodList):
+                self.logger.debug("making inference_method an InferenceMethodList")
+                inference_method = InferenceMethodList(inference_method)
+            self.inference_method = inference_method
+
+
        self._in_init_ = True
-        X, fracs = self._init_X(initx, Ylist)
+        if X is None:
+            X, fracs = self._init_X(initx, Ylist)
+        else:
+            fracs = [X.var(0)]*len(Ylist)
        self.Z = Param('inducing inputs', self._init_Z(initz, X))
        self.num_inducing = self.Z.shape[0] # ensure M==N if M>N

        # sort out the kernels
+        self.logger.info("building kernels")
        if kernel is None:
            from ..kern import RBF
-            self.kern = [RBF(input_dim, ARD=1, lengthscale=fracs[i], name='rbf'.format(i)) for i in range(len(Ylist))]
+            kernels = [RBF(input_dim, ARD=1, lengthscale=fracs[i]) for i in range(len(Ylist))]
        elif isinstance(kernel, Kern):
-            self.kern = []
+            kernels = []
            for i in range(len(Ylist)):
                k = kernel.copy()
-                self.kern.append(k)
+                kernels.append(k)
        else:
            assert len(kernel) == len(Ylist), "need one kernel per output"
            assert all([isinstance(k, Kern) for k in kernel]), "invalid kernel object detected!"
-            self.kern = kernel
+            kernels = kernel

        if X_variance is None:
            X_variance = np.random.uniform(0.1, 0.2, X.shape)
@ -80,32 +124,28 @@ class MRD(Model):
        self.variational_prior = NormalPrior()
        self.X = NormalPosterior(X, X_variance)

-        if likelihood is None:
-            self.likelihood = [Gaussian(name='Gaussian_noise'.format(i)) for i in range(len(Ylist))]
-        else: self.likelihood = likelihood
-
-        if inference_method is None:
-            self.inference_method= []
-            for y in Ylist:
-                if np.any(np.isnan(y)):
-                    self.inference_method.append(VarDTCMissingData(limit=1))
-                else:
-                    self.inference_method.append(VarDTC(limit=1))
-        else:
-            self.inference_method = inference_method
-            self.inference_method.set_limit(len(Ylist))
+        if likelihoods is None:
+            likelihoods = [Gaussian(name='Gaussian_noise'.format(i)) for i in range(len(Ylist))]
+        else: likelihoods = likelihoods

+        self.logger.info("adding X and Z")
        self.add_parameters(self.X, self.Z)

-        if Ynames is None:
-            Ynames = ['Y{}'.format(i) for i in range(len(Ylist))]
+        self.bgplvms = []
+        self.num_data = Ylist[0].shape[0]

-        for i, n, k, l in itertools.izip(itertools.count(), Ynames, self.kern, self.likelihood):
+        for i, n, k, l, Y in itertools.izip(itertools.count(), Ynames, kernels, likelihoods, Ylist):
+            assert Y.shape[0] == self.num_data, "All datasets need to share the number of datapoints, and those have to correspond to one another"
            p = Parameterized(name=n)
            p.add_parameter(k)
+            p.kern = k
            p.add_parameter(l)
-            setattr(self, 'Y{}'.format(i), p)
+            p.likelihood = l
            self.add_parameter(p)
+            self.bgplvms.append(p)
+
+        self.posterior = None
+        self.logger.info("init done")
        self._in_init_ = False

    def parameters_changed(self):
@ -113,14 +153,15 @@ class MRD(Model):
        self.posteriors = []
        self.Z.gradient[:] = 0.
        self.X.gradient[:] = 0.
-
-        for y, k, l, i in itertools.izip(self.Ylist, self.kern, self.likelihood, self.inference_method):
+        for y, b, i in itertools.izip(self.Ylist, self.bgplvms, self.inference_method):
+            self.logger.info('working on im <{}>'.format(hex(id(i))))
+            k, l = b.kern, b.likelihood
            posterior, lml, grad_dict = i.inference(k, self.X, self.Z, l, y)

            self.posteriors.append(posterior)
            self._log_marginal_likelihood += lml

-            # likelihood gradients
+            # likelihoods gradients
            l.update_gradients(grad_dict.pop('dL_dthetaL'))

            #gradients wrt kernel
@ -133,12 +174,19 @@ class MRD(Model):
            #gradients wrt Z
            self.Z.gradient += k.gradients_X(dL_dKmm, self.Z)
            self.Z.gradient += k.gradients_Z_expectations(
-                               grad_dict['dL_dpsi1'], grad_dict['dL_dpsi2'], Z=self.Z, variational_posterior=self.X)
+                               grad_dict['dL_dpsi0'], 
+                               grad_dict['dL_dpsi1'], 
+                               grad_dict['dL_dpsi2'], 
+                               Z=self.Z, variational_posterior=self.X)

            dL_dmean, dL_dS = k.gradients_qX_expectations(variational_posterior=self.X, Z=self.Z, **grad_dict)
            self.X.mean.gradient += dL_dmean
            self.X.variance.gradient += dL_dS

+        self.posterior = self.posteriors[0]
+        self.kern = self.bgplvms[0].kern
+        self.likelihood = self.bgplvms[0].likelihood
+
        # update for the KL divergence
        self.variational_prior.update_gradients_KL(self.X)
        self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
@ -151,7 +199,7 @@ class MRD(Model):
            Ylist = self.Ylist
        if init in "PCA_concat":
            X, fracs = initialize_latent('PCA', self.input_dim, np.hstack(Ylist))
-            fracs = [fracs]*self.input_dim
+            fracs = [fracs]*len(Ylist)
        elif init in "PCA_single":
            X = np.zeros((Ylist[0].shape[0], self.input_dim))
            fracs = []
@ -162,7 +210,7 @@ class MRD(Model):
        else: # init == 'random':
            X = np.random.randn(Ylist[0].shape[0], self.input_dim)
            fracs = X.var(0)
-            fracs = [fracs]*self.input_dim
+            fracs = [fracs]*len(Ylist)
        X -= X.mean()
        X /= X.std()
        return X, fracs
@ -177,10 +225,12 @@ class MRD(Model):
        return Z

    def _handle_plotting(self, fignum, axes, plotf, sharex=False, sharey=False):
+        import matplotlib.pyplot as plt
        if axes is None:
-            fig = pylab.figure(num=fignum)
+            fig = plt.figure(num=fignum)
        sharex_ax = None
        sharey_ax = None
+        plots = []
        for i, g in enumerate(self.bgplvms):
            try:
                if sharex:
@ -197,26 +247,36 @@ class MRD(Model):
                ax = axes[i]
            else:
                raise ValueError("Need one axes per latent dimension input_dim")
-            plotf(i, g, ax)
+            plots.append(plotf(i, g, ax))
            if sharey_ax is not None:
-                pylab.setp(ax.get_yticklabels(), visible=False)
-        pylab.draw()
+                plt.setp(ax.get_yticklabels(), visible=False)
+        plt.draw()
        if axes is None:
-            fig.tight_layout()
-            return fig
-        else:
-            return pylab.gcf()
+            try:
+                fig.tight_layout()
+            except:
+                pass
+        return plots

-    def plot_X(self, fignum=None, ax=None):
-        fig = self._handle_plotting(fignum, ax, lambda i, g, ax: ax.imshow(g.X))
-        return fig
+    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None, Yindex=0):
+        """
+        Prediction for data set Yindex[default=0].
+        This predicts the output mean and variance for the dataset given in Ylist[Yindex]
+        """
+        self.posterior = self.posteriors[Yindex]
+        self.kern = self.bgplvms[0].kern
+        self.likelihood = self.bgplvms[0].likelihood
+        return super(MRD, self).predict(Xnew, full_cov, Y_metadata, kern)

-    def plot_predict(self, fignum=None, ax=None, sharex=False, sharey=False, **kwargs):
-        fig = self._handle_plotting(fignum,
-                                    ax,
-                                    lambda i, g, ax: ax.imshow(g. predict(g.X)[0], **kwargs),
-                                    sharex=sharex, sharey=sharey)
-        return fig
+    #===============================================================================
+    # TODO: Predict! Maybe even change to several bgplvms, which share an X?
+    #===============================================================================
+    #     def plot_predict(self, fignum=None, ax=None, sharex=False, sharey=False, **kwargs):
+    #         fig = self._handle_plotting(fignum,
+    #                                     ax,
+    #                                     lambda i, g, ax: ax.imshow(g.predict(g.X)[0], **kwargs),
+    #                                     sharex=sharex, sharey=sharey)
+    #         return fig

    def plot_scales(self, fignum=None, ax=None, titles=None, sharex=False, sharey=True, *args, **kwargs):
        """
@ -228,28 +288,55 @@ class MRD(Model):
        """
        if titles is None:
            titles = [r'${}$'.format(name) for name in self.names]
-        ymax = reduce(max, [np.ceil(max(g.input_sensitivity())) for g in self.bgplvms])
+        ymax = reduce(max, [np.ceil(max(g.kern.input_sensitivity())) for g in self.bgplvms])
        def plotf(i, g, ax):
            ax.set_ylim([0,ymax])
-            g.kern.plot_ARD(ax=ax, title=titles[i], *args, **kwargs)
+            return g.kern.plot_ARD(ax=ax, title=titles[i], *args, **kwargs)
        fig = self._handle_plotting(fignum, ax, plotf, sharex=sharex, sharey=sharey)
        return fig

-    def plot_latent(self, fignum=None, ax=None, *args, **kwargs):
-        fig = self.gref.plot_latent(fignum=fignum, ax=ax, *args, **kwargs) # self._handle_plotting(fignum, ax, lambda i, g, ax: g.plot_latent(ax=ax, *args, **kwargs))
-        return fig
+    def plot_latent(self, labels=None, which_indices=None,
+                resolution=50, ax=None, marker='o', s=40,
+                fignum=None, plot_inducing=True, legend=True,
+                plot_limits=None, 
+                aspect='auto', updates=False, predict_kwargs={}, imshow_kwargs={}):
+        """
+        see plotting.matplot_dep.dim_reduction_plots.plot_latent
+        if predict_kwargs is None, will plot latent spaces for 0th dataset (and kernel), otherwise give
+        predict_kwargs=dict(Yindex='index') for plotting only the latent space of dataset with 'index'.
+        """
+        import sys
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from matplotlib import pyplot as plt
+        from ..plotting.matplot_dep import dim_reduction_plots
+        if "Yindex" not in predict_kwargs:
+            predict_kwargs['Yindex'] = 0
+        if ax is None:
+            fig = plt.figure(num=fignum)
+            ax = fig.add_subplot(111)
+        else:
+            fig = ax.figure
+        plot = dim_reduction_plots.plot_latent(self, labels, which_indices,
+                                        resolution, ax, marker, s,
+                                        fignum, plot_inducing, legend,
+                                        plot_limits, aspect, updates, predict_kwargs, imshow_kwargs)
+        ax.set_title(self.bgplvms[predict_kwargs['Yindex']].name)
+        try:
+            fig.tight_layout()
+        except:
+            pass

-    def _debug_plot(self):
-        self.plot_X_1d()
-        fig = pylab.figure("MRD DEBUG PLOT", figsize=(4 * len(self.bgplvms), 9))
-        fig.clf()
-        axes = [fig.add_subplot(3, len(self.bgplvms), i + 1) for i in range(len(self.bgplvms))]
-        self.plot_X(ax=axes)
-        axes = [fig.add_subplot(3, len(self.bgplvms), i + len(self.bgplvms) + 1) for i in range(len(self.bgplvms))]
-        self.plot_latent(ax=axes)
-        axes = [fig.add_subplot(3, len(self.bgplvms), i + 2 * len(self.bgplvms) + 1) for i in range(len(self.bgplvms))]
-        self.plot_scales(ax=axes)
-        pylab.draw()
-        fig.tight_layout()
+        return plot

+    def __getstate__(self):
+        state = super(MRD, self).__getstate__()
+        del state['kern']
+        del state['likelihood']
+        return state

+    def __setstate__(self, state):
+        # TODO:
+        super(MRD, self).__setstate__(state)
+        self.kern = self.bgplvms[0].kern
+        self.likelihood = self.bgplvms[0].likelihood
+        self.parameters_changed()
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@ -11,7 +11,7 @@ from ..likelihoods import Gaussian
 from ..inference.optimization import SCG
 from ..util import linalg
 from ..core.parameterization.variational import SpikeAndSlabPrior, SpikeAndSlabPosterior
-from ..inference.latent_function_inference.var_dtc_parallel import update_gradients
+from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch
 from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU


@ -41,7 +41,7 @@ class SSGPLVM(SparseGP):
        if X_variance is None: # The variance of the variational approximation (S)
            X_variance = np.random.uniform(0,.1,X.shape)
            
-        gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
+        gamma = np.empty_like(X, order='F') # The posterior probabilities of the binary variable in the variational approximation
        gamma[:] = 0.5 + 0.01 * np.random.randn(X.shape[0], input_dim)
        
        if group_spike:
@ -60,12 +60,15 @@ class SSGPLVM(SparseGP):
        pi = np.empty((input_dim))
        pi[:] = 0.5
        self.variational_prior = SpikeAndSlabPrior(pi=pi) # the prior probability of the latent binary variable b
+        
+        X = np.asfortranarray(X)
+        X_variance = np.asfortranarray(X_variance)
+        gamma = np.asfortranarray(gamma)
        X = SpikeAndSlabPosterior(X, X_variance, gamma)
        
        if group_spike:
            kernel.group_spike_prob = True
            self.variational_prior.group_spike_prob = True
-            

        SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs)
        self.add_parameter(self.X, index=0)
@ -76,7 +79,7 @@ class SSGPLVM(SparseGP):
        X.mean.gradient, X.variance.gradient, X.binary_prob.gradient = X_grad

    def parameters_changed(self):
-        if isinstance(self.inference_method, VarDTC_GPU):
+        if isinstance(self.inference_method, VarDTC_GPU) or isinstance(self.inference_method, VarDTC_minibatch):
            update_gradients(self)
            return
        
--- a/GPy/plotting/matplot_dep/dim_reduction_plots.py
+++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py
@ -31,7 +31,7 @@ def plot_latent(model, labels=None, which_indices=None,
                resolution=50, ax=None, marker='o', s=40,
                fignum=None, plot_inducing=False, legend=True,
                plot_limits=None, 
-                aspect='auto', updates=False, **kwargs):
+                aspect='auto', updates=False, predict_kwargs={}, imshow_kwargs={}):
    """
    :param labels: a np.array of size model.num_data containing labels for the points (can be number, strings, etc)
    :param resolution: the resolution of the grid on which to evaluate the predictive variance
@ -60,7 +60,7 @@ def plot_latent(model, labels=None, which_indices=None,
    def plot_function(x):
        Xtest_full = np.zeros((x.shape[0], model.X.shape[1]))
        Xtest_full[:, [input_1, input_2]] = x
-        _, var = model.predict(Xtest_full)
+        _, var = model.predict(Xtest_full, **predict_kwargs)
        var = var[:, :1]
        return np.log(var)

@ -81,7 +81,7 @@ def plot_latent(model, labels=None, which_indices=None,
    view = ImshowController(ax, plot_function,
                            (xmin, ymin, xmax, ymax),
                            resolution, aspect=aspect, interpolation='bilinear',
-                            cmap=pb.cm.binary, **kwargs)
+                            cmap=pb.cm.binary, **imshow_kwargs)

    # make sure labels are in order of input:
    ulabels = []
--- a/GPy/plotting/matplot_dep/kernel_plots.py
+++ b/GPy/plotting/matplot_dep/kernel_plots.py
@ -68,7 +68,7 @@ def plot_ARD(kernel, fignum=None, ax=None, title='', legend=False):
    
    for i in range(ard_params.shape[0]):
        c = Tango.nextMedium()
-        bars.append(plot_bars(fig, ax, x, ard_params[i,:], c, kernel._parameters_[i].name, bottom=bottom))
+        bars.append(plot_bars(fig, ax, x, ard_params[i,:], c, kernel.parameters[i].name, bottom=bottom))
        bottom += ard_params[i,:]
    
    ax.set_xlim(-.5, kernel.input_dim - .5)
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@ -97,7 +97,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',

        for d in which_data_ycols:
            plots['gpplot'] = gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], ax=ax, edgecol=linecol, fillcol=fillcol)
-            plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], data_symbol, mew=1.5)
+            if not plot_raw: plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], data_symbol, mew=1.5)

        #optionally plot some samples
        if samples: #NOTE not tested with fixed_inputs
@ -151,7 +151,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
        for d in which_data_ycols:
            m_d = m[:,d].reshape(resolution, resolution).T
            plots['contour'] = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
-            plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+            if not plot_raw: plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)

        #set the limits of the plot to some sensible values
        ax.set_xlim(xmin[0], xmax[0])
--- a/GPy/plotting/matplot_dep/visualize.py
+++ b/GPy/plotting/matplot_dep/visualize.py
@ -88,7 +88,6 @@ class vector_show(matplotlib_show):


 class lvm(matplotlib_show):
-    
    def __init__(self, vals, model, data_visualize, latent_axes=None, sense_axes=None, latent_index=[0,1]):
        """Visualize a latent variable model

@ -99,10 +98,11 @@ class lvm(matplotlib_show):
        """
        if vals is None:
            if isinstance(model.X, VariationalPosterior):
-                vals = param_to_array(model.X.mean)
+                vals = model.X.mean.values
            else:
-                vals = param_to_array(model.X)
-
+                vals = model.X.values
+        if len(vals.shape)==1:
+            vals = vals[None,:]
        matplotlib_show.__init__(self, vals, axes=latent_axes)

        if isinstance(latent_axes,mpl.axes.Axes):
@ -133,7 +133,7 @@ class lvm(matplotlib_show):

    def modify(self, vals):
        """When latent values are modified update the latent representation and ulso update the output visualization."""
-        self.vals = vals.copy()
+        self.vals = vals.view(np.ndarray).copy()
        y = self.model.predict(self.vals)[0]
        self.data_visualize.modify(y)
        self.latent_handle.set_data(self.vals[0,self.latent_index[0]], self.vals[0,self.latent_index[1]])
@ -146,7 +146,6 @@ class lvm(matplotlib_show):
        pass

    def on_click(self, event):
-        print 'click!'
        if event.inaxes!=self.latent_axes: return
        self.move_on = not self.move_on
        self.called = True
@ -219,11 +218,11 @@ class lvm_dimselect(lvm):
        self.labels = labels
        lvm.__init__(self,vals,model,data_visualize,latent_axes,sense_axes,latent_index)
        self.show_sensitivities()
-        print "use left and right mouse butons to select dimensions"
+        print self.latent_values
+        print "use left and right mouse buttons to select dimensions"


    def on_click(self, event):
-
        if event.inaxes==self.sense_axes:
            new_index = max(0,min(int(np.round(event.xdata-0.5)),self.model.input_dim-1))
            if event.button == 1:
@ -249,6 +248,7 @@ class lvm_dimselect(lvm):


    def on_leave(self,event):
+        print type(self.latent_values)
        latent_values = self.latent_values.copy()
        y = self.model.predict(latent_values[None,:])[0]
        self.data_visualize.modify(y)
@ -393,14 +393,13 @@ class mocap_data_show_vpython(vpython_show):
    def process_values(self):
        raise NotImplementedError, "this needs to be implemented to use the data_show class"

-
 class mocap_data_show(matplotlib_show):
    """Base class for visualizing motion capture data."""

    def __init__(self, vals, axes=None, connect=None):
        if axes==None:
            fig = plt.figure()
-            axes = fig.add_subplot(111, projection='3d')
+            axes = fig.add_subplot(111, projection='3d',aspect='equal')
        matplotlib_show.__init__(self, vals, axes)

        self.connect = connect
@ -445,11 +444,12 @@ class mocap_data_show(matplotlib_show):
    def process_values(self):
        raise NotImplementedError, "this needs to be implemented to use the data_show class"

-    def initialize_axes(self):
+    def initialize_axes(self, boundary=0.05):
        """Set up the axes with the right limits and scaling."""
-        self.x_lim = np.array([self.vals[:, 0].min(), self.vals[:, 0].max()])
-        self.y_lim = np.array([self.vals[:, 1].min(), self.vals[:, 1].max()])
-        self.z_lim = np.array([self.vals[:, 2].min(), self.vals[:, 2].max()])
+        bs = [(self.vals[:, i].max()-self.vals[:, i].min())*boundary for i in xrange(3)]
+        self.x_lim = np.array([self.vals[:, 0].min()-bs[0], self.vals[:, 0].max()+bs[0]])
+        self.y_lim = np.array([self.vals[:, 1].min()-bs[1], self.vals[:, 1].max()+bs[1]])
+        self.z_lim = np.array([self.vals[:, 2].min()-bs[2], self.vals[:, 2].max()+bs[2]])

    def initialize_axes_modify(self):
        self.points_handle.remove()
@ -472,6 +472,8 @@ class mocap_data_show(matplotlib_show):
 class stick_show(mocap_data_show):
    """Show a three dimensional point cloud as a figure. Connect elements of the figure together using the matrix connect."""
    def __init__(self, vals, connect=None, axes=None):
+        if len(vals.shape)==1:
+            vals = vals[None,:]
        mocap_data_show.__init__(self, vals, axes=axes, connect=connect)

    def process_values(self):
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -94,22 +94,18 @@ class MiscTests(unittest.TestCase):
        np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood())

        m.kern.lengthscale.randomize()
-        m._trigger_params_changed()
        m2.kern.lengthscale = m.kern.lengthscale
        np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood())

        m.kern.lengthscale.randomize()
-        m._trigger_params_changed()
        m2['.*lengthscale'] = m.kern.lengthscale
        np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood())

        m.kern.lengthscale.randomize()
-        m._trigger_params_changed()
        m2['.*lengthscale'] = m.kern['.*lengthscale']
        np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood())

        m.kern.lengthscale.randomize()
-        m._trigger_params_changed()
        m2.kern.lengthscale = m.kern['.*lengthscale']
        np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood())

@ -130,6 +126,23 @@ class MiscTests(unittest.TestCase):
        m2.kern[:] = m.kern[''].values()
        np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood())

+    def test_big_model(self):
+        m = GPy.examples.dimensionality_reduction.mrd_simulation(optimize=0, plot=0, plot_sim=0)
+        m.X.fix()
+        print m
+        m.unfix()
+        m.checkgrad()
+        print m
+        m.fix()
+        print m
+        m.inducing_inputs.unfix()
+        print m
+        m.checkgrad()
+        m.unfix()
+        m.checkgrad()
+        m.checkgrad()
+        print m
+
    def test_model_set_params(self):
        m = GPy.models.GPRegression(self.X, self.Y)
        lengthscale = np.random.uniform()
--- a/GPy/testing/parameterized_tests.py
+++ b/GPy/testing/parameterized_tests.py
@ -95,7 +95,7 @@ class ParameterizedTest(unittest.TestCase):
        self.assertListEqual(self.test1.kern.param_array.tolist(), val[:2].tolist())

    def test_add_parameter_already_in_hirarchy(self):
-        self.assertRaises(HierarchyError, self.test1.add_parameter, self.white._parameters_[0])
+        self.assertRaises(HierarchyError, self.test1.add_parameter, self.white.parameters[0])

    def test_default_constraints(self):
        self.assertIs(self.rbf.variance.constraints._param_index_ops, self.rbf.constraints._param_index_ops)
--- a/GPy/testing/pickle_tests.py
+++ b/GPy/testing/pickle_tests.py
@ -4,7 +4,8 @@ Created on 13 Mar 2014
@author: maxz
 '''
 import unittest, itertools
-import cPickle as pickle
+#import cPickle as pickle
+import pickle
 import numpy as np
 from GPy.core.parameterization.index_operations import ParameterIndexOperations,\
    ParameterIndexOperationsView
@ -15,8 +16,7 @@ from GPy.core.parameterization.priors import Gaussian
 from GPy.kern._src.rbf import RBF
 from GPy.kern._src.linear import Linear
 from GPy.kern._src.static import Bias, White
-from GPy.examples.dimensionality_reduction import mrd_simulation,\
-    bgplvm_simulation
+from GPy.examples.dimensionality_reduction import mrd_simulation
 from GPy.examples.regression import toy_rbf_1d_50
 from GPy.core.parameterization.variational import NormalPosterior
 from GPy.models.gp_regression import GPRegression
@ -89,28 +89,29 @@ class Test(ListDictTestCase):
        self.assertIs(pcopy.constraints, pcopy.rbf.lengthscale.constraints._param_index_ops)
        self.assertIs(pcopy.constraints, pcopy.linear.constraints._param_index_ops)
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        pcopy.gradient = 10 # gradient does not get copied anymore
+        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
        with tempfile.TemporaryFile('w+b') as f:
            par.pickle(f)
            f.seek(0)
            pcopy = pickle.load(f)
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
        pcopy.gradient = 10
-        np.testing.assert_allclose(par.linear.full_gradient, pcopy.linear.full_gradient)
-        np.testing.assert_allclose(pcopy.linear.full_gradient, 10)
+        np.testing.assert_allclose(par.linear.gradient_full, pcopy.linear.gradient_full)
+        np.testing.assert_allclose(pcopy.linear.gradient_full, 10)
        self.assertSequenceEqual(str(par), str(pcopy))

    def test_model(self):
        par = toy_rbf_1d_50(optimize=0, plot=0)
        pcopy = par.copy()
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
        self.assertTrue(pcopy.checkgrad())
        self.assert_(np.any(pcopy.gradient!=0.0))
        with tempfile.TemporaryFile('w+b') as f:
@ -118,18 +119,18 @@ class Test(ListDictTestCase):
            f.seek(0)
            pcopy = pickle.load(f)
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        np.testing.assert_allclose(par.full_gradient, pcopy.full_gradient)
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assert_(pcopy.checkgrad())

    def test_modelrecreation(self):
        par = toy_rbf_1d_50(optimize=0, plot=0)
        pcopy = GPRegression(par.X.copy(), par.Y.copy(), kernel=par.kern.copy())
-        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        np.testing.assert_allclose(par.param_array, pcopy.param_array)
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
        self.assertTrue(pcopy.checkgrad())
        self.assert_(np.any(pcopy.gradient!=0.0))
        pcopy.optimize('bfgs')
@ -139,8 +140,8 @@ class Test(ListDictTestCase):
            par.pickle(f)
            f.seek(0)
            pcopy = pickle.load(f)
-        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        np.testing.assert_allclose(par.full_gradient, pcopy.full_gradient)
+        np.testing.assert_allclose(par.param_array, pcopy.param_array)
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assert_(pcopy.checkgrad())

@ -150,19 +151,20 @@ class Test(ListDictTestCase):
        par = NormalPosterior(X,Xv)
        par.gradient = 10
        pcopy = par.copy()
+        pcopy.gradient = 10
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
        with tempfile.TemporaryFile('w+b') as f:
            par.pickle(f)
            f.seek(0)
            pcopy = pickle.load(f)
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
        pcopy.gradient = 10
-        np.testing.assert_allclose(par.full_gradient, pcopy.full_gradient)
-        np.testing.assert_allclose(pcopy.mean.full_gradient, 10)
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
+        np.testing.assert_allclose(pcopy.mean.gradient_full, 10)
        self.assertSequenceEqual(str(par), str(pcopy))

    def test_model_concat(self):
@ -170,10 +172,11 @@ class Test(ListDictTestCase):
        par.randomize()
        pcopy = par.copy()
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        self.assertListEqual(par.full_gradient.tolist(), pcopy.full_gradient.tolist())
+        self.assertListEqual(par.gradient_full.tolist(), pcopy.gradient_full.tolist())
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assertIsNot(par.param_array, pcopy.param_array)
-        self.assertIsNot(par.full_gradient, pcopy.full_gradient)
+        self.assertIsNot(par.gradient_full, pcopy.gradient_full)
+        self.assertTrue(par.checkgrad())
        self.assertTrue(pcopy.checkgrad())
        self.assert_(np.any(pcopy.gradient!=0.0))
        with tempfile.TemporaryFile('w+b') as f:
@ -181,7 +184,7 @@ class Test(ListDictTestCase):
            f.seek(0)
            pcopy = pickle.load(f)
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
-        np.testing.assert_allclose(par.full_gradient, pcopy.full_gradient)
+        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
        self.assertSequenceEqual(str(par), str(pcopy))
        self.assert_(pcopy.checkgrad())

--- a/GPy/util/caching.py
+++ b/GPy/util/caching.py
@ -1,9 +1,7 @@
 from ..core.parameterization.parameter_core import Observable
-import itertools, collections, weakref
+import collections, weakref, logging

 class Cacher(object):
-
-
    def __init__(self, operation, limit=5, ignore_args=(), force_kwargs=()):
        """
        Parameters:
@ -12,13 +10,15 @@ class Cacher(object):
        :param int limit: depth of cacher
        :param [int] ignore_args: list of indices, pointing at arguments to ignore in *args of operation(*args). This includes self!
        :param [str] force_kwargs: list of kwarg names (strings). If a kwarg with that name is given, the cacher will force recompute and wont cache anything.
+        :param int verbose: verbosity level. 0: no print outs, 1: casual print outs, 2: debug level print outs
        """
        self.limit = int(limit)
        self.ignore_args = ignore_args
        self.force_kwargs = force_kwargs
-        self.operation=operation
+        self.operation = operation
        self.order = collections.deque()
-        self.cached_inputs = {} # point from cache_ids to a list of [ind_ids], which where used in cache cache_id
+        self.cached_inputs = {}  # point from cache_ids to a list of [ind_ids], which where used in cache cache_id
+        self.logger = logging.getLogger("cache")

        #=======================================================================
        # point from each ind_id to [ref(obj), cache_ids]
@ -27,53 +27,75 @@ class Cacher(object):
        self.cached_input_ids = {} 
        #=======================================================================

-        self.cached_outputs = {} # point from cache_ids to outputs
-        self.inputs_changed = {} # point from cache_ids to bools
+        self.cached_outputs = {}  # point from cache_ids to outputs
+        self.inputs_changed = {}  # point from cache_ids to bools

-    def combine_args_kw(self, args, kw):
+    def id(self, obj):
+        """returns the self.id of an object, to be used in caching individual self.ids"""
+        return hex(id(obj))
+
+    def combine_inputs(self, args, kw):
        "Combines the args and kw in a unique way, such that ordering of kwargs does not lead to recompute"
+        self.logger.debug("combining args and kw")
        return args + tuple(c[1] for c in sorted(kw.items(), key=lambda x: x[0]))

-    def preprocess(self, combined_args_kw, ignore_args):
-        "get the cacheid (conc. string of argument ids in order) ignoring ignore_args"
-        return "".join(str(id(a)) for i,a in enumerate(combined_args_kw) if i not in ignore_args)
+    def prepare_cache_id(self, combined_args_kw, ignore_args):
+        "get the cacheid (conc. string of argument self.ids in order) ignoring ignore_args"
+        cache_id = "".join(self.id(a) for i, a in enumerate(combined_args_kw) if i not in ignore_args)
+        self.logger.debug("cache_id={} was created".format(cache_id))
+        return cache_id

    def ensure_cache_length(self, cache_id):
        "Ensures the cache is within its limits and has one place free"
+        self.logger.debug("cache length gets ensured")
        if len(self.order) == self.limit:
+            self.logger.debug("cache limit of l={} was reached".format(self.limit))
            # we have reached the limit, so lets release one element
            cache_id = self.order.popleft()
+            self.logger.debug("cach_id '{}' gets removed".format(cache_id))
            combined_args_kw = self.cached_inputs[cache_id]
            for ind in combined_args_kw:
-                ind_id = id(ind)
-                ref, cache_ids = self.cached_input_ids[ind_id]
-                if len(cache_ids) == 1 and ref() is not None:
-                    ref().remove_observer(self, self.on_cache_changed)
-                    del self.cached_input_ids[ind_id]
-                else:
-                    cache_ids.remove(cache_id)
-                    self.cached_input_ids[ind_id] = [ref, cache_ids]
+                if ind is not None:
+                    ind_id = self.id(ind)
+                    tmp = self.cached_input_ids.get(ind_id, None)
+                    if tmp is not None:
+                        ref, cache_ids = tmp
+                        if len(cache_ids) == 1 and ref() is not None:
+                            ref().remove_observer(self, self.on_cache_changed)
+                            del self.cached_input_ids[ind_id]
+                        else:
+                            cache_ids.remove(cache_id)
+                            self.cached_input_ids[ind_id] = [ref, cache_ids]
+            self.logger.debug("removing caches")
            del self.cached_outputs[cache_id]
            del self.inputs_changed[cache_id]
            del self.cached_inputs[cache_id]

-    def add_to_cache(self, cache_id, combined_args_kw, output):
+    def add_to_cache(self, cache_id, inputs, output):
+        """This adds cache_id to the cache, with inputs and output"""
        self.inputs_changed[cache_id] = False
        self.cached_outputs[cache_id] = output
        self.order.append(cache_id)
-        self.cached_inputs[cache_id] = combined_args_kw
-        for a in combined_args_kw:
-            ind_id = id(a)
-            v = self.cached_input_ids.get(ind_id, [weakref.ref(a), []])
-            v[1].append(cache_id)
-            if len(v[1]) == 1:
-                a.add_observer(self, self.on_cache_changed)
-            self.cached_input_ids[ind_id] = v
+        self.cached_inputs[cache_id] = inputs
+        for a in inputs:
+            if a is not None:
+                ind_id = self.id(a)
+                v = self.cached_input_ids.get(ind_id, [weakref.ref(a), []])
+                self.logger.debug("cache_id '{}' gets stored".format(cache_id))
+                v[1].append(cache_id)
+                if len(v[1]) == 1:
+                    self.logger.debug("adding observer to object {}".format(repr(a)))
+                    a.add_observer(self, self.on_cache_changed)
+                self.cached_input_ids[ind_id] = v

    def __call__(self, *args, **kw):
        """
        A wrapper function for self.operation,
        """
+        #=======================================================================
+        # !WARNING CACHE OFFSWITCH!
+        # return self.operation(*args, **kw)
+        #=======================================================================

        # 1: Check whether we have forced recompute arguments:
        if len(self.force_kwargs) != 0:
@ -81,27 +103,33 @@ class Cacher(object):
                if k in kw and kw[k] is not None:
                    return self.operation(*args, **kw)

-        # 2: preprocess and get the unique id string for this call
-        combined_args_kw = self.combine_args_kw(args, kw)
-        cache_id = self.preprocess(combined_args_kw, self.ignore_args)
-
+        # 2: prepare_cache_id and get the unique self.id string for this call
+        inputs = self.combine_inputs(args, kw)
+        cache_id = self.prepare_cache_id(inputs, self.ignore_args)
        # 2: if anything is not cachable, we will just return the operation, without caching
-        if reduce(lambda a,b: a or (not isinstance(b, Observable)), combined_args_kw, False):
+        if reduce(lambda a, b: a or (not (isinstance(b, Observable) or b is None)), inputs, False):
+            self.logger.info("some inputs are not observable: returning without caching")
+            self.logger.debug(str(map(lambda x: isinstance(x, Observable) or x is None, inputs)))
+            self.logger.debug(str(map(repr, inputs)))
            return self.operation(*args, **kw)
        # 3&4: check whether this cache_id has been cached, then has it changed?
        try:
            if(self.inputs_changed[cache_id]):
-                # 4: This happens, when one element has changed for this cache id
+                self.logger.debug("{} already seen, but inputs changed. refreshing cacher".format(cache_id))
+                # 4: This happens, when elements have changed for this cache self.id
                self.inputs_changed[cache_id] = False
                self.cached_outputs[cache_id] = self.operation(*args, **kw)
        except KeyError:
+            self.logger.info("{} never seen, creating cache entry".format(cache_id))
            # 3: This is when we never saw this chache_id:
            self.ensure_cache_length(cache_id)
-            self.add_to_cache(cache_id, combined_args_kw, self.operation(*args, **kw))
+            self.add_to_cache(cache_id, inputs, self.operation(*args, **kw))
        except:
+            self.logger.error("an error occurred while trying to run caching for {}, resetting".format(cache_id))
            self.reset()
            raise
        # 5: We have seen this cache_id and it is cached:
+        self.logger.info("returning cache {}".format(cache_id))
        return self.cached_outputs[cache_id]

    def on_cache_changed(self, direct, which=None):
@ -110,10 +138,13 @@ class Cacher(object):

        this function gets 'hooked up' to the inputs when we cache them, and upon their elements being changed we update here.
        """
-        for ind_id in [id(direct), id(which)]:
-            _, cache_ids = self.cached_input_ids.get(ind_id, [None, []])
-            for cache_id in cache_ids:
-                self.inputs_changed[cache_id] = True
+        for what in [direct, which]:
+            if what is not None:
+                ind_id = self.id(what)
+                _, cache_ids = self.cached_input_ids.get(ind_id, [None, []])
+                for cache_id in cache_ids:
+                    self.logger.info("callback from {} changed inputs from {}".format(ind_id, self.inputs_changed[cache_id]))
+                    self.inputs_changed[cache_id] = True

    def reset(self):
        """
@ -150,7 +181,7 @@ class Cacher_wrap(object):
        return partial(self, obj)
    def __call__(self, *args, **kwargs):
        obj = args[0]
-        #import ipdb;ipdb.set_trace()
+        # import ipdb;ipdb.set_trace()
        try:
            caches = obj.__cachers
        except AttributeError:
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -671,7 +671,7 @@ def osu_run1(data_set='osu_run1', sample_every=4):
    return data_details_return({'Y': Y, 'connect' : connect}, data_set)

 def swiss_roll_generated(num_samples=1000, sigma=0.0):
-    with open(os.path.join(data_path, 'swiss_roll.pickle')) as f:
+    with open(os.path.join(os.path.dirname(__file__), 'datasets', 'swiss_roll.pickle')) as f:
        data = pickle.load(f)
    Na = data['Y'].shape[0]
    perm = np.random.permutation(np.r_[:Na])[:num_samples]
@ -723,14 +723,20 @@ def hapmap3(data_set='hapmap3'):
        import bz2
    except ImportError as i:
        raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset"
-    if not data_available(data_set):
-        download_data(data_set)
+
    dirpath = os.path.join(data_path,'hapmap3')
    hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly'
+    unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']]
+    unpacked_files_exist = reduce(lambda a, b:a and b, map(os.path.exists, unpacked_files))
+
+    if not unpacked_files_exist and not data_available(data_set):
+        download_data(data_set)
+
    preprocessed_data_paths = [os.path.join(dirpath,hapmap_file_name + file_name) for file_name in \
                               ['.snps.pickle',
                                '.info.pickle',
                                '.nan.pickle']]
+
    if not reduce(lambda a,b: a and b, map(os.path.exists, preprocessed_data_paths)):
        if not overide_manual_authorize and not prompt_user("Preprocessing requires ~25GB "
                            "of memory and can take a (very) long time, continue? [Y/n]"):
@ -744,8 +750,7 @@ def hapmap3(data_set='hapmap3'):
                                                               perc="="*int(20.*progress/100.))
            stdout.write(status); stdout.flush()
            return status
-        unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']]
-        if not reduce(lambda a,b: a and b, map(os.path.exists, unpacked_files)):
+        if not unpacked_files_exist:
            status=write_status('unpacking...', 0, '')
            curr = 0
            for newfilepath in unpacked_files:
@ -762,6 +767,7 @@ def hapmap3(data_set='hapmap3'):
                            status=write_status('unpacking...', curr+12.*file_processed/(file_size), status)
                curr += 12
                status=write_status('unpacking...', curr, status)
+                os.remove(filepath)
        status=write_status('reading .ped...', 25, status)
        # Preprocess data:    
        snpstrnp = np.loadtxt(unpacked_files[0], dtype=str)
@ -832,7 +838,7 @@ def hapmap3(data_set='hapmap3'):
 def singlecell(data_set='singlecell'):
    if not data_available(data_set):
        download_data(data_set)
-    
+
    from pandas import read_csv
    dirpath = os.path.join(data_path, data_set)
    filename = os.path.join(dirpath, 'singlecell.csv')
--- a/GPy/util/datasets/swiss_roll.pickle
+++ b/GPy/util/datasets/swiss_roll.pickle
--- a/GPy/util/initialization.py
+++ b/GPy/util/initialization.py
@ -8,7 +8,7 @@ import numpy as np
 from GPy.util.pca import pca

 def initialize_latent(init, input_dim, Y):
-    Xr = np.random.randn(Y.shape[0], input_dim)
+    Xr = np.asfortranarray(np.random.randn(Y.shape[0], input_dim))
    if init == 'PCA':
        p = pca(Y)
        PC = p.project(Y, min(input_dim, Y.shape[1]))
@ -20,4 +20,4 @@ def initialize_latent(init, input_dim, Y):
    Xr -= Xr.mean(0)
    Xr /= Xr.var(0)
    
-    return Xr, var/var.max()
+    return Xr, var/var.max()
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -123,7 +123,7 @@ def dtrtrs(A, B, lower=1, trans=0, unitdiag=0):
    :returns:

    """
-    A = force_F_ordered(A)
+    A = np.asfortranarray(A)
    #Note: B does not seem to need to be F ordered!
    return lapack.dtrtrs(A, B, lower=lower, trans=trans, unitdiag=unitdiag)

--- a/GPy/util/pca.py
+++ b/GPy/util/pca.py
@ -106,12 +106,14 @@ class pca(object):
                ulabels.append(lab)
        nlabels = len(ulabels)
        if colors is None:
-            colors = [cmap(float(i) / nlabels) for i in range(nlabels)]
+            colors = iter([cmap(float(i) / nlabels) for i in range(nlabels)])
+        else:
+            colors = iter(colors)
        X_ = self.project(X, self.Q)[:,dimensions]
        kwargs.update(dict(s=s))
        plots = list()
        for i, l in enumerate(ulabels):
-            kwargs.update(dict(color=colors[i], marker=marker[i % len(marker)]))
+            kwargs.update(dict(color=colors.next(), marker=marker[i % len(marker)]))
            plots.append(ax.scatter(*X_[labels == l, :].T, label=str(l), **kwargs))
        ax.set_xlabel(r"PC$_1$")
        ax.set_ylabel(r"PC$_2$")
--- a/GPy/util/subarray_and_sorting.py
+++ b/GPy/util/subarray_and_sorting.py
@ -4,9 +4,9 @@
 .. moduleauthor:: Max Zwiessele <ibinbei@gmail.com>

 '''
-__updated__ = '2013-12-02'
+__updated__ = '2014-05-21'

-import numpy as np
+import numpy as np, logging

 def common_subarrays(X, axis=0):
    """
@ -14,11 +14,11 @@ def common_subarrays(X, axis=0):
    Common subarrays are returned as a dictionary of <subarray, [index]> pairs, where
    the subarray is a tuple representing the subarray and the index is the index
    for the subarray in X, where index is the index to the remaining axis.
-    
+
    :param :class:`np.ndarray` X: 2d array to check for common subarrays in
    :param int axis: axis to apply subarray detection over. 
        When the index is 0, compare rows -- columns, otherwise.   
-    
+
    Examples:
    =========

@ -48,7 +48,17 @@ def common_subarrays(X, axis=0):
    assert X.ndim == 2 and axis in (0,1), "Only implemented for 2D arrays"
    subarrays = defaultdict(list)
    cnt = count()
-    np.apply_along_axis(lambda x: iadd(subarrays[tuple(x)], [cnt.next()]), 1-axis, X)
+    logger = logging.getLogger("common_subarrays")
+    def accumulate(x, s, c):
+        logger.debug("creating tuple")
+        t = tuple(x)
+        logger.debug("tuple done")
+        col = c.next()
+        iadd(s[t], [col])
+        logger.debug("added col {}".format(col))
+        return None
+    if axis == 0: [accumulate(x, subarrays, cnt) for x in X]
+    else: [accumulate(x, subarrays, cnt) for x in X.T]
    return subarrays

 if __name__ == '__main__':