diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 29d9032a..ca2583e0 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -180,40 +180,80 @@ class GP(Model):
 
         return Ysim
 
-    def plot_f(self, *args, **kwargs):
+    def plot_f(self, plot_limits=None, which_data_rows='all',
+        which_data_ycols='all', fixed_inputs=[],
+        levels=20, samples=0, fignum=None, ax=None, resolution=None,
+        plot_raw=True,
+        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx'):
         """
-
-        Plot the GP's view of the world, where the data is normalized and
-        before applying a likelihood.
-
-        This is a convenience function: arguments are passed to
-        GPy.plotting.matplot_dep.models_plots.plot_f_fit
-
+        Plot the GP's view of the world, where the data is normalized and before applying a likelihood.
+        This is a call to plot with plot_raw=True.
+        Data will not be plotted in this, as the GP's view of the world
+        may live in another space, or units then the data.
         """
         assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
         from ..plotting.matplot_dep import models_plots
-        return models_plots.plot_fit_f(self,*args,**kwargs)
+        kw = {}
+        if linecol is not None:
+            kw['linecol'] = linecol
+        if fillcol is not None:
+            kw['fillcol'] = fillcol
+        return models_plots.plot_fit(self, plot_limits, which_data_rows,
+                                     which_data_ycols, fixed_inputs, 
+                                     levels, samples, fignum, ax, resolution, 
+                                     plot_raw=plot_raw, Y_metadata=Y_metadata, 
+                                     data_symbol=data_symbol, **kw)
 
-    def plot(self, *args, **kwargs):
+    def plot(self, plot_limits=None, which_data_rows='all',
+        which_data_ycols='all', fixed_inputs=[],
+        levels=20, samples=0, fignum=None, ax=None, resolution=None,
+        plot_raw=False,
+        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx'):
         """
         Plot the posterior of the GP.
-          - In one dimension, the function is plotted with a shaded region
-            identifying two standard deviations.
-          - In two dimsensions, a contour-plot shows the mean predicted
-            function
-          - In higher dimensions, use fixed_inputs to plot the GP  with some of
-            the inputs fixed.
+          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
+          - In two dimsensions, a contour-plot shows the mean predicted function
+          - In higher dimensions, use fixed_inputs to plot the GP  with some of the inputs fixed.
 
         Can plot only part of the data and part of the posterior functions
-        using which_data_rows which_data_ycols and which_parts
-
-        This is a convenience function: arguments are passed to
-        GPy.plotting.matplot_dep.models_plots.plot_fit
+        using which_data_rowsm which_data_ycols.
 
+        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+        :type plot_limits: np.array
+        :param which_data_rows: which of the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
+        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+        :type which_data_rows: 'all' or a list of integers
+        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
+        :type fixed_inputs: a list of tuples
+        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+        :type resolution: int
+        :param levels: number of levels to plot in a contour plot.
+        :type levels: int
+        :param samples: the number of a posteriori samples to plot
+        :type samples: int
+        :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+        :type output: integer (first output is 0)
+        :param linecol: color of line to plot [Tango.colorsHex['darkBlue']]
+        :type linecol:
+        :param fillcol: color of fill [Tango.colorsHex['lightBlue']]
+        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
         """
         assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
         from ..plotting.matplot_dep import models_plots
-        return models_plots.plot_fit(self,*args,**kwargs)
+        kw = {}
+        if linecol is not None:
+            kw['linecol'] = linecol
+        if fillcol is not None:
+            kw['fillcol'] = fillcol
+        return models_plots.plot_fit(self, plot_limits, which_data_rows,
+                                     which_data_ycols, fixed_inputs, 
+                                     levels, samples, fignum, ax, resolution, 
+                                     plot_raw=plot_raw, Y_metadata=Y_metadata, 
+                                     data_symbol=data_symbol, **kw)
 
     def input_sensitivity(self):
         """
diff --git a/GPy/core/parameterization/index_operations.py b/GPy/core/parameterization/index_operations.py
index 1f3ac934..7fc76d07 100644
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@@ -7,6 +7,20 @@ import numpy
 from numpy.lib.function_base import vectorize
 from lists_and_dicts import IntArrayDict
 
+def extract_properties_to_index(index, props):
+    prop_index = dict()
+    for i, cl in enumerate(props):
+        for c in cl:
+            ind = prop_index.get(c, list())
+            ind.append(index[i])
+            prop_index[c] = ind
+
+    for c, i in prop_index.items():
+        prop_index[c] = numpy.array(i, dtype=int)
+
+    return prop_index
+
+
 class ParameterIndexOperations(object):
     '''
     Index operations for storing param index _properties
@@ -66,8 +80,34 @@ class ParameterIndexOperations(object):
         return self._properties.values()
 
     def properties_for(self, index):
+        """
+        Returns a list of properties, such that each entry in the list corresponds
+        to the element of the index given.
+
+        Example:
+        let properties: 'one':[1,2,3,4], 'two':[3,5,6]
+
+        >>> properties_for([2,3,5])
+        [['one'], ['one', 'two'], ['two']]
+        """
         return vectorize(lambda i: [prop for prop in self.iterproperties() if i in self[prop]], otypes=[list])(index)
 
+    def properties_to_index_dict(self, index):
+        """
+        Return a dictionary, containing properties as keys and indices as index
+        Thus, the indices for each constraint, which is contained will be collected as
+        one dictionary
+
+        Example:
+        let properties: 'one':[1,2,3,4], 'two':[3,5,6]
+
+        >>> properties_to_index_dict([2,3,5])
+        {'one':[2,3], 'two':[3,5]}
+        """
+        props = self.properties_for(index)
+        prop_index = extract_properties_to_index(index, props)
+        return prop_index
+
     def add(self, prop, indices):
         self._properties[prop] = combine_indices(self._properties[prop], indices)
 
@@ -174,8 +214,32 @@ class ParameterIndexOperationsView(object):
 
 
     def properties_for(self, index):
+        """
+        Returns a list of properties, such that each entry in the list corresponds
+        to the element of the index given.
+
+        Example:
+        let properties: 'one':[1,2,3,4], 'two':[3,5,6]
+
+        >>> properties_for([2,3,5])
+        [['one'], ['one', 'two'], ['two']]
+        """
         return vectorize(lambda i: [prop for prop in self.iterproperties() if i in self[prop]], otypes=[list])(index)
 
+    def properties_to_index_dict(self, index):
+        """
+        Return a dictionary, containing properties as keys and indices as index
+        Thus, the indices for each constraint, which is contained will be collected as
+        one dictionary
+
+        Example:
+        let properties: 'one':[1,2,3,4], 'two':[3,5,6]
+
+        >>> properties_to_index_dict([2,3,5])
+        {'one':[2,3], 'two':[3,5]}
+        """
+        return extract_properties_to_index(index, self.properties_for(index))
+
 
     def add(self, prop, indices):
         self._param_index_ops.add(prop, indices+self._offset)
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index f0fbf4ea..9cedc46d 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -17,7 +17,7 @@ from transformations import Logexp, NegativeLogexp, Logistic, __fixed__, FIXED,
 import numpy as np
 import re
 
-__updated__ = '2014-05-15'
+__updated__ = '2014-05-20'
 
 class HierarchyError(Exception):
     """
@@ -50,11 +50,24 @@ class Observable(object):
     self as only argument to all its observers.
     """
     _updated = True
+    _updates = True
     def __init__(self, *args, **kwargs):
         super(Observable, self).__init__()
         from lists_and_dicts import ObserverList
         self.observers = ObserverList()
 
+    @property
+    def updates(self):
+        self._updates = self._highest_parent_._updates
+        return self._updates
+
+    @updates.setter
+    def updates(self, ups):
+        assert isinstance(ups, bool), "updates are either on (True) or off (False)"
+        self._highest_parent_._updates = ups
+        if ups:
+            self._trigger_params_changed()
+
     def add_observer(self, observer, callble, priority=0):
         """
         Add an observer `observer` with the callback `callble` 
@@ -91,6 +104,8 @@ class Observable(object):
         :param min_priority: only notify observers with priority > min_priority
                              if min_priority is None, notify all observers in order
         """
+        if not self.updates:
+            return
         if which is None:
             which = self
         if min_priority is None:
@@ -309,6 +324,7 @@ class Indexable(Nameable, Observable):
         self._default_constraint_ = default_constraint
         from index_operations import ParameterIndexOperations
         self.constraints = ParameterIndexOperations()
+        self._old_constraints = ParameterIndexOperations()
         self.priors = ParameterIndexOperations()
         if self._default_constraint_ is not None:
             self.constrain(self._default_constraint_)
@@ -371,8 +387,10 @@ class Indexable(Nameable, Observable):
         """
         if value is not None:
             self[:] = value
-        reconstrained = self.unconstrain()
-        index = self._add_to_index_operations(self.constraints, reconstrained, __fixed__, warning)
+
+        index = self._raveled_index()
+        # reconstrained = self.unconstrain()
+        index = self._add_to_index_operations(self.constraints, index, __fixed__, warning)
         self._highest_parent_._set_fixed(self, index)
         self.notify_observers(self, None if trigger_parent else -np.inf)
         return index
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index cfb2171d..dd9a07c4 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -272,8 +272,11 @@ class Parameterized(Parameterizable):
     def __setattr__(self, name, val):
         # override the default behaviour, if setting a param, so broadcasting can by used
         if hasattr(self, "parameters"):
-            pnames = self.parameter_names(False, adjust_for_printing=True, recursive=False)
-            if name in pnames: self.parameters[pnames.index(name)][:] = val; return
+            try:
+                pnames = self.parameter_names(False, adjust_for_printing=True, recursive=False)
+                if name in pnames: self.parameters[pnames.index(name)][:] = val; return
+            except AttributeError:
+                pass
         object.__setattr__(self, name, val);
 
     #===========================================================================
@@ -281,11 +284,14 @@ class Parameterized(Parameterizable):
     #===========================================================================
     def __setstate__(self, state):
         super(Parameterized, self).__setstate__(state)
-        self._connect_parameters()
-        self._connect_fixes()
-        self._notify_parent_change()
+        try:
+            self._connect_parameters()
+            self._connect_fixes()
+            self._notify_parent_change()
+            self.parameters_changed()
+        except Exception as e:
+            print "WARNING: caught exception {!s}, trying to continue".format(e)
 
-        self.parameters_changed()
     def copy(self):
         c = super(Parameterized, self).copy()
         c._connect_parameters()
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 7c963913..ff67bb05 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -66,7 +66,11 @@ class SparseGP(GP):
             #gradients wrt Z
             self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z)
             self.Z.gradient += self.kern.gradients_Z_expectations(
-                               self.grad_dict['dL_dpsi0'], self.grad_dict['dL_dpsi1'], self.grad_dict['dL_dpsi2'], Z=self.Z, variational_posterior=self.X)
+                               self.grad_dict['dL_dpsi0'], 
+                               self.grad_dict['dL_dpsi1'], 
+                               self.grad_dict['dL_dpsi2'], 
+                               Z=self.Z, 
+                               variational_posterior=self.X)
         else:
             #gradients wrt kernel
             self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X)
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 484970b8..a216eec6 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -303,9 +303,11 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
     k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
 
     inan = _np.random.binomial(1, .6, size=Y.shape).astype(bool)
-    m = BayesianGPLVM(Y.copy(), Q, init="random", num_inducing=num_inducing, kernel=k)
-    m.inference_method = VarDTCMissingData()
-    m.Y[inan] = _np.nan
+    Y[inan] = _np.nan
+
+    m = BayesianGPLVM(Y.copy(), Q, init="random", num_inducing=num_inducing, 
+                      inference_method=VarDTCMissingData(inan=inan), kernel=k)
+
     m.X.variance[:] = _np.random.uniform(0,.01,m.X.shape)
     m.likelihood.variance = .01
     m.parameters_changed()
@@ -338,7 +340,40 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
         print "Optimizing Model:"
         m.optimize(messages=verbose, max_iters=8e3, gtol=.1)
     if plot:
-        m.plot_X_1d("MRD Latent Space 1D")
+        m.X.plot("MRD Latent Space 1D")
+        m.plot_scales("MRD Scales")
+    return m
+
+def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
+    from GPy import kern
+    from GPy.models import MRD
+    from GPy.inference.latent_function_inference.var_dtc import VarDTCMissingData
+
+    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
+    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
+
+    #Ylist = [Ylist[0]]
+    k = kern.Linear(Q, ARD=True)
+    inanlist = []
+
+    for Y in Ylist:
+        inan = _np.random.binomial(1, .6, size=Y.shape).astype(bool)
+        inanlist.append(inan)
+        Y[inan] = _np.nan
+
+    imlist = []
+    for inan in inanlist:
+        imlist.append(VarDTCMissingData(limit=1, inan=inan))
+
+    m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing, 
+            kernel=k, inference_method=imlist,
+            initx="random", initz='permute', **kw)
+
+    if optimize:
+        print "Optimizing Model:"
+        m.optimize('bfgs', messages=verbose, max_iters=8e3, gtol=.1)
+    if plot:
+        m.X.plot("MRD Latent Space 1D")
         m.plot_scales("MRD Scales")
     return m
 
@@ -483,21 +518,22 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
     Q = 6
     kernel = GPy.kern.RBF(Q, lengthscale=np.repeat(.5, Q), ARD=True) 
     m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel)
-    
+
     m.data = data
     m.likelihood.variance = 0.001
-    
+
     # optimize
-    if optimize: m.optimize('bfgs', messages=verbose, max_iters=800, xtol=1e-300, ftol=1e-300)
+    if optimize: m.optimize('bfgs', messages=verbose, max_iters=5e3, bfgs_factor=10)
     if plot:
-        plt.clf, (latent_axes, sense_axes) = plt.subplots(1, 2)
+        fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
         plt.sca(latent_axes)
         m.plot_latent(ax=latent_axes)
         y = m.Y[:1, :].copy()
         data_show = GPy.plotting.matplot_dep.visualize.stick_show(y, connect=data['connect'])
-        GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean[:1, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
-        plt.draw()
-        #raw_input('Press enter to finish')
+        dim_select = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean[:1, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
+        fig.canvas.draw()
+        fig.canvas.show()
+        raw_input('Press enter to finish')
 
     return m
 
diff --git a/GPy/inference/latent_function_inference/__init__.py b/GPy/inference/latent_function_inference/__init__.py
index 878c1e4f..8589a60e 100644
--- a/GPy/inference/latent_function_inference/__init__.py
+++ b/GPy/inference/latent_function_inference/__init__.py
@@ -38,6 +38,25 @@ class LatentFunctionInference(object):
         """
         pass
 
+class InferenceMethodList(LatentFunctionInference, list):
+
+    def on_optimization_start(self):
+        for inf in self:
+            inf.on_optimization_start()
+
+    def on_optimization_end(self):
+        for inf in self:
+            inf.on_optimization_end()
+    
+    def __getstate__(self):
+        state = []
+        for inf in self:
+            state.append(inf)
+        return state
+    
+    def __setstate__(self, state):
+        for inf in state:
+            self.append(inf)
 
 from exact_gaussian_inference import ExactGaussianInference
 from laplace import Laplace
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index 309cb7e5..70b0e0ea 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -95,7 +95,7 @@ class Posterior(object):
         """
         if self._covariance is None:
             #LiK, _ = dtrtrs(self.woodbury_chol, self._K, lower=1)
-            self._covariance = self._K - (np.tensordot(np.dot(np.atleast_3d(self.woodbury_inv).T, self._K), self._K, [1,0]).T).squeeze()
+            self._covariance = (np.atleast_3d(self._K) - np.tensordot(np.dot(np.atleast_3d(self.woodbury_inv).T, self._K), self._K, [1,0]).T).squeeze()
             #self._covariance = self._K - self._K.dot(self.woodbury_inv).dot(self._K)
         return self._covariance
 
diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py
index 3043a7e8..a9a137dc 100644
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@@ -202,6 +202,17 @@ class VarDTCMissingData(LatentFunctionInference):
     def set_limit(self, limit):
         self._Y.limit = limit
 
+    def __getstate__(self):
+        # has to be overridden, as Cacher objects cannot be pickled. 
+        return self._Y.limit, self._inan
+
+    def __setstate__(self, state):
+        # has to be overridden, as Cacher objects cannot be pickled. 
+        from ...util.caching import Cacher
+        self.limit = state[0]
+        self._inan = state[1]
+        self._Y = Cacher(self._subarray_computations, self.limit)
+
     def _subarray_computations(self, Y):
         if self._inan is None:
             inan = np.isnan(Y)
@@ -272,7 +283,11 @@ class VarDTCMissingData(LatentFunctionInference):
             else: beta = beta_all
 
             VVT_factor = (beta*y)
-            VVT_factor_all[v, ind].flat = VVT_factor.flat
+            try:
+                VVT_factor_all[v, ind].flat = VVT_factor.flat
+            except ValueError:
+                mult = np.ravel_multi_index((v.nonzero()[0][:,None],ind[None,:]), VVT_factor_all.shape)
+                VVT_factor_all.flat[mult] = VVT_factor
             output_dim = y.shape[1]
 
             psi0 = psi0_all[v]
diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py
index 1c237a6c..12f5d444 100644
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@@ -134,7 +134,7 @@ class Add(CombinationKernel):
                     eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.variance * 2.
                 else:
                     eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2.
-            target += p1.gradients_Z_expectations(eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
+            target += p1.gradients_Z_expectations(dL_psi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
         return target
 
     def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
diff --git a/GPy/kern/_src/psi_comp/rbf_psi_comp.py b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
new file mode 100644
index 00000000..d5cd8951
--- /dev/null
+++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
@@ -0,0 +1,170 @@
+"""
+The module for psi-statistics for RBF kernel
+"""
+
+import numpy as np
+from . import PSICOMP
+from GPy.util.caching import Cache_this
+from ....util.misc import param_to_array
+from scipy import weave
+from ....util.config import *
+
+class PSICOMP_RBF(PSICOMP):
+    
+    @Cache_this(limit=1, ignore_args=(0,))
+    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        """
+        # here are the "statistics" for psi0, psi1 and psi2
+        # Produced intermediate results:
+        # _psi1                NxM
+        mu = variational_posterior.mean
+        S = variational_posterior.variance
+        
+        psi0 = np.empty(mu.shape[0])
+        psi0[:] = variance
+        psi1 = self._psi1computations(variance, lengthscale, Z, mu, S)
+        psi2 = self._psi2computations(variance, lengthscale, Z, mu, S).sum(axis=0)
+        return psi0, psi1, psi2
+    
+    @Cache_this(limit=1, ignore_args=(0,))
+    def _psi1computations(self, variance, lengthscale, Z, mu, S):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        """
+        # here are the "statistics" for psi1
+        # Produced intermediate results:
+        # _psi1                NxM
+    
+        lengthscale2 = np.square(lengthscale)
+    
+        # psi1
+        _psi1_logdenom = np.log(S/lengthscale2+1.).sum(axis=-1) # N
+        _psi1_log = (_psi1_logdenom[:,None]+np.einsum('nmq,nq->nm',np.square(mu[:,None,:]-Z[None,:,:]),1./(S+lengthscale2)))/(-2.)
+        _psi1 = variance*np.exp(_psi1_log)
+            
+        return _psi1
+    
+    @Cache_this(limit=1, ignore_args=(0,))
+    def _psi2computations(self, variance, lengthscale, Z, mu, S):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        """
+        # here are the "statistics" for psi2
+        # Produced intermediate results:
+        # _psi2                MxM
+        
+        lengthscale2 = np.square(lengthscale)
+        
+        _psi2_logdenom = np.log(2.*S/lengthscale2+1.).sum(axis=-1)/(-2.) # N
+        _psi2_exp1 = (np.square(Z[:,None,:]-Z[None,:,:])/lengthscale2).sum(axis=-1)/(-4.) #MxM
+        Z_hat = (Z[:,None,:]+Z[None,:,:])/2. #MxMxQ
+        denom = 1./(2.*S+lengthscale2)
+        _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+2.*np.einsum('nq,moq,nq->nmo',mu,Z_hat,denom)-np.einsum('moq,nq->nmo',np.square(Z_hat),denom)
+        _psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2)
+        
+    
+        return _psi2
+    
+    @Cache_this(limit=1, ignore_args=(0,1,2,3))
+    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+        ARD = (len(lengthscale)!=1)
+        
+        dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = self._psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
+        dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = self._psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
+    
+        dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
+        
+        dL_dlengscale = dl_psi1 + dl_psi2
+        if not ARD:
+            dL_dlengscale = dL_dlengscale.sum()
+
+        dL_dmu = dmu_psi1 + dmu_psi2
+        dL_dS = dS_psi1 + dS_psi2
+        dL_dZ = dZ_psi1 + dZ_psi2
+        
+        return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS
+    
+    def _psi1compDer(self, dL_dpsi1, variance, lengthscale, Z, mu, S):
+        """
+        dL_dpsi1 - NxM
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        """
+        # here are the "statistics" for psi1
+        # Produced intermediate results: dL_dparams w.r.t. psi1
+        # _dL_dvariance     1
+        # _dL_dlengthscale  Q
+        # _dL_dZ            MxQ
+        # _dL_dgamma        NxQ
+        # _dL_dmu           NxQ
+        # _dL_dS            NxQ
+        
+        lengthscale2 = np.square(lengthscale)
+        
+        _psi1 = self._psi1computations(variance, lengthscale, Z, mu, S)
+        Lpsi1 = dL_dpsi1*_psi1
+        Zmu = Z[None,:,:]-mu[:,None,:] # NxMxQ
+        denom = 1./(S+lengthscale2)
+        Zmu2_denom = np.square(Zmu)*denom[:,None,:] #NxMxQ
+        _dL_dvar = Lpsi1.sum()/variance
+        _dL_dmu = np.einsum('nm,nmq,nq->nq',Lpsi1,Zmu,denom)
+        _dL_dS = np.einsum('nm,nmq,nq->nq',Lpsi1,(Zmu2_denom-1.),denom)/2.
+        _dL_dZ = -np.einsum('nm,nmq,nq->mq',Lpsi1,Zmu,denom)
+        _dL_dl = np.einsum('nm,nmq,nq->q',Lpsi1,(Zmu2_denom+(S/lengthscale2)[:,None,:]),denom*lengthscale)
+        
+        return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
+    
+    def _psi2compDer(self, dL_dpsi2, variance, lengthscale, Z, mu, S):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        dL_dpsi2 - MxM
+        """
+        # here are the "statistics" for psi2
+        # Produced the derivatives w.r.t. psi2:
+        # _dL_dvariance      1
+        # _dL_dlengthscale   Q
+        # _dL_dZ             MxQ
+        # _dL_dgamma         NxQ
+        # _dL_dmu            NxQ
+        # _dL_dS             NxQ
+        
+        lengthscale2 = np.square(lengthscale)
+        denom = 1./(2*S+lengthscale2)
+        denom2 = np.square(denom)
+
+        _psi2 = self._psi2computations(variance, lengthscale, Z, mu, S) # NxMxM
+        
+        Lpsi2 = dL_dpsi2[None,:,:]*_psi2
+        Lpsi2sum = np.einsum('nmo->n',Lpsi2) #N
+        Lpsi2Z = np.einsum('nmo,oq->nq',Lpsi2,Z) #NxQ
+        Lpsi2Z2 = np.einsum('nmo,oq,oq->nq',Lpsi2,Z,Z) #NxQ
+        Lpsi2Z2p = np.einsum('nmo,mq,oq->nq',Lpsi2,Z,Z) #NxQ
+        Lpsi2Zhat = Lpsi2Z
+        Lpsi2Zhat2 = (Lpsi2Z2+Lpsi2Z2p)/2
+        
+        _dL_dvar = Lpsi2sum.sum()*2/variance
+        _dL_dmu = (-2*denom) * (mu*Lpsi2sum[:,None]-Lpsi2Zhat)
+        _dL_dS = (2*np.square(denom))*(np.square(mu)*Lpsi2sum[:,None]-2*mu*Lpsi2Zhat+Lpsi2Zhat2) - denom*Lpsi2sum[:,None]
+        _dL_dZ = -np.einsum('nmo,oq->oq',Lpsi2,Z)/lengthscale2+np.einsum('nmo,oq->mq',Lpsi2,Z)/lengthscale2+ \
+                 2*np.einsum('nmo,nq,nq->mq',Lpsi2,mu,denom) - np.einsum('nmo,nq,mq->mq',Lpsi2,denom,Z) - np.einsum('nmo,oq,nq->mq',Lpsi2,Z,denom)
+        _dL_dl = 2*lengthscale* ((S/lengthscale2*denom+np.square(mu*denom))*Lpsi2sum[:,None]+(Lpsi2Z2-Lpsi2Z2p)/(2*np.square(lengthscale2))-
+                                 (2*mu*denom2)*Lpsi2Zhat+denom2*Lpsi2Zhat2).sum(axis=0)
+    
+        return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
+    
diff --git a/GPy/kern/_src/stationary.py b/GPy/kern/_src/stationary.py
index f561baa4..ef8900f9 100644
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@@ -180,7 +180,7 @@ class Stationary(Kern):
         return np.zeros(X.shape)
 
     def input_sensitivity(self):
-        return np.ones(self.input_dim)/self.lengthscale
+        return np.ones(self.input_dim)/self.lengthscale**2
 
 class Exponential(Stationary):
     def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Exponential'):
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index 2e7589a0..bc30bcfc 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -82,8 +82,8 @@ class BayesianGPLVM(SparseGP):
     def plot_latent(self, labels=None, which_indices=None,
                 resolution=50, ax=None, marker='o', s=40,
                 fignum=None, plot_inducing=True, legend=True,
-                plot_limits=None,
-                aspect='auto', updates=False, **kwargs):
+                plot_limits=None, 
+                aspect='auto', updates=False, predict_kwargs={}, imshow_kwargs={}):
         import sys
         assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
         from ..plotting.matplot_dep import dim_reduction_plots
@@ -91,7 +91,7 @@ class BayesianGPLVM(SparseGP):
         return dim_reduction_plots.plot_latent(self, labels, which_indices,
                 resolution, ax, marker, s,
                 fignum, plot_inducing, legend,
-                plot_limits, aspect, updates, **kwargs)
+                plot_limits, aspect, updates, predict_kwargs, imshow_kwargs)
 
     def do_test_latents(self, Y):
         """
diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
index 458a70a1..0e6547bb 100644
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@@ -9,16 +9,25 @@ from ..core import Model
 from ..kern import Kern
 from ..core.parameterization.variational import NormalPosterior, NormalPrior
 from ..core.parameterization import Param, Parameterized
+from ..core.parameterization.observable_array import ObsAr
 from ..inference.latent_function_inference.var_dtc import VarDTCMissingData, VarDTC
+from ..inference.latent_function_inference import InferenceMethodList
 from ..likelihoods import Gaussian
-from GPy.util.initialization import initialize_latent
+from ..util.initialization import initialize_latent
+from ..core.sparse_gp import SparseGP, GP
 
-class MRD(Model):
+class MRD(SparseGP):
     """
+    !WARNING: This is bleeding edge code and still in development. 
+    Functionality may change fundamentally during development!
+
     Apply MRD to all given datasets Y in Ylist.
 
     Y_i in [n x p_i]
 
+    If Ylist is a dictionary, the keys of the dictionary are the names, and the
+    values are the different datasets to compare. 
+
     The samples n in the datasets need
     to match up, whereas the dimensionality p_d can differ.
 
@@ -39,40 +48,71 @@ class MRD(Model):
     :param num_inducing: number of inducing inputs to use
     :param Z: initial inducing inputs
     :param kernel: list of kernels or kernel to copy for each output
-    :type kernel: [GPy.kern.kern] | GPy.kern.kern | None (default)
-    :param :class:`~GPy.inference.latent_function_inference inference_method: the inference method to use
-    :param :class:`~GPy.likelihoods.likelihood.Likelihood` likelihood: the likelihood to use
+    :type kernel: [GPy.kernels.kernels] | GPy.kernels.kernels | None (default)
+    :param :class:`~GPy.inference.latent_function_inference inference_method: 
+        InferenceMethodList of inferences, or one inference method for all
+    :param :class:`~GPy.likelihoodss.likelihoods.likelihoods` likelihoods: the likelihoods to use
     :param str name: the name of this model
     :param [str] Ynames: the names for the datasets given, must be of equal length as Ylist or None
     """
     def __init__(self, Ylist, input_dim, X=None, X_variance=None,
                  initx = 'PCA', initz = 'permute',
                  num_inducing=10, Z=None, kernel=None,
-                 inference_method=None, likelihood=None, name='mrd', Ynames=None):
-        super(MRD, self).__init__(name)
+                 inference_method=None, likelihoods=None, name='mrd', Ynames=None):
+        super(GP, self).__init__(name)
 
         self.input_dim = input_dim
         self.num_inducing = num_inducing
 
-        self.Ylist = Ylist
+        if isinstance(Ylist, dict):
+            Ynames, Ylist = zip(*Ylist.items())
+
+        self.Ylist = [ObsAr(Y) for Y in Ylist]
+
+        if Ynames is None:
+            Ynames = ['Y{}'.format(i) for i in range(len(Ylist))]
+        self.names = Ynames
+        assert len(self.names) == len(self.Ylist), "one name per dataset, or None if Ylist is a dict"
+
+        if inference_method is None:
+            self.inference_method= InferenceMethodList()
+            warned = False
+            for y in Ylist:
+                inan = np.isnan(y)
+                if np.any(inan):
+                    if not warned:
+                        print "WARING: NaN values detected, make sure initx method can cope with NaN values or provide starting latent space X"
+                        warned = True
+                    self.inference_method.append(VarDTCMissingData(limit=1, inan=inan))
+                else:
+                    self.inference_method.append(VarDTC(limit=1))
+        else:
+            if not isinstance(inference_method, InferenceMethodList):
+                inference_method = InferenceMethodList(inference_method)
+            self.inference_method = inference_method
+
+
         self._in_init_ = True
-        X, fracs = self._init_X(initx, Ylist)
+        if X is None:
+            X, fracs = self._init_X(initx, Ylist)
+        else:
+            fracs = [X.var(0)]*len(Ylist)
         self.Z = Param('inducing inputs', self._init_Z(initz, X))
         self.num_inducing = self.Z.shape[0] # ensure M==N if M>N
 
         # sort out the kernels
         if kernel is None:
             from ..kern import RBF
-            self.kern = [RBF(input_dim, ARD=1, lengthscale=fracs[i], name='rbf'.format(i)) for i in range(len(Ylist))]
+            self.kernels = [RBF(input_dim, ARD=1, lengthscale=fracs[i]) for i in range(len(Ylist))]
         elif isinstance(kernel, Kern):
-            self.kern = []
+            self.kernels = []
             for i in range(len(Ylist)):
                 k = kernel.copy()
-                self.kern.append(k)
+                self.kernels.append(k)
         else:
             assert len(kernel) == len(Ylist), "need one kernel per output"
             assert all([isinstance(k, Kern) for k in kernel]), "invalid kernel object detected!"
-            self.kern = kernel
+            self.kernels = kernel
 
         if X_variance is None:
             X_variance = np.random.uniform(0.1, 0.2, X.shape)
@@ -80,32 +120,27 @@ class MRD(Model):
         self.variational_prior = NormalPrior()
         self.X = NormalPosterior(X, X_variance)
 
-        if likelihood is None:
-            self.likelihood = [Gaussian(name='Gaussian_noise'.format(i)) for i in range(len(Ylist))]
-        else: self.likelihood = likelihood
-
-        if inference_method is None:
-            self.inference_method= []
-            for y in Ylist:
-                if np.any(np.isnan(y)):
-                    self.inference_method.append(VarDTCMissingData(limit=1))
-                else:
-                    self.inference_method.append(VarDTC(limit=1))
-        else:
-            self.inference_method = inference_method
-            self.inference_method.set_limit(len(Ylist))
+        if likelihoods is None:
+            self.likelihoods = [Gaussian(name='Gaussian_noise'.format(i)) for i in range(len(Ylist))]
+        else: self.likelihoods = likelihoods
 
         self.add_parameters(self.X, self.Z)
 
-        if Ynames is None:
-            Ynames = ['Y{}'.format(i) for i in range(len(Ylist))]
+        self.bgplvms = []
+        self.num_data = Ylist[0].shape[0]
+
+        for i, n, k, l, Y in itertools.izip(itertools.count(), Ynames, self.kernels, self.likelihoods, self.Ylist):
+            assert Y.shape[0] == self.num_data, "All datasets need to share the number of datapoints, and those have to correspond to one another"
 
-        for i, n, k, l in itertools.izip(itertools.count(), Ynames, self.kern, self.likelihood):
             p = Parameterized(name=n)
             p.add_parameter(k)
+            p.kern = k
             p.add_parameter(l)
-            setattr(self, 'Y{}'.format(i), p)
+            p.likelihood = l
             self.add_parameter(p)
+            self.bgplvms.append(p)
+
+        self.posterior = None
         self._in_init_ = False
 
     def parameters_changed(self):
@@ -114,13 +149,13 @@ class MRD(Model):
         self.Z.gradient[:] = 0.
         self.X.gradient[:] = 0.
 
-        for y, k, l, i in itertools.izip(self.Ylist, self.kern, self.likelihood, self.inference_method):
+        for y, k, l, i in itertools.izip(self.Ylist, self.kernels, self.likelihoods, self.inference_method):
             posterior, lml, grad_dict = i.inference(k, self.X, self.Z, l, y)
 
             self.posteriors.append(posterior)
             self._log_marginal_likelihood += lml
 
-            # likelihood gradients
+            # likelihoods gradients
             l.update_gradients(grad_dict.pop('dL_dthetaL'))
 
             #gradients wrt kernel
@@ -133,13 +168,20 @@ class MRD(Model):
             #gradients wrt Z
             self.Z.gradient += k.gradients_X(dL_dKmm, self.Z)
             self.Z.gradient += k.gradients_Z_expectations(
-                               grad_dict['dL_dpsi1'], grad_dict['dL_dpsi2'], Z=self.Z, variational_posterior=self.X)
+                               grad_dict['dL_dpsi0'], 
+                               grad_dict['dL_dpsi1'], 
+                               grad_dict['dL_dpsi2'], 
+                               Z=self.Z, variational_posterior=self.X)
 
             dL_dmean, dL_dS = k.gradients_qX_expectations(variational_posterior=self.X, Z=self.Z, **grad_dict)
             self.X.mean.gradient += dL_dmean
             self.X.variance.gradient += dL_dS
 
         # update for the KL divergence
+        self.posterior = self.posteriors[0]
+        self.kern = self.kernels[0]
+        self.likelihood = self.likelihoods[0]
+
         self.variational_prior.update_gradients_KL(self.X)
         self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
 
@@ -151,7 +193,7 @@ class MRD(Model):
             Ylist = self.Ylist
         if init in "PCA_concat":
             X, fracs = initialize_latent('PCA', self.input_dim, np.hstack(Ylist))
-            fracs = [fracs]*self.input_dim
+            fracs = [fracs]*len(Ylist)
         elif init in "PCA_single":
             X = np.zeros((Ylist[0].shape[0], self.input_dim))
             fracs = []
@@ -162,7 +204,7 @@ class MRD(Model):
         else: # init == 'random':
             X = np.random.randn(Ylist[0].shape[0], self.input_dim)
             fracs = X.var(0)
-            fracs = [fracs]*self.input_dim
+            fracs = [fracs]*len(Ylist)
         X -= X.mean()
         X /= X.std()
         return X, fracs
@@ -181,6 +223,7 @@ class MRD(Model):
             fig = pylab.figure(num=fignum)
         sharex_ax = None
         sharey_ax = None
+        plots = []
         for i, g in enumerate(self.bgplvms):
             try:
                 if sharex:
@@ -197,26 +240,36 @@ class MRD(Model):
                 ax = axes[i]
             else:
                 raise ValueError("Need one axes per latent dimension input_dim")
-            plotf(i, g, ax)
+            plots.append(plotf(i, g, ax))
             if sharey_ax is not None:
                 pylab.setp(ax.get_yticklabels(), visible=False)
         pylab.draw()
         if axes is None:
-            fig.tight_layout()
-            return fig
-        else:
-            return pylab.gcf()
+            try:
+                fig.tight_layout()
+            except:
+                pass
+        return plots
 
-    def plot_X(self, fignum=None, ax=None):
-        fig = self._handle_plotting(fignum, ax, lambda i, g, ax: ax.imshow(g.X))
-        return fig
+    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None, Yindex=0):
+        """
+        Prediction for data set Yindex[default=0].
+        This predicts the output mean and variance for the dataset given in Ylist[Yindex]
+        """
+        self.posterior = self.posteriors[Yindex]
+        self.kern = self.kernels[Yindex]
+        self.likelihood = self.likelihoods[Yindex]
+        return super(MRD, self).predict(Xnew, full_cov, Y_metadata, kern)
 
-    def plot_predict(self, fignum=None, ax=None, sharex=False, sharey=False, **kwargs):
-        fig = self._handle_plotting(fignum,
-                                    ax,
-                                    lambda i, g, ax: ax.imshow(g. predict(g.X)[0], **kwargs),
-                                    sharex=sharex, sharey=sharey)
-        return fig
+    #===============================================================================
+    # TODO: Predict! Maybe even change to several bgplvms, which share an X?
+    #===============================================================================
+    #     def plot_predict(self, fignum=None, ax=None, sharex=False, sharey=False, **kwargs):
+    #         fig = self._handle_plotting(fignum,
+    #                                     ax,
+    #                                     lambda i, g, ax: ax.imshow(g.predict(g.X)[0], **kwargs),
+    #                                     sharex=sharex, sharey=sharey)
+    #         return fig
 
     def plot_scales(self, fignum=None, ax=None, titles=None, sharex=False, sharey=True, *args, **kwargs):
         """
@@ -228,28 +281,58 @@ class MRD(Model):
         """
         if titles is None:
             titles = [r'${}$'.format(name) for name in self.names]
-        ymax = reduce(max, [np.ceil(max(g.input_sensitivity())) for g in self.bgplvms])
+        ymax = reduce(max, [np.ceil(max(g.kern.input_sensitivity())) for g in self.bgplvms])
         def plotf(i, g, ax):
             ax.set_ylim([0,ymax])
-            g.kern.plot_ARD(ax=ax, title=titles[i], *args, **kwargs)
+            return g.kern.plot_ARD(ax=ax, title=titles[i], *args, **kwargs)
         fig = self._handle_plotting(fignum, ax, plotf, sharex=sharex, sharey=sharey)
         return fig
 
-    def plot_latent(self, fignum=None, ax=None, *args, **kwargs):
-        fig = self.gref.plot_latent(fignum=fignum, ax=ax, *args, **kwargs) # self._handle_plotting(fignum, ax, lambda i, g, ax: g.plot_latent(ax=ax, *args, **kwargs))
-        return fig
+    def plot_latent(self, labels=None, which_indices=None,
+                resolution=50, ax=None, marker='o', s=40,
+                fignum=None, plot_inducing=True, legend=True,
+                plot_limits=None, 
+                aspect='auto', updates=False, predict_kwargs={}, imshow_kwargs={}):
+        """
+        see plotting.matplot_dep.dim_reduction_plots.plot_latent
+        if predict_kwargs is None, will plot latent spaces for 0th dataset (and kernel), otherwise give
+        predict_kwargs=dict(Yindex='index') for plotting only the latent space of dataset with 'index'.
+        """
+        import sys
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import dim_reduction_plots
+        if "Yindex" not in predict_kwargs:
+            predict_kwargs['Yindex'] = 0
+        if ax is None:
+            fig = pylab.figure(num=fignum)
+            ax = fig.add_subplot(111)
+        else:
+            fig = ax.figure
+        plot = dim_reduction_plots.plot_latent(self, labels, which_indices,
+                                        resolution, ax, marker, s,
+                                        fignum, plot_inducing, legend,
+                                        plot_limits, aspect, updates, predict_kwargs, imshow_kwargs)
+        ax.set_title(self.bgplvms[predict_kwargs['Yindex']].name)
+        try:
+            fig.tight_layout()
+        except:
+            pass
 
-    def _debug_plot(self):
-        self.plot_X_1d()
-        fig = pylab.figure("MRD DEBUG PLOT", figsize=(4 * len(self.bgplvms), 9))
-        fig.clf()
-        axes = [fig.add_subplot(3, len(self.bgplvms), i + 1) for i in range(len(self.bgplvms))]
-        self.plot_X(ax=axes)
-        axes = [fig.add_subplot(3, len(self.bgplvms), i + len(self.bgplvms) + 1) for i in range(len(self.bgplvms))]
-        self.plot_latent(ax=axes)
-        axes = [fig.add_subplot(3, len(self.bgplvms), i + 2 * len(self.bgplvms) + 1) for i in range(len(self.bgplvms))]
-        self.plot_scales(ax=axes)
-        pylab.draw()
-        fig.tight_layout()
+        return plot
 
+    def __getstate__(self):
+        # TODO:
+        import copy
+        state = copy.copy(self.__dict__)
+        del state['kernels']
+        del state['kern']
+        del state['likelihood']
+        return state
 
+    def __setstate__(self, state):
+        # TODO:
+        super(MRD, self).__setstate__(state)
+        self.kernels = [p.kern for p in self.bgplvms]
+        self.kern = self.kernels[0]
+        self.likelihood = self.likelihoods[0]
+        self.parameters_changed()
\ No newline at end of file
diff --git a/GPy/plotting/matplot_dep/dim_reduction_plots.py b/GPy/plotting/matplot_dep/dim_reduction_plots.py
index f8413671..0ba082df 100644
--- a/GPy/plotting/matplot_dep/dim_reduction_plots.py
+++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py
@@ -31,7 +31,7 @@ def plot_latent(model, labels=None, which_indices=None,
                 resolution=50, ax=None, marker='o', s=40,
                 fignum=None, plot_inducing=False, legend=True,
                 plot_limits=None, 
-                aspect='auto', updates=False, **kwargs):
+                aspect='auto', updates=False, predict_kwargs={}, imshow_kwargs={}):
     """
     :param labels: a np.array of size model.num_data containing labels for the points (can be number, strings, etc)
     :param resolution: the resolution of the grid on which to evaluate the predictive variance
@@ -60,7 +60,7 @@ def plot_latent(model, labels=None, which_indices=None,
     def plot_function(x):
         Xtest_full = np.zeros((x.shape[0], model.X.shape[1]))
         Xtest_full[:, [input_1, input_2]] = x
-        _, var = model.predict(Xtest_full)
+        _, var = model.predict(Xtest_full, **predict_kwargs)
         var = var[:, :1]
         return np.log(var)
 
@@ -81,7 +81,7 @@ def plot_latent(model, labels=None, which_indices=None,
     view = ImshowController(ax, plot_function,
                             (xmin, ymin, xmax, ymax),
                             resolution, aspect=aspect, interpolation='bilinear',
-                            cmap=pb.cm.binary, **kwargs)
+                            cmap=pb.cm.binary, **imshow_kwargs)
 
     # make sure labels are in order of input:
     ulabels = []
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index 84747d05..8f3e55b0 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -97,7 +97,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
 
         for d in which_data_ycols:
             plots['gpplot'] = gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], ax=ax, edgecol=linecol, fillcol=fillcol)
-            plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], data_symbol, mew=1.5)
+            if not plot_raw: plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], data_symbol, mew=1.5)
 
         #optionally plot some samples
         if samples: #NOTE not tested with fixed_inputs
@@ -151,7 +151,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
         for d in which_data_ycols:
             m_d = m[:,d].reshape(resolution, resolution).T
             plots['contour'] = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
-            plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+            if not plot_raw: plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
 
         #set the limits of the plot to some sensible values
         ax.set_xlim(xmin[0], xmax[0])
diff --git a/GPy/plotting/matplot_dep/visualize.py b/GPy/plotting/matplot_dep/visualize.py
index ee505af9..557729da 100644
--- a/GPy/plotting/matplot_dep/visualize.py
+++ b/GPy/plotting/matplot_dep/visualize.py
@@ -88,7 +88,6 @@ class vector_show(matplotlib_show):
 
 
 class lvm(matplotlib_show):
-    
     def __init__(self, vals, model, data_visualize, latent_axes=None, sense_axes=None, latent_index=[0,1], disable_drag=False):
         """Visualize a latent variable model
 
@@ -150,7 +149,6 @@ class lvm(matplotlib_show):
         pass
 
     def on_click(self, event):
-        print 'click!'
         if event.inaxes!=self.latent_axes: return
         if self.disable_drag:
             self.move_on = True
@@ -228,11 +226,10 @@ class lvm_dimselect(lvm):
         self.labels = labels
         lvm.__init__(self,vals,model,data_visualize,latent_axes,sense_axes,latent_index)
         self.show_sensitivities()
-        print "use left and right mouse butons to select dimensions"
+        print "use left and right mouse buttons to select dimensions"
 
 
     def on_click(self, event):
-
         if event.inaxes==self.sense_axes:
             new_index = max(0,min(int(np.round(event.xdata-0.5)),self.model.input_dim-1))
             if event.button == 1:
diff --git a/GPy/testing/pickle_tests.py b/GPy/testing/pickle_tests.py
index 94504645..13a67744 100644
--- a/GPy/testing/pickle_tests.py
+++ b/GPy/testing/pickle_tests.py
@@ -4,7 +4,8 @@ Created on 13 Mar 2014
 @author: maxz
 '''
 import unittest, itertools
-import cPickle as pickle
+#import cPickle as pickle
+import pickle
 import numpy as np
 from GPy.core.parameterization.index_operations import ParameterIndexOperations,\
     ParameterIndexOperationsView
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index 4d89ece2..d140fe3a 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -687,14 +687,20 @@ def hapmap3(data_set='hapmap3'):
         import bz2
     except ImportError as i:
         raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset"
-    if not data_available(data_set):
-        download_data(data_set)
+
     dirpath = os.path.join(data_path,'hapmap3')
     hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly'
+    unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']]
+    unpacked_files_exist = reduce(lambda a, b:a and b, map(os.path.exists, unpacked_files))
+
+    if not unpacked_files_exist and not data_available(data_set):
+        download_data(data_set)
+
     preprocessed_data_paths = [os.path.join(dirpath,hapmap_file_name + file_name) for file_name in \
                                ['.snps.pickle',
                                 '.info.pickle',
                                 '.nan.pickle']]
+
     if not reduce(lambda a,b: a and b, map(os.path.exists, preprocessed_data_paths)):
         if not overide_manual_authorize and not prompt_user("Preprocessing requires ~25GB "
                             "of memory and can take a (very) long time, continue? [Y/n]"):
@@ -708,8 +714,7 @@ def hapmap3(data_set='hapmap3'):
                                                                perc="="*int(20.*progress/100.))
             stdout.write(status); stdout.flush()
             return status
-        unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']]
-        if not reduce(lambda a,b: a and b, map(os.path.exists, unpacked_files)):
+        if not unpacked_files_exist:
             status=write_status('unpacking...', 0, '')
             curr = 0
             for newfilepath in unpacked_files:
@@ -726,6 +731,7 @@ def hapmap3(data_set='hapmap3'):
                             status=write_status('unpacking...', curr+12.*file_processed/(file_size), status)
                 curr += 12
                 status=write_status('unpacking...', curr, status)
+                os.remove(filepath)
         status=write_status('reading .ped...', 25, status)
         # Preprocess data:    
         snpstrnp = np.loadtxt(unpacked_files[0], dtype=str)
@@ -796,7 +802,7 @@ def hapmap3(data_set='hapmap3'):
 def singlecell(data_set='singlecell'):
     if not data_available(data_set):
         download_data(data_set)
-    
+
     from pandas import read_csv
     dirpath = os.path.join(data_path, data_set)
     filename = os.path.join(dirpath, 'singlecell.csv')
diff --git a/GPy/util/pca.py b/GPy/util/pca.py
index 6c548b3d..967d0e1b 100644
--- a/GPy/util/pca.py
+++ b/GPy/util/pca.py
@@ -106,12 +106,14 @@ class pca(object):
                 ulabels.append(lab)
         nlabels = len(ulabels)
         if colors is None:
-            colors = [cmap(float(i) / nlabels) for i in range(nlabels)]
+            colors = iter([cmap(float(i) / nlabels) for i in range(nlabels)])
+        else:
+            colors = iter(colors)
         X_ = self.project(X, self.Q)[:,dimensions]
         kwargs.update(dict(s=s))
         plots = list()
         for i, l in enumerate(ulabels):
-            kwargs.update(dict(color=colors[i], marker=marker[i % len(marker)]))
+            kwargs.update(dict(color=colors.next(), marker=marker[i % len(marker)]))
             plots.append(ax.scatter(*X_[labels == l, :].T, label=str(l), **kwargs))
         ax.set_xlabel(r"PC$_1$")
         ax.set_ylabel(r"PC$_2$")
diff --git a/GPy/util/subarray_and_sorting.py b/GPy/util/subarray_and_sorting.py
index ab7b7672..f1707394 100644
--- a/GPy/util/subarray_and_sorting.py
+++ b/GPy/util/subarray_and_sorting.py
@@ -4,7 +4,7 @@
 .. moduleauthor:: Max Zwiessele <ibinbei@gmail.com>
 
 '''
-__updated__ = '2013-12-02'
+__updated__ = '2014-05-20'
 
 import numpy as np