Merge branch 'devel' of github.com:SheffieldML/GPy into devel

2026-05-10 04:22:38 +02:00 · 2014-11-03 14:22:16 +00:00 · 2014-11-03 14:22:16 +00:00 · 3eb2fa3a8f
commit 3eb2fa3a8f
parent 871405e3e4 80b1bbbd64
12 changed files with 717 additions and 332 deletions
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@ -26,39 +26,39 @@ class ParameterIndexOperations(object):
    This object wraps a dictionary, whos keys are _operations_ that we'd like
    to apply to a parameter array, and whose values are np integer arrays which
    index the parameter array appropriately.
-    
+
    A model instance will contain one instance of this class for each thing
    that needs indexing (i.e. constraints, ties and priors). Parameters within
    the model constain instances of the ParameterIndexOperationsView class,
    which can map from a 'local' index (starting 0) to this global index.
-    
+
    Here's an illustration:
-    
+
    #=======================================================================
    model : 0 1 2 3 4 5 6 7 8 9
    key1: 4 5
    key2: 7 8
-    
+
    param1: 0 1 2 3 4 5
    key1: 2 3
    key2: 5
-    
+
    param2: 0 1 2 3 4
    key1: 0
    key2: 2 3
    #=======================================================================
-    
+
    The views of this global index have a subset of the keys in this global
    (model) index.
-    
+
    Adding a new key (e.g. a constraint) to a view will cause the view to pass
    the new key to the global index, along with the local index and an offset.
    This global index then stores the key and the appropriate global index
    (which can be seen by the view).
-    
+
    See also:
    ParameterIndexOperationsView
-    
+
    """
    _offset = 0
    def __init__(self, constraints=None):
@ -221,8 +221,6 @@ class ParameterIndexOperationsView(object):

    def shift_left(self, start, size):
        self._param_index_ops.shift_left(start+self._offset, size)
-        self._offset -= size
-        self._size -= size

    def clear(self):
        for i, ind in self.items():
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@ -18,7 +18,7 @@ import numpy as np
 import re
 import logging

-__updated__ = '2014-10-28'
+__updated__ = '2014-11-03'

 class HierarchyError(Exception):
    """
@ -924,7 +924,7 @@ class Parameterizable(OptimizationHandlable):
        !WARNING!: setting the parameter array MUST always be done in memory:
        m.param_array[:] = m_copy.param_array
        """
-        if self.__dict__.get('_param_array_', None) is None:
+        if (self.__dict__.get('_param_array_', None) is None) or (self._param_array_.size != self.size):
            self._param_array_ = np.empty(self.size, dtype=np.float64)
        return self._param_array_

@ -1002,7 +1002,7 @@ class Parameterizable(OptimizationHandlable):
    #=========================================================================
    @property
    def gradient(self):
-        if self.__dict__.get('_gradient_array_', None) is None:
+        if (self.__dict__.get('_gradient_array_', None) is None) or self._gradient_array_.size != self.size:
            self._gradient_array_ = np.empty(self.size, dtype=np.float64)
        return self._gradient_array_

--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -9,6 +9,7 @@ from param import ParamConcatenation
 from parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing

 import logging
+from GPy.core.parameterization.index_operations import ParameterIndexOperationsView
 logger = logging.getLogger("parameters changed meta")

 class ParametersChangedMeta(type):
@ -20,7 +21,7 @@ class ParametersChangedMeta(type):
        self._in_init_ = False
        logger.debug("connecting parameters")
        self._highest_parent_._connect_parameters()
-        self._highest_parent_._notify_parent_change()
+        #self._highest_parent_._notify_parent_change()
        self._highest_parent_._connect_fixes()
        logger.debug("calling parameters changed")
        self.parameters_changed()
@ -140,6 +141,8 @@ class Parameterized(Parameterizable):
                self.priors.shift_right(start, param.size)
                self.constraints.update(param.constraints, self.size)
                self.priors.update(param.priors, self.size)
+                param._parent_ = self
+                param._parent_index_ = len(self.parameters)
                self.parameters.append(param)
            else:
                start = sum(p.size for p in self.parameters[:index])
@ -147,19 +150,23 @@ class Parameterized(Parameterizable):
                self.priors.shift_right(start, param.size)
                self.constraints.update(param.constraints, start)
                self.priors.update(param.priors, start)
+                param._parent_ = self
+                param._parent_index_ = index if index>=0 else len(self.parameters[:index])
+                for p in self.parameters[index:]:
+                    p._parent_index_ += 1
                self.parameters.insert(index, param)

-            self._notify_parent_change()
            param.add_observer(self, self._pass_through_notify_observers, -np.inf)

            parent = self
            while parent is not None:
                parent.size += param.size
                parent = parent._parent_
+            self._notify_parent_change()

            if not self._in_init_:
-                self._connect_parameters()
-                self._notify_parent_change()
+                #self._connect_parameters()
+                #self._notify_parent_change()

                self._highest_parent_._connect_parameters(ignore_added_names=_ignore_added_names)
                self._highest_parent_._notify_parent_change()
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -40,8 +40,7 @@ class SparseGP(GP):
    """

    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
-                 name='sparse gp', Y_metadata=None, normalizer=False,
-                 missing_data=False, stochastic=False, batchsize=1):
+                 name='sparse gp', Y_metadata=None, normalizer=False):
        #pick a sensible inference method
        if inference_method is None:
            if isinstance(likelihood, likelihoods.Gaussian):
@ -55,260 +54,51 @@ class SparseGP(GP):
        self.num_inducing = Z.shape[0]

        GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
-        self.missing_data = missing_data
-
-        if stochastic and missing_data:
-            self.missing_data = True
-            self.ninan = ~np.isnan(Y)
-            self.stochastics = SparseGPStochastics(self, batchsize)
-        elif stochastic and not missing_data:
-            self.missing_data = False
-            self.stochastics = SparseGPStochastics(self, batchsize)
-        elif missing_data:
-            self.missing_data = True
-            self.ninan = ~np.isnan(Y)
-            self.stochastics = SparseGPMissing(self)
-        else:
-            self.stochastics = False

        logger.info("Adding Z as parameter")
        self.link_parameter(self.Z, index=0)
-        if self.missing_data:
-            self.Ylist = []
-            overall = self.Y_normalized.shape[1]
-            m_f = lambda i: "Precomputing Y for missing data: {: >7.2%}".format(float(i+1)/overall)
-            message = m_f(-1)
-            print message,
-            for d in xrange(overall):
-                self.Ylist.append(self.Y_normalized[self.ninan[:, d], d][:, None])
-                print ' '*(len(message)+1) + '\r',
-                message = m_f(d)
-                print message,
-            print ''
-
        self.posterior = None

    def has_uncertain_inputs(self):
        return isinstance(self.X, VariationalPosterior)

-    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None):
-        """
-        This is the standard part, which usually belongs in parameters_changed.
-
-        For automatic handling of subsampling (such as missing_data, stochastics etc.), we need to put this into an inner
-        loop, in order to ensure a different handling of gradients etc of different
-        subsets of data.
-
-        The dict in current_values will be passed aroung as current_values for
-        the rest of the algorithm, so this is the place to store current values,
-        such as subsets etc, if necessary.
-
-        If Lm and dL_dKmm can be precomputed (or only need to be computed once)
-        pass them in here, so they will be passed to the inference_method.
-
-        subset_indices is a dictionary of indices. you can put the indices however you
-        like them into this dictionary for inner use of the indices inside the
-        algorithm.
-        """
-        try:
-            posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=None)
-        except:
-            posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata)
-        current_values = {}
-        likelihood.update_gradients(grad_dict['dL_dthetaL'])
-        current_values['likgrad'] = likelihood.gradient.copy()
-        if subset_indices is None:
-            subset_indices = {}
-        if isinstance(X, VariationalPosterior):
-            #gradients wrt kernel
-            dL_dKmm = grad_dict['dL_dKmm']
-            kern.update_gradients_full(dL_dKmm, Z, None)
-            current_values['kerngrad'] = kern.gradient.copy()
-            kern.update_gradients_expectations(variational_posterior=X,
-                                                    Z=Z,
-                                                    dL_dpsi0=grad_dict['dL_dpsi0'],
-                                                    dL_dpsi1=grad_dict['dL_dpsi1'],
-                                                    dL_dpsi2=grad_dict['dL_dpsi2'])
-            current_values['kerngrad'] += kern.gradient
-
-            #gradients wrt Z
-            current_values['Zgrad'] = kern.gradients_X(dL_dKmm, Z)
-            current_values['Zgrad'] += kern.gradients_Z_expectations(
-                               grad_dict['dL_dpsi0'],
-                               grad_dict['dL_dpsi1'],
-                               grad_dict['dL_dpsi2'],
-                               Z=Z,
-                               variational_posterior=X)
-        else:
-            #gradients wrt kernel
-            kern.update_gradients_diag(grad_dict['dL_dKdiag'], X)
-            current_values['kerngrad'] = kern.gradient.copy()
-            kern.update_gradients_full(grad_dict['dL_dKnm'], X, Z)
-            current_values['kerngrad'] += kern.gradient
-            kern.update_gradients_full(grad_dict['dL_dKmm'], Z, None)
-            current_values['kerngrad'] += kern.gradient
-            #gradients wrt Z
-            current_values['Zgrad'] = kern.gradients_X(grad_dict['dL_dKmm'], Z)
-            current_values['Zgrad'] += kern.gradients_X(grad_dict['dL_dKnm'].T, Z, X)
-        return posterior, log_marginal_likelihood, grad_dict, current_values, subset_indices
-
-    def _inner_take_over_or_update(self, full_values=None, current_values=None, value_indices=None):
-        """
-        This is for automatic updates of values in the inner loop of missing
-        data handling. Both arguments are dictionaries and the values in
-        full_values will be updated by the current_gradients.
-
-        If a key from current_values does not exist in full_values, it will be
-        initialized to the value in current_values.
-
-        If there is indices needed for the update, value_indices can be used for
-        that. If value_indices has the same key, as current_values, the update
-        in full_values will be indexed by the indices in value_indices.
-
-        grads:
-            dictionary of standing gradients (you will have to carefully make sure, that
-            the ordering is right!). The values in here will be updated such that
-            full_values[key] += current_values[key]  forall key in full_gradients.keys()
-
-        gradients:
-            dictionary of gradients in the current set of parameters.
-
-        value_indices:
-            dictionary holding indices for the update in full_values.
-            if the key exists the update rule is:def df(x):
-            full_values[key][value_indices[key]] += current_values[key]
-        """
-        for key in current_values.keys():
-            if value_indices is not None and value_indices.has_key(key):
-                index = value_indices[key]
-            else:
-                index = slice(None)
-            if full_values.has_key(key):
-                full_values[key][index] += current_values[key]
-            else:
-                full_values[key] = current_values[key]
-
-    def _inner_values_update(self, current_values):
-        """
-        This exists if there is more to do with the current values.
-        It will be called allways in the inner loop, so that
-        you can do additional inner updates for the inside of the missing data
-        loop etc. This can also be used for stochastic updates, when only working on
-        one dimension of the output.
-        """
-        pass
-
-    def _outer_values_update(self, full_values):
-        """
-        Here you put the values, which were collected before in the right places.
-        E.g. set the gradients of parameters, etc.
-        """
-        self.likelihood.gradient = full_values['likgrad']
-        self.kern.gradient = full_values['kerngrad']
-        self.Z.gradient = full_values['Zgrad']
-
-    def _outer_init_full_values(self):
-        """
-        If full_values has indices in values_indices, we might want to initialize
-        the full_values differently, so that subsetting is possible.
-
-        Here you can initialize the full_values for the values needed.
-
-        Keep in mind, that if a key does not exist in full_values when updating
-        values, it will be set (so e.g. for Z there is no need to initialize Zgrad,
-        as there is no subsetting needed. For X in BGPLVM on the other hand we probably need
-        to initialize the gradients for the mean and the variance in order to
-        have the full gradient for indexing)
-        """
-        return {}
-
-    def _outer_loop_for_missing_data(self):
-        Lm = None
-        dL_dKmm = None
-
-        self._log_marginal_likelihood = 0
-        self.full_values = self._outer_init_full_values()
-
-        if self.posterior is None:
-            woodbury_inv = np.zeros((self.num_inducing, self.num_inducing, self.output_dim))
-            woodbury_vector = np.zeros((self.num_inducing, self.output_dim))
-        else:
-            woodbury_inv = self.posterior._woodbury_inv
-            woodbury_vector = self.posterior._woodbury_vector
-
-        if not self.stochastics:
-            m_f = lambda i: "Inference with missing_data: {: >7.2%}".format(float(i+1)/self.output_dim)
-            message = m_f(-1)
-            print message,
-
-        for d in self.stochastics.d:
-            ninan = self.ninan[:, d]
-
-            if not self.stochastics:
-                print ' '*(len(message)) + '\r',
-                message = m_f(d)
-                print message,
-
-            posterior, log_marginal_likelihood, \
-                grad_dict, current_values, value_indices = self._inner_parameters_changed(
-                                self.kern, self.X[ninan],
-                                self.Z, self.likelihood,
-                                self.Ylist[d], self.Y_metadata,
-                                Lm, dL_dKmm,
-                                subset_indices=dict(outputs=d, samples=ninan))
-
-            self._inner_take_over_or_update(self.full_values, current_values, value_indices)
-            self._inner_values_update(current_values)
-
-            Lm = posterior.K_chol
-            dL_dKmm = grad_dict['dL_dKmm']
-            woodbury_inv[:, :, d] = posterior.woodbury_inv
-            woodbury_vector[:, d:d+1] = posterior.woodbury_vector
-            self._log_marginal_likelihood += log_marginal_likelihood
-        if not self.stochastics:
-            print ''
-
-        if self.posterior is None:
-            self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector,
-                                   K=posterior._K, mean=None, cov=None, K_chol=posterior.K_chol)
-        self._outer_values_update(self.full_values)
-
-    def _outer_loop_without_missing_data(self):
-        self._log_marginal_likelihood = 0
-
-        if self.posterior is None:
-            woodbury_inv = np.zeros((self.num_inducing, self.num_inducing, self.output_dim))
-            woodbury_vector = np.zeros((self.num_inducing, self.output_dim))
-        else:
-            woodbury_inv = self.posterior._woodbury_inv
-            woodbury_vector = self.posterior._woodbury_vector
-
-        d = self.stochastics.d
-        posterior, log_marginal_likelihood, \
-            grad_dict, self.full_values, _ = self._inner_parameters_changed(
-                            self.kern, self.X,
-                            self.Z, self.likelihood,
-                            self.Y_normalized[:, d], self.Y_metadata)
-        self.grad_dict = grad_dict
-
-        self._log_marginal_likelihood += log_marginal_likelihood
-
-        self._outer_values_update(self.full_values)
-
-        woodbury_inv[:, :, d] = posterior.woodbury_inv[:, :, None]
-        woodbury_vector[:, d] = posterior.woodbury_vector
-        if self.posterior is None:
-            self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector,
-                                   K=posterior._K, mean=None, cov=None, K_chol=posterior.K_chol)
-
    def parameters_changed(self):
-        if self.missing_data:
-            self._outer_loop_for_missing_data()
-        elif self.stochastics:
-            self._outer_loop_without_missing_data()
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata)
+
+        self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
+
+        if isinstance(self.X, VariationalPosterior):
+            #gradients wrt kernel
+            dL_dKmm = self.grad_dict['dL_dKmm']
+            self.kern.update_gradients_full(dL_dKmm, self.Z, None)
+            kerngrad = self.kern.gradient.copy()
+            self.kern.update_gradients_expectations(variational_posterior=self.X,
+                                                    Z=self.Z,
+                                                    dL_dpsi0=self.grad_dict['dL_dpsi0'],
+                                                    dL_dpsi1=self.grad_dict['dL_dpsi1'],
+                                                    dL_dpsi2=self.grad_dict['dL_dpsi2'])
+            self.kern.gradient += kerngrad
+
+            #gradients wrt Z
+            self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z)
+            self.Z.gradient += self.kern.gradients_Z_expectations(
+                               self.grad_dict['dL_dpsi0'],
+                               self.grad_dict['dL_dpsi1'],
+                               self.grad_dict['dL_dpsi2'],
+                               Z=self.Z,
+                               variational_posterior=self.X)
        else:
-            self.posterior, self._log_marginal_likelihood, self.grad_dict, self.full_values, _ = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
-            self._outer_values_update(self.full_values)
+            #gradients wrt kernel
+            self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X)
+            kerngrad = self.kern.gradient.copy()
+            self.kern.update_gradients_full(self.grad_dict['dL_dKnm'], self.X, self.Z)
+            kerngrad += self.kern.gradient
+            self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z, None)
+            self.kern.gradient += kerngrad
+            #gradients wrt Z
+            self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
+            self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)
+

    def _raw_predict(self, Xnew, full_cov=False, kern=None):
        """
--- a/GPy/core/sparse_gp_mpi.py
+++ b/GPy/core/sparse_gp_mpi.py
@ -34,8 +34,7 @@ class SparseGP_MPI(SparseGP):

    """

-    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp mpi', Y_metadata=None, mpi_comm=None, normalizer=False,
-                 missing_data=False, stochastic=False, batchsize=1):
+    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp mpi', Y_metadata=None, mpi_comm=None, normalizer=False):
        self._IN_OPTIMIZATION_ = False
        if mpi_comm != None:
            if inference_method is None:
@ -43,8 +42,7 @@ class SparseGP_MPI(SparseGP):
            else:
                assert isinstance(inference_method, VarDTC_minibatch), 'inference_method has to support MPI!'

-        super(SparseGP_MPI, self).__init__(X, Y, Z, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer,
-                                           missing_data=missing_data, stochastic=stochastic, batchsize=batchsize)
+        super(SparseGP_MPI, self).__init__(X, Y, Z, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
        self.update_model(False)
        self.link_parameter(self.X, index=0)
        if variational_prior is not None:
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -468,7 +468,7 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):

    k = GPy.kern.RBF(1)
    # create simple GP Model - no input uncertainty on this one
-    m = GPy.models.SparseGPRegression(X, Y, kernel=GPy.kern.RBF(1), Z=Z)
+    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z)

    if optimize:
        m.optimize('scg', messages=1, max_iters=max_iters)
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@ -14,6 +14,9 @@ import numpy as np
 from ...util.linalg import mdot, jitchol, dpotrs, dtrtrs, dpotri, symmetrify, pdinv
 from posterior import Posterior
 import warnings
+def warning_on_one_line(message, category, filename, lineno, file=None, line=None):
+    return ' %s:%s: %s:%s\n' % (filename, lineno, category.__name__, message)
+warnings.formatwarning = warning_on_one_line
 from scipy import optimize
 from . import LatentFunctionInference

@ -29,8 +32,11 @@ class Laplace(LatentFunctionInference):
        """

        self._mode_finding_tolerance = 1e-7
-        self._mode_finding_max_iter = 40
-        self.bad_fhat = True
+        self._mode_finding_max_iter = 60
+        self.bad_fhat = False
+        #Store whether it is the first run of the inference so that we can choose whether we need
+        #to calculate things or reuse old variables
+        self.first_run = True
        self._previous_Ki_fhat = None

    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
@ -42,8 +48,9 @@ class Laplace(LatentFunctionInference):
        K = kern.K(X)

        #Find mode
-        if self.bad_fhat:
+        if self.bad_fhat or self.first_run:
            Ki_f_init = np.zeros_like(Y)
+            first_run = False
        else:
            Ki_f_init = self._previous_Ki_fhat

@ -123,11 +130,11 @@ class Laplace(LatentFunctionInference):
        #Warn of bad fits
        if difference > self._mode_finding_tolerance:
            if not self.bad_fhat:
-                warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
+                warnings.warn("Not perfect mode found (f_hat). difference: {}, iteration: {} out of max {}".format(difference, iteration, self._mode_finding_max_iter))
            self.bad_fhat = True
        elif self.bad_fhat:
            self.bad_fhat = False
-            warnings.warn("f_hat now fine again")
+            warnings.warn("f_hat now fine again. difference: {}, iteration: {} out of max {}".format(difference, iteration, self._mode_finding_max_iter))

        return f, Ki_f

--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@ -75,8 +75,7 @@ class BayesianGPLVM(SparseGP_MPI):
                                           name=name, inference_method=inference_method,
                                           normalizer=normalizer, mpi_comm=mpi_comm,
                                           variational_prior=self.variational_prior,
-                                           missing_data=missing_data, stochastic=stochastic,
-                                           batchsize=batchsize)
+                                           )

    def set_X_gradients(self, X, X_grad):
        """Set the gradients of the posterior distribution of X in its specific form."""
@ -86,55 +85,22 @@ class BayesianGPLVM(SparseGP_MPI):
        """Get the gradients of the posterior distribution of X in its specific form."""
        return X.mean.gradient, X.variance.gradient

-    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None):
-        posterior, log_marginal_likelihood, grad_dict, current_values, value_indices = super(BayesianGPLVM, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm, subset_indices=subset_indices)
-
-        kl_fctr = 1.
-        if self.missing_data:
-            d = self.output_dim
-            log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)/d
-        else:
-            log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)
-
-        current_values['meangrad'], current_values['vargrad'] = self.kern.gradients_qX_expectations(
-                                            variational_posterior=X,
-                                            Z=Z, dL_dpsi0=grad_dict['dL_dpsi0'],
-                                            dL_dpsi1=grad_dict['dL_dpsi1'],
-                                            dL_dpsi2=grad_dict['dL_dpsi2'])
-
-        # Subsetting Variational Posterior objects, makes the gradients
-        # empty. We need them to be 0 though:
-        X.mean.gradient[:] = 0
-        X.variance.gradient[:] = 0
-
-        self.variational_prior.update_gradients_KL(X)
-        if self.missing_data:
-            current_values['meangrad'] += kl_fctr*X.mean.gradient/d
-            current_values['vargrad'] += kl_fctr*X.variance.gradient/d
-        else:
-            current_values['meangrad'] += kl_fctr*X.mean.gradient
-            current_values['vargrad'] += kl_fctr*X.variance.gradient
-
-        if subset_indices is not None:
-            value_indices['meangrad'] = subset_indices['samples']
-            value_indices['vargrad'] = subset_indices['samples']
-        return posterior, log_marginal_likelihood, grad_dict, current_values, value_indices
-
-    def _outer_values_update(self, full_values):
-        """
-        Here you put the values, which were collected before in the right places.
-        E.g. set the gradients of parameters, etc.
-        """
-        super(BayesianGPLVM, self)._outer_values_update(full_values)
-        self.X.mean.gradient = full_values['meangrad']
-        self.X.variance.gradient = full_values['vargrad']
-
-    def _outer_init_full_values(self):
-        return dict(meangrad=np.zeros(self.X.mean.shape),
-                    vargrad=np.zeros(self.X.variance.shape))
-
    def parameters_changed(self):
        super(BayesianGPLVM,self).parameters_changed()
+
+        kl_fctr = 1.
+        self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)
+
+        self.X.mean.gradient, self.X.variance.gradient = self.kern.gradients_qX_expectations(
+                                            variational_posterior=self.X,
+                                            Z=self.Z,
+                                            dL_dpsi0=self.grad_dict['dL_dpsi0'],
+                                            dL_dpsi1=self.grad_dict['dL_dpsi1'],
+                                            dL_dpsi2=self.grad_dict['dL_dpsi2'])
+
+        self.variational_prior.update_gradients_KL(self.X)
+
+
        if isinstance(self.inference_method, VarDTC_minibatch):
            return

--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@ -0,0 +1,267 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from .. import kern
+from ..likelihoods import Gaussian
+from ..core.parameterization.variational import NormalPosterior, NormalPrior
+from ..inference.latent_function_inference.var_dtc_parallel import VarDTC_minibatch
+from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU
+import logging
+from GPy.models.sparse_gp_minibatch import SparseGPMiniBatch
+
+class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
+    """
+    Bayesian Gaussian Process Latent Variable Model
+
+    :param Y: observed data (np.ndarray) or GPy.likelihood
+    :type Y: np.ndarray| GPy.likelihood instance
+    :param input_dim: latent dimensionality
+    :type input_dim: int
+    :param init: initialisation method for the latent space
+    :type init: 'PCA'|'random'
+
+    """
+    def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
+                 Z=None, kernel=None, inference_method=None, likelihood=None,
+                 name='bayesian gplvm', normalizer=None,
+                 missing_data=False, stochastic=False, batchsize=1):
+        self.logger = logging.getLogger(self.__class__.__name__)
+        if X is None:
+            from ..util.initialization import initialize_latent
+            self.logger.info("initializing latent space X with method {}".format(init))
+            X, fracs = initialize_latent(init, input_dim, Y)
+        else:
+            fracs = np.ones(input_dim)
+
+        self.init = init
+
+        if X_variance is None:
+            self.logger.info("initializing latent space variance ~ uniform(0,.1)")
+            X_variance = np.random.uniform(0,.1,X.shape)
+
+        if Z is None:
+            self.logger.info("initializing inducing inputs")
+            Z = np.random.permutation(X.copy())[:num_inducing]
+        assert Z.shape[1] == X.shape[1]
+
+        if kernel is None:
+            self.logger.info("initializing kernel RBF")
+            kernel = kern.RBF(input_dim, lengthscale=1./fracs, ARD=True) #+ kern.Bias(input_dim) + kern.White(input_dim)
+
+        if likelihood is None:
+            likelihood = Gaussian()
+
+        self.variational_prior = NormalPrior()
+        X = NormalPosterior(X, X_variance)
+
+        if inference_method is None:
+            from ..inference.latent_function_inference.var_dtc import VarDTC
+            self.logger.debug("creating inference_method var_dtc")
+            inference_method = VarDTC(limit=1 if not missing_data else Y.shape[1])
+
+        if kernel.useGPU and isinstance(inference_method, VarDTC_GPU):
+            kernel.psicomp.GPU_direct = True
+
+        super(BayesianGPLVMMiniBatch,self).__init__(X, Y, Z, kernel, likelihood=likelihood,
+                                           name=name, inference_method=inference_method,
+                                           normalizer=normalizer,
+                                           missing_data=missing_data, stochastic=stochastic,
+                                           batchsize=batchsize)
+        self.X = X
+        self.link_parameter(self.X, 0)
+
+    def set_X_gradients(self, X, X_grad):
+        """Set the gradients of the posterior distribution of X in its specific form."""
+        X.mean.gradient, X.variance.gradient = X_grad
+
+    def get_X_gradients(self, X):
+        """Get the gradients of the posterior distribution of X in its specific form."""
+        return X.mean.gradient, X.variance.gradient
+
+    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None):
+        posterior, log_marginal_likelihood, grad_dict, current_values, value_indices = super(BayesianGPLVMMiniBatch, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm, subset_indices=subset_indices)
+
+        kl_fctr = 1.
+        if self.missing_data:
+            d = self.output_dim
+            log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)/d
+        else:
+            log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)
+
+        current_values['meangrad'], current_values['vargrad'] = self.kern.gradients_qX_expectations(
+                                            variational_posterior=X,
+                                            Z=Z, dL_dpsi0=grad_dict['dL_dpsi0'],
+                                            dL_dpsi1=grad_dict['dL_dpsi1'],
+                                            dL_dpsi2=grad_dict['dL_dpsi2'])
+
+        # Subsetting Variational Posterior objects, makes the gradients
+        # empty. We need them to be 0 though:
+        X.mean.gradient[:] = 0
+        X.variance.gradient[:] = 0
+
+        self.variational_prior.update_gradients_KL(X)
+        if self.missing_data:
+            current_values['meangrad'] += kl_fctr*X.mean.gradient/d
+            current_values['vargrad'] += kl_fctr*X.variance.gradient/d
+        else:
+            current_values['meangrad'] += kl_fctr*X.mean.gradient
+            current_values['vargrad'] += kl_fctr*X.variance.gradient
+
+        if subset_indices is not None:
+            value_indices['meangrad'] = subset_indices['samples']
+            value_indices['vargrad'] = subset_indices['samples']
+        return posterior, log_marginal_likelihood, grad_dict, current_values, value_indices
+
+    def _outer_values_update(self, full_values):
+        """
+        Here you put the values, which were collected before in the right places.
+        E.g. set the gradients of parameters, etc.
+        """
+        super(BayesianGPLVMMiniBatch, self)._outer_values_update(full_values)
+        self.X.mean.gradient = full_values['meangrad']
+        self.X.variance.gradient = full_values['vargrad']
+
+    def _outer_init_full_values(self):
+        return dict(meangrad=np.zeros(self.X.mean.shape),
+                    vargrad=np.zeros(self.X.variance.shape))
+
+    def parameters_changed(self):
+        super(BayesianGPLVMMiniBatch,self).parameters_changed()
+        if isinstance(self.inference_method, VarDTC_minibatch):
+            return
+
+        #super(BayesianGPLVM, self).parameters_changed()
+        #self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
+
+        #self.X.mean.gradient, self.X.variance.gradient = self.kern.gradients_qX_expectations(variational_posterior=self.X, Z=self.Z, dL_dpsi0=self.grad_dict['dL_dpsi0'], dL_dpsi1=self.grad_dict['dL_dpsi1'], dL_dpsi2=self.grad_dict['dL_dpsi2'])
+
+        # This is testing code -------------------------
+#         i = np.random.randint(self.X.shape[0])
+#         X_ = self.X.mean
+#         which = np.sqrt(((X_ - X_[i:i+1])**2).sum(1)).argsort()>(max(0, self.X.shape[0]-51))
+#         _, _, grad_dict = self.inference_method.inference(self.kern, self.X[which], self.Z, self.likelihood, self.Y[which], self.Y_metadata)
+#         grad = self.kern.gradients_qX_expectations(variational_posterior=self.X[which], Z=self.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'])
+#
+#         self.X.mean.gradient[:] = 0
+#         self.X.variance.gradient[:] = 0
+#         self.X.mean.gradient[which] = grad[0]
+#         self.X.variance.gradient[which] = grad[1]
+
+        # update for the KL divergence
+#         self.variational_prior.update_gradients_KL(self.X, which)
+        # -----------------------------------------------
+
+        # update for the KL divergence
+        #self.variational_prior.update_gradients_KL(self.X)
+
+    def plot_latent(self, labels=None, which_indices=None,
+                resolution=50, ax=None, marker='o', s=40,
+                fignum=None, plot_inducing=True, legend=True,
+                plot_limits=None,
+                aspect='auto', updates=False, predict_kwargs={}, imshow_kwargs={}):
+        import sys
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import dim_reduction_plots
+
+        return dim_reduction_plots.plot_latent(self, labels, which_indices,
+                resolution, ax, marker, s,
+                fignum, plot_inducing, legend,
+                plot_limits, aspect, updates, predict_kwargs, imshow_kwargs)
+
+    def do_test_latents(self, Y):
+        """
+        Compute the latent representation for a set of new points Y
+
+        Notes:
+        This will only work with a univariate Gaussian likelihood (for now)
+        """
+        N_test = Y.shape[0]
+        input_dim = self.Z.shape[1]
+
+        means = np.zeros((N_test, input_dim))
+        covars = np.zeros((N_test, input_dim))
+
+        dpsi0 = -0.5 * self.input_dim / self.likelihood.variance
+        dpsi2 = self.grad_dict['dL_dpsi2'][0][None, :, :] # TODO: this may change if we ignore het. likelihoods
+        V = Y/self.likelihood.variance
+
+        #compute CPsi1V
+        #if self.Cpsi1V is None:
+        #    psi1V = np.dot(self.psi1.T, self.likelihood.V)
+        #    tmp, _ = linalg.dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
+        #    tmp, _ = linalg.dpotrs(self.LB, tmp, lower=1)
+        #    self.Cpsi1V, _ = linalg.dtrtrs(self._Lm, tmp, lower=1, trans=1)
+
+        dpsi1 = np.dot(self.posterior.woodbury_vector, V.T)
+
+        #start = np.zeros(self.input_dim * 2)
+
+
+        from scipy.optimize import minimize
+
+        for n, dpsi1_n in enumerate(dpsi1.T[:, :, None]):
+            args = (input_dim, self.kern.copy(), self.Z, dpsi0, dpsi1_n.T, dpsi2)
+            res = minimize(latent_cost_and_grad, jac=True, x0=np.hstack((means[n], covars[n])), args=args, method='BFGS')
+            xopt = res.x
+            mu, log_S = xopt.reshape(2, 1, -1)
+            means[n] = mu[0].copy()
+            covars[n] = np.exp(log_S[0]).copy()
+
+        X = NormalPosterior(means, covars)
+
+        return X
+
+    def dmu_dX(self, Xnew):
+        """
+        Calculate the gradient of the prediction at Xnew w.r.t Xnew.
+        """
+        dmu_dX = np.zeros_like(Xnew)
+        for i in range(self.Z.shape[0]):
+            dmu_dX += self.kern.gradients_X(self.grad_dict['dL_dpsi1'][i:i + 1, :], Xnew, self.Z[i:i + 1, :])
+        return dmu_dX
+
+    def dmu_dXnew(self, Xnew):
+        """
+        Individual gradient of prediction at Xnew w.r.t. each sample in Xnew
+        """
+        gradients_X = np.zeros((Xnew.shape[0], self.num_inducing))
+        ones = np.ones((1, 1))
+        for i in range(self.Z.shape[0]):
+            gradients_X[:, i] = self.kern.gradients_X(ones, Xnew, self.Z[i:i + 1, :]).sum(-1)
+        return np.dot(gradients_X, self.grad_dict['dL_dpsi1'])
+
+    def plot_steepest_gradient_map(self, *args, ** kwargs):
+        """
+        See GPy.plotting.matplot_dep.dim_reduction_plots.plot_steepest_gradient_map
+        """
+        import sys
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import dim_reduction_plots
+
+        return dim_reduction_plots.plot_steepest_gradient_map(self,*args,**kwargs)
+
+
+def latent_cost_and_grad(mu_S, input_dim, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
+    """
+    objective function for fitting the latent variables for test points
+    (negative log-likelihood: should be minimised!)
+    """
+    mu = mu_S[:input_dim][None]
+    log_S = mu_S[input_dim:][None]
+    S = np.exp(log_S)
+
+    X = NormalPosterior(mu, S)
+
+    psi0 = kern.psi0(Z, X)
+    psi1 = kern.psi1(Z, X)
+    psi2 = kern.psi2(Z, X)
+
+    lik = dL_dpsi0 * psi0.sum() + np.einsum('ij,kj->...', dL_dpsi1, psi1) + np.einsum('ijk,lkj->...', dL_dpsi2, psi2) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S)
+
+    dLdmu, dLdS = kern.gradients_qX_expectations(dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, X)
+    dmu = dLdmu - mu
+    # dS = S0 + S1 + S2 -0.5 + .5/S
+    dlnS = S * (dLdS - 0.5) + .5
+
+    return -lik, -np.hstack((dmu.flatten(), dlnS.flatten()))
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@ -0,0 +1,347 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ..core.parameterization.param import Param
+from ..core.gp import GP
+from ..inference.latent_function_inference import var_dtc
+from .. import likelihoods
+from ..core.parameterization.variational import VariationalPosterior
+
+import logging
+from GPy.inference.latent_function_inference.posterior import Posterior
+from GPy.inference.optimization.stochastics import SparseGPStochastics,\
+    SparseGPMissing
+#no stochastics.py file added! from GPy.inference.optimization.stochastics import SparseGPStochastics,\
+    #SparseGPMissing
+logger = logging.getLogger("sparse gp")
+
+class SparseGPMiniBatch(GP):
+    """
+    A general purpose Sparse GP model
+'''
+Created on 3 Nov 2014
+
+@author: maxz
+'''
+
+    This model allows (approximate) inference using variational DTC or FITC
+    (Gaussian likelihoods) as well as non-conjugate sparse methods based on
+    these.
+
+    :param X: inputs
+    :type X: np.ndarray (num_data x input_dim)
+    :param likelihood: a likelihood instance, containing the observed data
+    :type likelihood: GPy.likelihood.(Gaussian | EP | Laplace)
+    :param kernel: the kernel (covariance function). See link kernels
+    :type kernel: a GPy.kern.kern instance
+    :param X_variance: The uncertainty in the measurements of X (Gaussian variance)
+    :type X_variance: np.ndarray (num_data x input_dim) | None
+    :param Z: inducing inputs
+    :type Z: np.ndarray (num_inducing x input_dim)
+    :param num_inducing: Number of inducing points (optional, default 10. Ignored if Z is not None)
+    :type num_inducing: int
+
+    """
+
+    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
+                 name='sparse gp', Y_metadata=None, normalizer=False,
+                 missing_data=False, stochastic=False, batchsize=1):
+        #pick a sensible inference method
+        if inference_method is None:
+            if isinstance(likelihood, likelihoods.Gaussian):
+                inference_method = var_dtc.VarDTC(limit=1 if not self.missing_data else Y.shape[1])
+            else:
+                #inference_method = ??
+                raise NotImplementedError, "what to do what to do?"
+            print "defaulting to ", inference_method, "for latent function inference"
+
+        self.Z = Param('inducing inputs', Z)
+        self.num_inducing = Z.shape[0]
+
+        GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
+        self.missing_data = missing_data
+
+        if stochastic and missing_data:
+            self.missing_data = True
+            self.ninan = ~np.isnan(Y)
+            self.stochastics = SparseGPStochastics(self, batchsize)
+        elif stochastic and not missing_data:
+            self.missing_data = False
+            self.stochastics = SparseGPStochastics(self, batchsize)
+        elif missing_data:
+            self.missing_data = True
+            self.ninan = ~np.isnan(Y)
+            self.stochastics = SparseGPMissing(self)
+        else:
+            self.stochastics = False
+
+        logger.info("Adding Z as parameter")
+        self.link_parameter(self.Z, index=0)
+        if self.missing_data:
+            self.Ylist = []
+            overall = self.Y_normalized.shape[1]
+            m_f = lambda i: "Precomputing Y for missing data: {: >7.2%}".format(float(i+1)/overall)
+            message = m_f(-1)
+            print message,
+            for d in xrange(overall):
+                self.Ylist.append(self.Y_normalized[self.ninan[:, d], d][:, None])
+                print ' '*(len(message)+1) + '\r',
+                message = m_f(d)
+                print message,
+            print ''
+
+        self.posterior = None
+
+    def has_uncertain_inputs(self):
+        return isinstance(self.X, VariationalPosterior)
+
+    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None):
+        """
+        This is the standard part, which usually belongs in parameters_changed.
+
+        For automatic handling of subsampling (such as missing_data, stochastics etc.), we need to put this into an inner
+        loop, in order to ensure a different handling of gradients etc of different
+        subsets of data.
+
+        The dict in current_values will be passed aroung as current_values for
+        the rest of the algorithm, so this is the place to store current values,
+        such as subsets etc, if necessary.
+
+        If Lm and dL_dKmm can be precomputed (or only need to be computed once)
+        pass them in here, so they will be passed to the inference_method.
+
+        subset_indices is a dictionary of indices. you can put the indices however you
+        like them into this dictionary for inner use of the indices inside the
+        algorithm.
+        """
+        try:
+            posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=None)
+        except:
+            posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata)
+        current_values = {}
+        likelihood.update_gradients(grad_dict['dL_dthetaL'])
+        current_values['likgrad'] = likelihood.gradient.copy()
+        if subset_indices is None:
+            subset_indices = {}
+        if isinstance(X, VariationalPosterior):
+            #gradients wrt kernel
+            dL_dKmm = grad_dict['dL_dKmm']
+            kern.update_gradients_full(dL_dKmm, Z, None)
+            current_values['kerngrad'] = kern.gradient.copy()
+            kern.update_gradients_expectations(variational_posterior=X,
+                                                    Z=Z,
+                                                    dL_dpsi0=grad_dict['dL_dpsi0'],
+                                                    dL_dpsi1=grad_dict['dL_dpsi1'],
+                                                    dL_dpsi2=grad_dict['dL_dpsi2'])
+            current_values['kerngrad'] += kern.gradient
+
+            #gradients wrt Z
+            current_values['Zgrad'] = kern.gradients_X(dL_dKmm, Z)
+            current_values['Zgrad'] += kern.gradients_Z_expectations(
+                               grad_dict['dL_dpsi0'],
+                               grad_dict['dL_dpsi1'],
+                               grad_dict['dL_dpsi2'],
+                               Z=Z,
+                               variational_posterior=X)
+        else:
+            #gradients wrt kernel
+            kern.update_gradients_diag(grad_dict['dL_dKdiag'], X)
+            current_values['kerngrad'] = kern.gradient.copy()
+            kern.update_gradients_full(grad_dict['dL_dKnm'], X, Z)
+            current_values['kerngrad'] += kern.gradient
+            kern.update_gradients_full(grad_dict['dL_dKmm'], Z, None)
+            current_values['kerngrad'] += kern.gradient
+            #gradients wrt Z
+            current_values['Zgrad'] = kern.gradients_X(grad_dict['dL_dKmm'], Z)
+            current_values['Zgrad'] += kern.gradients_X(grad_dict['dL_dKnm'].T, Z, X)
+        return posterior, log_marginal_likelihood, grad_dict, current_values, subset_indices
+
+    def _inner_take_over_or_update(self, full_values=None, current_values=None, value_indices=None):
+        """
+        This is for automatic updates of values in the inner loop of missing
+        data handling. Both arguments are dictionaries and the values in
+        full_values will be updated by the current_gradients.
+
+        If a key from current_values does not exist in full_values, it will be
+        initialized to the value in current_values.
+
+        If there is indices needed for the update, value_indices can be used for
+        that. If value_indices has the same key, as current_values, the update
+        in full_values will be indexed by the indices in value_indices.
+
+        grads:
+            dictionary of standing gradients (you will have to carefully make sure, that
+            the ordering is right!). The values in here will be updated such that
+            full_values[key] += current_values[key]  forall key in full_gradients.keys()
+
+        gradients:
+            dictionary of gradients in the current set of parameters.
+
+        value_indices:
+            dictionary holding indices for the update in full_values.
+            if the key exists the update rule is:def df(x):
+            full_values[key][value_indices[key]] += current_values[key]
+        """
+        for key in current_values.keys():
+            if value_indices is not None and value_indices.has_key(key):
+                index = value_indices[key]
+            else:
+                index = slice(None)
+            if full_values.has_key(key):
+                full_values[key][index] += current_values[key]
+            else:
+                full_values[key] = current_values[key]
+
+    def _inner_values_update(self, current_values):
+        """
+        This exists if there is more to do with the current values.
+        It will be called allways in the inner loop, so that
+        you can do additional inner updates for the inside of the missing data
+        loop etc. This can also be used for stochastic updates, when only working on
+        one dimension of the output.
+        """
+        pass
+
+    def _outer_values_update(self, full_values):
+        """
+        Here you put the values, which were collected before in the right places.
+        E.g. set the gradients of parameters, etc.
+        """
+        self.likelihood.gradient = full_values['likgrad']
+        self.kern.gradient = full_values['kerngrad']
+        self.Z.gradient = full_values['Zgrad']
+
+    def _outer_init_full_values(self):
+        """
+        If full_values has indices in values_indices, we might want to initialize
+        the full_values differently, so that subsetting is possible.
+
+        Here you can initialize the full_values for the values needed.
+
+        Keep in mind, that if a key does not exist in full_values when updating
+        values, it will be set (so e.g. for Z there is no need to initialize Zgrad,
+        as there is no subsetting needed. For X in BGPLVM on the other hand we probably need
+        to initialize the gradients for the mean and the variance in order to
+        have the full gradient for indexing)
+        """
+        return {}
+
+    def _outer_loop_for_missing_data(self):
+        Lm = None
+        dL_dKmm = None
+
+        self._log_marginal_likelihood = 0
+        self.full_values = self._outer_init_full_values()
+
+        if self.posterior is None:
+            woodbury_inv = np.zeros((self.num_inducing, self.num_inducing, self.output_dim))
+            woodbury_vector = np.zeros((self.num_inducing, self.output_dim))
+        else:
+            woodbury_inv = self.posterior._woodbury_inv
+            woodbury_vector = self.posterior._woodbury_vector
+
+        if not self.stochastics:
+            m_f = lambda i: "Inference with missing_data: {: >7.2%}".format(float(i+1)/self.output_dim)
+            message = m_f(-1)
+            print message,
+
+        for d in self.stochastics.d:
+            ninan = self.ninan[:, d]
+
+            if not self.stochastics:
+                print ' '*(len(message)) + '\r',
+                message = m_f(d)
+                print message,
+
+            posterior, log_marginal_likelihood, \
+                grad_dict, current_values, value_indices = self._inner_parameters_changed(
+                                self.kern, self.X[ninan],
+                                self.Z, self.likelihood,
+                                self.Ylist[d], self.Y_metadata,
+                                Lm, dL_dKmm,
+                                subset_indices=dict(outputs=d, samples=ninan))
+
+            self._inner_take_over_or_update(self.full_values, current_values, value_indices)
+            self._inner_values_update(current_values)
+
+            Lm = posterior.K_chol
+            dL_dKmm = grad_dict['dL_dKmm']
+            woodbury_inv[:, :, d] = posterior.woodbury_inv
+            woodbury_vector[:, d:d+1] = posterior.woodbury_vector
+            self._log_marginal_likelihood += log_marginal_likelihood
+        if not self.stochastics:
+            print ''
+
+        if self.posterior is None:
+            self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector,
+                                   K=posterior._K, mean=None, cov=None, K_chol=posterior.K_chol)
+        self._outer_values_update(self.full_values)
+
+    def _outer_loop_without_missing_data(self):
+        self._log_marginal_likelihood = 0
+
+        if self.posterior is None:
+            woodbury_inv = np.zeros((self.num_inducing, self.num_inducing, self.output_dim))
+            woodbury_vector = np.zeros((self.num_inducing, self.output_dim))
+        else:
+            woodbury_inv = self.posterior._woodbury_inv
+            woodbury_vector = self.posterior._woodbury_vector
+
+        d = self.stochastics.d
+        posterior, log_marginal_likelihood, \
+            grad_dict, self.full_values, _ = self._inner_parameters_changed(
+                            self.kern, self.X,
+                            self.Z, self.likelihood,
+                            self.Y_normalized[:, d], self.Y_metadata)
+        self.grad_dict = grad_dict
+
+        self._log_marginal_likelihood += log_marginal_likelihood
+
+        self._outer_values_update(self.full_values)
+
+        woodbury_inv[:, :, d] = posterior.woodbury_inv[:, :, None]
+        woodbury_vector[:, d] = posterior.woodbury_vector
+        if self.posterior is None:
+            self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector,
+                                   K=posterior._K, mean=None, cov=None, K_chol=posterior.K_chol)
+
+    def parameters_changed(self):
+        if self.missing_data:
+            self._outer_loop_for_missing_data()
+        elif self.stochastics:
+            self._outer_loop_without_missing_data()
+        else:
+            self.posterior, self._log_marginal_likelihood, self.grad_dict, self.full_values, _ = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
+            self._outer_values_update(self.full_values)
+
+    def _raw_predict(self, Xnew, full_cov=False, kern=None):
+        """
+        Make a prediction for the latent function values
+        """
+
+        if kern is None: kern = self.kern
+
+        if not isinstance(Xnew, VariationalPosterior):
+            Kx = kern.K(self.Z, Xnew)
+            mu = np.dot(Kx.T, self.posterior.woodbury_vector)
+            if full_cov:
+                Kxx = kern.K(Xnew)
+                if self.posterior.woodbury_inv.ndim == 2:
+                    var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
+                elif self.posterior.woodbury_inv.ndim == 3:
+                    var = Kxx[:,:,None] - np.tensordot(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx).T, Kx, [1,0]).swapaxes(1,2)
+                var = var
+            else:
+                Kxx = kern.Kdiag(Xnew)
+                var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
+        else:
+            Kx = kern.psi1(self.Z, Xnew)
+            mu = np.dot(Kx, self.posterior.woodbury_vector)
+            if full_cov:
+                raise NotImplementedError, "TODO"
+            else:
+                Kxx = kern.psi0(self.Z, Xnew)
+                psi2 = kern.psi2(self.Z, Xnew)
+                var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
+        return mu, var
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -112,7 +112,7 @@ class MiscTests(unittest.TestCase):

    def test_missing_data(self):
        from GPy import kern
-        from GPy.models import BayesianGPLVM
+        from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
        from GPy.examples.dimensionality_reduction import _simulate_matern

        D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 400, 3, 4
@ -124,12 +124,12 @@ class MiscTests(unittest.TestCase):
        Ymissing[inan] = np.nan

        k = kern.Linear(Q, ARD=True) + kern.White(Q, np.exp(-2)) # + kern.bias(Q)
-        m = BayesianGPLVM(Ymissing, Q, init="random", num_inducing=num_inducing,
+        m = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
                          kernel=k, missing_data=True)
        assert(m.checkgrad())

        k = kern.RBF(Q, ARD=True) + kern.White(Q, np.exp(-2)) # + kern.bias(Q)
-        m = BayesianGPLVM(Ymissing, Q, init="random", num_inducing=num_inducing,
+        m = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
                          kernel=k, missing_data=True)
        assert(m.checkgrad())

--- a/GPy/testing/pickle_tests.py
+++ b/GPy/testing/pickle_tests.py
@ -17,10 +17,15 @@ from GPy.kern._src.rbf import RBF
 from GPy.kern._src.linear import Linear
 from GPy.kern._src.static import Bias, White
 from GPy.examples.dimensionality_reduction import mrd_simulation
-from GPy.examples.regression import toy_rbf_1d_50
 from GPy.core.parameterization.variational import NormalPosterior
 from GPy.models.gp_regression import GPRegression

+def toy_model():
+    X = np.linspace(0,1,50)[:, None]
+    Y = np.sin(X)
+    m = GPRegression(X=X, Y=Y)
+    return m
+
 class ListDictTestCase(unittest.TestCase):
    def assertListDictEquals(self, d1, d2, msg=None):
        for k,v in d1.iteritems():
@ -105,7 +110,7 @@ class Test(ListDictTestCase):
        self.assertSequenceEqual(str(par), str(pcopy))

    def test_model(self):
-        par = toy_rbf_1d_50(optimize=0, plot=0)
+        par = toy_model()
        pcopy = par.copy()
        self.assertListEqual(par.param_array.tolist(), pcopy.param_array.tolist())
        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
@ -124,7 +129,7 @@ class Test(ListDictTestCase):
        self.assert_(pcopy.checkgrad())

    def test_modelrecreation(self):
-        par = toy_rbf_1d_50(optimize=0, plot=0)
+        par = toy_model()
        pcopy = GPRegression(par.X.copy(), par.Y.copy(), kernel=par.kern.copy())
        np.testing.assert_allclose(par.param_array, pcopy.param_array)
        np.testing.assert_allclose(par.gradient_full, pcopy.gradient_full)
@ -135,7 +140,7 @@ class Test(ListDictTestCase):
        self.assert_(np.any(pcopy.gradient!=0.0))
        pcopy.optimize('bfgs')
        par.optimize('bfgs')
-        np.testing.assert_allclose(pcopy.param_array, par.param_array, atol=.001)
+        np.testing.assert_allclose(pcopy.param_array, par.param_array, atol=1e-6)
        with tempfile.TemporaryFile('w+b') as f:
            par.pickle(f)
            f.seek(0)
@ -193,7 +198,7 @@ class Test(ListDictTestCase):

    @unittest.skip
    def test_add_observer(self):
-        par = toy_rbf_1d_50(optimize=0, plot=0)
+        par = toy_model()
        par.name = "original"
        par.count = 0
        par.add_observer(self, self._callback, 1)
@ -211,4 +216,4 @@ class Test(ListDictTestCase):

 if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.test_parameter_index_operations']
-    unittest.main()
+    unittest.main()