[missing data] general implementation for subsetting data

2026-06-11 15:15:15 +02:00 · 2014-10-08 12:03:51 +01:00 · 2014-10-08 12:03:51 +01:00 · fa7807ee6f
commit fa7807ee6f
parent d93fac8c13
7 changed files with 329 additions and 108 deletions
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -9,6 +9,7 @@ from .. import likelihoods
 from parameterization.variational import VariationalPosterior

 import logging
+from GPy.inference.latent_function_inference.posterior import Posterior
 logger = logging.getLogger("sparse gp")

 class SparseGP(GP):
@ -34,12 +35,18 @@ class SparseGP(GP):

    """

-    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None, name='sparse gp', Y_metadata=None, normalizer=False):
+    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
+                 name='sparse gp', Y_metadata=None, normalizer=False,
+                 missing_data=False):
+
+        self.missing_data = missing_data
+        if self.missing_data:
+            self.ninan = ~np.isnan(Y)

        #pick a sensible inference method
        if inference_method is None:
            if isinstance(likelihood, likelihoods.Gaussian):
-                inference_method = var_dtc.VarDTC()
+                inference_method = var_dtc.VarDTC(limit=1 if not self.missing_data else Y.shape[1])
            else:
                #inference_method = ??
                raise NotImplementedError, "what to do what to do?"
@ -51,44 +58,200 @@ class SparseGP(GP):
        GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
        logger.info("Adding Z as parameter")
        self.link_parameter(self.Z, index=0)
+        if self.missing_data:
+            self.Ylist = []
+            overall = self.Y_normalized.shape[1]
+            m_f = lambda i: "Precomputing Y for missing data: {: >7.2%}".format(float(i+1)/overall)
+            message = m_f(-1)
+            print message,
+            for d in xrange(overall):
+                self.Ylist.append(self.Y_normalized[self.ninan[:, d], d][:, None])
+                print ' '*(len(message)+1) + '\r',
+                message = m_f(d)
+                print message,
+            print ''
+

    def has_uncertain_inputs(self):
        return isinstance(self.X, VariationalPosterior)

-    def parameters_changed(self):
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
-        self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
-        if isinstance(self.X, VariationalPosterior):
+    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None):
+        """
+        This is the standard part, which usually belongs in parameters_changed.
+
+        For automatic handling of subsampling (such as missing_data, stochastics etc.), we need to put this into an inner
+        loop, in order to ensure a different handling of gradients etc of different
+        subsets of data.
+
+        The dict in current_values will be passed aroung as current_values for
+        the rest of the algorithm, so this is the place to store current values,
+        such as subsets etc, if necessary.
+
+        If Lm and dL_dKmm can be precomputed (or only need to be computed once)
+        pass them in here, so they will be passed to the inference_method.
+
+        subset_indices is a dictionary of indices. you can put the indices however you
+        like them into this dictionary for inner use of the indices inside the
+        algorithm.
+        """
+        posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=None)
+        current_values = {}
+        likelihood.update_gradients(grad_dict['dL_dthetaL'])
+        current_values['likgrad'] = likelihood.gradient.copy()
+        if subset_indices is None:
+            subset_indices = {}
+        if isinstance(X, VariationalPosterior):
            #gradients wrt kernel
-            dL_dKmm = self.grad_dict['dL_dKmm']
-            self.kern.update_gradients_full(dL_dKmm, self.Z, None)
-            target = self.kern.gradient.copy()
-            self.kern.update_gradients_expectations(variational_posterior=self.X,
-                                                    Z=self.Z,
-                                                    dL_dpsi0=self.grad_dict['dL_dpsi0'],
-                                                    dL_dpsi1=self.grad_dict['dL_dpsi1'],
-                                                    dL_dpsi2=self.grad_dict['dL_dpsi2'])
-            self.kern.gradient += target
+            dL_dKmm = grad_dict['dL_dKmm']
+            kern.update_gradients_full(dL_dKmm, Z, None)
+            current_values['kerngrad'] = kern.gradient.copy()
+            kern.update_gradients_expectations(variational_posterior=X,
+                                                    Z=Z,
+                                                    dL_dpsi0=grad_dict['dL_dpsi0'],
+                                                    dL_dpsi1=grad_dict['dL_dpsi1'],
+                                                    dL_dpsi2=grad_dict['dL_dpsi2'])
+            current_values['kerngrad'] += kern.gradient

            #gradients wrt Z
-            self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z)
-            self.Z.gradient += self.kern.gradients_Z_expectations(
-                               self.grad_dict['dL_dpsi0'],
-                               self.grad_dict['dL_dpsi1'],
-                               self.grad_dict['dL_dpsi2'],
-                               Z=self.Z,
-                               variational_posterior=self.X)
+            current_values['Zgrad'] = kern.gradients_X(dL_dKmm, Z)
+            current_values['Zgrad'] += kern.gradients_Z_expectations(
+                               grad_dict['dL_dpsi0'],
+                               grad_dict['dL_dpsi1'],
+                               grad_dict['dL_dpsi2'],
+                               Z=Z,
+                               variational_posterior=X)
        else:
            #gradients wrt kernel
-            self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X)
-            target = self.kern.gradient.copy()
-            self.kern.update_gradients_full(self.grad_dict['dL_dKnm'], self.X, self.Z)
-            target += self.kern.gradient
-            self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z, None)
-            self.kern.gradient += target
+            kern.update_gradients_diag(grad_dict['dL_dKdiag'], X)
+            current_values['kerngrad'] = kern.gradient.copy()
+            kern.update_gradients_full(grad_dict['dL_dKnm'], X, Z)
+            current_values['kerngrad'] += kern.gradient
+            kern.update_gradients_full(grad_dict['dL_dKmm'], Z, None)
+            current_values['kerngrad'] += kern.gradient
            #gradients wrt Z
-            self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
-            self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)
+            current_values['Zgrad'] = kern.gradients_X(grad_dict['dL_dKmm'], Z)
+            current_values['Zgrad'] += kern.gradients_X(grad_dict['dL_dKnm'].T, Z, X)
+        return posterior, log_marginal_likelihood, grad_dict, current_values, subset_indices
+
+    def _inner_take_over_or_update(self, full_values=None, current_values=None, value_indices=None):
+        """
+        This is for automatic updates of values in the inner loop of missing
+        data handling. Both arguments are dictionaries and the values in
+        full_values will be updated by the current_gradients.
+
+        If a key from current_values does not exist in full_values, it will be
+        initialized to the value in current_values.
+
+        If there is indices needed for the update, value_indices can be used for
+        that. If value_indices has the same key, as current_values, the update
+        in full_values will be indexed by the indices in value_indices.
+
+        grads:
+            dictionary of standing gradients (you will have to carefully make sure, that
+            the ordering is right!). The values in here will be updated such that
+            full_values[key] += current_values[key]  forall key in full_gradients.keys()
+
+        gradients:
+            dictionary of gradients in the current set of parameters.
+
+        value_indices:
+            dictionary holding indices for the update in full_values.
+            if the key exists the update rule is:
+            full_values[key][value_indices[key]] += current_values[key]
+        """
+        for key in current_values.keys():
+            if value_indices is not None and value_indices.has_key(key):
+                index = value_indices[key]
+            else:
+                index = slice(None)
+            if full_values.has_key(key):
+                full_values[key][index] += current_values[key]
+            else:
+                full_values[key] = current_values[key]
+
+    def _inner_values_update(self, current_values):
+        """
+        This exists if there is more to do with the current values.
+        It will be called allways in the inner loop, so that
+        you can do additional inner updates for the inside of the missing data
+        loop etc. This can also be used for stochastic updates, when only working on
+        one dimension of the output.
+        """
+        pass
+
+    def _outer_values_update(self, full_values):
+        """
+        Here you put the values, which were collected before in the right places.
+        E.g. set the gradients of parameters, etc.
+        """
+        self.likelihood.gradient = full_values['likgrad']
+        self.kern.gradient = full_values['kerngrad']
+        self.Z.gradient = full_values['Zgrad']
+
+    def _outer_init_full_values(self):
+        """
+        If full_values has indices in values_indices, we might want to initialize
+        the full_values differently, so that subsetting is possible.
+
+        Here you can initialize the full_values for the values needed.
+
+        Keep in mind, that if a key does not exist in full_values when updating
+        values, it will be set (so e.g. for Z there is no need to initialize Zgrad,
+        as there is no subsetting needed. For X in BGPLVM on the other hand we probably need
+        to initialize the gradients for the mean and the variance in order to
+        have the full gradient for indexing)
+        """
+        return {}
+
+    def _outer_loop_for_missing_data(self):
+        Lm = None
+        dL_dKmm = None
+        Kmm = None
+
+        self._log_marginal_likelihood = 0
+        full_values = self._outer_init_full_values()
+
+        woodbury_inv = np.zeros((self.num_inducing, self.num_inducing, self.output_dim))
+        woodbury_vector = np.zeros((self.num_inducing, self.output_dim))
+        m_f = lambda i: "Inference with missing data: {: >7.2%}".format(float(i+1)/self.output_dim)
+        message = m_f(-1)
+        print message,
+        for d in xrange(self.output_dim):
+            ninan = self.ninan[:, d]
+            print ' '*(len(message)) + '\r',
+            message = m_f(d)
+            print message,
+
+            posterior, log_marginal_likelihood, \
+                grad_dict, current_values, value_indices = self._inner_parameters_changed(
+                                self.kern, self.X[ninan],
+                                self.Z, self.likelihood,
+                                self.Ylist[d], self.Y_metadata,
+                                Lm, dL_dKmm,
+                                subset_indices=dict(outputs=d, samples=ninan))
+
+            self._inner_take_over_or_update(full_values, current_values, value_indices)
+            self._inner_values_update(current_values)
+
+            Lm = posterior.K_chol
+            dL_dKmm = grad_dict['dL_dKmm']
+            Kmm = posterior._K
+            woodbury_inv[:, :, d] = posterior.woodbury_inv
+            woodbury_vector[:, d:d+1] = posterior.woodbury_vector
+            self._log_marginal_likelihood += log_marginal_likelihood
+        print ''
+
+        self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector,
+                                   K=Kmm, mean=None, cov=None, K_chol=Lm)
+        self._outer_values_update(full_values)
+
+    def parameters_changed(self):
+        if self.missing_data:
+            self._outer_loop_for_missing_data()
+        else:
+            self.posterior, self._log_marginal_likelihood, self.grad_dict, gradients, _ = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
+            self.kern.gradient = gradients['kerngrad']
+            self.Z.gradient = gradients['Zgrad']

    def _raw_predict(self, Xnew, full_cov=False, kern=None):
        """
--- a/GPy/core/sparse_gp_mpi.py
+++ b/GPy/core/sparse_gp_mpi.py
@ -34,15 +34,15 @@ class SparseGP_MPI(SparseGP):

    """

-    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp mpi', Y_metadata=None, mpi_comm=None, normalizer=False):
+    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp mpi', Y_metadata=None, mpi_comm=None, normalizer=False, missing_data=False):
        self._IN_OPTIMIZATION_ = False
        if mpi_comm != None:
            if inference_method is None:
                inference_method = VarDTC_minibatch(mpi_comm=mpi_comm)
            else:
                assert isinstance(inference_method, VarDTC_minibatch), 'inference_method has to support MPI!'
-                        
-        super(SparseGP_MPI, self).__init__(X, Y, Z, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
+
+        super(SparseGP_MPI, self).__init__(X, Y, Z, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer, missing_data=missing_data)
        self.update_model(False)
        self.link_parameter(self.X, index=0)
        if variational_prior is not None:
@ -75,18 +75,18 @@ class SparseGP_MPI(SparseGP):
        return dc

    #=====================================================
-    # The MPI parallelization 
+    # The MPI parallelization
    #     - can move to model at some point
    #=====================================================
-    
+
    @SparseGP.optimizer_array.setter
    def optimizer_array(self, p):
        if self.mpi_comm != None:
            if self._IN_OPTIMIZATION_ and self.mpi_comm.rank==0:
                self.mpi_comm.Bcast(np.int32(1),root=0)
-            self.mpi_comm.Bcast(p, root=0)        
+            self.mpi_comm.Bcast(p, root=0)
        SparseGP.optimizer_array.fset(self,p)
-        
+
    def optimize(self, optimizer=None, start=None, **kwargs):
        self._IN_OPTIMIZATION_ = True
        if self.mpi_comm==None:
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -367,14 +367,14 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
    Y = Ylist[0]
    k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)

-    inan = _np.random.binomial(1, .8, size=Y.shape).astype(bool) # 80% missing data
+    inan = _np.random.binomial(1, .2, size=Y.shape).astype(bool) # 80% missing data
    Ymissing = Y.copy()
    Ymissing[inan] = _np.nan

    m = BayesianGPLVM(Ymissing, Q, init="random", num_inducing=num_inducing,
-                      inference_method=VarDTCMissingData(inan=inan), kernel=k)
+                      kernel=k, missing_data=True)

-    m.X.variance[:] = _np.random.uniform(0,.01,m.X.shape)
+    m.X.variance[:] = _np.random.uniform(0,.1,m.X.shape)
    m.likelihood.variance = .01
    m.parameters_changed()
    m.Yreal = Y
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@ -62,7 +62,9 @@ class VarDTC(LatentFunctionInference):
    def get_VVTfactor(self, Y, prec):
        return Y * prec # TODO chache this, and make it effective

-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+
+
+    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None):

        _, output_dim = Y.shape
        uncertain_inputs = isinstance(X, VariationalPosterior)
@ -84,8 +86,8 @@ class VarDTC(LatentFunctionInference):

        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
-        Lm = jitchol(Kmm)
-
+        if Lm is None:
+            Lm = jitchol(Kmm)

        # The rather complex computations of A, and the psi stats
        if uncertain_inputs:
@ -100,7 +102,6 @@ class VarDTC(LatentFunctionInference):
        else:
            psi0 = kern.Kdiag(X)
            psi1 = kern.K(X, Z)
-            psi2 = None
            if het_noise:
                tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
            else:
@ -122,11 +123,12 @@ class VarDTC(LatentFunctionInference):
        delit = tdot(_LBi_Lmi_psi1Vf)
        data_fit = np.trace(delit)
        DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit)
-        delit = -0.5 * DBi_plus_BiPBi
-        delit += -0.5 * B * output_dim
-        delit += output_dim * np.eye(num_inducing)
-        # Compute dL_dKmm
-        dL_dKmm = backsub_both_sides(Lm, delit)
+        if dL_dKmm is None:
+            delit = -0.5 * DBi_plus_BiPBi
+            delit += -0.5 * B * output_dim
+            delit += output_dim * np.eye(num_inducing)
+            # Compute dL_dKmm
+            dL_dKmm = backsub_both_sides(Lm, delit)

        # derivatives of L w.r.t. psi
        dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm,
@ -208,6 +210,7 @@ class VarDTCMissingData(LatentFunctionInference):
            inan = np.isnan(Y)
            has_none = inan.any()
            self._inan = ~inan
+            inan = self._inan
        else:
            inan = self._inan
            has_none = True
@ -241,7 +244,7 @@ class VarDTCMissingData(LatentFunctionInference):
            self._subarray_indices = [[slice(None),slice(None)]]
            return [Y], [(Y**2).sum()]

-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None):
        if isinstance(X, VariationalPosterior):
            uncertain_inputs = True
            psi0_all = kern.psi0(Z, X)
@ -260,7 +263,7 @@ class VarDTCMissingData(LatentFunctionInference):
        dL_dpsi0_all = np.zeros(Y.shape[0])
        dL_dpsi1_all = np.zeros((Y.shape[0], num_inducing))
        if uncertain_inputs:
-            dL_dpsi2_all = np.zeros((num_inducing, num_inducing))
+            dL_dpsi2_all = np.zeros((Y.shape[0], num_inducing, num_inducing))

        dL_dR = 0
        woodbury_vector = np.zeros((num_inducing, Y.shape[1]))
@ -271,31 +274,31 @@ class VarDTCMissingData(LatentFunctionInference):
        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
        #factor Kmm
-        Lm = jitchol(Kmm)
+        if Lm is None:
+            Lm = jitchol(Kmm)
        if uncertain_inputs: LmInv = dtrtri(Lm)

-        size = Y.shape[1]
+        size = len(Ys)
        next_ten = 0
-
        for i, [y, v, trYYT] in enumerate(itertools.izip(Ys, self._inan.T, traces)):
            if ((i+1.)/size) >= next_ten:
                logger.info('inference {:> 6.1%}'.format((i+1.)/size))
                next_ten += .1
            if het_noise: beta = beta_all[i]
            else: beta = beta_all
-
            VVT_factor = (y*beta)
            output_dim = 1#len(ind)

            psi0 = psi0_all[v]
            psi1 = psi1_all[v, :]
-            if uncertain_inputs: psi2 = kern.psi2(Z, X[v, :])
+            if uncertain_inputs:
+                psi2 = kern.psi2(Z, X[v, :])
            else: psi2 = None
            num_data = psi1.shape[0]

            if uncertain_inputs:
                if het_noise: psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0)
-                else: psi2_beta = psi2 * beta
+                else: psi2_beta = psi2 * (beta)
                A = LmInv.dot(psi2_beta.dot(LmInv.T))
            else:
                if het_noise: tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
@ -330,11 +333,11 @@ class VarDTCMissingData(LatentFunctionInference):
            dL_dpsi0_all[v] += dL_dpsi0
            dL_dpsi1_all[v, :] += dL_dpsi1
            if uncertain_inputs:
-                dL_dpsi2_all += dL_dpsi2
+                dL_dpsi2_all[v, :, :] += dL_dpsi2

            # log marginal likelihood
            log_marginal += _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise,
-                psi0, A, LB, trYYT, data_fit,VVT_factor)
+                psi0, A, LB, trYYT, data_fit, VVT_factor)

            #put the gradients in the right places
            dL_dR += _compute_dL_dR(likelihood,
@ -393,7 +396,6 @@ def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, C
            # subsume back into psi1 (==Kmn)
            dL_dpsi1 += 2.*np.dot(psi1, dL_dpsi2)
            dL_dpsi2 = None
-
    return dL_dpsi0, dL_dpsi1, dL_dpsi2


--- a/GPy/kern/_src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
@ -17,7 +17,7 @@ def psicomputations(variance, lengthscale, Z, variational_posterior):
    # _psi1                NxM
    mu = variational_posterior.mean
    S = variational_posterior.variance
-    
+
    psi0 = np.empty(mu.shape[0])
    psi0[:] = variance
    psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
@ -41,7 +41,7 @@ def __psi1computations(variance, lengthscale, Z, mu, S):
    _psi1_logdenom = np.log(S/lengthscale2+1.).sum(axis=-1) # N
    _psi1_log = (_psi1_logdenom[:,None]+np.einsum('nmq,nq->nm',np.square(mu[:,None,:]-Z[None,:,:]),1./(S+lengthscale2)))/(-2.)
    _psi1 = variance*np.exp(_psi1_log)
-        
+
    return _psi1

 def __psi2computations(variance, lengthscale, Z, mu, S):
@ -54,27 +54,27 @@ def __psi2computations(variance, lengthscale, Z, mu, S):
    # here are the "statistics" for psi2
    # Produced intermediate results:
    # _psi2                MxM
-    
+
    lengthscale2 = np.square(lengthscale)
-    
+
    _psi2_logdenom = np.log(2.*S/lengthscale2+1.).sum(axis=-1)/(-2.) # N
    _psi2_exp1 = (np.square(Z[:,None,:]-Z[None,:,:])/lengthscale2).sum(axis=-1)/(-4.) #MxM
    Z_hat = (Z[:,None,:]+Z[None,:,:])/2. #MxMxQ
    denom = 1./(2.*S+lengthscale2)
    _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+2.*np.einsum('nq,moq,nq->nmo',mu,Z_hat,denom)-np.einsum('moq,nq->nmo',np.square(Z_hat),denom)
    _psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2)
-    
+

    return _psi2

 def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
    ARD = (len(lengthscale)!=1)
-    
+
    dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
    dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance)

    dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
-    
+
    dL_dlengscale = dl_psi1 + dl_psi2
    if not ARD:
        dL_dlengscale = dL_dlengscale.sum()
@ -82,7 +82,7 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscal
    dL_dmu = dmu_psi1 + dmu_psi2
    dL_dS = dS_psi1 + dS_psi2
    dL_dZ = dZ_psi1 + dZ_psi2
-    
+
    return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS

 def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S):
@ -101,9 +101,9 @@ def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S):
    # _dL_dgamma        NxQ
    # _dL_dmu           NxQ
    # _dL_dS            NxQ
-    
+
    lengthscale2 = np.square(lengthscale)
-    
+
    _psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
    Lpsi1 = dL_dpsi1*_psi1
    Zmu = Z[None,:,:]-mu[:,None,:] # NxMxQ
@ -114,7 +114,7 @@ def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S):
    _dL_dS = np.einsum('nm,nmq,nq->nq',Lpsi1,(Zmu2_denom-1.),denom)/2.
    _dL_dZ = -np.einsum('nm,nmq,nq->mq',Lpsi1,Zmu,denom)
    _dL_dl = np.einsum('nm,nmq,nq->q',Lpsi1,(Zmu2_denom+(S/lengthscale2)[:,None,:]),denom*lengthscale)
-    
+
    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS

 def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
@ -133,13 +133,12 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
    # _dL_dgamma         NxQ
    # _dL_dmu            NxQ
    # _dL_dS             NxQ
-    
+
    lengthscale2 = np.square(lengthscale)
    denom = 1./(2*S+lengthscale2)
    denom2 = np.square(denom)

    _psi2 = _psi2computations(variance, lengthscale, Z, mu, S) # NxMxM
-    
    Lpsi2 = dL_dpsi2[None,:,:]*_psi2
    Lpsi2sum = np.einsum('nmo->n',Lpsi2) #N
    Lpsi2Z = np.einsum('nmo,oq->nq',Lpsi2,Z) #NxQ
@ -147,7 +146,7 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
    Lpsi2Z2p = np.einsum('nmo,mq,oq->nq',Lpsi2,Z,Z) #NxQ
    Lpsi2Zhat = Lpsi2Z
    Lpsi2Zhat2 = (Lpsi2Z2+Lpsi2Z2p)/2
-    
+
    _dL_dvar = Lpsi2sum.sum()*2/variance
    _dL_dmu = (-2*denom) * (mu*Lpsi2sum[:,None]-Lpsi2Zhat)
    _dL_dS = (2*np.square(denom))*(np.square(mu)*Lpsi2sum[:,None]-2*mu*Lpsi2Zhat+Lpsi2Zhat2) - denom*Lpsi2sum[:,None]
@ -157,6 +156,6 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
                             (2*mu*denom2)*Lpsi2Zhat+denom2*Lpsi2Zhat2).sum(axis=0)

    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
-    
+
 _psi1computations = Cacher(__psi1computations, limit=1)
 _psi2computations = Cacher(__psi2computations, limit=1)
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@ -25,7 +25,9 @@ class BayesianGPLVM(SparseGP_MPI):

    """
    def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
-                 Z=None, kernel=None, inference_method=None, likelihood=None, name='bayesian gplvm', mpi_comm=None, normalizer=None):
+                 Z=None, kernel=None, inference_method=None, likelihood=None,
+                 name='bayesian gplvm', mpi_comm=None, normalizer=None,
+                 missing_data=False):
        self.mpi_comm = mpi_comm
        self.__IN_OPTIMIZATION__ = False

@ -59,24 +61,23 @@ class BayesianGPLVM(SparseGP_MPI):
        X = NormalPosterior(X, X_variance)

        if inference_method is None:
-            inan = np.isnan(Y)
-            if np.any(inan):
-                from ..inference.latent_function_inference.var_dtc import VarDTCMissingData
-                self.logger.debug("creating inference_method with var_dtc missing data")
-                inference_method = VarDTCMissingData(inan=inan)
-            elif mpi_comm is not None:
+            if mpi_comm is not None:
                inference_method = VarDTC_minibatch(mpi_comm=mpi_comm)
            else:
                from ..inference.latent_function_inference.var_dtc import VarDTC
                self.logger.debug("creating inference_method var_dtc")
-                inference_method = VarDTC()
+                inference_method = VarDTC(limit=1 if not missing_data else Y.shape[1])
        if isinstance(inference_method,VarDTC_minibatch):
            inference_method.mpi_comm = mpi_comm

        if kernel.useGPU and isinstance(inference_method, VarDTC_GPU):
            kernel.psicomp.GPU_direct = True

-        super(BayesianGPLVM,self).__init__(X, Y, Z, kernel, likelihood=likelihood, name=name, inference_method=inference_method, normalizer=normalizer, mpi_comm=mpi_comm, variational_prior=self.variational_prior)
+        super(BayesianGPLVM,self).__init__(X, Y, Z, kernel, likelihood=likelihood,
+                                           name=name, inference_method=inference_method,
+                                           normalizer=normalizer, mpi_comm=mpi_comm,
+                                           variational_prior=self.variational_prior,
+                                           missing_data=missing_data)

    def set_X_gradients(self, X, X_grad):
        """Set the gradients of the posterior distribution of X in its specific form."""
@ -86,15 +87,48 @@ class BayesianGPLVM(SparseGP_MPI):
        """Get the gradients of the posterior distribution of X in its specific form."""
        return X.mean.gradient, X.variance.gradient

+    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None):
+        posterior, log_marginal_likelihood, grad_dict, current_values, value_indices = super(BayesianGPLVM, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm, subset_indices=subset_indices)
+
+        log_marginal_likelihood -= self.variational_prior.KL_divergence(X)
+
+        current_values['meangrad'], current_values['vargrad'] = self.kern.gradients_qX_expectations(
+                                            variational_posterior=X,
+                                            Z=Z, dL_dpsi0=grad_dict['dL_dpsi0'],
+                                            dL_dpsi1=grad_dict['dL_dpsi1'],
+                                            dL_dpsi2=grad_dict['dL_dpsi2'])
+        X.mean.gradient[:] = 0
+        X.variance.gradient[:] = 0
+        self.variational_prior.update_gradients_KL(X)
+        current_values['meangrad'] += X.mean.gradient
+        current_values['vargrad'] += X.variance.gradient
+
+        value_indices['meangrad'] = subset_indices['samples']
+        value_indices['vargrad'] = subset_indices['samples']
+        return posterior, log_marginal_likelihood, grad_dict, current_values, value_indices
+
+    def _outer_values_update(self, full_values):
+        """
+        Here you put the values, which were collected before in the right places.
+        E.g. set the gradients of parameters, etc.
+        """
+        super(BayesianGPLVM, self)._outer_values_update(full_values)
+        self.X.mean.gradient = full_values['meangrad']
+        self.X.variance.gradient = full_values['vargrad']
+
+    def _outer_init_full_values(self):
+        return dict(meangrad=np.zeros(self.X.mean.shape),
+                    vargrad=np.zeros(self.X.variance.shape))
+
    def parameters_changed(self):
        super(BayesianGPLVM,self).parameters_changed()
        if isinstance(self.inference_method, VarDTC_minibatch):
            return

-        super(BayesianGPLVM, self).parameters_changed()
-        self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
+        #super(BayesianGPLVM, self).parameters_changed()
+        #self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)

-        self.X.mean.gradient, self.X.variance.gradient = self.kern.gradients_qX_expectations(variational_posterior=self.X, Z=self.Z, dL_dpsi0=self.grad_dict['dL_dpsi0'], dL_dpsi1=self.grad_dict['dL_dpsi1'], dL_dpsi2=self.grad_dict['dL_dpsi2'])
+        #self.X.mean.gradient, self.X.variance.gradient = self.kern.gradients_qX_expectations(variational_posterior=self.X, Z=self.Z, dL_dpsi0=self.grad_dict['dL_dpsi0'], dL_dpsi1=self.grad_dict['dL_dpsi1'], dL_dpsi2=self.grad_dict['dL_dpsi2'])

        # This is testing code -------------------------
 #         i = np.random.randint(self.X.shape[0])
@ -113,7 +147,7 @@ class BayesianGPLVM(SparseGP_MPI):
        # -----------------------------------------------

        # update for the KL divergence
-        self.variational_prior.update_gradients_KL(self.X)
+        #self.variational_prior.update_gradients_KL(self.X)

    def plot_latent(self, labels=None, which_indices=None,
                resolution=50, ax=None, marker='o', s=40,
@ -178,7 +212,7 @@ class BayesianGPLVM(SparseGP_MPI):
        """
        dmu_dX = np.zeros_like(Xnew)
        for i in range(self.Z.shape[0]):
-            dmu_dX += self.kern.gradients_X(self.Cpsi1Vf[i:i + 1, :], Xnew, self.Z[i:i + 1, :])
+            dmu_dX += self.kern.gradients_X(self.grad_dict['dL_dpsi1'][i:i + 1, :], Xnew, self.Z[i:i + 1, :])
        return dmu_dX

    def dmu_dXnew(self, Xnew):
@ -189,7 +223,7 @@ class BayesianGPLVM(SparseGP_MPI):
        ones = np.ones((1, 1))
        for i in range(self.Z.shape[0]):
            gradients_X[:, i] = self.kern.gradients_X(ones, Xnew, self.Z[i:i + 1, :]).sum(-1)
-        return np.dot(gradients_X, self.Cpsi1Vf)
+        return np.dot(gradients_X, self.grad_dict['dL_dpsi1'])

    def plot_steepest_gradient_map(self, *args, ** kwargs):
        """
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -20,7 +20,7 @@ class MiscTests(unittest.TestCase):
        m = GPy.models.GPRegression(self.X, self.Y, kernel=k)
        m.randomize()
        m.likelihood.variance = .5
-        Kinv = np.linalg.pinv(k.K(self.X) + np.eye(self.N)*m.likelihood.variance)
+        Kinv = np.linalg.pinv(k.K(self.X) + np.eye(self.N) * m.likelihood.variance)
        K_hat = k.K(self.X_new) - k.K(self.X_new, self.X).dot(Kinv).dot(k.K(self.X, self.X_new))
        mu_hat = k.K(self.X_new, self.X).dot(Kinv).dot(m.Y_normalized)

@ -43,7 +43,7 @@ class MiscTests(unittest.TestCase):
        Z = m.Z[:]
        X = self.X[:]

-        #Not easy to check if woodbury_inv is correct in itself as it requires a large derivation and expression
+        # Not easy to check if woodbury_inv is correct in itself as it requires a large derivation and expression
        Kinv = m.posterior.woodbury_inv
        K_hat = k.K(self.X_new) - k.K(self.X_new, Z).dot(Kinv).dot(k.K(Z, self.X_new))

@ -51,13 +51,13 @@ class MiscTests(unittest.TestCase):
        self.assertEquals(mu.shape, (self.N_new, self.D))
        self.assertEquals(covar.shape, (self.N_new, self.N_new))
        np.testing.assert_almost_equal(K_hat, covar)
-        #np.testing.assert_almost_equal(mu_hat, mu)
+        # np.testing.assert_almost_equal(mu_hat, mu)

        mu, var = m._raw_predict(self.X_new)
        self.assertEquals(mu.shape, (self.N_new, self.D))
        self.assertEquals(var.shape, (self.N_new, 1))
        np.testing.assert_almost_equal(np.diag(K_hat)[:, None], var)
-        #np.testing.assert_almost_equal(mu_hat, mu)
+        # np.testing.assert_almost_equal(mu_hat, mu)

    def test_likelihood_replicate(self):
        m = GPy.models.GPRegression(self.X, self.Y)
@ -110,6 +110,29 @@ class MiscTests(unittest.TestCase):
        m2.kern.lengthscale = m.kern['.*lengthscale']
        np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood())

+    def test_missing_data(self):
+        from GPy import kern
+        from GPy.models import BayesianGPLVM
+        from GPy.examples.dimensionality_reduction import _simulate_matern
+
+        D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 400, 3, 4
+        _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, False)
+        Y = Ylist[0]
+
+        inan = np.random.binomial(1, .9, size=Y.shape).astype(bool) # 80% missing data
+        Ymissing = Y.copy()
+        Ymissing[inan] = np.nan
+
+        k = kern.Linear(Q, ARD=True) + kern.White(Q, np.exp(-2)) # + kern.bias(Q)
+        m = BayesianGPLVM(Ymissing, Q, init="random", num_inducing=num_inducing,
+                          kernel=k, missing_data=True)
+        assert(m.checkgrad())
+
+        k = kern.RBF(Q, ARD=True) + kern.White(Q, np.exp(-2)) # + kern.bias(Q)
+        m = BayesianGPLVM(Ymissing, Q, init="random", num_inducing=num_inducing,
+                          kernel=k, missing_data=True)
+        assert(m.checkgrad())
+
    def test_likelihood_replicate_kern(self):
        m = GPy.models.GPRegression(self.X, self.Y)
        m2 = GPy.models.GPRegression(self.X, self.Y)
@ -158,7 +181,7 @@ class MiscTests(unittest.TestCase):
    def test_model_optimize(self):
        X = np.random.uniform(-3., 3., (20, 1))
        Y = np.sin(X) + np.random.randn(20, 1) * 0.05
-        m = GPy.models.GPRegression(X,Y)
+        m = GPy.models.GPRegression(X, Y)
        m.optimize()
        print m

@ -219,8 +242,8 @@ class GradientTests(np.testing.TestCase):
        mlp = GPy.kern.MLP(1)
        self.check_model(mlp, model_type='GPRegression', dimension=1)

-    #TODO:
-    #def test_GPRegression_poly_1d(self):
+    # TODO:
+    # def test_GPRegression_poly_1d(self):
    #    ''' Testing the GP regression with polynomial kernel with white kernel on 1d data '''
    #    mlp = GPy.kern.Poly(1, degree=5)
    #    self.check_model(mlp, model_type='GPRegression', dimension=1)
@ -417,11 +440,11 @@ class GradientTests(np.testing.TestCase):

    def test_gp_heteroscedastic_regression(self):
        num_obs = 25
-        X = np.random.randint(0,140,num_obs)
-        X = X[:,None]
-        Y = 25. + np.sin(X/20.) * 2. + np.random.rand(num_obs)[:,None]
+        X = np.random.randint(0, 140, num_obs)
+        X = X[:, None]
+        Y = 25. + np.sin(X / 20.) * 2. + np.random.rand(num_obs)[:, None]
        kern = GPy.kern.Bias(1) + GPy.kern.RBF(1)
-        m = GPy.models.GPHeteroscedasticRegression(X,Y,kern)
+        m = GPy.models.GPHeteroscedasticRegression(X, Y, kern)
        self.assertTrue(m.checkgrad())

    def test_gp_kronecker_gaussian(self):
@ -432,12 +455,12 @@ class GradientTests(np.testing.TestCase):
        k1 = GPy.kern.RBF(1)  # + GPy.kern.White(1)
        k2 = GPy.kern.RBF(1)  # + GPy.kern.White(1)
        Y = np.random.randn(N1, N2)
-        Y = Y-Y.mean(0)
-        Y = Y/Y.std(0)
+        Y = Y - Y.mean(0)
+        Y = Y / Y.std(0)
        m = GPy.models.GPKroneckerGaussianRegression(X1, X2, Y, k1, k2)

        # build the model the dumb way
-        assert (N1*N2<1000), "too much data for standard GPs!"
+        assert (N1 * N2 < 1000), "too much data for standard GPs!"
        yy, xx = np.meshgrid(X2, X1)
        Xgrid = np.vstack((xx.flatten(order='F'), yy.flatten(order='F'))).T
        kg = GPy.kern.RBF(1, active_dims=[0]) * GPy.kern.RBF(1, active_dims=[1])
@ -458,11 +481,11 @@ class GradientTests(np.testing.TestCase):

    def test_gp_VGPC(self):
        num_obs = 25
-        X = np.random.randint(0,140,num_obs)
-        X = X[:,None]
-        Y = 25. + np.sin(X/20.) * 2. + np.random.rand(num_obs)[:,None]
+        X = np.random.randint(0, 140, num_obs)
+        X = X[:, None]
+        Y = 25. + np.sin(X / 20.) * 2. + np.random.rand(num_obs)[:, None]
        kern = GPy.kern.Bias(1) + GPy.kern.RBF(1)
-        m = GPy.models.GPVariationalGaussianApproximation(X,Y,kern)
+        m = GPy.models.GPVariationalGaussianApproximation(X, Y, kern)
        self.assertTrue(m.checkgrad())