From 12c335c62e65593b9741028b81d053400f674195 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 25 Aug 2015 17:30:09 +0300
Subject: [PATCH 01/13] Fixed array2string bug for N > 1000 default printing

---
 GPy/inference/optimization/stochastics.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/GPy/inference/optimization/stochastics.py b/GPy/inference/optimization/stochastics.py
index 645c5f64..f7ad1245 100644
--- a/GPy/inference/optimization/stochastics.py
+++ b/GPy/inference/optimization/stochastics.py
@@ -5,7 +5,7 @@ class StochasticStorage(object):
     '''
     This is a container for holding the stochastic parameters,
     such as subset indices or step length and so on.
-    
+
     self.d has to be a list of lists:
     [dimension indices, nan indices for those dimensions]
     so that the minibatches can be used as efficiently as possible.10
@@ -38,16 +38,17 @@ class SparseGPMissing(StochasticStorage):
         import numpy as np
         self.Y = model.Y_normalized
         bdict = {}
+        #For N > 1000 array2string default crops
+        opt = np.get_printoptions()
+        np.set_printoptions(threshold='nan')
         for d in range(self.Y.shape[1]):
-            inan = np.isnan(self.Y[:, d])
-            arr_str = np.array2string(inan, 
-                                      np.inf, 0, 
-                                      True, '', 
-                                      formatter={'bool':lambda x: '1' if x else '0'})
+            inan = np.isnan(self.Y)[:, d]
+            arr_str = np.array2string(~inan, np.inf, 0, True, '', formatter={'bool':lambda x: '1' if x else '0'})
             try:
                 bdict[arr_str][0].append(d)
             except:
                 bdict[arr_str] = [[d], ~inan]
+        np.set_printoptions(**opt)
         self.d = bdict.values()
 
 class SparseGPStochastics(StochasticStorage):
@@ -70,16 +71,19 @@ class SparseGPStochastics(StochasticStorage):
             import numpy as np
             self.d = np.random.choice(self.output_dim, size=self.batchsize, replace=False)
             bdict = {}
+            opt = np.get_printoptions()
+            np.set_printoptions(threshold='nan')
             for d in self.d:
                 inan = np.isnan(self.Y[:, d])
-                arr_str = int(np.array2string(inan, 
-                                          np.inf, 0, 
-                                          True, '', 
+                arr_str = int(np.array2string(inan,
+                                          np.inf, 0,
+                                          True, '',
                                           formatter={'bool':lambda x: '1' if x else '0'}), 2)
                 try:
                     bdict[arr_str][0].append(d)
                 except:
                     bdict[arr_str] = [[d], ~inan]
+            np.set_printoptions(**opt)
             self.d = bdict.values()
 
     def reset(self):

From 4143f005403808d11928b9772a9a40679a4d441e Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 27 Aug 2015 17:56:08 +0300
Subject: [PATCH 02/13] Optimizing missing data model, needs tidying but now
 much faster

---
 .../latent_function_inference/var_dtc.py      |  27 +++--
 GPy/kern/_src/kern.py                         |  27 +++--
 GPy/kern/_src/kernel_slice_operations.py      |  18 ++-
 GPy/kern/_src/psi_comp/__init__.py            |  21 ++--
 GPy/kern/_src/psi_comp/linear_psi_comp.py     |  17 +--
 GPy/kern/_src/psi_comp/rbf_psi_comp.py        |  29 +++--
 GPy/kern/_src/psi_comp/ssrbf_psi_comp.py      | 108 ++++++++---------
 GPy/kern/_src/rbf.py                          |  17 +--
 GPy/models/bayesian_gplvm_minibatch.py        |  95 ++++++++-------
 GPy/models/sparse_gp_minibatch.py             | 111 +++++++++++-------
 10 files changed, 275 insertions(+), 195 deletions(-)

diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py
index 97d8dfe3..e50daa24 100644
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@@ -64,9 +64,7 @@ class VarDTC(LatentFunctionInference):
     def get_VVTfactor(self, Y, prec):
         return Y * prec # TODO chache this, and make it effective
 
-
-
-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None):
+    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None, psi0=None, psi1=None, psi2=None):
 
         _, output_dim = Y.shape
         uncertain_inputs = isinstance(X, VariationalPosterior)
@@ -95,17 +93,28 @@ class VarDTC(LatentFunctionInference):
 
         # The rather complex computations of A, and the psi stats
         if uncertain_inputs:
-            psi0 = kern.psi0(Z, X)
-            psi1 = kern.psi1(Z, X)
+            if psi0 is None:
+                psi0 = kern.psi0(Z, X)
+            if psi1 is None:
+                psi1 = kern.psi1(Z, X)
             if het_noise:
-                psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0)
+                if psi2 is None:
+                    assert len(psi2.shape) == 3  # Need to have not summed out N
+                    #FIXME: Need testing
+                    psi2_beta = np.sum([psi2[X[i:i+1,:], :, :] * beta_i for i,beta_i in enumerate(beta)],0)
+                else:
+                    psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0)
             else:
-                psi2_beta = kern.psi2(Z,X) * beta
+                if psi2 is None:
+                    psi2 = kern.psi2(Z,X)
+                psi2_beta =  psi2 * beta
             LmInv = dtrtri(Lm)
             A = LmInv.dot(psi2_beta.dot(LmInv.T))
         else:
-            psi0 = kern.Kdiag(X)
-            psi1 = kern.K(X, Z)
+            if psi0 is None:
+                psi0 = kern.Kdiag(X)
+            if psi1 is None:
+                psi1 = kern.K(X, Z)
             if het_noise:
                 tmp = psi1 * (np.sqrt(beta))
             else:
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index 5af3fb1c..fb12030b 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -59,7 +59,12 @@ class Kern(Parameterized):
         self._sliced_X = 0
         self.useGPU = self._support_GPU and useGPU
         self._return_psi2_n_flag = ObsAr(np.zeros(1)).astype(bool)
-        
+
+        #FIXME: temporary solution
+        from ...core.parameterization.lists_and_dicts import ObserverList
+        self._return_psi2_n_flag.observers = ObserverList()
+        self._return_psi2_n_flag._update_on = True
+
         from .psi_comp import PSICOMP_GH
         self.psicomp = PSICOMP_GH()
 
@@ -97,7 +102,7 @@ class Kern(Parameterized):
     def psi1(self, Z, variational_posterior):
         return self.psicomp.psicomputations(self, Z, variational_posterior)[1]
     def psi2(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self, Z, variational_posterior)[2]
+        return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=self.return_psi2_n)[2]
     def gradients_X(self, dL_dK, X, X2):
         raise NotImplementedError
     def gradients_X_diag(self, dL_dKdiag, X):
@@ -111,7 +116,8 @@ class Kern(Parameterized):
         """Set the gradients of all parameters when doing full (N) inference."""
         raise NotImplementedError
 
-    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
+                                      psi0=None, psi1=None, psi2=None):
         """
         Set the gradients of all parameters when doing inference with
         uncertain inputs, using expectations of the kernel.
@@ -122,22 +128,27 @@ class Kern(Parameterized):
                         dL_dpsi1 * dpsi1_d{theta_i} +
                         dL_dpsi2 * dpsi2_d{theta_i}
         """
-        dtheta = self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[0]
+        dtheta = self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
+                                                        psi0=psi0, psi1=psi1, psi2=psi2)[0]
         self.gradient[:] = dtheta
 
-    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
+                                psi0=None, psi1=None, psi2=None):
         """
         Returns the derivative of the objective wrt Z, using the chain rule
         through the expectation variables.
         """
-        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[1]
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
+                                                      psi0=psi0, psi1=psi1, psi2=psi2)[1]
 
-    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
+                                    psi0=None, psi1=None, psi2=None):
         """
         Compute the gradients wrt the parameters of the variational
         distruibution q(X), chain-ruling via the expectations of the kernel
         """
-        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)[2:]
+        return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
+                                                      psi0=psi0, psi1=psi1, psi2=psi2)[2:]
 
     def plot(self, x=None, fignum=None, ax=None, title=None, plot_limits=None, resolution=None, **mpl_kwargs):
         """
diff --git a/GPy/kern/_src/kernel_slice_operations.py b/GPy/kern/_src/kernel_slice_operations.py
index 3473ffce..a2058ca8 100644
--- a/GPy/kern/_src/kernel_slice_operations.py
+++ b/GPy/kern/_src/kernel_slice_operations.py
@@ -116,25 +116,31 @@ def _slice_psi(f):
 
 def _slice_update_gradients_expectations(f):
     @wraps(f)
-    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
+             psi0=None, psi1=None, psi2=None):
         with _Slice_wrap(self, Z, variational_posterior) as s:
-            ret = f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2)
+            ret = f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2,
+                    psi0=psi0, psi1=psi1, psi2=psi2)
         return ret
     return wrap
 
 def _slice_gradients_Z_expectations(f):
     @wraps(f)
-    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
+             psi0=None, psi1=None, psi2=None):
         with _Slice_wrap(self, Z, variational_posterior) as s:
-            ret = s.handle_return_array(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2))
+            ret = s.handle_return_array(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2,
+                                          psi0=psi0, psi1=psi1, psi2=psi2))
         return ret
     return wrap
 
 def _slice_gradients_qX_expectations(f):
     @wraps(f)
-    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
+             psi0=None, psi1=None, psi2=None):
         with _Slice_wrap(self, variational_posterior, Z) as s:
-            ret = list(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X2, s.X))
+            ret = list(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X2, s.X,
+                         psi0=psi0, psi1=psi1, psi2=psi2))
             r2 = ret[:2]
             ret[0] = s.handle_return_array(r2[0])
             ret[1] = s.handle_return_array(r2[1])
diff --git a/GPy/kern/_src/psi_comp/__init__.py b/GPy/kern/_src/psi_comp/__init__.py
index 6d7222be..6849ed81 100644
--- a/GPy/kern/_src/psi_comp/__init__.py
+++ b/GPy/kern/_src/psi_comp/__init__.py
@@ -12,18 +12,22 @@ from .gaussherm import PSICOMP_GH
 
 class PSICOMP_RBF(Pickleable):
     @Cache_this(limit=10, ignore_args=(0,))
-    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
+    def psicomputations(self, variance, lengthscale, Z, variational_posterior, return_psi2_n=False):
         if isinstance(variational_posterior, variational.NormalPosterior):
-            return rbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior)
+            return rbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior, return_psi2_n=return_psi2_n)
         elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
+            if return_psi2_n:
+                raise NotImplementedError('However this function seems to return it by default')
             return ssrbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior)
         else:
             raise ValueError("unknown distriubtion received for psi-statistics")
 
     @Cache_this(limit=10, ignore_args=(0,1,2,3))
-    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior,
+                                  psi0=None, psi1=None, psi2=None):
         if isinstance(variational_posterior, variational.NormalPosterior):
-            return rbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior)
+            return rbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior,
+                                                          psi0=psi0, psi1=psi1, psi2=psi2)
         elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
             return ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior)
         else:
@@ -35,7 +39,9 @@ class PSICOMP_RBF(Pickleable):
 class PSICOMP_Linear(Pickleable):
 
     @Cache_this(limit=10, ignore_args=(0,))
-    def psicomputations(self, variance, Z, variational_posterior):
+    def psicomputations(self, variance, Z, variational_posterior, return_psi2_n=False):
+        if return_psi2_n:
+            raise NotImplementedError
         if isinstance(variational_posterior, variational.NormalPosterior):
             return linear_psi_comp.psicomputations(variance, Z, variational_posterior)
         elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
@@ -44,8 +50,9 @@ class PSICOMP_Linear(Pickleable):
             raise ValueError("unknown distriubtion received for psi-statistics")
 
     @Cache_this(limit=10, ignore_args=(0,1,2,3))
-    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior):
+    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior, psi0=None, psi1=None, psi2=None):
         if isinstance(variational_posterior, variational.NormalPosterior):
+            #Should pass psi in
             return linear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
         elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
             return sslinear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
@@ -53,4 +60,4 @@ class PSICOMP_Linear(Pickleable):
             raise ValueError("unknown distriubtion received for psi-statistics")
 
     def _setup_observers(self):
-        pass
\ No newline at end of file
+        pass
diff --git a/GPy/kern/_src/psi_comp/linear_psi_comp.py b/GPy/kern/_src/psi_comp/linear_psi_comp.py
index 50090428..cd5d8fa0 100644
--- a/GPy/kern/_src/psi_comp/linear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/linear_psi_comp.py
@@ -8,7 +8,7 @@ The package for the Psi statistics computation of the linear kernel for Bayesian
 import numpy as np
 from ....util.linalg import tdot
 
-def psicomputations(variance, Z, variational_posterior):
+def psicomputations(variance, Z, variational_posterior, return_psi2_n=False):
     """
     Compute psi-statistics for ss-linear kernel
     """
@@ -22,7 +22,10 @@ def psicomputations(variance, Z, variational_posterior):
 
     psi0 = (variance*(np.square(mu)+S)).sum(axis=1)
     psi1 = np.dot(mu,(variance*Z).T)
-    psi2 = np.dot(S.sum(axis=0)*np.square(variance)*Z,Z.T)+ tdot(psi1.T)
+    if sum_N_psi2:
+        psi2 = np.dot(S.sum(axis=0)*np.square(variance)*Z,Z.T)+ tdot(psi1.T)
+    else:
+        raise NotImplementedError
 
     return psi0, psi1, psi2
 
@@ -40,7 +43,7 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variati
     dL_dmu += 2.*dL_dpsi0_var*mu+np.dot(dL_dpsi1,Z)*variance
     dL_dS += dL_dpsi0_var
     dL_dZ += dL_dpsi1_mu*variance
-    
+
     return dL_dvar, dL_dZ, dL_dmu, dL_dS
 
 def _psi2computations(dL_dpsi2, variance, Z, mu, S):
@@ -56,7 +59,7 @@ def _psi2computations(dL_dpsi2, variance, Z, mu, S):
     # _psi2_dZ             MxQ
     # _psi2_dmu            NxQ
     # _psi2_dS             NxQ
-    
+
     variance2 = np.square(variance)
     common_sum = np.dot(mu,(variance*Z).T)
     Z_expect = (np.dot(dL_dpsi2,Z)*Z).sum(axis=0)
@@ -66,12 +69,12 @@ def _psi2computations(dL_dpsi2, variance, Z, mu, S):
     Z1_expect = np.dot(dL_dpsi2T,Z)
 
     dL_dvar = 2.*S.sum(axis=0)*variance*Z_expect+(common_expect*mu).sum(axis=0)
-            
+
     dL_dmu = common_expect*variance
-    
+
     dL_dS = np.empty(S.shape)
     dL_dS[:] = Z_expect*variance2
-    
+
     dL_dZ = variance2*S.sum(axis=0)*Z1_expect+np.dot(Z2_expect.T,variance*mu)
 
     return dL_dvar, dL_dmu, dL_dS, dL_dZ
diff --git a/GPy/kern/_src/psi_comp/rbf_psi_comp.py b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
index 16bada72..de958a37 100644
--- a/GPy/kern/_src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
@@ -5,7 +5,7 @@ The module for psi-statistics for RBF kernel
 import numpy as np
 from GPy.util.caching import Cacher
 
-def psicomputations(variance, lengthscale, Z, variational_posterior):
+def psicomputations(variance, lengthscale, Z, variational_posterior, return_psi2_n=False):
     """
     Z - MxQ
     mu - NxQ
@@ -21,7 +21,9 @@ def psicomputations(variance, lengthscale, Z, variational_posterior):
     psi0 = np.empty(mu.shape[0])
     psi0[:] = variance
     psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
-    psi2 = _psi2computations(variance, lengthscale, Z, mu, S).sum(axis=0)
+    psi2 = _psi2computations(variance, lengthscale, Z, mu, S)
+    if not return_psi2_n:
+        psi2 = psi2.sum(axis=0)
     return psi0, psi1, psi2
 
 def __psi1computations(variance, lengthscale, Z, mu, S):
@@ -66,11 +68,12 @@ def __psi2computations(variance, lengthscale, Z, mu, S):
     _psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2)
     return _psi2
 
-def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior,
+                              psi0=None, psi1=None, psi2=None):
     ARD = (len(lengthscale)!=1)
 
-    dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
-    dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
+    dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, psi1=psi1)
+    dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, psi2=psi2)
 
     dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
 
@@ -84,7 +87,7 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscal
 
     return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS
 
-def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S):
+def __psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, psi1=None):
     """
     dL_dpsi1 - NxM
     Z - MxQ
@@ -103,8 +106,9 @@ def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S):
 
     lengthscale2 = np.square(lengthscale)
 
-    _psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
-    Lpsi1 = dL_dpsi1*_psi1
+    if psi1 is None:
+        psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
+    Lpsi1 = dL_dpsi1*psi1
     Zmu = Z[None,:,:]-mu[:,None,:] # NxMxQ
     denom = 1./(S+lengthscale2)
     Zmu2_denom = np.square(Zmu)*denom[:,None,:] #NxMxQ
@@ -116,7 +120,7 @@ def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S):
 
     return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
 
-def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
+def __psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, psi2=None):
     """
     Z - MxQ
     mu - NxQ
@@ -137,8 +141,9 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
     denom = 1./(2*S+lengthscale2)
     denom2 = np.square(denom)
 
-    _psi2 = _psi2computations(variance, lengthscale, Z, mu, S) # NxMxM
-    Lpsi2 = dL_dpsi2*_psi2 # dL_dpsi2 is MxM, using broadcast to multiply N out
+    if psi2 is None:
+        psi2 = _psi2computations(variance, lengthscale, Z, mu, S) # NxMxM
+    Lpsi2 = dL_dpsi2*psi2 # dL_dpsi2 is MxM, using broadcast to multiply N out
     Lpsi2sum = np.einsum('nmo->n',Lpsi2) #N
     Lpsi2Z = np.einsum('nmo,oq->nq',Lpsi2,Z) #NxQ
     Lpsi2Z2 = np.einsum('nmo,oq,oq->nq',Lpsi2,Z,Z) #NxQ
@@ -158,3 +163,5 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
 
 _psi1computations = Cacher(__psi1computations, limit=5)
 _psi2computations = Cacher(__psi2computations, limit=5)
+_psi1compDer = Cacher(__psi1compDer, limit=5)
+_psi2compDer = Cacher(__psi2compDer, limit=5)
diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
index f6a24c86..10ea95e4 100644
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
@@ -9,7 +9,7 @@ import numpy as np
 
 try:
     from scipy import weave
-     
+
     def _psicomputations(variance, lengthscale, Z, variational_posterior):
         """
         Z - MxQ
@@ -23,7 +23,7 @@ try:
         mu = variational_posterior.mean
         S = variational_posterior.variance
         gamma = variational_posterior.binary_prob
-         
+
         N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
         l2 = np.square(lengthscale)
         log_denom1 = np.log(S/l2+1)
@@ -35,13 +35,13 @@ try:
         psi0[:] = variance
         psi1 = np.empty((N,M))
         psi2n = np.empty((N,M,M))
-         
+
         from ....util.misc import param_to_array
         S = param_to_array(S)
         mu = param_to_array(mu)
         gamma = param_to_array(gamma)
         Z = param_to_array(Z)
-         
+
         support_code = """
         #include <math.h>
         """
@@ -56,11 +56,11 @@ try:
                         double lq = l2(q);
                         double Zm1q = Z(m1,q);
                         double Zm2q = Z(m2,q);
-                         
+
                         if(m2==0) {
                             // Compute Psi_1
                             double muZ = mu(n,q)-Z(m1,q);
-                             
+
                             double psi1_exp1 = log_gamma(n,q) - (muZ*muZ/(Snq+lq) +log_denom1(n,q))/2.;
                             double psi1_exp2 = log_gamma1(n,q) -Zm1q*Zm1q/(2.*lq);
                             log_psi1 += (psi1_exp1>psi1_exp2)?psi1_exp1+log1p(exp(psi1_exp2-psi1_exp1)):psi1_exp2+log1p(exp(psi1_exp1-psi1_exp2));
@@ -69,10 +69,10 @@ try:
                         double muZhat = mu(n,q) - (Zm1q+Zm2q)/2.;
                         double Z2 = Zm1q*Zm1q+ Zm2q*Zm2q;
                         double dZ = Zm1q - Zm2q;
-                         
+
                         double psi2_exp1 = dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2(n,q)/2. + log_gamma(n,q);
                         double psi2_exp2 = log_gamma1(n,q) - Z2/(2.*lq);
-                        log_psi2_n += (psi2_exp1>psi2_exp2)?psi2_exp1+log1p(exp(psi2_exp2-psi2_exp1)):psi2_exp2+log1p(exp(psi2_exp1-psi2_exp2));                    
+                        log_psi2_n += (psi2_exp1>psi2_exp2)?psi2_exp1+log1p(exp(psi2_exp2-psi2_exp1)):psi2_exp2+log1p(exp(psi2_exp1-psi2_exp2));
                     }
                     double exp_psi2_n = exp(log_psi2_n);
                     psi2n(n,m1,m2) = variance*variance*exp_psi2_n;
@@ -83,18 +83,18 @@ try:
         }
         """
         weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz)
-     
+
         psi2 = psi2n.sum(axis=0)
         return psi0,psi1,psi2,psi2n
-     
+
     from GPy.util.caching import Cacher
     psicomputations = Cacher(_psicomputations, limit=1)
-     
+
     def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
         ARD = (len(lengthscale)!=1)
-         
+
         _,psi1,_,psi2n = psicomputations(variance, lengthscale, Z, variational_posterior)
-     
+
         mu = variational_posterior.mean
         S = variational_posterior.variance
         gamma = variational_posterior.binary_prob
@@ -105,7 +105,7 @@ try:
         log_gamma = np.log(gamma)
         log_gamma1 = np.log(1.-gamma)
         variance = float(variance)
-     
+
         dvar = np.zeros(1)
         dmu = np.zeros((N,Q))
         dS = np.zeros((N,Q))
@@ -113,13 +113,13 @@ try:
         dl = np.zeros(Q)
         dZ = np.zeros((M,Q))
         dvar += np.sum(dL_dpsi0)
-         
+
         from ....util.misc import param_to_array
         S = param_to_array(S)
         mu = param_to_array(mu)
         gamma = param_to_array(gamma)
         Z = param_to_array(Z)
-         
+
         support_code = """
         #include <math.h>
         """
@@ -136,16 +136,16 @@ try:
                         double Zm2q = Z(m2,q);
                         double gnq = gamma(n,q);
                         double mu_nq = mu(n,q);
-                         
+
                         if(m2==0) {
-                            // Compute Psi_1                        
+                            // Compute Psi_1
                             double lpsi1 = psi1(n,m1)*dL_dpsi1(n,m1);
                             if(q==0) {dvar(0) += lpsi1/variance;}
-                             
+
                             double Zmu = Zm1q - mu_nq;
                             double denom = Snq+lq;
                             double Zmu2_denom = Zmu*Zmu/denom;
-                             
+
                             double exp1 = log_gamma(n,q)-(Zmu*Zmu/(Snq+lq)+log_denom1(n,q))/(2.);
                             double exp2 = log_gamma1(n,q)-Zm1q*Zm1q/(2.*lq);
                             double d_exp1,d_exp2;
@@ -157,7 +157,7 @@ try:
                                 d_exp2 = 1.;
                             }
                             double exp_sum = d_exp1+d_exp2;
-                             
+
                             dmu(n,q) += lpsi1*Zmu*d_exp1/(denom*exp_sum);
                             dS(n,q) += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum)/2.;
                             dgamma(n,q) += lpsi1*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
@@ -167,13 +167,13 @@ try:
                         // Compute Psi_2
                         double lpsi2 = psi2n(n,m1,m2)*dL_dpsi2(m1,m2);
                         if(q==0) {dvar(0) += lpsi2*2/variance;}
-                         
+
                         double dZm1m2 = Zm1q - Zm2q;
                         double Z2 = Zm1q*Zm1q+Zm2q*Zm2q;
                         double muZhat =  mu_nq - (Zm1q + Zm2q)/2.;
                         double denom = 2.*Snq+lq;
                         double muZhat2_denom = muZhat*muZhat/denom;
-                         
+
                         double exp1 = dZm1m2*dZm1m2/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2(n,q)/2. + log_gamma(n,q);
                         double exp2 = log_gamma1(n,q) - Z2/(2.*lq);
                         double d_exp1,d_exp2;
@@ -185,23 +185,23 @@ try:
                             d_exp2 = 1.;
                         }
                         double exp_sum = d_exp1+d_exp2;
-                         
+
                         dmu(n,q) += -2.*lpsi2*muZhat/denom*d_exp1/exp_sum;
                         dS(n,q) += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum;
                         dgamma(n,q) += lpsi2*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
                         dl(q) += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZm1m2*dZm1m2/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum;
-                        dZ(m1,q) += 2.*lpsi2*((muZhat/denom-dZm1m2/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum;                   
+                        dZ(m1,q) += 2.*lpsi2*((muZhat/denom-dZm1m2/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum;
                     }
                 }
             }
         }
         """
         weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz)
-     
+
         dl *= 2.*lengthscale
         if not ARD:
             dl = dl.sum()
-         
+
         return dvar, dl, dZ, dmu, dS, dgamma
 
 except:
@@ -219,13 +219,13 @@ except:
         mu = variational_posterior.mean
         S = variational_posterior.variance
         gamma = variational_posterior.binary_prob
-         
+
         psi0 = np.empty(mu.shape[0])
         psi0[:] = variance
         psi1 = _psi1computations(variance, lengthscale, Z, mu, S, gamma)
         psi2 = _psi2computations(variance, lengthscale, Z, mu, S, gamma)
         return psi0, psi1, psi2
-    
+
     def _psi1computations(variance, lengthscale, Z, mu, S, gamma):
         """
         Z - MxQ
@@ -236,9 +236,9 @@ except:
         # here are the "statistics" for psi1
         # Produced intermediate results:
         # _psi1                NxM
-    
+
         lengthscale2 = np.square(lengthscale)
-    
+
         # psi1
         _psi1_denom = S[:, None, :] / lengthscale2 + 1.  # Nx1xQ
         _psi1_denom_sqrt = np.sqrt(_psi1_denom) #Nx1xQ
@@ -251,9 +251,9 @@ except:
         _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ
         _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM
         _psi1 = variance * np.exp(_psi1_exp_sum) # NxM
-    
+
         return _psi1
-    
+
     def _psi2computations(variance, lengthscale, Z, mu, S, gamma):
         """
         Z - MxQ
@@ -264,14 +264,14 @@ except:
         # here are the "statistics" for psi2
         # Produced intermediate results:
         # _psi2                MxM
-        
+
         lengthscale2 = np.square(lengthscale)
-        
+
         _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
         _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
         _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
         _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
-    
+
         # psi2
         _psi2_denom = 2.*S[:, None, None, :] / lengthscale2 + 1. # Nx1x1xQ
         _psi2_denom_sqrt = np.sqrt(_psi2_denom)
@@ -284,28 +284,28 @@ except:
         _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
         _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
         _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
-    
+
         return _psi2
-    
+
     def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
         ARD = (len(lengthscale)!=1)
-         
+
         dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1, dgamma_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
         dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2, dgamma_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-     
+
         dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
-         
+
         dL_dlengscale = dl_psi1 + dl_psi2
         if not ARD:
             dL_dlengscale = dL_dlengscale.sum()
-     
+
         dL_dgamma = dgamma_psi1 + dgamma_psi2
         dL_dmu = dmu_psi1 + dmu_psi2
         dL_dS = dS_psi1 + dS_psi2
         dL_dZ = dZ_psi1 + dZ_psi2
-         
+
         return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma
-    
+
     def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, gamma):
         """
         dL_dpsi1 - NxM
@@ -322,9 +322,9 @@ except:
         # _dL_dgamma        NxQ
         # _dL_dmu           NxQ
         # _dL_dS            NxQ
-        
+
         lengthscale2 = np.square(lengthscale)
-    
+
         # psi1
         _psi1_denom = S / lengthscale2 + 1.  # NxQ
         _psi1_denom_sqrt = np.sqrt(_psi1_denom) #NxQ
@@ -346,9 +346,9 @@ except:
         _dL_dS = np.einsum('nm,nmq,nmq,nq,nmq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_common,(_psi1_dist_sq-1.))/2.  # NxQ
         _dL_dZ = np.einsum('nm,nmq,nmq->mq',dL_dpsi1,_psi1_q, (- _psi1_common[:,None,:] * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z))
         _dL_dlengthscale = lengthscale* np.einsum('nm,nmq,nmq->q',dL_dpsi1,_psi1_q,(_psi1_common[:,None,:]*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + (1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z))
-    
-        return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma 
-    
+
+        return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma
+
     def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, gamma):
         """
         Z - MxQ
@@ -365,14 +365,14 @@ except:
         # _dL_dgamma         NxQ
         # _dL_dmu            NxQ
         # _dL_dS             NxQ
-        
+
         lengthscale2 = np.square(lengthscale)
-        
+
         _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
         _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
         _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
         _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
-    
+
         # psi2
         _psi2_denom = 2.*S / lengthscale2 + 1. # NxQ
         _psi2_denom_sqrt = np.sqrt(_psi2_denom)
@@ -384,7 +384,7 @@ except:
         _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2)
         _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
         _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
-        _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ 
+        _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ
         _psi2_exp_dist_sq = np.exp(-_psi2_Zdist_sq -_psi2_mudist_sq) # NxMxMxQ
         _psi2_exp_Z = np.exp(-0.5*_psi2_Z_sq_sum) # MxMxQ
         _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
@@ -394,5 +394,5 @@ except:
         _dL_dS = np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q, _psi2_common, (2.*_psi2_mudist_sq-1.), _psi2_exp_dist_sq)
         _dL_dZ = 2.*np.einsum('mo,nmoq,nmoq->mq',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(-_psi2_Zdist*_psi2_denom[:,None,None,:]+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z))
         _dL_dlengthscale = 2.*lengthscale* np.einsum('mo,nmoq,nmoq->q',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z))
-    
+
         return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index 73f2d0a4..357c48ec 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -56,16 +56,19 @@ class RBF(Stationary):
         return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[1]
 
     def psi2(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[2]
+        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior, return_psi2_n=self.return_psi2_n)[2]
 
-    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[:2]
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
+                                      psi0=None, psi1=None, psi2=None):
+        dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2)[:2]
         self.variance.gradient = dL_dvar
         self.lengthscale.gradient = dL_dlengscale
 
-    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[2]
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
+                                 psi0=None, psi1=None, psi2=None):
+        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2)[2]
 
-    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[3:]
+    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
+                                  psi0=None, psi1=None, psi2=None):
+        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2)[3:]
 
diff --git a/GPy/models/bayesian_gplvm_minibatch.py b/GPy/models/bayesian_gplvm_minibatch.py
index 7a7bb0e8..47c425a4 100644
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@@ -80,45 +80,9 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
         """Get the gradients of the posterior distribution of X in its specific form."""
         return X.mean.gradient, X.variance.gradient
 
-    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None, **kw):
-        posterior, log_marginal_likelihood, grad_dict, current_values, value_indices = super(BayesianGPLVMMiniBatch, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm, subset_indices=subset_indices, **kw)
-
-        if self.has_uncertain_inputs():
-            current_values['meangrad'], current_values['vargrad'] = self.kern.gradients_qX_expectations(
-                                                variational_posterior=X,
-                                                Z=Z, dL_dpsi0=grad_dict['dL_dpsi0'],
-                                                dL_dpsi1=grad_dict['dL_dpsi1'],
-                                                dL_dpsi2=grad_dict['dL_dpsi2'])
-        else:
-            current_values['Xgrad'] = self.kern.gradients_X(grad_dict['dL_dKnm'], X, Z)
-            current_values['Xgrad'] += self.kern.gradients_X_diag(grad_dict['dL_dKdiag'], X)
-            if subset_indices is not None:
-                value_indices['Xgrad'] = subset_indices['samples']
-
-        kl_fctr = self.kl_factr
-        if self.has_uncertain_inputs():
-            if self.missing_data:
-                d = self.output_dim
-                log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)/d
-            else:
-                log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)
-
-            # Subsetting Variational Posterior objects, makes the gradients
-            # empty. We need them to be 0 though:
-            X.mean.gradient[:] = 0
-            X.variance.gradient[:] = 0
-
-            self.variational_prior.update_gradients_KL(X)
-            if self.missing_data:
-                current_values['meangrad'] += kl_fctr*X.mean.gradient/d
-                current_values['vargrad'] += kl_fctr*X.variance.gradient/d
-            else:
-                current_values['meangrad'] += kl_fctr*X.mean.gradient
-                current_values['vargrad'] += kl_fctr*X.variance.gradient
-
-            if subset_indices is not None:
-                value_indices['meangrad'] = subset_indices['samples']
-                value_indices['vargrad'] = subset_indices['samples']
+    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None, psi0=None, psi1=None, psi2=None, **kw):
+        posterior, log_marginal_likelihood, grad_dict, current_values, value_indices = super(BayesianGPLVMMiniBatch, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm,
+                                                                                                                                                     psi0=psi0, psi1=psi1, psi2=psi2, subset_indices=subset_indices, **kw)
         return posterior, log_marginal_likelihood, grad_dict, current_values, value_indices
 
     def _outer_values_update(self, full_values):
@@ -127,6 +91,47 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
         E.g. set the gradients of parameters, etc.
         """
         super(BayesianGPLVMMiniBatch, self)._outer_values_update(full_values)
+        current_values = full_values
+        grad_dict = current_values
+        if self.has_uncertain_inputs():
+            current_values['meangrad'], current_values['vargrad'] = self.kern.gradients_qX_expectations(
+                                                variational_posterior=self.X,
+                                                Z=self.Z, dL_dpsi0=grad_dict['dL_dpsi0'],
+                                                dL_dpsi1=grad_dict['dL_dpsi1'],
+                                                dL_dpsi2=grad_dict['dL_dpsi2'],
+                                                psi0=self.psi0, psi1=self.psi1, psi2=self.psi2)
+        else:
+            current_values['Xgrad'] = self.kern.gradients_X(grad_dict['dL_dKnm'], self.X, self.Z)
+            current_values['Xgrad'] += self.kern.gradients_X_diag(grad_dict['dL_dKdiag'], self.X)
+            if subset_indices is not None:
+                value_indices['Xgrad'] = subset_indices['samples']
+
+        kl_fctr = self.kl_factr
+        if self.has_uncertain_inputs():
+            #if self.missing_data:
+                #d = self.output_dim
+                #log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)/d
+            #else:
+            self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)
+
+            # Subsetting Variational Posterior objects, makes the gradients
+            # empty. We need them to be 0 though:
+            self.X.mean.gradient[:] = 0
+            self.X.variance.gradient[:] = 0
+
+            self.variational_prior.update_gradients_KL(self.X)
+            #if self.missing_data:
+                #current_values['meangrad'] += kl_fctr*self.X.mean.gradient/d
+                #current_values['vargrad'] += kl_fctr*self.X.variance.gradient/d
+            #else:
+            current_values['meangrad'] += kl_fctr*self.X.mean.gradient
+            current_values['vargrad'] += kl_fctr*self.X.variance.gradient
+
+            #if subset_indices is not None:
+                #value_indices['meangrad'] = subset_indices['samples']
+                #value_indices['vargrad'] = subset_indices['samples']
+
+        full_values = current_values
         if self.has_uncertain_inputs():
             self.X.mean.gradient = full_values['meangrad']
             self.X.variance.gradient = full_values['vargrad']
@@ -134,11 +139,13 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
             self.X.gradient = full_values['Xgrad']
 
     def _outer_init_full_values(self):
-        if self.has_uncertain_inputs():
-            return dict(meangrad=np.zeros(self.X.mean.shape),
-                        vargrad=np.zeros(self.X.variance.shape))
-        else:
-            return dict(Xgrad=np.zeros(self.X.shape))
+        full_values = super(BayesianGPLVMMiniBatch, self)._outer_init_full_values()
+        #if self.has_uncertain_inputs():
+            #return dict(meangrad=np.zeros(self.X.mean.shape),
+                        #vargrad=np.zeros(self.X.variance.shape))
+        #else:
+            #return dict(Xgrad=np.zeros(self.X.shape))
+        return full_values
 
     def parameters_changed(self):
         super(BayesianGPLVMMiniBatch,self).parameters_changed()
diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py
index 07295255..f8322b4e 100644
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@@ -44,7 +44,7 @@ class SparseGPMiniBatch(SparseGP):
     def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
                  name='sparse gp', Y_metadata=None, normalizer=False,
                  missing_data=False, stochastic=False, batchsize=1):
-        
+
         # pick a sensible inference method
         if inference_method is None:
             if isinstance(likelihood, likelihoods.Gaussian):
@@ -80,7 +80,7 @@ class SparseGPMiniBatch(SparseGP):
     def has_uncertain_inputs(self):
         return isinstance(self.X, VariationalPosterior)
 
-    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None, **kwargs):
+    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None, psi0=None, psi1=None, psi2=None, missing_inds=None, **kwargs):
         """
         This is the standard part, which usually belongs in parameters_changed.
 
@@ -99,46 +99,22 @@ class SparseGPMiniBatch(SparseGP):
         like them into this dictionary for inner use of the indices inside the
         algorithm.
         """
-        try:
-            posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=None, **kwargs)
-        except:
-            posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata)
+        if psi2 is None:
+            psi2_sum_n = None
+        else:
+            psi2_sum_n = psi2.sum(axis=0)
+        posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=None, psi0=psi0, psi1=psi1, psi2=psi2_sum_n, **kwargs)
         current_values = {}
         likelihood.update_gradients(grad_dict['dL_dthetaL'])
         current_values['likgrad'] = likelihood.gradient.copy()
         if subset_indices is None:
             subset_indices = {}
-        if isinstance(X, VariationalPosterior):
-            #gradients wrt kernel
-            dL_dKmm = grad_dict['dL_dKmm']
-            kern.update_gradients_full(dL_dKmm, Z, None)
-            current_values['kerngrad'] = kern.gradient.copy()
-            kern.update_gradients_expectations(variational_posterior=X,
-                                                    Z=Z,
-                                                    dL_dpsi0=grad_dict['dL_dpsi0'],
-                                                    dL_dpsi1=grad_dict['dL_dpsi1'],
-                                                    dL_dpsi2=grad_dict['dL_dpsi2'])
-            current_values['kerngrad'] += kern.gradient
+        current_values['dL_dpsi0'] = grad_dict['dL_dpsi0']
+        current_values['dL_dpsi1'] = grad_dict['dL_dpsi1']
+        current_values['dL_dpsi2'] = grad_dict['dL_dpsi2']
+        current_values['dL_dKmm'] = grad_dict['dL_dKmm']
 
-            #gradients wrt Z
-            current_values['Zgrad'] = kern.gradients_X(dL_dKmm, Z)
-            current_values['Zgrad'] += kern.gradients_Z_expectations(
-                               grad_dict['dL_dpsi0'],
-                               grad_dict['dL_dpsi1'],
-                               grad_dict['dL_dpsi2'],
-                               Z=Z,
-                               variational_posterior=X)
-        else:
-            #gradients wrt kernel
-            kern.update_gradients_diag(grad_dict['dL_dKdiag'], X)
-            current_values['kerngrad'] = kern.gradient.copy()
-            kern.update_gradients_full(grad_dict['dL_dKnm'], X, Z)
-            current_values['kerngrad'] += kern.gradient
-            kern.update_gradients_full(grad_dict['dL_dKmm'], Z, None)
-            current_values['kerngrad'] += kern.gradient
-            #gradients wrt Z
-            current_values['Zgrad'] = kern.gradients_X(grad_dict['dL_dKmm'], Z)
-            current_values['Zgrad'] += kern.gradients_X(grad_dict['dL_dKnm'].T, Z, X)
+        #current_values = grad_dict
         return posterior, log_marginal_likelihood, grad_dict, current_values, subset_indices
 
     def _inner_take_over_or_update(self, full_values=None, current_values=None, value_indices=None):
@@ -192,9 +168,43 @@ class SparseGPMiniBatch(SparseGP):
         Here you put the values, which were collected before in the right places.
         E.g. set the gradients of parameters, etc.
         """
-        self.likelihood.gradient = full_values['likgrad']
-        self.kern.gradient = full_values['kerngrad']
-        self.Z.gradient = full_values['Zgrad']
+        grad_dict = full_values
+        current_values = full_values
+        #current_values = {}
+        if isinstance(self.X, VariationalPosterior):
+            #gradients wrt kernel
+            dL_dKmm = grad_dict['dL_dKmm']
+            self.kern.update_gradients_full(dL_dKmm, self.Z, None)
+            current_values['kerngrad'] = self.kern.gradient.copy()
+            self.kern.update_gradients_expectations(variational_posterior=self.X,
+                                                    Z=self.Z,
+                                                    dL_dpsi0=grad_dict['dL_dpsi0'],
+                                                    dL_dpsi1=grad_dict['dL_dpsi1'],
+                                                    dL_dpsi2=grad_dict['dL_dpsi2'])
+            current_values['kerngrad'] += self.kern.gradient
+
+            #gradients wrt Z
+            current_values['Zgrad'] = self.kern.gradients_X(dL_dKmm, self.Z)
+            current_values['Zgrad'] += self.kern.gradients_Z_expectations(
+                               grad_dict['dL_dpsi0'],
+                               grad_dict['dL_dpsi1'],
+                               grad_dict['dL_dpsi2'],
+                               Z=self.Z,
+                               variational_posterior=self.X)
+        else:
+            #gradients wrt kernel
+            kern.update_gradients_diag(grad_dict['dL_dKdiag'], self.X)
+            current_values['kerngrad'] = self.kern.gradient.copy()
+            kern.update_gradients_full(grad_dict['dL_dKnm'], self.X, self.Z)
+            current_values['kerngrad'] += kern.gradient
+            kern.update_gradients_full(grad_dict['dL_dKmm'], self.Z, None)
+            current_values['kerngrad'] += kern.gradient
+            #gradients wrt Z
+            current_values['Zgrad'] = kern.gradients_X(grad_dict['dL_dKmm'], self.Z)
+            current_values['Zgrad'] += kern.gradients_X(grad_dict['dL_dKnm'].T, self.Z, self.X)
+        self.likelihood.gradient = current_values['likgrad']
+        self.kern.gradient = current_values['kerngrad']
+        self.Z.gradient = current_values['Zgrad']
 
     def _outer_init_full_values(self):
         """
@@ -209,7 +219,8 @@ class SparseGPMiniBatch(SparseGP):
         to initialize the gradients for the mean and the variance in order to
         have the full gradient for indexing)
         """
-        return {}
+        return {'dL_dpsi0': np.zeros(self.X.shape[0]),
+                'dL_dpsi1': np.zeros((self.X.shape[0], self.Z.shape[0]))}
 
     def _outer_loop_for_missing_data(self):
         Lm = None
@@ -230,20 +241,35 @@ class SparseGPMiniBatch(SparseGP):
             message = m_f(-1)
             print(message, end=' ')
 
-        for d, ninan in self.stochastics.d:
+        #Compute the psi statistics for N once, but don't sum out N in psi2
+        self.kern.return_psi2_n = True
+        psi0 = self.kern.psi0(self.Z, self.X)
+        psi1 = self.kern.psi1(self.Z, self.X)
+        psi2 = self.kern.psi2(self.Z, self.X)
+        self.psi0 = psi0
+        self.psi1 = psi1
+        self.psi2 = psi2
 
+        for d, ninan in self.stochastics.d:
             if not self.stochastics:
                 print(' '*(len(message)) + '\r', end=' ')
                 message = m_f(d)
                 print(message, end=' ')
 
+            psi0ni = psi0[ninan]
+            psi1ni = psi1[ninan]
+            psi2ni = psi2[ninan]
+
             posterior, log_marginal_likelihood, \
                 grad_dict, current_values, value_indices = self._inner_parameters_changed(
                                 self.kern, self.X[ninan],
                                 self.Z, self.likelihood,
                                 self.Y_normalized[ninan][:, d], self.Y_metadata,
                                 Lm, dL_dKmm,
-                                subset_indices=dict(outputs=d, samples=ninan))
+                                subset_indices=dict(outputs=d, samples=ninan,
+                                                    dL_dpsi0=ninan,
+                                                    dL_dpsi1=ninan),
+                                psi0=psi0ni, psi1=psi1ni, psi2=psi2ni)
 
             self._inner_take_over_or_update(self.full_values, current_values, value_indices)
             self._inner_values_update(current_values)
@@ -253,6 +279,7 @@ class SparseGPMiniBatch(SparseGP):
             woodbury_inv[:, :, d] = posterior.woodbury_inv[:,:,None]
             woodbury_vector[:, d] = posterior.woodbury_vector
             self._log_marginal_likelihood += log_marginal_likelihood
+
         if not self.stochastics:
             print('')
 

From 5f119d09185ae64bd5474935ceffca644d598881 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 27 Aug 2015 19:08:11 +0300
Subject: [PATCH 03/13] Tidied but broken

---
 GPy/models/bayesian_gplvm_minibatch.py |  46 +++-------
 GPy/models/sparse_gp_minibatch.py      | 115 ++++++++++++-------------
 2 files changed, 69 insertions(+), 92 deletions(-)

diff --git a/GPy/models/bayesian_gplvm_minibatch.py b/GPy/models/bayesian_gplvm_minibatch.py
index 47c425a4..b7e0c9b2 100644
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@@ -80,10 +80,10 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
         """Get the gradients of the posterior distribution of X in its specific form."""
         return X.mean.gradient, X.variance.gradient
 
-    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None, psi0=None, psi1=None, psi2=None, **kw):
-        posterior, log_marginal_likelihood, grad_dict, current_values, value_indices = super(BayesianGPLVMMiniBatch, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm,
-                                                                                                                                                     psi0=psi0, psi1=psi1, psi2=psi2, subset_indices=subset_indices, **kw)
-        return posterior, log_marginal_likelihood, grad_dict, current_values, value_indices
+    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None,psi0=None, psi1=None, psi2=None, **kw):
+        posterior, log_marginal_likelihood, grad_dict = super(BayesianGPLVMMiniBatch, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm,
+                                                                                                                    psi0=psi0, psi1=psi1, psi2=psi2, **kw)
+        return posterior, log_marginal_likelihood, grad_dict
 
     def _outer_values_update(self, full_values):
         """
@@ -91,27 +91,19 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
         E.g. set the gradients of parameters, etc.
         """
         super(BayesianGPLVMMiniBatch, self)._outer_values_update(full_values)
-        current_values = full_values
-        grad_dict = current_values
         if self.has_uncertain_inputs():
-            current_values['meangrad'], current_values['vargrad'] = self.kern.gradients_qX_expectations(
+            full_values['meangrad'], full_values['vargrad'] = self.kern.gradients_qX_expectations(
                                                 variational_posterior=self.X,
-                                                Z=self.Z, dL_dpsi0=grad_dict['dL_dpsi0'],
-                                                dL_dpsi1=grad_dict['dL_dpsi1'],
-                                                dL_dpsi2=grad_dict['dL_dpsi2'],
+                                                Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
+                                                dL_dpsi1=full_values['dL_dpsi1'],
+                                                dL_dpsi2=full_values['dL_dpsi2'],
                                                 psi0=self.psi0, psi1=self.psi1, psi2=self.psi2)
         else:
-            current_values['Xgrad'] = self.kern.gradients_X(grad_dict['dL_dKnm'], self.X, self.Z)
-            current_values['Xgrad'] += self.kern.gradients_X_diag(grad_dict['dL_dKdiag'], self.X)
-            if subset_indices is not None:
-                value_indices['Xgrad'] = subset_indices['samples']
+            full_values['Xgrad'] = self.kern.gradients_X(full_values['dL_dKnm'], self.X, self.Z)
+            full_values['Xgrad'] += self.kern.gradients_X_diag(full_values['dL_dKdiag'], self.X)
 
         kl_fctr = self.kl_factr
         if self.has_uncertain_inputs():
-            #if self.missing_data:
-                #d = self.output_dim
-                #log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)/d
-            #else:
             self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)
 
             # Subsetting Variational Posterior objects, makes the gradients
@@ -120,18 +112,9 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
             self.X.variance.gradient[:] = 0
 
             self.variational_prior.update_gradients_KL(self.X)
-            #if self.missing_data:
-                #current_values['meangrad'] += kl_fctr*self.X.mean.gradient/d
-                #current_values['vargrad'] += kl_fctr*self.X.variance.gradient/d
-            #else:
-            current_values['meangrad'] += kl_fctr*self.X.mean.gradient
-            current_values['vargrad'] += kl_fctr*self.X.variance.gradient
+            full_values['meangrad'] += kl_fctr*self.X.mean.gradient
+            full_values['vargrad'] += kl_fctr*self.X.variance.gradient
 
-            #if subset_indices is not None:
-                #value_indices['meangrad'] = subset_indices['samples']
-                #value_indices['vargrad'] = subset_indices['samples']
-
-        full_values = current_values
         if self.has_uncertain_inputs():
             self.X.mean.gradient = full_values['meangrad']
             self.X.variance.gradient = full_values['vargrad']
@@ -140,11 +123,6 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
 
     def _outer_init_full_values(self):
         full_values = super(BayesianGPLVMMiniBatch, self)._outer_init_full_values()
-        #if self.has_uncertain_inputs():
-            #return dict(meangrad=np.zeros(self.X.mean.shape),
-                        #vargrad=np.zeros(self.X.variance.shape))
-        #else:
-            #return dict(Xgrad=np.zeros(self.X.shape))
         return full_values
 
     def parameters_changed(self):
diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py
index f8322b4e..eab8d8f3 100644
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@@ -80,7 +80,7 @@ class SparseGPMiniBatch(SparseGP):
     def has_uncertain_inputs(self):
         return isinstance(self.X, VariationalPosterior)
 
-    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None, psi0=None, psi1=None, psi2=None, missing_inds=None, **kwargs):
+    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, psi0=None, psi1=None, psi2=None, **kwargs):
         """
         This is the standard part, which usually belongs in parameters_changed.
 
@@ -103,19 +103,10 @@ class SparseGPMiniBatch(SparseGP):
             psi2_sum_n = None
         else:
             psi2_sum_n = psi2.sum(axis=0)
-        posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=None, psi0=psi0, psi1=psi1, psi2=psi2_sum_n, **kwargs)
-        current_values = {}
-        likelihood.update_gradients(grad_dict['dL_dthetaL'])
-        current_values['likgrad'] = likelihood.gradient.copy()
-        if subset_indices is None:
-            subset_indices = {}
-        current_values['dL_dpsi0'] = grad_dict['dL_dpsi0']
-        current_values['dL_dpsi1'] = grad_dict['dL_dpsi1']
-        current_values['dL_dpsi2'] = grad_dict['dL_dpsi2']
-        current_values['dL_dKmm'] = grad_dict['dL_dKmm']
+        posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm,
+                                                                                        dL_dKmm=dL_dKmm, psi0=psi0, psi1=psi1, psi2=psi2_sum_n, **kwargs)
 
-        #current_values = grad_dict
-        return posterior, log_marginal_likelihood, grad_dict, current_values, subset_indices
+        return posterior, log_marginal_likelihood, grad_dict
 
     def _inner_take_over_or_update(self, full_values=None, current_values=None, value_indices=None):
         """
@@ -149,7 +140,10 @@ class SparseGPMiniBatch(SparseGP):
             else:
                 index = slice(None)
             if key in full_values:
-                full_values[key][index] += current_values[key]
+                if np.isscalar(current_values[key]):
+                    full_values[key] += current_values[key]
+                else:
+                    full_values[key][index] += current_values[key]
             else:
                 full_values[key] = current_values[key]
 
@@ -168,43 +162,44 @@ class SparseGPMiniBatch(SparseGP):
         Here you put the values, which were collected before in the right places.
         E.g. set the gradients of parameters, etc.
         """
-        grad_dict = full_values
-        current_values = full_values
-        #current_values = {}
-        if isinstance(self.X, VariationalPosterior):
+        if self.has_uncertain_inputs():
             #gradients wrt kernel
-            dL_dKmm = grad_dict['dL_dKmm']
+            dL_dKmm = full_values['dL_dKmm']
             self.kern.update_gradients_full(dL_dKmm, self.Z, None)
-            current_values['kerngrad'] = self.kern.gradient.copy()
+            full_values['kerngrad'] = self.kern.gradient.copy()
             self.kern.update_gradients_expectations(variational_posterior=self.X,
                                                     Z=self.Z,
-                                                    dL_dpsi0=grad_dict['dL_dpsi0'],
-                                                    dL_dpsi1=grad_dict['dL_dpsi1'],
-                                                    dL_dpsi2=grad_dict['dL_dpsi2'])
-            current_values['kerngrad'] += self.kern.gradient
+                                                    dL_dpsi0=full_values['dL_dpsi0'],
+                                                    dL_dpsi1=full_values['dL_dpsi1'],
+                                                    dL_dpsi2=full_values['dL_dpsi2'])
+            full_values['kerngrad'] += self.kern.gradient
 
             #gradients wrt Z
-            current_values['Zgrad'] = self.kern.gradients_X(dL_dKmm, self.Z)
-            current_values['Zgrad'] += self.kern.gradients_Z_expectations(
-                               grad_dict['dL_dpsi0'],
-                               grad_dict['dL_dpsi1'],
-                               grad_dict['dL_dpsi2'],
+            full_values['Zgrad'] = self.kern.gradients_X(dL_dKmm, self.Z)
+            full_values['Zgrad'] += self.kern.gradients_Z_expectations(
+                               full_values['dL_dpsi0'],
+                               full_values['dL_dpsi1'],
+                               full_values['dL_dpsi2'],
                                Z=self.Z,
                                variational_posterior=self.X)
         else:
             #gradients wrt kernel
-            kern.update_gradients_diag(grad_dict['dL_dKdiag'], self.X)
-            current_values['kerngrad'] = self.kern.gradient.copy()
-            kern.update_gradients_full(grad_dict['dL_dKnm'], self.X, self.Z)
-            current_values['kerngrad'] += kern.gradient
-            kern.update_gradients_full(grad_dict['dL_dKmm'], self.Z, None)
-            current_values['kerngrad'] += kern.gradient
+            self.kern.update_gradients_diag(full_values['dL_dKdiag'], self.X)
+            full_values['kerngrad'] = self.kern.gradient.copy()
+            self.kern.update_gradients_full(full_values['dL_dKnm'], self.X, self.Z)
+            full_values['kerngrad'] += self.kern.gradient
+            self.kern.update_gradients_full(full_values['dL_dKmm'], self.Z, None)
+            full_values['kerngrad'] += self.kern.gradient
             #gradients wrt Z
-            current_values['Zgrad'] = kern.gradients_X(grad_dict['dL_dKmm'], self.Z)
-            current_values['Zgrad'] += kern.gradients_X(grad_dict['dL_dKnm'].T, self.Z, self.X)
-        self.likelihood.gradient = current_values['likgrad']
-        self.kern.gradient = current_values['kerngrad']
-        self.Z.gradient = current_values['Zgrad']
+            full_values['Zgrad'] = self.kern.gradients_X(full_values['dL_dKmm'], self.Z)
+            full_values['Zgrad'] += self.kern.gradients_X(full_values['dL_dKnm'].T, self.Z, self.X)
+
+        self.likelihood.update_gradients(full_values['dL_dthetaL'])
+        full_values['likgrad'] = self.likelihood.gradient.copy()
+
+        self.likelihood.gradient = full_values['likgrad']
+        self.kern.gradient = full_values['kerngrad']
+        self.Z.gradient = full_values['Zgrad']
 
     def _outer_init_full_values(self):
         """
@@ -242,10 +237,18 @@ class SparseGPMiniBatch(SparseGP):
             print(message, end=' ')
 
         #Compute the psi statistics for N once, but don't sum out N in psi2
-        self.kern.return_psi2_n = True
-        psi0 = self.kern.psi0(self.Z, self.X)
-        psi1 = self.kern.psi1(self.Z, self.X)
-        psi2 = self.kern.psi2(self.Z, self.X)
+        if self.has_uncertain_inputs():
+            self.kern.return_psi2_n = True
+            psi0 = self.kern.psi0(self.Z, self.X)
+            psi1 = self.kern.psi1(self.Z, self.X)
+            psi2 = self.kern.psi2(self.Z, self.X)
+        else:
+            if psi0 is None:
+                psi0 = kern.Kdiag(X)
+            if psi1 is None:
+                psi1 = kern.K(X, Z)
+            psi2 = None
+
         self.psi0 = psi0
         self.psi1 = psi1
         self.psi2 = psi2
@@ -259,23 +262,20 @@ class SparseGPMiniBatch(SparseGP):
             psi0ni = psi0[ninan]
             psi1ni = psi1[ninan]
             psi2ni = psi2[ninan]
+            value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan)
 
-            posterior, log_marginal_likelihood, \
-                grad_dict, current_values, value_indices = self._inner_parameters_changed(
+            posterior, log_marginal_likelihood, grad_dict = self._inner_parameters_changed(
                                 self.kern, self.X[ninan],
                                 self.Z, self.likelihood,
                                 self.Y_normalized[ninan][:, d], self.Y_metadata,
                                 Lm, dL_dKmm,
-                                subset_indices=dict(outputs=d, samples=ninan,
-                                                    dL_dpsi0=ninan,
-                                                    dL_dpsi1=ninan),
                                 psi0=psi0ni, psi1=psi1ni, psi2=psi2ni)
 
-            self._inner_take_over_or_update(self.full_values, current_values, value_indices)
-            self._inner_values_update(current_values)
+            # Fill out the full values by adding in the apporpriate grad_dict
+            # values
+            self._inner_take_over_or_update(self.full_values, grad_dict, value_indices)
+            self._inner_values_update(grad_dict)  # What is this for?
 
-            Lm = posterior.K_chol
-            dL_dKmm = grad_dict['dL_dKmm']
             woodbury_inv[:, :, d] = posterior.woodbury_inv[:,:,None]
             woodbury_vector[:, d] = posterior.woodbury_vector
             self._log_marginal_likelihood += log_marginal_likelihood
@@ -299,8 +299,7 @@ class SparseGPMiniBatch(SparseGP):
             woodbury_vector = self.posterior._woodbury_vector
 
         d = self.stochastics.d
-        posterior, log_marginal_likelihood, \
-            grad_dict, self.full_values, _ = self._inner_parameters_changed(
+        posterior, log_marginal_likelihood, grad_dict= self._inner_parameters_changed(
                             self.kern, self.X,
                             self.Z, self.likelihood,
                             self.Y_normalized[:, d], self.Y_metadata)
@@ -308,7 +307,7 @@ class SparseGPMiniBatch(SparseGP):
 
         self._log_marginal_likelihood += log_marginal_likelihood
 
-        self._outer_values_update(self.full_values)
+        self._outer_values_update(self.grad_dict)
 
         woodbury_inv[:, :, d] = posterior.woodbury_inv[:, :, None]
         woodbury_vector[:, d] = posterior.woodbury_vector
@@ -322,5 +321,5 @@ class SparseGPMiniBatch(SparseGP):
         elif self.stochastics:
             self._outer_loop_without_missing_data()
         else:
-            self.posterior, self._log_marginal_likelihood, self.grad_dict, self.full_values, _ = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
-            self._outer_values_update(self.full_values)
+            self.posterior, self._log_marginal_likelihood, self.grad_dict = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
+            self._outer_values_update(self.grad_dict)

From afba8d5c5e4b8742f5f28af4963bab021810f4fa Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 28 Aug 2015 10:07:19 +0300
Subject: [PATCH 04/13] Made sparse gp work again

---
 GPy/models/bayesian_gplvm_minibatch.py |  2 ++
 GPy/models/sparse_gp_minibatch.py      | 18 ++++++++++--------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/GPy/models/bayesian_gplvm_minibatch.py b/GPy/models/bayesian_gplvm_minibatch.py
index b7e0c9b2..5058f17b 100644
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@@ -123,6 +123,8 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
 
     def _outer_init_full_values(self):
         full_values = super(BayesianGPLVMMiniBatch, self)._outer_init_full_values()
+        full_values['dL_dpsi0'] = np.zeros(self.X.shape[0])
+        full_values['dL_dpsi1'] = np.zeros((self.X.shape[0], self.Z.shape[0]))
         return full_values
 
     def parameters_changed(self):
diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py
index eab8d8f3..3065b260 100644
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@@ -214,8 +214,8 @@ class SparseGPMiniBatch(SparseGP):
         to initialize the gradients for the mean and the variance in order to
         have the full gradient for indexing)
         """
-        return {'dL_dpsi0': np.zeros(self.X.shape[0]),
-                'dL_dpsi1': np.zeros((self.X.shape[0], self.Z.shape[0]))}
+        return {'dL_dKdiag': np.zeros(self.X.shape[0]),
+                'dL_dKnm': np.zeros((self.X.shape[0], self.Z.shape[0]))}
 
     def _outer_loop_for_missing_data(self):
         Lm = None
@@ -243,10 +243,8 @@ class SparseGPMiniBatch(SparseGP):
             psi1 = self.kern.psi1(self.Z, self.X)
             psi2 = self.kern.psi2(self.Z, self.X)
         else:
-            if psi0 is None:
-                psi0 = kern.Kdiag(X)
-            if psi1 is None:
-                psi1 = kern.K(X, Z)
+            psi0 = self.kern.Kdiag(self.X)
+            psi1 = self.kern.K(self.X, self.Z)
             psi2 = None
 
         self.psi0 = psi0
@@ -261,8 +259,12 @@ class SparseGPMiniBatch(SparseGP):
 
             psi0ni = psi0[ninan]
             psi1ni = psi1[ninan]
-            psi2ni = psi2[ninan]
-            value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan)
+            if self.has_uncertain_inputs():
+                psi2ni = psi2[ninan]
+                value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan)
+            else:
+                psi2ni = None
+                value_indices = dict(outputs=d, samples=ninan, dL_dKdiag=ninan, dL_dKnm=ninan)
 
             posterior, log_marginal_likelihood, grad_dict = self._inner_parameters_changed(
                                 self.kern, self.X[ninan],

From e12575c0c6eb224d01c84414a25c87a00c971762 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 28 Aug 2015 11:47:10 +0300
Subject: [PATCH 05/13] Gradients for X seem to match

---
 GPy/models/bayesian_gplvm_minibatch.py | 49 +++++++++++++++++++++-----
 GPy/models/sparse_gp_minibatch.py      |  2 +-
 2 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/GPy/models/bayesian_gplvm_minibatch.py b/GPy/models/bayesian_gplvm_minibatch.py
index 5058f17b..8b8c6feb 100644
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@@ -83,6 +83,35 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
     def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None,psi0=None, psi1=None, psi2=None, **kw):
         posterior, log_marginal_likelihood, grad_dict = super(BayesianGPLVMMiniBatch, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm,
                                                                                                                     psi0=psi0, psi1=psi1, psi2=psi2, **kw)
+        kl_fctr = self.kl_factr
+        if self.has_uncertain_inputs():
+            if self.missing_data:
+                d = self.output_dim
+                log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)/d
+            else:
+                log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)
+
+            X.mean.gradient[:] = 0
+            X.variance.gradient[:] = 0
+
+            self.variational_prior.update_gradients_KL(X)
+            if self.missing_data:
+                grad_dict['meangrad'] = kl_fctr*X.mean.gradient/d
+                grad_dict['vargrad'] = kl_fctr*X.variance.gradient/d
+            else:
+                grad_dict['meangrad'] = kl_fctr*X.mean.gradient
+                grad_dict['vargrad'] = kl_fctr*X.variance.gradient
+
+            #if subset_indices is not None:
+                #value_indices['meangrad'] = subset_indices['samples']
+                #value_indices['vargrad'] = subset_indices['samples']
+
+            #grad_dict['meangrad'] = kl_fctr*self.X.mean.gradient
+            #grad_dict['vargrad'] = kl_fctr*self.X.variance.gradient
+            #self.X.mean.gradient[:] = 0
+            #self.X.variance.gradient[:] = 0
+            #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+
         return posterior, log_marginal_likelihood, grad_dict
 
     def _outer_values_update(self, full_values):
@@ -92,28 +121,28 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
         """
         super(BayesianGPLVMMiniBatch, self)._outer_values_update(full_values)
         if self.has_uncertain_inputs():
-            full_values['meangrad'], full_values['vargrad'] = self.kern.gradients_qX_expectations(
+             meangrad_tmp, vargrad_tmp = self.kern.gradients_qX_expectations(
                                                 variational_posterior=self.X,
                                                 Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
                                                 dL_dpsi1=full_values['dL_dpsi1'],
                                                 dL_dpsi2=full_values['dL_dpsi2'],
                                                 psi0=self.psi0, psi1=self.psi1, psi2=self.psi2)
+             full_values['meangrad'] += meangrad_tmp
+             full_values['vargrad'] += vargrad_tmp
         else:
             full_values['Xgrad'] = self.kern.gradients_X(full_values['dL_dKnm'], self.X, self.Z)
             full_values['Xgrad'] += self.kern.gradients_X_diag(full_values['dL_dKdiag'], self.X)
 
-        kl_fctr = self.kl_factr
-        if self.has_uncertain_inputs():
-            self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)
+        #kl_fctr = self.kl_factr
+        #if self.has_uncertain_inputs():
+            #self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)
 
             # Subsetting Variational Posterior objects, makes the gradients
             # empty. We need them to be 0 though:
-            self.X.mean.gradient[:] = 0
-            self.X.variance.gradient[:] = 0
+            #self.X.mean.gradient[:] = 0
+            #self.X.variance.gradient[:] = 0
 
-            self.variational_prior.update_gradients_KL(self.X)
-            full_values['meangrad'] += kl_fctr*self.X.mean.gradient
-            full_values['vargrad'] += kl_fctr*self.X.variance.gradient
+            #self.variational_prior.update_gradients_KL(self.X)
 
         if self.has_uncertain_inputs():
             self.X.mean.gradient = full_values['meangrad']
@@ -123,6 +152,8 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
 
     def _outer_init_full_values(self):
         full_values = super(BayesianGPLVMMiniBatch, self)._outer_init_full_values()
+        full_values['meangrad'] = np.zeros((self.X.shape[0], self.X.shape[1]))
+        full_values['vargrad'] = np.zeros((self.X.shape[0], self.X.shape[1]))
         full_values['dL_dpsi0'] = np.zeros(self.X.shape[0])
         full_values['dL_dpsi1'] = np.zeros((self.X.shape[0], self.Z.shape[0]))
         return full_values
diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py
index 3065b260..a5676e9d 100644
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@@ -261,7 +261,7 @@ class SparseGPMiniBatch(SparseGP):
             psi1ni = psi1[ninan]
             if self.has_uncertain_inputs():
                 psi2ni = psi2[ninan]
-                value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan)
+                value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan, meangrad=ninan, vargrad=ninan)
             else:
                 psi2ni = None
                 value_indices = dict(outputs=d, samples=ninan, dL_dKdiag=ninan, dL_dKnm=ninan)

From b89196c7cf25c4b4266133045762f3216ad998b6 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 28 Aug 2015 14:03:14 +0300
Subject: [PATCH 06/13] Checking kerngrads full_value

---
 GPy/models/bayesian_gplvm_minibatch.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/GPy/models/bayesian_gplvm_minibatch.py b/GPy/models/bayesian_gplvm_minibatch.py
index 8b8c6feb..85b1e44f 100644
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@@ -120,6 +120,8 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
         E.g. set the gradients of parameters, etc.
         """
         super(BayesianGPLVMMiniBatch, self)._outer_values_update(full_values)
+        print full_values['kerngrad']
+        import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
         if self.has_uncertain_inputs():
              meangrad_tmp, vargrad_tmp = self.kern.gradients_qX_expectations(
                                                 variational_posterior=self.X,

From 1a2b194aa27cfe8169a2d716dc6b4c6b78dd020f Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Fri, 28 Aug 2015 14:23:44 +0300
Subject: [PATCH 07/13] Removed comment

---
 GPy/models/bayesian_gplvm_minibatch.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/GPy/models/bayesian_gplvm_minibatch.py b/GPy/models/bayesian_gplvm_minibatch.py
index 85b1e44f..8b8c6feb 100644
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@@ -120,8 +120,6 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
         E.g. set the gradients of parameters, etc.
         """
         super(BayesianGPLVMMiniBatch, self)._outer_values_update(full_values)
-        print full_values['kerngrad']
-        import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
         if self.has_uncertain_inputs():
              meangrad_tmp, vargrad_tmp = self.kern.gradients_qX_expectations(
                                                 variational_posterior=self.X,

From c83f56723ec7ee3e5725a8db941b0edcb045e043 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Sun, 30 Aug 2015 22:22:27 +0300
Subject: [PATCH 08/13] passing psi statistics

---
 GPy/models/sparse_gp_minibatch.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py
index a5676e9d..c9d13e6b 100644
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@@ -167,21 +167,27 @@ class SparseGPMiniBatch(SparseGP):
             dL_dKmm = full_values['dL_dKmm']
             self.kern.update_gradients_full(dL_dKmm, self.Z, None)
             full_values['kerngrad'] = self.kern.gradient.copy()
-            self.kern.update_gradients_expectations(variational_posterior=self.X,
-                                                    Z=self.Z,
-                                                    dL_dpsi0=full_values['dL_dpsi0'],
-                                                    dL_dpsi1=full_values['dL_dpsi1'],
-                                                    dL_dpsi2=full_values['dL_dpsi2'])
+            self.kern.update_gradients_expectations(
+                                                variational_posterior=self.X,
+                                                Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
+                                                dL_dpsi1=full_values['dL_dpsi1'],
+                                                dL_dpsi2=full_values['dL_dpsi2'],
+                                                psi0=self.psi0, psi1=self.psi1, psi2=self.psi2)
+            #self.kern.update_gradients_expectations(variational_posterior=self.X,
+                                                    #Z=self.Z,
+                                                    #dL_dpsi0=full_values['dL_dpsi0'],
+                                                    #dL_dpsi1=full_values['dL_dpsi1'],
+                                                    #dL_dpsi2=full_values['dL_dpsi2'])
             full_values['kerngrad'] += self.kern.gradient
 
             #gradients wrt Z
             full_values['Zgrad'] = self.kern.gradients_X(dL_dKmm, self.Z)
             full_values['Zgrad'] += self.kern.gradients_Z_expectations(
-                               full_values['dL_dpsi0'],
-                               full_values['dL_dpsi1'],
-                               full_values['dL_dpsi2'],
-                               Z=self.Z,
-                               variational_posterior=self.X)
+                                            variational_posterior=self.X,
+                                            Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
+                                            dL_dpsi1=full_values['dL_dpsi1'],
+                                            dL_dpsi2=full_values['dL_dpsi2'],
+                                            psi0=self.psi0, psi1=self.psi1, psi2=self.psi2)
         else:
             #gradients wrt kernel
             self.kern.update_gradients_diag(full_values['dL_dKdiag'], self.X)

From 3818aa37452bea67e670616a400a363ae2c6edfb Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Mon, 31 Aug 2015 14:13:35 +0300
Subject: [PATCH 09/13] Try calculating dL_dpsi1*psi1 individually for each
 dimension as we go along

---
 GPy/kern/_src/kern.py                    | 12 ++++++------
 GPy/kern/_src/kernel_slice_operations.py | 12 ++++++------
 GPy/kern/_src/psi_comp/__init__.py       |  4 ++--
 GPy/kern/_src/psi_comp/rbf_psi_comp.py   | 16 +++++++++-------
 GPy/kern/_src/rbf.py                     | 12 ++++++------
 GPy/models/bayesian_gplvm_minibatch.py   |  8 +++++++-
 GPy/models/sparse_gp_minibatch.py        | 14 +++++++++++---
 7 files changed, 47 insertions(+), 31 deletions(-)

diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index fb12030b..78488628 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -117,7 +117,7 @@ class Kern(Parameterized):
         raise NotImplementedError
 
     def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
-                                      psi0=None, psi1=None, psi2=None):
+                                      psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
         """
         Set the gradients of all parameters when doing inference with
         uncertain inputs, using expectations of the kernel.
@@ -129,26 +129,26 @@ class Kern(Parameterized):
                         dL_dpsi2 * dpsi2_d{theta_i}
         """
         dtheta = self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
-                                                        psi0=psi0, psi1=psi1, psi2=psi2)[0]
+                                                        psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)[0]
         self.gradient[:] = dtheta
 
     def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
-                                psi0=None, psi1=None, psi2=None):
+                                psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
         """
         Returns the derivative of the objective wrt Z, using the chain rule
         through the expectation variables.
         """
         return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
-                                                      psi0=psi0, psi1=psi1, psi2=psi2)[1]
+                                                      psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)[1]
 
     def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
-                                    psi0=None, psi1=None, psi2=None):
+                                    psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
         """
         Compute the gradients wrt the parameters of the variational
         distruibution q(X), chain-ruling via the expectations of the kernel
         """
         return self.psicomp.psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
-                                                      psi0=psi0, psi1=psi1, psi2=psi2)[2:]
+                                                      psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)[2:]
 
     def plot(self, x=None, fignum=None, ax=None, title=None, plot_limits=None, resolution=None, **mpl_kwargs):
         """
diff --git a/GPy/kern/_src/kernel_slice_operations.py b/GPy/kern/_src/kernel_slice_operations.py
index a2058ca8..49899512 100644
--- a/GPy/kern/_src/kernel_slice_operations.py
+++ b/GPy/kern/_src/kernel_slice_operations.py
@@ -117,30 +117,30 @@ def _slice_psi(f):
 def _slice_update_gradients_expectations(f):
     @wraps(f)
     def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
-             psi0=None, psi1=None, psi2=None):
+             psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
         with _Slice_wrap(self, Z, variational_posterior) as s:
             ret = f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2,
-                    psi0=psi0, psi1=psi1, psi2=psi2)
+                    psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)
         return ret
     return wrap
 
 def _slice_gradients_Z_expectations(f):
     @wraps(f)
     def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
-             psi0=None, psi1=None, psi2=None):
+             psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
         with _Slice_wrap(self, Z, variational_posterior) as s:
             ret = s.handle_return_array(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2,
-                                          psi0=psi0, psi1=psi1, psi2=psi2))
+                                          psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2))
         return ret
     return wrap
 
 def _slice_gradients_qX_expectations(f):
     @wraps(f)
     def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
-             psi0=None, psi1=None, psi2=None):
+             psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
         with _Slice_wrap(self, variational_posterior, Z) as s:
             ret = list(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X2, s.X,
-                         psi0=psi0, psi1=psi1, psi2=psi2))
+                         psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2))
             r2 = ret[:2]
             ret[0] = s.handle_return_array(r2[0])
             ret[1] = s.handle_return_array(r2[1])
diff --git a/GPy/kern/_src/psi_comp/__init__.py b/GPy/kern/_src/psi_comp/__init__.py
index 6849ed81..088b514c 100644
--- a/GPy/kern/_src/psi_comp/__init__.py
+++ b/GPy/kern/_src/psi_comp/__init__.py
@@ -24,10 +24,10 @@ class PSICOMP_RBF(Pickleable):
 
     @Cache_this(limit=10, ignore_args=(0,1,2,3))
     def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior,
-                                  psi0=None, psi1=None, psi2=None):
+                                  psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
         if isinstance(variational_posterior, variational.NormalPosterior):
             return rbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior,
-                                                          psi0=psi0, psi1=psi1, psi2=psi2)
+                                                          psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)
         elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
             return ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior)
         else:
diff --git a/GPy/kern/_src/psi_comp/rbf_psi_comp.py b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
index de958a37..eaf2b04d 100644
--- a/GPy/kern/_src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
@@ -69,11 +69,11 @@ def __psi2computations(variance, lengthscale, Z, mu, S):
     return _psi2
 
 def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior,
-                              psi0=None, psi1=None, psi2=None):
+                              psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
     ARD = (len(lengthscale)!=1)
 
-    dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, psi1=psi1)
-    dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, psi2=psi2)
+    dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, psi1=psi1, Lpsi1=Lpsi1)
+    dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, psi2=psi2, Lpsi2=Lpsi2)
 
     dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
 
@@ -87,7 +87,7 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscal
 
     return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS
 
-def __psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, psi1=None):
+def __psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, psi1=None, Lpsi1=None):
     """
     dL_dpsi1 - NxM
     Z - MxQ
@@ -108,7 +108,8 @@ def __psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, psi1=None):
 
     if psi1 is None:
         psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
-    Lpsi1 = dL_dpsi1*psi1
+    if Lpsi1 is None:
+        Lpsi1 = dL_dpsi1*psi1
     Zmu = Z[None,:,:]-mu[:,None,:] # NxMxQ
     denom = 1./(S+lengthscale2)
     Zmu2_denom = np.square(Zmu)*denom[:,None,:] #NxMxQ
@@ -120,7 +121,7 @@ def __psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, psi1=None):
 
     return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
 
-def __psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, psi2=None):
+def __psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, psi2=None, Lpsi2=None):
     """
     Z - MxQ
     mu - NxQ
@@ -143,7 +144,8 @@ def __psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, psi2=None):
 
     if psi2 is None:
         psi2 = _psi2computations(variance, lengthscale, Z, mu, S) # NxMxM
-    Lpsi2 = dL_dpsi2*psi2 # dL_dpsi2 is MxM, using broadcast to multiply N out
+    if Lpsi2 is None:
+        Lpsi2 = dL_dpsi2*psi2 # dL_dpsi2 is MxM, using broadcast to multiply N out
     Lpsi2sum = np.einsum('nmo->n',Lpsi2) #N
     Lpsi2Z = np.einsum('nmo,oq->nq',Lpsi2,Z) #NxQ
     Lpsi2Z2 = np.einsum('nmo,oq,oq->nq',Lpsi2,Z,Z) #NxQ
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index 357c48ec..3ba782b1 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -59,16 +59,16 @@ class RBF(Stationary):
         return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior, return_psi2_n=self.return_psi2_n)[2]
 
     def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
-                                      psi0=None, psi1=None, psi2=None):
-        dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2)[:2]
+                                      psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
+        dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)[:2]
         self.variance.gradient = dL_dvar
         self.lengthscale.gradient = dL_dlengscale
 
     def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
-                                 psi0=None, psi1=None, psi2=None):
-        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2)[2]
+                                 psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
+        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)[2]
 
     def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
-                                  psi0=None, psi1=None, psi2=None):
-        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2)[3:]
+                                  psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
+        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior, psi0=psi0, psi1=psi1, psi2=psi2, Lpsi0=Lpsi0, Lpsi1=Lpsi1, Lpsi2=Lpsi2)[3:]
 
diff --git a/GPy/models/bayesian_gplvm_minibatch.py b/GPy/models/bayesian_gplvm_minibatch.py
index 8b8c6feb..0828ed6b 100644
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@@ -126,7 +126,8 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
                                                 Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
                                                 dL_dpsi1=full_values['dL_dpsi1'],
                                                 dL_dpsi2=full_values['dL_dpsi2'],
-                                                psi0=self.psi0, psi1=self.psi1, psi2=self.psi2)
+                                                psi0=self.psi0, psi1=self.psi1, psi2=self.psi2,
+                                                Lpsi0=full_values['Lpsi0'], Lpsi1=full_values['Lpsi1'], Lpsi2=full_values['Lpsi2'])
              full_values['meangrad'] += meangrad_tmp
              full_values['vargrad'] += vargrad_tmp
         else:
@@ -156,6 +157,11 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
         full_values['vargrad'] = np.zeros((self.X.shape[0], self.X.shape[1]))
         full_values['dL_dpsi0'] = np.zeros(self.X.shape[0])
         full_values['dL_dpsi1'] = np.zeros((self.X.shape[0], self.Z.shape[0]))
+        full_values['dL_dpsi2'] = np.zeros((self.Z.shape[0], self.Z.shape[0]))
+
+        full_values['Lpsi0'] = np.zeros(self.X.shape[0])
+        full_values['Lpsi1'] = np.zeros((self.X.shape[0], self.Z.shape[0]))
+        full_values['Lpsi2'] = np.zeros((self.X.shape[0], self.Z.shape[0], self.Z.shape[0]))
         return full_values
 
     def parameters_changed(self):
diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py
index c9d13e6b..e13d8061 100644
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@@ -106,6 +106,10 @@ class SparseGPMiniBatch(SparseGP):
         posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm,
                                                                                         dL_dKmm=dL_dKmm, psi0=psi0, psi1=psi1, psi2=psi2_sum_n, **kwargs)
 
+        if self.has_uncertain_inputs():
+            grad_dict['Lpsi0'] = grad_dict['dL_dpsi0']*psi0
+            grad_dict['Lpsi1'] = grad_dict['dL_dpsi1']*psi1
+            grad_dict['Lpsi2'] = grad_dict['dL_dpsi2']*psi2
         return posterior, log_marginal_likelihood, grad_dict
 
     def _inner_take_over_or_update(self, full_values=None, current_values=None, value_indices=None):
@@ -172,7 +176,8 @@ class SparseGPMiniBatch(SparseGP):
                                                 Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
                                                 dL_dpsi1=full_values['dL_dpsi1'],
                                                 dL_dpsi2=full_values['dL_dpsi2'],
-                                                psi0=self.psi0, psi1=self.psi1, psi2=self.psi2)
+                                                psi0=self.psi0, psi1=self.psi1, psi2=self.psi2,
+                                                Lpsi0=full_values['Lpsi0'], Lpsi1=full_values['Lpsi1'], Lpsi2=full_values['Lpsi2'])
             #self.kern.update_gradients_expectations(variational_posterior=self.X,
                                                     #Z=self.Z,
                                                     #dL_dpsi0=full_values['dL_dpsi0'],
@@ -187,7 +192,8 @@ class SparseGPMiniBatch(SparseGP):
                                             Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
                                             dL_dpsi1=full_values['dL_dpsi1'],
                                             dL_dpsi2=full_values['dL_dpsi2'],
-                                            psi0=self.psi0, psi1=self.psi1, psi2=self.psi2)
+                                            psi0=self.psi0, psi1=self.psi1, psi2=self.psi2,
+                                            Lpsi0=full_values['Lpsi0'], Lpsi1=full_values['Lpsi1'], Lpsi2=full_values['Lpsi2'])
         else:
             #gradients wrt kernel
             self.kern.update_gradients_diag(full_values['dL_dKdiag'], self.X)
@@ -267,7 +273,9 @@ class SparseGPMiniBatch(SparseGP):
             psi1ni = psi1[ninan]
             if self.has_uncertain_inputs():
                 psi2ni = psi2[ninan]
-                value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan, meangrad=ninan, vargrad=ninan)
+                #value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan, meangrad=ninan, vargrad=ninan)
+                value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan, meangrad=ninan, vargrad=ninan,
+                                     Lpsi0=ninan, Lpsi1=ninan, Lpsi2=ninan)
             else:
                 psi2ni = None
                 value_indices = dict(outputs=d, samples=ninan, dL_dKdiag=ninan, dL_dKnm=ninan)

From 50b9e4dc82d7efc42e1f43ab1ffd47c4afd231a9 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Tue, 1 Sep 2015 19:17:56 +0300
Subject: [PATCH 10/13] corrected caching for psi derivatives

---
 GPy/kern/_src/kern.py                  |  7 +------
 GPy/kern/_src/psi_comp/rbf_psi_comp.py | 12 ++++++++++--
 GPy/models/bayesian_gplvm_minibatch.py | 26 +++++++++-----------------
 GPy/models/sparse_gp_minibatch.py      | 18 +++++++++---------
 4 files changed, 29 insertions(+), 34 deletions(-)

diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index 78488628..77f4d563 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -58,12 +58,7 @@ class Kern(Parameterized):
 
         self._sliced_X = 0
         self.useGPU = self._support_GPU and useGPU
-        self._return_psi2_n_flag = ObsAr(np.zeros(1)).astype(bool)
-
-        #FIXME: temporary solution
-        from ...core.parameterization.lists_and_dicts import ObserverList
-        self._return_psi2_n_flag.observers = ObserverList()
-        self._return_psi2_n_flag._update_on = True
+        self._return_psi2_n_flag = ObsAr(np.zeros(1).astype(bool))
 
         from .psi_comp import PSICOMP_GH
         self.psicomp = PSICOMP_GH()
diff --git a/GPy/kern/_src/psi_comp/rbf_psi_comp.py b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
index eaf2b04d..9d85c93e 100644
--- a/GPy/kern/_src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
@@ -68,6 +68,7 @@ def __psi2computations(variance, lengthscale, Z, mu, S):
     _psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2)
     return _psi2
 
+@profile
 def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior,
                               psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
     ARD = (len(lengthscale)!=1)
@@ -121,6 +122,7 @@ def __psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, psi1=None, Lpsi1=No
 
     return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
 
+@profile
 def __psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, psi2=None, Lpsi2=None):
     """
     Z - MxQ
@@ -156,8 +158,14 @@ def __psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, psi2=None, Lpsi2=No
     _dL_dvar = Lpsi2sum.sum()*2/variance
     _dL_dmu = (-2*denom) * (mu*Lpsi2sum[:,None]-Lpsi2Zhat)
     _dL_dS = (2*np.square(denom))*(np.square(mu)*Lpsi2sum[:,None]-2*mu*Lpsi2Zhat+Lpsi2Zhat2) - denom*Lpsi2sum[:,None]
-    _dL_dZ = -np.einsum('nmo,oq->oq',Lpsi2,Z)/lengthscale2+np.einsum('nmo,oq->mq',Lpsi2,Z)/lengthscale2+ \
-             2*np.einsum('nmo,nq,nq->mq',Lpsi2,mu,denom) - np.einsum('nmo,nq,mq->mq',Lpsi2,denom,Z) - np.einsum('nmo,oq,nq->mq',Lpsi2,Z,denom)
+    _dL_dZ1 = -np.einsum('nmo,oq->oq',Lpsi2,Z)/lengthscale2
+    _dL_dZ2 = np.einsum('nmo,oq->mq',Lpsi2,Z)/lengthscale2
+    _dL_dZ3 = 2*np.einsum('nmo,nq,nq->mq',Lpsi2,mu,denom)
+    _dL_dZ4 = - np.einsum('nmo,nq,mq->mq',Lpsi2,denom,Z)
+    _dL_dZ5 = - np.einsum('nmo,oq,nq->mq',Lpsi2,Z,denom)
+    _dL_dZ = _dL_dZ1 + _dL_dZ2 + _dL_dZ3 + _dL_dZ4 + _dL_dZ5
+    #_dL_dZ = -np.einsum('nmo,oq->oq',Lpsi2,Z)/lengthscale2+np.einsum('nmo,oq->mq',Lpsi2,Z)/lengthscale2+ \
+             #2*np.einsum('nmo,nq,nq->mq',Lpsi2,mu,denom) - np.einsum('nmo,nq,mq->mq',Lpsi2,denom,Z) - np.einsum('nmo,oq,nq->mq',Lpsi2,Z,denom)
     _dL_dl = 2*lengthscale* ((S/lengthscale2*denom+np.square(mu*denom))*Lpsi2sum[:,None]+(Lpsi2Z2-Lpsi2Z2p)/(2*np.square(lengthscale2))-
                              (2*mu*denom2)*Lpsi2Zhat+denom2*Lpsi2Zhat2).sum(axis=0)
 
diff --git a/GPy/models/bayesian_gplvm_minibatch.py b/GPy/models/bayesian_gplvm_minibatch.py
index 0828ed6b..ab4cc909 100644
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@@ -9,6 +9,7 @@ from ..inference.latent_function_inference.var_dtc_parallel import VarDTC_miniba
 import logging
 from GPy.models.sparse_gp_minibatch import SparseGPMiniBatch
 from GPy.core.parameterization.param import Param
+from GPy.core.parameterization.observable_array import ObsAr
 
 class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
     """
@@ -134,17 +135,6 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
             full_values['Xgrad'] = self.kern.gradients_X(full_values['dL_dKnm'], self.X, self.Z)
             full_values['Xgrad'] += self.kern.gradients_X_diag(full_values['dL_dKdiag'], self.X)
 
-        #kl_fctr = self.kl_factr
-        #if self.has_uncertain_inputs():
-            #self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)
-
-            # Subsetting Variational Posterior objects, makes the gradients
-            # empty. We need them to be 0 though:
-            #self.X.mean.gradient[:] = 0
-            #self.X.variance.gradient[:] = 0
-
-            #self.variational_prior.update_gradients_KL(self.X)
-
         if self.has_uncertain_inputs():
             self.X.mean.gradient = full_values['meangrad']
             self.X.variance.gradient = full_values['vargrad']
@@ -155,13 +145,15 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
         full_values = super(BayesianGPLVMMiniBatch, self)._outer_init_full_values()
         full_values['meangrad'] = np.zeros((self.X.shape[0], self.X.shape[1]))
         full_values['vargrad'] = np.zeros((self.X.shape[0], self.X.shape[1]))
-        full_values['dL_dpsi0'] = np.zeros(self.X.shape[0])
-        full_values['dL_dpsi1'] = np.zeros((self.X.shape[0], self.Z.shape[0]))
-        full_values['dL_dpsi2'] = np.zeros((self.Z.shape[0], self.Z.shape[0]))
 
-        full_values['Lpsi0'] = np.zeros(self.X.shape[0])
-        full_values['Lpsi1'] = np.zeros((self.X.shape[0], self.Z.shape[0]))
-        full_values['Lpsi2'] = np.zeros((self.X.shape[0], self.Z.shape[0], self.Z.shape[0]))
+        #FIXME Hack
+        full_values['dL_dpsi0'] = ObsAr(np.zeros(self.X.shape[0]))
+        full_values['dL_dpsi1'] = ObsAr(np.zeros((self.X.shape[0], self.Z.shape[0])))
+        full_values['dL_dpsi2'] = ObsAr(np.zeros((self.Z.shape[0], self.Z.shape[0])))
+
+        full_values['Lpsi0'] = ObsAr(np.zeros(self.X.shape[0]))
+        full_values['Lpsi1'] = ObsAr(np.zeros((self.X.shape[0], self.Z.shape[0])))
+        full_values['Lpsi2'] = ObsAr(np.zeros((self.X.shape[0], self.Z.shape[0], self.Z.shape[0])))
         return full_values
 
     def parameters_changed(self):
diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py
index e13d8061..1eebeeca 100644
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@@ -147,7 +147,11 @@ class SparseGPMiniBatch(SparseGP):
                 if np.isscalar(current_values[key]):
                     full_values[key] += current_values[key]
                 else:
-                    full_values[key][index] += current_values[key]
+                    from ..core.parameterization.observable_array import ObsAr
+                    if isinstance(full_values[key], ObsAr):
+                        full_values[key].values[index] += current_values[key]
+                    else:
+                        full_values[key][index] += current_values[key]
             else:
                 full_values[key] = current_values[key]
 
@@ -178,11 +182,6 @@ class SparseGPMiniBatch(SparseGP):
                                                 dL_dpsi2=full_values['dL_dpsi2'],
                                                 psi0=self.psi0, psi1=self.psi1, psi2=self.psi2,
                                                 Lpsi0=full_values['Lpsi0'], Lpsi1=full_values['Lpsi1'], Lpsi2=full_values['Lpsi2'])
-            #self.kern.update_gradients_expectations(variational_posterior=self.X,
-                                                    #Z=self.Z,
-                                                    #dL_dpsi0=full_values['dL_dpsi0'],
-                                                    #dL_dpsi1=full_values['dL_dpsi1'],
-                                                    #dL_dpsi2=full_values['dL_dpsi2'])
             full_values['kerngrad'] += self.kern.gradient
 
             #gradients wrt Z
@@ -251,9 +250,10 @@ class SparseGPMiniBatch(SparseGP):
         #Compute the psi statistics for N once, but don't sum out N in psi2
         if self.has_uncertain_inputs():
             self.kern.return_psi2_n = True
-            psi0 = self.kern.psi0(self.Z, self.X)
-            psi1 = self.kern.psi1(self.Z, self.X)
-            psi2 = self.kern.psi2(self.Z, self.X)
+            from ..core.parameterization.observable_array import ObsAr
+            psi0 = ObsAr(self.kern.psi0(self.Z, self.X))
+            psi1 = ObsAr(self.kern.psi1(self.Z, self.X))
+            psi2 = ObsAr(self.kern.psi2(self.Z, self.X))
         else:
             psi0 = self.kern.Kdiag(self.X)
             psi1 = self.kern.K(self.X, self.Z)

From 81ef734908411ddaec45467b133b5b7a5d44eda2 Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Wed, 2 Sep 2015 10:46:30 +0300
Subject: [PATCH 11/13] Reindented, did some profiling which looks promising

---
 GPy/kern/_src/psi_comp/rbf_psi_comp.py |  2 --
 GPy/models/bayesian_gplvm_minibatch.py | 19 +++++++++----------
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/GPy/kern/_src/psi_comp/rbf_psi_comp.py b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
index 9d85c93e..3fa2de94 100644
--- a/GPy/kern/_src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
@@ -68,7 +68,6 @@ def __psi2computations(variance, lengthscale, Z, mu, S):
     _psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2)
     return _psi2
 
-@profile
 def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior,
                               psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
     ARD = (len(lengthscale)!=1)
@@ -122,7 +121,6 @@ def __psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, psi1=None, Lpsi1=No
 
     return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
 
-@profile
 def __psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, psi2=None, Lpsi2=None):
     """
     Z - MxQ
diff --git a/GPy/models/bayesian_gplvm_minibatch.py b/GPy/models/bayesian_gplvm_minibatch.py
index ab4cc909..5fd9af60 100644
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@@ -122,15 +122,15 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
         """
         super(BayesianGPLVMMiniBatch, self)._outer_values_update(full_values)
         if self.has_uncertain_inputs():
-             meangrad_tmp, vargrad_tmp = self.kern.gradients_qX_expectations(
-                                                variational_posterior=self.X,
-                                                Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
-                                                dL_dpsi1=full_values['dL_dpsi1'],
-                                                dL_dpsi2=full_values['dL_dpsi2'],
-                                                psi0=self.psi0, psi1=self.psi1, psi2=self.psi2,
-                                                Lpsi0=full_values['Lpsi0'], Lpsi1=full_values['Lpsi1'], Lpsi2=full_values['Lpsi2'])
-             full_values['meangrad'] += meangrad_tmp
-             full_values['vargrad'] += vargrad_tmp
+            meangrad_tmp, vargrad_tmp = self.kern.gradients_qX_expectations(
+                                            variational_posterior=self.X,
+                                            Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
+                                            dL_dpsi1=full_values['dL_dpsi1'],
+                                            dL_dpsi2=full_values['dL_dpsi2'],
+                                            psi0=self.psi0, psi1=self.psi1, psi2=self.psi2,
+                                            Lpsi0=full_values['Lpsi0'], Lpsi1=full_values['Lpsi1'], Lpsi2=full_values['Lpsi2'])
+            full_values['meangrad'] += meangrad_tmp
+            full_values['vargrad'] += vargrad_tmp
         else:
             full_values['Xgrad'] = self.kern.gradients_X(full_values['dL_dKnm'], self.X, self.Z)
             full_values['Xgrad'] += self.kern.gradients_X_diag(full_values['dL_dKdiag'], self.X)
@@ -146,7 +146,6 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
         full_values['meangrad'] = np.zeros((self.X.shape[0], self.X.shape[1]))
         full_values['vargrad'] = np.zeros((self.X.shape[0], self.X.shape[1]))
 
-        #FIXME Hack
         full_values['dL_dpsi0'] = ObsAr(np.zeros(self.X.shape[0]))
         full_values['dL_dpsi1'] = ObsAr(np.zeros((self.X.shape[0], self.Z.shape[0])))
         full_values['dL_dpsi2'] = ObsAr(np.zeros((self.Z.shape[0], self.Z.shape[0])))

From b73932c35034bbc5a37cc374edd070f18a0d42ca Mon Sep 17 00:00:00 2001
From: Alan Saul <alan.daniel.saul@gmail.com>
Date: Thu, 3 Sep 2015 13:32:59 +0300
Subject: [PATCH 12/13] Small edits for linear kernel

---
 GPy/kern/_src/psi_comp/linear_psi_comp.py | 2 +-
 GPy/models/sparse_gp_minibatch.py         | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/GPy/kern/_src/psi_comp/linear_psi_comp.py b/GPy/kern/_src/psi_comp/linear_psi_comp.py
index cd5d8fa0..a24b263a 100644
--- a/GPy/kern/_src/psi_comp/linear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/linear_psi_comp.py
@@ -22,7 +22,7 @@ def psicomputations(variance, Z, variational_posterior, return_psi2_n=False):
 
     psi0 = (variance*(np.square(mu)+S)).sum(axis=1)
     psi1 = np.dot(mu,(variance*Z).T)
-    if sum_N_psi2:
+    if return_psi2_n:
         psi2 = np.dot(S.sum(axis=0)*np.square(variance)*Z,Z.T)+ tdot(psi1.T)
     else:
         raise NotImplementedError
diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py
index 1eebeeca..92b2baf5 100644
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@@ -303,6 +303,8 @@ class SparseGPMiniBatch(SparseGP):
             self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector,
                                    K=posterior._K, mean=None, cov=None, K_chol=posterior.K_chol)
         self._outer_values_update(self.full_values)
+        if self.has_uncertain_inputs():
+            self.kern.return_psi2_n = False
 
     def _outer_loop_without_missing_data(self):
         self._log_marginal_likelihood = 0

From cb024f0bf834f707b3afa26c5fcdaa0a1f88144b Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Fri, 4 Sep 2015 10:33:32 +0100
Subject: [PATCH 13/13] [spgp minibatch] added new routine for psi NxMxM, much
 faster, little bigger mem footbprint

---
 GPy/inference/optimization/stochastics.py |  37 ++++----
 GPy/kern/_src/kern.py                     |  18 +---
 GPy/kern/_src/rbf.py                      |   5 +-
 GPy/models/bayesian_gplvm_minibatch.py    |  78 ++++++----------
 GPy/models/sparse_gp_minibatch.py         | 109 ++++++++++------------
 5 files changed, 101 insertions(+), 146 deletions(-)

diff --git a/GPy/inference/optimization/stochastics.py b/GPy/inference/optimization/stochastics.py
index f7ad1245..0fc488a2 100644
--- a/GPy/inference/optimization/stochastics.py
+++ b/GPy/inference/optimization/stochastics.py
@@ -43,7 +43,7 @@ class SparseGPMissing(StochasticStorage):
         np.set_printoptions(threshold='nan')
         for d in range(self.Y.shape[1]):
             inan = np.isnan(self.Y)[:, d]
-            arr_str = np.array2string(~inan, np.inf, 0, True, '', formatter={'bool':lambda x: '1' if x else '0'})
+            arr_str = np.array2string(inan, np.inf, 0, True, '', formatter={'bool':lambda x: '1' if x else '0'})
             try:
                 bdict[arr_str][0].append(d)
             except:
@@ -56,35 +56,36 @@ class SparseGPStochastics(StochasticStorage):
     For the sparse gp we need to store the dimension we are in,
     and the indices corresponding to those
     """
-    def __init__(self, model, batchsize=1):
+    def __init__(self, model, batchsize=1, missing_data=True):
         self.batchsize = batchsize
         self.output_dim = model.Y.shape[1]
         self.Y = model.Y_normalized
+        self.missing_data = missing_data
         self.reset()
         self.do_stochastics()
 
     def do_stochastics(self):
+        import numpy as np
         if self.batchsize == 1:
             self.current_dim = (self.current_dim+1)%self.output_dim
-            self.d = [[[self.current_dim], np.isnan(self.Y[:, self.d])]]
+            self.d = [[[self.current_dim], np.isnan(self.Y[:, self.current_dim]) if self.missing_data else None]]
         else:
-            import numpy as np
             self.d = np.random.choice(self.output_dim, size=self.batchsize, replace=False)
             bdict = {}
-            opt = np.get_printoptions()
-            np.set_printoptions(threshold='nan')
-            for d in self.d:
-                inan = np.isnan(self.Y[:, d])
-                arr_str = int(np.array2string(inan,
-                                          np.inf, 0,
-                                          True, '',
-                                          formatter={'bool':lambda x: '1' if x else '0'}), 2)
-                try:
-                    bdict[arr_str][0].append(d)
-                except:
-                    bdict[arr_str] = [[d], ~inan]
-            np.set_printoptions(**opt)
-            self.d = bdict.values()
+            if self.missing_data:
+                opt = np.get_printoptions()
+                np.set_printoptions(threshold='nan')
+                for d in self.d:
+                    inan = np.isnan(self.Y[:, d])
+                    arr_str = np.array2string(inan,np.inf, 0,True, '',formatter={'bool':lambda x: '1' if x else '0'})
+                    try:
+                        bdict[arr_str][0].append(d)
+                    except:
+                        bdict[arr_str] = [[d], ~inan]
+                np.set_printoptions(**opt)
+                self.d = bdict.values()
+            else:
+                self.d = [[self.d, None]]
 
     def reset(self):
         self.current_dim = -1
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index 77f4d563..f6971faf 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -58,24 +58,10 @@ class Kern(Parameterized):
 
         self._sliced_X = 0
         self.useGPU = self._support_GPU and useGPU
-        self._return_psi2_n_flag = ObsAr(np.zeros(1).astype(bool))
 
         from .psi_comp import PSICOMP_GH
         self.psicomp = PSICOMP_GH()
 
-    @property
-    def return_psi2_n(self):
-        """
-        Flag whether to pass back psi2 as NxMxM or MxM, by summing out N.
-        """
-        return self._return_psi2_n_flag[0]
-    @return_psi2_n.setter
-    def return_psi2_n(self, val):
-        def visit(self):
-            if isinstance(self, Kern):
-                self._return_psi2_n_flag[0]=val
-        self.traverse(visit)
-
     @Cache_this(limit=20)
     def _slice_X(self, X):
         return X[:, self.active_dims]
@@ -97,7 +83,9 @@ class Kern(Parameterized):
     def psi1(self, Z, variational_posterior):
         return self.psicomp.psicomputations(self, Z, variational_posterior)[1]
     def psi2(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=self.return_psi2_n)[2]
+        return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=False)[2]
+    def psi2n(self, Z, variational_posterior):
+        return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=True)[2]
     def gradients_X(self, dL_dK, X, X2):
         raise NotImplementedError
     def gradients_X_diag(self, dL_dKdiag, X):
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index 3ba782b1..145b9bfd 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -56,7 +56,10 @@ class RBF(Stationary):
         return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[1]
 
     def psi2(self, Z, variational_posterior):
-        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior, return_psi2_n=self.return_psi2_n)[2]
+        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior, return_psi2_n=False)[2]
+
+    def psi2n(self, Z, variational_posterior):
+        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior, return_psi2_n=True)[2]
 
     def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior,
                                       psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None):
diff --git a/GPy/models/bayesian_gplvm_minibatch.py b/GPy/models/bayesian_gplvm_minibatch.py
index 5fd9af60..1d06f07f 100644
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@@ -81,38 +81,9 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
         """Get the gradients of the posterior distribution of X in its specific form."""
         return X.mean.gradient, X.variance.gradient
 
-    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None,psi0=None, psi1=None, psi2=None, **kw):
+    def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, psi0=None, psi1=None, psi2=None, **kw):
         posterior, log_marginal_likelihood, grad_dict = super(BayesianGPLVMMiniBatch, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm,
                                                                                                                     psi0=psi0, psi1=psi1, psi2=psi2, **kw)
-        kl_fctr = self.kl_factr
-        if self.has_uncertain_inputs():
-            if self.missing_data:
-                d = self.output_dim
-                log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)/d
-            else:
-                log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)
-
-            X.mean.gradient[:] = 0
-            X.variance.gradient[:] = 0
-
-            self.variational_prior.update_gradients_KL(X)
-            if self.missing_data:
-                grad_dict['meangrad'] = kl_fctr*X.mean.gradient/d
-                grad_dict['vargrad'] = kl_fctr*X.variance.gradient/d
-            else:
-                grad_dict['meangrad'] = kl_fctr*X.mean.gradient
-                grad_dict['vargrad'] = kl_fctr*X.variance.gradient
-
-            #if subset_indices is not None:
-                #value_indices['meangrad'] = subset_indices['samples']
-                #value_indices['vargrad'] = subset_indices['samples']
-
-            #grad_dict['meangrad'] = kl_fctr*self.X.mean.gradient
-            #grad_dict['vargrad'] = kl_fctr*self.X.variance.gradient
-            #self.X.mean.gradient[:] = 0
-            #self.X.variance.gradient[:] = 0
-            #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
-
         return posterior, log_marginal_likelihood, grad_dict
 
     def _outer_values_update(self, full_values):
@@ -127,36 +98,41 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
                                             Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
                                             dL_dpsi1=full_values['dL_dpsi1'],
                                             dL_dpsi2=full_values['dL_dpsi2'],
-                                            psi0=self.psi0, psi1=self.psi1, psi2=self.psi2,
-                                            Lpsi0=full_values['Lpsi0'], Lpsi1=full_values['Lpsi1'], Lpsi2=full_values['Lpsi2'])
-            full_values['meangrad'] += meangrad_tmp
-            full_values['vargrad'] += vargrad_tmp
-        else:
-            full_values['Xgrad'] = self.kern.gradients_X(full_values['dL_dKnm'], self.X, self.Z)
-            full_values['Xgrad'] += self.kern.gradients_X_diag(full_values['dL_dKdiag'], self.X)
+                                            psi0=self.psi0, psi1=self.psi1, psi2=self.psi2)
+
+            kl_fctr = self.kl_factr
+
+            self.X.mean.gradient[:] = 0
+            self.X.variance.gradient[:] = 0
+            self.variational_prior.update_gradients_KL(self.X)
+
+            if self.missing_data or not self.stochastics:
+                self.X.mean.gradient = kl_fctr*self.X.mean.gradient
+                self.X.variance.gradient = kl_fctr*self.X.variance.gradient
+            else:
+                d = self.output_dim
+                self.X.mean.gradient = kl_fctr*self.X.mean.gradient*self.stochastics.batchsize/d
+                self.X.variance.gradient = kl_fctr*self.X.variance.gradient*self.stochastics.batchsize/d
+            self.X.mean.gradient += meangrad_tmp
+            self.X.variance.gradient += vargrad_tmp
 
-        if self.has_uncertain_inputs():
-            self.X.mean.gradient = full_values['meangrad']
-            self.X.variance.gradient = full_values['vargrad']
         else:
-            self.X.gradient = full_values['Xgrad']
+            self.X.gradient = self.kern.gradients_X(full_values['dL_dKnm'], self.X, self.Z)
+            self.X.gradient += self.kern.gradients_X_diag(full_values['dL_dKdiag'], self.X)
 
     def _outer_init_full_values(self):
         full_values = super(BayesianGPLVMMiniBatch, self)._outer_init_full_values()
-        full_values['meangrad'] = np.zeros((self.X.shape[0], self.X.shape[1]))
-        full_values['vargrad'] = np.zeros((self.X.shape[0], self.X.shape[1]))
-
-        full_values['dL_dpsi0'] = ObsAr(np.zeros(self.X.shape[0]))
-        full_values['dL_dpsi1'] = ObsAr(np.zeros((self.X.shape[0], self.Z.shape[0])))
-        full_values['dL_dpsi2'] = ObsAr(np.zeros((self.Z.shape[0], self.Z.shape[0])))
-
-        full_values['Lpsi0'] = ObsAr(np.zeros(self.X.shape[0]))
-        full_values['Lpsi1'] = ObsAr(np.zeros((self.X.shape[0], self.Z.shape[0])))
-        full_values['Lpsi2'] = ObsAr(np.zeros((self.X.shape[0], self.Z.shape[0], self.Z.shape[0])))
         return full_values
 
     def parameters_changed(self):
         super(BayesianGPLVMMiniBatch,self).parameters_changed()
+        kl_fctr = self.kl_factr
+        if self.missing_data or not self.stochastics:
+            self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)
+        elif self.stochastics:
+            d = self.output_dim
+            self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)*self.stochastics.batchsize/d
+
         if isinstance(self.inference_method, VarDTC_minibatch):
             return
 
diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py
index 92b2baf5..ecb53f0f 100644
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@@ -63,10 +63,10 @@ class SparseGPMiniBatch(SparseGP):
 
         if stochastic and missing_data:
             self.missing_data = True
-            self.stochastics = SparseGPStochastics(self, batchsize)
+            self.stochastics = SparseGPStochastics(self, batchsize, self.missing_data)
         elif stochastic and not missing_data:
             self.missing_data = False
-            self.stochastics = SparseGPStochastics(self, batchsize)
+            self.stochastics = SparseGPStochastics(self, batchsize, self.missing_data)
         elif missing_data:
             self.missing_data = True
             self.stochastics = SparseGPMissing(self)
@@ -105,11 +105,6 @@ class SparseGPMiniBatch(SparseGP):
             psi2_sum_n = psi2.sum(axis=0)
         posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm,
                                                                                         dL_dKmm=dL_dKmm, psi0=psi0, psi1=psi1, psi2=psi2_sum_n, **kwargs)
-
-        if self.has_uncertain_inputs():
-            grad_dict['Lpsi0'] = grad_dict['dL_dpsi0']*psi0
-            grad_dict['Lpsi1'] = grad_dict['dL_dpsi1']*psi1
-            grad_dict['Lpsi2'] = grad_dict['dL_dpsi2']*psi2
         return posterior, log_marginal_likelihood, grad_dict
 
     def _inner_take_over_or_update(self, full_values=None, current_values=None, value_indices=None):
@@ -144,14 +139,10 @@ class SparseGPMiniBatch(SparseGP):
             else:
                 index = slice(None)
             if key in full_values:
-                if np.isscalar(current_values[key]):
+                try:
+                    full_values[key][index] += current_values[key]
+                except:
                     full_values[key] += current_values[key]
-                else:
-                    from ..core.parameterization.observable_array import ObsAr
-                    if isinstance(full_values[key], ObsAr):
-                        full_values[key].values[index] += current_values[key]
-                    else:
-                        full_values[key][index] += current_values[key]
             else:
                 full_values[key] = current_values[key]
 
@@ -174,43 +165,39 @@ class SparseGPMiniBatch(SparseGP):
             #gradients wrt kernel
             dL_dKmm = full_values['dL_dKmm']
             self.kern.update_gradients_full(dL_dKmm, self.Z, None)
-            full_values['kerngrad'] = self.kern.gradient.copy()
+            kgrad = self.kern.gradient.copy()
             self.kern.update_gradients_expectations(
                                                 variational_posterior=self.X,
                                                 Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
                                                 dL_dpsi1=full_values['dL_dpsi1'],
                                                 dL_dpsi2=full_values['dL_dpsi2'],
-                                                psi0=self.psi0, psi1=self.psi1, psi2=self.psi2,
-                                                Lpsi0=full_values['Lpsi0'], Lpsi1=full_values['Lpsi1'], Lpsi2=full_values['Lpsi2'])
-            full_values['kerngrad'] += self.kern.gradient
+                                                psi0=self.psi0, psi1=self.psi1, psi2=self.psi2)
+            self.kern.gradient += kgrad
+
 
             #gradients wrt Z
-            full_values['Zgrad'] = self.kern.gradients_X(dL_dKmm, self.Z)
-            full_values['Zgrad'] += self.kern.gradients_Z_expectations(
+            self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z)
+            self.Z.gradient += self.kern.gradients_Z_expectations(
                                             variational_posterior=self.X,
                                             Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'],
                                             dL_dpsi1=full_values['dL_dpsi1'],
                                             dL_dpsi2=full_values['dL_dpsi2'],
-                                            psi0=self.psi0, psi1=self.psi1, psi2=self.psi2,
-                                            Lpsi0=full_values['Lpsi0'], Lpsi1=full_values['Lpsi1'], Lpsi2=full_values['Lpsi2'])
+                                            psi0=self.psi0, psi1=self.psi1, psi2=self.psi2)
         else:
             #gradients wrt kernel
             self.kern.update_gradients_diag(full_values['dL_dKdiag'], self.X)
-            full_values['kerngrad'] = self.kern.gradient.copy()
+            kgrad = self.kern.gradient.copy()
             self.kern.update_gradients_full(full_values['dL_dKnm'], self.X, self.Z)
-            full_values['kerngrad'] += self.kern.gradient
+            kgrad += self.kern.gradient
             self.kern.update_gradients_full(full_values['dL_dKmm'], self.Z, None)
-            full_values['kerngrad'] += self.kern.gradient
+            self.kern.gradient += kgrad
+            #kgrad += self.kern.gradient
+
             #gradients wrt Z
-            full_values['Zgrad'] = self.kern.gradients_X(full_values['dL_dKmm'], self.Z)
-            full_values['Zgrad'] += self.kern.gradients_X(full_values['dL_dKnm'].T, self.Z, self.X)
+            self.Z.gradient = self.kern.gradients_X(full_values['dL_dKmm'], self.Z)
+            self.Z.gradient += self.kern.gradients_X(full_values['dL_dKnm'].T, self.Z, self.X)
 
         self.likelihood.update_gradients(full_values['dL_dthetaL'])
-        full_values['likgrad'] = self.likelihood.gradient.copy()
-
-        self.likelihood.gradient = full_values['likgrad']
-        self.kern.gradient = full_values['kerngrad']
-        self.Z.gradient = full_values['Zgrad']
 
     def _outer_init_full_values(self):
         """
@@ -225,8 +212,15 @@ class SparseGPMiniBatch(SparseGP):
         to initialize the gradients for the mean and the variance in order to
         have the full gradient for indexing)
         """
-        return {'dL_dKdiag': np.zeros(self.X.shape[0]),
-                'dL_dKnm': np.zeros((self.X.shape[0], self.Z.shape[0]))}
+        retd = dict(dL_dKmm=np.zeros((self.Z.shape[0], self.Z.shape[0])))
+        if self.has_uncertain_inputs():
+            retd.update(dict(dL_dpsi0=np.zeros(self.X.shape[0]),
+                             dL_dpsi1=np.zeros((self.X.shape[0], self.Z.shape[0])),
+                             dL_dpsi2=np.zeros((self.X.shape[0], self.Z.shape[0], self.Z.shape[0]))))
+        else:
+            retd.update({'dL_dKdiag': np.zeros(self.X.shape[0]),
+                         'dL_dKnm': np.zeros((self.X.shape[0], self.Z.shape[0]))})
+        return retd
 
     def _outer_loop_for_missing_data(self):
         Lm = None
@@ -247,35 +241,17 @@ class SparseGPMiniBatch(SparseGP):
             message = m_f(-1)
             print(message, end=' ')
 
-        #Compute the psi statistics for N once, but don't sum out N in psi2
-        if self.has_uncertain_inputs():
-            self.kern.return_psi2_n = True
-            from ..core.parameterization.observable_array import ObsAr
-            psi0 = ObsAr(self.kern.psi0(self.Z, self.X))
-            psi1 = ObsAr(self.kern.psi1(self.Z, self.X))
-            psi2 = ObsAr(self.kern.psi2(self.Z, self.X))
-        else:
-            psi0 = self.kern.Kdiag(self.X)
-            psi1 = self.kern.K(self.X, self.Z)
-            psi2 = None
-
-        self.psi0 = psi0
-        self.psi1 = psi1
-        self.psi2 = psi2
-
         for d, ninan in self.stochastics.d:
             if not self.stochastics:
                 print(' '*(len(message)) + '\r', end=' ')
                 message = m_f(d)
                 print(message, end=' ')
 
-            psi0ni = psi0[ninan]
-            psi1ni = psi1[ninan]
+            psi0ni = self.psi0[ninan]
+            psi1ni = self.psi1[ninan]
             if self.has_uncertain_inputs():
-                psi2ni = psi2[ninan]
-                #value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan, meangrad=ninan, vargrad=ninan)
-                value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan, meangrad=ninan, vargrad=ninan,
-                                     Lpsi0=ninan, Lpsi1=ninan, Lpsi2=ninan)
+                psi2ni = self.psi2[ninan]
+                value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan, dL_dpsi2=ninan)
             else:
                 psi2ni = None
                 value_indices = dict(outputs=d, samples=ninan, dL_dKdiag=ninan, dL_dKnm=ninan)
@@ -290,7 +266,7 @@ class SparseGPMiniBatch(SparseGP):
             # Fill out the full values by adding in the apporpriate grad_dict
             # values
             self._inner_take_over_or_update(self.full_values, grad_dict, value_indices)
-            self._inner_values_update(grad_dict)  # What is this for?
+            self._inner_values_update(grad_dict)  # What is this for? -> MRD
 
             woodbury_inv[:, :, d] = posterior.woodbury_inv[:,:,None]
             woodbury_vector[:, d] = posterior.woodbury_vector
@@ -307,8 +283,6 @@ class SparseGPMiniBatch(SparseGP):
             self.kern.return_psi2_n = False
 
     def _outer_loop_without_missing_data(self):
-        self._log_marginal_likelihood = 0
-
         if self.posterior is None:
             woodbury_inv = np.zeros((self.num_inducing, self.num_inducing, self.output_dim))
             woodbury_vector = np.zeros((self.num_inducing, self.output_dim))
@@ -316,14 +290,14 @@ class SparseGPMiniBatch(SparseGP):
             woodbury_inv = self.posterior._woodbury_inv
             woodbury_vector = self.posterior._woodbury_vector
 
-        d = self.stochastics.d
+        d = self.stochastics.d[0][0]
         posterior, log_marginal_likelihood, grad_dict= self._inner_parameters_changed(
                             self.kern, self.X,
                             self.Z, self.likelihood,
                             self.Y_normalized[:, d], self.Y_metadata)
         self.grad_dict = grad_dict
 
-        self._log_marginal_likelihood += log_marginal_likelihood
+        self._log_marginal_likelihood = log_marginal_likelihood
 
         self._outer_values_update(self.grad_dict)
 
@@ -334,6 +308,19 @@ class SparseGPMiniBatch(SparseGP):
                                    K=posterior._K, mean=None, cov=None, K_chol=posterior.K_chol)
 
     def parameters_changed(self):
+        #Compute the psi statistics for N once, but don't sum out N in psi2
+        if self.has_uncertain_inputs():
+            #psi0 = ObsAr(self.kern.psi0(self.Z, self.X))
+            #psi1 = ObsAr(self.kern.psi1(self.Z, self.X))
+            #psi2 = ObsAr(self.kern.psi2(self.Z, self.X))
+            self.psi0 = self.kern.psi0(self.Z, self.X)
+            self.psi1 = self.kern.psi1(self.Z, self.X)
+            self.psi2 = self.kern.psi2n(self.Z, self.X)
+        else:
+            self.psi0 = self.kern.Kdiag(self.X)
+            self.psi1 = self.kern.K(self.X, self.Z)
+            self.psi2 = None
+
         if self.missing_data:
             self._outer_loop_for_missing_data()
         elif self.stochastics: