Merge branch 'devel' into gradientsxx

2026-04-29 14:56:24 +02:00 · 2016-06-06 10:50:34 +01:00 · 2016-06-06 10:50:34 +01:00 · 4e833a4f3a
commit 4e833a4f3a
parent 64f2af719a 163f5bcb04
13 changed files with 1570 additions and 41 deletions
--- a/GPy/version.py
+++ b/GPy/version.py
@ -1 +1 @@
-__version__ = "1.0.7"
+__version__ = "1.0.9"
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -148,14 +148,16 @@ class GP(Model):
                # LVM models
                if isinstance(self.X, VariationalPosterior):
                    assert isinstance(X, type(self.X)), "The given X must have the same type as the X in the model!"
+                    index = self.X._parent_index_
                    self.unlink_parameter(self.X)
                    self.X = X
-                    self.link_parameter(self.X)
+                    self.link_parameter(self.X, index=index)
                else:
+                    index = self.X._parent_index_
                    self.unlink_parameter(self.X)
                    from ..core import Param
-                    self.X = Param('latent mean',X)
-                    self.link_parameter(self.X)
+                    self.X = Param('latent mean', X)
+                    self.link_parameter(self.X, index=index)
            else:
                self.X = ObsAr(X)
        self.update_model(True)
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -340,7 +340,7 @@ def bgplvm_simulation(optimize=True, verbose=1,
                   gtol=.05)
    if plot:
        m.X.plot("BGPLVM Latent Space 1D")
-        m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
+        m.kern.plot_ARD()
    return m

 def gplvm_simulation(optimize=True, verbose=1,
@ -364,7 +364,7 @@ def gplvm_simulation(optimize=True, verbose=1,
                   gtol=.05)
    if plot:
        m.X.plot("BGPLVM Latent Space 1D")
-        m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
+        m.kern.plot_ARD()
    return m
 def ssgplvm_simulation(optimize=True, verbose=1,
                      plot=True, plot_sim=False,
@ -388,7 +388,7 @@ def ssgplvm_simulation(optimize=True, verbose=1,
                   gtol=.05)
    if plot:
        m.X.plot("SSGPLVM Latent Space 1D")
-        m.kern.plot_ARD('SSGPLVM Simulation ARD Parameters')
+        m.kern.plot_ARD()
    return m

 def bgplvm_simulation_missing_data(optimize=True, verbose=1,
@ -418,7 +418,7 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
                   gtol=.05)
    if plot:
        m.X.plot("BGPLVM Latent Space 1D")
-        m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
+        m.kern.plot_ARD()
    return m

 def bgplvm_simulation_missing_data_stochastics(optimize=True, verbose=1,
@ -448,7 +448,7 @@ def bgplvm_simulation_missing_data_stochastics(optimize=True, verbose=1,
                   gtol=.05)
    if plot:
        m.X.plot("BGPLVM Latent Space 1D")
-        m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
+        m.kern.plot_ARD()
    return m


@ -469,7 +469,7 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
        m.optimize(messages=verbose, max_iters=8e3)
    if plot:
        m.X.plot("MRD Latent Space 1D")
-        m.plot_scales("MRD Scales")
+        m.plot_scales()
    return m

 def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
@ -496,7 +496,7 @@ def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim
        m.optimize('bfgs', messages=verbose, max_iters=8e3, gtol=.1)
    if plot:
        m.X.plot("MRD Latent Space 1D")
-        m.plot_scales("MRD Scales")
+        m.plot_scales()
    return m

 def brendan_faces(optimize=True, verbose=True, plot=True):
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -24,6 +24,10 @@ from .src.ODE_st import ODE_st
 from .src.ODE_t import ODE_t
 from .src.poly import Poly
 from .src.eq_ode2 import EQ_ODE2
+from .src.integral import Integral
+from .src.integral_limits import Integral_Limits
+from .src.multidimensional_integral_limits import Multidimensional_Integral_Limits
+from .src.eq_ode1 import EQ_ODE1
 from .src.trunclinear import TruncLinear,TruncLinear_inf
 from .src.splitKern import SplitKern,DEtime
 from .src.splitKern import DEtime as DiffGenomeKern
--- a/GPy/kern/src/eq_ode1.py
+++ b/GPy/kern/src/eq_ode1.py
@ -0,0 +1,649 @@
+# Copyright (c) 2014, Cristian Guarnizo.
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from scipy.special import erf, erfcx
+from .kern import Kern
+from ...core.parameterization import Param
+from paramz.transformations import Logexp
+from paramz.caching import Cache_this
+
+class EQ_ODE1(Kern):
+    """
+    Covariance function for first order differential equation driven by an exponentiated quadratic covariance.
+
+    This outputs of this kernel have the form
+    .. math::
+       \frac{\text{d}y_j}{\text{d}t} = \sum_{i=1}^R w_{j,i} u_i(t-\delta_j) - d_jy_j(t)
+
+    where :math:`R` is the rank of the system, :math:`w_{j,i}` is the sensitivity of the :math:`j`th output to the :math:`i`th latent function, :math:`d_j` is the decay rate of the :math:`j`th output and :math:`u_i(t)` are independent latent Gaussian processes goverened by an exponentiated quadratic covariance.
+    
+    :param output_dim: number of outputs driven by latent function.
+    :type output_dim: int
+    :param W: sensitivities of each output to the latent driving function. 
+    :type W: ndarray (output_dim x rank).
+    :param rank: If rank is greater than 1 then there are assumed to be a total of rank latent forces independently driving the system, each with identical covariance.
+    :type rank: int
+    :param decay: decay rates for the first order system. 
+    :type decay: array of length output_dim.
+    :param delay: delay between latent force and output response.
+    :type delay: array of length output_dim.
+    :param kappa: diagonal term that allows each latent output to have an independent component to the response.
+    :type kappa: array of length output_dim.
+    
+    .. Note: see first order differential equation examples in GPy.examples.regression for some usage.
+    """
+    def __init__(self, input_dim=2, output_dim=1, rank=1, W = None, lengthscale=None,  decay=None, active_dims=None, name='eq_ode1'):
+        assert input_dim == 2, "only defined for 1 input dims"
+        super(EQ_ODE1, self).__init__(input_dim=input_dim, active_dims=active_dims, name=name)
+
+        self.rank = rank
+        self.output_dim = output_dim
+
+        if lengthscale is None:
+            lengthscale = .5 + np.random.rand(self.rank)
+        else:
+            lengthscale = np.asarray(lengthscale)
+            assert lengthscale.size in [1, self.rank], "Bad number of lengthscales"
+            if lengthscale.size != self.rank:
+                lengthscale = np.ones(self.rank)*lengthscale
+            
+        if W is None:
+            W = .5*np.random.randn(self.output_dim, self.rank)/np.sqrt(self.rank)
+        else:
+            assert W.shape == (self.output_dim, self.rank)
+        
+        if decay is None:
+            decay = np.ones(self.output_dim)
+        else:
+            decay = np.asarray(decay)
+            assert decay.size in [1, self.output_dim], "Bad number of decay"
+            if decay.size != self.output_dim:
+                decay = np.ones(self.output_dim)*decay
+
+#        if kappa is None:
+#            self.kappa = np.ones(self.output_dim)
+#        else:
+#            kappa = np.asarray(kappa)
+#            assert kappa.size in [1, self.output_dim], "Bad number of kappa"
+#            if decay.size != self.output_dim:
+#                decay = np.ones(self.output_dim)*kappa
+
+        #self.kappa = Param('kappa', kappa, Logexp())
+        #self.delay = Param('delay', delay, Logexp())
+        #self.is_normalized = True
+        #self.is_stationary = False
+        #self.gaussian_initial = False
+
+        self.lengthscale = Param('lengthscale', lengthscale, Logexp())
+        self.decay = Param('decay', decay, Logexp())
+        self.W = Param('W', W)
+        self.link_parameters(self.lengthscale, self.decay, self.W)
+
+    @Cache_this(limit=3)
+    def K(self, X, X2=None):
+        #This way is not working, indexes are lost after using k._slice_X
+        #index = np.asarray(X, dtype=np.int)
+        #index = index.reshape(index.size,)
+        if hasattr(X, 'values'):
+            X = X.values
+        index = np.int_(np.round(X[:, 1]))
+        index = index.reshape(index.size,)
+        X_flag = index[0] >= self.output_dim
+        if X2 is None:
+            if X_flag:
+                #Calculate covariance function for the latent functions
+                index -= self.output_dim
+                return self._Kuu(X, index)
+            else:
+                raise NotImplementedError
+        else:
+            #This way is not working, indexes are lost after using k._slice_X
+            #index2 = np.asarray(X2, dtype=np.int)
+            #index2 = index2.reshape(index2.size,)
+            if hasattr(X2, 'values'):
+                X2 = X2.values
+            index2 = np.int_(np.round(X2[:, 1]))
+            index2 = index2.reshape(index2.size,)
+            X2_flag = index2[0] >= self.output_dim
+            #Calculate cross-covariance function
+            if not X_flag and X2_flag:
+                index2 -= self.output_dim
+                return self._Kfu(X, index, X2, index2) #Kfu
+            elif X_flag and not X2_flag:
+                index -= self.output_dim
+                return self._Kfu(X2, index2, X, index).T #Kuf
+            elif X_flag and X2_flag:
+                index -= self.output_dim
+                index2 -= self.output_dim
+                return self._Kusu(X, index, X2, index2) #Ku_s u
+            else:
+                raise NotImplementedError #Kf_s f
+
+    #Calculate the covariance function for diag(Kff(X,X))
+    def Kdiag(self, X):
+        if hasattr(X, 'values'):
+            index = np.int_(np.round(X[:, 1].values))
+        else:
+            index = np.int_(np.round(X[:, 1]))
+        index = index.reshape(index.size,)
+        X_flag = index[0] >= self.output_dim
+        
+        if X_flag: #Kuudiag        
+            return np.ones(X[:,0].shape)
+        else: #Kffdiag
+            kdiag = self._Kdiag(X)
+            return np.sum(kdiag, axis=1)
+        
+    def _Kdiag(self, X):
+        #This way is not working, indexes are lost after using k._slice_X
+        #index = np.asarray(X, dtype=np.int)
+        #index = index.reshape(index.size,)
+        if hasattr(X, 'values'):
+            X = X.values
+        index = np.int_(X[:, 1])
+        index = index.reshape(index.size,)
+        
+        #terms that move along t
+        t = X[:, 0].reshape(X.shape[0], 1)
+        d = np.unique(index) #Output Indexes
+        B = self.decay.values[d]
+        S = self.W.values[d, :]
+        #Index transformation
+        indd = np.arange(self.output_dim)
+        indd[d] = np.arange(d.size)
+        index = indd[index]
+        
+        B = B.reshape(B.size, 1)
+        #Terms that move along q
+        lq = self.lengthscale.values.reshape(1, self.rank)
+        S2 = S*S
+        kdiag = np.empty((t.size, ))
+
+        #Dx1 terms
+        c0 = (S2/B)*((.5*np.sqrt(np.pi))*lq)
+
+        #DxQ terms
+        nu = lq*(B*.5)
+        nu2 = nu*nu
+        #Nx1 terms
+        gamt = -2.*B
+        gamt = gamt[index]*t
+
+        #NxQ terms
+        t_lq = t/lq
+
+        # Upsilon Calculations
+        # Using wofz
+        #erfnu = erf(nu)
+        
+        upm = np.exp(nu2[index, :] + lnDifErf( nu[index, :] ,t_lq+nu[index,:] ))
+        upm[t[:, 0] == 0, :] = 0.
+
+        
+        upv = np.exp(nu2[index, :] + gamt + lnDifErf( -t_lq+nu[index,:], nu[index, :] ) )
+        upv[t[:, 0] == 0, :] = 0.
+
+        #Covariance calculation
+        #kdiag = np.sum(c0[index, :]*(upm-upv), axis=1)
+        kdiag = c0[index, :]*(upm-upv)
+        return kdiag
+
+    def update_gradients_full(self, dL_dK, X, X2 = None):
+        #index = np.asarray(X, dtype=np.int)
+        #index = index.reshape(index.size,)
+        if hasattr(X, 'values'):
+            X = X.values
+        self.decay.gradient = np.zeros(self.decay.shape)
+        self.W.gradient = np.zeros(self.W.shape)
+        self.lengthscale.gradient = np.zeros(self.lengthscale.shape)
+        index = np.int_(np.round(X[:, 1]))
+        index = index.reshape(index.size,)
+        X_flag = index[0] >= self.output_dim
+        if X2 is None:
+            if X_flag: #Kuu or Kmm
+                index -= self.output_dim
+                tmp = dL_dK*self._gkuu_lq(X, index)
+                for q in np.unique(index):
+                    ind = np.where(index == q)
+                    self.lengthscale.gradient[q] = tmp[np.ix_(ind[0], ind[0])].sum()
+            else:
+                raise NotImplementedError
+        else: #Kfu or Knm
+            #index2 = np.asarray(X2, dtype=np.int)
+            #index2 = index2.reshape(index2.size,)
+            if hasattr(X2, 'values'):
+                X2 = X2.values
+            index2 = np.int_(np.round(X2[:, 1]))
+            index2 = index2.reshape(index2.size,)
+            X2_flag = index2[0] >= self.output_dim
+            if not X_flag and X2_flag: #Kfu
+                index2 -= self.output_dim
+            else: #Kuf
+                dL_dK = dL_dK.T #so we obtaing dL_Kfu
+                indtemp = index - self.output_dim
+                Xtemp = X
+                X = X2
+                X2 = Xtemp
+                index = index2
+                index2 = indtemp
+            glq, gSdq, gB = self._gkfu(X, index, X2, index2)
+            tmp = dL_dK*glq
+            for q in np.unique(index2):
+                ind = np.where(index2 == q)
+                self.lengthscale.gradient[q] = tmp[:, ind].sum()
+            tmpB = dL_dK*gB
+            tmp = dL_dK*gSdq
+            for d in np.unique(index):
+                ind = np.where(index == d)
+                self.decay.gradient[d] = tmpB[ind, :].sum()
+                for q in np.unique(index2):
+                    ind2 = np.where(index2 == q)
+                    self.W.gradient[d, q] = tmp[np.ix_(ind[0], ind2[0])].sum()
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        #index = np.asarray(X, dtype=np.int)
+        #index = index.reshape(index.size,)
+        if hasattr(X, 'values'):
+            X = X.values
+        self.decay.gradient = np.zeros(self.decay.shape)
+        self.W.gradient = np.zeros(self.W.shape)
+        self.lengthscale.gradient = np.zeros(self.lengthscale.shape)
+        index = np.int_(X[:, 1])
+        index = index.reshape(index.size,)
+        
+        glq, gS, gB = self._gkdiag(X, index)
+        if dL_dKdiag.size == X.shape[0]:
+            dL_dKdiag = np.reshape(dL_dKdiag, (index.size, 1))
+        tmp = dL_dKdiag*glq
+        self.lengthscale.gradient = tmp.sum(0)
+        tmpB = dL_dKdiag*gB
+        tmp = dL_dKdiag*gS
+        for d in np.unique(index):
+            ind = np.where(index == d)
+            self.decay.gradient[d] = tmpB[ind, :].sum()
+            self.W.gradient[d, :] = tmp[ind].sum(0)
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        #index = np.asarray(X, dtype=np.int)
+        #index = index.reshape(index.size,)
+        if hasattr(X, 'values'):
+            X = X.values
+        index = np.int_(np.round(X[:, 1]))
+        index = index.reshape(index.size,)
+        X_flag = index[0] >= self.output_dim
+        #If input_dim == 1, use this
+        #gX = np.zeros((X.shape[0], 1))
+        #Cheat to allow gradient for input_dim==2
+        gX = np.zeros(X.shape)
+        if X2 is None: #Kuu or Kmm
+            if X_flag:
+                index -= self.output_dim
+                gX[:, 0] = 2.*(dL_dK*self._gkuu_X(X, index)).sum(0)
+                return gX
+            else:
+                raise NotImplementedError
+        else: #Kuf or Kmn
+            #index2 = np.asarray(X2, dtype=np.int)
+            #index2 = index2.reshape(index2.size,)
+            if hasattr(X2, 'values'):
+                X2 = X2.values
+            index2 = np.int_(np.round(X2[:, 1]))
+            index2 = index2.reshape(index2.size,)
+            X2_flag = index2[0] >= self.output_dim
+            if X_flag and not X2_flag: #gradient of Kuf(Z, X) wrt Z
+                index -= self.output_dim
+                gX[:, 0] = (dL_dK*self._gkfu_z(X2, index2, X, index).T).sum(1)
+                return gX
+            else:
+                raise NotImplementedError
+
+    #---------------------------------------#
+    #             Helper functions          #
+    #---------------------------------------#
+
+    #Evaluation of squared exponential for LFM
+    def _Kuu(self, X, index):
+        index = index.reshape(index.size,)
+        t = X[:, 0].reshape(X.shape[0],)
+        lq = self.lengthscale.values.reshape(self.rank,)
+        lq2 = lq*lq
+        #Covariance matrix initialization
+        kuu = np.zeros((t.size, t.size))
+        #Assign 1. to diagonal terms
+        kuu[np.diag_indices(t.size)] = 1.
+        #Upper triangular indices
+        indtri1, indtri2 = np.triu_indices(t.size, 1)
+        #Block Diagonal indices among Upper Triangular indices
+        ind = np.where(index[indtri1] == index[indtri2])
+        indr = indtri1[ind]
+        indc = indtri2[ind]
+        r = t[indr] - t[indc]
+        r2 = r*r
+        #Calculation of  covariance function
+        kuu[indr, indc] = np.exp(-r2/lq2[index[indr]])
+        #Completion of lower triangular part
+        kuu[indc, indr] = kuu[indr, indc]
+        return kuu
+
+    def _Kusu(self, X, index, X2, index2):
+        index = index.reshape(index.size,)
+        index2 = index2.reshape(index2.size,)
+        t = X[:, 0].reshape(X.shape[0],1)
+        t2 = X2[:, 0].reshape(1,X2.shape[0])
+        lq = self.lengthscale.values.reshape(self.rank,)
+        #Covariance matrix initialization
+        kuu = np.zeros((t.size, t2.size))
+        for q in range(self.rank):
+            ind1 = index == q
+            ind2 = index2 == q
+            r = t[ind1]/lq[q] - t2[0,ind2]/lq[q]
+            r2 = r*r
+            #Calculation of  covariance function
+            kuu[np.ix_(ind1, ind2)] = np.exp(-r2)
+        return kuu
+
+    #Evaluation of cross-covariance function
+    def _Kfu(self, X, index, X2, index2):
+        #terms that move along t
+        t = X[:, 0].reshape(X.shape[0], 1)
+        d = np.unique(index) #Output Indexes
+        B = self.decay.values[d]
+        S = self.W.values[d, :]
+        #Index transformation
+        indd = np.arange(self.output_dim)
+        indd[d] = np.arange(d.size)
+        index = indd[index]
+        #Output related variables must be column-wise
+        B = B.reshape(B.size, 1)
+        #Input related variables must be row-wise
+        z = X2[:, 0].reshape(1, X2.shape[0])
+        lq = self.lengthscale.values.reshape((1, self.rank))
+
+        kfu = np.empty((t.size, z.size))
+
+        #DxQ terms
+        c0 = S*((.5*np.sqrt(np.pi))*lq)
+        nu = B*(.5*lq)
+        nu2 = nu**2
+        #1xM terms
+        z_lq = z/lq[0, index2]
+        #NxM terms
+        tz = t-z
+        tz_lq = tz/lq[0, index2]
+
+        # Upsilon Calculations
+        fullind = np.ix_(index, index2)
+
+        upsi = np.exp(nu2[fullind] - B[index]*tz + lnDifErf( -tz_lq + nu[fullind], z_lq+nu[fullind]))
+        upsi[t[:, 0] == 0, :] = 0.
+        #Covariance calculation
+        kfu = c0[fullind]*upsi
+
+        return kfu
+
+    #Gradient of Kuu wrt lengthscale
+    def _gkuu_lq(self, X, index):
+        t = X[:, 0].reshape(X.shape[0],)
+        index = index.reshape(X.shape[0],)
+        lq = self.lengthscale.values.reshape(self.rank,)
+        lq2 = lq*lq
+        #Covariance matrix initialization
+        glq = np.zeros((t.size, t.size))
+        #Upper triangular indices
+        indtri1, indtri2 = np.triu_indices(t.size, 1)
+        #Block Diagonal indices among Upper Triangular indices
+        ind = np.where(index[indtri1] == index[indtri2])
+        indr = indtri1[ind]
+        indc = indtri2[ind]
+        r = t[indr] - t[indc]
+        r2 = r*r
+        r2_lq2 = r2/lq2[index[indr]]
+        #Calculation of  covariance function
+        er2_lq2 = np.exp(-r2_lq2)
+        #Gradient wrt lq
+        c = 2.*r2_lq2/lq[index[indr]]
+        glq[indr, indc] = er2_lq2*c
+        #Complete the lower triangular
+        glq[indc, indr] = glq[indr, indc]
+        return glq
+
+    #Be careful this derivative should be transpose it
+    def _gkuu_X(self, X, index): #Diagonal terms are always zero
+        t = X[:, 0].reshape(X.shape[0],)
+        index = index.reshape(index.size,)
+        lq = self.lengthscale.values.reshape(self.rank,)
+        lq2 = lq*lq
+        #Covariance matrix initialization
+        gt = np.zeros((t.size, t.size))
+        #Upper triangular indices
+        indtri1, indtri2 = np.triu_indices(t.size, 1) #Offset of 1 from the diagonal
+        #Block Diagonal indices among Upper Triangular indices
+        ind = np.where(index[indtri1] == index[indtri2])
+        indr = indtri1[ind]
+        indc = indtri2[ind]
+        r = t[indr] - t[indc]
+        r2 = r*r
+        r2_lq2 = r2/(-lq2[index[indr]])
+        #Calculation of  covariance function
+        er2_lq2 = np.exp(r2_lq2)
+        #Gradient wrt t
+        c = 2.*r/lq2[index[indr]]
+        gt[indr, indc] = er2_lq2*c
+        #Complete the lower triangular
+        gt[indc, indr] = -gt[indr, indc]
+        return gt
+
+    #Gradients for Diagonal Kff
+    def _gkdiag(self, X, index):
+        index = index.reshape(index.size,)
+        #terms that move along t
+        d = np.unique(index)
+        B = self.decay[d].values
+        S = self.W[d, :].values
+        #Index transformation
+        indd = np.arange(self.output_dim)
+        indd[d] = np.arange(d.size)
+        index = indd[index]
+        #Output related variables must be column-wise
+        t = X[:, 0].reshape(X.shape[0], 1)
+        B = B.reshape(B.size, 1)
+        S2 = S*S
+
+        #Input related variables must be row-wise
+        lq = self.lengthscale.values.reshape(1, self.rank)
+
+        gB = np.empty((t.size,))
+        glq = np.empty((t.size, lq.size))
+        gS = np.empty((t.size, lq.size))
+
+        #Dx1 terms
+        c0 = S2*lq*np.sqrt(np.pi)
+
+        #DxQ terms
+        nu = (.5*lq)*B
+        nu2 = nu*nu
+        
+        #Nx1 terms
+        gamt = -B[index]*t
+        egamt = np.exp(gamt)
+        e2gamt = egamt*egamt
+
+        #NxQ terms
+        t_lq = t/lq
+        t2_lq2 = -t_lq*t_lq
+
+        etlq2gamt = np.exp(t2_lq2 + gamt) #NXQ
+
+        ##Upsilon calculations
+        #erfnu = erf(nu) #TODO: This can be improved
+
+        upm = np.exp(nu2[index, :] + lnDifErf( nu[index, :], t_lq + nu[index, :]) )
+        upm[t[:, 0] == 0, :] = 0.
+
+        upv = np.exp(nu2[index, :] + 2.*gamt + lnDifErf(-t_lq + nu[index, :], nu[index, :]) ) #egamt*upv
+        upv[t[:, 0] == 0, :] = 0.
+
+        #Gradient wrt S
+        c0_S = (S/B)*(lq*np.sqrt(np.pi))
+
+        gS = c0_S[index]*(upm - upv)
+
+        #For B
+        CB1 = (.5*lq)**2 - .5/B**2 #DXQ
+        lq2_2B = (.5*lq**2)*(S2/B) #DXQ
+        CB2 = 2.*etlq2gamt - e2gamt - 1. #NxQ
+        
+        # gradient wrt B NxZ
+        gB = c0[index, :]*(CB1[index, :]*upm - (CB1[index, :] - t/B[index])*upv) + \
+        lq2_2B[index, :]*CB2
+
+        #Gradient wrt lengthscale
+        #DxQ terms
+        c0 = (.5*np.sqrt(np.pi))*(S2/B)*(1.+.5*(lq*B)**2)
+        Clq1 = S2*(lq*.5)
+        glq = c0[index]*(upm - upv) + Clq1[index]*CB2
+
+        return glq, gS, gB
+
+    def _gkfu(self, X, index, Z, index2):
+        index = index.reshape(index.size,)
+        #TODO: reduce memory usage
+        #terms that move along t
+        d = np.unique(index)
+        B = self.decay[d].values
+        S = self.W[d, :].values
+
+        #Index transformation
+        indd = np.arange(self.output_dim)
+        indd[d] = np.arange(d.size)
+        index = indd[index]
+        #t column
+        t = X[:, 0].reshape(X.shape[0], 1)
+        B = B.reshape(B.size, 1)
+        #z row
+        z = Z[:, 0].reshape(1, Z.shape[0])
+        index2 = index2.reshape(index2.size,)
+        lq = self.lengthscale.values.reshape((1, self.rank))
+
+        #kfu = np.empty((t.size, z.size))
+        glq = np.empty((t.size, z.size))
+        gSdq = np.empty((t.size, z.size))
+        gB = np.empty((t.size, z.size))
+
+        #Dx1 terms
+        B_2 = B*.5
+        S_pi = S*(.5*np.sqrt(np.pi))
+        #DxQ terms
+        c0 = S_pi*lq #lq*Sdq*sqrt(pi)
+        nu = B*lq*.5
+        nu2 = nu*nu
+
+        #1xM terms
+        z_lq = z/lq[0, index2]
+        
+        #NxM terms
+        tz = t-z
+        tz_lq = tz/lq[0, index2]
+        etz_lq2 = -np.exp(-tz_lq*tz_lq)
+        ez_lq_Bt = np.exp(-z_lq*z_lq -B[index]*t)
+        
+        # Upsilon calculations
+        fullind = np.ix_(index, index2)
+        upsi = np.exp(nu2[fullind] - B[index]*tz + lnDifErf( -tz_lq + nu[fullind], z_lq+nu[fullind] ) )
+        upsi[t[:, 0] == 0., :] = 0.
+
+        #Gradient wrt S
+        #DxQ term
+        Sa1 = lq*(.5*np.sqrt(np.pi))
+
+        gSdq = Sa1[0,index2]*upsi
+
+        #Gradient wrt lq
+        la1 = S_pi*(1. + 2.*nu2)
+        Slq = S*lq
+        uplq = etz_lq2*(tz_lq/lq[0, index2] + B_2[index])
+        uplq += ez_lq_Bt*(-z_lq/lq[0, index2] + B_2[index])
+
+        glq = la1[fullind]*upsi
+        glq += Slq[fullind]*uplq
+
+        #Gradient wrt B
+        Slq = Slq*lq
+        nulq = nu*lq
+        upBd = etz_lq2 + ez_lq_Bt
+        gB = c0[fullind]*(nulq[fullind] - tz)*upsi + .5*Slq[fullind]*upBd
+
+        return glq, gSdq, gB
+
+    #TODO: reduce memory usage
+    def _gkfu_z(self, X, index, Z, index2): #Kfu(t,z)
+        index = index.reshape(index.size,)
+        #terms that move along t
+        d = np.unique(index)
+        B = self.decay[d].values
+        S = self.W[d, :].values
+        #Index transformation
+        indd = np.arange(self.output_dim)
+        indd[d] = np.arange(d.size)
+        index = indd[index]
+
+        #t column
+        t = X[:, 0].reshape(X.shape[0], 1)
+        B = B.reshape(B.size, 1)
+        #z row
+        z = Z[:, 0].reshape(1, Z.shape[0])
+        index2 = index2.reshape(index2.size,)
+        lq = self.lengthscale.values.reshape((1, self.rank))
+
+        #kfu = np.empty((t.size, z.size))
+        gz = np.empty((t.size, z.size))
+
+        #Dx1 terms
+        S_pi =S*(.5*np.sqrt(np.pi))
+        #DxQ terms
+        #Slq = S*lq
+        c0 = S_pi*lq #lq*Sdq*sqrt(pi)
+        nu = (.5*lq)*B
+        nu2 = nu*nu
+
+        #1xM terms
+        z_lq = z/lq[0, index2]
+        z_lq2 = -z_lq*z_lq
+        #NxQ terms
+        t_lq = t/lq
+        #NxM terms
+        zt_lq = z_lq - t_lq[:, index2]
+        zt_lq2 = -zt_lq*zt_lq
+
+        # Upsilon calculations
+        fullind = np.ix_(index, index2)
+        z2 = z_lq + nu[fullind]
+        z1 = z2 - t_lq[:, index2]
+        upsi = np.exp(nu2[fullind] - B[index]*(t-z) + lnDifErf(z1,z2) )
+        upsi[t[:, 0] == 0., :] = 0.
+
+        #Gradient wrt z
+        za1 = c0*B
+        #za2 = S_w
+        gz = za1[fullind]*upsi + S[fullind]*( np.exp(z_lq2 - B[index]*t) -np.exp(zt_lq2) )
+
+        return gz
+        
+def lnDifErf(z1,z2):
+    #Z2 is always positive
+    logdiferf = np.zeros(z1.shape)        
+    ind = np.where(z1>0.)
+    ind2 = np.where(z1<=0.)
+    if ind[0].shape > 0:
+        z1i = z1[ind]
+        z12 = z1i*z1i
+        z2i = z2[ind]
+        logdiferf[ind] = -z12 + np.log(erfcx(z1i) - erfcx(z2i)*np.exp(z12-z2i**2))
+    
+    if ind2[0].shape > 0:
+        z1i = z1[ind2]
+        z2i = z2[ind2]
+        logdiferf[ind2] = np.log(erf(z2i) - erf(z1i))
+        
+    return logdiferf
--- a/GPy/kern/src/eq_ode2.py
+++ b/GPy/kern/src/eq_ode2.py
@ -44,7 +44,7 @@ class EQ_ODE2(Kern):
            lengthscale = np.asarray(lengthscale)
            assert lengthscale.size in [1, self.rank], "Bad number of lengthscales"
            if lengthscale.size != self.rank:
-                lengthscale = np.ones(self.input_dim)*lengthscale
+                lengthscale = np.ones(self.rank)*lengthscale

        if W is None:
            #W = 0.5*np.random.randn(self.output_dim, self.rank)/np.sqrt(self.rank)
@ -71,7 +71,7 @@ class EQ_ODE2(Kern):
        #index = index.reshape(index.size,)
        if hasattr(X, 'values'):
            X = X.values
-        index = np.int_(X[:, 1])
+        index = np.int_(np.round(X[:, 1]))
        index = index.reshape(index.size,)
        X_flag = index[0] >= self.output_dim
        if X2 is None:
@ -79,7 +79,7 @@ class EQ_ODE2(Kern):
                #Calculate covariance function for the latent functions
                index -= self.output_dim
                return self._Kuu(X, index)
-            else:
+            else: #Kff full
                raise NotImplementedError
        else:
            #This way is not working, indexes are lost after using k._slice_X
@ -87,19 +87,40 @@ class EQ_ODE2(Kern):
            #index2 = index2.reshape(index2.size,)
            if hasattr(X2, 'values'):
                X2 = X2.values
-            index2 = np.int_(X2[:, 1])
+            index2 = np.int_(np.round(X2[:, 1]))
            index2 = index2.reshape(index2.size,)
            X2_flag = index2[0] >= self.output_dim
            #Calculate cross-covariance function
            if not X_flag and X2_flag:
                index2 -= self.output_dim
                return self._Kfu(X, index, X2, index2) #Kfu
-            else:
+            elif X_flag and not X2_flag:
                index -= self.output_dim
                return self._Kfu(X2, index2, X, index).T #Kuf
+            elif X_flag and X2_flag:
+                index -= self.output_dim
+                index2 -= self.output_dim
+                return self._Kusu(X, index, X2, index2) #Ku_s u
+            else:
+                raise NotImplementedError #Kf_s f

    #Calculate the covariance function for diag(Kff(X,X))
    def Kdiag(self, X):
+        if hasattr(X, 'values'):
+            index = np.int_(np.round(X[:, 1].values))
+        else:
+            index = np.int_(np.round(X[:, 1]))
+        index = index.reshape(index.size,)
+        X_flag = index[0] >= self.output_dim
+        
+        if X_flag: #Kuudiag        
+            return np.ones(X[:,0].shape)
+        else: #Kffdiag
+            kdiag = self._Kdiag(X)
+            return np.sum(kdiag, axis=1)
+
+    #Calculate the covariance function for diag(Kff(X,X))
+    def _Kdiag(self, X):
        #This way is not working, indexes are lost after using k._slice_X
        #index = np.asarray(X, dtype=np.int)
        #index = index.reshape(index.size,)
@ -132,7 +153,7 @@ class EQ_ODE2(Kern):
        #Terms that move along q
        lq = self.lengthscale.values.reshape(1, self.lengthscale.size)
        S2 = S*S
-        kdiag = np.empty((t.size, ))
+        kdiag = np.empty((t.size, lq.size))

        indD = np.arange(B.size)
        #(1) When wd is real
@ -187,8 +208,8 @@ class EQ_ODE2(Kern):
            upv[t1[:, 0] == 0, :] = 0.

            #Covariance calculation
-            kdiag[ind3t] = np.sum(np.real(K01[ind]*upm), axis=1)
-            kdiag[ind3t] += np.sum(np.real((c0[ind]*ec)*upv), axis=1)
+            kdiag[ind3t] = np.real(K01[ind]*upm)
+            kdiag[ind3t] += np.real((c0[ind]*ec)*upv)

        #(2) When w_d is complex
        if np.any(wbool):
@ -265,7 +286,7 @@ class EQ_ODE2(Kern):
            upvc[t1[:, 0] == 0, :] = 0.

            #Covariance calculation
-            kdiag[ind2t] = np.sum(K011[ind]*upm + K012[ind]*upmc + (c0[ind]*ec)*upv + (c0[ind]*ec2)*upvc, axis=1)
+            kdiag[ind2t] = K011[ind]*upm + K012[ind]*upmc + (c0[ind]*ec)*upv + (c0[ind]*ec2)*upvc
        return kdiag

    def update_gradients_full(self, dL_dK, X, X2 = None):
@ -336,16 +357,17 @@ class EQ_ODE2(Kern):
        index = index.reshape(index.size,)
        
        glq, gS, gB, gC = self._gkdiag(X, index)
-        tmp = dL_dKdiag.reshape(index.size, 1)*glq
+        if dL_dKdiag.size == X.shape[0]:
+            dL_dKdiag = np.reshape(dL_dKdiag, (index.size, 1))
+        tmp = dL_dKdiag*glq
        self.lengthscale.gradient = tmp.sum(0)
-        #TODO: Avoid the reshape by a priori knowing the shape of dL_dKdiag
-        tmpB = dL_dKdiag*gB.reshape(dL_dKdiag.shape)
-        tmpC = dL_dKdiag*gC.reshape(dL_dKdiag.shape)
-        tmp = dL_dKdiag.reshape(index.size, 1)*gS
+        tmpB = dL_dKdiag*gB
+        tmpC = dL_dKdiag*gC
+        tmp = dL_dKdiag*gS
        for d in np.unique(index):
            ind = np.where(index == d)
-            self.B.gradient[d] = tmpB[ind].sum()
-            self.C.gradient[d] = tmpC[ind].sum()
+            self.B.gradient[d] = tmpB[ind, :].sum()
+            self.C.gradient[d] = tmpC[ind, :].sum()
            self.W.gradient[d, :] = tmp[ind].sum(0)

    def gradients_X(self, dL_dK, X, X2=None):
@ -410,6 +432,23 @@ class EQ_ODE2(Kern):
        kuu[indc, indr] = kuu[indr, indc]
        return kuu

+    def _Kusu(self, X, index, X2, index2):
+        index = index.reshape(index.size,)
+        index2 = index2.reshape(index2.size,)
+        t = X[:, 0].reshape(X.shape[0],1)
+        t2 = X2[:, 0].reshape(1,X2.shape[0])
+        lq = self.lengthscale.values.reshape(self.rank,)
+        #Covariance matrix initialization
+        kuu = np.zeros((t.size, t2.size))
+        for q in range(self.rank):
+            ind1 = index == q
+            ind2 = index2 == q
+            r = t[ind1]/lq[q] - t2[0,ind2]/lq[q]
+            r2 = r*r
+            #Calculation of  covariance function
+            kuu[np.ix_(ind1, ind2)] = np.exp(-r2)
+        return kuu
+
    #Evaluation of cross-covariance function
    def _Kfu(self, X, index, X2, index2):
        #terms that move along t
@ -632,8 +671,8 @@ class EQ_ODE2(Kern):
        lq = self.lengthscale.values.reshape(1, self.rank)
        lq2 = lq*lq

-        gB = np.empty((t.size,))
-        gC = np.empty((t.size,))
+        gB = np.empty((t.size, lq.size))
+        gC = np.empty((t.size, lq.size))
        glq = np.empty((t.size, lq.size))
        gS = np.empty((t.size, lq.size))

@ -723,8 +762,8 @@ class EQ_ODE2(Kern):
            Ba4_1 = (S2lq*lq)*dgam_dB/w2
            Ba4 = Ba4_1*c

-            gB[ind3t] = np.sum(np.real(Ba1[ind]*upm) - np.real(((Ba2_1[ind] + Ba2_2[ind]*t1)*egamt - Ba3[ind]*egamct)*upv)\
-                + np.real(Ba4[ind]*upmd) + np.real((Ba4_1[ind]*ec)*upvd), axis=1)
+            gB[ind3t] = np.real(Ba1[ind]*upm) - np.real(((Ba2_1[ind] + Ba2_2[ind]*t1)*egamt - Ba3[ind]*egamct)*upv)\
+                + np.real(Ba4[ind]*upmd) + np.real((Ba4_1[ind]*ec)*upvd)

            # gradient wrt C
            dw_dC = - alphad*dw_dB
@ -738,8 +777,8 @@ class EQ_ODE2(Kern):
            Ca4_1 = (S2lq*lq)*dgam_dC/w2
            Ca4 = Ca4_1*c

-            gC[ind3t] = np.sum(np.real(Ca1[ind]*upm) - np.real(((Ca2_1[ind] + Ca2_2[ind]*t1)*egamt - (Ca3_1[ind] + Ca3_2[ind]*t1)*egamct)*upv)\
-                + np.real(Ca4[ind]*upmd) + np.real((Ca4_1[ind]*ec)*upvd), axis=1)
+            gC[ind3t] = np.real(Ca1[ind]*upm) - np.real(((Ca2_1[ind] + Ca2_2[ind]*t1)*egamt - (Ca3_1[ind] + Ca3_2[ind]*t1)*egamct)*upv)\
+                + np.real(Ca4[ind]*upmd) + np.real((Ca4_1[ind]*ec)*upvd)

            #Gradient wrt lengthscale
            #DxQ terms
@ -868,10 +907,10 @@ class EQ_ODE2(Kern):
            Ba2_1c = c0*(dgamc_dB*(0.5/gamc2 - 0.25*lq2) + 0.5/(w2*gamc))
            Ba2_2c = c0*dgamc_dB/gamc

-            gB[ind2t] = np.sum(Ba1[ind]*upm - ((Ba2_1[ind] + Ba2_2[ind]*t1)*egamt - Ba3[ind]*egamct)*upv\
+            gB[ind2t] = Ba1[ind]*upm - ((Ba2_1[ind] + Ba2_2[ind]*t1)*egamt - Ba3[ind]*egamct)*upv\
                + Ba4[ind]*upmd + (Ba4_1[ind]*ec)*upvd\
                + Ba1c[ind]*upmc - ((Ba2_1c[ind] + Ba2_2c[ind]*t1)*egamct - Ba3c[ind]*egamt)*upvc\
-                + Ba4c[ind]*upmdc + (Ba4_1c[ind]*ec2)*upvdc, axis=1)
+                + Ba4c[ind]*upmdc + (Ba4_1c[ind]*ec2)*upvdc

            ##Gradient wrt C
            dw_dC = 0.5*alphad/w
@ -895,10 +934,10 @@ class EQ_ODE2(Kern):
            Ca4_1c = S2lq2*(dgamc_dC/w2)
            Ca4c = Ca4_1c*c2

-            gC[ind2t] = np.sum(Ca1[ind]*upm - ((Ca2_1[ind] + Ca2_2[ind]*t1)*egamt - (Ca3_1[ind] + Ca3_2[ind]*t1)*egamct)*upv\
+            gC[ind2t] = Ca1[ind]*upm - ((Ca2_1[ind] + Ca2_2[ind]*t1)*egamt - (Ca3_1[ind] + Ca3_2[ind]*t1)*egamct)*upv\
                + Ca4[ind]*upmd + (Ca4_1[ind]*ec)*upvd\
                + Ca1c[ind]*upmc - ((Ca2_1c[ind] + Ca2_2c[ind]*t1)*egamct - (Ca3_1c[ind] + Ca3_2c[ind]*t1)*egamt)*upvc\
-                + Ca4c[ind]*upmdc + (Ca4_1c[ind]*ec2)*upvdc, axis=1)
+                + Ca4c[ind]*upmdc + (Ca4_1c[ind]*ec2)*upvdc

            #Gradient wrt lengthscale
            #DxQ terms
--- a/GPy/kern/src/integral.py
+++ b/GPy/kern/src/integral.py
@ -0,0 +1,84 @@
+# Written by Mike Smith michaeltsmith.org.uk
+
+import numpy as np
+from .kern import Kern
+from ...core.parameterization import Param
+from paramz.transformations import Logexp
+import math
+
+class Integral(Kern): #todo do I need to inherit from Stationary
+    """
+    Integral kernel between...
+    """
+    
+    def __init__(self, input_dim, variances=None, lengthscale=None, ARD=False, active_dims=None, name='integral'):
+        super(Integral, self).__init__(input_dim, active_dims, name)
+        
+        if lengthscale is None:
+            lengthscale = np.ones(1)
+        else:
+            lengthscale = np.asarray(lengthscale)
+
+        self.lengthscale = Param('lengthscale', lengthscale, Logexp()) #Logexp - transforms to allow positive only values...
+        self.variances = Param('variances', variances, Logexp()) #and here.
+        self.link_parameters(self.variances, self.lengthscale) #this just takes a list of parameters we need to optimise.
+    
+    def h(self, z):
+        return 0.5 * z * np.sqrt(math.pi) * math.erf(z) + np.exp(-(z**2))
+
+    def dk_dl(self, t, tprime, l): #derivative of the kernel wrt lengthscale
+        return l * ( self.h(t/l) - self.h((t - tprime)/l) + self.h(tprime/l) - 1)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        if X2 is None:  #we're finding dK_xx/dTheta
+            dK_dl = np.zeros([X.shape[0],X.shape[0]])
+            dK_dv = np.zeros([X.shape[0],X.shape[0]])
+            for i,x in enumerate(X):
+                for j,x2 in enumerate(X):
+                    dK_dl[i,j] = self.variances[0]*self.dk_dl(x[0],x2[0],self.lengthscale[0]) #TODO Multiple length scales
+                    dK_dv[i,j] = self.k_xx(x[0],x2[0],self.lengthscale[0])  #the gradient wrt the variance is k_xx.              
+            self.lengthscale.gradient = np.sum(dK_dl * dL_dK)
+            self.variances.gradient = np.sum(dK_dv * dL_dK)
+            #print "V%0.5f" % self.variances.gradient
+            #print "L%0.5f" % self.lengthscale.gradient
+        else:     #we're finding dK_xf/Dtheta                
+            print "NEED TO HANDLE TODO!"
+
+    #useful little function to help calculate the covariances.
+    def g(self,z):
+        return 1.0 * z * np.sqrt(math.pi) * math.erf(z) + np.exp(-(z**2))
+
+    #covariance between gradients (it's the gradients that we want out... maybe we should have a way of getting K_ff too? Currently you get the diag of K_ff from Kdiag)
+    def k_xx(self,t,tprime,l):
+        return 0.5 * (l**2) * ( self.g(t/l) - self.g((t - tprime)/l) + self.g(tprime/l) - 1)
+
+    def k_ff(self,t,tprime,l): 
+        return np.exp(-((t-tprime)**2)/(l**2)) #rbf
+        
+    #covariance between the gradient and the actual value
+    def k_xf(self,t,tprime,l):
+        return 0.5 * np.sqrt(math.pi) * l * (math.erf((t-tprime)/l) + math.erf(tprime/l))
+
+    def K(self, X, X2=None):
+        if X2 is None:         
+            K_xx = np.zeros([X.shape[0],X.shape[0]])
+            for i,x in enumerate(X):
+                for j,x2 in enumerate(X):
+                    K_xx[i,j] = self.k_xx(x[0],x2[0],self.lengthscale[0])
+            return K_xx * self.variances[0]
+        else:
+            K_xf = np.zeros([X.shape[0],X2.shape[0]])
+            for i,x in enumerate(X):
+                for j,x2 in enumerate(X2):
+                    K_xf[i,j] = self.k_xf(x[0],x2[0],self.lengthscale[0])
+            #print self.variances[0]
+            return K_xf * self.variances[0]
+            
+    def Kdiag(self, X):
+        """I've used the fact that we call this method for K_ff when finding the covariance as a hack so
+        I know if I should return K_ff or K_xx. In this case we're returning K_ff!!
+        $K_{ff}^{post} = K_{ff} - K_{fx} K_{xx}^{-1} K_{xf}$"""
+        K_ff = np.zeros(X.shape[0])
+        for i,x in enumerate(X):
+            K_ff[i] = self.k_ff(x[0],x[0],self.lengthscale[0])
+        return K_ff * self.variances[0]
--- a/GPy/kern/src/integral_limits.py
+++ b/GPy/kern/src/integral_limits.py
@ -0,0 +1,94 @@
+# Written by Mike Smith michaeltsmith.org.uk
+
+import numpy as np
+from .kern import Kern
+from ...core.parameterization import Param
+from paramz.transformations import Logexp
+import math
+
+class Integral_Limits(Kern): #todo do I need to inherit from Stationary
+    """
+    Integral kernel, can include limits on each integral value.
+    """
+    
+    def __init__(self, input_dim, variances=None, lengthscale=None, ARD=False, active_dims=None, name='integral'):
+        super(Integral_Limits, self).__init__(input_dim, active_dims, name)
+        
+        if lengthscale is None:
+            lengthscale = np.ones(1)
+        else:
+            lengthscale = np.asarray(lengthscale)
+
+        self.lengthscale = Param('lengthscale', lengthscale, Logexp()) #Logexp - transforms to allow positive only values...
+        self.variances = Param('variances', variances, Logexp()) #and here.
+        self.link_parameters(self.variances, self.lengthscale) #this just takes a list of parameters we need to optimise.
+    
+    def h(self, z):
+        return 0.5 * z * np.sqrt(math.pi) * math.erf(z) + np.exp(-(z**2))
+
+    def dk_dl(self, t, tprime, s, sprime, l): #derivative of the kernel wrt lengthscale
+        return l * ( self.h((t-sprime)/l) - self.h((t - tprime)/l) + self.h((tprime-s)/l) - self.h((s-sprime)/l))
+
+    def update_gradients_full(self, dL_dK, X, X2=None):        
+        if X2 is None:  #we're finding dK_xx/dTheta
+            dK_dl = np.zeros([X.shape[0],X.shape[0]])
+            dK_dv = np.zeros([X.shape[0],X.shape[0]])
+            for i,x in enumerate(X):
+                for j,x2 in enumerate(X):
+                    dK_dl[i,j] = self.variances[0]*self.dk_dl(x[0],x2[0],x[1],x2[1],self.lengthscale[0])
+                    dK_dv[i,j] = self.k_xx(x[0],x2[0],x[1],x2[1],self.lengthscale[0])  #the gradient wrt the variance is k_xx.              
+            self.lengthscale.gradient = np.sum(dK_dl * dL_dK)
+            self.variances.gradient = np.sum(dK_dv * dL_dK)
+            #print "V%0.5f" % self.variances.gradient
+            #print "L%0.5f" % self.lengthscale.gradient
+        else:     #we're finding dK_xf/Dtheta                
+            print "NEED TO HANDLE TODO!"
+
+    #useful little function to help calculate the covariances.
+    def g(self,z):
+        return 1.0 * z * np.sqrt(math.pi) * math.erf(z) + np.exp(-(z**2))
+
+    def k_xx(self,t,tprime,s,sprime,l):
+        """Covariance between observed values.
+        
+        s and t are one domain of the integral (i.e. the integral between s and t)
+        sprime and tprime are another domain of the integral (i.e. the integral between sprime and tprime)
+        
+        We're interested in how correlated these two integrals are.
+        
+        Note: We've not multiplied by the variance, this is done in K."""
+        return 0.5 * (l**2) * ( self.g((t-sprime)/l) + self.g((tprime-s)/l) - self.g((t - tprime)/l) - self.g((s-sprime)/l))
+
+    def k_ff(self,t,tprime,l): 
+        """Doesn't need s or sprime as we're looking at the 'derivatives', so no domains over which to integrate are required"""
+        return np.exp(-((t-tprime)**2)/(l**2)) #rbf
+        
+    def k_xf(self,t,tprime,s,l):
+        """Covariance between the gradient (latent value) and the actual (observed) value.
+        
+        Note that sprime isn't actually used in this expression, presumably because the 'primes' are the gradient (latent) values which don't
+        involve an integration, and thus there is no domain over which they're integrated, just a single value that we want."""
+        return 0.5 * np.sqrt(math.pi) * l * (math.erf((t-tprime)/l) + math.erf((tprime-s)/l))
+
+    def K(self, X, X2=None):
+        if X2 is None:         
+            K_xx = np.zeros([X.shape[0],X.shape[0]])
+            for i,x in enumerate(X):
+                for j,x2 in enumerate(X):
+                    K_xx[i,j] = self.k_xx(x[0],x2[0],x[1],x2[1],self.lengthscale[0])
+            return K_xx * self.variances[0]
+        else:
+            K_xf = np.zeros([X.shape[0],X2.shape[0]])
+            for i,x in enumerate(X):
+                for j,x2 in enumerate(X2):
+                    K_xf[i,j] = self.k_xf(x[0],x2[0],x[1],self.lengthscale[0]) #x2[1] unused, see k_xf docstring for explanation.
+            return K_xf * self.variances[0]
+            
+    def Kdiag(self, X):
+        """I've used the fact that we call this method for K_ff when finding the covariance as a hack so
+        I know if I should return K_ff or K_xx. In this case we're returning K_ff!!
+        $K_{ff}^{post} = K_{ff} - K_{fx} K_{xx}^{-1} K_{xf}$"""
+        K_ff = np.zeros(X.shape[0])
+        for i,x in enumerate(X):
+            K_ff[i] = self.k_ff(x[0],x[0],self.lengthscale[0])
+        return K_ff * self.variances[0]
--- a/GPy/kern/src/multidimensional_integral_limits.py
+++ b/GPy/kern/src/multidimensional_integral_limits.py
@ -0,0 +1,120 @@
+# Written by Mike Smith michaeltsmith.org.uk
+
+import numpy as np
+from .kern import Kern
+from ...core.parameterization import Param
+from paramz.transformations import Logexp
+import math
+
+class Multidimensional_Integral_Limits(Kern): #todo do I need to inherit from Stationary
+    """
+    Integral kernel, can include limits on each integral value.
+    """
+    
+    def __init__(self, input_dim, variances=None, lengthscale=None, ARD=False, active_dims=None, name='integral'):
+        super(Multidimensional_Integral_Limits, self).__init__(input_dim, active_dims, name)
+        
+        if lengthscale is None:
+            lengthscale = np.ones(1)
+        else:
+            lengthscale = np.asarray(lengthscale)
+
+        self.lengthscale = Param('lengthscale', lengthscale, Logexp()) #Logexp - transforms to allow positive only values...
+        self.variances = Param('variances', variances, Logexp()) #and here.
+        self.link_parameters(self.variances, self.lengthscale) #this just takes a list of parameters we need to optimise.
+    
+    def h(self, z):
+        return 0.5 * z * np.sqrt(math.pi) * math.erf(z) + np.exp(-(z**2))
+
+    def dk_dl(self, t, tprime, s, sprime, l): #derivative of the kernel wrt lengthscale
+        return l * ( self.h((t-sprime)/l) - self.h((t - tprime)/l) + self.h((tprime-s)/l) - self.h((s-sprime)/l))
+
+    def update_gradients_full(self, dL_dK, X, X2=None):        
+        #print self.variances
+        if X2 is None:  #we're finding dK_xx/dTheta
+            dK_dl_term = np.zeros([X.shape[0],X.shape[0],self.lengthscale.shape[0]])
+            k_term = np.zeros([X.shape[0],X.shape[0],self.lengthscale.shape[0]])            
+            dK_dl = np.zeros([X.shape[0],X.shape[0],self.lengthscale.shape[0]])            
+            dK_dv = np.zeros([X.shape[0],X.shape[0]])
+            for il,l in enumerate(self.lengthscale):
+                idx = il*2
+                for i,x in enumerate(X):
+                    for j,x2 in enumerate(X):
+                        dK_dl_term[i,j,il] = self.dk_dl(x[idx],x2[idx],x[idx+1],x2[idx+1],l)
+                        k_term[i,j,il] = self.k_xx(x[idx],x2[idx],x[idx+1],x2[idx+1],l) 
+            for il,l in enumerate(self.lengthscale):                               
+                dK_dl = self.variances[0] * dK_dl_term[:,:,il]
+                for jl, l in enumerate(self.lengthscale):
+                    if jl!=il:
+                        dK_dl *= k_term[:,:,jl]
+                        #dK_dl = np.dot(dK_dl,k_term[:,:,il])
+                        #print k_term[:,:,il]
+                self.lengthscale.gradient[il] = np.sum(dK_dl * dL_dK)  
+            dK_dv = self.calc_K_xx_wo_variance(X) #the gradient wrt the variance is k_xx. 
+            self.variances.gradient = np.sum(dK_dv * dL_dK)           
+        else:     #we're finding dK_xf/Dtheta                
+            print "NEED TO HANDLE TODO!"
+        #print self.variances[0],self.lengthscale[0],self.lengthscale[1] #np.sum(dK_dv*dL_dK)
+
+
+    #useful little function to help calculate the covariances.
+    def g(self,z):
+        return 1.0 * z * np.sqrt(math.pi) * math.erf(z) + np.exp(-(z**2))
+
+    def k_xx(self,t,tprime,s,sprime,l):
+        """Covariance between observed values.
+        
+        s and t are one domain of the integral (i.e. the integral between s and t)
+        sprime and tprime are another domain of the integral (i.e. the integral between sprime and tprime)
+        
+        We're interested in how correlated these two integrals are.
+        
+        Note: We've not multiplied by the variance, this is done in K."""
+        return 0.5 * (l**2) * ( self.g((t-sprime)/l) + self.g((tprime-s)/l) - self.g((t - tprime)/l) - self.g((s-sprime)/l))
+
+    def k_ff(self,t,tprime,l): 
+        """Doesn't need s or sprime as we're looking at the 'derivatives', so no domains over which to integrate are required"""
+        return np.exp(-((t-tprime)**2)/(l**2)) #rbf
+        
+    def k_xf(self,t,tprime,s,l):
+        """Covariance between the gradient (latent value) and the actual (observed) value.
+        
+        Note that sprime isn't actually used in this expression, presumably because the 'primes' are the gradient (latent) values which don't
+        involve an integration, and thus there is no domain over which they're integrated, just a single value that we want."""
+        return 0.5 * np.sqrt(math.pi) * l * (math.erf((t-tprime)/l) + math.erf((tprime-s)/l))
+
+    def calc_K_xx_wo_variance(self,X):
+        """Calculates K_xx without the variance term"""
+        K_xx = np.ones([X.shape[0],X.shape[0]]) #ones now as a product occurs over each dimension            
+        for i,x in enumerate(X):
+            for j,x2 in enumerate(X):
+                for il,l in enumerate(self.lengthscale):
+                    idx = il*2 #each pair of input dimensions describe the limits on one actual dimension in the data
+                    K_xx[i,j] *= self.k_xx(x[idx],x2[idx],x[idx+1],x2[idx+1],l)
+        return K_xx
+           
+    def K(self, X, X2=None):
+        if X2 is None:         
+            #print "X x X"
+            K_xx = self.calc_K_xx_wo_variance(X)
+            return K_xx * self.variances[0]
+        else:
+            #print "X x X2"
+            K_xf = np.ones([X.shape[0],X2.shape[0]])
+            for i,x in enumerate(X):
+                for j,x2 in enumerate(X2):
+                    for il,l in enumerate(self.lengthscale):
+                        idx = il*2
+                        K_xf[i,j] *= self.k_xf(x[idx],x2[idx],x[idx+1],l)
+            return K_xf * self.variances[0]
+            
+    def Kdiag(self, X):
+        """I've used the fact that we call this method for K_ff when finding the covariance as a hack so
+        I know if I should return K_ff or K_xx. In this case we're returning K_ff!!
+        $K_{ff}^{post} = K_{ff} - K_{fx} K_{xx}^{-1} K_{xf}$"""
+        K_ff = np.ones(X.shape[0])
+        for i,x in enumerate(X):
+            for il,l in enumerate(self.lengthscale):
+                idx = il*2
+                K_ff[i] *= self.k_ff(x[idx],x[idx],l)
+        return K_ff * self.variances[0]
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -24,3 +24,5 @@ from .one_vs_all_sparse_classification import OneVsAllSparseClassification
 from .dpgplvm import DPBayesianGPLVM

 from .state_space_model import StateSpace
+
+from .ibp_lfm import IBPLFM
--- a/GPy/models/ibp_lfm.py
+++ b/GPy/models/ibp_lfm.py
@ -0,0 +1,535 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+
+from ..core.sparse_gp_mpi import SparseGP_MPI
+from .. import kern
+from ..util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, pdinv
+from ..util import diag
+from ..core.parameterization import Param
+from ..likelihoods import Gaussian
+from ..inference.latent_function_inference.var_dtc_parallel import VarDTC_minibatch
+from ..inference.latent_function_inference.posterior import Posterior
+from GPy.core.parameterization.variational import VariationalPrior
+from ..core.parameterization.parameterized import Parameterized
+from paramz.transformations import Logexp, Logistic, __fixed__
+log_2_pi = np.log(2*np.pi)
+
+class VarDTC_minibatch_IBPLFM(VarDTC_minibatch):
+    '''
+    Modifications of VarDTC_minibatch for IBP LFM
+    '''
+    
+    def __init__(self, batchsize=None, limit=3, mpi_comm=None):
+        super(VarDTC_minibatch_IBPLFM, self).__init__(batchsize, limit, mpi_comm)
+
+    def gatherPsiStat(self, kern, X, Z, Y, beta, Zp):
+
+        het_noise = beta.size > 1
+        
+        assert beta.size == 1
+    
+        trYYT = self.get_trYYT(Y)
+        if self.Y_speedup and not het_noise:
+            Y = self.get_YYTfactor(Y)
+    
+        num_inducing = Z.shape[0]
+        num_data, output_dim = Y.shape
+        batchsize = num_data if self.batchsize is None else self.batchsize
+    
+        psi2_full = np.zeros((num_inducing, num_inducing)) # MxM
+        psi1Y_full = np.zeros((output_dim, num_inducing)) # DxM
+        psi0_full = 0.
+        YRY_full = 0.
+    
+        for n_start in range(0, num_data, batchsize):
+            n_end = min(batchsize+n_start, num_data)
+            if batchsize == num_data:
+                Y_slice = Y
+                X_slice = X
+            else:
+                Y_slice = Y[n_start:n_end]
+                X_slice = X[n_start:n_end]
+            
+            if het_noise:
+                b = beta[n_start]
+                YRY_full += np.inner(Y_slice, Y_slice)*b
+            else:
+                b = beta
+    
+            psi0 = kern._Kdiag(X_slice) #Kff^q
+            psi1 = kern.K(X_slice, Z) #Kfu
+
+            indX = X_slice.values
+            indX = np.int_(np.round(indX[:, -1]))
+
+            Zp = Zp.gamma.values
+            # Extend Zp across columns
+            indZ = Z.values
+            indZ = np.int_(np.round(indZ[:, -1])) - Zp.shape[0]
+            Zpq = Zp[:, indZ]
+
+            for d in np.unique(indX):
+                indd = indX == d
+                psi1d = psi1[indd, :]
+                Zpd = Zp[d, :]
+                Zp2 = Zpd[:, None]*Zpd[None, :] - np.diag(np.power(Zpd, 2)) + np.diag(Zpd)
+                psi2_full += (np.dot(psi1d.T, psi1d)*Zp2[np.ix_(indZ, indZ)])*b #Zp2*Kufd*Kfud*beta
+
+            psi0_full += np.sum(psi0*Zp[indX, :])*b
+            psi1Y_full += np.dot(Y_slice.T, psi1*Zpq[indX, :])*b
+    
+        if not het_noise:
+            YRY_full = trYYT*beta
+    
+        if self.mpi_comm is not None:
+            from mpi4py import MPI
+            psi0_all = np.array(psi0_full)
+            psi1Y_all = psi1Y_full.copy()
+            psi2_all = psi2_full.copy()
+            YRY_all = np.array(YRY_full)
+            self.mpi_comm.Allreduce([psi0_full, MPI.DOUBLE], [psi0_all, MPI.DOUBLE])
+            self.mpi_comm.Allreduce([psi1Y_full, MPI.DOUBLE], [psi1Y_all, MPI.DOUBLE])
+            self.mpi_comm.Allreduce([psi2_full, MPI.DOUBLE], [psi2_all, MPI.DOUBLE])
+            self.mpi_comm.Allreduce([YRY_full, MPI.DOUBLE], [YRY_all, MPI.DOUBLE])
+            return psi0_all, psi1Y_all, psi2_all, YRY_all
+    
+        return psi0_full, psi1Y_full, psi2_full, YRY_full
+
+
+    def inference_likelihood(self, kern, X, Z, likelihood, Y, Zp):
+        """
+        The first phase of inference:
+        Compute: log-likelihood, dL_dKmm
+
+        Cached intermediate results: Kmm, KmmInv,
+        """
+        
+        num_data, output_dim = Y.shape
+        input_dim = Z.shape[0]
+        if self.mpi_comm is not None:
+            from mpi4py import MPI
+            num_data_all = np.array(num_data,dtype=np.int32)
+            self.mpi_comm.Allreduce([np.int32(num_data), MPI.INT], [num_data_all, MPI.INT])
+            num_data = num_data_all
+
+        #see whether we've got a different noise variance for each datum
+        beta = 1./np.fmax(likelihood.variance, 1e-6)
+        het_noise = beta.size > 1
+        if het_noise:
+            self.batchsize = 1
+
+        psi0_full, psi1Y_full, psi2_full, YRY_full = self.gatherPsiStat(kern, X, Z, Y, beta, Zp)
+
+        #======================================================================
+        # Compute Common Components
+        #======================================================================
+
+        Kmm = kern.K(Z).copy()
+        diag.add(Kmm, self.const_jitter)
+        if not np.isfinite(Kmm).all():
+            print(Kmm)
+        Lm = jitchol(Kmm)
+        LmInv = dtrtri(Lm)
+
+        LmInvPsi2LmInvT = np.dot(LmInv, np.dot(psi2_full, LmInv.T))
+        Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT
+        LL = jitchol(Lambda)
+        LLInv = dtrtri(LL)
+        logdet_L = 2.*np.sum(np.log(np.diag(LL)))
+        LmLLInv = np.dot(LLInv, LmInv)
+        
+        b = np.dot(psi1Y_full, LmLLInv.T)
+        bbt = np.sum(np.square(b))
+        v = np.dot(b, LmLLInv).T
+        LLinvPsi1TYYTPsi1LLinvT = tdot(b.T)
+
+        tmp = -np.dot(np.dot(LLInv.T, LLinvPsi1TYYTPsi1LLinvT + output_dim*np.eye(input_dim)), LLInv)
+        dL_dpsi2R = .5*np.dot(np.dot(LmInv.T, tmp + output_dim*np.eye(input_dim)), LmInv)
+
+        # Cache intermediate results
+        self.midRes['dL_dpsi2R'] = dL_dpsi2R
+        self.midRes['v'] = v
+
+        #======================================================================
+        # Compute log-likelihood
+        #======================================================================
+        if het_noise:
+            logL_R = -np.sum(np.log(beta))
+        else:
+            logL_R = -num_data*np.log(beta)
+        logL = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-np.trace(LmInvPsi2LmInvT))+YRY_full-bbt)*.5 - output_dim*logdet_L*.5
+
+        #======================================================================
+        # Compute dL_dKmm
+        #======================================================================
+
+        dL_dKmm = dL_dpsi2R - .5*output_dim*np.dot(np.dot(LmInv.T, LmInvPsi2LmInvT), LmInv)
+
+        #======================================================================
+        # Compute the Posterior distribution of inducing points p(u|Y)
+        #======================================================================
+
+        if not self.Y_speedup or het_noise:
+            wd_inv = backsub_both_sides(Lm, np.eye(input_dim)- backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left')
+            post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=Lm)
+        else:
+            post = None
+
+        #======================================================================
+        # Compute dL_dthetaL for uncertian input and non-heter noise
+        #======================================================================
+
+        if not het_noise:
+            dL_dthetaL = .5*(YRY_full*beta + beta*output_dim*psi0_full - num_data*output_dim*beta) - beta*(dL_dpsi2R*psi2_full).sum() - beta*(v.T*psi1Y_full).sum()
+            self.midRes['dL_dthetaL'] = dL_dthetaL
+
+        return logL, dL_dKmm, post
+        
+    def inference_minibatch(self, kern, X, Z, likelihood, Y, Zp):
+        """
+        The second phase of inference: Computing the derivatives over a minibatch of Y
+        Compute: dL_dpsi0, dL_dpsi1, dL_dpsi2, dL_dthetaL
+        return a flag showing whether it reached the end of Y (isEnd)
+        """
+
+        num_data, output_dim = Y.shape
+
+        #see whether we've got a different noise variance for each datum
+        beta = 1./np.fmax(likelihood.variance, 1e-6)
+        het_noise = beta.size > 1
+        # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
+        #self.YYTfactor = beta*self.get_YYTfactor(Y)
+        if self.Y_speedup and not het_noise:
+            YYT_factor = self.get_YYTfactor(Y)
+        else:
+            YYT_factor = Y
+
+        n_start = self.batch_pos
+        batchsize = num_data if self.batchsize is None else self.batchsize
+        n_end = min(batchsize+n_start, num_data)
+        if n_end == num_data:
+            isEnd = True
+            self.batch_pos = 0
+        else:
+            isEnd = False
+            self.batch_pos = n_end
+
+        if batchsize == num_data:
+            Y_slice = YYT_factor
+            X_slice = X
+        else:
+            Y_slice = YYT_factor[n_start:n_end]
+            X_slice = X[n_start:n_end]
+
+        psi0 = kern._Kdiag(X_slice) #Kffdiag
+        psi1 = kern.K(X_slice, Z) #Kfu
+        betapsi1 = np.einsum('n,nm->nm', beta, psi1)
+
+        X_slice = X_slice.values
+        Z = Z.values
+
+        Zp = Zp.gamma.values
+        indX = np.int_(X_slice[:, -1])
+        indZ = np.int_(Z[:, -1]) - Zp.shape[0]
+
+        betaY = beta*Y_slice
+
+        #======================================================================
+        # Load Intermediate Results
+        #======================================================================
+
+        dL_dpsi2R = self.midRes['dL_dpsi2R']
+        v = self.midRes['v']
+
+        #======================================================================
+        # Compute dL_dpsi
+        #======================================================================
+
+        dL_dpsi0 = -.5*output_dim*(beta * Zp[indX, :]) #XxQ #TODO: Check this gradient
+
+        dL_dpsi1 = np.dot(betaY, v.T)
+        dL_dEZp  = psi1*dL_dpsi1
+        dL_dpsi1 = Zp[np.ix_(indX, indZ)]*dL_dpsi1
+        dL_dgamma = np.zeros(Zp.shape)
+        for d in np.unique(indX):
+            indd = indX == d
+            betapsi1d = betapsi1[indd, :]
+            psi1d = psi1[indd, :]
+            Zpd = Zp[d, :]
+            Zp2 = Zpd[:, None]*Zpd[None, :] - np.diag(np.power(Zpd, 2)) + np.diag(Zpd)
+            dL_dpsi1[indd, :] += np.dot(betapsi1d, Zp2[np.ix_(indZ, indZ)] * dL_dpsi2R)*2.
+
+            dL_EZp2 = dL_dpsi2R * (np.dot(psi1d.T, psi1d) * beta)*2.  # Zpd*Kufd*Kfud*beta
+            #Gradient of Likelihood wrt gamma is calculated here
+            EZ = Zp[d, indZ]
+            for q in range(Zp.shape[1]):
+                EZt = EZ.copy()
+                indq = indZ == q
+                EZt[indq] = .5
+                dL_dgamma[d, q] = np.sum(dL_dEZp[np.ix_(indd, indq)]) + np.sum(dL_EZp2[:, indq]*EZt[:, None]) -\
+                    .5*beta*(np.sum(psi0[indd, q]))
+
+        #======================================================================
+        # Compute dL_dthetaL
+        #======================================================================
+        if isEnd:
+            dL_dthetaL = self.midRes['dL_dthetaL']
+        else:
+            dL_dthetaL = 0.
+
+        grad_dict = {'dL_dKdiag': dL_dpsi0,
+                     'dL_dKnm': dL_dpsi1,
+                     'dL_dthetaL': dL_dthetaL,
+                     'dL_dgamma': dL_dgamma}
+
+        return isEnd, (n_start, n_end), grad_dict
+
+
+def update_gradients(model, mpi_comm=None):
+    if mpi_comm is None:
+        Y = model.Y
+        X = model.X
+    else:
+        Y = model.Y_local
+        X = model.X[model.N_range[0]:model.N_range[1]]
+
+    model._log_marginal_likelihood, dL_dKmm, model.posterior = model.inference_method.inference_likelihood(model.kern, X, model.Z, model.likelihood, Y, model.Zp)
+
+    het_noise = model.likelihood.variance.size > 1
+
+    if het_noise:
+        dL_dthetaL = np.empty((model.Y.shape[0],))
+    else:
+        dL_dthetaL = np.float64(0.)
+
+    kern_grad = model.kern.gradient.copy()
+    kern_grad[:] = 0.
+    model.Z.gradient = 0.
+    gamma_gradient = model.Zp.gamma.copy()
+    gamma_gradient[:] = 0.
+
+    isEnd = False
+    while not isEnd:
+        isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, X, model.Z, model.likelihood, Y, model.Zp)
+
+        if (n_range[1]-n_range[0]) == X.shape[0]:
+            X_slice = X
+        elif mpi_comm is None:
+            X_slice = model.X[n_range[0]:n_range[1]]
+        else:
+            X_slice = model.X[model.N_range[0]+n_range[0]:model.N_range[0]+n_range[1]]
+
+        #gradients w.r.t. kernel
+        model.kern.update_gradients_diag(grad_dict['dL_dKdiag'], X_slice)
+        kern_grad += model.kern.gradient
+
+        model.kern.update_gradients_full(grad_dict['dL_dKnm'], X_slice, model.Z)
+        kern_grad += model.kern.gradient
+
+        #gradients w.r.t. Z
+        model.Z.gradient += model.kern.gradients_X(grad_dict['dL_dKnm'].T, model.Z, X_slice)
+
+        #gradients w.r.t. posterior parameters of Zp
+        gamma_gradient += grad_dict['dL_dgamma']
+
+        if het_noise:
+            dL_dthetaL[n_range[0]:n_range[1]] = grad_dict['dL_dthetaL']
+        else:
+            dL_dthetaL += grad_dict['dL_dthetaL']
+
+    # Gather the gradients from multiple MPI nodes
+    if mpi_comm is not None:
+        from mpi4py import MPI
+        if het_noise:
+            raise "het_noise not implemented!"
+        kern_grad_all = kern_grad.copy()
+        Z_grad_all = model.Z.gradient.copy()
+        gamma_grad_all = gamma_gradient.copy()
+        mpi_comm.Allreduce([kern_grad, MPI.DOUBLE], [kern_grad_all, MPI.DOUBLE])
+        mpi_comm.Allreduce([model.Z.gradient, MPI.DOUBLE], [Z_grad_all, MPI.DOUBLE])
+        mpi_comm.Allreduce([gamma_gradient, MPI.DOUBLE], [gamma_grad_all, MPI.DOUBLE])
+        kern_grad = kern_grad_all
+        model.Z.gradient = Z_grad_all
+        gamma_gradient = gamma_grad_all
+
+    #gradients w.r.t. kernel
+    model.kern.update_gradients_full(dL_dKmm, model.Z, None)
+    model.kern.gradient += kern_grad
+
+    #gradients w.r.t. Z
+    model.Z.gradient += model.kern.gradients_X(dL_dKmm, model.Z)
+
+    #gradient w.r.t. gamma
+    model.Zp.gamma.gradient = gamma_gradient
+
+    # Update Log-likelihood
+    KL_div = model.variational_prior.KL_divergence(model.Zp)
+    # update for the KL divergence
+    model.variational_prior.update_gradients_KL(model.Zp)
+
+    model._log_marginal_likelihood += KL_div
+
+    # dL_dthetaL
+    model.likelihood.update_gradients(dL_dthetaL)
+
+
+class IBPPosterior(Parameterized):
+    '''
+    The IBP distribution for variational approximations.
+    '''
+    def __init__(self, binary_prob, tau=None, name='Sensitivity space', *a, **kw):
+        """
+        binary_prob : the probability of including a latent function over an output.
+        """
+        super(IBPPosterior, self).__init__(name=name, *a, **kw)
+        self.gamma = Param("binary_prob", binary_prob, Logistic(1e-10, 1. - 1e-10))
+        self.link_parameter(self.gamma)
+        if tau is not None:
+            assert tau.size == 2*self.gamma_.shape[1]
+            self.tau = Param("tau", tau, Logexp())
+        else:
+            self.tau = Param("tau", np.ones((2, self.gamma.shape[1])), Logexp())
+        self.link_parameter(self.tau)
+
+    def set_gradients(self, grad):
+        self.gamma.gradient, self.tau.gradient = grad
+
+    def __getitem__(self, s):
+        pass
+    #     if isinstance(s, (int, slice, tuple, list, np.ndarray)):
+    #         import copy
+    #         n = self.__new__(self.__class__, self.name)
+    #         dc = self.__dict__.copy()
+    #         dc['binary_prob'] = self.binary_prob[s]
+    #         dc['tau'] = self.tau
+    #         dc['parameters'] = copy.copy(self.parameters)
+    #         n.__dict__.update(dc)
+    #         n.parameters[dc['binary_prob']._parent_index_] = dc['binary_prob']
+    #         n.parameters[dc['tau']._parent_index_] = dc['tau']
+    #         n._gradient_array_ = None
+    #         oversize = self.size - self.gamma.size - self.tau.size
+    #         n.size = n.gamma.size + n.tau.size + oversize
+    #         return n
+    #     else:
+    #         return super(IBPPosterior, self).__getitem__(s)
+
+class IBPPrior(VariationalPrior):
+    def __init__(self, rank, alpha=2., name='IBPPrior', **kw):
+        super(IBPPrior, self).__init__(name=name, **kw)
+        from paramz.transformations import __fixed__  
+        self.rank = rank
+        self.alpha = Param('alpha', alpha, __fixed__)
+        self.link_parameter(self.alpha)
+
+    def KL_divergence(self, variational_posterior):
+        from scipy.special import gamma, psi
+        
+        eta, tau = variational_posterior.gamma.values, variational_posterior.tau.values
+        
+        sum_eta = np.sum(eta, axis=0) #sum_d gamma(d,q)
+        D_seta = eta.shape[0] - sum_eta
+        ad = self.alpha/eta.shape[1]
+        psitau1 = psi(tau[0, :])
+        psitau2 = psi(tau[1, :])
+        sumtau = np.sum(tau, axis=0)
+        psitau = psi(sumtau)
+        # E[log p(z)]
+        part1 = np.sum(sum_eta*psitau1 + D_seta*psitau2 - eta.shape[0]*psitau)
+            
+        # E[log p(pi)]
+        part1 += (ad - 1.)*np.sum(psitau1 - psitau) + eta.shape[1]*np.log(ad)
+        
+        #H(z)
+        part2 = np.sum(-(1.-eta)*np.log(1.-eta) - eta*np.log(eta))
+        #H(pi)
+        part2 += np.sum(np.log(gamma(tau[0, :])*gamma(tau[1, :])/gamma(sumtau))-(tau[0, :]-1.)*psitau1-(tau[1, :]-1.)*psitau2\
+                 + (sumtau-2.)*psitau)
+        
+        return part1+part2
+
+    def update_gradients_KL(self, variational_posterior):
+        eta, tau = variational_posterior.gamma.values, variational_posterior.tau.values
+
+        from scipy.special import psi, polygamma
+        dgamma = np.log(1. - eta) - np.log(eta) + psi(tau[0, :]) - psi(tau[1, :])
+        variational_posterior.gamma.gradient += dgamma
+        ad = self.alpha/self.rank
+        sumeta = np.sum(eta, axis=0)
+        sumtau = np.sum(tau, axis=0)
+        common = (-eta.shape[0] - (ad - 1.) + (sumtau - 2.))*polygamma(1, sumtau)
+        variational_posterior.tau.gradient[0, :] = (sumeta + ad - tau[0, :])*polygamma(1, tau[0, :]) + common
+        variational_posterior.tau.gradient[1, :] = ((eta.shape[0] - sumeta) - (tau[1, :] - 1.))*polygamma(1, tau[1, :])\
+                                                   + common
+
+
+class IBPLFM(SparseGP_MPI):
+    """
+    Indian Buffet Process for Latent Force Models
+
+    :param Y: observed data (np.ndarray) or GPy.likelihood
+    :type Y: np.ndarray| GPy.likelihood instance
+    :param X: input data (np.ndarray) [X:values, X:index], index refers to the number of the output
+    :type X: np.ndarray
+    :param input_dim: latent dimensionality
+    :type input_dim: int
+    : param rank: number of latent functions
+
+    """
+    def __init__(self, X, Y, input_dim=2, output_dim=1, rank=1, Gamma=None, num_inducing=10,
+                 Z=None, kernel=None, inference_method=None, likelihood=None, name='IBP for LFM', alpha=2., beta=2., connM=None, tau=None, mpi_comm=None, normalizer=False, variational_prior=None,**kwargs):
+
+        if kernel is None:
+            kernel = kern.EQ_ODE2(input_dim, output_dim, rank)
+        
+        if Gamma is None:
+            gamma = np.empty((output_dim, rank)) # The posterior probabilities of the binary variable in the variational approximation
+            gamma[:] = 0.5 + 0.1 * np.random.randn(output_dim, rank)
+            gamma[gamma>1.-1e-9] = 1.-1e-9
+            gamma[gamma<1e-9] = 1e-9
+        else:
+            gamma = Gamma.copy()
+        
+        #TODO: create a vector of inducing points
+        if Z is None:
+            Z = np.random.permutation(X.copy())[:num_inducing]
+        assert Z.shape[1] == X.shape[1]
+        
+        if likelihood is None:
+            likelihood = Gaussian()
+
+        if inference_method is None:
+            inference_method = VarDTC_minibatch_IBPLFM(mpi_comm=mpi_comm)
+        
+        #Definition of variational terms
+        self.variational_prior = IBPPrior(rank=rank, alpha=alpha) if variational_prior is None else variational_prior
+        self.Zp = IBPPosterior(gamma, tau=tau)
+
+        super(IBPLFM, self).__init__(X, Y, Z, kernel, likelihood, variational_prior=self.variational_prior, inference_method=inference_method, name=name, mpi_comm=mpi_comm, normalizer=normalizer, **kwargs)
+        self.link_parameter(self.Zp, index=0)
+        
+    def set_Zp_gradients(self, Zp, Zp_grad):
+        """Set the gradients of the posterior distribution of Zp in its specific form."""
+        Zp.gamma.gradient = Zp_grad
+    
+    def get_Zp_gradients(self, Zp):
+        """Get the gradients of the posterior distribution of Zp in its specific form."""
+        return Zp.gamma.gradient
+
+    def _propogate_Zp_val(self):
+        pass
+
+    def parameters_changed(self):
+        #super(IBPLFM,self).parameters_changed()
+        if isinstance(self.inference_method, VarDTC_minibatch_IBPLFM):
+            update_gradients(self,  mpi_comm=self.mpi_comm)
+            return
+
+        # Add the KL divergence term
+        self._log_marginal_likelihood += self.variational_prior.KL_divergence(self.Zp)
+        #TODO Change the following according to this variational distribution
+        #self.Zp.gamma.gradient = self.
+
+        # update for the KL divergence
+        self.variational_prior.update_gradients_KL(self.Zp)
--- a/GPy/testing/gp_tests.py
+++ b/GPy/testing/gp_tests.py
@ -24,9 +24,9 @@ class Test(unittest.TestCase):
        k = GPy.kern.RBF(1)
        m = GPy.models.BayesianGPLVM(self.Y, 1, kernel=k)
        mu, var = m.predict(m.X)
-        X = m.X.copy()
+        X = m.X
        Xnew = NormalPosterior(m.X.mean[:10].copy(), m.X.variance[:10].copy())
-        m.set_XY(Xnew, m.Y[:10])
+        m.set_XY(Xnew, m.Y[:10].copy())
        assert(m.checkgrad())
        m.set_XY(X, self.Y)
        mu2, var2 = m.predict(m.X)
@ -40,7 +40,7 @@ class Test(unittest.TestCase):
        mu, var = m.predict(m.X)
        X = m.X.copy()
        Xnew = X[:10].copy()
-        m.set_XY(Xnew, m.Y[:10])
+        m.set_XY(Xnew, m.Y[:10].copy())
        assert(m.checkgrad())
        m.set_XY(X, self.Y)
        mu2, var2 = m.predict(m.X)
--- a/setup.cfg
+++ b/setup.cfg
@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.0.7
+current_version = 1.0.9
 tag = False
 commit = True