[huge merge] trying to merge old master and master

2026-05-07 11:02:38 +02:00 · 2014-11-21 16:17:03 +00:00 · 2014-11-21 16:17:03 +00:00 · 180650ec85
commit 180650ec85
parent 0f8dbba56d 4fd05439fc
308 changed files with 27071 additions and 24550 deletions
--- a/GPy/kern/_src/ODE_UY.py
+++ b/GPy/kern/_src/ODE_UY.py
@ -0,0 +1,282 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+import numpy as np
+from independent_outputs import index_to_slices
+
+class ODE_UY(Kern):
+    def __init__(self, input_dim, variance_U=3., variance_Y=1., lengthscale_U=1., lengthscale_Y=1., active_dims=None, name='ode_uy'):
+        assert input_dim ==2, "only defined for 2 input dims"
+        super(ODE_UY, self).__init__(input_dim, active_dims, name)
+
+        self.variance_Y = Param('variance_Y', variance_Y, Logexp())
+        self.variance_U = Param('variance_U', variance_Y, Logexp())
+        self.lengthscale_Y = Param('lengthscale_Y', lengthscale_Y, Logexp())
+        self.lengthscale_U = Param('lengthscale_U', lengthscale_Y, Logexp())
+
+        self.link_parameters(self.variance_Y, self.variance_U, self.lengthscale_Y, self.lengthscale_U)
+
+    def K(self, X, X2=None):
+        # model :   a * dy/dt + b * y = U
+        #lu=sqrt(3)/theta1  ly=1/theta2  theta2= a/b :thetay   sigma2=1/(2ab) :sigmay
+
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        if X2 is None:
+            X2,slices2 = X,slices
+            K = np.zeros((X.shape[0], X.shape[0]))
+        else:
+            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+            K = np.zeros((X.shape[0], X2.shape[0]))
+
+
+        #rdist = X[:,0][:,None] - X2[:,0][:,None].T
+        rdist = X - X2.T
+        ly=1/self.lengthscale_Y
+        lu=np.sqrt(3)/self.lengthscale_U
+        #iu=self.input_lengthU  #dimention of U
+        Vu=self.variance_U
+        Vy=self.variance_Y
+        #Vy=ly/2
+        #stop
+
+
+        # kernel for kuu  matern3/2
+        kuu = lambda dist:Vu * (1 + lu* np.abs(dist)) * np.exp(-lu * np.abs(dist))
+
+        # kernel for kyy
+        k1 = lambda dist:np.exp(-ly*np.abs(dist))*(2*lu+ly)/(lu+ly)**2
+        k2 = lambda dist:(np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2
+        k3 = lambda dist:np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
+        kyy = lambda dist:Vu*Vy*(k1(dist) + k2(dist) + k3(dist))
+
+
+        # cross covariance function
+        kyu3 = lambda dist:np.exp(-lu*dist)/(lu+ly)*(1+lu*(dist+1/(lu+ly)))
+        #kyu3 = lambda dist: 0
+
+        k1cros = lambda dist:np.exp(ly*dist)/(lu-ly) * ( 1- np.exp( (lu-ly)*dist) + lu* ( dist*np.exp( (lu-ly)*dist ) + (1- np.exp( (lu-ly)*dist ) ) /(lu-ly)   )    )
+        #k1cros = lambda dist:0
+
+        k2cros = lambda dist:np.exp(ly*dist)*( 1/(lu+ly) + lu/(lu+ly)**2 )
+        #k2cros = lambda dist:0
+
+        Vyu=np.sqrt(Vy*ly*2)
+
+        # cross covariance kuy
+        kuyp = lambda dist:Vu*Vyu*(kyu3(dist))       #t>0 kuy
+        kuyn = lambda dist:Vu*Vyu*(k1cros(dist)+k2cros(dist))      #t<0 kuy
+        # cross covariance kyu
+        kyup = lambda dist:Vu*Vyu*(k1cros(-dist)+k2cros(-dist))    #t>0 kyu
+        kyun = lambda dist:Vu*Vyu*(kyu3(-dist))       #t<0 kyu
+
+
+        for i, s1 in enumerate(slices):
+            for j, s2 in enumerate(slices2):
+                for ss1 in s1:
+                    for ss2 in s2:
+                        if i==0 and j==0:
+                            K[ss1,ss2] = kuu(np.abs(rdist[ss1,ss2]))
+                        elif i==0 and j==1:
+                            #K[ss1,ss2]=  np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[ss1,ss2]) )   )
+                            K[ss1,ss2]=  np.where(  rdist[ss1,ss2]>0 , kuyp(rdist[ss1,ss2]), kuyn(rdist[ss1,ss2] )   )
+                        elif i==1 and j==1:
+                            K[ss1,ss2] = kyy(np.abs(rdist[ss1,ss2]))
+                        else:
+                            #K[ss1,ss2]= 0
+                            #K[ss1,ss2]= np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[ss1,ss2]) )   )
+                            K[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(rdist[ss1,ss2]), kyun(rdist[ss1,ss2] )   )
+        return K
+
+
+
+    def Kdiag(self, X):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        Kdiag = np.zeros(X.shape[0])
+        ly=1/self.lengthscale_Y
+        lu=np.sqrt(3)/self.lengthscale_U
+
+        Vu = self.variance_U
+        Vy=self.variance_Y
+
+        k1 = (2*lu+ly)/(lu+ly)**2
+        k2 = (ly-2*lu + 2*lu-ly ) / (ly-lu)**2
+        k3 = 1/(lu+ly) + (lu)/(lu+ly)**2
+
+        slices = index_to_slices(X[:,-1])
+
+        for i, ss1 in enumerate(slices):
+            for s1 in ss1:
+                if i==0:
+                    Kdiag[s1]+= self.variance_U
+                elif i==1:
+                    Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
+                else:
+                    raise ValueError, "invalid input/output index"
+        #Kdiag[slices[0][0]]+= self.variance_U   #matern32 diag
+        #Kdiag[slices[1][0]]+= self.variance_U*self.variance_Y*(k1+k2+k3)  #  diag
+        return Kdiag
+
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        """derivative of the covariance matrix with respect to the parameters."""
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        if X2 is None:
+            X2,slices2 = X,slices
+        else:
+            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+        #rdist = X[:,0][:,None] - X2[:,0][:,None].T
+
+        rdist = X - X2.T
+        ly=1/self.lengthscale_Y
+        lu=np.sqrt(3)/self.lengthscale_U
+
+        Vu=self.variance_U
+        Vy=self.variance_Y
+        Vyu = np.sqrt(Vy*ly*2)
+        dVdly = 0.5/np.sqrt(ly)*np.sqrt(2*Vy)
+        dVdVy = 0.5/np.sqrt(Vy)*np.sqrt(2*ly)
+
+        rd=rdist.shape
+        dktheta1 = np.zeros(rd)
+        dktheta2 = np.zeros(rd)
+        dkUdvar = np.zeros(rd)
+        dkYdvar = np.zeros(rd)
+
+        # dk dtheta for UU
+        UUdtheta1 = lambda dist: np.exp(-lu* dist)*dist + (-dist)*np.exp(-lu* dist)*(1+lu*dist)
+        UUdtheta2 = lambda dist: 0
+        #UUdvar = lambda dist: (1 + lu*dist)*np.exp(-lu*dist)
+        UUdvar = lambda dist: (1 + lu* np.abs(dist)) * np.exp(-lu * np.abs(dist))
+
+        # dk dtheta for YY
+
+        dk1theta1 = lambda dist: np.exp(-ly*dist)*2*(-lu)/(lu+ly)**3
+
+        dk2theta1 = lambda dist: (1.0)*(
+            np.exp(-lu*dist)*dist*(-ly+2*lu-lu*ly*dist+dist*lu**2)*(ly-lu)**(-2) + np.exp(-lu*dist)*(-2+ly*dist-2*dist*lu)*(ly-lu)**(-2)
+            +np.exp(-dist*lu)*(ly-2*lu+ly*lu*dist-dist*lu**2)*2*(ly-lu)**(-3)
+            +np.exp(-dist*ly)*2*(ly-lu)**(-2)
+            +np.exp(-dist*ly)*2*(2*lu-ly)*(ly-lu)**(-3)
+            )
+
+        dk3theta1 = lambda dist: np.exp(-dist*lu)*(lu+ly)**(-2)*((2*lu+ly+dist*lu**2+lu*ly*dist)*(-dist-2/(lu+ly))+2+2*lu*dist+ly*dist)
+
+        #dktheta1 = lambda dist: self.variance_U*self.variance_Y*(dk1theta1+dk2theta1+dk3theta1)
+
+
+
+
+        dk1theta2 = lambda dist: np.exp(-ly*dist) * ((lu+ly)**(-2)) * (  (-dist)*(2*lu+ly)  +  1  +  (-2)*(2*lu+ly)/(lu+ly)  )
+
+        dk2theta2 =lambda dist:  1*(
+            np.exp(-dist*lu)*(ly-lu)**(-2) * ( 1+lu*dist+(-2)*(ly-2*lu+lu*ly*dist-dist*lu**2)*(ly-lu)**(-1) )
+            +np.exp(-dist*ly)*(ly-lu)**(-2) * ( (-dist)*(2*lu-ly) -1+(2*lu-ly)*(-2)*(ly-lu)**(-1) )
+            )
+
+        dk3theta2 = lambda dist: np.exp(-dist*lu) * (-3*lu-ly-dist*lu**2-lu*ly*dist)/(lu+ly)**3
+
+        #dktheta2 = lambda dist: self.variance_U*self.variance_Y*(dk1theta2 + dk2theta2 +dk3theta2)
+
+        # kyy kernel
+
+        k1 = lambda dist: np.exp(-ly*dist)*(2*lu+ly)/(lu+ly)**2
+        k2 = lambda dist: (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2
+        k3 = lambda dist: np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
+        #dkdvar = k1+k2+k3
+
+
+
+        # cross covariance function
+        kyu3 = lambda dist:np.exp(-lu*dist)/(lu+ly)*(1+lu*(dist+1/(lu+ly)))
+
+        k1cros = lambda dist:np.exp(ly*dist)/(lu-ly) * ( 1- np.exp( (lu-ly)*dist) + lu* ( dist*np.exp( (lu-ly)*dist ) + (1- np.exp( (lu-ly)*dist ) ) /(lu-ly)   )    )
+
+        k2cros = lambda dist:np.exp(ly*dist)*( 1/(lu+ly) + lu/(lu+ly)**2 )
+        # cross covariance kuy
+        kuyp = lambda dist:(kyu3(dist))       #t>0 kuy
+        kuyn = lambda dist:(k1cros(dist)+k2cros(dist))      #t<0 kuy
+        # cross covariance kyu
+        kyup = lambda dist:(k1cros(-dist)+k2cros(-dist))    #t>0 kyu
+        kyun = lambda dist:(kyu3(-dist))       #t<0 kyu
+
+        # dk dtheta for UY
+
+
+        dkyu3dtheta2 = lambda dist: np.exp(-lu*dist) * ( (-1)*(lu+ly)**(-2)*(1+lu*dist+lu*(lu+ly)**(-1)) + (lu+ly)**(-1)*(-lu)*(lu+ly)**(-2) )
+        dkyu3dtheta1 = lambda dist: np.exp(-lu*dist)*(lu+ly)**(-1)* ( (-dist)*(1+dist*lu+lu*(lu+ly)**(-1)) -\
+         (lu+ly)**(-1)*(1+dist*lu+lu*(lu+ly)**(-1)) +dist+(lu+ly)**(-1)-lu*(lu+ly)**(-2) )
+
+        dkcros2dtheta1 = lambda dist: np.exp(ly*dist)* ( -(ly+lu)**(-2) + (ly+lu)**(-2) + (-2)*lu*(lu+ly)**(-3)  )
+        dkcros2dtheta2 = lambda dist: np.exp(ly*dist)*dist* ( (ly+lu)**(-1) + lu*(lu+ly)**(-2) ) + \
+                                      np.exp(ly*dist)*( -(lu+ly)**(-2) + lu*(-2)*(lu+ly)**(-3)  )
+
+        dkcros1dtheta1 = lambda dist: np.exp(ly*dist)*(     -(lu-ly)**(-2)*(  1-np.exp((lu-ly)*dist) + lu*dist*np.exp((lu-ly)*dist)+ \
+          lu*(1-np.exp((lu-ly)*dist))/(lu-ly)  )  +  (lu-ly)**(-1)*(  -np.exp( (lu-ly)*dist )*dist + dist*np.exp( (lu-ly)*dist)+\
+          lu*dist**2*np.exp((lu-ly)*dist)+(1-np.exp((lu-ly)*dist))/(lu-ly) - lu*np.exp((lu-ly)*dist)*dist/(lu-ly) -\
+          lu*(1-np.exp((lu-ly)*dist))/(lu-ly)**2  )   )
+
+        dkcros1dtheta2 = lambda t: np.exp(ly*t)*t/(lu-ly)*( 1-np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t)+\
+            lu*(1-np.exp((lu-ly)*t))/(lu-ly)  )+\
+            np.exp(ly*t)/(lu-ly)**2* ( 1-np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t) + lu*( 1-np.exp((lu-ly)*t) )/(lu-ly)  )+\
+            np.exp(ly*t)/(lu-ly)*( np.exp((lu-ly)*t)*t -lu*t*t*np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t)/(lu-ly)+\
+            lu*( 1-np.exp((lu-ly)*t) )/(lu-ly)**2 )
+
+        dkuypdtheta1 = lambda dist:(dkyu3dtheta1(dist))       #t>0 kuy
+        dkuyndtheta1 = lambda dist:(dkcros1dtheta1(dist)+dkcros2dtheta1(dist))      #t<0 kuy
+        # cross covariance kyu
+        dkyupdtheta1 = lambda dist:(dkcros1dtheta1(-dist)+dkcros2dtheta1(-dist))    #t>0 kyu
+        dkyundtheta1 = lambda dist:(dkyu3dtheta1(-dist))       #t<0 kyu
+
+        dkuypdtheta2 = lambda dist:(dkyu3dtheta2(dist))       #t>0 kuy
+        dkuyndtheta2 = lambda dist:(dkcros1dtheta2(dist)+dkcros2dtheta2(dist))      #t<0 kuy
+        # cross covariance kyu
+        dkyupdtheta2 = lambda dist:(dkcros1dtheta2(-dist)+dkcros2dtheta2(-dist))    #t>0 kyu
+        dkyundtheta2 = lambda dist:(dkyu3dtheta2(-dist))       #t<0 kyu
+
+
+        for i, s1 in enumerate(slices):
+            for j, s2 in enumerate(slices2):
+                for ss1 in s1:
+                    for ss2 in s2:
+                        if i==0 and j==0:
+                            #target[ss1,ss2] = kuu(np.abs(rdist[ss1,ss2]))
+                            dktheta1[ss1,ss2] = Vu*UUdtheta1(np.abs(rdist[ss1,ss2]))
+                            dktheta2[ss1,ss2] = 0
+                            dkUdvar[ss1,ss2] = UUdvar(np.abs(rdist[ss1,ss2]))
+                            dkYdvar[ss1,ss2] = 0
+                        elif i==0 and j==1:
+                            ########target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) )   )
+                            #np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) )   )
+                            #dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , self.variance_U*self.variance_Y*dkcrtheta1(np.abs(rdist[ss1,ss2])) ,self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2])))    )
+                            #dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , self.variance_U*self.variance_Y*dkcrtheta2(np.abs(rdist[ss1,ss2])) ,self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2]))+dk2theta2(np.abs(rdist[ss1,ss2])))    )
+                            dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkuypdtheta1(rdist[ss1,ss2]),Vu*Vyu*dkuyndtheta1(rdist[ss1,ss2]) )
+                            dkUdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vyu*kuyp(rdist[ss1,ss2]), Vyu* kuyn(rdist[ss1,ss2])  )
+                            dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkuypdtheta2(rdist[ss1,ss2])+Vu*dVdly*kuyp(rdist[ss1,ss2]),Vu*Vyu*dkuyndtheta2(rdist[ss1,ss2])+Vu*dVdly*kuyn(rdist[ss1,ss2]) )
+                            dkYdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*dVdVy*kuyp(rdist[ss1,ss2]), Vu*dVdVy* kuyn(rdist[ss1,ss2])  )
+                        elif i==1 and j==1:
+                            #target[ss1,ss2] = kyy(np.abs(rdist[ss1,ss2]))
+                            dktheta1[ss1,ss2] = self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2]))+dk3theta1(np.abs(rdist[ss1,ss2])))
+                            dktheta2[ss1,ss2] = self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2])) + dk2theta2(np.abs(rdist[ss1,ss2])) +dk3theta2(np.abs(rdist[ss1,ss2])))
+                            dkUdvar[ss1,ss2] = self.variance_Y*(k1(np.abs(rdist[ss1,ss2]))+k2(np.abs(rdist[ss1,ss2]))+k3(np.abs(rdist[ss1,ss2])) )
+                            dkYdvar[ss1,ss2] = self.variance_U*(k1(np.abs(rdist[ss1,ss2]))+k2(np.abs(rdist[ss1,ss2]))+k3(np.abs(rdist[ss1,ss2])) )
+                        else:
+                            #######target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[s1[0],s2[0]]) )   )
+                            #dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 ,self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2]))) , self.variance_U*self.variance_Y*dkcrtheta1(np.abs(rdist[ss1,ss2])) )
+                            #dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 ,self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2]))+dk2theta2(np.abs(rdist[ss1,ss2]))) , self.variance_U*self.variance_Y*dkcrtheta2(np.abs(rdist[ss1,ss2])) )
+                            dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkyupdtheta1(rdist[ss1,ss2]),Vu*Vyu*dkyundtheta1(rdist[ss1,ss2])  )
+                            dkUdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vyu*kyup(rdist[ss1,ss2]),Vyu*kyun(rdist[ss1,ss2]))
+                            dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkyupdtheta2(rdist[ss1,ss2])+Vu*dVdly*kyup(rdist[ss1,ss2]),Vu*Vyu*dkyundtheta2(rdist[ss1,ss2])+Vu*dVdly*kyun(rdist[ss1,ss2])  )
+                            dkYdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*dVdVy*kyup(rdist[ss1,ss2]), Vu*dVdVy*kyun(rdist[ss1,ss2]))
+
+        #stop
+        self.variance_U.gradient = np.sum(dkUdvar * dL_dK)     # Vu
+
+        self.variance_Y.gradient = np.sum(dkYdvar * dL_dK)     # Vy
+
+        self.lengthscale_U.gradient = np.sum(dktheta1*(-np.sqrt(3)*self.lengthscale_U**(-2))* dL_dK)     #lu
+
+        self.lengthscale_Y.gradient = np.sum(dktheta2*(-self.lengthscale_Y**(-2)) * dL_dK)              #ly
+
--- a/GPy/kern/_src/ODE_UYC.py
+++ b/GPy/kern/_src/ODE_UYC.py
@ -0,0 +1,290 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+import numpy as np
+from independent_outputs import index_to_slices
+
+class ODE_UYC(Kern):
+    def __init__(self, input_dim, variance_U=3., variance_Y=1., lengthscale_U=1., lengthscale_Y=1., ubias =1. ,active_dims=None, name='ode_uyc'):
+        assert input_dim ==2, "only defined for 2 input dims"
+        super(ODE_UYC, self).__init__(input_dim, active_dims, name)
+
+        self.variance_Y = Param('variance_Y', variance_Y, Logexp())
+        self.variance_U = Param('variance_U', variance_U, Logexp())
+        self.lengthscale_Y = Param('lengthscale_Y', lengthscale_Y, Logexp())
+        self.lengthscale_U = Param('lengthscale_U', lengthscale_U, Logexp())
+        self.ubias = Param('ubias', ubias, Logexp())
+
+        self.add_parameters(self.variance_Y, self.variance_U, self.lengthscale_Y, self.lengthscale_U, self.ubias)
+
+    def K(self, X, X2=None):
+        # model :   a * dy/dt + b * y = U
+        #lu=sqrt(3)/theta1  ly=1/theta2  theta2= a/b :thetay   sigma2=1/(2ab) :sigmay
+
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        if X2 is None:
+            X2,slices2 = X,slices
+            K = np.zeros((X.shape[0], X.shape[0]))
+        else:
+            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+            K = np.zeros((X.shape[0], X2.shape[0]))
+
+        #stop
+        #rdist = X[:,0][:,None] - X2[:,0][:,None].T
+        rdist = X - X2.T
+        ly=1/self.lengthscale_Y
+        lu=np.sqrt(3)/self.lengthscale_U
+        #iu=self.input_lengthU  #dimention of U
+        Vu=self.variance_U
+        Vy=self.variance_Y
+        #Vy=ly/2
+        #stop
+
+
+        # kernel for kuu  matern3/2
+        kuu = lambda dist:Vu * (1 + lu* np.abs(dist)) * np.exp(-lu * np.abs(dist)) +self.ubias
+
+        # kernel for kyy
+        k1 = lambda dist:np.exp(-ly*np.abs(dist))*(2*lu+ly)/(lu+ly)**2
+        k2 = lambda dist:(np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2
+        k3 = lambda dist:np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
+        kyy = lambda dist:Vu*Vy*(k1(dist) + k2(dist) + k3(dist))
+
+
+        # cross covariance function
+        kyu3 = lambda dist:np.exp(-lu*dist)/(lu+ly)*(1+lu*(dist+1/(lu+ly)))
+        #kyu3 = lambda dist: 0
+
+        k1cros = lambda dist:np.exp(ly*dist)/(lu-ly) * ( 1- np.exp( (lu-ly)*dist) + lu* ( dist*np.exp( (lu-ly)*dist ) + (1- np.exp( (lu-ly)*dist ) ) /(lu-ly)   )    )
+        #k1cros = lambda dist:0
+
+        k2cros = lambda dist:np.exp(ly*dist)*( 1/(lu+ly) + lu/(lu+ly)**2 )
+        #k2cros = lambda dist:0
+
+        Vyu=np.sqrt(Vy*ly*2)
+
+        # cross covariance kuy
+        kuyp = lambda dist:Vu*Vyu*(kyu3(dist))       #t>0 kuy
+        kuyn = lambda dist:Vu*Vyu*(k1cros(dist)+k2cros(dist))      #t<0 kuy
+        # cross covariance kyu
+        kyup = lambda dist:Vu*Vyu*(k1cros(-dist)+k2cros(-dist))    #t>0 kyu
+        kyun = lambda dist:Vu*Vyu*(kyu3(-dist))       #t<0 kyu
+
+
+        for i, s1 in enumerate(slices):
+            for j, s2 in enumerate(slices2):
+                for ss1 in s1:
+                    for ss2 in s2:
+                        if i==0 and j==0:
+                            K[ss1,ss2] = kuu(np.abs(rdist[ss1,ss2]))
+                        elif i==0 and j==1:
+                            #K[ss1,ss2]=  np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[ss1,ss2]) )   )
+                            K[ss1,ss2]=  np.where(  rdist[ss1,ss2]>0 , kuyp(rdist[ss1,ss2]), kuyn(rdist[ss1,ss2] )   )
+                        elif i==1 and j==1:
+                            K[ss1,ss2] = kyy(np.abs(rdist[ss1,ss2]))
+                        else:
+                            #K[ss1,ss2]= 0
+                            #K[ss1,ss2]= np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[ss1,ss2]) )   )
+                            K[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(rdist[ss1,ss2]), kyun(rdist[ss1,ss2] )   )
+        return K
+
+
+
+    def Kdiag(self, X):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        Kdiag = np.zeros(X.shape[0])
+        ly=1/self.lengthscale_Y
+        lu=np.sqrt(3)/self.lengthscale_U
+
+        Vu = self.variance_U
+        Vy=self.variance_Y
+
+        k1 = (2*lu+ly)/(lu+ly)**2
+        k2 = (ly-2*lu + 2*lu-ly ) / (ly-lu)**2
+        k3 = 1/(lu+ly) + (lu)/(lu+ly)**2
+
+        slices = index_to_slices(X[:,-1])
+
+        for i, ss1 in enumerate(slices):
+            for s1 in ss1:
+                if i==0:
+                    Kdiag[s1]+= self.variance_U + self.ubias
+                elif i==1:
+                    Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
+                else:
+                    raise ValueError, "invalid input/output index"
+        #Kdiag[slices[0][0]]+= self.variance_U   #matern32 diag
+        #Kdiag[slices[1][0]]+= self.variance_U*self.variance_Y*(k1+k2+k3)  #  diag
+        return Kdiag
+
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        """derivative of the covariance matrix with respect to the parameters."""
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        if X2 is None:
+            X2,slices2 = X,slices
+        else:
+            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+        #rdist = X[:,0][:,None] - X2[:,0][:,None].T
+
+        rdist = X - X2.T
+        ly=1/self.lengthscale_Y
+        lu=np.sqrt(3)/self.lengthscale_U
+
+        Vu=self.variance_U
+        Vy=self.variance_Y
+        Vyu = np.sqrt(Vy*ly*2)
+        dVdly = 0.5/np.sqrt(ly)*np.sqrt(2*Vy)
+        dVdVy = 0.5/np.sqrt(Vy)*np.sqrt(2*ly)
+
+        rd=rdist.shape[0]
+        dktheta1 = np.zeros([rd,rd])
+        dktheta2 = np.zeros([rd,rd])
+        dkUdvar = np.zeros([rd,rd])
+        dkYdvar = np.zeros([rd,rd])
+
+        dkdubias = np.zeros([rd,rd])
+
+        # dk dtheta for UU
+        UUdtheta1 = lambda dist: np.exp(-lu* dist)*dist + (-dist)*np.exp(-lu* dist)*(1+lu*dist)
+        UUdtheta2 = lambda dist: 0
+        #UUdvar = lambda dist: (1 + lu*dist)*np.exp(-lu*dist)
+        UUdvar = lambda dist: (1 + lu* np.abs(dist)) * np.exp(-lu * np.abs(dist))
+
+        # dk dtheta for YY
+
+        dk1theta1 = lambda dist: np.exp(-ly*dist)*2*(-lu)/(lu+ly)**3
+
+        dk2theta1 = lambda dist: (1.0)*(
+            np.exp(-lu*dist)*dist*(-ly+2*lu-lu*ly*dist+dist*lu**2)*(ly-lu)**(-2) + np.exp(-lu*dist)*(-2+ly*dist-2*dist*lu)*(ly-lu)**(-2)
+            +np.exp(-dist*lu)*(ly-2*lu+ly*lu*dist-dist*lu**2)*2*(ly-lu)**(-3)
+            +np.exp(-dist*ly)*2*(ly-lu)**(-2)
+            +np.exp(-dist*ly)*2*(2*lu-ly)*(ly-lu)**(-3)
+            )
+
+        dk3theta1 = lambda dist: np.exp(-dist*lu)*(lu+ly)**(-2)*((2*lu+ly+dist*lu**2+lu*ly*dist)*(-dist-2/(lu+ly))+2+2*lu*dist+ly*dist)
+
+        #dktheta1 = lambda dist: self.variance_U*self.variance_Y*(dk1theta1+dk2theta1+dk3theta1)
+
+
+
+
+        dk1theta2 = lambda dist: np.exp(-ly*dist) * ((lu+ly)**(-2)) * (  (-dist)*(2*lu+ly)  +  1  +  (-2)*(2*lu+ly)/(lu+ly)  )
+
+        dk2theta2 =lambda dist:  1*(
+            np.exp(-dist*lu)*(ly-lu)**(-2) * ( 1+lu*dist+(-2)*(ly-2*lu+lu*ly*dist-dist*lu**2)*(ly-lu)**(-1) )
+            +np.exp(-dist*ly)*(ly-lu)**(-2) * ( (-dist)*(2*lu-ly) -1+(2*lu-ly)*(-2)*(ly-lu)**(-1) )
+            )
+
+        dk3theta2 = lambda dist: np.exp(-dist*lu) * (-3*lu-ly-dist*lu**2-lu*ly*dist)/(lu+ly)**3
+
+        #dktheta2 = lambda dist: self.variance_U*self.variance_Y*(dk1theta2 + dk2theta2 +dk3theta2)
+
+        # kyy kernel
+
+        k1 = lambda dist: np.exp(-ly*dist)*(2*lu+ly)/(lu+ly)**2
+        k2 = lambda dist: (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2
+        k3 = lambda dist: np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
+        #dkdvar = k1+k2+k3
+
+
+
+        # cross covariance function
+        kyu3 = lambda dist:np.exp(-lu*dist)/(lu+ly)*(1+lu*(dist+1/(lu+ly)))
+
+        k1cros = lambda dist:np.exp(ly*dist)/(lu-ly) * ( 1- np.exp( (lu-ly)*dist) + lu* ( dist*np.exp( (lu-ly)*dist ) + (1- np.exp( (lu-ly)*dist ) ) /(lu-ly)   )    )
+
+        k2cros = lambda dist:np.exp(ly*dist)*( 1/(lu+ly) + lu/(lu+ly)**2 )
+        # cross covariance kuy
+        kuyp = lambda dist:(kyu3(dist))       #t>0 kuy
+        kuyn = lambda dist:(k1cros(dist)+k2cros(dist))      #t<0 kuy
+        # cross covariance kyu
+        kyup = lambda dist:(k1cros(-dist)+k2cros(-dist))    #t>0 kyu
+        kyun = lambda dist:(kyu3(-dist))       #t<0 kyu
+
+        # dk dtheta for UY
+
+
+        dkyu3dtheta2 = lambda dist: np.exp(-lu*dist) * ( (-1)*(lu+ly)**(-2)*(1+lu*dist+lu*(lu+ly)**(-1)) + (lu+ly)**(-1)*(-lu)*(lu+ly)**(-2) )
+        dkyu3dtheta1 = lambda dist: np.exp(-lu*dist)*(lu+ly)**(-1)* ( (-dist)*(1+dist*lu+lu*(lu+ly)**(-1)) -\
+         (lu+ly)**(-1)*(1+dist*lu+lu*(lu+ly)**(-1)) +dist+(lu+ly)**(-1)-lu*(lu+ly)**(-2) )
+
+        dkcros2dtheta1 = lambda dist: np.exp(ly*dist)* ( -(ly+lu)**(-2) + (ly+lu)**(-2) + (-2)*lu*(lu+ly)**(-3)  )
+        dkcros2dtheta2 = lambda dist: np.exp(ly*dist)*dist* ( (ly+lu)**(-1) + lu*(lu+ly)**(-2) ) + \
+                                      np.exp(ly*dist)*( -(lu+ly)**(-2) + lu*(-2)*(lu+ly)**(-3)  )
+
+        dkcros1dtheta1 = lambda dist: np.exp(ly*dist)*(     -(lu-ly)**(-2)*(  1-np.exp((lu-ly)*dist) + lu*dist*np.exp((lu-ly)*dist)+ \
+          lu*(1-np.exp((lu-ly)*dist))/(lu-ly)  )  +  (lu-ly)**(-1)*(  -np.exp( (lu-ly)*dist )*dist + dist*np.exp( (lu-ly)*dist)+\
+          lu*dist**2*np.exp((lu-ly)*dist)+(1-np.exp((lu-ly)*dist))/(lu-ly) - lu*np.exp((lu-ly)*dist)*dist/(lu-ly) -\
+          lu*(1-np.exp((lu-ly)*dist))/(lu-ly)**2  )   )
+
+        dkcros1dtheta2 = lambda t: np.exp(ly*t)*t/(lu-ly)*( 1-np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t)+\
+            lu*(1-np.exp((lu-ly)*t))/(lu-ly)  )+\
+            np.exp(ly*t)/(lu-ly)**2* ( 1-np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t) + lu*( 1-np.exp((lu-ly)*t) )/(lu-ly)  )+\
+            np.exp(ly*t)/(lu-ly)*( np.exp((lu-ly)*t)*t -lu*t*t*np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t)/(lu-ly)+\
+            lu*( 1-np.exp((lu-ly)*t) )/(lu-ly)**2 )
+
+        dkuypdtheta1 = lambda dist:(dkyu3dtheta1(dist))       #t>0 kuy
+        dkuyndtheta1 = lambda dist:(dkcros1dtheta1(dist)+dkcros2dtheta1(dist))      #t<0 kuy
+        # cross covariance kyu
+        dkyupdtheta1 = lambda dist:(dkcros1dtheta1(-dist)+dkcros2dtheta1(-dist))    #t>0 kyu
+        dkyundtheta1 = lambda dist:(dkyu3dtheta1(-dist))       #t<0 kyu
+
+        dkuypdtheta2 = lambda dist:(dkyu3dtheta2(dist))       #t>0 kuy
+        dkuyndtheta2 = lambda dist:(dkcros1dtheta2(dist)+dkcros2dtheta2(dist))      #t<0 kuy
+        # cross covariance kyu
+        dkyupdtheta2 = lambda dist:(dkcros1dtheta2(-dist)+dkcros2dtheta2(-dist))    #t>0 kyu
+        dkyundtheta2 = lambda dist:(dkyu3dtheta2(-dist))       #t<0 kyu
+
+
+        for i, s1 in enumerate(slices):
+            for j, s2 in enumerate(slices2):
+                for ss1 in s1:
+                    for ss2 in s2:
+                        if i==0 and j==0:
+                            #target[ss1,ss2] = kuu(np.abs(rdist[ss1,ss2]))
+                            dktheta1[ss1,ss2] = Vu*UUdtheta1(np.abs(rdist[ss1,ss2]))
+                            dktheta2[ss1,ss2] = 0
+                            dkUdvar[ss1,ss2] = UUdvar(np.abs(rdist[ss1,ss2]))
+                            dkYdvar[ss1,ss2] = 0
+                            dkdubias[ss1,ss2] = 1
+                        elif i==0 and j==1:
+                            ########target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) )   )
+                            #np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) )   )
+                            #dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , self.variance_U*self.variance_Y*dkcrtheta1(np.abs(rdist[ss1,ss2])) ,self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2])))    )
+                            #dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , self.variance_U*self.variance_Y*dkcrtheta2(np.abs(rdist[ss1,ss2])) ,self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2]))+dk2theta2(np.abs(rdist[ss1,ss2])))    )
+                            dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkuypdtheta1(rdist[ss1,ss2]),Vu*Vyu*dkuyndtheta1(rdist[ss1,ss2]) )
+                            dkUdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vyu*kuyp(rdist[ss1,ss2]), Vyu* kuyn(rdist[ss1,ss2])  )
+                            dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkuypdtheta2(rdist[ss1,ss2])+Vu*dVdly*kuyp(rdist[ss1,ss2]),Vu*Vyu*dkuyndtheta2(rdist[ss1,ss2])+Vu*dVdly*kuyn(rdist[ss1,ss2]) )
+                            dkYdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*dVdVy*kuyp(rdist[ss1,ss2]), Vu*dVdVy* kuyn(rdist[ss1,ss2])  )
+                            dkdubias[ss1,ss2] = 0
+                        elif i==1 and j==1:
+                            #target[ss1,ss2] = kyy(np.abs(rdist[ss1,ss2]))
+                            dktheta1[ss1,ss2] = self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2]))+dk3theta1(np.abs(rdist[ss1,ss2])))
+                            dktheta2[ss1,ss2] = self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2])) + dk2theta2(np.abs(rdist[ss1,ss2])) +dk3theta2(np.abs(rdist[ss1,ss2])))
+                            dkUdvar[ss1,ss2] = self.variance_Y*(k1(np.abs(rdist[ss1,ss2]))+k2(np.abs(rdist[ss1,ss2]))+k3(np.abs(rdist[ss1,ss2])) )
+                            dkYdvar[ss1,ss2] = self.variance_U*(k1(np.abs(rdist[ss1,ss2]))+k2(np.abs(rdist[ss1,ss2]))+k3(np.abs(rdist[ss1,ss2])) )
+                            dkdubias[ss1,ss2] = 0
+                        else:
+                            #######target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[s1[0],s2[0]]) )   )
+                            #dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 ,self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2]))) , self.variance_U*self.variance_Y*dkcrtheta1(np.abs(rdist[ss1,ss2])) )
+                            #dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 ,self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2]))+dk2theta2(np.abs(rdist[ss1,ss2]))) , self.variance_U*self.variance_Y*dkcrtheta2(np.abs(rdist[ss1,ss2])) )
+                            dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkyupdtheta1(rdist[ss1,ss2]),Vu*Vyu*dkyundtheta1(rdist[ss1,ss2])  )
+                            dkUdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vyu*kyup(rdist[ss1,ss2]),Vyu*kyun(rdist[ss1,ss2]))
+                            dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkyupdtheta2(rdist[ss1,ss2])+Vu*dVdly*kyup(rdist[ss1,ss2]),Vu*Vyu*dkyundtheta2(rdist[ss1,ss2])+Vu*dVdly*kyun(rdist[ss1,ss2])  )
+                            dkYdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*dVdVy*kyup(rdist[ss1,ss2]), Vu*dVdVy*kyun(rdist[ss1,ss2]))
+                            dkdubias[ss1,ss2] = 0
+        #stop
+        self.variance_U.gradient = np.sum(dkUdvar * dL_dK)     # Vu
+
+        self.variance_Y.gradient = np.sum(dkYdvar * dL_dK)     # Vy
+
+        self.lengthscale_U.gradient = np.sum(dktheta1*(-np.sqrt(3)*self.lengthscale_U**(-2))* dL_dK)     #lu
+
+        self.lengthscale_Y.gradient = np.sum(dktheta2*(-self.lengthscale_Y**(-2)) * dL_dK)              #ly
+
+        self.ubias.gradient = np.sum(dkdubias * dL_dK) 
+
--- a/GPy/kern/_src/ODE_st.py
+++ b/GPy/kern/_src/ODE_st.py
@ -0,0 +1,267 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+import numpy as np
+from independent_outputs import index_to_slices
+
+
+class ODE_st(Kern):
+    """
+    kernel resultiong from a first order ODE with OU driving GP
+
+    :param input_dim: the number of input dimension, has to be equal to one
+    :type input_dim: int
+    :param varianceU: variance of the driving GP
+    :type varianceU: float
+    :param lengthscaleU: lengthscale of the driving GP  (sqrt(3)/lengthscaleU)
+    :type lengthscaleU: float
+    :param varianceY: 'variance' of the transfer function
+    :type varianceY: float
+    :param lengthscaleY: 'lengthscale' of the transfer function (1/lengthscaleY)
+    :type lengthscaleY: float
+    :rtype: kernel object
+
+    """
+    
+    def __init__(self, input_dim, a=1.,b=1., c=1.,variance_Yx=3.,variance_Yt=1.5, lengthscale_Yx=1.5, lengthscale_Yt=1.5, active_dims=None, name='ode_st'):
+        assert input_dim ==3, "only defined for 3 input dims"
+        super(ODE_st, self).__init__(input_dim, active_dims, name)
+
+        self.variance_Yt = Param('variance_Yt', variance_Yt, Logexp())
+        self.variance_Yx = Param('variance_Yx', variance_Yx, Logexp())
+        self.lengthscale_Yt = Param('lengthscale_Yt', lengthscale_Yt, Logexp())
+        self.lengthscale_Yx = Param('lengthscale_Yx', lengthscale_Yx, Logexp())        
+
+        self.a= Param('a', a, Logexp())
+        self.b = Param('b', b, Logexp())
+        self.c = Param('c', c, Logexp())
+
+        self.add_parameters(self.a, self.b, self.c, self.variance_Yt, self.variance_Yx, self.lengthscale_Yt,self.lengthscale_Yx)
+
+
+    def K(self, X, X2=None):        
+    # model :   -a d^2y/dx^2  + b dy/dt + c * y = U
+    # kernel Kyy rbf spatiol temporal
+    # vyt Y temporal variance  vyx Y spatiol variance   lyt Y temporal lengthscale   lyx Y spatiol lengthscale
+    # kernel Kuu doper( doper(Kyy))
+    # a   b    c    lyt   lyx    vyx*vyt
+        """Compute the covariance matrix between X and X2."""        
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        if X2 is None:
+            X2,slices2 = X,slices
+            K = np.zeros((X.shape[0], X.shape[0]))
+        else:
+            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+            K = np.zeros((X.shape[0], X2.shape[0]))
+
+
+        tdist = (X[:,0][:,None] - X2[:,0][None,:])**2
+        xdist = (X[:,1][:,None] - X2[:,1][None,:])**2
+
+        ttdist = (X[:,0][:,None] - X2[:,0][None,:])
+        #rdist = [tdist,xdist]
+        #dist = np.abs(X - X2.T)
+        vyt = self.variance_Yt
+        vyx = self.variance_Yx
+        
+        lyt=1/(2*self.lengthscale_Yt)
+        lyx=1/(2*self.lengthscale_Yx)
+
+        a = self.a ## -a is used in the model, negtive diffusion
+        b = self.b
+        c = self.c
+
+        kyy = lambda tdist,xdist: np.exp(-lyt*(tdist) -lyx*(xdist))
+
+        k1 = lambda tdist: (2*lyt - 4*lyt**2 * (tdist) )
+
+        k2 = lambda xdist: ( 4*lyx**2 * (xdist)  - 2*lyx )
+
+        k3 = lambda xdist: ( 3*4*lyx**2 - 6*8*xdist*lyx**3 + 16*xdist**2*lyx**4 )
+
+        k4 = lambda ttdist: 2*lyt*(ttdist)
+
+        for i, s1 in enumerate(slices):
+            for j, s2 in enumerate(slices2):
+                for ss1 in s1:
+                    for ss2 in s2:
+                        if i==0 and j==0:
+                            K[ss1,ss2] = vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                        elif i==0 and j==1:
+                            K[ss1,ss2] = (-a*k2(xdist[ss1,ss2]) + b*k4(ttdist[ss1,ss2]) + c)*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            #K[ss1,ss2]=  np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[ss1,ss2]) )   )
+                            #K[ss1,ss2]=  np.where(  rdist[ss1,ss2]>0 , kuyp(rdist[ss1,ss2]), kuyn(rdist[ss1,ss2] )   )
+                        elif i==1 and j==1:
+                            K[ss1,ss2] = ( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 )* vyt*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                        else:
+                            K[ss1,ss2] = (-a*k2(xdist[ss1,ss2]) - b*k4(ttdist[ss1,ss2]) + c)*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            #K[ss1,ss2]= np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[ss1,ss2]) )   )
+                            #K[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(rdist[ss1,ss2]), kyun(rdist[ss1,ss2] )   )
+        
+        #stop
+        return K
+
+    def Kdiag(self, X):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        vyt = self.variance_Yt
+        vyx = self.variance_Yx
+
+        lyt = 1./(2*self.lengthscale_Yt)
+        lyx = 1./(2*self.lengthscale_Yx)
+
+        a = self.a
+        b = self.b
+        c = self.c
+
+        ## dk^2/dtdt'
+        k1 = (2*lyt )*vyt*vyx
+        ## dk^2/dx^2
+        k2 = ( - 2*lyx )*vyt*vyx
+        ## dk^4/dx^2dx'^2
+        k3 = ( 4*3*lyx**2 )*vyt*vyx
+
+
+        Kdiag = np.zeros(X.shape[0])
+        slices = index_to_slices(X[:,-1])
+
+        for i, ss1 in enumerate(slices):
+            for s1 in ss1:
+                if i==0:
+                    Kdiag[s1]+= vyt*vyx
+                elif i==1:
+                    #i=1
+                    Kdiag[s1]+= b**2*k1 - 2*a*c*k2 + a**2*k3 + c**2*vyt*vyx
+                    #Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
+                else:
+                    raise ValueError, "invalid input/output index"
+
+        return Kdiag
+        
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+    #def dK_dtheta(self, dL_dK, X, X2, target):
+        """derivative of the covariance matrix with respect to the parameters."""
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        if X2 is None:
+            X2,slices2 = X,slices
+            K = np.zeros((X.shape[0], X.shape[0]))
+        else:
+            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+        
+        vyt = self.variance_Yt
+        vyx = self.variance_Yx
+
+        lyt = 1./(2*self.lengthscale_Yt)
+        lyx = 1./(2*self.lengthscale_Yx)
+
+        a = self.a
+        b = self.b
+        c = self.c
+
+        tdist = (X[:,0][:,None] - X2[:,0][None,:])**2
+        xdist = (X[:,1][:,None] - X2[:,1][None,:])**2
+        #rdist = [tdist,xdist]
+        ttdist = (X[:,0][:,None] - X2[:,0][None,:])
+        
+        rd=tdist.shape[0]
+
+        dka = np.zeros([rd,rd])
+        dkb = np.zeros([rd,rd])
+        dkc = np.zeros([rd,rd])
+        dkYdvart = np.zeros([rd,rd])
+        dkYdvarx = np.zeros([rd,rd])
+        dkYdlent = np.zeros([rd,rd])
+        dkYdlenx = np.zeros([rd,rd])
+
+
+        kyy = lambda tdist,xdist: np.exp(-lyt*(tdist) -lyx*(xdist))
+        #k1 = lambda tdist: (lyt - lyt**2 * (tdist) )
+        #k2 = lambda xdist: ( lyx**2 * (xdist)  - lyx )
+        #k3 = lambda xdist: ( 3*lyx**2 - 6*xdist*lyx**3 + xdist**2*lyx**4 )
+        #k4 = lambda tdist: -lyt*np.sqrt(tdist)
+
+        k1 = lambda tdist: (2*lyt - 4*lyt**2 * (tdist) )
+
+        k2 = lambda xdist: ( 4*lyx**2 * (xdist)  - 2*lyx )
+
+        k3 = lambda xdist: ( 3*4*lyx**2 - 6*8*xdist*lyx**3 + 16*xdist**2*lyx**4 )
+
+        k4 = lambda ttdist: 2*lyt*(ttdist)
+
+        dkyydlyx = lambda tdist,xdist: kyy(tdist,xdist)*(-xdist)
+        dkyydlyt = lambda tdist,xdist: kyy(tdist,xdist)*(-tdist)
+
+        dk1dlyt = lambda tdist: 2. - 4*2.*lyt*tdist
+        dk2dlyx = lambda xdist: (4.*2.*lyx*xdist -2.)
+        dk3dlyx = lambda xdist: (6.*4.*lyx - 18.*8*xdist*lyx**2 + 4*16*xdist**2*lyx**3)
+
+        dk4dlyt = lambda ttdist: 2*(ttdist)
+
+        for i, s1 in enumerate(slices):
+            for j, s2 in enumerate(slices2):
+                for ss1 in s1:
+                    for ss2 in s2:
+                        if i==0 and j==0:
+                            dka[ss1,ss2] = 0
+                            dkb[ss1,ss2] = 0
+                            dkc[ss1,ss2] = 0
+                            dkYdvart[ss1,ss2] = vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdvarx[ss1,ss2] = vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdlenx[ss1,ss2] = vyt*vyx*dkyydlyx(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdlent[ss1,ss2] = vyt*vyx*dkyydlyt(tdist[ss1,ss2],xdist[ss1,ss2])
+                        elif i==0 and j==1:
+                            dka[ss1,ss2] = -k2(xdist[ss1,ss2])*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkb[ss1,ss2] = k4(ttdist[ss1,ss2])*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkc[ss1,ss2] = vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            #dkYdvart[ss1,ss2] = 0
+                            #dkYdvarx[ss1,ss2] = 0
+                            #dkYdlent[ss1,ss2] = 0
+                            #dkYdlenx[ss1,ss2] = 0
+                            dkYdvart[ss1,ss2] = (-a*k2(xdist[ss1,ss2])+b*k4(ttdist[ss1,ss2])+c)*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdvarx[ss1,ss2] = (-a*k2(xdist[ss1,ss2])+b*k4(ttdist[ss1,ss2])+c)*vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdlent[ss1,ss2] = vyt*vyx*dkyydlyt(tdist[ss1,ss2],xdist[ss1,ss2])* (-a*k2(xdist[ss1,ss2])+b*k4(ttdist[ss1,ss2])+c)+\
+                            vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*b*dk4dlyt(ttdist[ss1,ss2])
+                            dkYdlenx[ss1,ss2] = vyt*vyx*dkyydlyx(tdist[ss1,ss2],xdist[ss1,ss2])*(-a*k2(xdist[ss1,ss2])+b*k4(ttdist[ss1,ss2])+c)+\
+                            vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*(-a*dk2dlyx(xdist[ss1,ss2]))
+                        elif i==1 and j==1:
+                            dka[ss1,ss2] = (2*a*k3(xdist[ss1,ss2]) - 2*c*k2(xdist[ss1,ss2]))*vyt*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkb[ss1,ss2] = 2*b*k1(tdist[ss1,ss2])*vyt*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkc[ss1,ss2] = (-2*a*k2(xdist[ss1,ss2]) + 2*c )*vyt*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdvart[ss1,ss2] = ( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 )*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdvarx[ss1,ss2] = ( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 )*vyt* kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdlent[ss1,ss2] = vyt*vyx*dkyydlyt(tdist[ss1,ss2],xdist[ss1,ss2])*( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 ) +\
+                            vyx*vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*b**2*dk1dlyt(tdist[ss1,ss2])
+                            dkYdlenx[ss1,ss2] = vyt*vyx*dkyydlyx(tdist[ss1,ss2],xdist[ss1,ss2])*( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 ) +\
+                            vyx*vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2])* (-2*a*c*dk2dlyx(xdist[ss1,ss2]) + a**2*dk3dlyx(xdist[ss1,ss2]) )
+                        else:
+                            dka[ss1,ss2] = -k2(xdist[ss1,ss2])*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkb[ss1,ss2] = -k4(ttdist[ss1,ss2])*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkc[ss1,ss2] = vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            #dkYdvart[ss1,ss2] = 0
+                            #dkYdvarx[ss1,ss2] = 0
+                            #dkYdlent[ss1,ss2] = 0
+                            #dkYdlenx[ss1,ss2] = 0
+                            dkYdvart[ss1,ss2] = (-a*k2(xdist[ss1,ss2])-b*k4(ttdist[ss1,ss2])+c)*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdvarx[ss1,ss2] = (-a*k2(xdist[ss1,ss2])-b*k4(ttdist[ss1,ss2])+c)*vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdlent[ss1,ss2] = vyt*vyx*dkyydlyt(tdist[ss1,ss2],xdist[ss1,ss2])* (-a*k2(xdist[ss1,ss2])-b*k4(ttdist[ss1,ss2])+c)+\
+                            vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*(-1)*b*dk4dlyt(ttdist[ss1,ss2])
+                            dkYdlenx[ss1,ss2] = vyt*vyx*dkyydlyx(tdist[ss1,ss2],xdist[ss1,ss2])*(-a*k2(xdist[ss1,ss2])-b*k4(ttdist[ss1,ss2])+c)+\
+                            vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*(-a*dk2dlyx(xdist[ss1,ss2])) 
+
+        self.a.gradient = np.sum(dka * dL_dK)  
+
+        self.b.gradient = np.sum(dkb * dL_dK) 
+
+        self.c.gradient = np.sum(dkc * dL_dK)
+
+
+        self.variance_Yt.gradient = np.sum(dkYdvart * dL_dK)  # Vy
+
+        self.variance_Yx.gradient = np.sum(dkYdvarx * dL_dK)
+
+        self.lengthscale_Yt.gradient = np.sum(dkYdlent*(-0.5*self.lengthscale_Yt**(-2)) * dL_dK)    #ly np.sum(dktheta2*(-self.lengthscale_Y**(-2)) * dL_dK) 
+
+        self.lengthscale_Yx.gradient =  np.sum(dkYdlenx*(-0.5*self.lengthscale_Yx**(-2)) * dL_dK)
+
--- a/GPy/kern/_src/ODE_t.py
+++ b/GPy/kern/_src/ODE_t.py
@ -0,0 +1,165 @@
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+import numpy as np
+from independent_outputs import index_to_slices
+
+
+class ODE_t(Kern):
+
+        def __init__(self, input_dim, a=1., c=1.,variance_Yt=3., lengthscale_Yt=1.5,ubias =1., active_dims=None, name='ode_st'):
+                assert input_dim ==2, "only defined for 2 input dims"
+                super(ODE_t, self).__init__(input_dim, active_dims, name)
+
+                self.variance_Yt = Param('variance_Yt', variance_Yt, Logexp())
+                self.lengthscale_Yt = Param('lengthscale_Yt', lengthscale_Yt, Logexp())        
+
+                self.a= Param('a', a, Logexp())
+                self.c = Param('c', c, Logexp())
+                self.ubias = Param('ubias', ubias, Logexp())
+                self.add_parameters(self.a, self.c, self.variance_Yt, self.lengthscale_Yt,self.ubias)
+
+        def K(self, X, X2=None):
+                """Compute the covariance matrix between X and X2."""        
+                X,slices = X[:,:-1],index_to_slices(X[:,-1])
+                if X2 is None:
+                        X2,slices2 = X,slices
+                        K = np.zeros((X.shape[0], X.shape[0]))
+                else:
+                        X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+                        K = np.zeros((X.shape[0], X2.shape[0]))
+
+                tdist = (X[:,0][:,None] - X2[:,0][None,:])**2
+                ttdist = (X[:,0][:,None] - X2[:,0][None,:])
+                
+                vyt = self.variance_Yt
+                
+                lyt=1/(2*self.lengthscale_Yt)
+
+                a = -self.a
+                c = self.c
+
+                kyy = lambda tdist: np.exp(-lyt*(tdist))
+
+                k1 = lambda tdist: (2*lyt - 4*lyt**2 *(tdist) )
+
+                k4 = lambda tdist: 2*lyt*(tdist)
+
+                for i, s1 in enumerate(slices):
+                        for j, s2 in enumerate(slices2):
+                                for ss1 in s1:
+                                    for ss2 in s2:
+                                        if i==0 and j==0:
+                                            K[ss1,ss2] = vyt*kyy(tdist[ss1,ss2])
+                                        elif i==0 and j==1:
+                                            K[ss1,ss2] = (k4(ttdist[ss1,ss2])+1)*vyt*kyy(tdist[ss1,ss2])
+                                            #K[ss1,ss2] = (2*lyt*(ttdist[ss1,ss2])+1)*vyt*kyy(tdist[ss1,ss2])
+                                        elif i==1 and j==1:
+                                            K[ss1,ss2] = ( k1(tdist[ss1,ss2]) + 1. )*vyt* kyy(tdist[ss1,ss2])+self.ubias
+                                        else:
+                                            K[ss1,ss2] = (-k4(ttdist[ss1,ss2])+1)*vyt*kyy(tdist[ss1,ss2])
+                                            #K[ss1,ss2] = (-2*lyt*(ttdist[ss1,ss2])+1)*vyt*kyy(tdist[ss1,ss2])
+                #stop
+                return K
+
+
+        def Kdiag(self, X):
+
+                vyt = self.variance_Yt
+                lyt = 1./(2*self.lengthscale_Yt)
+
+                a = -self.a
+                c = self.c        
+                
+                k1 = (2*lyt )*vyt
+                
+                Kdiag = np.zeros(X.shape[0])
+                slices = index_to_slices(X[:,-1])
+
+                for i, ss1 in enumerate(slices):
+                    for s1 in ss1:
+                        if i==0:
+                            Kdiag[s1]+= vyt
+                        elif i==1:
+                            #i=1
+                            Kdiag[s1]+= k1 + vyt+self.ubias
+                            #Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
+                        else:
+                            raise ValueError, "invalid input/output index"
+
+                return Kdiag
+
+        def update_gradients_full(self, dL_dK, X, X2=None):
+                """derivative of the covariance matrix with respect to the parameters."""
+                X,slices = X[:,:-1],index_to_slices(X[:,-1])
+                if X2 is None:
+                    X2,slices2 = X,slices
+                    K = np.zeros((X.shape[0], X.shape[0]))
+                else:
+                    X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+
+
+                vyt = self.variance_Yt
+
+                lyt = 1./(2*self.lengthscale_Yt)
+
+                tdist = (X[:,0][:,None] - X2[:,0][None,:])**2
+                ttdist = (X[:,0][:,None] - X2[:,0][None,:])
+                #rdist = [tdist,xdist]
+                
+                rd=tdist.shape[0]
+
+                dka = np.zeros([rd,rd])
+                dkc = np.zeros([rd,rd])
+                dkYdvart = np.zeros([rd,rd])
+                dkYdlent = np.zeros([rd,rd])
+
+                dkdubias = np.zeros([rd,rd])
+
+                kyy = lambda tdist: np.exp(-lyt*(tdist))
+                dkyydlyt = lambda tdist: kyy(tdist)*(-tdist)
+
+                k1 = lambda tdist: (2*lyt - 4*lyt**2 * (tdist) )
+
+                k4 = lambda ttdist: 2*lyt*(ttdist)
+
+                dk1dlyt = lambda tdist: 2. - 4*2.*lyt*tdist
+
+                dk4dlyt = lambda ttdist: 2*(ttdist)
+
+                for i, s1 in enumerate(slices):
+                    for j, s2 in enumerate(slices2):
+                        for ss1 in s1:
+                            for ss2 in s2:
+                                if i==0 and j==0:
+                                    dkYdvart[ss1,ss2] = kyy(tdist[ss1,ss2])
+                                    dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])
+                                    dkdubias[ss1,ss2] = 0
+                                elif i==0 and j==1:
+                                    dkYdvart[ss1,ss2] = (k4(ttdist[ss1,ss2])+1)*kyy(tdist[ss1,ss2])
+                                    #dkYdvart[ss1,ss2] = ((2*lyt*ttdist[ss1,ss2])+1)*kyy(tdist[ss1,ss2])
+                                    dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])* (k4(ttdist[ss1,ss2])+1.)+\
+                                    vyt*kyy(tdist[ss1,ss2])*(dk4dlyt(ttdist[ss1,ss2]))
+                                    #dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])* (2*lyt*(ttdist[ss1,ss2])+1.)+\
+                                    #vyt*kyy(tdist[ss1,ss2])*(2*ttdist[ss1,ss2])
+                                    dkdubias[ss1,ss2] = 0
+                                elif i==1 and j==1:
+                                    dkYdvart[ss1,ss2] = (k1(tdist[ss1,ss2]) + 1. )* kyy(tdist[ss1,ss2])
+                                    dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])*( k1(tdist[ss1,ss2]) + 1. ) +\
+                          			vyt*kyy(tdist[ss1,ss2])*dk1dlyt(tdist[ss1,ss2])
+                                    dkdubias[ss1,ss2] = 1
+                                else:
+                                    dkYdvart[ss1,ss2] = (-k4(ttdist[ss1,ss2])+1)*kyy(tdist[ss1,ss2])
+                                    #dkYdvart[ss1,ss2] = (-2*lyt*(ttdist[ss1,ss2])+1)*kyy(tdist[ss1,ss2])
+                                    dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])* (-k4(ttdist[ss1,ss2])+1.)+\
+                                    vyt*kyy(tdist[ss1,ss2])*(-dk4dlyt(ttdist[ss1,ss2]) )
+                                    dkdubias[ss1,ss2] = 0
+                                    #dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])* (-2*lyt*(ttdist[ss1,ss2])+1.)+\
+                                    #vyt*kyy(tdist[ss1,ss2])*(-2)*(ttdist[ss1,ss2])
+   
+
+                self.variance_Yt.gradient = np.sum(dkYdvart * dL_dK)
+
+                self.lengthscale_Yt.gradient =  np.sum(dkYdlent*(-0.5*self.lengthscale_Yt**(-2)) * dL_dK)
+
+                self.ubias.gradient = np.sum(dkdubias * dL_dK) 
--- a/GPy/kern/_src/init.py
+++ b/GPy/kern/_src/init.py
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@ -0,0 +1,188 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+import itertools
+from ...util.caching import Cache_this
+from kern import CombinationKernel
+
+class Add(CombinationKernel):
+    """
+    Add given list of kernels together.
+    propagates gradients through.
+
+    This kernel will take over the active dims of it's subkernels passed in.
+    """
+    def __init__(self, subkerns, name='add'):
+        for i, kern in enumerate(subkerns[:]):
+            if isinstance(kern, Add):
+                del subkerns[i]
+                for part in kern.parts[::-1]:
+                    kern.unlink_parameter(part)
+                    subkerns.insert(i, part)
+
+        super(Add, self).__init__(subkerns, name)
+
+    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    def K(self, X, X2=None, which_parts=None):
+        """
+        Add all kernels together.
+        If a list of parts (of this kernel!) `which_parts` is given, only
+        the parts of the list are taken to compute the covariance.
+        """
+        if which_parts is None:
+            which_parts = self.parts
+        elif not isinstance(which_parts, (list, tuple)):
+            # if only one part is given
+            which_parts = [which_parts]
+        return reduce(np.add, (p.K(X, X2) for p in which_parts))
+
+    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    def Kdiag(self, X, which_parts=None):
+        if which_parts is None:
+            which_parts = self.parts
+        elif not isinstance(which_parts, (list, tuple)):
+            # if only one part is given
+            which_parts = [which_parts]
+        return reduce(np.add, (p.Kdiag(X) for p in which_parts))
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        [p.update_gradients_full(dL_dK, X, X2) for p in self.parts if not p.is_fixed]
+
+    def update_gradients_diag(self, dL_dK, X):
+        [p.update_gradients_diag(dL_dK, X) for p in self.parts]
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        """Compute the gradient of the objective function with respect to X.
+
+        :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
+        :type dL_dK: np.ndarray (num_samples x num_inducing)
+        :param X: Observed data inputs
+        :type X: np.ndarray (num_samples x input_dim)
+        :param X2: Observed data inputs (optional, defaults to X)
+        :type X2: np.ndarray (num_inducing x input_dim)"""
+
+        target = np.zeros(X.shape)
+        [target.__iadd__(p.gradients_X(dL_dK, X, X2)) for p in self.parts]
+        return target
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        target = np.zeros(X.shape)
+        [target.__iadd__(p.gradients_X_diag(dL_dKdiag, X)) for p in self.parts]
+        return target
+    
+    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    def psi0(self, Z, variational_posterior):
+        return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts))
+    
+    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    def psi1(self, Z, variational_posterior):
+        return reduce(np.add, (p.psi1(Z, variational_posterior) for p in self.parts))
+
+    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    def psi2(self, Z, variational_posterior):
+        psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts))
+        #return psi2
+        # compute the "cross" terms
+        from static import White, Bias
+        from rbf import RBF
+        #from rbf_inv import RBFInv
+        from linear import Linear
+        #ffrom fixed import Fixed
+
+        for p1, p2 in itertools.combinations(self.parts, 2):
+            # i1, i2 = p1.active_dims, p2.active_dims
+            # white doesn;t combine with anything
+            if isinstance(p1, White) or isinstance(p2, White):
+                pass
+            # rbf X bias
+            #elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
+            elif isinstance(p1,  Bias) and isinstance(p2, (RBF, Linear)):
+                tmp = p2.psi1(Z, variational_posterior).sum(axis=0)
+                psi2 += p1.variance * (tmp[:,None]+tmp[None,:]) #(tmp[:, :, None] + tmp[:, None, :])
+            #elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
+            elif isinstance(p2, Bias) and isinstance(p1, (RBF, Linear)):
+                tmp = p1.psi1(Z, variational_posterior).sum(axis=0)
+                psi2 += p2.variance * (tmp[:,None]+tmp[None,:]) #(tmp[:, :, None] + tmp[:, None, :])
+            elif isinstance(p2, (RBF, Linear)) and isinstance(p1, (RBF, Linear)):
+                assert np.intersect1d(p1.active_dims, p2.active_dims).size == 0, "only non overlapping kernel dimensions allowed so far"
+                tmp1 = p1.psi1(Z, variational_posterior)
+                tmp2 = p2.psi1(Z, variational_posterior)
+                psi2 += np.einsum('nm,no->mo',tmp1,tmp2)+np.einsum('nm,no->mo',tmp2,tmp1)
+                #(tmp1[:, :, None] * tmp2[:, None, :]) + (tmp2[:, :, None] * tmp1[:, None, :])
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"
+        return psi2
+
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        from static import White, Bias
+        for p1 in self.parts:
+            #compute the effective dL_dpsi1. Extra terms appear becaue of the cross terms in psi2!
+            eff_dL_dpsi1 = dL_dpsi1.copy()
+            for p2 in self.parts:
+                if p2 is p1:
+                    continue
+                if isinstance(p2, White):
+                    continue
+                elif isinstance(p2, Bias):
+                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
+                else:# np.setdiff1d(p1.active_dims, ar2, assume_unique): # TODO: Careful, not correct for overlapping active_dims
+                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
+            p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
+
+    def gradients_Z_expectations(self, dL_psi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        from static import White, Bias
+        target = np.zeros(Z.shape)
+        for p1 in self.parts:
+            #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
+            eff_dL_dpsi1 = dL_dpsi1.copy()
+            for p2 in self.parts:
+                if p2 is p1:
+                    continue
+                if isinstance(p2, White):
+                    continue
+                elif isinstance(p2, Bias):
+                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
+                else:
+                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
+            target += p1.gradients_Z_expectations(dL_psi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
+        return target
+
+    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        from static import White, Bias
+        target_grads = [np.zeros(v.shape) for v in variational_posterior.parameters]
+        for p1 in self.parameters:
+            #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
+            eff_dL_dpsi1 = dL_dpsi1.copy()
+            for p2 in self.parameters:
+                if p2 is p1:
+                    continue
+                if isinstance(p2, White):
+                    continue
+                elif isinstance(p2, Bias):
+                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
+                else:
+                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
+            grads = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
+            [np.add(target_grads[i],grads[i],target_grads[i]) for i in xrange(len(grads))]
+        return target_grads
+
+    def add(self, other):
+        if isinstance(other, Add):
+            other_params = other.parameters[:]
+            for p in other_params:
+                other.unlink_parameter(p)
+            self.link_parameters(*other_params)
+        else:
+            self.link_parameter(other)
+        self.input_dim, self.active_dims = self.get_input_dim_active_dims(self.parts)
+        return self
+
+    def input_sensitivity(self, summarize=True):
+        if summarize:
+            return reduce(np.add, [k.input_sensitivity(summarize) for k in self.parts])
+        else:
+            i_s = np.zeros((len(self.parts), self.input_dim))
+            from operator import setitem
+            [setitem(i_s, (i, Ellipsis), k.input_sensitivity(summarize)) for i, k in enumerate(self.parts)]
+            return i_s
--- a/GPy/kern/_src/brownian.py
+++ b/GPy/kern/_src/brownian.py
@ -0,0 +1,50 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+import numpy as np
+
+class Brownian(Kern):
+    """
+    Brownian motion in 1D only.
+
+    Negative times are treated as a separate (backwards!) Brownian motion.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance:
+    :type variance: float
+    """
+    def __init__(self, input_dim=1, variance=1., active_dims=None, name='Brownian'):
+        assert input_dim==1, "Brownian motion in 1D only"
+        super(Brownian, self).__init__(input_dim, active_dims, name)
+
+        self.variance = Param('variance', variance, Logexp())
+        self.link_parameters(self.variance)
+
+    def K(self,X,X2=None):
+        if X2 is None:
+            X2 = X
+        return self.variance*np.where(np.sign(X)==np.sign(X2.T),np.fmin(np.abs(X),np.abs(X2.T)), 0.)
+
+    def Kdiag(self,X):
+        return self.variance*np.abs(X.flatten())
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        if X2 is None:
+            X2 = X
+        self.variance.gradient = np.sum(dL_dK * np.where(np.sign(X)==np.sign(X2.T),np.fmin(np.abs(X),np.abs(X2.T)), 0.))
+
+    #def update_gradients_diag(self, dL_dKdiag, X):
+        #self.variance.gradient = np.dot(np.abs(X.flatten()), dL_dKdiag)
+
+    #def gradients_X(self, dL_dK, X, X2=None):
+        #if X2 is None:
+            #return np.sum(self.variance*dL_dK*np.abs(X),1)[:,None]
+        #else:
+            #return np.sum(np.where(np.logical_and(np.abs(X)<np.abs(X2.T), np.sign(X)==np.sign(X2)), self.variance*dL_dK,0.),1)[:,None]
+
+
+
--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@ -0,0 +1,174 @@
+# Copyright (c) 2012, James Hensman and Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kern import Kern
+import numpy as np
+from scipy import weave
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+from ...util.config import config # for assesing whether to use weave
+
+class Coregionalize(Kern):
+    """
+    Covariance function for intrinsic/linear coregionalization models
+
+    This covariance has the form:
+    .. math::
+       \mathbf{B} = \mathbf{W}\mathbf{W}^\top + \text{diag}(kappa)
+
+    An intrinsic/linear coregionalization covariance function of the form:
+    .. math::
+
+       k_2(x, y)=\mathbf{B} k(x, y)
+
+    it is obtained as the tensor product between a covariance function
+    k(x, y) and B.
+
+    :param output_dim: number of outputs to coregionalize
+    :type output_dim: int
+    :param rank: number of columns of the W matrix (this parameter is ignored if parameter W is not None)
+    :type rank: int
+    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalization matrix B
+    :type W: numpy array of dimensionality (num_outpus, W_columns)
+    :param kappa: a vector which allows the outputs to behave independently
+    :type kappa: numpy array of dimensionality  (output_dim, )
+
+    .. note: see coregionalization examples in GPy.examples.regression for some usage.
+    """
+    def __init__(self, input_dim, output_dim, rank=1, W=None, kappa=None, active_dims=None, name='coregion'):
+        super(Coregionalize, self).__init__(input_dim, active_dims, name=name)
+        self.output_dim = output_dim
+        self.rank = rank
+        if self.rank>output_dim:
+            print("Warning: Unusual choice of rank, it should normally be less than the output_dim.")
+        if W is None:
+            W = 0.5*np.random.randn(self.output_dim, self.rank)/np.sqrt(self.rank)
+        else:
+            assert W.shape==(self.output_dim, self.rank)
+        self.W = Param('W', W)
+        if kappa is None:
+            kappa = 0.5*np.ones(self.output_dim)
+        else:
+            assert kappa.shape==(self.output_dim, )
+        self.kappa = Param('kappa', kappa, Logexp())
+        self.link_parameters(self.W, self.kappa)
+
+    def parameters_changed(self):
+        self.B = np.dot(self.W, self.W.T) + np.diag(self.kappa)
+
+    def K(self, X, X2=None):
+        if config.getboolean('weave', 'working'):
+            try:
+                return self._K_weave(X, X2)
+            except:
+                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                config.set('weave', 'working', 'False')
+                return self._K_numpy(X, X2)
+        else:
+            return self._K_numpy(X, X2)
+
+
+    def _K_numpy(self, X, X2=None):
+        index = np.asarray(X, dtype=np.int)
+        if X2 is None:
+            return self.B[index,index.T]
+        else:
+            index2 = np.asarray(X2, dtype=np.int)
+            return self.B[index,index2.T]
+
+    def _K_weave(self, X, X2=None):
+        """compute the kernel function using scipy.weave"""
+        index = np.asarray(X, dtype=np.int)
+
+        if X2 is None:
+            target = np.empty((X.shape[0], X.shape[0]), dtype=np.float64)
+            code="""
+            for(int i=0;i<N; i++){
+              target[i+i*N] = B[index[i]+output_dim*index[i]];
+              for(int j=0; j<i; j++){
+                  target[j+i*N] = B[index[i]+output_dim*index[j]];
+                  target[i+j*N] = target[j+i*N];
+                }
+              }
+            """
+            N, B, output_dim = index.size, self.B, self.output_dim
+            weave.inline(code, ['target', 'index', 'N', 'B', 'output_dim'])
+        else:
+            index2 = np.asarray(X2, dtype=np.int)
+            target = np.empty((X.shape[0], X2.shape[0]), dtype=np.float64)
+            code="""
+            for(int i=0;i<num_inducing; i++){
+              for(int j=0; j<N; j++){
+                  target[i+j*num_inducing] = B[output_dim*index[j]+index2[i]];
+                }
+              }
+            """
+            N, num_inducing, B, output_dim = index.size, index2.size, self.B, self.output_dim
+            weave.inline(code, ['target', 'index', 'index2', 'N', 'num_inducing', 'B', 'output_dim'])
+        return target
+
+
+    def Kdiag(self, X):
+        return np.diag(self.B)[np.asarray(X, dtype=np.int).flatten()]
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        index = np.asarray(X, dtype=np.int)
+        if X2 is None:
+            index2 = index
+        else:
+            index2 = np.asarray(X2, dtype=np.int)
+
+        #attempt to use weave for a nasty double indexing loop: fall back to numpy
+        if config.getboolean('weave', 'working'):
+            try:
+                dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
+            except:
+                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                config.set('weave', 'working', 'False')
+                dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
+        else:
+            dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
+
+
+
+        dkappa = np.diag(dL_dK_small)
+        dL_dK_small += dL_dK_small.T
+        dW = (self.W[:, None, :]*dL_dK_small[:, :, None]).sum(0)
+
+        self.W.gradient = dW
+        self.kappa.gradient = dkappa
+
+    def _gradient_reduce_weave(self, dL_dK, index, index2):
+        dL_dK_small = np.zeros_like(self.B)
+        code="""
+        for(int i=0; i<num_inducing; i++){
+          for(int j=0; j<N; j++){
+            dL_dK_small[index[j] + output_dim*index2[i]] += dL_dK[i+j*num_inducing];
+          }
+        }
+        """
+        N, num_inducing, output_dim = index.size, index2.size, self.output_dim
+        weave.inline(code, ['N', 'num_inducing', 'output_dim', 'dL_dK', 'dL_dK_small', 'index', 'index2'])
+        return dL_dK_small
+
+    def _gradient_reduce_numpy(self, dL_dK, index, index2):
+        index, index2 = index[:,0], index2[:,0]
+        dL_dK_small = np.zeros_like(self.B)
+        for i in range(k.output_dim):
+            tmp1 = dL_dK[index==i]
+            for j in range(k.output_dim):
+                dL_dK_small[j,i] = tmp1[:,index2==j].sum()
+        return dL_dK_small
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        index = np.asarray(X, dtype=np.int).flatten()
+        dL_dKdiag_small = np.array([dL_dKdiag[index==i].sum() for i in xrange(self.output_dim)])
+        self.W.gradient = 2.*self.W*dL_dKdiag_small[:, None]
+        self.kappa.gradient = dL_dKdiag_small
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        return np.zeros(X.shape)
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        return np.zeros(X.shape)
+
--- a/GPy/kern/_src/independent_outputs.py
+++ b/GPy/kern/_src/independent_outputs.py
@ -0,0 +1,202 @@
+# Copyright (c) 2012, James Hesnsman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kern import Kern, CombinationKernel
+import numpy as np
+import itertools
+
+def index_to_slices(index):
+    """
+    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index. 
+
+    e.g.
+    >>> index = np.asarray([0,0,0,1,1,1,2,2,2])
+    returns
+    >>> [[slice(0,3,None)],[slice(3,6,None)],[slice(6,9,None)]]
+
+    or, a more complicated example
+    >>> index = np.asarray([0,0,1,1,0,2,2,2,1,1])
+    returns
+    >>> [[slice(0,2,None),slice(4,5,None)],[slice(2,4,None),slice(8,10,None)],[slice(5,8,None)]]
+    """
+    if len(index)==0:
+        return[]
+
+    #contruct the return structure
+    ind = np.asarray(index,dtype=np.int)
+    ret = [[] for i in range(ind.max()+1)]
+
+    #find the switchpoints
+    ind_ = np.hstack((ind,ind[0]+ind[-1]+1))
+    switchpoints = np.nonzero(ind_ - np.roll(ind_,+1))[0]
+
+    [ret[ind_i].append(slice(*indexes_i)) for ind_i,indexes_i in zip(ind[switchpoints[:-1]],zip(switchpoints,switchpoints[1:]))]
+    return ret
+
+class IndependentOutputs(CombinationKernel):
+    """
+    A kernel which can represent several independent functions.  this kernel
+    'switches off' parts of the matrix where the output indexes are different.
+
+    The index of the functions is given by the last column in the input X the
+    rest of the columns of X are passed to the underlying kernel for
+    computation (in blocks).
+
+    :param kernels: either a kernel, or list of kernels to work with. If it is
+    a list of kernels the indices in the index_dim, index the kernels you gave!
+    """
+    def __init__(self, kernels, index_dim=-1, name='independ'):
+        assert isinstance(index_dim, int), "IndependentOutputs kernel is only defined with one input dimension being the index"
+        if not isinstance(kernels, list):
+            self.single_kern = True
+            self.kern = kernels
+            kernels = [kernels]
+        else:
+            self.single_kern = False
+            self.kern = kernels
+        super(IndependentOutputs, self).__init__(kernels=kernels, extra_dims=[index_dim], name=name)
+        self.index_dim = index_dim
+
+    def K(self,X ,X2=None):
+        slices = index_to_slices(X[:,self.index_dim])
+        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
+        if X2 is None:
+            target = np.zeros((X.shape[0], X.shape[0]))
+            [[target.__setitem__((s,ss), kern.K(X[s,:], X[ss,:])) for s,ss in itertools.product(slices_i, slices_i)] for kern, slices_i in zip(kerns, slices)]
+        else:
+            slices2 = index_to_slices(X2[:,self.index_dim])
+            target = np.zeros((X.shape[0], X2.shape[0]))
+            [[target.__setitem__((s,s2), kern.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices_i, slices_j)] for kern, slices_i,slices_j in zip(kerns, slices,slices2)]
+        return target
+
+    def Kdiag(self,X):
+        slices = index_to_slices(X[:,self.index_dim])
+        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
+        target = np.zeros(X.shape[0])
+        [[np.copyto(target[s], kern.Kdiag(X[s])) for s in slices_i] for kern, slices_i in zip(kerns, slices)]
+        return target
+
+    def update_gradients_full(self,dL_dK,X,X2=None):
+        slices = index_to_slices(X[:,self.index_dim])
+        if self.single_kern: 
+            target = np.zeros(self.kern.size)
+            kerns = itertools.repeat(self.kern)
+        else: 
+            kerns = self.kern
+            target = [np.zeros(kern.size) for kern, _ in zip(kerns, slices)]
+        def collate_grads(kern, i, dL, X, X2):
+            kern.update_gradients_full(dL,X,X2)
+            if self.single_kern: target[:] += kern.gradient
+            else: target[i][:] += kern.gradient
+        if X2 is None:
+            [[collate_grads(kern, i, dL_dK[s,ss], X[s], X[ss]) for s,ss in itertools.product(slices_i, slices_i)] for i,(kern,slices_i) in enumerate(zip(kerns,slices))]
+        else:
+            slices2 = index_to_slices(X2[:,self.index_dim])
+            [[[collate_grads(kern, i, dL_dK[s,s2],X[s],X2[s2]) for s in slices_i] for s2 in slices_j] for i,(kern,slices_i,slices_j) in enumerate(zip(kerns,slices,slices2))]
+        if self.single_kern: kern.gradient = target
+        else:[kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(kerns, slices))]
+
+    def gradients_X(self,dL_dK, X, X2=None):
+        target = np.zeros(X.shape)
+        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
+        if X2 is None:
+            # TODO: make use of index_to_slices
+            values = np.unique(X[:,self.index_dim])
+            slices = [X[:,self.index_dim]==i for i in values]
+            [target.__setitem__(s, kern.gradients_X(dL_dK[s,s],X[s],None))
+              for kern, s in zip(kerns, slices)]
+            #slices = index_to_slices(X[:,self.index_dim])
+            #[[np.add(target[s], kern.gradients_X(dL_dK[s,s], X[s]), out=target[s]) 
+            #  for s in slices_i] for kern, slices_i in zip(kerns, slices)]
+            #import ipdb;ipdb.set_trace()
+            #[[(np.add(target[s ], kern.gradients_X(dL_dK[s ,ss],X[s ], X[ss]), out=target[s ]),
+            #   np.add(target[ss], kern.gradients_X(dL_dK[ss,s ],X[ss], X[s ]), out=target[ss]))
+            #  for s, ss in itertools.combinations(slices_i, 2)] for kern, slices_i in zip(kerns, slices)]
+        else:
+            values = np.unique(X[:,self.index_dim])
+            slices = [X[:,self.index_dim]==i for i in values]
+            slices2 = [X2[:,self.index_dim]==i for i in values]
+            [target.__setitem__(s, kern.gradients_X(dL_dK[s, :][:, s2],X[s],X2[s2]))
+              for kern, s, s2 in zip(kerns, slices, slices2)]
+            # TODO: make work with index_to_slices
+            #slices = index_to_slices(X[:,self.index_dim])
+            #slices2 = index_to_slices(X2[:,self.index_dim])
+            #[[target.__setitem__(s, target[s] + kern.gradients_X(dL_dK[s,s2], X[s], X2[s2])) for s, s2 in itertools.product(slices_i, slices_j)] for kern, slices_i,slices_j in zip(kerns, slices,slices2)]
+        return target
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        slices = index_to_slices(X[:,self.index_dim])
+        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
+        target = np.zeros(X.shape)
+        [[target.__setitem__(s, kern.gradients_X_diag(dL_dKdiag[s],X[s])) for s in slices_i] for kern, slices_i in zip(kerns, slices)]
+        return target
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        slices = index_to_slices(X[:,self.index_dim])
+        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
+        if self.single_kern: target = np.zeros(self.kern.size)
+        else: target = [np.zeros(kern.size) for kern, _ in zip(kerns, slices)]
+        def collate_grads(kern, i, dL, X):
+            kern.update_gradients_diag(dL,X)
+            if self.single_kern: target[:] += kern.gradient
+            else: target[i][:] += kern.gradient
+        [[collate_grads(kern, i, dL_dKdiag[s], X[s,:]) for s in slices_i] for i, (kern, slices_i) in enumerate(zip(kerns, slices))]
+        if self.single_kern: kern.gradient = target
+        else:[kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(kerns, slices))]
+
+class Hierarchical(CombinationKernel):
+    """
+    A kernel which can represent a simple hierarchical model.
+
+    See Hensman et al 2013, "Hierarchical Bayesian modelling of gene expression time
+    series across irregularly sampled replicates and clusters"
+    http://www.biomedcentral.com/1471-2105/14/252
+
+    To construct this kernel, you must pass a list of kernels. the first kernel
+    will be assumed to be the 'base' kernel, and will be computed everywhere.
+    For every additional kernel, we assume another layer in the hierachy, with
+    a corresponding column of the input matrix which indexes which function the
+    data are in at that level.
+
+    For more, see the ipython notebook documentation on Hierarchical
+    covariances.
+    """
+    def __init__(self, kernels, name='hierarchy'):
+        assert all([k.input_dim==kernels[0].input_dim for k in kernels])
+        assert len(kernels) > 1
+        self.levels = len(kernels) -1
+        input_max = max([k.input_dim for k in kernels])
+        super(Hierarchical, self).__init__(kernels=kernels, extra_dims = range(input_max, input_max + len(kernels)-1), name=name)
+
+    def K(self,X ,X2=None):
+        K = self.parts[0].K(X, X2) # compute 'base' kern everywhere
+        slices = [index_to_slices(X[:,i]) for i in self.extra_dims]
+        if X2 is None:
+            [[[np.add(K[s,s], k.K(X[s], None), K[s, s]) for s in slices_i] for slices_i in slices_k] for k, slices_k in zip(self.parts[1:], slices)]
+        else:
+            slices2 = [index_to_slices(X2[:,i]) for i in self.extra_dims]
+            [[[np.add(K[s,ss], k.K(X[s], X2[ss]), K[s, ss]) for s,ss in zip(slices_i, slices_j)] for slices_i, slices_j in zip(slices_k1, slices_k2)] for k, slices_k1, slices_k2 in zip(self.parts[1:], slices, slices2)]
+        return K
+
+    def Kdiag(self,X):
+        return np.diag(self.K(X))
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        raise NotImplementedError
+
+    def update_gradients_full(self,dL_dK,X,X2=None):
+        slices = [index_to_slices(X[:,i]) for i in self.extra_dims]
+        if X2 is None:
+            self.parts[0].update_gradients_full(dL_dK, X, None)
+            for k, slices_k in zip(self.parts[1:], slices):
+                target = np.zeros(k.size)
+                def collate_grads(dL, X, X2, target):
+                    k.update_gradients_full(dL,X,X2)
+                    target += k.gradient
+                [[collate_grads(dL_dK[s,s], X[s], None, target) for s in slices_i] for slices_i in slices_k]
+                k.gradient[:] = target
+        else:
+            raise NotImplementedError
+
+
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@ -0,0 +1,280 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import sys
+import numpy as np
+from ...core.parameterization.parameterized import Parameterized
+from kernel_slice_operations import KernCallsViaSlicerMeta
+from ...util.caching import Cache_this
+from GPy.core.parameterization.observable_array import ObsAr
+
+
+
+class Kern(Parameterized):
+    #===========================================================================
+    # This adds input slice support. The rather ugly code for slicing can be
+    # found in kernel_slice_operations
+    __metaclass__ = KernCallsViaSlicerMeta
+    #===========================================================================
+    _support_GPU=False
+    def __init__(self, input_dim, active_dims, name, useGPU=False, *a, **kw):
+        """
+        The base class for a kernel: a positive definite function
+        which forms of a covariance function (kernel).
+
+        input_dim:
+
+            is the number of dimensions to work on. Make sure to give the
+            tight dimensionality of inputs.
+            You most likely want this to be the integer telling the number of
+            input dimensions of the kernel.
+            If this is not an integer (!) we will work on the whole input matrix X,
+            and not check whether dimensions match or not (!).
+
+        active_dims:
+
+            is the active_dimensions of inputs X we will work on.
+            All kernels will get sliced Xes as inputs, if active_dims is not None
+            Only positive integers are allowed in active_dims!
+            if active_dims is None, slicing is switched off and all X will be passed through as given.
+
+        :param int input_dim: the number of input dimensions to the function
+        :param array-like|None active_dims: list of indices on which dimensions this kernel works on, or none if no slicing
+
+        Do not instantiate.
+        """
+        super(Kern, self).__init__(name=name, *a, **kw)
+        self.input_dim = int(input_dim)
+
+        if active_dims is None:
+            active_dims = np.arange(input_dim)
+
+        self.active_dims = np.atleast_1d(active_dims).astype(int)
+
+        assert self.active_dims.size == self.input_dim, "input_dim={} does not match len(active_dim)={}, active_dims={}".format(self.input_dim, self.active_dims.size, self.active_dims)
+
+        self._sliced_X = 0
+        self.useGPU = self._support_GPU and useGPU
+        self._return_psi2_n_flag = ObsAr(np.zeros(1)).astype(bool)
+
+    @property
+    def return_psi2_n(self):
+        """
+        Flag whether to pass back psi2 as NxMxM or MxM, by summing out N.
+        """
+        return self._return_psi2_n_flag[0]
+    @return_psi2_n.setter
+    def return_psi2_n(self, val):
+        def visit(self):
+            if isinstance(self, Kern):
+                self._return_psi2_n_flag[0]=val
+        self.traverse(visit)
+
+    @Cache_this(limit=20)
+    def _slice_X(self, X):
+        return X[:, self.active_dims]
+
+    def K(self, X, X2):
+        """
+        Compute the kernel function.
+
+        :param X: the first set of inputs to the kernel
+        :param X2: (optional) the second set of arguments to the kernel. If X2
+                   is None, this is passed throgh to the 'part' object, which
+                   handLes this as X2 == X.
+        """
+        raise NotImplementedError
+    def Kdiag(self, X):
+        raise NotImplementedError
+    def psi0(self, Z, variational_posterior):
+        raise NotImplementedError
+    def psi1(self, Z, variational_posterior):
+        raise NotImplementedError
+    def psi2(self, Z, variational_posterior):
+        raise NotImplementedError
+    def gradients_X(self, dL_dK, X, X2):
+        raise NotImplementedError
+    def gradients_X_diag(self, dL_dKdiag, X):
+        raise NotImplementedError
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        """ update the gradients of all parameters when using only the diagonal elements of the covariance matrix"""
+        raise NotImplementedError
+
+    def update_gradients_full(self, dL_dK, X, X2):
+        """Set the gradients of all parameters when doing full (N) inference."""
+        raise NotImplementedError
+
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        """
+        Set the gradients of all parameters when doing inference with
+        uncertain inputs, using expectations of the kernel.
+
+        The esential maths is
+
+        dL_d{theta_i} = dL_dpsi0 * dpsi0_d{theta_i} +
+                        dL_dpsi1 * dpsi1_d{theta_i} +
+                        dL_dpsi2 * dpsi2_d{theta_i}
+        """
+        raise NotImplementedError
+
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        """
+        Returns the derivative of the objective wrt Z, using the chain rule
+        through the expectation variables.
+        """
+        raise NotImplementedError
+
+    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        """
+        Compute the gradients wrt the parameters of the variational
+        distruibution q(X), chain-ruling via the expectations of the kernel
+        """
+        raise NotImplementedError
+
+    def plot(self, x=None, fignum=None, ax=None, title=None, plot_limits=None, resolution=None, **mpl_kwargs):
+        """
+        plot this kernel.
+        :param x: the value to use for the other kernel argument (kernels are a function of two variables!)
+        :param fignum: figure number of the plot
+        :param ax: matplotlib axis to plot on
+        :param title: the matplotlib title
+        :param plot_limits: the range over which to plot the kernel
+        :resolution: the resolution of the lines used in plotting
+        :mpl_kwargs avalid keyword arguments to pass through to matplotlib (e.g. lw=7)
+        """
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ...plotting.matplot_dep import kernel_plots
+        kernel_plots.plot(self, x, fignum, ax, title, plot_limits, resolution, **mpl_kwargs)
+
+    def plot_ARD(self, *args, **kw):
+        """
+        See :class:`~GPy.plotting.matplot_dep.kernel_plots`
+        """
+        import sys
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ...plotting.matplot_dep import kernel_plots
+        return kernel_plots.plot_ARD(self,*args,**kw)
+
+    def input_sensitivity(self, summarize=True):
+        """
+        Returns the sensitivity for each dimension of this kernel.
+        """
+        return np.zeros(self.input_dim)
+
+    def __add__(self, other):
+        """ Overloading of the '+' operator. for more control, see self.add """
+        return self.add(other)
+
+    def __iadd__(self, other):
+        return self.add(other)
+
+    def add(self, other, name='add'):
+        """
+        Add another kernel to this one.
+
+        :param other: the other kernel to be added
+        :type other: GPy.kern
+
+        """
+        assert isinstance(other, Kern), "only kernels can be added to kernels..."
+        from add import Add
+        return Add([self, other], name=name)
+
+    def __mul__(self, other):
+        """ Here we overload the '*' operator. See self.prod for more information"""
+        return self.prod(other)
+
+    def __imul__(self, other):
+        """ Here we overload the '*' operator. See self.prod for more information"""
+        return self.prod(other)
+
+    def __pow__(self, other):
+        """
+        Shortcut for tensor `prod`.
+        """
+        assert np.all(self.active_dims == range(self.input_dim)), "Can only use kernels, which have their input_dims defined from 0"
+        assert np.all(other.active_dims == range(other.input_dim)), "Can only use kernels, which have their input_dims defined from 0"
+        other.active_dims += self.input_dim
+        return self.prod(other)
+
+    def prod(self, other, name='mul'):
+        """
+        Multiply two kernels (either on the same space, or on the tensor
+        product of the input space).
+
+        :param other: the other kernel to be added
+        :type other: GPy.kern
+        :param tensor: whether or not to use the tensor space (default is false).
+        :type tensor: bool
+
+        """
+        assert isinstance(other, Kern), "only kernels can be multiplied to kernels..."
+        from prod import Prod
+        #kernels = []
+        #if isinstance(self, Prod): kernels.extend(self.parameters)
+        #else: kernels.append(self)
+        #if isinstance(other, Prod): kernels.extend(other.parameters)
+        #else: kernels.append(other)
+        return Prod([self, other], name)
+
+    def _check_input_dim(self, X):
+        assert X.shape[1] == self.input_dim, "{} did not specify active_dims and X has wrong shape: X_dim={}, whereas input_dim={}".format(self.name, X.shape[1], self.input_dim)
+
+    def _check_active_dims(self, X):
+        assert X.shape[1] >= len(self.active_dims), "At least {} dimensional X needed, X.shape={!s}".format(len(self.active_dims), X.shape)
+
+
+class CombinationKernel(Kern):
+    """
+    Abstract super class for combination kernels.
+    A combination kernel combines (a list of) kernels and works on those.
+    Examples are the HierarchicalKernel or Add and Prod kernels.
+    """
+    def __init__(self, kernels, name, extra_dims=[]):
+        """
+        Abstract super class for combination kernels.
+        A combination kernel combines (a list of) kernels and works on those.
+        Examples are the HierarchicalKernel or Add and Prod kernels.
+
+        :param list kernels: List of kernels to combine (can be only one element)
+        :param str name: name of the combination kernel
+        :param array-like extra_dims: if needed extra dimensions for the combination kernel to work on
+        """
+        assert all([isinstance(k, Kern) for k in kernels])
+        extra_dims = np.array(extra_dims, dtype=int)
+        input_dim, active_dims = self.get_input_dim_active_dims(kernels, extra_dims)
+        # initialize the kernel with the full input_dim
+        super(CombinationKernel, self).__init__(input_dim, active_dims, name)
+        self.extra_dims = extra_dims
+        self.link_parameters(*kernels)
+
+    @property
+    def parts(self):
+        return self.parameters
+
+    def get_input_dim_active_dims(self, kernels, extra_dims = None):
+        #active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int))
+        #active_dims = np.array(np.concatenate((active_dims, extra_dims if extra_dims is not None else [])), dtype=int)
+        input_dim = reduce(max, (k.active_dims.max() for k in kernels)) + 1
+
+        if extra_dims is not None:
+            input_dim += extra_dims.size
+
+        active_dims = np.arange(input_dim)
+        return input_dim, active_dims
+
+    def input_sensitivity(self, summarize=True):
+        """
+        If summize is true, we want to get the summerized view of the sensitivities,
+        otherwise put everything into an array with shape (#kernels, input_dim)
+        in the order of appearance of the kernels in the parameterized object.
+        """
+        raise NotImplementedError("Choose the kernel you want to get the sensitivity for. You need to override the default behaviour for getting the input sensitivity to be able to get the input sensitivity. For sum kernel it is the sum of all sensitivities, TODO: product kernel? Other kernels?, also TODO: shall we return all the sensitivities here in the combination kernel? So we can combine them however we want? This could lead to just plot all the sensitivities here...")
+
+    def _check_active_dims(self, X):
+        return
+
+    def _check_input_dim(self, X):
+        # As combination kernels cannot always know, what their inner kernels have as input dims, the check will be done inside them, respectively
+        return
--- a/GPy/kern/_src/kernel_slice_operations.py
+++ b/GPy/kern/_src/kernel_slice_operations.py
@ -0,0 +1,143 @@
+'''
+Created on 11 Mar 2014
+
+@author: maxz
+'''
+from ...core.parameterization.parameterized import ParametersChangedMeta
+import numpy as np
+from functools import wraps
+
+def put_clean(dct, name, func):
+    if name in dct:
+        dct['_clean_{}'.format(name)] = dct[name]
+        dct[name] = func(dct[name])
+
+class KernCallsViaSlicerMeta(ParametersChangedMeta):
+    def __new__(cls, name, bases, dct):
+        put_clean(dct, 'K', _slice_K)
+        put_clean(dct, 'Kdiag', _slice_Kdiag)
+        put_clean(dct, 'update_gradients_full', _slice_update_gradients_full)
+        put_clean(dct, 'update_gradients_diag', _slice_update_gradients_diag)
+        put_clean(dct, 'gradients_X', _slice_gradients_X)
+        put_clean(dct, 'gradients_X_diag', _slice_gradients_X_diag)
+
+        put_clean(dct, 'psi0', _slice_psi)
+        put_clean(dct, 'psi1', _slice_psi)
+        put_clean(dct, 'psi2', _slice_psi)
+        put_clean(dct, 'update_gradients_expectations', _slice_update_gradients_expectations)
+        put_clean(dct, 'gradients_Z_expectations', _slice_gradients_Z_expectations)
+        put_clean(dct, 'gradients_qX_expectations', _slice_gradients_qX_expectations)
+        return super(KernCallsViaSlicerMeta, cls).__new__(cls, name, bases, dct)
+
+class _Slice_wrap(object):
+    def __init__(self, k, X, X2=None):
+        self.k = k
+        self.shape = X.shape
+        assert X.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X.shape={!s}".format(X.shape)
+        if X2 is not None:
+            assert X2.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X2.shape={!s}".format(X2.shape)
+        if (self.k.active_dims is not None) and (self.k._sliced_X == 0):
+            self.k._check_active_dims(X)
+            self.X = self.k._slice_X(X)
+            self.X2 = self.k._slice_X(X2) if X2 is not None else X2
+            self.ret = True
+        else:
+            self.k._check_input_dim(X)
+            self.X = X
+            self.X2 = X2
+            self.ret = False
+    def __enter__(self):
+        self.k._sliced_X += 1
+        return self
+    def __exit__(self, *a):
+        self.k._sliced_X -= 1
+    def handle_return_array(self, return_val):
+        if self.ret:
+            ret = np.zeros(self.shape)
+            ret[:, self.k.active_dims] = return_val
+            return ret
+        return return_val
+
+def _slice_K(f):
+    @wraps(f)
+    def wrap(self, X, X2 = None, *a, **kw):
+        with _Slice_wrap(self, X, X2) as s:
+            ret = f(self, s.X, s.X2, *a, **kw)
+        return ret
+    return wrap
+
+def _slice_Kdiag(f):
+    @wraps(f)
+    def wrap(self, X, *a, **kw):
+        with _Slice_wrap(self, X, None) as s:
+            ret = f(self, s.X, *a, **kw)
+        return ret
+    return wrap
+
+def _slice_update_gradients_full(f):
+    @wraps(f)
+    def wrap(self, dL_dK, X, X2=None):
+        with _Slice_wrap(self, X, X2) as s:
+            ret = f(self, dL_dK, s.X, s.X2)
+        return ret
+    return wrap
+
+def _slice_update_gradients_diag(f):
+    @wraps(f)
+    def wrap(self, dL_dKdiag, X):
+        with _Slice_wrap(self, X, None) as s:
+            ret = f(self, dL_dKdiag, s.X)
+        return ret
+    return wrap
+
+def _slice_gradients_X(f):
+    @wraps(f)
+    def wrap(self, dL_dK, X, X2=None):
+        with _Slice_wrap(self, X, X2) as s:
+            ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2))
+        return ret
+    return wrap
+
+def _slice_gradients_X_diag(f):
+    @wraps(f)
+    def wrap(self, dL_dKdiag, X):
+        with _Slice_wrap(self, X, None) as s:
+            ret = s.handle_return_array(f(self, dL_dKdiag, s.X))
+        return ret
+    return wrap
+
+def _slice_psi(f):
+    @wraps(f)
+    def wrap(self, Z, variational_posterior):
+        with _Slice_wrap(self, Z, variational_posterior) as s:
+            ret = f(self, s.X, s.X2)
+        return ret
+    return wrap
+
+def _slice_update_gradients_expectations(f):
+    @wraps(f)
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        with _Slice_wrap(self, Z, variational_posterior) as s:
+            ret = f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2)
+        return ret
+    return wrap
+
+def _slice_gradients_Z_expectations(f):
+    @wraps(f)
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        with _Slice_wrap(self, Z, variational_posterior) as s:
+            ret = s.handle_return_array(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2))
+        return ret
+    return wrap
+
+def _slice_gradients_qX_expectations(f):
+    @wraps(f)
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        with _Slice_wrap(self, variational_posterior, Z) as s:
+            ret = list(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X2, s.X))
+            r2 = ret[:2]
+            ret[0] = s.handle_return_array(r2[0])
+            ret[1] = s.handle_return_array(r2[1])
+            del r2
+        return ret
+    return wrap
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@ -0,0 +1,177 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from kern import Kern
+from ...util.linalg import tdot
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+from ...util.caching import Cache_this
+from ...util.config import *
+from .psi_comp import PSICOMP_Linear
+
+class Linear(Kern):
+    """
+    Linear kernel
+
+    .. math::
+
+       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i x_iy_i
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variances: the vector of variances :math:`\sigma^2_i`
+    :type variances: array or list of the appropriate size (or float if there
+                     is only one variance parameter)
+    :param ARD: Auto Relevance Determination. If False, the kernel has only one
+                variance parameter \sigma^2, otherwise there is one variance
+                parameter per dimension.
+    :type ARD: Boolean
+    :rtype: kernel object
+
+    """
+
+    def __init__(self, input_dim, variances=None, ARD=False, active_dims=None, name='linear'):
+        super(Linear, self).__init__(input_dim, active_dims, name)
+        self.ARD = ARD
+        if not ARD:
+            if variances is not None:
+                variances = np.asarray(variances)
+                assert variances.size == 1, "Only one variance needed for non-ARD kernel"
+            else:
+                variances = np.ones(1)
+        else:
+            if variances is not None:
+                variances = np.asarray(variances)
+                assert variances.size == self.input_dim, "bad number of variances, need one ARD variance per input_dim"
+            else:
+                variances = np.ones(self.input_dim)
+
+        self.variances = Param('variances', variances, Logexp())
+        self.link_parameter(self.variances)
+        self.psicomp = PSICOMP_Linear()
+
+    @Cache_this(limit=2)
+    def K(self, X, X2=None):
+        if self.ARD:
+            if X2 is None:
+                return tdot(X*np.sqrt(self.variances))
+            else:
+                rv = np.sqrt(self.variances)
+                return np.dot(X*rv, (X2*rv).T)
+        else:
+            return self._dot_product(X, X2) * self.variances
+
+    @Cache_this(limit=1, ignore_args=(0,))
+    def _dot_product(self, X, X2=None):
+        if X2 is None:
+            return tdot(X)
+        else:
+            return np.dot(X, X2.T)
+
+    def Kdiag(self, X):
+        return np.sum(self.variances * np.square(X), -1)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        if self.ARD:
+            if X2 is None:
+                #self.variances.gradient = np.array([np.sum(dL_dK * tdot(X[:, i:i + 1])) for i in range(self.input_dim)])
+                self.variances.gradient = np.einsum('ij,iq,jq->q', dL_dK, X, X)
+            else:
+                #product = X[:, None, :] * X2[None, :, :]
+                #self.variances.gradient = (dL_dK[:, :, None] * product).sum(0).sum(0)
+                self.variances.gradient = np.einsum('ij,iq,jq->q', dL_dK, X, X2)
+        else:
+            self.variances.gradient = np.sum(self._dot_product(X, X2) * dL_dK)
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        tmp = dL_dKdiag[:, None] * X ** 2
+        if self.ARD:
+            self.variances.gradient = tmp.sum(0)
+        else:
+            self.variances.gradient = np.atleast_1d(tmp.sum())
+
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        if X2 is None:
+            return np.einsum('jq,q,ij->iq', X, 2*self.variances, dL_dK)
+        else:
+            #return (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
+            return np.einsum('jq,q,ij->iq', X2, self.variances, dL_dK)
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        return 2.*self.variances*dL_dKdiag[:,None]*X
+
+    def input_sensitivity(self, summarize=True):
+        return np.ones(self.input_dim) * self.variances
+
+    #---------------------------------------#
+    #             PSI statistics            #
+    #---------------------------------------#
+
+    def psi0(self, Z, variational_posterior):
+        return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[0]
+
+    def psi1(self, Z, variational_posterior):
+        return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[1]
+
+    def psi2(self, Z, variational_posterior):
+        return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[2]
+
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        dL_dvar = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[0]
+        if self.ARD:
+            self.variances.gradient = dL_dvar
+        else:
+            self.variances.gradient = dL_dvar.sum()
+
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[1]
+
+    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[2:]
+
+class LinearFull(Kern):
+    def __init__(self, input_dim, rank, W=None, kappa=None, active_dims=None, name='linear_full'):
+        super(LinearFull, self).__init__(input_dim, active_dims, name)
+        if W is None:
+            W = np.ones((input_dim, rank))
+        if kappa is None:
+            kappa = np.ones(input_dim)
+        assert W.shape == (input_dim, rank)
+        assert kappa.shape == (input_dim,)
+
+        self.W = Param('W', W)
+        self.kappa = Param('kappa', kappa, Logexp())
+        self.link_parameters(self.W, self.kappa)
+
+    def K(self, X, X2=None):
+        P = np.dot(self.W, self.W.T) + np.diag(self.kappa)
+        return np.einsum('ij,jk,lk->il', X, P, X if X2 is None else X2)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        self.kappa.gradient = np.einsum('ij,ik,kj->j', X, dL_dK, X if X2 is None else X2)
+        self.W.gradient = np.einsum('ij,kl,ik,lm->jm', X, X if X2 is None else X2, dL_dK, self.W)
+        self.W.gradient += np.einsum('ij,kl,ik,jm->lm', X, X if X2 is None else X2, dL_dK, self.W)
+
+    def Kdiag(self, X):
+        P = np.dot(self.W, self.W.T) + np.diag(self.kappa)
+        return np.einsum('ij,jk,ik->i', X, P, X)
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        self.kappa.gradient = np.einsum('ij,i->j', np.square(X), dL_dKdiag)
+        self.W.gradient = 2.*np.einsum('ij,ik,jl,i->kl', X, X, self.W, dL_dKdiag)
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        P = np.dot(self.W, self.W.T) + np.diag(self.kappa)
+        if X2 is None:
+            return 2.*np.einsum('ij,jk,kl->il', dL_dK, X, P)
+        else:
+            return np.einsum('ij,jk,kl->il', dL_dK, X2, P)
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        P = np.dot(self.W, self.W.T) + np.diag(self.kappa)
+        return 2.*np.einsum('jk,i,ij->ik', P, dL_dKdiag, X)
+
+
--- a/GPy/kern/_src/mlp.py
+++ b/GPy/kern/_src/mlp.py
@ -0,0 +1,129 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+import numpy as np
+four_over_tau = 2./np.pi
+
+class MLP(Kern):
+    """
+
+    Multi layer perceptron kernel (also known as arc sine kernel or neural network kernel)
+
+    .. math::
+
+          k(x,y) = \\sigma^{2}\\frac{2}{\\pi }  \\text{asin} \\left ( \\frac{ \\sigma_w^2 x^\\top y+\\sigma_b^2}{\\sqrt{\\sigma_w^2x^\\top x + \\sigma_b^2 + 1}\\sqrt{\\sigma_w^2 y^\\top y \\sigma_b^2 +1}} \\right )
+
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance :math:`\sigma^2`
+    :type variance: float
+    :param weight_variance: the vector of the variances of the prior over input weights in the neural network :math:`\sigma^2_w`
+    :type weight_variance: array or list of the appropriate size (or float if there is only one weight variance parameter)
+    :param bias_variance: the variance of the prior over bias parameters :math:`\sigma^2_b`
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
+    :type ARD: Boolean
+    :rtype: Kernpart object
+
+
+    """
+
+    def __init__(self, input_dim, variance=1., weight_variance=1., bias_variance=100., active_dims=None, name='mlp'):
+        super(MLP, self).__init__(input_dim, active_dims, name)
+        self.variance = Param('variance', variance, Logexp())
+        self.weight_variance = Param('weight_variance', weight_variance, Logexp())
+        self.bias_variance = Param('bias_variance', bias_variance, Logexp())
+        self.link_parameters(self.variance, self.weight_variance, self.bias_variance)
+
+
+    def K(self, X, X2=None):
+        self._K_computations(X, X2)
+        return self.variance*self._K_dvar
+
+    def Kdiag(self, X):
+        """Compute the diagonal of the covariance matrix for X."""
+        self._K_diag_computations(X)
+        return self.variance*self._K_diag_dvar
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        """Derivative of the covariance with respect to the parameters."""
+        self._K_computations(X, X2)
+        self.variance.gradient = np.sum(self._K_dvar*dL_dK)
+
+        denom3 = self._K_denom**3
+        base = four_over_tau*self.variance/np.sqrt(1-self._K_asin_arg*self._K_asin_arg)
+        base_cov_grad = base*dL_dK
+
+        if X2 is None:
+            vec = np.diag(self._K_inner_prod)
+            self.weight_variance.gradient = ((self._K_inner_prod/self._K_denom
+                           -.5*self._K_numer/denom3
+                           *(np.outer((self.weight_variance*vec+self.bias_variance+1.), vec)
+                             +np.outer(vec,(self.weight_variance*vec+self.bias_variance+1.))))*base_cov_grad).sum()
+            self.bias_variance.gradient = ((1./self._K_denom
+                           -.5*self._K_numer/denom3
+                           *((vec[None, :]+vec[:, None])*self.weight_variance
+                           +2.*self.bias_variance + 2.))*base_cov_grad).sum()
+        else:
+            vec1 = (X*X).sum(1)
+            vec2 = (X2*X2).sum(1)
+            self.weight_variance.gradient = ((self._K_inner_prod/self._K_denom
+                           -.5*self._K_numer/denom3
+                           *(np.outer((self.weight_variance*vec1+self.bias_variance+1.), vec2) + np.outer(vec1, self.weight_variance*vec2 + self.bias_variance+1.)))*base_cov_grad).sum()
+            self.bias_variance.gradient = ((1./self._K_denom
+                           -.5*self._K_numer/denom3
+                           *((vec1[:, None]+vec2[None, :])*self.weight_variance
+                             + 2*self.bias_variance + 2.))*base_cov_grad).sum()
+
+    def update_gradients_diag(self, X):
+        raise NotImplementedError, "TODO"
+
+
+    def gradients_X(self, dL_dK, X, X2):
+        """Derivative of the covariance matrix with respect to X"""
+        self._K_computations(X, X2)
+        arg = self._K_asin_arg
+        numer = self._K_numer
+        denom = self._K_denom
+        denom3 = denom*denom*denom
+        if X2 is not None:
+            vec2 = (X2*X2).sum(1)*self.weight_variance+self.bias_variance + 1.
+            return four_over_tau*self.weight_variance*self.variance*((X2[None, :, :]/denom[:, :, None] - vec2[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
+        else:
+            vec = (X*X).sum(1)*self.weight_variance+self.bias_variance + 1.
+            return 2*four_over_tau*self.weight_variance*self.variance*((X[None, :, :]/denom[:, :, None] - vec[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        """Gradient of diagonal of covariance with respect to X"""
+        self._K_diag_computations(X)
+        arg = self._K_diag_asin_arg
+        denom = self._K_diag_denom
+        #numer = self._K_diag_numer
+        return four_over_tau*2.*self.weight_variance*self.variance*X*(1./denom*(1. - arg)*dL_dKdiag/(np.sqrt(1-arg*arg)))[:, None]
+
+
+    def _K_computations(self, X, X2):
+        """Pre-computations for the covariance matrix (used for computing the covariance and its gradients."""
+        if X2 is None:
+            self._K_inner_prod = np.dot(X,X.T)
+            self._K_numer = self._K_inner_prod*self.weight_variance + self.bias_variance
+            vec = np.diag(self._K_numer) + 1.
+            self._K_denom = np.sqrt(np.outer(vec,vec))
+        else:
+            self._K_inner_prod = np.dot(X,X2.T)
+            self._K_numer = self._K_inner_prod*self.weight_variance + self.bias_variance
+            vec1 = (X*X).sum(1)*self.weight_variance + self.bias_variance + 1.
+            vec2 = (X2*X2).sum(1)*self.weight_variance + self.bias_variance + 1.
+            self._K_denom = np.sqrt(np.outer(vec1,vec2))
+        self._K_asin_arg = self._K_numer/self._K_denom
+        self._K_dvar = four_over_tau*np.arcsin(self._K_asin_arg)
+
+    def _K_diag_computations(self, X):
+        """Pre-computations concerning the diagonal terms (used for computation of diagonal and its gradients)."""
+        self._K_diag_numer = (X*X).sum(1)*self.weight_variance + self.bias_variance
+        self._K_diag_denom = self._K_diag_numer+1.
+        self._K_diag_asin_arg = self._K_diag_numer/self._K_diag_denom
+        self._K_diag_dvar = four_over_tau*np.arcsin(self._K_diag_asin_arg)
--- a/GPy/kern/_src/periodic.py
+++ b/GPy/kern/_src/periodic.py
@ -0,0 +1,405 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from kern import Kern
+from ...util.linalg import mdot
+from ...util.decorators import silence_errors
+from ...core.parameterization.param import Param
+from ...core.parameterization.transformations import Logexp
+
+class Periodic(Kern):
+    def __init__(self, input_dim, variance, lengthscale, period, n_freq, lower, upper, active_dims, name):
+        """
+        :type input_dim: int
+        :param variance: the variance of the Matern kernel
+        :type variance: float
+        :param lengthscale: the lengthscale of the Matern kernel
+        :type lengthscale: np.ndarray of size (input_dim,)
+        :param period: the period
+        :type period: float
+        :param n_freq: the number of frequencies considered for the periodic subspace
+        :type n_freq: int
+        :rtype: kernel object
+        """
+
+        assert input_dim==1, "Periodic kernels are only defined for input_dim=1"
+        super(Periodic, self).__init__(input_dim, active_dims, name)
+        self.input_dim = input_dim
+        self.lower,self.upper = lower, upper
+        self.n_freq = n_freq
+        self.n_basis = 2*n_freq
+        self.variance = Param('variance', np.float64(variance), Logexp())
+        self.lengthscale = Param('lengthscale', np.float64(lengthscale), Logexp())
+        self.period = Param('period', np.float64(period), Logexp())
+        self.link_parameters(self.variance, self.lengthscale, self.period)
+
+    def _cos(self, alpha, omega, phase):
+        def f(x):
+            return alpha*np.cos(omega*x + phase)
+        return f
+
+    @silence_errors
+    def _cos_factorization(self, alpha, omega, phase):
+        r1 = np.sum(alpha*np.cos(phase),axis=1)[:,None]
+        r2 = np.sum(alpha*np.sin(phase),axis=1)[:,None]
+        r =  np.sqrt(r1**2 + r2**2)
+        psi = np.where(r1 != 0, (np.arctan(r2/r1) + (r1<0.)*np.pi),np.arcsin(r2))
+        return r,omega[:,0:1], psi
+
+    @silence_errors
+    def _int_computation(self,r1,omega1,phi1,r2,omega2,phi2):
+        Gint1 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) + 1./(omega1-omega2.T)*( np.sin((omega1-omega2.T)*self.upper+phi1-phi2.T) - np.sin((omega1-omega2.T)*self.lower+phi1-phi2.T) )
+        Gint2 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) +  np.cos(phi1-phi2.T)*(self.upper-self.lower)
+        Gint = np.dot(r1,r2.T)/2 * np.where(np.isnan(Gint1),Gint2,Gint1)
+        return Gint
+
+    def K(self, X, X2=None):
+        FX = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+        if X2 is None:
+            FX2 = FX
+        else:
+            FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
+        return mdot(FX,self.Gi,FX2.T)
+
+    def Kdiag(self,X):
+        return np.diag(self.K(X))
+
+
+
+
+class PeriodicExponential(Periodic):
+    """
+    Kernel of the periodic subspace (up to a given frequency) of a exponential
+    (Matern 1/2) RKHS.
+
+    Only defined for input_dim=1.
+    """
+
+    def __init__(self, input_dim=1, variance=1., lengthscale=1., period=2.*np.pi, n_freq=10, lower=0., upper=4*np.pi, active_dims=None, name='periodic_exponential'):
+        super(PeriodicExponential, self).__init__(input_dim, variance, lengthscale, period, n_freq, lower, upper, active_dims, name)
+
+    def parameters_changed(self):
+        self.a = [1./self.lengthscale, 1.]
+        self.b = [1]
+
+        self.basis_alpha = np.ones((self.n_basis,))
+        self.basis_omega = (2*np.pi*np.arange(1,self.n_freq+1)/self.period).repeat(2)
+        self.basis_phi =   np.zeros(self.n_freq * 2)
+        self.basis_phi[::2] = -np.pi/2
+
+        self.G = self.Gram_matrix()
+        self.Gi = np.linalg.inv(self.G)
+
+    def Gram_matrix(self):
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega))
+        Lo = np.column_stack((self.basis_omega,self.basis_omega))
+        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+        return(self.lengthscale/(2*self.variance) * Gint + 1./self.variance*np.dot(Flower,Flower.T))
+
+    @silence_errors
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        """derivative of the covariance matrix with respect to the parameters (shape is N x num_inducing x num_params)"""
+        if X2 is None: X2 = X
+        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
+
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega))
+        Lo = np.column_stack((self.basis_omega,self.basis_omega))
+        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+
+        #dK_dvar
+        dK_dvar = 1./self.variance*mdot(FX,self.Gi,FX2.T)
+
+        #dK_dlen
+        da_dlen = [-1./self.lengthscale**2,0.]
+        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)),da_dlen[1]*self.basis_omega))
+        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
+        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
+        dGint_dlen = dGint_dlen + dGint_dlen.T
+        dG_dlen = 1./2*Gint + self.lengthscale/2*dGint_dlen
+        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX2.T)
+
+        #dK_dper
+        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
+        dFX2_dper = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X2,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X2)
+
+        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period))
+        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi))
+        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
+
+        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
+        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
+        # SIMPLIFY!!!       IPPprim1 = (self.upper - self.lower)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
+        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
+        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
+        IPPprim = np.where(np.logical_or(np.isnan(IPPprim1), np.isinf(IPPprim1)), IPPprim2, IPPprim1)
+
+
+        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
+        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
+        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + 1./2*self.upper**2*np.cos(phi-phi1.T)
+        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + 1./2*self.lower**2*np.cos(phi-phi1.T)
+        #IPPint2[0,0] = (self.upper**2 - self.lower**2)*np.cos(phi[0,0])*np.cos(phi1[0,0])
+        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
+
+        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period))
+        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2))
+        r2,omega2,phi2 = dLa_dper2.T,Lo[:,0:1],dLp_dper2.T
+
+        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) + self._int_computation(r2,omega2,phi2, r,omega,phi)
+        dGint_dper = dGint_dper + dGint_dper.T
+
+        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+
+        dG_dper = 1./self.variance*(self.lengthscale/2*dGint_dper + self.b[0]*(np.dot(dFlower_dper,Flower.T)+np.dot(Flower,dFlower_dper.T)))
+
+        dK_dper = mdot(dFX_dper,self.Gi,FX2.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX2.T) + mdot(FX,self.Gi,dFX2_dper.T)
+
+        self.variance.gradient = np.sum(dK_dvar*dL_dK)
+        self.lengthscale.gradient = np.sum(dK_dlen*dL_dK)
+        self.period.gradient = np.sum(dK_dper*dL_dK)
+
+
+
+class PeriodicMatern32(Periodic):
+    """
+    Kernel of the periodic subspace (up to a given frequency) of a Matern 3/2 RKHS. Only defined for input_dim=1.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance of the Matern kernel
+    :type variance: float
+    :param lengthscale: the lengthscale of the Matern kernel
+    :type lengthscale: np.ndarray of size (input_dim,)
+    :param period: the period
+    :type period: float
+    :param n_freq: the number of frequencies considered for the periodic subspace
+    :type n_freq: int
+    :rtype: kernel object
+
+    """
+
+    def __init__(self, input_dim=1, variance=1., lengthscale=1., period=2.*np.pi, n_freq=10, lower=0., upper=4*np.pi, active_dims=None, name='periodic_Matern32'):
+        super(PeriodicMatern32, self).__init__(input_dim, variance, lengthscale, period, n_freq, lower, upper, active_dims, name)
+    def parameters_changed(self):
+        self.a = [3./self.lengthscale**2, 2*np.sqrt(3)/self.lengthscale, 1.]
+        self.b = [1,self.lengthscale**2/3]
+
+        self.basis_alpha = np.ones((self.n_basis,))
+        self.basis_omega = (2*np.pi*np.arange(1,self.n_freq+1)/self.period).repeat(2)
+        self.basis_phi =   np.zeros(self.n_freq * 2)
+        self.basis_phi[::2] = -np.pi/2
+
+        self.G = self.Gram_matrix()
+        self.Gi = np.linalg.inv(self.G)
+
+    def Gram_matrix(self):
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega,self.a[2]*self.basis_omega**2))
+        Lo = np.column_stack((self.basis_omega,self.basis_omega,self.basis_omega))
+        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2,self.basis_phi+np.pi))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        return(self.lengthscale**3/(12*np.sqrt(3)*self.variance) * Gint + 1./self.variance*np.dot(Flower,Flower.T) + self.lengthscale**2/(3.*self.variance)*np.dot(F1lower,F1lower.T))
+
+
+    @silence_errors
+    def update_gradients_full(self,dL_dK,X,X2):
+        """derivative of the covariance matrix with respect to the parameters (shape is num_data x num_inducing x num_params)"""
+        if X2 is None: X2 = X
+        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
+
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega,self.a[2]*self.basis_omega**2))
+        Lo = np.column_stack((self.basis_omega,self.basis_omega,self.basis_omega))
+        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2,self.basis_phi+np.pi))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+
+        #dK_dvar
+        dK_dvar = 1./self.variance*mdot(FX,self.Gi,FX2.T)
+
+        #dK_dlen
+        da_dlen = [-6/self.lengthscale**3,-2*np.sqrt(3)/self.lengthscale**2,0.]
+        db_dlen = [0.,2*self.lengthscale/3.]
+        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)),da_dlen[1]*self.basis_omega,da_dlen[2]*self.basis_omega**2))
+        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
+        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
+        dGint_dlen = dGint_dlen + dGint_dlen.T
+        dG_dlen = self.lengthscale**2/(4*np.sqrt(3))*Gint + self.lengthscale**3/(12*np.sqrt(3))*dGint_dlen + db_dlen[0]*np.dot(Flower,Flower.T) + db_dlen[1]*np.dot(F1lower,F1lower.T)
+        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX2.T)
+
+        #dK_dper
+        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
+        dFX2_dper = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X2,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X2)
+
+        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period, -self.a[2]*self.basis_omega**3/self.period))
+        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi,self.basis_phi+np.pi*3/2))
+        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
+
+        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
+        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
+        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
+        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
+        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)
+
+        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
+        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
+        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + 1./2*self.upper**2*np.cos(phi-phi1.T)
+        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + 1./2*self.lower**2*np.cos(phi-phi1.T)
+        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
+
+        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period, -2*self.a[2]*self.basis_omega**2/self.period))
+        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi))
+        r2,omega2,phi2 =  self._cos_factorization(dLa_dper2,Lo[:,0:2],dLp_dper2)
+
+        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) +  self._int_computation(r2,omega2,phi2, r,omega,phi)
+        dGint_dper = dGint_dper + dGint_dper.T
+
+        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        dF1lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower)+self._cos(-self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+
+        dG_dper = 1./self.variance*(self.lengthscale**3/(12*np.sqrt(3))*dGint_dper + self.b[0]*(np.dot(dFlower_dper,Flower.T)+np.dot(Flower,dFlower_dper.T)) + self.b[1]*(np.dot(dF1lower_dper,F1lower.T)+np.dot(F1lower,dF1lower_dper.T)))
+
+        dK_dper = mdot(dFX_dper,self.Gi,FX2.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX2.T) + mdot(FX,self.Gi,dFX2_dper.T)
+
+        self.variance.gradient = np.sum(dK_dvar*dL_dK)
+        self.lengthscale.gradient = np.sum(dK_dlen*dL_dK)
+        self.period.gradient = np.sum(dK_dper*dL_dK)
+
+
+
+class PeriodicMatern52(Periodic):
+    """
+    Kernel of the periodic subspace (up to a given frequency) of a Matern 5/2 RKHS. Only defined for input_dim=1.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance of the Matern kernel
+    :type variance: float
+    :param lengthscale: the lengthscale of the Matern kernel
+    :type lengthscale: np.ndarray of size (input_dim,)
+    :param period: the period
+    :type period: float
+    :param n_freq: the number of frequencies considered for the periodic subspace
+    :type n_freq: int
+    :rtype: kernel object
+
+    """
+
+    def __init__(self, input_dim=1, variance=1., lengthscale=1., period=2.*np.pi, n_freq=10, lower=0., upper=4*np.pi, active_dims=None, name='periodic_Matern52'):
+        super(PeriodicMatern52, self).__init__(input_dim, variance, lengthscale, period, n_freq, lower, upper, active_dims, name)
+
+    def parameters_changed(self):
+        self.a = [5*np.sqrt(5)/self.lengthscale**3, 15./self.lengthscale**2,3*np.sqrt(5)/self.lengthscale, 1.]
+        self.b  = [9./8, 9*self.lengthscale**4/200., 3*self.lengthscale**2/5., 3*self.lengthscale**2/(5*8.), 3*self.lengthscale**2/(5*8.)]
+
+        self.basis_alpha = np.ones((2*self.n_freq,))
+        self.basis_omega = (2*np.pi*np.arange(1,self.n_freq+1)/self.period).repeat(2)
+        self.basis_phi =   np.zeros(self.n_freq * 2)
+        self.basis_phi[::2] = -np.pi/2
+
+        self.G = self.Gram_matrix()
+        self.Gi = np.linalg.inv(self.G)
+
+    def Gram_matrix(self):
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)), self.a[1]*self.basis_omega, self.a[2]*self.basis_omega**2, self.a[3]*self.basis_omega**3))
+        Lo = np.column_stack((self.basis_omega, self.basis_omega, self.basis_omega, self.basis_omega))
+        Lp = np.column_stack((self.basis_phi, self.basis_phi+np.pi/2, self.basis_phi+np.pi, self.basis_phi+np.pi*3/2))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        F2lower = np.array(self._cos(self.basis_alpha*self.basis_omega**2,self.basis_omega,self.basis_phi+np.pi)(self.lower))[:,None]
+        lower_terms = self.b[0]*np.dot(Flower,Flower.T) + self.b[1]*np.dot(F2lower,F2lower.T) + self.b[2]*np.dot(F1lower,F1lower.T) + self.b[3]*np.dot(F2lower,Flower.T) + self.b[4]*np.dot(Flower,F2lower.T)
+        return(3*self.lengthscale**5/(400*np.sqrt(5)*self.variance) * Gint + 1./self.variance*lower_terms)
+
+    @silence_errors
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        if X2 is None: X2 = X
+        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
+
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)), self.a[1]*self.basis_omega, self.a[2]*self.basis_omega**2, self.a[3]*self.basis_omega**3))
+        Lo = np.column_stack((self.basis_omega, self.basis_omega, self.basis_omega, self.basis_omega))
+        Lp = np.column_stack((self.basis_phi, self.basis_phi+np.pi/2, self.basis_phi+np.pi, self.basis_phi+np.pi*3/2))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        F2lower = np.array(self._cos(self.basis_alpha*self.basis_omega**2,self.basis_omega,self.basis_phi+np.pi)(self.lower))[:,None]
+
+        #dK_dvar
+        dK_dvar = 1./self.variance*mdot(FX,self.Gi,FX2.T)
+
+        #dK_dlen
+        da_dlen = [-3*self.a[0]/self.lengthscale, -2*self.a[1]/self.lengthscale, -self.a[2]/self.lengthscale, 0.]
+        db_dlen = [0., 4*self.b[1]/self.lengthscale, 2*self.b[2]/self.lengthscale, 2*self.b[3]/self.lengthscale, 2*self.b[4]/self.lengthscale]
+        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)), da_dlen[1]*self.basis_omega, da_dlen[2]*self.basis_omega**2, da_dlen[3]*self.basis_omega**3))
+        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
+        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
+        dGint_dlen = dGint_dlen + dGint_dlen.T
+        dlower_terms_dlen = db_dlen[0]*np.dot(Flower,Flower.T) + db_dlen[1]*np.dot(F2lower,F2lower.T) + db_dlen[2]*np.dot(F1lower,F1lower.T) + db_dlen[3]*np.dot(F2lower,Flower.T) + db_dlen[4]*np.dot(Flower,F2lower.T)
+        dG_dlen = 15*self.lengthscale**4/(400*np.sqrt(5))*Gint + 3*self.lengthscale**5/(400*np.sqrt(5))*dGint_dlen + dlower_terms_dlen
+        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX2.T)
+
+        #dK_dper
+        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
+        dFX2_dper = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X2,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X2)
+
+        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period, -self.a[2]*self.basis_omega**3/self.period, -self.a[3]*self.basis_omega**4/self.period))
+        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi,self.basis_phi+np.pi*3/2,self.basis_phi))
+        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
+
+        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
+        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
+        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
+        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
+        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)
+
+        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
+        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
+        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + 1./2*self.upper**2*np.cos(phi-phi1.T)
+        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + 1./2*self.lower**2*np.cos(phi-phi1.T)
+        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
+
+        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period, -2*self.a[2]*self.basis_omega**2/self.period, -3*self.a[3]*self.basis_omega**3/self.period))
+        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2, self.basis_phi+np.pi, self.basis_phi+np.pi*3/2))
+        r2,omega2,phi2 =  self._cos_factorization(dLa_dper2,Lo[:,0:2],dLp_dper2)
+
+        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) +  self._int_computation(r2,omega2,phi2, r,omega,phi)
+        dGint_dper = dGint_dper + dGint_dper.T
+
+        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        dF1lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower)+self._cos(-self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        dF2lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**3/self.period,self.basis_omega,self.basis_phi+np.pi*3/2)(self.lower) + self._cos(-2*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower))[:,None]
+
+        dlower_terms_dper  = self.b[0] * (np.dot(dFlower_dper,Flower.T) + np.dot(Flower.T,dFlower_dper))
+        dlower_terms_dper += self.b[1] * (np.dot(dF2lower_dper,F2lower.T) + np.dot(F2lower,dF2lower_dper.T)) - 4*self.b[1]/self.period*np.dot(F2lower,F2lower.T)
+        dlower_terms_dper += self.b[2] * (np.dot(dF1lower_dper,F1lower.T) + np.dot(F1lower,dF1lower_dper.T)) - 2*self.b[2]/self.period*np.dot(F1lower,F1lower.T)
+        dlower_terms_dper += self.b[3] * (np.dot(dF2lower_dper,Flower.T) + np.dot(F2lower,dFlower_dper.T)) - 2*self.b[3]/self.period*np.dot(F2lower,Flower.T)
+        dlower_terms_dper += self.b[4] * (np.dot(dFlower_dper,F2lower.T) + np.dot(Flower,dF2lower_dper.T)) - 2*self.b[4]/self.period*np.dot(Flower,F2lower.T)
+
+        dG_dper = 1./self.variance*(3*self.lengthscale**5/(400*np.sqrt(5))*dGint_dper + 0.5*dlower_terms_dper)
+        dK_dper = mdot(dFX_dper,self.Gi,FX2.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX2.T) + mdot(FX,self.Gi,dFX2_dper.T)
+
+        self.variance.gradient = np.sum(dK_dvar*dL_dK)
+        self.lengthscale.gradient = np.sum(dK_dlen*dL_dK)
+        self.period.gradient = np.sum(dK_dper*dL_dK)
+
--- a/GPy/kern/_src/poly.py
+++ b/GPy/kern/_src/poly.py
@ -0,0 +1,41 @@
+# Copyright (c) 2014, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+class Poly(Kern):
+    """
+    Polynomial kernel
+    """
+
+    def __init__(self, input_dim, variance=1., order=3., active_dims=None, name='poly'):
+        super(Poly, self).__init__(input_dim, active_dims, name)
+        self.variance = Param('variance', variance, Logexp())
+        self.link_parameter(self.variance)
+        self.order=order
+
+    def K(self, X, X2=None):
+        return (self._dot_product(X, X2) + 1.)**self.order * self.variance
+
+    def _dot_product(self, X, X2=None):
+        if X2 is None:
+            return np.dot(X, X.T)
+        else:
+            return np.dot(X, X2.T)
+
+    def Kdiag(self, X):
+        return self.variance*(np.square(X).sum(1) + 1.)**self.order
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        self.variance.gradient = np.sum(dL_dK * (self._dot_product(X, X2) + 1.)**self.order)
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        raise NotImplementedError
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        raise NotImplementedError
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        raise NotImplementedError
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@ -0,0 +1,66 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from kern import CombinationKernel
+from ...util.caching import Cache_this
+import itertools
+
+class Prod(CombinationKernel):
+    """
+    Computes the product of 2 kernels
+
+    :param k1, k2: the kernels to multiply
+    :type k1, k2: Kern
+    :param tensor: The kernels are either multiply as functions defined on the same input space (default) or on the product of the input spaces
+    :type tensor: Boolean
+    :rtype: kernel object
+
+    """
+    def __init__(self, kernels, name='mul'):
+        for i, kern in enumerate(kernels[:]):
+            if isinstance(kern, Prod):
+                del kernels[i]
+                for part in kern.parts[::-1]:
+                    kern.unlink_parameter(part)
+                    kernels.insert(i, part)
+        super(Prod, self).__init__(kernels, name)
+
+    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    def K(self, X, X2=None, which_parts=None):
+        if which_parts is None:
+            which_parts = self.parts
+        elif not isinstance(which_parts, (list, tuple)):
+            # if only one part is given
+            which_parts = [which_parts]
+        return reduce(np.multiply, (p.K(X, X2) for p in which_parts))
+
+    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    def Kdiag(self, X, which_parts=None):
+        if which_parts is None:
+            which_parts = self.parts
+        return reduce(np.multiply, (p.Kdiag(X) for p in which_parts))
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        k = self.K(X,X2)*dL_dK
+        for p in self.parts:
+            p.update_gradients_full(k/p.K(X,X2),X,X2)
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        k = self.Kdiag(X)*dL_dKdiag
+        for p in self.parts:
+            p.update_gradients_diag(k/p.Kdiag(X),X)
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        target = np.zeros(X.shape)
+        k = self.K(X,X2)*dL_dK
+        for p in self.parts:
+            target += p.gradients_X(k/p.K(X,X2),X,X2)
+        return target
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        target = np.zeros(X.shape)
+        k = self.Kdiag(X)*dL_dKdiag
+        for p in self.parts:
+            target += p.gradients_X_diag(k/p.Kdiag(X),X)
+        return target
--- a/GPy/kern/_src/psi_comp/init.py
+++ b/GPy/kern/_src/psi_comp/init.py
@ -0,0 +1,55 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from ....core.parameterization.parameter_core import Pickleable
+from GPy.util.caching import Cache_this
+from ....core.parameterization import variational
+import rbf_psi_comp
+import ssrbf_psi_comp
+import sslinear_psi_comp
+import linear_psi_comp
+
+class PSICOMP_RBF(Pickleable):
+    @Cache_this(limit=2, ignore_args=(0,))
+    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
+        if isinstance(variational_posterior, variational.NormalPosterior):
+            return rbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior)
+        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
+            return ssrbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior)
+        else:
+            raise ValueError, "unknown distriubtion received for psi-statistics"
+
+    @Cache_this(limit=2, ignore_args=(0,1,2,3))
+    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+        if isinstance(variational_posterior, variational.NormalPosterior):
+            return rbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior)
+        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
+            return ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior)
+        else:
+            raise ValueError, "unknown distriubtion received for psi-statistics"
+
+    def _setup_observers(self):
+        pass
+
+class PSICOMP_Linear(Pickleable):
+
+    @Cache_this(limit=2, ignore_args=(0,))
+    def psicomputations(self, variance, Z, variational_posterior):
+        if isinstance(variational_posterior, variational.NormalPosterior):
+            return linear_psi_comp.psicomputations(variance, Z, variational_posterior)
+        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
+            return sslinear_psi_comp.psicomputations(variance, Z, variational_posterior)
+        else:
+            raise ValueError, "unknown distriubtion received for psi-statistics"
+
+    @Cache_this(limit=2, ignore_args=(0,1,2,3))
+    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior):
+        if isinstance(variational_posterior, variational.NormalPosterior):
+            return linear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
+        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
+            return sslinear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
+        else:
+            raise ValueError, "unknown distriubtion received for psi-statistics"
+
+    def _setup_observers(self):
+        pass
--- a/GPy/kern/_src/psi_comp/linear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/linear_psi_comp.py
@ -0,0 +1,77 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+"""
+The package for the Psi statistics computation of the linear kernel for Bayesian GPLVM
+"""
+
+import numpy as np
+from ....util.linalg import tdot
+
+def psicomputations(variance, Z, variational_posterior):
+    """
+    Compute psi-statistics for ss-linear kernel
+    """
+    # here are the "statistics" for psi0, psi1 and psi2
+    # Produced intermediate results:
+    # psi0    N
+    # psi1    NxM
+    # psi2    MxM
+    mu = variational_posterior.mean
+    S = variational_posterior.variance
+
+    psi0 = (variance*(np.square(mu)+S)).sum(axis=1)
+    psi1 = np.dot(mu,(variance*Z).T)
+    psi2 = np.dot(S.sum(axis=0)*np.square(variance)*Z,Z.T)+ tdot(psi1.T)
+
+    return psi0, psi1, psi2
+
+def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior):
+    mu = variational_posterior.mean
+    S = variational_posterior.variance
+
+    dL_dvar, dL_dmu, dL_dS, dL_dZ = _psi2computations(dL_dpsi2, variance, Z, mu, S)
+
+    # Compute for psi0 and psi1
+    mu2S = np.square(mu)+S
+    dL_dpsi0_var = dL_dpsi0[:,None]*variance[None,:]
+    dL_dpsi1_mu = np.dot(dL_dpsi1.T,mu)
+    dL_dvar += (dL_dpsi0[:,None]*mu2S).sum(axis=0)+ (dL_dpsi1_mu*Z).sum(axis=0)
+    dL_dmu += 2.*dL_dpsi0_var*mu+np.dot(dL_dpsi1,Z)*variance
+    dL_dS += dL_dpsi0_var
+    dL_dZ += dL_dpsi1_mu*variance
+    
+    return dL_dvar, dL_dZ, dL_dmu, dL_dS
+
+def _psi2computations(dL_dpsi2, variance, Z, mu, S):
+    """
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    """
+    # here are the "statistics" for psi1 and psi2
+    # Produced intermediate results:
+    # _psi2_dvariance      Q
+    # _psi2_dZ             MxQ
+    # _psi2_dmu            NxQ
+    # _psi2_dS             NxQ
+    
+    variance2 = np.square(variance)
+    common_sum = np.dot(mu,(variance*Z).T)
+    Z_expect = (np.dot(dL_dpsi2,Z)*Z).sum(axis=0)
+    dL_dpsi2T = dL_dpsi2+dL_dpsi2.T
+    common_expect = np.dot(common_sum,np.dot(dL_dpsi2T,Z))
+    Z2_expect = np.inner(common_sum,dL_dpsi2T)
+    Z1_expect = np.dot(dL_dpsi2T,Z)
+
+    dL_dvar = 2.*S.sum(axis=0)*variance*Z_expect+(common_expect*mu).sum(axis=0)
+            
+    dL_dmu = common_expect*variance
+    
+    dL_dS = np.empty(S.shape)
+    dL_dS[:] = Z_expect*variance2
+    
+    dL_dZ = variance2*S.sum(axis=0)*Z1_expect+np.dot(Z2_expect.T,variance*mu)
+
+    return dL_dvar, dL_dmu, dL_dS, dL_dZ
--- a/GPy/kern/_src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
@ -0,0 +1,161 @@
+"""
+The module for psi-statistics for RBF kernel
+"""
+
+import numpy as np
+from GPy.util.caching import Cacher
+
+def psicomputations(variance, lengthscale, Z, variational_posterior):
+    """
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    """
+    # here are the "statistics" for psi0, psi1 and psi2
+    # Produced intermediate results:
+    # _psi1                NxM
+    mu = variational_posterior.mean
+    S = variational_posterior.variance
+
+    psi0 = np.empty(mu.shape[0])
+    psi0[:] = variance
+    psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
+    psi2 = _psi2computations(variance, lengthscale, Z, mu, S).sum(axis=0)
+    return psi0, psi1, psi2
+
+def __psi1computations(variance, lengthscale, Z, mu, S):
+    """
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    """
+    # here are the "statistics" for psi1
+    # Produced intermediate results:
+    # _psi1                NxM
+
+    lengthscale2 = np.square(lengthscale)
+
+    # psi1
+    _psi1_logdenom = np.log(S/lengthscale2+1.).sum(axis=-1) # N
+    _psi1_log = (_psi1_logdenom[:,None]+np.einsum('nmq,nq->nm',np.square(mu[:,None,:]-Z[None,:,:]),1./(S+lengthscale2)))/(-2.)
+    _psi1 = variance*np.exp(_psi1_log)
+
+    return _psi1
+
+def __psi2computations(variance, lengthscale, Z, mu, S):
+    """
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    """
+    # here are the "statistics" for psi2
+    # Produced intermediate results:
+    # _psi2                MxM
+
+    lengthscale2 = np.square(lengthscale)
+
+    _psi2_logdenom = np.log(2.*S/lengthscale2+1.).sum(axis=-1)/(-2.) # N
+    _psi2_exp1 = (np.square(Z[:,None,:]-Z[None,:,:])/lengthscale2).sum(axis=-1)/(-4.) #MxM
+    Z_hat = (Z[:,None,:]+Z[None,:,:])/2. #MxMxQ
+    denom = 1./(2.*S+lengthscale2)
+    _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+2.*np.einsum('nq,moq,nq->nmo',mu,Z_hat,denom)-np.einsum('moq,nq->nmo',np.square(Z_hat),denom)
+    _psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2)
+
+
+    return _psi2
+
+def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+    ARD = (len(lengthscale)!=1)
+
+    dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
+    dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
+
+    dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
+
+    dL_dlengscale = dl_psi1 + dl_psi2
+    if not ARD:
+        dL_dlengscale = dL_dlengscale.sum()
+
+    dL_dmu = dmu_psi1 + dmu_psi2
+    dL_dS = dS_psi1 + dS_psi2
+    dL_dZ = dZ_psi1 + dZ_psi2
+
+    return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS
+
+def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S):
+    """
+    dL_dpsi1 - NxM
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    """
+    # here are the "statistics" for psi1
+    # Produced intermediate results: dL_dparams w.r.t. psi1
+    # _dL_dvariance     1
+    # _dL_dlengthscale  Q
+    # _dL_dZ            MxQ
+    # _dL_dgamma        NxQ
+    # _dL_dmu           NxQ
+    # _dL_dS            NxQ
+
+    lengthscale2 = np.square(lengthscale)
+
+    _psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
+    Lpsi1 = dL_dpsi1*_psi1
+    Zmu = Z[None,:,:]-mu[:,None,:] # NxMxQ
+    denom = 1./(S+lengthscale2)
+    Zmu2_denom = np.square(Zmu)*denom[:,None,:] #NxMxQ
+    _dL_dvar = Lpsi1.sum()/variance
+    _dL_dmu = np.einsum('nm,nmq,nq->nq',Lpsi1,Zmu,denom)
+    _dL_dS = np.einsum('nm,nmq,nq->nq',Lpsi1,(Zmu2_denom-1.),denom)/2.
+    _dL_dZ = -np.einsum('nm,nmq,nq->mq',Lpsi1,Zmu,denom)
+    _dL_dl = np.einsum('nm,nmq,nq->q',Lpsi1,(Zmu2_denom+(S/lengthscale2)[:,None,:]),denom*lengthscale)
+
+    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
+
+def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
+    """
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    dL_dpsi2 - MxM
+    """
+    # here are the "statistics" for psi2
+    # Produced the derivatives w.r.t. psi2:
+    # _dL_dvariance      1
+    # _dL_dlengthscale   Q
+    # _dL_dZ             MxQ
+    # _dL_dgamma         NxQ
+    # _dL_dmu            NxQ
+    # _dL_dS             NxQ
+
+    lengthscale2 = np.square(lengthscale)
+    denom = 1./(2*S+lengthscale2)
+    denom2 = np.square(denom)
+
+    _psi2 = _psi2computations(variance, lengthscale, Z, mu, S) # NxMxM
+    Lpsi2 = dL_dpsi2*_psi2 # dL_dpsi2 is MxM, using broadcast to multiply N out
+    Lpsi2sum = np.einsum('nmo->n',Lpsi2) #N
+    Lpsi2Z = np.einsum('nmo,oq->nq',Lpsi2,Z) #NxQ
+    Lpsi2Z2 = np.einsum('nmo,oq,oq->nq',Lpsi2,Z,Z) #NxQ
+    Lpsi2Z2p = np.einsum('nmo,mq,oq->nq',Lpsi2,Z,Z) #NxQ
+    Lpsi2Zhat = Lpsi2Z
+    Lpsi2Zhat2 = (Lpsi2Z2+Lpsi2Z2p)/2
+
+    _dL_dvar = Lpsi2sum.sum()*2/variance
+    _dL_dmu = (-2*denom) * (mu*Lpsi2sum[:,None]-Lpsi2Zhat)
+    _dL_dS = (2*np.square(denom))*(np.square(mu)*Lpsi2sum[:,None]-2*mu*Lpsi2Zhat+Lpsi2Zhat2) - denom*Lpsi2sum[:,None]
+    _dL_dZ = -np.einsum('nmo,oq->oq',Lpsi2,Z)/lengthscale2+np.einsum('nmo,oq->mq',Lpsi2,Z)/lengthscale2+ \
+             2*np.einsum('nmo,nq,nq->mq',Lpsi2,mu,denom) - np.einsum('nmo,nq,mq->mq',Lpsi2,denom,Z) - np.einsum('nmo,oq,nq->mq',Lpsi2,Z,denom)
+    _dL_dl = 2*lengthscale* ((S/lengthscale2*denom+np.square(mu*denom))*Lpsi2sum[:,None]+(Lpsi2Z2-Lpsi2Z2p)/(2*np.square(lengthscale2))-
+                             (2*mu*denom2)*Lpsi2Zhat+denom2*Lpsi2Zhat2).sum(axis=0)
+
+    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
+
+_psi1computations = Cacher(__psi1computations, limit=1)
+_psi2computations = Cacher(__psi2computations, limit=1)
--- a/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py
@ -0,0 +1,411 @@
+"""
+The module for psi-statistics for RBF kernel
+"""
+
+import numpy as np
+from ....util.caching import Cache_this
+from . import PSICOMP_RBF
+from ....util import gpu_init
+
+try:
+    import pycuda.gpuarray as gpuarray
+    from pycuda.compiler import SourceModule
+    from ....util.linalg_gpu import sum_axis
+except:
+    pass    
+
+gpu_code = """
+    // define THREADNUM
+
+    #define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
+    #define IDX_NMM(n,m1,m2) ((m2*M+m1)*N+n)
+    #define IDX_NQ(n,q) (q*N+n)
+    #define IDX_NM(n,m) (m*N+n)
+    #define IDX_MQ(m,q) (q*M+m)
+    #define IDX_MM(m1,m2) (m2*M+m1)
+    #define IDX_NQB(n,q,b) ((b*Q+q)*N+n)
+    #define IDX_QB(q,b) (b*Q+q)
+
+    // Divide data evenly
+    __device__ void divide_data(int total_data, int psize, int pidx, int *start, int *end) {
+        int residue = (total_data)%psize;
+        if(pidx<residue) {
+            int size = total_data/psize+1;
+            *start = size*pidx;
+            *end = *start+size;
+        } else {
+            int size = total_data/psize;
+            *start = size*pidx+residue;
+            *end = *start+size;
+        }
+    }
+    
+    __device__ void reduce_sum(double* array, int array_size) {
+        int s;
+        if(array_size >= blockDim.x) {
+            for(int i=blockDim.x+threadIdx.x; i<array_size; i+= blockDim.x) {
+                array[threadIdx.x] += array[i];
+            }
+            array_size = blockDim.x;
+        }
+        __syncthreads();
+        for(int i=1; i<=array_size;i*=2) {s=i;}
+        if(threadIdx.x < array_size-s) {array[threadIdx.x] += array[s+threadIdx.x];}
+        __syncthreads();
+        for(s=s/2;s>=1;s=s/2) {
+            if(threadIdx.x < s) {array[threadIdx.x] += array[s+threadIdx.x];}
+            __syncthreads();
+        }
+    }
+
+    __global__ void compDenom(double *log_denom1, double *log_denom2, double *l, double *S, int N, int Q)
+    {
+        int n_start, n_end;
+        divide_data(N, gridDim.x, blockIdx.x, &n_start, &n_end);
+        
+        for(int i=n_start*Q+threadIdx.x; i<n_end*Q; i+=blockDim.x) {
+            int n=i/Q;
+            int q=i%Q;
+
+            double Snq = S[IDX_NQ(n,q)];
+            double lq = l[q]*l[q];
+            log_denom1[IDX_NQ(n,q)] = log(Snq/lq+1.);
+            log_denom2[IDX_NQ(n,q)] = log(2.*Snq/lq+1.);
+        }
+    }
+
+    __global__ void psi1computations(double *psi1, double *log_denom1, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
+    {
+        int m_start, m_end;
+        divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
+        
+        for(int m=m_start; m<m_end; m++) {
+            for(int n=threadIdx.x; n<N; n+= blockDim.x) {            
+                double log_psi1 = 0;
+                for(int q=0;q<Q;q++) {
+                    double muZ = mu[IDX_NQ(n,q)]-Z[IDX_MQ(m,q)];
+                    double Snq = S[IDX_NQ(n,q)];
+                    double lq = l[q]*l[q];
+                    log_psi1 += (muZ*muZ/(Snq+lq)+log_denom1[IDX_NQ(n,q)])/(-2.);
+                }
+                psi1[IDX_NM(n,m)] = var*exp(log_psi1);
+            }
+        }
+    }
+    
+    __global__ void psi2computations(double *psi2, double *psi2n, double *log_denom2, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
+    {
+        int psi2_idx_start, psi2_idx_end;
+        __shared__ double psi2_local[THREADNUM];
+        divide_data((M+1)*M/2, gridDim.x, blockIdx.x, &psi2_idx_start, &psi2_idx_end);
+        
+        for(int psi2_idx=psi2_idx_start; psi2_idx<psi2_idx_end; psi2_idx++) {
+            int m1 = int((sqrt(8.*psi2_idx+1.)-1.)/2.);
+            int m2 = psi2_idx - (m1+1)*m1/2;
+            
+            psi2_local[threadIdx.x] = 0;
+            for(int n=threadIdx.x;n<N;n+=blockDim.x) {
+                double log_psi2_n = 0;
+                for(int q=0;q<Q;q++) {
+                    double dZ = Z[IDX_MQ(m1,q)] - Z[IDX_MQ(m2,q)];
+                    double muZhat = mu[IDX_NQ(n,q)]- (Z[IDX_MQ(m1,q)]+Z[IDX_MQ(m2,q)])/2.;
+                    double Snq = S[IDX_NQ(n,q)];
+                    double lq = l[q]*l[q];
+                    log_psi2_n += dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) + log_denom2[IDX_NQ(n,q)]/(-2.);
+                }
+                double exp_psi2_n = exp(log_psi2_n);
+                psi2n[IDX_NMM(n,m1,m2)] = var*var*exp_psi2_n;
+                if(m1!=m2) { psi2n[IDX_NMM(n,m2,m1)] = var*var*exp_psi2_n;}
+                psi2_local[threadIdx.x] += exp_psi2_n;
+            }
+            __syncthreads();
+            reduce_sum(psi2_local, THREADNUM);
+            if(threadIdx.x==0) {
+                psi2[IDX_MM(m1,m2)] = var*var*psi2_local[0];
+                if(m1!=m2) { psi2[IDX_MM(m2,m1)] = var*var*psi2_local[0]; }
+            }
+            __syncthreads();
+        }
+    }
+    
+    __global__ void psi1compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dL_dpsi1, double *psi1, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
+    {
+        int m_start, m_end;
+        __shared__ double g_local[THREADNUM];
+        divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
+        int P = int(ceil(double(N)/THREADNUM));
+
+        double dvar_local = 0;
+        for(int q=0;q<Q;q++) {
+            double lq_sqrt = l[q];
+            double lq = lq_sqrt*lq_sqrt;
+            double dl_local = 0;
+            for(int p=0;p<P;p++) {
+                int n = p*THREADNUM + threadIdx.x;
+                double dmu_local = 0;
+                double dS_local = 0;
+                double Snq,mu_nq;
+                if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)];}
+                for(int m=m_start; m<m_end; m++) {
+                    if(n<N) {
+                        double lpsi1 = psi1[IDX_NM(n,m)]*dL_dpsi1[IDX_NM(n,m)];
+                        if(q==0) {dvar_local += lpsi1;}
+                        
+                        double Zmu = Z[IDX_MQ(m,q)] - mu_nq;
+                        double denom = Snq+lq;
+                        double Zmu2_denom = Zmu*Zmu/denom;
+                        
+                        dmu_local += lpsi1*Zmu/denom;
+                        dS_local += lpsi1*(Zmu2_denom-1.)/denom;
+                        dl_local += lpsi1*(Zmu2_denom+Snq/lq)/denom;
+                        g_local[threadIdx.x] = -lpsi1*Zmu/denom;
+                    }
+                    __syncthreads();
+                    reduce_sum(g_local, p<P-1?THREADNUM:N-(P-1)*THREADNUM);
+                    if(threadIdx.x==0) {dZ[IDX_MQ(m,q)] += g_local[0];}
+                }
+                if(n<N) {
+                    dmu[IDX_NQB(n,q,blockIdx.x)] += dmu_local;
+                    dS[IDX_NQB(n,q,blockIdx.x)] += dS_local/2.;
+                }
+                __threadfence_block();
+            }
+            g_local[threadIdx.x] = dl_local*lq_sqrt;
+            __syncthreads();
+            reduce_sum(g_local, THREADNUM);
+            if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
+        }
+        g_local[threadIdx.x] = dvar_local;
+        __syncthreads();
+        reduce_sum(g_local, THREADNUM);
+        if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]/var;}        
+    }
+    
+    __global__ void psi2compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dL_dpsi2, double *psi2n, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
+    {
+        int m_start, m_end;
+        __shared__ double g_local[THREADNUM];
+        divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
+        int P = int(ceil(double(N)/THREADNUM));
+
+        double dvar_local = 0;
+        for(int q=0;q<Q;q++) {
+            double lq_sqrt = l[q];
+            double lq = lq_sqrt*lq_sqrt;
+            double dl_local = 0;
+            for(int p=0;p<P;p++) {
+                int n = p*THREADNUM + threadIdx.x;
+                double dmu_local = 0;
+                double dS_local = 0;
+                double Snq,mu_nq;
+                if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)];}
+                for(int m1=m_start; m1<m_end; m1++) {
+                    g_local[threadIdx.x] = 0;
+                    for(int m2=0;m2<M;m2++) {
+                        if(n<N) {
+                            double lpsi2 = psi2n[IDX_NMM(n,m1,m2)]*dL_dpsi2[IDX_MM(m1,m2)];
+                            if(q==0) {dvar_local += lpsi2;}
+                            
+                            double dZ = Z[IDX_MQ(m1,q)] - Z[IDX_MQ(m2,q)];
+                            double muZhat =  mu_nq - (Z[IDX_MQ(m1,q)] + Z[IDX_MQ(m2,q)])/2.;
+                            double denom = 2.*Snq+lq;
+                            double muZhat2_denom = muZhat*muZhat/denom;
+                            
+                            dmu_local += lpsi2*muZhat/denom;
+                            dS_local += lpsi2*(2.*muZhat2_denom-1.)/denom;
+                            dl_local += lpsi2*((Snq/lq+muZhat2_denom)/denom+dZ*dZ/(4.*lq*lq));
+                            g_local[threadIdx.x] += 2.*lpsi2*(muZhat/denom-dZ/(2*lq));
+                        }
+                    }
+                    __syncthreads();
+                    reduce_sum(g_local, p<P-1?THREADNUM:N-(P-1)*THREADNUM);
+                    if(threadIdx.x==0) {dZ[IDX_MQ(m1,q)] += g_local[0];}
+                }
+                if(n<N) {
+                    dmu[IDX_NQB(n,q,blockIdx.x)] += -2.*dmu_local;
+                    dS[IDX_NQB(n,q,blockIdx.x)] += dS_local;
+                }
+                __threadfence_block();
+            }
+            g_local[threadIdx.x] = dl_local*2.*lq_sqrt;
+            __syncthreads();
+            reduce_sum(g_local, THREADNUM);
+            if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
+        }
+        g_local[threadIdx.x] = dvar_local;
+        __syncthreads();
+        reduce_sum(g_local, THREADNUM);
+        if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]*2/var;}
+    }
+    """
+
+class PSICOMP_RBF_GPU(PSICOMP_RBF):
+
+    def __init__(self, threadnum=128, blocknum=15, GPU_direct=False):
+        self.GPU_direct = GPU_direct
+        self.gpuCache = None
+        
+        self.threadnum = threadnum
+        self.blocknum = blocknum
+        module = SourceModule("#define THREADNUM "+str(self.threadnum)+"\n"+gpu_code)
+        self.g_psi1computations = module.get_function('psi1computations')
+        self.g_psi1computations.prepare('PPdPPPPiii')
+        self.g_psi2computations = module.get_function('psi2computations')
+        self.g_psi2computations.prepare('PPPdPPPPiii')
+        self.g_psi1compDer = module.get_function('psi1compDer')
+        self.g_psi1compDer.prepare('PPPPPPPdPPPPiii')
+        self.g_psi2compDer = module.get_function('psi2compDer')
+        self.g_psi2compDer.prepare('PPPPPPPdPPPPiii')
+        self.g_compDenom = module.get_function('compDenom')
+        self.g_compDenom.prepare('PPPPii')
+        
+    def __deepcopy__(self, memo):
+        s = PSICOMP_RBF_GPU(threadnum=self.threadnum, blocknum=self.blocknum, GPU_direct=self.GPU_direct)
+        memo[id(self)] = s 
+        return s
+    
+    def _initGPUCache(self, N, M, Q):            
+        if self.gpuCache == None:
+            self.gpuCache = {
+                             'l_gpu'                :gpuarray.empty((Q,),np.float64,order='F'),
+                             'Z_gpu'                :gpuarray.empty((M,Q),np.float64,order='F'),
+                             'mu_gpu'               :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'S_gpu'                :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'psi1_gpu'             :gpuarray.empty((N,M),np.float64,order='F'),
+                             'psi2_gpu'             :gpuarray.empty((M,M),np.float64,order='F'),
+                             'psi2n_gpu'            :gpuarray.empty((N,M,M),np.float64,order='F'),
+                             'dL_dpsi1_gpu'         :gpuarray.empty((N,M),np.float64,order='F'),
+                             'dL_dpsi2_gpu'         :gpuarray.empty((M,M),np.float64,order='F'),
+                             'log_denom1_gpu'       :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'log_denom2_gpu'       :gpuarray.empty((N,Q),np.float64,order='F'),
+                             # derivatives
+                             'dvar_gpu'             :gpuarray.empty((self.blocknum,),np.float64, order='F'),
+                             'dl_gpu'               :gpuarray.empty((Q,self.blocknum),np.float64, order='F'),
+                             'dZ_gpu'               :gpuarray.empty((M,Q),np.float64, order='F'),
+                             'dmu_gpu'              :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
+                             'dS_gpu'               :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
+                             # grad
+                             'grad_l_gpu'               :gpuarray.empty((Q,),np.float64, order='F'),
+                             'grad_mu_gpu'              :gpuarray.empty((N,Q,),np.float64, order='F'),
+                             'grad_S_gpu'               :gpuarray.empty((N,Q,),np.float64, order='F'),
+                             }
+        else:
+            assert N==self.gpuCache['mu_gpu'].shape[0]
+            assert M==self.gpuCache['Z_gpu'].shape[0]
+            assert Q==self.gpuCache['l_gpu'].shape[0]
+    
+    def sync_params(self, lengthscale, Z, mu, S):
+        if len(lengthscale)==1:
+            self.gpuCache['l_gpu'].fill(lengthscale)
+        else:
+            self.gpuCache['l_gpu'].set(np.asfortranarray(lengthscale))
+        self.gpuCache['Z_gpu'].set(np.asfortranarray(Z))
+        self.gpuCache['mu_gpu'].set(np.asfortranarray(mu))
+        self.gpuCache['S_gpu'].set(np.asfortranarray(S))
+        N,Q = self.gpuCache['S_gpu'].shape
+        # t=self.g_compDenom(self.gpuCache['log_denom1_gpu'],self.gpuCache['log_denom2_gpu'],self.gpuCache['l_gpu'],self.gpuCache['S_gpu'], np.int32(N), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
+        # print 'g_compDenom '+str(t)
+        self.g_compDenom.prepared_call((self.blocknum,1),(self.threadnum,1,1), self.gpuCache['log_denom1_gpu'].gpudata,self.gpuCache['log_denom2_gpu'].gpudata,self.gpuCache['l_gpu'].gpudata,self.gpuCache['S_gpu'].gpudata, np.int32(N), np.int32(Q))
+        
+    def reset_derivative(self):
+        self.gpuCache['dvar_gpu'].fill(0.)
+        self.gpuCache['dl_gpu'].fill(0.)
+        self.gpuCache['dZ_gpu'].fill(0.)
+        self.gpuCache['dmu_gpu'].fill(0.)
+        self.gpuCache['dS_gpu'].fill(0.)
+        self.gpuCache['grad_l_gpu'].fill(0.)
+        self.gpuCache['grad_mu_gpu'].fill(0.)
+        self.gpuCache['grad_S_gpu'].fill(0.)
+    
+    def get_dimensions(self, Z, variational_posterior):
+        return variational_posterior.mean.shape[0], Z.shape[0], Z.shape[1]
+
+    @Cache_this(limit=1, ignore_args=(0,))
+    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        """
+        N,M,Q = self.get_dimensions(Z, variational_posterior)
+        self._initGPUCache(N,M,Q)
+        self.sync_params(lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
+        
+        psi1_gpu = self.gpuCache['psi1_gpu']
+        psi2_gpu = self.gpuCache['psi2_gpu']
+        psi2n_gpu = self.gpuCache['psi2n_gpu']
+        l_gpu = self.gpuCache['l_gpu']
+        Z_gpu = self.gpuCache['Z_gpu']
+        mu_gpu = self.gpuCache['mu_gpu']
+        S_gpu = self.gpuCache['S_gpu']
+        log_denom1_gpu = self.gpuCache['log_denom1_gpu']
+        log_denom2_gpu = self.gpuCache['log_denom2_gpu']
+
+        psi0 = np.empty((N,))
+        psi0[:] = variance
+        self.g_psi1computations.prepared_call((self.blocknum,1),(self.threadnum,1,1),psi1_gpu.gpudata, log_denom1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
+        self.g_psi2computations.prepared_call((self.blocknum,1),(self.threadnum,1,1),psi2_gpu.gpudata, psi2n_gpu.gpudata, log_denom2_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
+        # t = self.g_psi1computations(psi1_gpu, log_denom1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
+        # print 'g_psi1computations '+str(t)
+        # t = self.g_psi2computations(psi2_gpu, psi2n_gpu, log_denom2_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
+        # print 'g_psi2computations '+str(t)
+         
+        if self.GPU_direct:
+            return psi0, psi1_gpu, psi2_gpu
+        else:
+            return psi0, psi1_gpu.get(), psi2_gpu.get()
+
+    @Cache_this(limit=1, ignore_args=(0,1,2,3))
+    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+        ARD = (len(lengthscale)!=1)
+        
+        N,M,Q = self.get_dimensions(Z, variational_posterior)
+        psi1_gpu = self.gpuCache['psi1_gpu']
+        psi2n_gpu = self.gpuCache['psi2n_gpu']
+        l_gpu = self.gpuCache['l_gpu']
+        Z_gpu = self.gpuCache['Z_gpu']
+        mu_gpu = self.gpuCache['mu_gpu']
+        S_gpu = self.gpuCache['S_gpu']
+        dvar_gpu = self.gpuCache['dvar_gpu']
+        dl_gpu = self.gpuCache['dl_gpu']
+        dZ_gpu = self.gpuCache['dZ_gpu']
+        dmu_gpu = self.gpuCache['dmu_gpu']
+        dS_gpu = self.gpuCache['dS_gpu']
+        grad_l_gpu = self.gpuCache['grad_l_gpu']
+        grad_mu_gpu = self.gpuCache['grad_mu_gpu']
+        grad_S_gpu = self.gpuCache['grad_S_gpu']
+        
+        if self.GPU_direct:
+            dL_dpsi1_gpu = dL_dpsi1
+            dL_dpsi2_gpu = dL_dpsi2
+            dL_dpsi0_sum = dL_dpsi0.get().sum() #gpuarray.sum(dL_dpsi0).get()
+        else:
+            dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu']
+            dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu']
+            dL_dpsi1_gpu.set(np.asfortranarray(dL_dpsi1))
+            dL_dpsi2_gpu.set(np.asfortranarray(dL_dpsi2))
+            dL_dpsi0_sum = dL_dpsi0.sum()
+
+        self.reset_derivative()
+        # t=self.g_psi1compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi1_gpu,psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
+        # print 'g_psi1compDer '+str(t)
+        # t=self.g_psi2compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi2_gpu,psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
+        # print 'g_psi2compDer '+str(t)
+        self.g_psi1compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dL_dpsi1_gpu.gpudata,psi1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
+        self.g_psi2compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dL_dpsi2_gpu.gpudata,psi2n_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
+
+        dL_dvar = dL_dpsi0_sum + dvar_gpu.get().sum()#gpuarray.sum(dvar_gpu).get()
+        sum_axis(grad_mu_gpu,dmu_gpu,N*Q,self.blocknum)
+        dL_dmu = grad_mu_gpu.get()
+        sum_axis(grad_S_gpu,dS_gpu,N*Q,self.blocknum)
+        dL_dS = grad_S_gpu.get()
+        dL_dZ = dZ_gpu.get()
+        if ARD:
+            sum_axis(grad_l_gpu,dl_gpu,Q,self.blocknum)
+            dL_dlengscale = grad_l_gpu.get()
+        else:
+            dL_dlengscale = dl_gpu.get().sum() #gpuarray.sum(dl_gpu).get()
+            
+        return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS
+    
+
--- a/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
@ -0,0 +1,92 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+"""
+The package for the Psi statistics computation of the linear kernel for SSGPLVM
+"""
+
+from ....util.linalg import tdot
+
+import numpy as np
+
+def psicomputations(variance, Z, variational_posterior):
+    """
+    Compute psi-statistics for ss-linear kernel
+    """
+    # here are the "statistics" for psi0, psi1 and psi2
+    # Produced intermediate results:
+    # psi0    N
+    # psi1    NxM
+    # psi2    MxM
+    mu = variational_posterior.mean
+    S = variational_posterior.variance
+    gamma = variational_posterior.binary_prob
+
+    psi0 = (gamma*(np.square(mu)+S)*variance).sum(axis=-1)
+    psi1 = np.inner(variance*gamma*mu,Z)
+    psi2 = np.inner(np.square(variance)*(gamma*((1-gamma)*np.square(mu)+S)).sum(axis=0)*Z,Z)+tdot(psi1.T)
+
+    return psi0, psi1, psi2
+
+def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior):
+    mu = variational_posterior.mean
+    S = variational_posterior.variance
+    gamma = variational_posterior.binary_prob
+
+    dL_dvar, dL_dgamma, dL_dmu, dL_dS, dL_dZ = _psi2computations(dL_dpsi2, variance, Z, mu, S, gamma)
+
+    # Compute for psi0 and psi1
+    mu2S = np.square(mu)+S
+    dL_dvar += np.einsum('n,nq,nq->q',dL_dpsi0,gamma,mu2S) + np.einsum('nm,nq,mq,nq->q',dL_dpsi1,gamma,Z,mu)
+    dL_dgamma += np.einsum('n,q,nq->nq',dL_dpsi0,variance,mu2S) + np.einsum('nm,q,mq,nq->nq',dL_dpsi1,variance,Z,mu)
+    dL_dmu += np.einsum('n,nq,q,nq->nq',dL_dpsi0,gamma,2.*variance,mu) + np.einsum('nm,nq,q,mq->nq',dL_dpsi1,gamma,variance,Z)
+    dL_dS += np.einsum('n,nq,q->nq',dL_dpsi0,gamma,variance)
+    dL_dZ +=  np.einsum('nm,nq,q,nq->mq',dL_dpsi1,gamma, variance,mu)
+    
+    return dL_dvar, dL_dZ, dL_dmu, dL_dS, dL_dgamma
+
+def _psi2computations(dL_dpsi2, variance, Z, mu, S, gamma):
+    """
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    """
+    # here are the "statistics" for psi1 and psi2
+    # Produced intermediate results:
+    # _psi2_dvariance      Q
+    # _psi2_dZ             MxQ
+    # _psi2_dgamma         NxQ
+    # _psi2_dmu            NxQ
+    # _psi2_dS             NxQ
+    
+    mu2 = np.square(mu)
+    gamma2 = np.square(gamma)
+    variance2 = np.square(variance)
+    mu2S = mu2+S # NxQ
+    gvm = np.einsum('nq,nq,q->nq',gamma,mu,variance)
+    common_sum = np.einsum('nq,mq->nm',gvm,Z)
+#     common_sum = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) # NxM
+    Z_expect = np.einsum('mo,mq,oq->q',dL_dpsi2,Z,Z)
+    dL_dpsi2T = dL_dpsi2+dL_dpsi2.T
+    tmp = np.einsum('mo,oq->mq',dL_dpsi2T,Z)
+    common_expect = np.einsum('mq,nm->nq',tmp,common_sum)
+#     common_expect = np.einsum('mo,mq,no->nq',dL_dpsi2+dL_dpsi2.T,Z,common_sum)
+    Z2_expect = np.einsum('om,nm->no',dL_dpsi2T,common_sum)
+    Z1_expect = np.einsum('om,mq->oq',dL_dpsi2T,Z)
+    
+    dL_dvar = np.einsum('nq,q,q->q',2.*(gamma*mu2S-gamma2*mu2),variance,Z_expect)+\
+        np.einsum('nq,nq,nq->q',common_expect,gamma,mu)
+        
+    dL_dgamma = np.einsum('q,q,nq->nq',Z_expect,variance2,(mu2S-2.*gamma*mu2))+\
+        np.einsum('nq,q,nq->nq',common_expect,variance,mu)
+    
+    dL_dmu = np.einsum('q,q,nq,nq->nq',Z_expect,variance2,mu,2.*(gamma-gamma2))+\
+            np.einsum('nq,nq,q->nq',common_expect,gamma,variance)
+                    
+    dL_dS = np.einsum('q,nq,q->nq',Z_expect,gamma,variance2)
+    
+#     dL_dZ = 2.*(np.einsum('om,nq,q,mq,nq->oq',dL_dpsi2,gamma,variance2,Z,(mu2S-gamma*mu2))+np.einsum('om,nq,q,nq,nm->oq',dL_dpsi2,gamma,variance,mu,common_sum))
+    dL_dZ = Z1_expect*np.einsum('nq,q,nq->q',gamma,variance2,(mu2S-gamma*mu2))+np.einsum('nq,q,nq,nm->mq',gamma,variance,mu,Z2_expect)
+
+    return dL_dvar, dL_dgamma, dL_dmu, dL_dS, dL_dZ
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
@ -0,0 +1,394 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+"""
+The package for the psi statistics computation
+"""
+
+import numpy as np
+
+try:
+    from scipy import weave
+     
+    def _psicomputations(variance, lengthscale, Z, variational_posterior):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        """
+        # here are the "statistics" for psi0, psi1 and psi2
+        # Produced intermediate results:
+        # _psi1                NxM
+        mu = variational_posterior.mean
+        S = variational_posterior.variance
+         
+        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
+        l2 = np.square(lengthscale)
+        log_denom1 = np.log(S/l2+1)
+        log_denom2 = np.log(2*S/l2+1)
+        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        variance = float(variance)
+        psi0 = np.empty(N)
+        psi0[:] = variance
+        psi1 = np.empty((N,M))
+        psi2n = np.empty((N,M,M))
+         
+        from ....util.misc import param_to_array
+        S = param_to_array(S)
+        mu = param_to_array(mu)
+        Z = param_to_array(Z)
+         
+        support_code = """
+        #include <math.h>
+        """
+        code = """
+        for(int n=0; n<N; n++) {
+            for(int m1=0;m1<M;m1++) {
+                double log_psi1=0;
+                for(int m2=0;m2<=m1;m2++) {
+                    double log_psi2_n=0;
+                    for(int q=0;q<Q;q++) {
+                        double Snq = S(n,q);
+                        double lq = l2(q);
+                        double Zm1q = Z(m1,q);
+                        double Zm2q = Z(m2,q);
+                         
+                        if(m2==0) {
+                            // Compute Psi_1
+                            double muZ = mu(n,q)-Z(m1,q);
+                             
+                            double psi1_exp1 = log_gamma(n,q) - (muZ*muZ/(Snq+lq) +log_denom1(n,q))/2.;
+                            double psi1_exp2 = log_gamma1(n,q) -Zm1q*Zm1q/(2.*lq);
+                            log_psi1 += (psi1_exp1>psi1_exp2)?psi1_exp1+log1p(exp(psi1_exp2-psi1_exp1)):psi1_exp2+log1p(exp(psi1_exp1-psi1_exp2));
+                        }
+                        // Compute Psi_2
+                        double muZhat = mu(n,q) - (Zm1q+Zm2q)/2.;
+                        double Z2 = Zm1q*Zm1q+ Zm2q*Zm2q;
+                        double dZ = Zm1q - Zm2q;
+                         
+                        double psi2_exp1 = dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2(n,q)/2. + log_gamma(n,q);
+                        double psi2_exp2 = log_gamma1(n,q) - Z2/(2.*lq);
+                        log_psi2_n += (psi2_exp1>psi2_exp2)?psi2_exp1+log1p(exp(psi2_exp2-psi2_exp1)):psi2_exp2+log1p(exp(psi2_exp1-psi2_exp2));                    
+                    }
+                    double exp_psi2_n = exp(log_psi2_n);
+                    psi2n(n,m1,m2) = variance*variance*exp_psi2_n;
+                    if(m1!=m2) { psi2n(n,m2,m1) = variance*variance*exp_psi2_n;}
+                }
+                psi1(n,m1) = variance*exp(log_psi1);
+            }
+        }
+        """
+        weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz)
+     
+        psi2 = psi2n.sum(axis=0)
+        return psi0,psi1,psi2,psi2n
+     
+    from GPy.util.caching import Cacher
+    psicomputations = Cacher(_psicomputations, limit=1)
+     
+    def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+        ARD = (len(lengthscale)!=1)
+         
+        _,psi1,_,psi2n = psicomputations(variance, lengthscale, Z, variational_posterior)
+     
+        mu = variational_posterior.mean
+        S = variational_posterior.variance
+        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
+        l2 = np.square(lengthscale)
+        log_denom1 = np.log(S/l2+1)
+        log_denom2 = np.log(2*S/l2+1)
+        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        gamma, gamma1 = variational_posterior.gamma_probabilities()
+        variance = float(variance)
+     
+        dvar = np.zeros(1)
+        dmu = np.zeros((N,Q))
+        dS = np.zeros((N,Q))
+        dgamma = np.zeros((N,Q))
+        dl = np.zeros(Q)
+        dZ = np.zeros((M,Q))
+        dvar += np.sum(dL_dpsi0)
+         
+        from ....util.misc import param_to_array
+        S = param_to_array(S)
+        mu = param_to_array(mu)
+        Z = param_to_array(Z)
+         
+        support_code = """
+        #include <math.h>
+        """
+        code = """
+        for(int n=0; n<N; n++) {
+            for(int m1=0;m1<M;m1++) {
+                double log_psi1=0;
+                for(int m2=0;m2<M;m2++) {
+                    double log_psi2_n=0;
+                    for(int q=0;q<Q;q++) {
+                        double Snq = S(n,q);
+                        double lq = l2(q);
+                        double Zm1q = Z(m1,q);
+                        double Zm2q = Z(m2,q);
+                        double gnq = gamma(n,q);
+                        double g1nq = gamma1(n,q);
+                        double mu_nq = mu(n,q);
+                         
+                        if(m2==0) {
+                            // Compute Psi_1                        
+                            double lpsi1 = psi1(n,m1)*dL_dpsi1(n,m1);
+                            if(q==0) {dvar(0) += lpsi1/variance;}
+                             
+                            double Zmu = Zm1q - mu_nq;
+                            double denom = Snq+lq;
+                            double Zmu2_denom = Zmu*Zmu/denom;
+                             
+                            double exp1 = log_gamma(n,q)-(Zmu*Zmu/(Snq+lq)+log_denom1(n,q))/(2.);
+                            double exp2 = log_gamma1(n,q)-Zm1q*Zm1q/(2.*lq);
+                            double d_exp1,d_exp2;
+                            if(exp1>exp2) {
+                                d_exp1 = 1.;
+                                d_exp2 = exp(exp2-exp1);
+                            } else {
+                                d_exp1 = exp(exp1-exp2);
+                                d_exp2 = 1.;
+                            }
+                            double exp_sum = d_exp1+d_exp2;
+                             
+                            dmu(n,q) += lpsi1*Zmu*d_exp1/(denom*exp_sum);
+                            dS(n,q) += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum)/2.;
+                            dgamma(n,q) += lpsi1*(d_exp1*g1nq-d_exp2*gnq)/exp_sum;
+                            dl(q) += lpsi1*((Zmu2_denom+Snq/lq)/denom*d_exp1+Zm1q*Zm1q/(lq*lq)*d_exp2)/(2.*exp_sum);
+                            dZ(m1,q) += lpsi1*(-Zmu/denom*d_exp1-Zm1q/lq*d_exp2)/exp_sum;
+                        }
+                        // Compute Psi_2
+                        double lpsi2 = psi2n(n,m1,m2)*dL_dpsi2(m1,m2);
+                        if(q==0) {dvar(0) += lpsi2*2/variance;}
+                         
+                        double dZm1m2 = Zm1q - Zm2q;
+                        double Z2 = Zm1q*Zm1q+Zm2q*Zm2q;
+                        double muZhat =  mu_nq - (Zm1q + Zm2q)/2.;
+                        double denom = 2.*Snq+lq;
+                        double muZhat2_denom = muZhat*muZhat/denom;
+                         
+                        double exp1 = dZm1m2*dZm1m2/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2(n,q)/2. + log_gamma(n,q);
+                        double exp2 = log_gamma1(n,q) - Z2/(2.*lq);
+                        double d_exp1,d_exp2;
+                        if(exp1>exp2) {
+                            d_exp1 = 1.;
+                            d_exp2 = exp(exp2-exp1);
+                        } else {
+                            d_exp1 = exp(exp1-exp2);
+                            d_exp2 = 1.;
+                        }
+                        double exp_sum = d_exp1+d_exp2;
+                         
+                        dmu(n,q) += -2.*lpsi2*muZhat/denom*d_exp1/exp_sum;
+                        dS(n,q) += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum;
+                        dgamma(n,q) += lpsi2*(d_exp1*g1nq-d_exp2*gnq)/exp_sum;
+                        dl(q) += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZm1m2*dZm1m2/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum;
+                        dZ(m1,q) += 2.*lpsi2*((muZhat/denom-dZm1m2/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum;                   
+                    }
+                }
+            }
+        }
+        """
+        weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','gamma1','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz)
+     
+        dl *= 2.*lengthscale
+        if not ARD:
+            dl = dl.sum()
+         
+        return dvar, dl, dZ, dmu, dS, dgamma
+
+except:
+
+    def psicomputations(variance, lengthscale, Z, variational_posterior):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        """
+        # here are the "statistics" for psi0, psi1 and psi2
+        # Produced intermediate results:
+        # _psi1                NxM
+        mu = variational_posterior.mean
+        S = variational_posterior.variance
+        gamma = variational_posterior.binary_prob
+         
+        psi0 = np.empty(mu.shape[0])
+        psi0[:] = variance
+        psi1 = _psi1computations(variance, lengthscale, Z, mu, S, gamma)
+        psi2 = _psi2computations(variance, lengthscale, Z, mu, S, gamma)
+        return psi0, psi1, psi2
+    
+    def _psi1computations(variance, lengthscale, Z, mu, S, gamma):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        """
+        # here are the "statistics" for psi1
+        # Produced intermediate results:
+        # _psi1                NxM
+    
+        lengthscale2 = np.square(lengthscale)
+    
+        # psi1
+        _psi1_denom = S[:, None, :] / lengthscale2 + 1.  # Nx1xQ
+        _psi1_denom_sqrt = np.sqrt(_psi1_denom) #Nx1xQ
+        _psi1_dist = Z[None, :, :] - mu[:, None, :]  # NxMxQ
+        _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom) # NxMxQ
+        _psi1_common = gamma[:,None,:] / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #Nx1xQ
+        _psi1_exponent1 = np.log(gamma[:,None,:]) - (_psi1_dist_sq + np.log(_psi1_denom))/2. # NxMxQ
+        _psi1_exponent2 = np.log(1.-gamma[:,None,:]) - (np.square(Z[None,:,:])/lengthscale2)/2. # NxMxQ
+        _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2)
+        _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ
+        _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM
+        _psi1 = variance * np.exp(_psi1_exp_sum) # NxM
+    
+        return _psi1
+    
+    def _psi2computations(variance, lengthscale, Z, mu, S, gamma):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        """
+        # here are the "statistics" for psi2
+        # Produced intermediate results:
+        # _psi2                MxM
+        
+        lengthscale2 = np.square(lengthscale)
+        
+        _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
+        _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
+        _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
+        _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
+    
+        # psi2
+        _psi2_denom = 2.*S[:, None, None, :] / lengthscale2 + 1. # Nx1x1xQ
+        _psi2_denom_sqrt = np.sqrt(_psi2_denom)
+        _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q
+        _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom)
+        _psi2_common = gamma[:,None,None,:]/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # Nx1x1xQ
+        _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom)+np.log(gamma[:,None,None,:]) #N,M,M,Q
+        _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ
+        _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2)
+        _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
+        _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
+        _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
+    
+        return _psi2
+    
+    def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+        ARD = (len(lengthscale)!=1)
+         
+        dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1, dgamma_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+        dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2, dgamma_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+     
+        dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
+         
+        dL_dlengscale = dl_psi1 + dl_psi2
+        if not ARD:
+            dL_dlengscale = dL_dlengscale.sum()
+     
+        dL_dgamma = dgamma_psi1 + dgamma_psi2
+        dL_dmu = dmu_psi1 + dmu_psi2
+        dL_dS = dS_psi1 + dS_psi2
+        dL_dZ = dZ_psi1 + dZ_psi2
+         
+        return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma
+    
+    def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, gamma):
+        """
+        dL_dpsi1 - NxM
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        """
+        # here are the "statistics" for psi1
+        # Produced intermediate results: dL_dparams w.r.t. psi1
+        # _dL_dvariance     1
+        # _dL_dlengthscale  Q
+        # _dL_dZ            MxQ
+        # _dL_dgamma        NxQ
+        # _dL_dmu           NxQ
+        # _dL_dS            NxQ
+        
+        lengthscale2 = np.square(lengthscale)
+    
+        # psi1
+        _psi1_denom = S / lengthscale2 + 1.  # NxQ
+        _psi1_denom_sqrt = np.sqrt(_psi1_denom) #NxQ
+        _psi1_dist = Z[None, :, :] - mu[:, None, :]  # NxMxQ
+        _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom[:,None,:]) # NxMxQ
+        _psi1_common = gamma / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #NxQ
+        _psi1_exponent1 = np.log(gamma[:,None,:]) -0.5 * (_psi1_dist_sq + np.log(_psi1_denom[:, None,:])) # NxMxQ
+        _psi1_exponent2 = np.log(1.-gamma[:,None,:]) -0.5 * (np.square(Z[None,:,:])/lengthscale2) # NxMxQ
+        _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2)
+        _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ
+        _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM
+        _psi1_exp_dist_sq = np.exp(-0.5*_psi1_dist_sq) # NxMxQ
+        _psi1_exp_Z = np.exp(-0.5*np.square(Z[None,:,:])/lengthscale2) # 1xMxQ
+        _psi1_q = variance * np.exp(_psi1_exp_sum[:,:,None] - _psi1_exponent) # NxMxQ
+        _psi1 = variance * np.exp(_psi1_exp_sum) # NxM
+        _dL_dvariance = np.einsum('nm,nm->',dL_dpsi1, _psi1)/variance # 1
+        _dL_dgamma = np.einsum('nm,nmq,nmq->nq',dL_dpsi1, _psi1_q, (_psi1_exp_dist_sq/_psi1_denom_sqrt[:,None,:]-_psi1_exp_Z)) # NxQ
+        _dL_dmu = np.einsum('nm, nmq, nmq, nmq, nq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_dist,_psi1_common)  # NxQ
+        _dL_dS = np.einsum('nm,nmq,nmq,nq,nmq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_common,(_psi1_dist_sq-1.))/2.  # NxQ
+        _dL_dZ = np.einsum('nm,nmq,nmq->mq',dL_dpsi1,_psi1_q, (- _psi1_common[:,None,:] * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z))
+        _dL_dlengthscale = lengthscale* np.einsum('nm,nmq,nmq->q',dL_dpsi1,_psi1_q,(_psi1_common[:,None,:]*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + (1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z))
+    
+        return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma 
+    
+    def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, gamma):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        dL_dpsi2 - MxM
+        """
+        # here are the "statistics" for psi2
+        # Produced the derivatives w.r.t. psi2:
+        # _dL_dvariance      1
+        # _dL_dlengthscale   Q
+        # _dL_dZ             MxQ
+        # _dL_dgamma         NxQ
+        # _dL_dmu            NxQ
+        # _dL_dS             NxQ
+        
+        lengthscale2 = np.square(lengthscale)
+        
+        _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
+        _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
+        _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
+        _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
+    
+        # psi2
+        _psi2_denom = 2.*S / lengthscale2 + 1. # NxQ
+        _psi2_denom_sqrt = np.sqrt(_psi2_denom)
+        _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q
+        _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom[:,None,None,:])
+        _psi2_common = gamma/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # NxQ
+        _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom[:,None,None,:])+np.log(gamma[:,None,None,:]) #N,M,M,Q
+        _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ
+        _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2)
+        _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
+        _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
+        _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ 
+        _psi2_exp_dist_sq = np.exp(-_psi2_Zdist_sq -_psi2_mudist_sq) # NxMxMxQ
+        _psi2_exp_Z = np.exp(-0.5*_psi2_Z_sq_sum) # MxMxQ
+        _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
+        _dL_dvariance = np.einsum('mo,mo->',dL_dpsi2,_psi2)*2./variance
+        _dL_dgamma = np.einsum('mo,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,(_psi2_exp_dist_sq/_psi2_denom_sqrt[:,None,None,:] - _psi2_exp_Z))
+        _dL_dmu = -2.*np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,_psi2_common,_psi2_mudist,_psi2_exp_dist_sq)
+        _dL_dS = np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q, _psi2_common, (2.*_psi2_mudist_sq-1.), _psi2_exp_dist_sq)
+        _dL_dZ = 2.*np.einsum('mo,nmoq,nmoq->mq',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(-_psi2_Zdist*_psi2_denom[:,None,None,:]+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z))
+        _dL_dlengthscale = 2.*lengthscale* np.einsum('mo,nmoq,nmoq->q',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z))
+    
+        return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py
@ -0,0 +1,474 @@
+
+"""
+The module for psi-statistics for RBF kernel for Spike-and-Slab GPLVM
+"""
+
+import numpy as np
+from ....util.caching import Cache_this
+from . import PSICOMP_RBF
+from ....util import gpu_init
+
+try:
+    import pycuda.gpuarray as gpuarray
+    from pycuda.compiler import SourceModule
+    from ....util.linalg_gpu import sum_axis
+except:
+    pass    
+
+gpu_code = """
+    // define THREADNUM
+
+    #define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
+    #define IDX_NMM(n,m1,m2) ((m2*M+m1)*N+n)
+    #define IDX_NQ(n,q) (q*N+n)
+    #define IDX_NM(n,m) (m*N+n)
+    #define IDX_MQ(m,q) (q*M+m)
+    #define IDX_MM(m1,m2) (m2*M+m1)
+    #define IDX_NQB(n,q,b) ((b*Q+q)*N+n)
+    #define IDX_QB(q,b) (b*Q+q)
+
+    // Divide data evenly
+    __device__ void divide_data(int total_data, int psize, int pidx, int *start, int *end) {
+        int residue = (total_data)%psize;
+        if(pidx<residue) {
+            int size = total_data/psize+1;
+            *start = size*pidx;
+            *end = *start+size;
+        } else {
+            int size = total_data/psize;
+            *start = size*pidx+residue;
+            *end = *start+size;
+        }
+    }
+    
+    __device__ void reduce_sum(double* array, int array_size) {
+        int s;
+        if(array_size >= blockDim.x) {
+            for(int i=blockDim.x+threadIdx.x; i<array_size; i+= blockDim.x) {
+                array[threadIdx.x] += array[i];
+            }
+            array_size = blockDim.x;
+        }
+        __syncthreads();
+        for(int i=1; i<=array_size;i*=2) {s=i;}
+        if(threadIdx.x < array_size-s) {array[threadIdx.x] += array[s+threadIdx.x];}
+        __syncthreads();
+        for(s=s/2;s>=1;s=s/2) {
+            if(threadIdx.x < s) {array[threadIdx.x] += array[s+threadIdx.x];}
+            __syncthreads();
+        }
+    }
+
+    __global__ void compDenom(double *log_denom1, double *log_denom2, double *log_gamma, double*log_gamma1, double *gamma, double *l, double *S, int N, int Q)
+    {
+        int n_start, n_end;
+        divide_data(N, gridDim.x, blockIdx.x, &n_start, &n_end);
+        
+        for(int i=n_start*Q+threadIdx.x; i<n_end*Q; i+=blockDim.x) {
+            int n=i/Q;
+            int q=i%Q;
+
+            double Snq = S[IDX_NQ(n,q)];
+            double lq = l[q]*l[q];
+            double gnq = gamma[IDX_NQ(n,q)];
+            log_denom1[IDX_NQ(n,q)] = log(Snq/lq+1.);
+            log_denom2[IDX_NQ(n,q)] = log(2.*Snq/lq+1.);
+            log_gamma[IDX_NQ(n,q)] = log(gnq);
+            log_gamma1[IDX_NQ(n,q)] = log(1.-gnq);
+        }
+    }
+
+    __global__ void psi1computations(double *psi1, double *log_denom1, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
+    {
+        int m_start, m_end;
+        divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
+        
+        for(int m=m_start; m<m_end; m++) {
+            for(int n=threadIdx.x; n<N; n+= blockDim.x) {            
+                double log_psi1 = 0;
+                for(int q=0;q<Q;q++) {
+                    double Zmq = Z[IDX_MQ(m,q)];
+                    double muZ = mu[IDX_NQ(n,q)]-Zmq;
+                    double Snq = S[IDX_NQ(n,q)];
+                    double lq = l[q]*l[q];
+                    double exp1 = log_gamma[IDX_NQ(n,q)]-(muZ*muZ/(Snq+lq)+log_denom1[IDX_NQ(n,q)])/(2.);
+                    double exp2 = log_gamma1[IDX_NQ(n,q)]-Zmq*Zmq/(2.*lq);
+                    log_psi1 += (exp1>exp2)?exp1+log1p(exp(exp2-exp1)):exp2+log1p(exp(exp1-exp2));
+                }
+                psi1[IDX_NM(n,m)] = var*exp(log_psi1);
+            }
+        }
+    }
+    
+    __global__ void psi2computations(double *psi2, double *psi2n, double *log_denom2, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
+    {
+        int psi2_idx_start, psi2_idx_end;
+        __shared__ double psi2_local[THREADNUM];
+        divide_data((M+1)*M/2, gridDim.x, blockIdx.x, &psi2_idx_start, &psi2_idx_end);
+        
+        for(int psi2_idx=psi2_idx_start; psi2_idx<psi2_idx_end; psi2_idx++) {
+            int m1 = int((sqrt(8.*psi2_idx+1.)-1.)/2.);
+            int m2 = psi2_idx - (m1+1)*m1/2;
+            
+            psi2_local[threadIdx.x] = 0;
+            for(int n=threadIdx.x;n<N;n+=blockDim.x) {
+                double log_psi2_n = 0;
+                for(int q=0;q<Q;q++) {
+                    double Zm1q = Z[IDX_MQ(m1,q)];
+                    double Zm2q = Z[IDX_MQ(m2,q)];
+                    double dZ = Zm1q - Zm2q;
+                    double muZhat = mu[IDX_NQ(n,q)]- (Zm1q+Zm2q)/2.;
+                    double Z2 = Zm1q*Zm1q+Zm2q*Zm2q;
+                    double Snq = S[IDX_NQ(n,q)];
+                    double lq = l[q]*l[q];
+                    double exp1 = dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2[IDX_NQ(n,q)]/2. + log_gamma[IDX_NQ(n,q)];
+                    double exp2 = log_gamma1[IDX_NQ(n,q)] - Z2/(2.*lq);
+                    log_psi2_n += (exp1>exp2)?exp1+log1p(exp(exp2-exp1)):exp2+log1p(exp(exp1-exp2));
+                }
+                double exp_psi2_n = exp(log_psi2_n);
+                psi2n[IDX_NMM(n,m1,m2)] = var*var*exp_psi2_n;
+                if(m1!=m2) { psi2n[IDX_NMM(n,m2,m1)] = var*var*exp_psi2_n;}
+                psi2_local[threadIdx.x] += exp_psi2_n;
+            }
+            __syncthreads();
+            reduce_sum(psi2_local, THREADNUM);
+            if(threadIdx.x==0) {
+                psi2[IDX_MM(m1,m2)] = var*var*psi2_local[0];
+                if(m1!=m2) { psi2[IDX_MM(m2,m1)] = var*var*psi2_local[0]; }
+            }
+            __syncthreads();
+        }
+    }
+    
+    __global__ void psi1compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dgamma, double *dL_dpsi1, double *psi1, double *log_denom1, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q)
+    {
+        int m_start, m_end;
+        __shared__ double g_local[THREADNUM];
+        divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
+        int P = int(ceil(double(N)/THREADNUM));
+
+        double dvar_local = 0;
+        for(int q=0;q<Q;q++) {
+            double lq_sqrt = l[q];
+            double lq = lq_sqrt*lq_sqrt;
+            double dl_local = 0;
+            for(int p=0;p<P;p++) {
+                int n = p*THREADNUM + threadIdx.x;
+                double dmu_local = 0;
+                double dS_local = 0;
+                double dgamma_local = 0;
+                double Snq,mu_nq,gnq,log_gnq,log_gnq1,log_de;
+                if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)]; gnq = gamma[IDX_NQ(n,q)];
+                        log_gnq = log_gamma[IDX_NQ(n,q)]; log_gnq1 = log_gamma1[IDX_NQ(n,q)];
+                        log_de = log_denom1[IDX_NQ(n,q)];}
+                for(int m=m_start; m<m_end; m++) {
+                    if(n<N) {
+                        double lpsi1 = psi1[IDX_NM(n,m)]*dL_dpsi1[IDX_NM(n,m)];
+                        if(q==0) {dvar_local += lpsi1;}
+                        
+                        double Zmq = Z[IDX_MQ(m,q)];
+                        double Zmu = Zmq - mu_nq;
+                        double denom = Snq+lq;
+                        double Zmu2_denom = Zmu*Zmu/denom;
+                        
+                        double exp1 = log_gnq-(Zmu*Zmu/(Snq+lq)+log_de)/(2.);
+                        double exp2 = log_gnq1-Zmq*Zmq/(2.*lq);
+                        double d_exp1,d_exp2;
+                        if(exp1>exp2) {
+                            d_exp1 = 1.;
+                            d_exp2 = exp(exp2-exp1);
+                        } else {
+                            d_exp1 = exp(exp1-exp2);
+                            d_exp2 = 1.;
+                        }
+                        double exp_sum = d_exp1+d_exp2;
+                        
+                        dmu_local += lpsi1*Zmu*d_exp1/(denom*exp_sum);
+                        dS_local += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum);
+                        dgamma_local += lpsi1*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
+                        dl_local += lpsi1*((Zmu2_denom+Snq/lq)/denom*d_exp1+Zmq*Zmq/(lq*lq)*d_exp2)/(2.*exp_sum);
+                        g_local[threadIdx.x] = lpsi1*(-Zmu/denom*d_exp1-Zmq/lq*d_exp2)/exp_sum;
+                    }
+                    __syncthreads();
+                    reduce_sum(g_local, p<P-1?THREADNUM:N-(P-1)*THREADNUM);
+                    if(threadIdx.x==0) {dZ[IDX_MQ(m,q)] += g_local[0];}
+                }
+                if(n<N) {
+                    dmu[IDX_NQB(n,q,blockIdx.x)] += dmu_local;
+                    dS[IDX_NQB(n,q,blockIdx.x)] += dS_local/2.;
+                    dgamma[IDX_NQB(n,q,blockIdx.x)] += dgamma_local;
+                }
+                __threadfence_block();
+            }
+            g_local[threadIdx.x] = dl_local*2.*lq_sqrt;
+            __syncthreads();
+            reduce_sum(g_local, THREADNUM);
+            if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
+        }
+        g_local[threadIdx.x] = dvar_local;
+        __syncthreads();
+        reduce_sum(g_local, THREADNUM);
+        if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]/var;}
+    }
+    
+    __global__ void psi2compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dgamma, double *dL_dpsi2, double *psi2n, double *log_denom2, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q)
+    {
+        int m_start, m_end;
+        __shared__ double g_local[THREADNUM];
+        divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
+        int P = int(ceil(double(N)/THREADNUM));
+
+        double dvar_local = 0;
+        for(int q=0;q<Q;q++) {
+            double lq_sqrt = l[q];
+            double lq = lq_sqrt*lq_sqrt;
+            double dl_local = 0;
+            for(int p=0;p<P;p++) {
+                int n = p*THREADNUM + threadIdx.x;
+                double dmu_local = 0;
+                double dS_local = 0;
+                double dgamma_local = 0;
+                double Snq,mu_nq,gnq,log_gnq,log_gnq1,log_de;
+                if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)]; gnq = gamma[IDX_NQ(n,q)];
+                        log_gnq = log_gamma[IDX_NQ(n,q)]; log_gnq1 = log_gamma1[IDX_NQ(n,q)];
+                        log_de = log_denom2[IDX_NQ(n,q)];}
+                for(int m1=m_start; m1<m_end; m1++) {
+                    g_local[threadIdx.x] = 0;
+                    for(int m2=0;m2<M;m2++) {
+                        if(n<N) {
+                            double lpsi2 = psi2n[IDX_NMM(n,m1,m2)]*dL_dpsi2[IDX_MM(m1,m2)];
+                            if(q==0) {dvar_local += lpsi2;}
+                            
+                            double Zm1q = Z[IDX_MQ(m1,q)];
+                            double Zm2q = Z[IDX_MQ(m2,q)];
+                            double dZ = Zm1q - Zm2q;
+                            double Z2 = Zm1q*Zm1q+Zm2q*Zm2q;
+                            double muZhat =  mu_nq - (Zm1q + Zm2q)/2.;
+                            double denom = 2.*Snq+lq;
+                            double muZhat2_denom = muZhat*muZhat/denom;
+                            
+                            double exp1 = dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_de/2. + log_gnq;
+                            double exp2 = log_gnq1 - Z2/(2.*lq);
+                            double d_exp1,d_exp2;
+                            if(exp1>exp2) {
+                                d_exp1 = 1.;
+                                d_exp2 = exp(exp2-exp1);
+                            } else {
+                                d_exp1 = exp(exp1-exp2);
+                                d_exp2 = 1.;
+                            }
+                            double exp_sum = d_exp1+d_exp2;
+                            
+                            dmu_local += lpsi2*muZhat/denom*d_exp1/exp_sum;
+                            dS_local += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum;
+                            dgamma_local += lpsi2*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
+                            dl_local += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZ*dZ/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum;
+                            g_local[threadIdx.x] += 2.*lpsi2*((muZhat/denom-dZ/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum;
+                        }
+                    }
+                    __syncthreads();
+                    reduce_sum(g_local, p<P-1?THREADNUM:N-(P-1)*THREADNUM);
+                    if(threadIdx.x==0) {dZ[IDX_MQ(m1,q)] += g_local[0];}
+                }
+                if(n<N) {
+                    dmu[IDX_NQB(n,q,blockIdx.x)] += -2.*dmu_local;
+                    dS[IDX_NQB(n,q,blockIdx.x)] += dS_local;
+                    dgamma[IDX_NQB(n,q,blockIdx.x)] += dgamma_local;
+                }
+                __threadfence_block();
+            }
+            g_local[threadIdx.x] = dl_local*2.*lq_sqrt;
+            __syncthreads();
+            reduce_sum(g_local, THREADNUM);
+            if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
+        }
+        g_local[threadIdx.x] = dvar_local;
+        __syncthreads();
+        reduce_sum(g_local, THREADNUM);
+        if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]*2/var;}
+    }
+    """
+
+class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
+
+    def __init__(self, threadnum=128, blocknum=15, GPU_direct=False):
+        self.GPU_direct = GPU_direct
+        self.gpuCache = None
+        
+        self.threadnum = threadnum
+        self.blocknum = blocknum
+        module = SourceModule("#define THREADNUM "+str(self.threadnum)+"\n"+gpu_code)
+        self.g_psi1computations = module.get_function('psi1computations')
+        self.g_psi1computations.prepare('PPPPdPPPPiii')
+        self.g_psi2computations = module.get_function('psi2computations')
+        self.g_psi2computations.prepare('PPPPPdPPPPiii')
+        self.g_psi1compDer = module.get_function('psi1compDer')
+        self.g_psi1compDer.prepare('PPPPPPPPPPPdPPPPPiii')
+        self.g_psi2compDer = module.get_function('psi2compDer')
+        self.g_psi2compDer.prepare('PPPPPPPPPPPdPPPPPiii')
+        self.g_compDenom = module.get_function('compDenom')
+        self.g_compDenom.prepare('PPPPPPPii')
+
+    def __deepcopy__(self, memo):
+        s = PSICOMP_SSRBF_GPU(threadnum=self.threadnum, blocknum=self.blocknum, GPU_direct=self.GPU_direct)
+        memo[id(self)] = s 
+        return s
+
+    def _initGPUCache(self, N, M, Q):            
+        if self.gpuCache == None:
+            self.gpuCache = {
+                             'l_gpu'                :gpuarray.empty((Q,),np.float64,order='F'),
+                             'Z_gpu'                :gpuarray.empty((M,Q),np.float64,order='F'),
+                             'mu_gpu'               :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'S_gpu'                :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'gamma_gpu'            :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'psi1_gpu'             :gpuarray.empty((N,M),np.float64,order='F'),
+                             'psi2_gpu'             :gpuarray.empty((M,M),np.float64,order='F'),
+                             'psi2n_gpu'            :gpuarray.empty((N,M,M),np.float64,order='F'),
+                             'dL_dpsi1_gpu'         :gpuarray.empty((N,M),np.float64,order='F'),
+                             'dL_dpsi2_gpu'         :gpuarray.empty((M,M),np.float64,order='F'),
+                             'log_denom1_gpu'       :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'log_denom2_gpu'       :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'log_gamma_gpu'        :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'log_gamma1_gpu'       :gpuarray.empty((N,Q),np.float64,order='F'),
+                             # derivatives
+                             'dvar_gpu'             :gpuarray.empty((self.blocknum,),np.float64, order='F'),
+                             'dl_gpu'               :gpuarray.empty((Q,self.blocknum),np.float64, order='F'),
+                             'dZ_gpu'               :gpuarray.empty((M,Q),np.float64, order='F'),
+                             'dmu_gpu'              :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
+                             'dS_gpu'               :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
+                             'dgamma_gpu'           :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
+                             # grad
+                             'grad_l_gpu'               :gpuarray.empty((Q,),np.float64, order='F'),
+                             'grad_mu_gpu'              :gpuarray.empty((N,Q,),np.float64, order='F'),
+                             'grad_S_gpu'               :gpuarray.empty((N,Q,),np.float64, order='F'),
+                             'grad_gamma_gpu'           :gpuarray.empty((N,Q,),np.float64, order='F'),
+                             }
+        else:
+            assert N==self.gpuCache['mu_gpu'].shape[0]
+            assert M==self.gpuCache['Z_gpu'].shape[0]
+            assert Q==self.gpuCache['l_gpu'].shape[0]
+    
+    def sync_params(self, lengthscale, Z, mu, S, gamma):
+        if len(lengthscale)==1:
+            self.gpuCache['l_gpu'].fill(lengthscale)
+        else:
+            self.gpuCache['l_gpu'].set(np.asfortranarray(lengthscale))
+        self.gpuCache['Z_gpu'].set(np.asfortranarray(Z))
+        self.gpuCache['mu_gpu'].set(np.asfortranarray(mu))
+        self.gpuCache['S_gpu'].set(np.asfortranarray(S))
+        self.gpuCache['gamma_gpu'].set(np.asfortranarray(gamma))
+        N,Q = self.gpuCache['S_gpu'].shape
+        self.g_compDenom.prepared_call((self.blocknum,1),(self.threadnum,1,1), self.gpuCache['log_denom1_gpu'].gpudata,self.gpuCache['log_denom2_gpu'].gpudata,self.gpuCache['log_gamma_gpu'].gpudata,self.gpuCache['log_gamma1_gpu'].gpudata,self.gpuCache['gamma_gpu'].gpudata,self.gpuCache['l_gpu'].gpudata,self.gpuCache['S_gpu'].gpudata, np.int32(N), np.int32(Q))
+        
+    def reset_derivative(self):
+        self.gpuCache['dvar_gpu'].fill(0.)
+        self.gpuCache['dl_gpu'].fill(0.)
+        self.gpuCache['dZ_gpu'].fill(0.)
+        self.gpuCache['dmu_gpu'].fill(0.)
+        self.gpuCache['dS_gpu'].fill(0.)
+        self.gpuCache['dgamma_gpu'].fill(0.)
+        self.gpuCache['grad_l_gpu'].fill(0.)
+        self.gpuCache['grad_mu_gpu'].fill(0.)
+        self.gpuCache['grad_S_gpu'].fill(0.)
+        self.gpuCache['grad_gamma_gpu'].fill(0.)
+    
+    def get_dimensions(self, Z, variational_posterior):
+        return variational_posterior.mean.shape[0], Z.shape[0], Z.shape[1]
+
+    @Cache_this(limit=1, ignore_args=(0,))
+    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        """
+        N,M,Q = self.get_dimensions(Z, variational_posterior)
+        self._initGPUCache(N,M,Q)
+        self.sync_params(lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+        
+        psi1_gpu = self.gpuCache['psi1_gpu']
+        psi2_gpu = self.gpuCache['psi2_gpu']
+        psi2n_gpu = self.gpuCache['psi2n_gpu']
+        l_gpu = self.gpuCache['l_gpu']
+        Z_gpu = self.gpuCache['Z_gpu']
+        mu_gpu = self.gpuCache['mu_gpu']
+        S_gpu = self.gpuCache['S_gpu']
+        log_denom1_gpu = self.gpuCache['log_denom1_gpu']
+        log_denom2_gpu = self.gpuCache['log_denom2_gpu']
+        log_gamma_gpu = self.gpuCache['log_gamma_gpu']
+        log_gamma1_gpu = self.gpuCache['log_gamma1_gpu']
+
+        psi0 = np.empty((N,))
+        psi0[:] = variance
+        self.g_psi1computations.prepared_call((self.blocknum,1),(self.threadnum,1,1),psi1_gpu.gpudata, log_denom1_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
+        self.g_psi2computations.prepared_call((self.blocknum,1),(self.threadnum,1,1),psi2_gpu.gpudata, psi2n_gpu.gpudata, log_denom2_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
+        
+        if self.GPU_direct:
+            return psi0, psi1_gpu, psi2_gpu
+        else:
+            return psi0, psi1_gpu.get(), psi2_gpu.get()
+
+    @Cache_this(limit=1, ignore_args=(0,1,2,3))
+    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+        ARD = (len(lengthscale)!=1)
+        
+        N,M,Q = self.get_dimensions(Z, variational_posterior)
+        psi1_gpu = self.gpuCache['psi1_gpu']
+        psi2n_gpu = self.gpuCache['psi2n_gpu']
+        l_gpu = self.gpuCache['l_gpu']
+        Z_gpu = self.gpuCache['Z_gpu']
+        mu_gpu = self.gpuCache['mu_gpu']
+        S_gpu = self.gpuCache['S_gpu']
+        gamma_gpu = self.gpuCache['gamma_gpu']
+        dvar_gpu = self.gpuCache['dvar_gpu']
+        dl_gpu = self.gpuCache['dl_gpu']
+        dZ_gpu = self.gpuCache['dZ_gpu']
+        dmu_gpu = self.gpuCache['dmu_gpu']
+        dS_gpu = self.gpuCache['dS_gpu']
+        dgamma_gpu = self.gpuCache['dgamma_gpu']
+        grad_l_gpu = self.gpuCache['grad_l_gpu']
+        grad_mu_gpu = self.gpuCache['grad_mu_gpu']
+        grad_S_gpu = self.gpuCache['grad_S_gpu']
+        grad_gamma_gpu = self.gpuCache['grad_gamma_gpu']
+        log_denom1_gpu = self.gpuCache['log_denom1_gpu']
+        log_denom2_gpu = self.gpuCache['log_denom2_gpu']
+        log_gamma_gpu = self.gpuCache['log_gamma_gpu']
+        log_gamma1_gpu = self.gpuCache['log_gamma1_gpu']
+        
+        if self.GPU_direct:
+            dL_dpsi1_gpu = dL_dpsi1
+            dL_dpsi2_gpu = dL_dpsi2
+            dL_dpsi0_sum = gpuarray.sum(dL_dpsi0).get()
+        else:
+            dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu']
+            dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu']
+            dL_dpsi1_gpu.set(np.asfortranarray(dL_dpsi1))
+            dL_dpsi2_gpu.set(np.asfortranarray(dL_dpsi2))
+            dL_dpsi0_sum = dL_dpsi0.sum()
+
+        self.reset_derivative()
+        # t=self.g_psi1compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi1_gpu,psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
+        # print 'g_psi1compDer '+str(t)
+        # t=self.g_psi2compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi2_gpu,psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
+        # print 'g_psi2compDer '+str(t)
+        self.g_psi1compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dgamma_gpu.gpudata,dL_dpsi1_gpu.gpudata,psi1_gpu.gpudata, log_denom1_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata,gamma_gpu.gpudata,np.int32(N), np.int32(M), np.int32(Q))
+        self.g_psi2compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dgamma_gpu.gpudata,dL_dpsi2_gpu.gpudata,psi2n_gpu.gpudata, log_denom2_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata,gamma_gpu.gpudata,np.int32(N), np.int32(M), np.int32(Q))
+
+        dL_dvar = dL_dpsi0_sum + gpuarray.sum(dvar_gpu).get()
+        sum_axis(grad_mu_gpu,dmu_gpu,N*Q,self.blocknum)
+        dL_dmu = grad_mu_gpu.get()
+        sum_axis(grad_S_gpu,dS_gpu,N*Q,self.blocknum)
+        dL_dS = grad_S_gpu.get()
+        sum_axis(grad_gamma_gpu,dgamma_gpu,N*Q,self.blocknum)
+        dL_dgamma = grad_gamma_gpu.get()
+        dL_dZ = dZ_gpu.get()
+        if ARD:
+            sum_axis(grad_l_gpu,dl_gpu,Q,self.blocknum)
+            dL_dlengscale = grad_l_gpu.get()
+        else:
+            dL_dlengscale = gpuarray.sum(dl_gpu).get()
+            
+        return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma
+    
+
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@ -0,0 +1,71 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from stationary import Stationary
+from psi_comp import PSICOMP_RBF
+from psi_comp.rbf_psi_gpucomp import PSICOMP_RBF_GPU
+from ...util.config import *
+
+class RBF(Stationary):
+    """
+    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel:
+
+    .. math::
+
+       k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg)
+
+    """
+    _support_GPU = True
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='rbf', useGPU=False):
+        super(RBF, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name, useGPU=useGPU)
+        self.psicomp = PSICOMP_RBF()
+        if self.useGPU:
+            self.psicomp = PSICOMP_RBF_GPU()
+        else:
+            self.psicomp = PSICOMP_RBF()
+
+    def K_of_r(self, r):
+        return self.variance * np.exp(-0.5 * r**2)
+
+    def dK_dr(self, r):
+        return -r*self.K_of_r(r)
+
+    def __getstate__(self):
+        dc = super(RBF, self).__getstate__()
+        if self.useGPU:
+            dc['psicomp'] = PSICOMP_RBF()
+        return dc
+
+    def __setstate__(self, state):
+        return super(RBF, self).__setstate__(state)
+
+    def spectrum(self, omega):
+        assert self.input_dim == 1 #TODO: higher dim spectra?
+        return self.variance*np.sqrt(2*np.pi)*self.lengthscale*np.exp(-self.lengthscale*2*omega**2/2)
+
+    #---------------------------------------#
+    #             PSI statistics            #
+    #---------------------------------------#
+
+    def psi0(self, Z, variational_posterior):
+        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[0]
+
+    def psi1(self, Z, variational_posterior):
+        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[1]
+
+    def psi2(self, Z, variational_posterior):
+        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[2]
+
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[:2]
+        self.variance.gradient = dL_dvar
+        self.lengthscale.gradient = dL_dlengscale
+
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[2]
+
+    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[3:]
+
--- a/GPy/kern/_src/splitKern.py
+++ b/GPy/kern/_src/splitKern.py
@ -0,0 +1,204 @@
+"""
+A new kernel
+"""
+
+import numpy as np
+from kern import Kern,CombinationKernel
+from .independent_outputs import index_to_slices
+import itertools
+
+class DiffGenomeKern(Kern):
+
+    def __init__(self, kernel, idx_p, Xp, index_dim=-1, name='DiffGenomeKern'):
+        self.idx_p = idx_p
+        self.index_dim=index_dim
+        self.kern = SplitKern(kernel,Xp, index_dim=index_dim)
+        super(DiffGenomeKern, self).__init__(input_dim=kernel.input_dim+1, active_dims=None, name=name)
+        self.add_parameter(self.kern)
+    
+    def K(self, X, X2=None):
+        assert X2==None
+        K = self.kern.K(X,X2)
+        
+        if self.idx_p<=0 or self.idx_p>X.shape[0]/2:
+            return K
+        
+        slices = index_to_slices(X[:,self.index_dim])
+        idx_start = slices[1][0].start
+        idx_end = idx_start+self.idx_p
+        K_c = K[idx_start:idx_end,idx_start:idx_end].copy()
+        K[idx_start:idx_end,:] = K[:self.idx_p,:]
+        K[:,idx_start:idx_end] = K[:,:self.idx_p]
+        K[idx_start:idx_end,idx_start:idx_end] = K_c
+        
+        return K
+    
+    def Kdiag(self,X):
+        Kdiag = self.kern.Kdiag(X)
+
+        if self.idx_p<=0 or self.idx_p>X.shape[0]/2:
+            return Kdiag
+
+        slices = index_to_slices(X[:,self.index_dim])
+        idx_start = slices[1][0].start
+        idx_end = idx_start+self.idx_p
+        Kdiag[idx_start:idx_end] = Kdiag[:self.idx_p]
+        
+        return Kdiag
+    
+    def update_gradients_full(self,dL_dK,X,X2=None):
+        assert X2==None
+        if self.idx_p<=0 or self.idx_p>X.shape[0]/2:
+            self.kern.update_gradients_full(dL_dK, X)
+            return
+        
+        slices = index_to_slices(X[:,self.index_dim])
+        idx_start = slices[1][0].start
+        idx_end = idx_start+self.idx_p
+        
+        self.kern.update_gradients_full(dL_dK[idx_start:idx_end,:], X[:self.idx_p],X)
+        grad_p1 = self.kern.gradient.copy()
+        self.kern.update_gradients_full(dL_dK[:,idx_start:idx_end], X, X[:self.idx_p])
+        grad_p2 = self.kern.gradient.copy()
+        self.kern.update_gradients_full(dL_dK[idx_start:idx_end,idx_start:idx_end], X[:self.idx_p],X[idx_start:idx_end])
+        grad_p3 = self.kern.gradient.copy()
+        self.kern.update_gradients_full(dL_dK[idx_start:idx_end,idx_start:idx_end], X[idx_start:idx_end], X[:self.idx_p])
+        grad_p4 = self.kern.gradient.copy()
+
+        self.kern.update_gradients_full(dL_dK[idx_start:idx_end,:], X[idx_start:idx_end],X)
+        grad_n1 = self.kern.gradient.copy()
+        self.kern.update_gradients_full(dL_dK[:,idx_start:idx_end], X, X[idx_start:idx_end])
+        grad_n2 = self.kern.gradient.copy()
+        self.kern.update_gradients_full(dL_dK[idx_start:idx_end,idx_start:idx_end], X[idx_start:idx_end], X[idx_start:idx_end])
+        grad_n3 = self.kern.gradient.copy()
+
+        self.kern.update_gradients_full(dL_dK, X)
+        self.kern.gradient += grad_p1+grad_p2-grad_p3-grad_p4-grad_n1-grad_n2+2*grad_n3
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        pass
+
+class SplitKern(CombinationKernel):
+
+    def __init__(self, kernel, Xp, index_dim=-1, name='SplitKern'):
+        assert isinstance(index_dim, int), "The index dimension must be an integer!"
+        self.kern = kernel
+        self.kern_cross = SplitKern_cross(kernel,Xp)
+        super(SplitKern, self).__init__(kernels=[self.kern, self.kern_cross], extra_dims=[index_dim], name=name)
+        self.index_dim = index_dim
+
+    def K(self,X ,X2=None):
+        slices = index_to_slices(X[:,self.index_dim])
+        assert len(slices)<=2, 'The Split kernel only support two different indices'
+        if X2 is None:
+            target = np.zeros((X.shape[0], X.shape[0]))
+            # diagonal blocks
+            [[target.__setitem__((s,ss), self.kern.K(X[s,:], X[ss,:])) for s,ss in itertools.product(slices_i, slices_i)] for slices_i in slices]
+            if len(slices)>1:
+                # cross blocks
+                [target.__setitem__((s,ss), self.kern_cross.K(X[s,:], X[ss,:])) for s,ss in itertools.product(slices[0], slices[1])]
+                # cross blocks
+                [target.__setitem__((s,ss), self.kern_cross.K(X[s,:], X[ss,:])) for s,ss in itertools.product(slices[1], slices[0])]
+        else:
+            slices2 = index_to_slices(X2[:,self.index_dim])
+            assert len(slices2)<=2, 'The Split kernel only support two different indices'
+            target = np.zeros((X.shape[0], X2.shape[0]))
+            # diagonal blocks
+            [[target.__setitem__((s,s2), self.kern.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices[i], slices2[i])] for i in xrange(min(len(slices),len(slices2)))]
+            if len(slices)>1:
+                [target.__setitem__((s,s2), self.kern_cross.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices[1], slices2[0])]
+            if len(slices2)>1:
+                [target.__setitem__((s,s2), self.kern_cross.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices[0], slices2[1])]                
+        return target
+
+    def Kdiag(self,X):
+        return self.kern.Kdiag(X)
+
+    def update_gradients_full(self,dL_dK,X,X2=None):
+        slices = index_to_slices(X[:,self.index_dim])
+        target = np.zeros(self.kern.size)
+
+        def collate_grads(dL, X, X2, cross=False):
+            if cross:
+                self.kern_cross.update_gradients_full(dL,X,X2)
+                target[:] += self.kern_cross.kern.gradient
+            else:
+                self.kern.update_gradients_full(dL,X,X2)
+                target[:] += self.kern.gradient
+    
+        if X2 is None:
+            assert dL_dK.shape==(X.shape[0],X.shape[0])
+            [[collate_grads(dL_dK[s,ss], X[s], X[ss]) for s,ss in itertools.product(slices_i, slices_i)] for slices_i in slices]
+            if len(slices)>1:
+                [collate_grads(dL_dK[s,ss], X[s], X[ss], True) for s,ss in itertools.product(slices[0], slices[1])]
+                [collate_grads(dL_dK[s,ss], X[s], X[ss], True) for s,ss in itertools.product(slices[1], slices[0])]
+        else:
+            assert dL_dK.shape==(X.shape[0],X2.shape[0])
+            slices2 = index_to_slices(X2[:,self.index_dim])
+            [[collate_grads(dL_dK[s,s2],X[s],X2[s2]) for s,s2 in itertools.product(slices[i], slices2[i])] for i in xrange(min(len(slices),len(slices2)))]
+            if len(slices)>1:
+                [collate_grads(dL_dK[s,s2], X[s], X2[s2], True) for s,s2 in itertools.product(slices[1], slices2[0])]
+            if len(slices2)>1:
+                [collate_grads(dL_dK[s,s2], X[s], X2[s2], True) for s,s2 in itertools.product(slices[0], slices2[1])]
+        self.kern.gradient = target
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        self.kern.update_gradients_diag(self, dL_dKdiag, X)
+
+class SplitKern_cross(Kern):
+
+    def __init__(self, kernel, Xp, name='SplitKern_cross'):
+        assert isinstance(kernel, Kern)
+        self.kern = kernel
+        if not isinstance(Xp,np.ndarray):
+            Xp = np.array([[Xp]])
+        self.Xp = Xp
+        super(SplitKern_cross, self).__init__(input_dim=kernel.input_dim, active_dims=None, name=name)
+        
+    def K(self, X, X2=None):
+        if X2 is None:
+            return np.dot(self.kern.K(X,self.Xp),self.kern.K(self.Xp,X))/self.kern.K(self.Xp,self.Xp)
+        else:
+            return np.dot(self.kern.K(X,self.Xp),self.kern.K(self.Xp,X2))/self.kern.K(self.Xp,self.Xp)
+        
+    def Kdiag(self, X):
+        return np.inner(self.kern.K(X,self.Xp),self.kern.K(self.Xp,X).T)/self.kern.K(self.Xp,self.Xp)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        if X2 is None:
+            X2 = X
+                        
+        k1 = self.kern.K(X,self.Xp)
+        k2 = self.kern.K(self.Xp,X2)
+        k3 = self.kern.K(self.Xp,self.Xp)
+        dL_dk1 = np.einsum('ij,j->i',dL_dK,k2[0])/k3[0,0]
+        dL_dk2 = np.einsum('ij,i->j',dL_dK,k1[:,0])/k3[0,0]
+        dL_dk3 = np.einsum('ij,ij->',dL_dK,-np.dot(k1,k2)/(k3[0,0]*k3[0,0]))
+
+        self.kern.update_gradients_full(dL_dk1[:,None],X,self.Xp)
+        grad = self.kern.gradient.copy()
+        self.kern.update_gradients_full(dL_dk2[None,:],self.Xp,X2)
+        grad += self.kern.gradient.copy()
+        self.kern.update_gradients_full(np.array([[dL_dk3]]),self.Xp,self.Xp)
+        grad += self.kern.gradient.copy()
+        
+        self.kern.gradient = grad
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        k1 = self.kern.K(X,self.Xp)
+        k2 = self.kern.K(self.Xp,X)
+        k3 = self.kern.K(self.Xp,self.Xp)
+        dL_dk1 = dL_dKdiag*k2[0]/k3
+        dL_dk2 = dL_dKdiag*k1[:,0]/k3
+        dL_dk3 = -dL_dKdiag*(k1[:,0]*k2[0]).sum()/(k3*k3)
+        
+        self.kern.update_gradients_full(dL_dk1[:,None],X,self.Xp)
+        grad1 = self.kern.gradient.copy()
+        self.kern.update_gradients_full(dL_dk2[None,:],self.Xp,X)
+        grad2 = self.kern.gradient.copy()
+        self.kern.update_gradients_full(np.array([[dL_dk3]]),self.Xp,self.Xp)
+        grad3 = self.kern.gradient.copy()
+        
+        self.kern.gradient = grad1+grad2+grad3
+        
+
--- a/GPy/kern/_src/static.py
+++ b/GPy/kern/_src/static.py
@ -0,0 +1,122 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kern import Kern
+import numpy as np
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+
+class Static(Kern):
+    def __init__(self, input_dim, variance, active_dims, name):
+        super(Static, self).__init__(input_dim, active_dims, name)
+        self.variance = Param('variance', variance, Logexp())
+        self.link_parameters(self.variance)
+
+    def Kdiag(self, X):
+        ret = np.empty((X.shape[0],), dtype=np.float64)
+        ret[:] = self.variance
+        return ret
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        return np.zeros(X.shape)
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        return np.zeros(X.shape)
+
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        return np.zeros(Z.shape)
+
+    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        return np.zeros(variational_posterior.shape), np.zeros(variational_posterior.shape)
+
+    def psi0(self, Z, variational_posterior):
+        return self.Kdiag(variational_posterior.mean)
+
+    def psi1(self, Z, variational_posterior):
+        return self.K(variational_posterior.mean, Z)
+
+    def psi2(self, Z, variational_posterior):
+        K = self.K(variational_posterior.mean, Z)
+        return np.einsum('ij,ik->jk',K,K) #K[:,:,None]*K[:,None,:] # NB. more efficient implementations on inherriting classes
+
+    def input_sensitivity(self, summarize=True):
+        if summarize:
+            return super(Static, self).input_sensitivity(summarize=summarize)
+        else:
+            return np.ones(self.input_dim) * self.variance
+
+class White(Static):
+    def __init__(self, input_dim, variance=1., active_dims=None, name='white'):
+        super(White, self).__init__(input_dim, variance, active_dims, name)
+
+    def K(self, X, X2=None):
+        if X2 is None:
+            return np.eye(X.shape[0])*self.variance
+        else:
+            return np.zeros((X.shape[0], X2.shape[0]))
+
+    def psi2(self, Z, variational_posterior):
+        return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        self.variance.gradient = np.trace(dL_dK)
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        self.variance.gradient = dL_dKdiag.sum()
+
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        self.variance.gradient = dL_dpsi0.sum()
+
+class Bias(Static):
+    def __init__(self, input_dim, variance=1., active_dims=None, name='bias'):
+        super(Bias, self).__init__(input_dim, variance, active_dims, name)
+
+    def K(self, X, X2=None):
+        shape = (X.shape[0], X.shape[0] if X2 is None else X2.shape[0])
+        ret = np.empty(shape, dtype=np.float64)
+        ret[:] = self.variance
+        return ret
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        self.variance.gradient = dL_dK.sum()
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        self.variance.gradient = dL_dKdiag.sum()
+
+    def psi2(self, Z, variational_posterior):
+        ret = np.empty((Z.shape[0], Z.shape[0]), dtype=np.float64)
+        ret[:] = self.variance*self.variance*variational_posterior.shape[0]
+        return ret
+
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        self.variance.gradient = dL_dpsi0.sum() + dL_dpsi1.sum() + 2.*self.variance*dL_dpsi2.sum()*variational_posterior.shape[0]
+
+class Fixed(Static):
+    def __init__(self, input_dim, covariance_matrix, variance=1., active_dims=None, name='fixed'):
+        """
+        :param input_dim: the number of input dimensions
+        :type input_dim: int
+        :param variance: the variance of the kernel
+        :type variance: float
+        """
+        super(Fixed, self).__init__(input_dim, variance, active_dims, name)
+        self.fixed_K = covariance_matrix
+    def K(self, X, X2):
+        return self.variance * self.fixed_K
+
+    def Kdiag(self, X):
+        return self.variance * self.fixed_K.diag()
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        self.variance.gradient = np.einsum('ij,ij', dL_dK, self.fixed_K)
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        self.variance.gradient = np.einsum('i,i', dL_dKdiag, self.fixed_K)
+
+    def psi2(self, Z, variational_posterior):
+        return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)
+
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        self.variance.gradient = dL_dpsi0.sum()
+
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@ -0,0 +1,484 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+from ...util.linalg import tdot
+from ... import util
+import numpy as np
+from scipy import integrate, weave
+from ...util.config import config # for assesing whether to use weave
+from ...util.caching import Cache_this
+
+class Stationary(Kern):
+    """
+    Stationary kernels (covariance functions).
+
+    Stationary covariance fucntion depend only on r, where r is defined as
+
+      r = \sqrt{ \sum_{q=1}^Q (x_q - x'_q)^2 }
+
+    The covariance function k(x, x' can then be written k(r).
+
+    In this implementation, r is scaled by the lengthscales parameter(s):
+
+      r = \sqrt{ \sum_{q=1}^Q \frac{(x_q - x'_q)^2}{\ell_q^2} }.
+
+    By default, there's only one lengthscale: seaprate lengthscales for each
+    dimension can be enables by setting ARD=True.
+
+    To implement a stationary covariance function using this class, one need
+    only define the covariance function k(r), and it derivative.
+
+      ...
+      def K_of_r(self, r):
+          return foo
+      def dK_dr(self, r):
+          return bar
+
+    The lengthscale(s) and variance parameters are added to the structure automatically.
+
+    """
+
+    def __init__(self, input_dim, variance, lengthscale, ARD, active_dims, name, useGPU=False):
+        super(Stationary, self).__init__(input_dim, active_dims, name,useGPU=useGPU)
+        self.ARD = ARD
+        if not ARD:
+            if lengthscale is None:
+                lengthscale = np.ones(1)
+            else:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size == 1, "Only 1 lengthscale needed for non-ARD kernel"
+        else:
+            if lengthscale is not None:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size in [1, input_dim], "Bad number of lengthscales"
+                if lengthscale.size != input_dim:
+                    lengthscale = np.ones(input_dim)*lengthscale
+            else:
+                lengthscale = np.ones(self.input_dim)
+        self.lengthscale = Param('lengthscale', lengthscale, Logexp())
+        self.variance = Param('variance', variance, Logexp())
+        assert self.variance.size==1
+        self.link_parameters(self.variance, self.lengthscale)
+
+    def K_of_r(self, r):
+        raise NotImplementedError, "implement the covariance function as a fn of r to use this class"
+
+    def dK_dr(self, r):
+        raise NotImplementedError, "implement derivative of the covariance function wrt r to use this class"
+
+    @Cache_this(limit=5, ignore_args=())
+    def K(self, X, X2=None):
+        """
+        Kernel function applied on inputs X and X2.
+        In the stationary case there is an inner function depending on the
+        distances from X to X2, called r.
+
+        K(X, X2) = K_of_r((X-X2)**2)
+        """
+        r = self._scaled_dist(X, X2)
+        return self.K_of_r(r)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK_dr_via_X(self, X, X2):
+        #a convenience function, so we can cache dK_dr
+        return self.dK_dr(self._scaled_dist(X, X2))
+
+    def _unscaled_dist(self, X, X2=None):
+        """
+        Compute the Euclidean distance between each row of X and X2, or between
+        each pair of rows of X if X2 is None.
+        """
+        #X, = self._slice_X(X)
+        if X2 is None:
+            Xsq = np.sum(np.square(X),1)
+            r2 = -2.*tdot(X) + (Xsq[:,None] + Xsq[None,:])
+            util.diag.view(r2)[:,]= 0. # force diagnoal to be zero: sometime numerically a little negative
+            r2 = np.clip(r2, 0, np.inf)
+            return np.sqrt(r2)
+        else:
+            #X2, = self._slice_X(X2)
+            X1sq = np.sum(np.square(X),1)
+            X2sq = np.sum(np.square(X2),1)
+            r2 = -2.*np.dot(X, X2.T) + X1sq[:,None] + X2sq[None,:]
+            r2 = np.clip(r2, 0, np.inf)
+            return np.sqrt(r2)
+
+    @Cache_this(limit=5, ignore_args=())
+    def _scaled_dist(self, X, X2=None):
+        """
+        Efficiently compute the scaled distance, r.
+
+        r = \sqrt( \sum_{q=1}^Q (x_q - x'q)^2/l_q^2 )
+
+        Note that if thre is only one lengthscale, l comes outside the sum. In
+        this case we compute the unscaled distance first (in a separate
+        function for caching) and divide by lengthscale afterwards
+
+        """
+        if self.ARD:
+            if X2 is not None:
+                X2 = X2 / self.lengthscale
+            return self._unscaled_dist(X/self.lengthscale, X2)
+        else:
+            return self._unscaled_dist(X, X2)/self.lengthscale
+
+    def Kdiag(self, X):
+        ret = np.empty(X.shape[0])
+        ret[:] = self.variance
+        return ret
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        """
+        Given the derivative of the objective with respect to the diagonal of
+        the covariance matrix, compute the derivative wrt the parameters of
+        this kernel and stor in the <parameter>.gradient field.
+
+        See also update_gradients_full
+        """
+        self.variance.gradient = np.sum(dL_dKdiag)
+        self.lengthscale.gradient = 0.
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        """
+        Given the derivative of the objective wrt the covariance matrix
+        (dL_dK), compute the gradient wrt the parameters of this kernel,
+        and store in the parameters object as e.g. self.variance.gradient
+        """
+        self.variance.gradient = np.einsum('ij,ij,i', self.K(X, X2), dL_dK, 1./self.variance)
+
+        #now the lengthscale gradient(s)
+        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
+        if self.ARD:
+            #rinv = self._inv_dis# this is rather high memory? Should we loop instead?t(X, X2)
+            #d =  X[:, None, :] - X2[None, :, :]
+            #x_xl3 = np.square(d)
+            #self.lengthscale.gradient = -((dL_dr*rinv)[:,:,None]*x_xl3).sum(0).sum(0)/self.lengthscale**3
+            tmp = dL_dr*self._inv_dist(X, X2)
+            if X2 is None: X2 = X
+            
+
+            if config.getboolean('weave', 'working'):
+                try:
+                    self.lengthscale.gradient = self.weave_lengthscale_grads(tmp, X, X2)
+                except:
+                    print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                    config.set('weave', 'working', 'False')
+                    self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)])
+            else:
+                self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)])
+        else:
+            r = self._scaled_dist(X, X2)
+            self.lengthscale.gradient = -np.sum(dL_dr*r)/self.lengthscale
+
+
+    def _inv_dist(self, X, X2=None):
+        """
+        Compute the elementwise inverse of the distance matrix, expecpt on the
+        diagonal, where we return zero (the distance on the diagonal is zero).
+        This term appears in derviatives.
+        """
+        dist = self._scaled_dist(X, X2).copy()
+        return 1./np.where(dist != 0., dist, np.inf)
+
+    def weave_lengthscale_grads(self, tmp, X, X2):
+        """Use scipy.weave to compute derivatives wrt the lengthscales"""
+        N,M = tmp.shape
+        Q = X.shape[1]
+        if hasattr(X, 'values'):X = X.values
+        if hasattr(X2, 'values'):X2 = X2.values
+        grads = np.zeros(self.input_dim)
+        code = """
+        double gradq;
+        for(int q=0; q<Q; q++){
+          gradq = 0;
+          for(int n=0; n<N; n++){
+            for(int m=0; m<M; m++){
+              gradq += tmp(n,m)*(X(n,q)-X2(m,q))*(X(n,q)-X2(m,q));
+            }
+          }
+          grads(q) = gradq;
+        }
+        """
+        weave.inline(code, ['tmp', 'X', 'X2', 'grads', 'N', 'M', 'Q'], type_converters=weave.converters.blitz, support_code="#include <math.h>")
+        return -grads/self.lengthscale**3
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        """
+        Given the derivative of the objective wrt K (dL_dK), compute the derivative wrt X
+        """
+        if config.getboolean('weave', 'working'):
+            try:
+                return self.gradients_X_weave(dL_dK, X, X2)
+            except:
+                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                config.set('weave', 'working', 'False')
+                return self.gradients_X_(dL_dK, X, X2)
+        else:
+            return self.gradients_X_(dL_dK, X, X2)
+
+    def gradients_X_(self, dL_dK, X, X2=None):
+        invdist = self._inv_dist(X, X2)
+        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
+        tmp = invdist*dL_dr
+        if X2 is None:
+            tmp = tmp + tmp.T
+            X2 = X
+
+        #The high-memory numpy way:
+        #d =  X[:, None, :] - X2[None, :, :]
+        #ret = np.sum(tmp[:,:,None]*d,1)/self.lengthscale**2
+
+        #the lower memory way with a loop
+        ret = np.empty(X.shape, dtype=np.float64)
+        for q in xrange(self.input_dim):
+            np.sum(tmp*(X[:,q][:,None]-X2[:,q][None,:]), axis=1, out=ret[:,q])
+        ret /= self.lengthscale**2
+
+        return ret
+
+    def gradients_X_weave(self, dL_dK, X, X2=None):
+        invdist = self._inv_dist(X, X2)
+        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
+        tmp = invdist*dL_dr
+        if X2 is None:
+            tmp = tmp + tmp.T
+            X2 = X
+
+        code = """
+        int n,m,d;
+        double retnd;
+        #pragma omp parallel for private(n,d, retnd, m)
+        for(d=0;d<D;d++){
+          for(n=0;n<N;n++){
+            retnd = 0.0;
+            for(m=0;m<M;m++){
+              retnd += tmp(n,m)*(X(n,d)-X2(m,d));
+            }
+            ret(n,d) = retnd;
+          }
+        }
+ 
+        """
+        if hasattr(X, 'values'):X = X.values #remove the GPy wrapping to make passing into weave safe
+        if hasattr(X2, 'values'):X2 = X2.values
+        ret = np.zeros(X.shape)
+        N,D = X.shape
+        N,M = tmp.shape
+        from scipy import weave
+        support_code = """
+        #include <omp.h>
+        #include <stdio.h>
+        """
+        weave_options = {'headers'           : ['<omp.h>'],
+                         'extra_compile_args': ['-fopenmp -O3'], # -march=native'],
+                         'extra_link_args'   : ['-lgomp']}
+        weave.inline(code, ['ret', 'N', 'D', 'M', 'tmp', 'X', 'X2'], type_converters=weave.converters.blitz, support_code=support_code, **weave_options)
+        return ret/self.lengthscale**2
+    
+    def gradients_X_diag(self, dL_dKdiag, X):
+        return np.zeros(X.shape)
+
+    def input_sensitivity(self, summarize=True):
+        return np.ones(self.input_dim)/self.lengthscale**2
+
+class Exponential(Stationary):
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Exponential'):
+        super(Exponential, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
+
+    def K_of_r(self, r):
+        return self.variance * np.exp(-0.5 * r)
+
+    def dK_dr(self, r):
+        return -0.5*self.K_of_r(r)
+
+
+class OU(Stationary):
+    """
+    OU kernel:
+
+    .. math::
+
+       k(r) = \\sigma^2 \exp(- r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
+
+    """
+
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='OU'):
+        super(OU, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
+
+    def K_of_r(self, r):
+        return self.variance * np.exp(-r)
+
+    def dK_dr(self,r):
+        return -1.*self.variance*np.exp(-r)
+
+
+class Matern32(Stationary):
+    """
+    Matern 3/2 kernel:
+
+    .. math::
+
+       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
+
+    """
+
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Mat32'):
+        super(Matern32, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
+
+    def K_of_r(self, r):
+        return self.variance * (1. + np.sqrt(3.) * r) * np.exp(-np.sqrt(3.) * r)
+
+    def dK_dr(self,r):
+        return -3.*self.variance*r*np.exp(-np.sqrt(3.)*r)
+
+    def Gram_matrix(self, F, F1, F2, lower, upper):
+        """
+        Return the Gram matrix of the vector of functions F with respect to the
+        RKHS norm. The use of this function is limited to input_dim=1.
+
+        :param F: vector of functions
+        :type F: np.array
+        :param F1: vector of derivatives of F
+        :type F1: np.array
+        :param F2: vector of second derivatives of F
+        :type F2: np.array
+        :param lower,upper: boundaries of the input domain
+        :type lower,upper: floats
+        """
+        assert self.input_dim == 1
+        def L(x, i):
+            return(3. / self.lengthscale ** 2 * F[i](x) + 2 * np.sqrt(3) / self.lengthscale * F1[i](x) + F2[i](x))
+        n = F.shape[0]
+        G = np.zeros((n, n))
+        for i in range(n):
+            for j in range(i, n):
+                G[i, j] = G[j, i] = integrate.quad(lambda x : L(x, i) * L(x, j), lower, upper)[0]
+        Flower = np.array([f(lower) for f in F])[:, None]
+        F1lower = np.array([f(lower) for f in F1])[:, None]
+        return(self.lengthscale ** 3 / (12.*np.sqrt(3) * self.variance) * G + 1. / self.variance * np.dot(Flower, Flower.T) + self.lengthscale ** 2 / (3.*self.variance) * np.dot(F1lower, F1lower.T))
+
+
+class Matern52(Stationary):
+    """
+    Matern 5/2 kernel:
+
+    .. math::
+
+       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r)
+       """
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Mat52'):
+        super(Matern52, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
+
+    def K_of_r(self, r):
+        return self.variance*(1+np.sqrt(5.)*r+5./3*r**2)*np.exp(-np.sqrt(5.)*r)
+
+    def dK_dr(self, r):
+        return self.variance*(10./3*r -5.*r -5.*np.sqrt(5.)/3*r**2)*np.exp(-np.sqrt(5.)*r)
+
+    def Gram_matrix(self, F, F1, F2, F3, lower, upper):
+        """
+        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to input_dim=1.
+
+        :param F: vector of functions
+        :type F: np.array
+        :param F1: vector of derivatives of F
+        :type F1: np.array
+        :param F2: vector of second derivatives of F
+        :type F2: np.array
+        :param F3: vector of third derivatives of F
+        :type F3: np.array
+        :param lower,upper: boundaries of the input domain
+        :type lower,upper: floats
+        """
+        assert self.input_dim == 1
+        def L(x,i):
+            return(5*np.sqrt(5)/self.lengthscale**3*F[i](x) + 15./self.lengthscale**2*F1[i](x)+ 3*np.sqrt(5)/self.lengthscale*F2[i](x) + F3[i](x))
+        n = F.shape[0]
+        G = np.zeros((n,n))
+        for i in range(n):
+            for j in range(i,n):
+                G[i,j] = G[j,i] = integrate.quad(lambda x : L(x,i)*L(x,j),lower,upper)[0]
+        G_coef = 3.*self.lengthscale**5/(400*np.sqrt(5))
+        Flower = np.array([f(lower) for f in F])[:,None]
+        F1lower = np.array([f(lower) for f in F1])[:,None]
+        F2lower = np.array([f(lower) for f in F2])[:,None]
+        orig = 9./8*np.dot(Flower,Flower.T) + 9.*self.lengthscale**4/200*np.dot(F2lower,F2lower.T)
+        orig2 = 3./5*self.lengthscale**2 * ( np.dot(F1lower,F1lower.T) + 1./8*np.dot(Flower,F2lower.T) + 1./8*np.dot(F2lower,Flower.T))
+        return(1./self.variance* (G_coef*G + orig + orig2))
+
+
+class ExpQuad(Stationary):
+    """
+    The Exponentiated quadratic covariance function.
+
+    .. math::
+
+       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r)
+
+    notes::
+     - Yes, this is exactly the same as the RBF covariance function, but the
+       RBF implementation also has some features for doing variational kernels
+       (the psi-statistics).
+
+    """
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='ExpQuad'):
+        super(ExpQuad, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
+
+    def K_of_r(self, r):
+        return self.variance * np.exp(-0.5 * r**2)
+
+    def dK_dr(self, r):
+        return -r*self.K_of_r(r)
+
+class Cosine(Stationary):
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Cosine'):
+        super(Cosine, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
+
+    def K_of_r(self, r):
+        return self.variance * np.cos(r)
+
+    def dK_dr(self, r):
+        return -self.variance * np.sin(r)
+
+
+class RatQuad(Stationary):
+    """
+    Rational Quadratic Kernel
+
+    .. math::
+
+       k(r) = \sigma^2 \\bigg( 1 + \\frac{r^2}{2} \\bigg)^{- \\alpha}
+
+    """
+
+
+    def __init__(self, input_dim, variance=1., lengthscale=None, power=2., ARD=False, active_dims=None, name='RatQuad'):
+        super(RatQuad, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
+        self.power = Param('power', power, Logexp())
+        self.link_parameters(self.power)
+
+    def K_of_r(self, r):
+        r2 = np.power(r, 2.)
+        return self.variance*np.power(1. + r2/2., -self.power)
+
+    def dK_dr(self, r):
+        r2 = np.power(r, 2.)
+        return -self.variance*self.power*r*np.power(1. + r2/2., - self.power - 1.)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        super(RatQuad, self).update_gradients_full(dL_dK, X, X2)
+        r = self._scaled_dist(X, X2)
+        r2 = np.power(r, 2.)
+        dK_dpow = -self.variance * np.power(2., self.power) * np.power(r2 + 2., -self.power) * np.log(0.5*(r2+2.))
+        grad = np.sum(dL_dK*dK_dpow)
+        self.power.gradient = grad
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        super(RatQuad, self).update_gradients_diag(dL_dKdiag, X)
+        self.power.gradient = 0.
+
+
--- a/GPy/kern/_src/symbolic.py
+++ b/GPy/kern/_src/symbolic.py
@ -0,0 +1,75 @@
+# Check Matthew Rocklin's blog post.
+import sympy as sym
+import numpy as np
+from kern import Kern
+from ...core.symbolic import Symbolic_core
+
+
+class Symbolic(Kern, Symbolic_core):
+    """
+    """
+    def __init__(self, input_dim, k=None, output_dim=1, name='symbolic', parameters=None, active_dims=None, operators=None, func_modules=[]):
+
+        if k is None:
+            raise ValueError, "You must provide an argument for the covariance function."
+
+        Kern.__init__(self, input_dim, active_dims, name=name)
+        kdiag = k
+        self.cacheable = ['X', 'Z']
+        Symbolic_core.__init__(self, {'k':k,'kdiag':kdiag}, cacheable=self.cacheable, derivatives = ['X', 'theta'], parameters=parameters, func_modules=func_modules)        
+        self.output_dim = output_dim
+
+    def __add__(self,other):
+        return spkern(self._sym_k+other._sym_k)
+
+    def _set_expressions(self, expressions):
+        """This method is overwritten because we need to modify kdiag by substituting z for x. We do this by calling the parent expression method to extract variables from expressions, then subsitute the z variables that are present with x."""
+        Symbolic_core._set_expressions(self, expressions)
+        Symbolic_core._set_variables(self, self.cacheable)
+        # Substitute z with x to obtain kdiag.
+        for x, z in zip(self.variables['X'], self.variables['Z']):
+            expressions['kdiag'] = expressions['kdiag'].subs(z, x)
+        Symbolic_core._set_expressions(self, expressions)
+            
+        
+    def K(self,X,X2=None):
+        if X2 is None:
+            return self.eval_function('k', X=X, Z=X)
+        else:
+            return self.eval_function('k', X=X, Z=X2)
+
+
+    def Kdiag(self,X):
+        d = self.eval_function('kdiag', X=X)
+        if not d.shape[0] == X.shape[0]:
+            d = np.tile(d, (X.shape[0], 1))
+        return d
+
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        #if self._X is None or X.base is not self._X.base or X2 is not None:
+        g = self.eval_gradients_X('k', dL_dK, X=X, Z=X2)
+        if X2 is None:
+            g *= 2
+        return g
+
+    def gradients_X_diag(self, dL_dK, X):
+        return self.eval_gradients_X('kdiag', dL_dK, X=X)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        # Need to extract parameters to local variables first
+        if X2 is None:
+            # need to double this inside ...
+            gradients = self.eval_update_gradients('k', dL_dK, X=X)
+        else:
+            gradients = self.eval_update_gradients('k', dL_dK, X=X, Z=X2)
+
+        for name, val in gradients:
+            setattr(getattr(self, name), 'gradient', val)
+
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        gradients = self.eval_update_gradients('kdiag', dL_dKdiag, X)
+        for name, val in gradients:
+            setattr(getattr(self, name), 'gradient', val)
+
--- a/GPy/kern/_src/sympy_helpers.cpp
+++ b/GPy/kern/_src/sympy_helpers.cpp
@ -0,0 +1,196 @@
+#include "Python.h"
+#include <math.h>
+#include <float.h>
+#include <stdlib.h>
+#include <iostream>
+#include <stdexcept>
+double DiracDelta(double x){
+  // TODO: this doesn't seem to be a dirac delta ... should return infinity. Neil
+    if((x<0.000001) & (x>-0.000001))//go on, laugh at my c++ skills
+        return 1.0;
+    else
+        return 0.0;
+};
+double DiracDelta(double x,int foo){
+    return 0.0;
+};
+
+double sinc(double x){
+  // compute the sinc function
+  if (x==0)
+    return 1.0;
+  else 
+    return sin(x)/x;
+}
+
+double sinc_grad(double x){
+  // compute the gradient of the sinc function.
+  if (x==0)
+    return 0.0;
+  else 
+    return (x*cos(x) - sin(x))/(x*x);
+}
+double erfcx(double x){
+  // Based on code by Soren Hauberg 2010 for Octave.
+  // compute the scaled complex error function.
+  //return erfc(x)*exp(x*x);
+  double xneg=-sqrt(log(DBL_MAX/2));
+  double xmax = 1/(sqrt(M_PI)*DBL_MIN);
+  xmax = DBL_MAX<xmax ? DBL_MAX : xmax;
+  // Find values where erfcx can be evaluated
+  double t = 3.97886080735226 / (fabs(x) + 3.97886080735226);
+  double u = t-0.5;
+  double y = (((((((((u * 0.00127109764952614092 + 1.19314022838340944e-4) * u 
+		     - 0.003963850973605135)   * u - 8.70779635317295828e-4) * u 
+		   + 0.00773672528313526668) * u + 0.00383335126264887303) * u 
+		 - 0.0127223813782122755)  * u - 0.0133823644533460069)  * u 
+	       + 0.0161315329733252248)  * u + 0.0390976845588484035)  * u + 0.00249367200053503304;
+  y = ((((((((((((y * u - 0.0838864557023001992) * u -		       
+		 0.119463959964325415) * u + 0.0166207924969367356) * u + 
+	       0.357524274449531043) * u + 0.805276408752910567)  * u + 
+	     1.18902982909273333)  * u + 1.37040217682338167)   * u +	
+	   1.31314653831023098)  * u + 1.07925515155856677)   * u +	
+	 0.774368199119538609) * u + 0.490165080585318424)  * u +	
+       0.275374741597376782) * t;
+
+  if (x<xneg)
+    return -INFINITY;
+  else if (x<0)
+    return 2.0*exp(x*x)-y;
+  else if (x>xmax)
+    return 0.0;
+  else 
+    return y;
+}
+
+double ln_diff_erf(double x0, double x1){
+  // stably compute the log of difference between two erfs.
+  if (x1>x0){
+    PyErr_SetString(PyExc_RuntimeError,"second argument must be smaller than or equal to first in ln_diff_erf");
+    throw 1;
+  }
+  if (x0==x1){
+    PyErr_WarnEx(PyExc_RuntimeWarning,"divide by zero encountered in log", 1);
+    return -INFINITY;
+  }
+  else if(x0<0 && x1>0 || x0>0 && x1<0) //x0 and x1 have opposite signs
+    return log(erf(x0)-erf(x1));
+  else if(x0>0) //x0 positive, x1 non-negative
+    return log(erfcx(x1)-erfcx(x0)*exp(x1*x1- x0*x0))-x1*x1; 
+  else //x0 and x1 non-positive
+    return log(erfcx(-x0)-erfcx(-x1)*exp(x0*x0 - x1*x1))-x0*x0;
+}
+// TODO: For all these computations of h things are very efficient at the moment. Need to recode sympykern to allow the precomputations to take place and all the gradients to be computed in one function. Not sure of best way forward for that yet. Neil
+double h(double t, double tprime, double d_i, double d_j, double l){
+  // Compute the h function for the sim covariance.
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - (t-tprime)/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_2 = half_l_di - t/l;
+  double sign_val = 1.0;
+  if(t/l==0)
+    sign_val = 0.0;
+  else if (t/l < 0)
+    sign_val = -1.0;
+  arg_2 = half_l_di - t/l;
+  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
+  // if either ln_part_1 or ln_part_2 are -inf, don't bother computing rest of that term.
+  double part_1 = 0.0;
+  if(isfinite(ln_part_1))
+    part_1 = sign_val*exp(half_l_di*half_l_di - d_i*(t-tprime) + ln_part_1 - log(d_i + d_j));
+  double part_2 = 0.0;
+  if(isfinite(ln_part_2))
+    part_2 = sign_val*exp(half_l_di*half_l_di - d_i*t - d_j*tprime + ln_part_2 - log(d_i + d_j));
+  return part_1 - part_2;
+}
+
+
+double dh_dd_i(double t, double tprime, double d_i, double d_j, double l){
+  double diff_t = (t-tprime);
+  double l2 = l*l;
+  double hv = h(t, tprime, d_i, d_j, l);
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - (t-tprime)/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_1 = half_l_di;
+  arg_2 = half_l_di - t/l;
+  double sign_val = 1.0;
+  if(t/l==0)
+    sign_val = 0.0;
+  else if (t/l < 0)
+    sign_val = -1.0;
+  double ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l);
+  double base = (0.5*d_i*l2*(d_i+d_j)-1)*hv;
+  if(isfinite(ln_part_1))
+    base -= diff_t*sign_val*exp(half_l_di*half_l_di
+				-d_i*diff_t
+				+ln_part_1);
+  if(isfinite(ln_part_2))
+    base += t*sign_val*exp(half_l_di*half_l_di
+			   -d_i*t-d_j*tprime
+			   +ln_part_2);
+  base += l/sqrt(M_PI)*(-exp(-diff_t*diff_t/l2)
+			+exp(-tprime*tprime/l2-d_i*t)
+			+exp(-t*t/l2-d_j*tprime)
+			-exp(-(d_i*t + d_j*tprime)));
+  return base/(d_i+d_j);
+
+}
+
+double dh_dd_j(double t, double tprime, double d_i, double d_j, double l){
+  double half_l_di = 0.5*l*d_i;
+  double hv = h(t, tprime, d_i, d_j, l);
+  double sign_val = 1.0;
+  if(t/l==0)
+    sign_val = 0.0;
+  else if (t/l < 0)
+    sign_val = -1.0;
+  double ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l);
+  double base = -hv;
+  if(isfinite(ln_part_2))
+    base += tprime*sign_val*exp(half_l_di*half_l_di-(d_i*t+d_j*tprime)+ln_part_2);
+  return base/(d_i+d_j);
+}
+
+double dh_dl(double t, double tprime, double d_i, double d_j, double l){
+  // compute gradient of h function with respect to lengthscale for sim covariance
+  // TODO a lot of energy wasted recomputing things here, need to do this in a shared way somehow ... perhaps needs rewrite of sympykern.
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - (t-tprime)/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_2 = half_l_di - t/l;
+  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
+  double diff_t = t - tprime;
+  double l2 = l*l;
+  double hv = h(t, tprime, d_i, d_j, l);
+  return 0.5*d_i*d_i*l*hv + 2/(sqrt(M_PI)*(d_i+d_j))*((-diff_t/l2-d_i/2)*exp(-diff_t*diff_t/l2)+(-tprime/l2+d_i/2)*exp(-tprime*tprime/l2-d_i*t)-(-t/l2-d_i/2)*exp(-t*t/l2-d_j*tprime)-d_i/2*exp(-(d_i*t+d_j*tprime)));
+}
+
+double dh_dt(double t, double tprime, double d_i, double d_j, double l){
+  // compute gradient of h function with respect to t.
+  double diff_t = t - tprime;
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - diff_t/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_2 = half_l_di - t/l;
+  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
+  
+  return (d_i*exp(ln_part_2-d_i*t - d_j*tprime) - d_i*exp(ln_part_1-d_i*diff_t) + 2*exp(-d_i*diff_t - pow(half_l_di - diff_t/l, 2))/(sqrt(M_PI)*l) - 2*exp(-d_i*t - d_j*tprime - pow(half_l_di - t/l,2))/(sqrt(M_PI)*l))*exp(half_l_di*half_l_di)/(d_i + d_j);
+}
+
+double dh_dtprime(double t, double tprime, double d_i, double d_j, double l){
+  // compute gradient of h function with respect to tprime.
+  double diff_t = t - tprime;
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - diff_t/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_2 = half_l_di - t/l;
+  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
+
+  return (d_i*exp(ln_part_1-d_i*diff_t) + d_j*exp(ln_part_2-d_i*t - d_j*tprime) + (-2*exp(-pow(half_l_di - diff_t/l,2)) + 2*exp(-pow(half_l_di + tprime/l,2)))*exp(-d_i*diff_t)/(sqrt(M_PI)*l))*exp(half_l_di*half_l_di)/(d_i + d_j);
+}
--- a/GPy/kern/_src/sympy_helpers.h
+++ b/GPy/kern/_src/sympy_helpers.h
@ -0,0 +1,16 @@
+#include <math.h>
+double DiracDelta(double x);
+double DiracDelta(double x, int foo);
+
+double sinc(double x);
+double sinc_grad(double x);
+
+double erfcx(double x);
+double ln_diff_erf(double x0, double x1);
+
+double h(double t, double tprime, double d_i, double d_j, double l);
+double dh_dl(double t, double tprime, double d_i, double d_j, double l);
+double dh_dd_i(double t, double tprime, double d_i, double d_j, double l);
+double dh_dd_j(double t, double tprime, double d_i, double d_j, double l);
+double dh_dt(double t, double tprime, double d_i, double d_j, double l);
+double dh_dtprime(double t, double tprime, double d_i, double d_j, double l);
--- a/GPy/kern/_src/todo/ODE_1.py
+++ b/GPy/kern/_src/todo/ODE_1.py
@ -0,0 +1,165 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+
+class ODE_1(Kernpart):
+    """
+    kernel resultiong from a first order ODE with OU driving GP
+
+    :param input_dim: the number of input dimension, has to be equal to one
+    :type input_dim: int
+    :param varianceU: variance of the driving GP
+    :type varianceU: float
+    :param lengthscaleU: lengthscale of the driving GP  (sqrt(3)/lengthscaleU)
+    :type lengthscaleU: float
+    :param varianceY: 'variance' of the transfer function
+    :type varianceY: float
+    :param lengthscaleY: 'lengthscale' of the transfer function (1/lengthscaleY)
+    :type lengthscaleY: float
+    :rtype: kernel object
+
+    """
+    def __init__(self, input_dim=1, varianceU=1., varianceY=1., lengthscaleU=None, lengthscaleY=None):
+        assert input_dim==1, "Only defined for input_dim = 1"
+        self.input_dim = input_dim
+        self.num_params = 4
+        self.name = 'ODE_1'
+        if lengthscaleU is not None:
+            lengthscaleU = np.asarray(lengthscaleU)
+            assert lengthscaleU.size == 1, "lengthscaleU should be one dimensional"
+        else:
+            lengthscaleU = np.ones(1)
+        if lengthscaleY is not None:
+            lengthscaleY = np.asarray(lengthscaleY)
+            assert lengthscaleY.size == 1, "lengthscaleY should be one dimensional"
+        else:
+            lengthscaleY = np.ones(1)
+            #lengthscaleY = 0.5
+        self._set_params(np.hstack((varianceU, varianceY, lengthscaleU,lengthscaleY)))
+
+    def _get_params(self):
+        """return the value of the parameters."""
+        return np.hstack((self.varianceU,self.varianceY, self.lengthscaleU,self.lengthscaleY))
+
+    def _set_params(self, x):
+        """set the value of the parameters."""
+        assert x.size == self.num_params
+        self.varianceU = x[0]
+        self.varianceY = x[1]
+        self.lengthscaleU = x[2]
+        self.lengthscaleY = x[3]
+
+    def _get_param_names(self):
+        """return parameter names."""
+        return ['varianceU','varianceY', 'lengthscaleU', 'lengthscaleY']
+
+
+    def K(self, X, X2, target):
+        """Compute the covariance matrix between X and X2."""
+        if X2 is None: X2 = X
+       # i1 = X[:,1]
+       # i2 = X2[:,1]
+       # X = X[:,0].reshape(-1,1)
+       # X2 = X2[:,0].reshape(-1,1)
+        dist = np.abs(X - X2.T)
+        
+        ly=1/self.lengthscaleY
+        lu=np.sqrt(3)/self.lengthscaleU
+        #ly=self.lengthscaleY
+        #lu=self.lengthscaleU
+
+        k1 = np.exp(-ly*dist)*(2*lu+ly)/(lu+ly)**2
+        k2 = (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2 
+        k3 = np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
+
+        np.add(self.varianceU*self.varianceY*(k1+k2+k3), target, target)
+
+    def Kdiag(self, X, target):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        ly=1/self.lengthscaleY
+        lu=np.sqrt(3)/self.lengthscaleU
+        #ly=self.lengthscaleY
+        #lu=self.lengthscaleU
+        
+        k1 = (2*lu+ly)/(lu+ly)**2
+        k2 = (ly-2*lu + 2*lu-ly ) / (ly-lu)**2 
+        k3 = 1/(lu+ly) + (lu)/(lu+ly)**2 
+
+        np.add(self.varianceU*self.varianceY*(k1+k2+k3), target, target)
+
+    def _param_grad_helper(self, dL_dK, X, X2, target):
+        """derivative of the covariance matrix with respect to the parameters."""
+        if X2 is None: X2 = X
+        dist = np.abs(X - X2.T)
+
+        ly=1/self.lengthscaleY
+        lu=np.sqrt(3)/self.lengthscaleU
+        #ly=self.lengthscaleY
+        #lu=self.lengthscaleU
+
+        dk1theta1 = np.exp(-ly*dist)*2*(-lu)/(lu+ly)**3
+        #c=np.sqrt(3)
+        #t1=c/lu
+        #t2=1/ly
+        #dk1theta1=np.exp(-dist*ly)*t2*( (2*c*t2+2*t1)/(c*t2+t1)**2 -2*(2*c*t2*t1+t1**2)/(c*t2+t1)**3   )
+        
+        dk2theta1 = 1*( 
+            np.exp(-lu*dist)*dist*(-ly+2*lu-lu*ly*dist+dist*lu**2)*(ly-lu)**(-2) + np.exp(-lu*dist)*(-2+ly*dist-2*dist*lu)*(ly-lu)**(-2) 
+            +np.exp(-dist*lu)*(ly-2*lu+ly*lu*dist-dist*lu**2)*2*(ly-lu)**(-3) 
+            +np.exp(-dist*ly)*2*(ly-lu)**(-2)
+            +np.exp(-dist*ly)*2*(2*lu-ly)*(ly-lu)**(-3)
+            )
+      
+        dk3theta1 = np.exp(-dist*lu)*(lu+ly)**(-2)*((2*lu+ly+dist*lu**2+lu*ly*dist)*(-dist-2/(lu+ly))+2+2*lu*dist+ly*dist)
+
+        dktheta1 = self.varianceU*self.varianceY*(dk1theta1+dk2theta1+dk3theta1)
+
+
+
+
+        dk1theta2 = np.exp(-ly*dist) * ((lu+ly)**(-2)) * (  (-dist)*(2*lu+ly)  +  1  +  (-2)*(2*lu+ly)/(lu+ly)  )
+
+        dk2theta2 = 1*(
+            np.exp(-dist*lu)*(ly-lu)**(-2) * ( 1+lu*dist+(-2)*(ly-2*lu+lu*ly*dist-dist*lu**2)*(ly-lu)**(-1) )
+            +np.exp(-dist*ly)*(ly-lu)**(-2) * ( (-dist)*(2*lu-ly) -1+(2*lu-ly)*(-2)*(ly-lu)**(-1) )
+            )
+
+        dk3theta2 = np.exp(-dist*lu) * (-3*lu-ly-dist*lu**2-lu*ly*dist)/(lu+ly)**3
+
+        dktheta2 = self.varianceU*self.varianceY*(dk1theta2 + dk2theta2 +dk3theta2)
+
+
+
+        k1 = np.exp(-ly*dist)*(2*lu+ly)/(lu+ly)**2
+        k2 = (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2 
+        k3 = np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
+        dkdvar = k1+k2+k3
+        
+        #target[0] dk dvarU
+        #target[1] dk dvarY
+        #target[2] dk d theta1
+        #target[3] dk d theta2 
+        target[0] += np.sum(self.varianceY*dkdvar * dL_dK)
+        target[1] += np.sum(self.varianceU*dkdvar * dL_dK)
+        target[2] += np.sum(dktheta1*(-np.sqrt(3)*self.lengthscaleU**(-2)) * dL_dK)
+        target[3] += np.sum(dktheta2*(-self.lengthscaleY**(-2)) * dL_dK)
+
+
+    # def dKdiag_dtheta(self, dL_dKdiag, X, target):
+    #     """derivative of the diagonal of the covariance matrix with respect to the parameters."""
+    #     # NB: derivative of diagonal elements wrt lengthscale is 0
+    #     target[0] += np.sum(dL_dKdiag)
+
+    # def dK_dX(self, dL_dK, X, X2, target):
+    #     """derivative of the covariance matrix with respect to X."""
+    #     if X2 is None: X2 = X
+    #     dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
+    #     ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
+    #     dK_dX = -np.transpose(self.variance * np.exp(-dist) * ddist_dX, (1, 0, 2))
+    #     target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+
+    # def dKdiag_dX(self, dL_dKdiag, X, target):
+    #     pass
--- a/GPy/kern/_src/todo/eq_ode1.py
+++ b/GPy/kern/_src/todo/eq_ode1.py
@ -0,0 +1,556 @@
+# Copyright (c) 2013, GPy Authors, see AUTHORS.txt
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+from GPy.util.linalg import mdot, pdinv
+from GPy.util.ln_diff_erfs import ln_diff_erfs
+import pdb
+from scipy import weave
+
+class Eq_ode1(Kernpart):
+    """
+    Covariance function for first order differential equation driven by an exponentiated quadratic covariance.
+
+    This outputs of this kernel have the form
+    .. math::
+       \frac{\text{d}y_j}{\text{d}t} = \sum_{i=1}^R w_{j,i} f_i(t-\delta_j) +\sqrt{\kappa_j}g_j(t) - d_jy_j(t)
+
+    where :math:`R` is the rank of the system, :math:`w_{j,i}` is the sensitivity of the :math:`j`th output to the :math:`i`th latent function, :math:`d_j` is the decay rate of the :math:`j`th output and :math:`f_i(t)` and :math:`g_i(t)` are independent latent Gaussian processes goverened by an exponentiated quadratic covariance.
+    
+    :param output_dim: number of outputs driven by latent function.
+    :type output_dim: int
+    :param W: sensitivities of each output to the latent driving function. 
+    :type W: ndarray (output_dim x rank).
+    :param rank: If rank is greater than 1 then there are assumed to be a total of rank latent forces independently driving the system, each with identical covariance.
+    :type rank: int
+    :param decay: decay rates for the first order system. 
+    :type decay: array of length output_dim.
+    :param delay: delay between latent force and output response.
+    :type delay: array of length output_dim.
+    :param kappa: diagonal term that allows each latent output to have an independent component to the response.
+    :type kappa: array of length output_dim.
+    
+    .. Note: see first order differential equation examples in GPy.examples.regression for some usage.
+    """
+    def __init__(self,output_dim, W=None, rank=1, kappa=None, lengthscale=1.0,  decay=None, delay=None):
+        self.rank = rank
+        self.input_dim = 1
+        self.name = 'eq_ode1'
+        self.output_dim = output_dim
+        self.lengthscale = lengthscale
+        self.num_params = self.output_dim*self.rank + 1 + (self.output_dim - 1)
+        if kappa is not None:
+            self.num_params+=self.output_dim
+        if delay is not None:
+            assert delay.shape==(self.output_dim-1,)
+            self.num_params+=self.output_dim-1
+        self.rank = rank
+        if W is None:
+            self.W = 0.5*np.random.randn(self.output_dim,self.rank)/np.sqrt(self.rank)
+        else:
+            assert W.shape==(self.output_dim,self.rank)
+            self.W = W
+        if decay is None:
+            self.decay = np.ones(self.output_dim-1)
+        if kappa is not None:
+            assert kappa.shape==(self.output_dim,)
+        self.kappa = kappa
+
+        self.delay = delay
+        self.is_normalized = True
+        self.is_stationary = False
+        self.gaussian_initial = False
+        self._set_params(self._get_params())
+        
+    def _get_params(self):
+        param_list = [self.W.flatten()]
+        if self.kappa is not None:
+            param_list.append(self.kappa)
+        param_list.append(self.decay)
+        if self.delay is not None:
+            param_list.append(self.delay)
+        param_list.append(self.lengthscale)
+        return np.hstack(param_list)
+
+    def _set_params(self,x):
+        assert x.size == self.num_params
+        end = self.output_dim*self.rank
+        self.W = x[:end].reshape(self.output_dim,self.rank)
+        start = end
+        self.B = np.dot(self.W,self.W.T)
+        if self.kappa is not None:
+            end+=self.output_dim
+            self.kappa = x[start:end]
+            self.B += np.diag(self.kappa)
+            start=end
+        end+=self.output_dim-1
+        self.decay = x[start:end]
+        start=end
+        if self.delay is not None:
+            end+=self.output_dim-1
+            self.delay = x[start:end]
+            start=end
+        end+=1
+        self.lengthscale = x[start]
+        self.sigma = np.sqrt(2)*self.lengthscale
+
+
+    def _get_param_names(self):
+        param_names = sum([['W%i_%i'%(i,j) for j in range(self.rank)] for i in range(self.output_dim)],[])
+        if self.kappa is not None:
+            param_names += ['kappa_%i'%i for i in range(self.output_dim)]
+        param_names += ['decay_%i'%i for i in range(1,self.output_dim)]
+        if self.delay is not None:
+            param_names += ['delay_%i'%i for i in 1+range(1,self.output_dim)]
+        param_names+= ['lengthscale'] 
+        return param_names
+
+    def K(self,X,X2,target):
+        
+        if X.shape[1] > 2:
+            raise ValueError('Input matrix for ode1 covariance should have at most two columns, one containing times, the other output indices')
+
+        self._K_computations(X, X2)
+        target += self._scale*self._K_dvar
+
+        if self.gaussian_initial:
+            # Add covariance associated with initial condition.
+            t1_mat = self._t[self._rorder, None]
+            t2_mat = self._t2[None, self._rorder2]
+            target+=self.initial_variance * np.exp(- self.decay * (t1_mat + t2_mat))
+
+    def Kdiag(self,index,target):
+        #target += np.diag(self.B)[np.asarray(index,dtype=np.int).flatten()]
+        pass
+    
+    def _param_grad_helper(self,dL_dK,X,X2,target):
+        
+        # First extract times and indices.
+        self._extract_t_indices(X, X2, dL_dK=dL_dK)
+        self._dK_ode_dtheta(target)
+        
+
+    def _dK_ode_dtheta(self, target):
+        """Do all the computations for the ode parts of the covariance function."""
+        t_ode = self._t[self._index>0]
+        dL_dK_ode = self._dL_dK[self._index>0, :]
+        index_ode = self._index[self._index>0]-1
+        if self._t2 is None:
+            if t_ode.size==0:
+                return        
+            t2_ode = t_ode
+            dL_dK_ode = dL_dK_ode[:, self._index>0]
+            index2_ode = index_ode
+        else:
+            t2_ode = self._t2[self._index2>0]
+            dL_dK_ode = dL_dK_ode[:, self._index2>0]
+            if t_ode.size==0 or t2_ode.size==0:
+                return
+            index2_ode = self._index2[self._index2>0]-1
+
+        h1 = self._compute_H(t_ode, index_ode, t2_ode, index2_ode, stationary=self.is_stationary, update_derivatives=True)
+        #self._dK_ddelay = self._dh_ddelay
+        self._dK_dsigma = self._dh_dsigma
+
+        if self._t2 is None:
+            h2 = h1
+        else:
+            h2 = self._compute_H(t2_ode, index2_ode, t_ode, index_ode, stationary=self.is_stationary, update_derivatives=True)
+
+        #self._dK_ddelay += self._dh_ddelay.T
+        self._dK_dsigma += self._dh_dsigma.T
+        # C1 = self.sensitivity
+        # C2 = self.sensitivity
+
+        # K = 0.5 * (h1 + h2.T)
+        # var2 = C1*C2
+        # if self.is_normalized:
+        #     dk_dD1 = (sum(sum(dL_dK.*dh1_dD1)) + sum(sum(dL_dK.*dh2_dD1.T)))*0.5*var2
+        #     dk_dD2 = (sum(sum(dL_dK.*dh1_dD2)) + sum(sum(dL_dK.*dh2_dD2.T)))*0.5*var2
+        #     dk_dsigma = 0.5 * var2 * sum(sum(dL_dK.*dK_dsigma))
+        #     dk_dC1 = C2 * sum(sum(dL_dK.*K))
+        #     dk_dC2 = C1 * sum(sum(dL_dK.*K))
+        # else:
+        #     K = np.sqrt(np.pi) * K
+        #     dk_dD1 = (sum(sum(dL_dK.*dh1_dD1)) + * sum(sum(dL_dK.*K))
+        #     dk_dC2 = self.sigma * C1 * sum(sum(dL_dK.*K))
+
+
+        # dk_dSim1Variance = dk_dC1
+        # Last element is the length scale.
+        (dL_dK_ode[:, :, None]*self._dh_ddelay[:, None, :]).sum(2)
+
+        target[-1] += (dL_dK_ode*self._dK_dsigma/np.sqrt(2)).sum()
+
+
+        # # only pass the gradient with respect to the inverse width to one
+        # # of the gradient vectors ... otherwise it is counted twice.
+        # g1 = real([dk_dD1 dk_dinvWidth dk_dSim1Variance])
+        # g2 = real([dk_dD2 0 dk_dSim2Variance])
+        # return g1, g2"""
+
+    def dKdiag_dtheta(self,dL_dKdiag,index,target):
+        pass
+
+    def gradients_X(self,dL_dK,X,X2,target):
+        pass
+
+    def _extract_t_indices(self, X, X2=None, dL_dK=None):
+        """Extract times and output indices from the input matrix X. Times are ordered according to their index for convenience of computation, this ordering is stored in self._order and self.order2. These orderings are then mapped back to the original ordering (in X) using self._rorder and self._rorder2. """
+
+        # TODO: some fast checking here to see if this needs recomputing?
+        self._t = X[:, 0]
+        if not X.shape[1] == 2:
+            raise ValueError('Input matrix for ode1 covariance should have two columns, one containing times, the other output indices')
+        self._index = np.asarray(X[:, 1],dtype=np.int)
+        # Sort indices so that outputs are in blocks for computational
+        # convenience.
+        self._order = self._index.argsort()
+        self._index = self._index[self._order]
+        self._t = self._t[self._order]
+        self._rorder = self._order.argsort() # rorder is for reversing the order
+        
+        if X2 is None:
+            self._t2 = None
+            self._index2 = None
+            self._order2 = self._order
+            self._rorder2 = self._rorder
+        else:
+            if not X2.shape[1] == 2:
+                raise ValueError('Input matrix for ode1 covariance should have two columns, one containing times, the other output indices')
+            self._t2 = X2[:, 0]
+            self._index2 = np.asarray(X2[:, 1],dtype=np.int)
+            self._order2 = self._index2.argsort()
+            self._index2 = self._index2[self._order2]
+            self._t2 = self._t2[self._order2]
+            self._rorder2 = self._order2.argsort() # rorder2 is for reversing order
+
+        if dL_dK is not None:
+            self._dL_dK = dL_dK[self._order, :]
+            self._dL_dK = self._dL_dK[:, self._order2]
+            
+    def _K_computations(self, X, X2):
+        """Perform main body of computations for the ode1 covariance function."""
+        # First extract times and indices.
+        self._extract_t_indices(X, X2)
+
+        self._K_compute_eq()
+        self._K_compute_ode_eq()
+        if X2 is None:
+            self._K_eq_ode = self._K_ode_eq.T
+        else:
+            self._K_compute_ode_eq(transpose=True)
+        self._K_compute_ode()
+
+        if X2 is None:
+            self._K_dvar = np.zeros((self._t.shape[0], self._t.shape[0]))
+        else:
+            self._K_dvar = np.zeros((self._t.shape[0], self._t2.shape[0]))
+
+        # Reorder values of blocks for placing back into _K_dvar.
+        self._K_dvar = np.vstack((np.hstack((self._K_eq, self._K_eq_ode)),
+                                                   np.hstack((self._K_ode_eq, self._K_ode))))
+        self._K_dvar = self._K_dvar[self._rorder, :]
+        self._K_dvar = self._K_dvar[:, self._rorder2]
+        
+        
+        if X2 is None:
+            # Matrix giving scales of each output
+            self._scale = np.zeros((self._t.size, self._t.size))
+            code="""
+            for(int i=0;i<N; i++){
+              scale_mat[i+i*N] = B[index[i]+output_dim*(index[i])];
+              for(int j=0; j<i; j++){
+                  scale_mat[j+i*N] = B[index[i]+output_dim*index[j]];
+                  scale_mat[i+j*N] = scale_mat[j+i*N];
+                }
+              }
+            """
+            scale_mat, B, index = self._scale, self.B, self._index
+            N, output_dim = self._t.size, self.output_dim
+            weave.inline(code,['index',
+                               'scale_mat', 'B',
+                               'N', 'output_dim'])
+        else:
+            self._scale = np.zeros((self._t.size, self._t2.size))
+            code = """
+            for(int i=0; i<N; i++){
+              for(int j=0; j<N2; j++){
+                scale_mat[i+j*N] = B[index[i]+output_dim*index2[j]];
+              }
+            }
+            """
+            scale_mat, B, index, index2 = self._scale, self.B, self._index, self._index2
+            N, N2, output_dim = self._t.size, self._t2.size, self.output_dim
+            weave.inline(code, ['index', 'index2',
+                                'scale_mat', 'B',
+                                'N', 'N2', 'output_dim'])
+
+
+
+    def _K_compute_eq(self):
+        """Compute covariance for latent covariance."""
+        t_eq = self._t[self._index==0]
+        if self._t2 is None:
+            if t_eq.size==0:
+                self._K_eq = np.zeros((0, 0))
+                return
+            self._dist2 = np.square(t_eq[:, None] - t_eq[None, :])
+        else:
+            t2_eq = self._t2[self._index2==0]
+            if t_eq.size==0 or t2_eq.size==0:
+                self._K_eq = np.zeros((t_eq.size, t2_eq.size))
+                return
+            self._dist2 = np.square(t_eq[:, None] - t2_eq[None, :])
+        
+        self._K_eq = np.exp(-self._dist2/(2*self.lengthscale*self.lengthscale))
+        if self.is_normalized:
+            self._K_eq/=(np.sqrt(2*np.pi)*self.lengthscale)
+
+    def _K_compute_ode_eq(self, transpose=False):
+        """Compute the cross covariances between latent exponentiated quadratic and observed ordinary differential equations.
+
+        :param transpose: if set to false the exponentiated quadratic is on the rows of the matrix and is computed according to self._t, if set to true it is on the columns and is computed according to self._t2 (default=False).
+        :type transpose: bool"""
+
+        if self._t2 is not None:
+            if transpose:
+                t_eq = self._t[self._index==0]
+                t_ode = self._t2[self._index2>0]
+                index_ode = self._index2[self._index2>0]-1
+            else:
+                t_eq = self._t2[self._index2==0]
+                t_ode = self._t[self._index>0]
+                index_ode = self._index[self._index>0]-1
+        else:
+            t_eq = self._t[self._index==0]
+            t_ode = self._t[self._index>0]
+            index_ode = self._index[self._index>0]-1
+
+        if t_ode.size==0 or t_eq.size==0:
+            if transpose:
+                self._K_eq_ode = np.zeros((t_eq.shape[0], t_ode.shape[0]))
+            else:
+                self._K_ode_eq = np.zeros((t_ode.shape[0], t_eq.shape[0]))
+            return
+
+        t_ode_mat = t_ode[:, None]
+        t_eq_mat = t_eq[None, :]
+        if self.delay is not None:
+            t_ode_mat -= self.delay[index_ode, None]
+        diff_t = (t_ode_mat - t_eq_mat)
+
+        inv_sigma_diff_t = 1./self.sigma*diff_t
+        decay_vals = self.decay[index_ode][:, None]
+        half_sigma_d_i = 0.5*self.sigma*decay_vals
+
+        if self.is_stationary:
+            ln_part, signs = ln_diff_erfs(inf, half_sigma_d_i - inv_sigma_diff_t, return_sign=True)
+        else:
+            ln_part, signs = ln_diff_erfs(half_sigma_d_i + t_eq_mat/self.sigma, half_sigma_d_i - inv_sigma_diff_t, return_sign=True)
+        sK = signs*np.exp(half_sigma_d_i*half_sigma_d_i - decay_vals*diff_t + ln_part)
+
+        sK *= 0.5
+
+        if not self.is_normalized:
+            sK *= np.sqrt(np.pi)*self.sigma
+
+
+        if transpose:
+            self._K_eq_ode = sK.T
+        else:
+            self._K_ode_eq = sK
+        
+    def _K_compute_ode(self):
+        # Compute covariances between outputs of the ODE models.
+
+        t_ode = self._t[self._index>0]
+        index_ode = self._index[self._index>0]-1
+        if self._t2 is None:
+            if t_ode.size==0:
+                self._K_ode = np.zeros((0, 0))
+                return        
+            t2_ode = t_ode
+            index2_ode = index_ode
+        else:
+            t2_ode = self._t2[self._index2>0]
+            if t_ode.size==0 or t2_ode.size==0:
+                self._K_ode = np.zeros((t_ode.size, t2_ode.size))
+                return
+            index2_ode = self._index2[self._index2>0]-1
+        
+        # When index is identical
+        h = self._compute_H(t_ode, index_ode, t2_ode, index2_ode, stationary=self.is_stationary)
+
+        if self._t2 is None:
+            self._K_ode = 0.5 * (h + h.T)
+        else:
+            h2 = self._compute_H(t2_ode, index2_ode, t_ode, index_ode, stationary=self.is_stationary)                
+            self._K_ode = 0.5 * (h + h2.T)
+
+        if not self.is_normalized:
+            self._K_ode *= np.sqrt(np.pi)*self.sigma
+    def _compute_diag_H(self, t, index, update_derivatives=False, stationary=False):
+        """Helper function for computing H for the diagonal only.
+        :param t: time input.
+        :type t: array
+        :param index: first output indices
+        :type index: array of int.
+        :param index: second output indices
+        :type index: array of int.
+        :param update_derivatives: whether or not to update the derivative portions (default False).
+        :type update_derivatives: bool
+        :param stationary: whether to compute the stationary version of the covariance (default False).
+        :type stationary: bool"""
+
+        """if delta_i~=delta_j:
+            [h, dh_dD_i, dh_dD_j, dh_dsigma] = np.diag(simComputeH(t, index, t, index, update_derivatives=True, stationary=self.is_stationary))
+        else:
+            Decay = self.decay[index]
+            if self.delay is not None:
+                t = t - self.delay[index]
+            
+            t_squared = t*t
+            half_sigma_decay = 0.5*self.sigma*Decay
+            [ln_part_1, sign1] = ln_diff_erfs(half_sigma_decay + t/self.sigma,
+                                              half_sigma_decay)
+    
+            [ln_part_2, sign2] = ln_diff_erfs(half_sigma_decay,
+                                              half_sigma_decay - t/self.sigma)
+            
+            h = (sign1*np.exp(half_sigma_decay*half_sigma_decay
+                             + ln_part_1
+                             - log(Decay + D_j)) 
+                 - sign2*np.exp(half_sigma_decay*half_sigma_decay
+                                - (Decay + D_j)*t
+                                + ln_part_2 
+                                - log(Decay + D_j)))
+    
+            sigma2 = self.sigma*self.sigma
+
+        if update_derivatives:
+        
+            dh_dD_i = ((0.5*Decay*sigma2*(Decay + D_j)-1)*h 
+                       + t*sign2*np.exp(
+                half_sigma_decay*half_sigma_decay-(Decay+D_j)*t + ln_part_2
+                )
+                       + self.sigma/np.sqrt(np.pi)*
+                       (-1 + np.exp(-t_squared/sigma2-Decay*t)
+                        + np.exp(-t_squared/sigma2-D_j*t)
+                        - np.exp(-(Decay + D_j)*t)))
+        
+            dh_dD_i = (dh_dD_i/(Decay+D_j)).real
+        
+        
+        
+            dh_dD_j = (t*sign2*np.exp(
+                half_sigma_decay*half_sigma_decay-(Decay + D_j)*t+ln_part_2
+                )
+                       -h)
+            dh_dD_j = (dh_dD_j/(Decay + D_j)).real
+
+            dh_dsigma = 0.5*Decay*Decay*self.sigma*h \
+                        + 2/(np.sqrt(np.pi)*(Decay+D_j))\
+                        *((-Decay/2) \
+                          + (-t/sigma2+Decay/2)*np.exp(-t_squared/sigma2 - Decay*t) \
+                          - (-t/sigma2-Decay/2)*np.exp(-t_squared/sigma2 - D_j*t) \
+                          - Decay/2*np.exp(-(Decay+D_j)*t))"""
+        pass
+    
+    def _compute_H(self, t, index, t2, index2, update_derivatives=False, stationary=False):
+        """Helper function for computing part of the ode1 covariance function.
+
+        :param t: first time input.
+        :type t: array
+        :param index: Indices of first output.
+        :type index: array of int
+        :param t2: second time input.
+        :type t2: array
+        :param index2: Indices of second output.
+        :type index2: array of int
+        :param update_derivatives: whether to update derivatives (default is False)
+        :return h : result of this subcomponent of the kernel for the given values.
+        :rtype: ndarray
+"""
+
+        if stationary:
+            raise NotImplementedError, "Error, stationary version of this covariance not yet implemented."
+        # Vector of decays and delays associated with each output.
+        Decay = self.decay[index]
+        Decay2 = self.decay[index2]
+        t_mat = t[:, None]
+        t2_mat = t2[None, :]
+        if self.delay is not None:
+            Delay = self.delay[index]
+            Delay2 = self.delay[index2]
+            t_mat-=Delay[:, None]
+            t2_mat-=Delay2[None, :]
+
+        diff_t = (t_mat - t2_mat)
+        inv_sigma_diff_t = 1./self.sigma*diff_t
+        half_sigma_decay_i = 0.5*self.sigma*Decay[:, None]
+
+        ln_part_1, sign1 = ln_diff_erfs(half_sigma_decay_i + t2_mat/self.sigma, 
+                                        half_sigma_decay_i - inv_sigma_diff_t,
+                                        return_sign=True)
+        ln_part_2, sign2 = ln_diff_erfs(half_sigma_decay_i,
+                                        half_sigma_decay_i - t_mat/self.sigma,
+                                        return_sign=True)
+
+        h = sign1*np.exp(half_sigma_decay_i
+                         *half_sigma_decay_i
+                         -Decay[:, None]*diff_t+ln_part_1
+                         -np.log(Decay[:, None] + Decay2[None, :]))
+        h -= sign2*np.exp(half_sigma_decay_i*half_sigma_decay_i
+                          -Decay[:, None]*t_mat-Decay2[None, :]*t2_mat+ln_part_2
+                          -np.log(Decay[:, None] + Decay2[None, :]))
+
+        if update_derivatives:
+            sigma2 = self.sigma*self.sigma
+            # Update ith decay gradient
+
+            dh_ddecay = ((0.5*Decay[:, None]*sigma2*(Decay[:, None] + Decay2[None, :])-1)*h
+                         + (-diff_t*sign1*np.exp(
+                half_sigma_decay_i*half_sigma_decay_i-Decay[:, None]*diff_t+ln_part_1
+                )
+                            +t_mat*sign2*np.exp(
+                half_sigma_decay_i*half_sigma_decay_i-Decay[:, None]*t_mat
+                - Decay2*t2_mat+ln_part_2))
+                         +self.sigma/np.sqrt(np.pi)*(
+                -np.exp(
+                -diff_t*diff_t/sigma2
+                )+np.exp(
+                -t2_mat*t2_mat/sigma2-Decay[:, None]*t_mat
+                )+np.exp(
+                -t_mat*t_mat/sigma2-Decay2[None, :]*t2_mat
+                )-np.exp(
+                -(Decay[:, None]*t_mat + Decay2[None, :]*t2_mat)
+                )
+                ))
+            self._dh_ddecay = (dh_ddecay/(Decay[:, None]+Decay2[None, :])).real
+            
+            # Update jth decay gradient
+            dh_ddecay2 = (t2_mat*sign2
+                         *np.exp(
+                half_sigma_decay_i*half_sigma_decay_i
+                -(Decay[:, None]*t_mat + Decay2[None, :]*t2_mat)
+                +ln_part_2
+                )
+                         -h)
+            self._dh_ddecay2 = (dh_ddecay/(Decay[:, None] + Decay2[None, :])).real
+            
+            # Update sigma gradient
+            self._dh_dsigma = (half_sigma_decay_i*Decay[:, None]*h
+                               + 2/(np.sqrt(np.pi)
+                                    *(Decay[:, None]+Decay2[None, :]))
+                               *((-diff_t/sigma2-Decay[:, None]/2)
+                                 *np.exp(-diff_t*diff_t/sigma2)
+                                 + (-t2_mat/sigma2+Decay[:, None]/2)
+                                 *np.exp(-t2_mat*t2_mat/sigma2-Decay[:, None]*t_mat) 
+                                 - (-t_mat/sigma2-Decay[:, None]/2) 
+                                 *np.exp(-t_mat*t_mat/sigma2-Decay2[None, :]*t2_mat) 
+                                 - Decay[:, None]/2
+                                 *np.exp(-(Decay[:, None]*t_mat+Decay2[None, :]*t2_mat))))
+                
+        return h
--- a/GPy/kern/_src/todo/finite_dimensional.py
+++ b/GPy/kern/_src/todo/finite_dimensional.py
@ -0,0 +1,74 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+from ...util.linalg import pdinv,mdot
+
+class FiniteDimensional(Kernpart):
+    def __init__(self, input_dim, F, G, variance=1., weights=None):
+        """
+        Argumnents
+        ----------
+        input_dim: int - the number of input dimensions
+        F: np.array of functions with shape (n,) - the n basis functions
+        G: np.array with shape (n,n) - the Gram matrix associated to F
+        weights : np.ndarray with shape (n,)
+        """
+        self.input_dim = input_dim
+        self.F = F
+        self.G = G
+        self.G_1 ,L,Li,logdet = pdinv(G)
+        self.n = F.shape[0]
+        if weights is not None:
+            assert weights.shape==(self.n,)
+        else:
+            weights = np.ones(self.n)
+        self.num_params = self.n + 1
+        self.name = 'finite_dim'
+        self._set_params(np.hstack((variance,weights)))
+
+    def _get_params(self):
+        return np.hstack((self.variance,self.weights))
+    def _set_params(self,x):
+        assert x.size == (self.num_params)
+        self.variance = x[0]
+        self.weights = x[1:]
+    def _get_param_names(self):
+        if self.n==1:
+            return ['variance','weight']
+        else:
+            return ['variance']+['weight_%i'%i for i in range(self.weights.size)]
+
+    def K(self,X,X2,target):
+        if X2 is None: X2 = X
+        FX = np.column_stack([f(X) for f in self.F])
+        FX2 = np.column_stack([f(X2) for f in self.F])
+        product = self.variance * mdot(FX,np.diag(np.sqrt(self.weights)),self.G_1,np.diag(np.sqrt(self.weights)),FX2.T)
+        np.add(product,target,target)
+    def Kdiag(self,X,target):
+        product = np.diag(self.K(X, X))
+        np.add(target,product,target)
+    def _param_grad_helper(self,X,X2,target):
+        """Return shape is NxMx(Ntheta)"""
+        if X2 is None: X2 = X
+        FX = np.column_stack([f(X) for f in self.F])
+        FX2 = np.column_stack([f(X2) for f in self.F])
+        DER = np.zeros((self.n,self.n,self.n))
+        for i in range(self.n):
+            DER[i,i,i] = np.sqrt(self.weights[i])
+        dw = self.variance * mdot(FX,DER,self.G_1,np.diag(np.sqrt(self.weights)),FX2.T)
+        dv = mdot(FX,np.diag(np.sqrt(self.weights)),self.G_1,np.diag(np.sqrt(self.weights)),FX2.T)
+        np.add(target[:,:,0],np.transpose(dv,(0,2,1)), target[:,:,0])
+        np.add(target[:,:,1:],np.transpose(dw,(0,2,1)), target[:,:,1:])
+    def dKdiag_dtheta(self,X,target):
+        np.add(target[:,0],1.,target[:,0])
+
+
+
+
+
+
+
+
--- a/GPy/kern/_src/todo/fixed.py
+++ b/GPy/kern/_src/todo/fixed.py
@ -0,0 +1,41 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+
+class Fixed(Kernpart):
+    def __init__(self, input_dim, K, variance=1.):
+        """
+        :param input_dim: the number of input dimensions
+        :type input_dim: int
+        :param variance: the variance of the kernel
+        :type variance: float
+        """
+        self.input_dim = input_dim
+        self.fixed_K = K
+        self.num_params = 1
+        self.name = 'fixed'
+        self._set_params(np.array([variance]).flatten())
+
+    def _get_params(self):
+        return self.variance
+
+    def _set_params(self, x):
+        assert x.shape == (1,)
+        self.variance = x
+
+    def _get_param_names(self):
+        return ['variance']
+
+    def K(self, X, X2, target):
+        target += self.variance * self.fixed_K
+
+    def _param_grad_helper(self, partial, X, X2, target):
+        target += (partial * self.fixed_K).sum()
+
+    def gradients_X(self, partial, X, X2, target):
+        pass
+
+    def dKdiag_dX(self, partial, X, target):
+        pass
--- a/GPy/kern/_src/todo/gibbs.py
+++ b/GPy/kern/_src/todo/gibbs.py
@ -0,0 +1,154 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+from ...util.linalg import tdot
+from ...core.mapping import Mapping
+import GPy
+
+class Gibbs(Kernpart):
+    """
+    Gibbs non-stationary covariance function. 
+
+    .. math::
+       
+       r = sqrt((x_i - x_j)'*(x_i - x_j))
+       
+       k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
+
+       Z = (2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')^{q/2}
+
+       where :math:`l(x)` is a function giving the length scale as a function of space and :math:`q` is the dimensionality of the input space.
+       This is the non stationary kernel proposed by Mark Gibbs in his 1997
+        thesis. It is similar to an RBF but has a length scale that varies
+        with input location. This leads to an additional term in front of
+        the kernel.
+
+        The parameters are :math:`\sigma^2`, the process variance, and
+        the parameters of l(x) which is a function that can be
+        specified by the user, by default an multi-layer peceptron is
+        used.
+
+        :param input_dim: the number of input dimensions
+        :type input_dim: int 
+        :param variance: the variance :math:`\sigma^2`
+        :type variance: float
+        :param mapping: the mapping that gives the lengthscale across the input space (by default GPy.mappings.MLP is used with 20 hidden nodes).
+        :type mapping: GPy.core.Mapping
+        :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
+        :type ARD: Boolean
+        :rtype: Kernpart object
+
+    See Mark Gibbs's thesis for more details: Gibbs,
+    M. N. (1997). Bayesian Gaussian Processes for Regression and
+    Classification. PhD thesis, Department of Physics, University of
+    Cambridge. Or also see Page 93 of Gaussian Processes for Machine
+    Learning by Rasmussen and Williams. Although note that we do not
+    constrain the lengthscale to be positive by default. This allows
+    anticorrelation to occur. The positive constraint can be included
+    by the user manually.
+
+    """
+
+    def __init__(self, input_dim, variance=1., mapping=None, ARD=False):
+        self.input_dim = input_dim
+        self.ARD = ARD
+        if not mapping:
+            mapping = GPy.mappings.MLP(output_dim=1, hidden_dim=20, input_dim=input_dim)
+        if not ARD:
+            self.num_params=1+mapping.num_params
+        else:
+            raise NotImplementedError
+
+        self.mapping = mapping
+        self.name='gibbs'
+        self._set_params(np.hstack((variance, self.mapping._get_params())))
+
+    def _get_params(self):
+        return np.hstack((self.variance, self.mapping._get_params()))
+
+    def _set_params(self, x):
+        assert x.size == (self.num_params)
+        self.variance = x[0]
+        self.mapping._set_params(x[1:])
+
+    def _get_param_names(self):
+        return ['variance'] + self.mapping._get_param_names()
+
+    def K(self, X, X2, target):
+        """Return covariance between X and X2."""
+        self._K_computations(X, X2)
+        target += self.variance*self._K_dvar
+
+    def Kdiag(self, X, target):
+        """Compute the diagonal of the covariance matrix for X."""
+        np.add(target, self.variance, target)
+
+    def _param_grad_helper(self, dL_dK, X, X2, target):
+        """Derivative of the covariance with respect to the parameters."""
+        self._K_computations(X, X2)
+        self._dK_computations(dL_dK)
+        if X2==None:
+            gmapping = self.mapping.df_dtheta(2*self._dL_dl[:, None], X)
+        else:
+            gmapping = self.mapping.df_dtheta(self._dL_dl[:, None], X)
+            gmapping += self.mapping.df_dtheta(self._dL_dl_two[:, None], X2)
+
+        target+= np.hstack([(dL_dK*self._K_dvar).sum(), gmapping])
+
+    def gradients_X(self, dL_dK, X, X2, target):
+        """Derivative of the covariance matrix with respect to X."""
+        # First account for gradients arising from presence of X in exponent.
+        self._K_computations(X, X2)
+        if X2 is None:
+            _K_dist = 2*(X[:, None, :] - X[None, :, :])
+        else:
+            _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_co
+        gradients_X = (-2.*self.variance)*np.transpose((self._K_dvar/self._w2)[:, :, None]*_K_dist, (1, 0, 2))
+        target += np.sum(gradients_X*dL_dK.T[:, :, None], 0)
+        # Now account for gradients arising from presence of X in lengthscale.
+        self._dK_computations(dL_dK)
+        if X2 is None:
+            target += 2.*self.mapping.df_dX(self._dL_dl[:, None], X)
+        else:
+            target += self.mapping.df_dX(self._dL_dl[:, None], X)
+    
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to X."""
+        pass
+
+    def dKdiag_dtheta(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to parameters."""
+        target[0] += np.sum(dL_dKdiag)
+
+
+    
+    def _K_computations(self, X, X2=None):
+        """Pre-computations for the covariance function (used both when computing the covariance and its gradients). Here self._dK_dvar and self._K_dist2 are updated."""
+        self._lengthscales=self.mapping.f(X)
+        self._lengthscales2=np.square(self._lengthscales)
+        if X2==None:
+            self._lengthscales_two = self._lengthscales
+            self._lengthscales_two2 = self._lengthscales2
+            Xsquare = np.square(X).sum(1)
+            self._K_dist2 = -2.*tdot(X) + Xsquare[:, None] + Xsquare[None, :]
+        else:
+            self._lengthscales_two = self.mapping.f(X2)
+            self._lengthscales_two2 = np.square(self._lengthscales_two)
+            self._K_dist2 = -2.*np.dot(X, X2.T) + np.square(X).sum(1)[:, None] + np.square(X2).sum(1)[None, :]
+        self._w2 = self._lengthscales2 + self._lengthscales_two2.T
+        prod_length = self._lengthscales*self._lengthscales_two.T
+        self._K_exponential = np.exp(-self._K_dist2/self._w2)
+        self._K_dvar = np.sign(prod_length)*(2*np.abs(prod_length)/self._w2)**(self.input_dim/2.)*np.exp(-self._K_dist2/self._w2)
+
+    def _dK_computations(self, dL_dK):
+        """Pre-computations for the gradients of the covaraince function. Here the gradient of the covariance with respect to all the individual lengthscales is computed.
+        :param dL_dK: the gradient of the objective with respect to the covariance function.
+        :type dL_dK: ndarray"""
+        
+        self._dL_dl = (dL_dK*self.variance*self._K_dvar*(self.input_dim/2.*(self._lengthscales_two.T**4 - self._lengthscales**4) + 2*self._lengthscales2*self._K_dist2)/(self._w2*self._w2*self._lengthscales)).sum(1)
+        if self._lengthscales_two is self._lengthscales:
+            self._dL_dl_two = None
+        else:
+            self._dL_dl_two = (dL_dK*self.variance*self._K_dvar*(self.input_dim/2.*(self._lengthscales**4 - self._lengthscales_two.T**4 ) + 2*self._lengthscales_two2.T*self._K_dist2)/(self._w2*self._w2*self._lengthscales_two.T)).sum(0)
--- a/GPy/kern/_src/todo/hetero.py
+++ b/GPy/kern/_src/todo/hetero.py
@ -0,0 +1,103 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+from ...util.linalg import tdot
+from ...core.mapping import Mapping
+import GPy
+
+class Hetero(Kernpart):
+    """
+    TODO: Need to constrain the function outputs
+    positive (still thinking of best way of doing this!!! Yes, intend to use
+    transformations, but what's the *best* way). Currently just squaring output.
+
+    Heteroschedastic noise which depends on input location. See, for example,
+    this paper by Goldberg et al.
+
+    .. math::
+
+       k(x_i, x_j) = \delta_{i,j} \sigma^2(x_i)
+
+       where :math:`\sigma^2(x)` is a function giving the variance  as a function of input space and :math:`\delta_{i,j}` is the Kronecker delta function.
+
+    The parameters are the parameters of \sigma^2(x) which is a
+    function that can be specified by the user, by default an
+    multi-layer peceptron is used.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param mapping: the mapping that gives the lengthscale across the input space (by default GPy.mappings.MLP is used with 20 hidden nodes).
+    :type mapping: GPy.core.Mapping
+    :rtype: Kernpart object
+
+    See this paper:
+
+    Goldberg, P. W.  Williams, C. K. I. and Bishop,
+    C. M. (1998) Regression with Input-dependent Noise: a Gaussian
+    Process Treatment In Advances in Neural Information Processing
+    Systems, Volume 10, pp.  493-499. MIT Press
+
+    for a Gaussian process treatment of this problem.
+
+    """
+
+    def __init__(self, input_dim, mapping=None, transform=None):
+        self.input_dim = input_dim
+        if not mapping:
+            mapping = GPy.mappings.MLP(output_dim=1, hidden_dim=20, input_dim=input_dim)
+        if not transform:
+            transform = GPy.core.transformations.logexp()
+
+        self.transform = transform
+        self.mapping = mapping
+        self.name='hetero'
+        self.num_params=self.mapping.num_params
+        self._set_params(self.mapping._get_params())
+
+    def _get_params(self):
+        return self.mapping._get_params()
+
+    def _set_params(self, x):
+        assert x.size == (self.num_params)
+        self.mapping._set_params(x)
+
+    def _get_param_names(self):
+        return self.mapping._get_param_names()
+
+    def K(self, X, X2, target):
+        """Return covariance between X and X2."""
+        if (X2 is None) or (X2 is X):
+            target[np.diag_indices_from(target)] += self._Kdiag(X)
+
+    def Kdiag(self, X, target):
+        """Compute the diagonal of the covariance matrix for X."""
+        target+=self._Kdiag(X)
+
+    def _Kdiag(self, X):
+        """Helper function for computing the diagonal elements of the covariance."""
+        return self.mapping.f(X).flatten()**2
+
+    def _param_grad_helper(self, dL_dK, X, X2, target):
+        """Derivative of the covariance with respect to the parameters."""
+        if (X2 is None) or (X2 is X):
+            dL_dKdiag = dL_dK.flat[::dL_dK.shape[0]+1]
+            self.dKdiag_dtheta(dL_dKdiag, X, target)
+
+    def dKdiag_dtheta(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to parameters."""
+        target += 2.*self.mapping.df_dtheta(dL_dKdiag[:, None]*self.mapping.f(X), X)
+
+    def gradients_X(self, dL_dK, X, X2, target):
+        """Derivative of the covariance matrix with respect to X."""
+        if X2==None or X2 is X:
+            dL_dKdiag = dL_dK.flat[::dL_dK.shape[0]+1]
+            self.dKdiag_dX(dL_dKdiag, X, target)
+
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to X."""
+        target += 2.*self.mapping.df_dX(dL_dKdiag[:, None], X)*self.mapping.f(X)
+
+
+
--- a/GPy/kern/_src/todo/odekern1.c
+++ b/GPy/kern/_src/todo/odekern1.c
@ -0,0 +1,38 @@
+#include <math.h> 
+
+ double k_uu(t1,t2,theta1,theta2,sig1,sig2)
+ {
+  double kern=0;
+  double dist=0;
+  
+  dist = sqrt(t2*t2-t1*t1) 
+ 
+  kern = sig1*(1+theta1*dist)*exp(-theta1*dist)
+
+ return kern;
+ }
+
+
+
+ double k_yy(t1, t2, theta1,theta2,sig1,sig2)
+ {
+  double kern=0;
+  double dist=0;
+  
+  dist = sqrt(t2*t2-t1*t1) 
+ 
+  kern = sig1*sig2 * (  exp(-theta1*dist)*(theta2-2*theta1+theta1*theta2*dist-theta1*theta1*dist) +
+  	exp(-dist)  ) / ((theta2-theta1)*(theta2-theta1))
+
+  return kern;
+ } 
+
+
+
+
+
+
+	
+
+
+
--- a/GPy/kern/_src/todo/poly.py
+++ b/GPy/kern/_src/todo/poly.py
@ -0,0 +1,138 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+four_over_tau = 2./np.pi
+
+class POLY(Kernpart):
+    """
+
+    Polynomial kernel parameter initialisation.  Included for completeness, but generally not recommended, is the polynomial kernel:
+
+    .. math::
+        k(x, y) = \sigma^2\*(\sigma_w^2 x'y+\sigma_b^b)^d
+
+    The kernel parameters are :math:`\sigma^2` (variance), :math:`\sigma^2_w`
+    (weight_variance), :math:`\sigma^2_b` (bias_variance) and d
+    (degree). Only gradients of the first three are provided for
+    kernel optimisation, it is assumed that polynomial degree would
+    be set by hand.
+
+    The kernel is not recommended as it is badly behaved when the
+    :math:`\sigma^2_w\*x'\*y + \sigma^2_b` has a magnitude greater than one. For completeness
+    there is an automatic relevance determination version of this
+    kernel provided (NOTE YET IMPLEMENTED!).
+    :param input_dim: the number of input dimensions
+    :type input_dim: int 
+    :param variance: the variance :math:`\sigma^2`
+    :type variance: float
+    :param weight_variance: the vector of the variances of the prior over input weights in the neural network :math:`\sigma^2_w`
+    :type weight_variance: array or list of the appropriate size (or float if there is only one weight variance parameter)
+    :param bias_variance: the variance of the prior over bias parameters :math:`\sigma^2_b`
+    :param degree: the degree of the polynomial.
+    :type degree: int
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter :math:`\sigma^2_w`), otherwise there is one weight variance parameter per dimension.
+    :type ARD: Boolean
+    :rtype: Kernpart object
+
+    """
+
+    def __init__(self, input_dim, variance=1., weight_variance=None, bias_variance=1., degree=2, ARD=False):
+        self.input_dim = input_dim
+        self.ARD = ARD
+        if not ARD:
+            self.num_params=3
+            if weight_variance is not None:
+                weight_variance = np.asarray(weight_variance)
+                assert weight_variance.size == 1, "Only one weight variance needed for non-ARD kernel"
+            else:
+                weight_variance = 1.*np.ones(1)
+        else:
+            self.num_params = self.input_dim + 2
+            if weight_variance is not None:
+                weight_variance = np.asarray(weight_variance)
+                assert weight_variance.size == self.input_dim, "bad number of weight variances"
+            else:
+                weight_variance = np.ones(self.input_dim)
+            raise NotImplementedError
+        self.degree=degree
+        self.name='poly_deg' + str(self.degree)
+        self._set_params(np.hstack((variance, weight_variance.flatten(), bias_variance)))
+
+    def _get_params(self):
+        return np.hstack((self.variance, self.weight_variance.flatten(), self.bias_variance))
+
+    def _set_params(self, x):
+        assert x.size == (self.num_params)
+        self.variance = x[0]
+        self.weight_variance = x[1:-1]
+        self.weight_std = np.sqrt(self.weight_variance)
+        self.bias_variance = x[-1]
+
+    def _get_param_names(self):
+        if self.num_params == 3:
+            return ['variance', 'weight_variance', 'bias_variance']
+        else:
+            return ['variance'] + ['weight_variance_%i' % i for i in range(self.lengthscale.size)] + ['bias_variance']
+
+    def K(self, X, X2, target):
+        """Return covariance between X and X2."""
+        self._K_computations(X, X2)
+        target += self.variance*self._K_dvar
+
+    def Kdiag(self, X, target):
+        """Compute the diagonal of the covariance matrix for X."""
+        self._K_diag_computations(X)
+        target+= self.variance*self._K_diag_dvar
+
+    def _param_grad_helper(self, dL_dK, X, X2, target):
+        """Derivative of the covariance with respect to the parameters."""
+        self._K_computations(X, X2)
+        base = self.variance*self.degree*self._K_poly_arg**(self.degree-1)
+        base_cov_grad = base*dL_dK
+
+
+            
+        target[0] += np.sum(self._K_dvar*dL_dK)
+        target[1] += (self._K_inner_prod*base_cov_grad).sum()
+        target[2] += base_cov_grad.sum()
+
+
+    def gradients_X(self, dL_dK, X, X2, target):
+        """Derivative of the covariance matrix with respect to X"""
+        self._K_computations(X, X2)
+        arg = self._K_poly_arg
+        if X2 is None:
+            target += 2*self.weight_variance*self.degree*self.variance*(((X[None,:, :])) *(arg**(self.degree-1))[:, :, None]*dL_dK[:, :, None]).sum(1)
+        else:
+            target += self.weight_variance*self.degree*self.variance*(((X2[None,:, :])) *(arg**(self.degree-1))[:, :, None]*dL_dK[:, :, None]).sum(1)
+            
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to X"""
+        self._K_diag_computations(X)
+        arg = self._K_diag_poly_arg
+        target += 2.*self.weight_variance*self.degree*self.variance*X*dL_dKdiag[:, None]*(arg**(self.degree-1))[:, None]
+    
+    
+    def _K_computations(self, X, X2):
+        if self.ARD:
+            pass
+        else:
+            if X2 is None:
+                self._K_inner_prod = np.dot(X,X.T)
+            else:
+                self._K_inner_prod = np.dot(X,X2.T)
+            self._K_poly_arg = self._K_inner_prod*self.weight_variance + self.bias_variance
+        self._K_dvar = self._K_poly_arg**self.degree
+
+    def _K_diag_computations(self, X):
+        if self.ARD:
+            pass
+        else:
+            self._K_diag_poly_arg = (X*X).sum(1)*self.weight_variance + self.bias_variance
+        self._K_diag_dvar = self._K_diag_poly_arg**self.degree
+
+  
+
+
--- a/GPy/kern/_src/todo/rbf_inv.py
+++ b/GPy/kern/_src/todo/rbf_inv.py
@ -0,0 +1,336 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from rbf import RBF
+import numpy as np
+from scipy import weave
+from ...util.linalg import tdot
+from ...core.parameterization import Param
+
+class RBFInv(RBF):
+    """
+    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel. It only
+    differs from RBF in that here the parametrization is wrt the inverse lengthscale:
+
+    .. math::
+
+       k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \ \ \ \ \  \\text{ where  } r^2 = \sum_{i=1}^d \\frac{ (x_i-x^\prime_i)^2}{\ell_i^2}
+
+    where \ell_i is the lengthscale, \sigma^2 the variance and d the dimensionality of the input.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+    :param lengthscale: the vector of lengthscale of the kernel
+    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
+    :type ARD: Boolean
+    :rtype: kernel object
+
+    .. Note: this object implements both the ARD and 'spherical' version of the function
+    """
+
+    def __init__(self, input_dim, variance=1., inv_lengthscale=None, ARD=False, name='inverse rbf'):
+        #self.input_dim = input_dim
+        #self.name = 'rbf_inv'
+        if inv_lengthscale is not None: lengthscale = 1./np.array(inv_lengthscale)
+        else: lengthscale = None
+        super(RBFInv, self).__init__(input_dim, variance=variance, lengthscale=lengthscale, ARD=ARD, name=name)
+        self.ARD = ARD
+        if not ARD:
+            self.num_params = 2
+            if inv_lengthscale is not None:
+                inv_lengthscale = np.asarray(inv_lengthscale)
+                assert inv_lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
+            else:
+                inv_lengthscale = np.ones(1)
+        else:
+            self.num_params = self.input_dim + 1
+            if inv_lengthscale is not None:
+                inv_lengthscale = np.asarray(inv_lengthscale)
+                assert inv_lengthscale.size == self.input_dim, "bad number of lengthscales"
+            else:
+                inv_lengthscale = np.ones(self.input_dim)
+        
+        self.variance = Param('variance', variance)
+        self.inv_lengthscale = Param('sensitivity', inv_lengthscale)
+        self.inv_lengthscale.add_observer(self, self.update_inv_lengthscale)
+        self.remove_parameter(self.lengthscale)
+        self.add_parameters(self.variance, self.inv_lengthscale)
+        #self._set_params(np.hstack((variance, inv_lengthscale.flatten())))
+
+        # initialize cache
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
+
+        # a set of optional args to pass to weave
+        self.weave_options = {'headers'           : ['<omp.h>'],
+                         'extra_compile_args': ['-fopenmp -O3'], # -march=native'],
+                         'extra_link_args'   : ['-lgomp']}
+
+
+
+#     def _get_params(self):
+#         return np.hstack((self.variance, self.inv_lengthscale))
+
+    def update_inv_lengthscale(self, il):
+        self.inv_lengthscale2 = np.square(self.inv_lengthscale)
+        # TODO: We can rewrite everything with inv_lengthscale and never need to do the below
+        self.lengthscale = 1. / self.inv_lengthscale
+        self.lengthscale2 = np.square(self.lengthscale)
+
+    #def _set_params(self, x):
+    def parameters_changed(self):
+        #assert x.size == (self.num_params)
+        #self.variance = x[0]
+        #self.inv_lengthscale = x[1:]
+        # reset cached results
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1)) # cached versions of Z,mu,S
+
+#     def _get_param_names(self):
+#         if self.num_params == 2:
+#             return ['variance', 'inv_lengthscale']
+#         else:
+#             return ['variance'] + ['inv_lengthscale%i' % i for i in range(self.inv_lengthscale.size)]
+
+    # TODO: Rewrite computations so that lengthscale is not needed (but only inv. lengthscale)
+    def _param_grad_helper(self, dL_dK, X, X2, target):
+        self._K_computations(X, X2)
+        target[0] += np.sum(self._K_dvar * dL_dK)
+        if self.ARD:
+            dvardLdK = self._K_dvar * dL_dK
+            var_len3 = self.variance / np.power(self.lengthscale, 3)
+            len2 = self.lengthscale2
+            if X2 is None:
+                # save computation for the symmetrical case
+                dvardLdK = dvardLdK + dvardLdK.T
+                code = """
+                int q,i,j;
+                double tmp;
+                for(q=0; q<input_dim; q++){
+                  tmp = 0;
+                  for(i=0; i<num_data; i++){
+                    for(j=0; j<i; j++){
+                      tmp += (X(i,q)-X(j,q))*(X(i,q)-X(j,q))*dvardLdK(i,j);
+                    }
+                  }
+                  target(q+1) += var_len3(q)*tmp*(-len2(q));
+                }
+                """
+                num_data, num_inducing, input_dim = X.shape[0], X.shape[0], self.input_dim
+                weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3', 'len2'], type_converters=weave.converters.blitz, **self.weave_options)
+            else:
+                code = """
+                int q,i,j;
+                double tmp;
+                for(q=0; q<input_dim; q++){
+                  tmp = 0;
+                  for(i=0; i<num_data; i++){
+                    for(j=0; j<num_inducing; j++){
+                      tmp += (X(i,q)-X2(j,q))*(X(i,q)-X2(j,q))*dvardLdK(i,j);
+                    }
+                  }
+                  target(q+1) += var_len3(q)*tmp*(-len2(q));
+                }
+                """
+                num_data, num_inducing, input_dim = X.shape[0], X2.shape[0], self.input_dim
+                # [np.add(target[1+q:2+q],var_len3[q]*np.sum(dvardLdK*np.square(X[:,q][:,None]-X2[:,q][None,:])),target[1+q:2+q]) for q in range(self.input_dim)]
+                weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3', 'len2'], type_converters=weave.converters.blitz, **self.weave_options)
+        else:
+            target[1] += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dK) * (-self.lengthscale2)
+
+    def gradients_X(self, dL_dK, X, X2, target):
+        self._K_computations(X, X2)
+        if X2 is None:            
+            _K_dist = 2*(X[:, None, :] - X[None, :, :])
+        else:
+            _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
+        gradients_X = (-self.variance * self.inv_lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
+        target += np.sum(gradients_X * dL_dK.T[:, :, None], 0)
+
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        pass
+
+
+    #---------------------------------------#
+    #             PSI statistics            #
+    #---------------------------------------#
+
+    # def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target):
+    #     self._psi_computations(Z, mu, S)
+    #     denom_deriv = S[:, None, :] / (self.lengthscale ** 3 + self.lengthscale * S[:, None, :])
+    #     d_length = self._psi1[:, :, None] * (self.lengthscale * np.square(self._psi1_dist / (self.lengthscale2 + S[:, None, :])) + denom_deriv)
+    #     target[0] += np.sum(dL_dpsi1 * self._psi1 / self.variance)
+    #     dpsi1_dlength = d_length * dL_dpsi1[:, :, None]
+    #     if not self.ARD:
+    #         target[1] += dpsi1_dlength.sum()*(-self.lengthscale2)
+    #     else:
+    #         target[1:] += dpsi1_dlength.sum(0).sum(0)*(-self.lengthscale2)
+    #     #target[1:] = target[1:]*(-self.lengthscale2)
+
+    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        tmp = 1 + S[:, None, :] * self.inv_lengthscale2
+        # d_inv_length_old = -self._psi1[:, :, None] * ((self._psi1_dist_sq - 1.) / (self.lengthscale * self._psi1_denom) + self.inv_lengthscale) / self.inv_lengthscale2
+        d_length = -(self._psi1[:, :, None] * ((np.square(self._psi1_dist) * self.inv_lengthscale) / (tmp ** 2) + (S[:, None, :] * self.inv_lengthscale) / (tmp)))
+        # d_inv_length = -self._psi1[:, :, None] * ((self._psi1_dist_sq - 1.) / self._psi1_denom + self.lengthscale)
+        target[0] += np.sum(dL_dpsi1 * self._psi1 / self.variance)
+        dpsi1_dlength = d_length * dL_dpsi1[:, :, None]
+        if not self.ARD:
+            target[1] += dpsi1_dlength.sum() # *(-self.lengthscale2)
+        else:
+            target[1:] += dpsi1_dlength.sum(0).sum(0) # *(-self.lengthscale2)
+        # target[1:] = target[1:]*(-self.lengthscale2)
+
+    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        dpsi1_dZ = -self._psi1[:, :, None] * ((self.inv_lengthscale2 * self._psi1_dist) / self._psi1_denom)
+        target += np.sum(dL_dpsi1[:, :, None] * dpsi1_dZ, 0)
+
+    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
+        self._psi_computations(Z, mu, S)
+        tmp = (self._psi1[:, :, None] * self.inv_lengthscale2) / self._psi1_denom
+        target_mu += np.sum(dL_dpsi1[:, :, None] * tmp * self._psi1_dist, 1)
+        target_S += np.sum(dL_dpsi1[:, :, None] * 0.5 * tmp * (self._psi1_dist_sq - 1), 1)
+
+    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target):
+        """Shape N,num_inducing,num_inducing,Ntheta"""
+        self._psi_computations(Z, mu, S)
+        d_var = 2.*self._psi2 / self.variance
+        # d_length = 2.*self._psi2[:, :, :, None] * (self._psi2_Zdist_sq * self._psi2_denom + self._psi2_mudist_sq + S[:, None, None, :] / self.lengthscale2) / (self.lengthscale * self._psi2_denom)
+        d_length = -2.*self._psi2[:, :, :, None] * (self._psi2_Zdist_sq * self._psi2_denom + self._psi2_mudist_sq + S[:, None, None, :] * self.inv_lengthscale2) / (self.inv_lengthscale * self._psi2_denom)
+        target[0] += np.sum(dL_dpsi2 * d_var)
+        dpsi2_dlength = d_length * dL_dpsi2[:, :, :, None]
+        if not self.ARD:
+            target[1] += dpsi2_dlength.sum() # *(-self.lengthscale2)
+        else:
+            target[1:] += dpsi2_dlength.sum(0).sum(0).sum(0) # *(-self.lengthscale2)
+        # target[1:] = target[1:]*(-self.lengthscale2)
+
+    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        term1 = self._psi2_Zdist * self.inv_lengthscale2 # num_inducing, num_inducing, input_dim
+        term2 = (self._psi2_mudist * self.inv_lengthscale2) / self._psi2_denom # N, num_inducing, num_inducing, input_dim
+        dZ = self._psi2[:, :, :, None] * (term1[None] + term2)
+        target += (dL_dpsi2[:, :, :, None] * dZ).sum(0).sum(0)
+
+    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
+        """Think N,num_inducing,num_inducing,input_dim """
+        self._psi_computations(Z, mu, S)
+        tmp = (self.inv_lengthscale2 * self._psi2[:, :, :, None]) / self._psi2_denom
+        target_mu += -2.*(dL_dpsi2[:, :, :, None] * tmp * self._psi2_mudist).sum(1).sum(1)
+        target_S += (dL_dpsi2[:, :, :, None] * tmp * (2.*self._psi2_mudist_sq - 1)).sum(1).sum(1)
+
+    #---------------------------------------#
+    #            Precomputations            #
+    #---------------------------------------#
+
+    def _K_computations(self, X, X2):
+        if not (np.array_equal(X, self._X) and np.array_equal(X2, self._X2) and np.array_equal(self._params , self._get_params())):
+            self._X = X.copy()
+            self._params = self._get_params().copy()
+            if X2 is None:
+                self._X2 = None
+                X = X * self.inv_lengthscale
+                Xsquare = np.sum(np.square(X), 1)
+                self._K_dist2 = -2.*tdot(X) + (Xsquare[:, None] + Xsquare[None, :])
+            else:
+                self._X2 = X2.copy()
+                X = X * self.inv_lengthscale
+                X2 = X2 * self.inv_lengthscale
+                self._K_dist2 = -2.*np.dot(X, X2.T) + (np.sum(np.square(X), 1)[:, None] + np.sum(np.square(X2), 1)[None, :])
+            self._K_dvar = np.exp(-0.5 * self._K_dist2)
+
+    def _psi_computations(self, Z, mu, S):
+        # here are the "statistics" for psi1 and psi2
+        if not np.array_equal(Z, self._Z):
+            # Z has changed, compute Z specific stuff
+            self._psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
+            self._psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
+            self._psi2_Zdist_sq = np.square(self._psi2_Zdist * self.inv_lengthscale) # M,M,Q
+
+        if not (np.array_equal(Z, self._Z) and np.array_equal(mu, self._mu) and np.array_equal(S, self._S)):
+            # something's changed. recompute EVERYTHING
+
+            # psi1
+            self._psi1_denom = S[:, None, :] * self.inv_lengthscale2 + 1.
+            self._psi1_dist = Z[None, :, :] - mu[:, None, :]
+            self._psi1_dist_sq = (np.square(self._psi1_dist) * self.inv_lengthscale2) / self._psi1_denom
+            self._psi1_exponent = -0.5 * np.sum(self._psi1_dist_sq + np.log(self._psi1_denom), -1)
+            self._psi1 = self.variance * np.exp(self._psi1_exponent)
+
+            # psi2
+            self._psi2_denom = 2.*S[:, None, None, :] * self.inv_lengthscale2 + 1. # N,M,M,Q
+            self._psi2_mudist, self._psi2_mudist_sq, self._psi2_exponent, _ = self.weave_psi2(mu, self._psi2_Zhat)
+            # self._psi2_mudist = mu[:,None,None,:]-self._psi2_Zhat #N,M,M,Q
+            # self._psi2_mudist_sq = np.square(self._psi2_mudist)/(self.lengthscale2*self._psi2_denom)
+            # self._psi2_exponent = np.sum(-self._psi2_Zdist_sq -self._psi2_mudist_sq -0.5*np.log(self._psi2_denom),-1) #N,M,M,Q
+            self._psi2 = np.square(self.variance) * np.exp(self._psi2_exponent) # N,M,M,Q
+
+            # store matrices for caching
+            self._Z, self._mu, self._S = Z, mu, S
+
+    def weave_psi2(self, mu, Zhat):
+        N, input_dim = mu.shape
+        num_inducing = Zhat.shape[0]
+
+        mudist = np.empty((N, num_inducing, num_inducing, input_dim))
+        mudist_sq = np.empty((N, num_inducing, num_inducing, input_dim))
+        psi2_exponent = np.zeros((N, num_inducing, num_inducing))
+        psi2 = np.empty((N, num_inducing, num_inducing))
+
+        psi2_Zdist_sq = self._psi2_Zdist_sq
+        _psi2_denom = self._psi2_denom.squeeze().reshape(N, self.input_dim)
+        half_log_psi2_denom = 0.5 * np.log(self._psi2_denom).squeeze().reshape(N, self.input_dim)
+        variance_sq = float(np.square(self.variance))
+        if self.ARD:
+            inv_lengthscale2 = self.inv_lengthscale2
+        else:
+            inv_lengthscale2 = np.ones(input_dim) * self.inv_lengthscale2
+        code = """
+        double tmp;
+
+        #pragma omp parallel for private(tmp)
+        for (int n=0; n<N; n++){
+            for (int m=0; m<num_inducing; m++){
+               for (int mm=0; mm<(m+1); mm++){
+                   for (int q=0; q<input_dim; q++){
+                       //compute mudist
+                       tmp = mu(n,q) - Zhat(m,mm,q);
+                       mudist(n,m,mm,q) = tmp;
+                       mudist(n,mm,m,q) = tmp;
+
+                       //now mudist_sq
+                       tmp = tmp*tmp*inv_lengthscale2(q)/_psi2_denom(n,q);
+                       mudist_sq(n,m,mm,q) = tmp;
+                       mudist_sq(n,mm,m,q) = tmp;
+
+                       //now psi2_exponent
+                       tmp = -psi2_Zdist_sq(m,mm,q) - tmp - half_log_psi2_denom(n,q);
+                       psi2_exponent(n,mm,m) += tmp;
+                       if (m !=mm){
+                           psi2_exponent(n,m,mm) += tmp;
+                       }
+                   //psi2 would be computed like this, but np is faster
+                   //tmp = variance_sq*exp(psi2_exponent(n,m,mm));
+                   //psi2(n,m,mm) = tmp;
+                   //psi2(n,mm,m) = tmp;
+                   }
+                }
+            }
+        }
+
+        """
+
+        support_code = """
+        #include <omp.h>
+        #include <math.h>
+        """
+        weave.inline(code, support_code=support_code, libraries=['gomp'],
+                     arg_names=['N', 'num_inducing', 'input_dim', 'mu', 'Zhat', 'mudist_sq', 'mudist', 'inv_lengthscale2', '_psi2_denom', 'psi2_Zdist_sq', 'psi2_exponent', 'half_log_psi2_denom', 'psi2', 'variance_sq'],
+                     type_converters=weave.converters.blitz, **self.weave_options)
+
+        return mudist, mudist_sq, psi2_exponent, psi2
--- a/GPy/kern/_src/todo/spline.py
+++ b/GPy/kern/_src/todo/spline.py
@ -0,0 +1,61 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+from ...core.parameterization import Param
+
+def theta(x):
+    """Heaviside step function"""
+    return np.where(x>=0.,1.,0.)
+
+class Spline(Kernpart):
+    """
+    Spline kernel
+
+    :param input_dim: the number of input dimensions (fixed to 1 right now TODO)
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+
+    """
+
+    def __init__(self,input_dim,variance=1.,lengthscale=1.):
+        self.input_dim = input_dim
+        assert self.input_dim==1
+        self.num_params = 1
+        self.name = 'spline'
+        self.variance = Param('variance', variance)
+        self.lengthscale = Param('lengthscale', lengthscale)
+        self.add_parameters(self.variance, self.lengthscale)
+        
+#     def _get_params(self):
+#         return self.variance
+# 
+#     def _set_params(self,x):
+#         self.variance = x
+# 
+#     def _get_param_names(self):
+#         return ['variance']
+
+    def K(self,X,X2,target):
+        assert np.all(X>0), "Spline covariance is for +ve domain only. TODO: symmetrise"
+        assert np.all(X2>0), "Spline covariance is for +ve domain only. TODO: symmetrise"
+        t = X
+        s = X2.T
+        s_t = s-t # broadcasted subtraction
+        target += self.variance*(0.5*(t*s**2) - s**3/6. + (s_t)**3*theta(s_t)/6.)
+
+    def Kdiag(self,X,target):
+        target += self.variance*X.flatten()**3/3.
+
+    def _param_grad_helper(self,X,X2,target):
+        target += 0.5*(t*s**2) - s**3/6. + (s_t)**3*theta(s_t)/6.
+
+    def dKdiag_dtheta(self,X,target):
+        target += X.flatten()**3/3.
+
+    def dKdiag_dX(self,X,target):
+        target += self.variance*X**2
+
--- a/GPy/kern/_src/todo/symmetric.py
+++ b/GPy/kern/_src/todo/symmetric.py
@ -0,0 +1,81 @@
+# Copyright (c) 2012 James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+
+class Symmetric(Kernpart):
+    """
+    Symmetrical kernels
+
+    :param k: the kernel to symmetrify
+    :type k: Kernpart
+    :param transform: the transform to use in symmetrification (allows symmetry on specified axes)
+    :type transform: A numpy array (input_dim x input_dim) specifiying the transform
+    :rtype: Kernpart
+
+    """
+    def __init__(self,k,transform=None):
+        if transform is None:
+            transform = np.eye(k.input_dim)*-1.
+        assert transform.shape == (k.input_dim, k.input_dim)
+        self.transform = transform
+        self.input_dim = k.input_dim
+        self.num_params = k.num_params
+        self.name = k.name + '_symm'
+        self.k = k
+        self.add_parameter(k)
+        #self._set_params(k._get_params())
+
+    def K(self,X,X2,target):
+        """Compute the covariance matrix between X and X2."""
+        AX = np.dot(X,self.transform)
+        if X2 is None:
+            X2 = X
+            AX2 = AX
+        else:
+            AX2 = np.dot(X2, self.transform)
+        self.k.K(X,X2,target)
+        self.k.K(AX,X2,target)
+        self.k.K(X,AX2,target)
+        self.k.K(AX,AX2,target)
+
+    def _param_grad_helper(self,dL_dK,X,X2,target):
+        """derivative of the covariance matrix with respect to the parameters."""
+        AX = np.dot(X,self.transform)
+        if X2 is None:
+            X2 = X
+            AX2 = AX
+        else:
+            AX2 = np.dot(X2, self.transform)
+        self.k._param_grad_helper(dL_dK,X,X2,target)
+        self.k._param_grad_helper(dL_dK,AX,X2,target)
+        self.k._param_grad_helper(dL_dK,X,AX2,target)
+        self.k._param_grad_helper(dL_dK,AX,AX2,target)
+
+
+    def gradients_X(self,dL_dK,X,X2,target):
+        """derivative of the covariance matrix with respect to X."""
+        AX = np.dot(X,self.transform)
+        if X2 is None:
+            X2 = X
+            ZX2 = AX
+        else:
+            AX2 = np.dot(X2, self.transform)
+        self.k.gradients_X(dL_dK, X, X2, target)
+        self.k.gradients_X(dL_dK, AX, X2, target)
+        self.k.gradients_X(dL_dK, X, AX2, target)
+        self.k.gradients_X(dL_dK, AX ,AX2, target)
+
+    def Kdiag(self,X,target):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        foo = np.zeros((X.shape[0],X.shape[0]))
+        self.K(X,X,foo)
+        target += np.diag(foo)
+
+    def dKdiag_dX(self,dL_dKdiag,X,target):
+        raise NotImplementedError
+
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        raise NotImplementedError
--- a/GPy/kern/_src/trunclinear.py
+++ b/GPy/kern/_src/trunclinear.py
@ -0,0 +1,204 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+from ...util.caching import Cache_this
+from ...util.config import *
+
+class TruncLinear(Kern):
+    """
+    Truncated Linear kernel
+
+    .. math::
+
+       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i \max(0, x_iy_i - \simga_q)
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variances: the vector of variances :math:`\sigma^2_i`
+    :type variances: array or list of the appropriate size (or float if there
+                     is only one variance parameter)
+    :param ARD: Auto Relevance Determination. If False, the kernel has only one
+                variance parameter \sigma^2, otherwise there is one variance
+                parameter per dimension.
+    :type ARD: Boolean
+    :rtype: kernel object
+
+    """
+
+    def __init__(self, input_dim, variances=None, delta=None, ARD=False, active_dims=None, name='linear'):
+        super(TruncLinear, self).__init__(input_dim, active_dims, name)
+        self.ARD = ARD
+        if not ARD:
+            if variances is not None:
+                variances = np.asarray(variances)
+                delta = np.asarray(delta)
+                assert variances.size == 1, "Only one variance needed for non-ARD kernel"
+            else:
+                variances = np.ones(1)
+                delta = np.zeros(1)
+        else:
+            if variances is not None:
+                variances = np.asarray(variances)
+                delta = np.asarray(delta)
+                assert variances.size == self.input_dim, "bad number of variances, need one ARD variance per input_dim"
+            else:
+                variances = np.ones(self.input_dim)
+                delta = np.zeros(self.input_dim)
+
+        self.variances = Param('variances', variances, Logexp())
+        self.delta = Param('delta', delta)
+        self.add_parameter(self.variances)
+        self.add_parameter(self.delta)
+            
+    @Cache_this(limit=2)
+    def K(self, X, X2=None):
+        XX = self.variances*self._product(X, X2)
+        return XX.sum(axis=-1)
+
+    @Cache_this(limit=2)
+    def _product(self, X, X2=None):
+        if X2 is None:
+            X2 = X
+        XX = np.einsum('nq,mq->nmq',X-self.delta,X2-self.delta)
+        XX[XX<0] = 0
+        return XX
+
+    def Kdiag(self, X):
+        return (self.variances*np.square(X-self.delta)).sum(axis=-1)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        dK_dvar = self._product(X, X2)
+        if X2 is None:
+            X2=X
+        dK_ddelta = self.variances*(2*self.delta-X[:,None,:]-X2[None,:,:])*(dK_dvar>0)
+        if self.ARD:
+            self.variances.gradient[:] = np.einsum('nmq,nm->q',dK_dvar,dL_dK)
+            self.delta.gradient[:] = np.einsum('nmq,nm->q',dK_ddelta,dL_dK)
+        else:
+            self.variances.gradient[:] = np.einsum('nmq,nm->',dK_dvar,dL_dK)
+            self.delta.gradient[:] = np.einsum('nmq,nm->',dK_ddelta,dL_dK)
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        if self.ARD:
+            self.variances.gradient[:] = np.einsum('nq,n->q',np.square(X-self.delta),dL_dKdiag)
+            self.delta.gradient[:] = np.einsum('nq,n->q',2*self.variances*(self.delta-X),dL_dKdiag)
+        else:
+            self.variances.gradient[:] = np.einsum('nq,n->',np.square(X-self.delta),dL_dKdiag)
+            self.delta.gradient[:] = np.einsum('nq,n->',2*self.variances*(self.delta-X),dL_dKdiag)
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        XX = self._product(X, X2)
+        if X2 is None:
+            Xp = (self.variances*(X-self.delta))*(XX>0)
+        else:
+            Xp = (self.variances*(X2-self.delta))*(XX>0)
+        if X2 is None:
+            return np.einsum('nmq,nm->nq',Xp,dL_dK)+np.einsum('mnq,nm->mq',Xp,dL_dK)
+        else:
+            return np.einsum('nmq,nm->nq',Xp,dL_dK)
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        return 2.*self.variances*dL_dKdiag[:,None]*(X-self.delta)
+
+    def input_sensitivity(self):
+        return np.ones(self.input_dim) * self.variances
+
+class TruncLinear_inf(Kern):
+    """
+    Truncated Linear kernel
+
+    .. math::
+
+       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i \max(0, x_iy_i - \simga_q)
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variances: the vector of variances :math:`\sigma^2_i`
+    :type variances: array or list of the appropriate size (or float if there
+                     is only one variance parameter)
+    :param ARD: Auto Relevance Determination. If False, the kernel has only one
+                variance parameter \sigma^2, otherwise there is one variance
+                parameter per dimension.
+    :type ARD: Boolean
+    :rtype: kernel object
+
+    """
+
+    def __init__(self, input_dim, interval, variances=None, ARD=False, active_dims=None, name='linear'):
+        super(TruncLinear_inf, self).__init__(input_dim, active_dims, name)
+        self.interval = interval
+        self.ARD = ARD
+        if not ARD:
+            if variances is not None:
+                variances = np.asarray(variances)
+                assert variances.size == 1, "Only one variance needed for non-ARD kernel"
+            else:
+                variances = np.ones(1)
+        else:
+            if variances is not None:
+                variances = np.asarray(variances)
+                assert variances.size == self.input_dim, "bad number of variances, need one ARD variance per input_dim"
+            else:
+                variances = np.ones(self.input_dim)
+
+        self.variances = Param('variances', variances, Logexp())
+        self.add_parameter(self.variances)
+        
+    
+#     @Cache_this(limit=2)
+    def K(self, X, X2=None):
+        tmp = self._product(X, X2)
+        return (self.variances*tmp).sum(axis=-1)
+
+#     @Cache_this(limit=2)
+    def _product(self, X, X2=None):
+        if X2 is None:
+            X2 = X
+        X_X2 = X[:,None,:]-X2[None,:,:]
+        tmp = np.abs(X_X2**3)/6+np.einsum('nq,mq->nmq',X,X2)*(self.interval[1]-self.interval[0]) \
+              -(X[:,None,:]+X2[None,:,:])*(self.interval[1]*self.interval[1]-self.interval[0]*self.interval[0])/2+(self.interval[1]**3-self.interval[0]**3)/3.
+        return tmp
+
+    def Kdiag(self, X):
+        tmp = np.square(X)*(self.interval[1]-self.interval[0]) \
+              -X*(self.interval[1]*self.interval[1]-self.interval[0]*self.interval[0])+(self.interval[1]**3-self.interval[0]**3)/3
+        return (self.variances*tmp).sum(axis=-1)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        dK_dvar = self._product(X, X2)
+        if self.ARD:
+            self.variances.gradient[:] = np.einsum('nmq,nm->q',dK_dvar,dL_dK)
+        else:
+            self.variances.gradient[:] = np.einsum('nmq,nm->',dK_dvar,dL_dK)
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        tmp = np.square(X)*(self.interval[1]-self.interval[0]) \
+              -X*(self.interval[1]*self.interval[1]-self.interval[0]*self.interval[0])+(self.interval[1]**3-self.interval[0]**3)/3
+        if self.ARD:
+            self.variances.gradient[:] = np.einsum('nq,n->q',tmp,dL_dKdiag)
+        else:
+            self.variances.gradient[:] = np.einsum('nq,n->',tmp,dL_dKdiag)
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        XX = self._product(X, X2)
+        if X2 is None:
+            Xp = (self.variances*(X-self.delta))*(XX>0)
+        else:
+            Xp = (self.variances*(X2-self.delta))*(XX>0)
+        if X2 is None:
+            return np.einsum('nmq,nm->nq',Xp,dL_dK)+np.einsum('mnq,nm->mq',Xp,dL_dK)
+        else:
+            return np.einsum('nmq,nm->nq',Xp,dL_dK)
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        return 2.*self.variances*dL_dKdiag[:,None]*(X-self.delta)
+
+    def input_sensitivity(self):
+        return np.ones(self.input_dim) * self.variances
+
+