Merge changes for model.py and optimization.py on comments.

2026-05-06 10:32:39 +02:00 · 2012-12-05 19:22:36 -08:00 · 2012-12-05 19:22:36 -08:00 · 60fd1c55dc
commit 60fd1c55dc
parent 0ee11a2077 c003b2f34d
12 changed files with 336 additions and 195 deletions
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -18,6 +18,7 @@ class model(parameterised):
        self.optimization_runs = []
        self.sampling_runs = []
        self.set_param(self.get_param())
        self.preferred_optimizer = 'tnc'
    def get_param(self):
        raise NotImplementedError, "this needs to be implemented to utilise the model class"
    def set_param(self,x):
@ -161,15 +162,18 @@ class model(parameterised):
        else:
            self.expand_param(initial_parameters)
-    def optimize(self, optimizer = 'tnc', start = None, **kwargs):
+    def optimize(self, optimizer=None, start=None, **kwargs):
        """
        Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
        kwargs are passed to the optimizer. They can be:
        :max_f_eval: maximum number of function evaluations
        :messages: whether to display during optimisation
-
+        :param optimzer: whice optimizer to use (defaults to self.preferred optimizer)
        :type optimzer: string TODO: valid strings?
        """
        if optimizer is None:
            optimizer = self.preferred_optimizer
        def f(x):
            self.expand_param(x)
--- a/GPy/examples/sparse_GPLVM_demo.py
+++ b/GPy/examples/sparse_GPLVM_demo.py
@ -9,6 +9,7 @@ np.random.seed(1)
 print "sparse GPLVM with RBF kernel"
 N = 100
 M = 4
 Q = 1
 D = 2
 #generate GPLVM-like data
@ -17,7 +18,7 @@ k = GPy.kern.rbf(Q, 1.0, 2.0) + GPy.kern.white(Q, 0.00001)
 K = k.K(X)
 Y = np.random.multivariate_normal(np.zeros(N),K,D).T
-m = GPy.models.sparse_GPLVM(Y, Q, M = 10)
+m = GPy.models.sparse_GPLVM(Y, Q, M=M)
 m.constrain_positive('(rbf|bias|noise)')
 m.constrain_bounded('white', 1e-3, 0.1)
 # m.plot()
--- a/GPy/examples/sparse_GP_regression_demo.py
+++ b/GPy/examples/sparse_GP_regression_demo.py
@ -12,6 +12,7 @@ import GPy
 np.random.seed(2)
 pb.ion()
 N = 500
 M = 5
 ######################################
 ## 1 dimensional example
@ -26,7 +27,7 @@ noise = GPy.kern.white(1)
 kernel = rbf + noise
 # create simple GP model
-m1 = GPy.models.sparse_GP_regression(X,Y,kernel, M = 10)
+m1 = GPy.models.sparse_GP_regression(X, Y, kernel, M=M)
 # contrain all parameters to be positive
 m1.constrain_positive('(variance|lengthscale|precision)')
--- a/GPy/inference/optimization.py
+++ b/GPy/inference/optimization.py
@ -23,8 +23,7 @@ class Optimizer():
    :rtype: optimizer object.
    """	
-    def __init__(self, x_init, f_fp, f, fp , messages = False, max_f_eval = 1e4, ftol = None, gtol = None, xtol = None):
+    def __init__(self, x_init, f_fp, f, fp , messages=False, max_f_eval=1e4, ftol=None, gtol=None, xtol=None):
        self.opt_name = None
        self.f_fp = f_fp
        self.f = f
--- a/GPy/kern/Matern32.py
+++ b/GPy/kern/Matern32.py
@ -39,11 +39,13 @@ class Matern32(kernpart):
    def get_param(self):
        """return the value of the parameters."""
        return np.hstack((self.variance,self.lengthscales))
    def set_param(self,x):
        """set the value of the parameters."""
        assert x.size==(self.D+1)
        self.variance = x[0]
        self.lengthscales = x[1:]
    def get_param_names(self):
        """return parameter names."""
        if self.D==1:
@ -56,10 +58,37 @@ class Matern32(kernpart):
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
        np.add(self.variance*(1+np.sqrt(3.)*dist)*np.exp(-np.sqrt(3.)*dist), target,target)
    def Kdiag(self,X,target):
        """Compute the diagonal of the covariance matrix associated to X."""
        np.add(target,self.variance,target)
    def dK_dtheta(self,partial,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
        dvar = (1+np.sqrt(3.)*dist)*np.exp(-np.sqrt(3.)*dist)
        invdist = 1./np.where(dist!=0.,dist,np.inf)
        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3
        dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
        target[0] += np.sum(dvar*partial)
        target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
    def dKdiag_dtheta(self,partial,X,target):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
        target[0] += np.sum(partial)
    def dK_dX(self,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
        dK_dX += -  np.transpose(3*self.variance*dist*np.exp(-np.sqrt(3)*dist)*ddist_dX,(1,0,2))
        target += np.sum(dK_dX*partial.T[:,:,None],0)
    def dKdiag_dX(self,X,target):
        pass
    def Gram_matrix(self,F,F1,F2,lower,upper):
        """
        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to D=1.
@ -87,25 +116,3 @@ class Matern32(kernpart):
        #return(G)
        return(self.lengthscales**3/(12.*np.sqrt(3)*self.variance) * G + 1./self.variance*np.dot(Flower,Flower.T) + self.lengthscales**2/(3.*self.variance)*np.dot(F1lower,F1lower.T))
    def dK_dtheta(self,X,X2,target):
        """derivative of the cross-covariance matrix with respect to the parameters (shape is NxMxNparam)"""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
        dvar = (1+np.sqrt(3.)*dist)*np.exp(-np.sqrt(3.)*dist)
        invdist = 1./np.where(dist!=0.,dist,np.inf)
        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3
        dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
        np.add(target[:,:,0],dvar, target[:,:,0])
        np.add(target[:,:,1:],dl, target[:,:,1:])
    def dKdiag_dtheta(self,X,target):
        """derivative of the diagonal of the covariance matrix with respect to the parameters (shape is NxNparam)"""
        np.add(target[:,0],1.,target[:,0])
    def dK_dX(self,X,X2,target):
        """derivative of the covariance matrix with respect to X (*! shape is NxMxD !*)."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
        target += -  np.transpose(3*self.variance*dist*np.exp(-np.sqrt(3)*dist)*ddist_dX,(1,0,2))
    def dKdiag_dX(self,X,target):
        pass
--- a/GPy/kern/Matern52.py
+++ b/GPy/kern/Matern52.py
@ -33,33 +33,61 @@ class Matern52(kernpart):
        self.Nparam = self.D + 1
        self.name = 'Mat52'
        self.set_param(np.hstack((variance,lengthscales)))
-        self._Z, self._mu, self._S = np.empty(shape=(3,1)) # cached versions of Z,mu,S
+        
    def get_param(self):
        """return the value of the parameters."""
        return np.hstack((self.variance,self.lengthscales))
    def set_param(self,x):
        """set the value of the parameters."""
        assert x.size==(self.D+1)
        self.variance = x[0]
        self.lengthscales = x[1:]
-        self.lengthscales2 = np.square(self.lengthscales)
+
        self._Z, self._mu, self._S = np.empty(shape=(3,1)) # cached versions of Z,mu,S
    def get_param_names(self):
        """return parameter names."""
        if self.D==1:
            return ['variance','lengthscale']
        else:
            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscales.size)]
    def K(self,X,X2,target):
        """Compute the covariance matrix between X and X2."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
        np.add(self.variance*(1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist), target,target)
    def Kdiag(self,X,target):
        """Compute the diagonal of the covariance matrix associated to X."""
        np.add(target,self.variance,target)
    def dK_dtheta(self,partial,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
        invdist = 1./np.where(dist!=0.,dist,np.inf)
        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3
        dvar = (1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist)
        dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
        target[0] += np.sum(dvar*partial)
        target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
    def dKdiag_dtheta(self,X,target):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
        target[0] += np.sum(partial)
    def dK_dX(self,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
        dK_dX += -  np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2))
        target += np.sum(dK_dX*partial.T[:,:,None],0)
    def dKdiag_dX(self,X,target):
        pass
    def Gram_matrix(self,F,F1,F2,F3,lower,upper):
        """
        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to D=1.
@ -91,26 +119,5 @@ class Matern52(kernpart):
        orig2 = 3./5*self.lengthscales**2 * ( np.dot(F1lower,F1lower.T) + 1./8*np.dot(Flower,F2lower.T) + 1./8*np.dot(F2lower,Flower.T))
        return(1./self.variance* (G_coef*G + orig + orig2))
    def dK_dtheta(self,X,X2,target):
        """derivative of the cross-covariance matrix with respect to the parameters (shape is NxMxNparam)"""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
        invdist = 1./np.where(dist!=0.,dist,np.inf)
        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3
        dvar = (1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist)
        dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
        np.add(target[:,:,0],dvar, target[:,:,0])
        np.add(target[:,:,1:],dl, target[:,:,1:])
    def dKdiag_dtheta(self,X,target):
        """derivative of the diagonal of the covariance matrix with respect to the parameters (shape is NxNparam)"""
        np.add(target[:,0],1.,target[:,0])
    def dK_dX(self,X,X2,target):
        """derivative of the covariance matrix with respect to X (*! shape is NxMxD !*)."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
        target += -  np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2))
    def dKdiag_dX(self,X,target):
        pass
--- a/GPy/kern/exponential.py
+++ b/GPy/kern/exponential.py
@ -62,7 +62,7 @@ class exponential(kernpart):
        np.add(target,self.variance,target)
    def dK_dtheta(self,partial,X,X2,target):
-        """derivative of the cross-covariance matrix with respect to the parameters (shape is NxMxNparam)"""
+        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
        invdist = 1./np.where(dist!=0.,dist,np.inf)
@ -73,12 +73,12 @@ class exponential(kernpart):
        target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
    def dKdiag_dtheta(self,partial,X,target): 
-        """derivative of the diagonal of the covariance matrix with respect to the parameters (shape is NxNparam)"""
+        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
        #NB: derivative of diagonal elements wrt lengthscale is 0
        target[0] += np.sum(partial)
    def dK_dX(self,X,X2,target):
-        """derivative of the covariance matrix with respect to X (*! shape is NxMxD !*)."""
+        """derivative of the covariance matrix with respect to X."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
--- a/GPy/kern/linear.py
+++ b/GPy/kern/linear.py
@ -21,6 +21,7 @@ class linear(kernpart):
        self.Nparam = 1
        self.name = 'linear'
        self.set_param(variance)
        self._Xcache, self._X2cache = np.empty(shape=(2,))
    def get_param(self):
        return self.variance
@ -32,7 +33,8 @@ class linear(kernpart):
        return ['variance']
    def K(self,X,X2,target):
-        target += self.variance * np.dot(X, X2.T)
+        self._K_computations(X, X2)
        target += self.variance * self._dot_product
    def Kdiag(self,X,target):
        np.add(target,np.sum(self.variance*np.square(X),-1),target)
@ -42,7 +44,9 @@ class linear(kernpart):
        Computes the derivatives wrt theta
        Return shape is NxMx(Ntheta)
        """
-        product = np.dot(X, X2.T)
+        self._K_computations(X, X2)
        product = self._dot_product
        # product = np.dot(X, X2.T)
        target += np.sum(product*partial)
    def dK_dX(self,partial,X,X2,target):
@ -51,6 +55,20 @@ class linear(kernpart):
    def dKdiag_dtheta(self,partial,X,target):
        target += np.sum(partial*np.square(X).sum(1))
    def _K_computations(self,X,X2):
        # (Nicolo) changed the logic here. If X2 is None, we want to cache
        # (X,X). In practice X2 should always be passed.
        if X2 is None:
            X2 = X
        if not (np.all(X==self._Xcache) and np.all(X2==self._X2cache)):
            self._Xcache = X
            self._X2cache = X2
            self._dot_product = np.dot(X,X2.T) 
        else:
            # print "Cache hit!"
            pass # TODO: insert debug message here (logging framework)
    # def psi0(self,Z,mu,S,target):
    #     expected = np.square(mu) + S
    #     np.add(target,np.sum(self.variance*expected),target)
--- a/GPy/kern/rbf_ARD.py
+++ b/GPy/kern/rbf_ARD.py
@ -30,6 +30,7 @@ class rbf_ARD(kernpart):
    def get_param(self):
        return np.hstack((self.variance,self.lengthscales))
    def set_param(self,x):
        assert x.size==(self.D+1)
        self.variance = x[0]
@ -37,61 +38,73 @@ class rbf_ARD(kernpart):
        self.lengthscales2 = np.square(self.lengthscales)
        #reset cached results
        self._Z, self._mu, self._S = np.empty(shape=(3,1)) # cached versions of Z,mu,S
    def get_param_names(self):
        if self.D==1:
            return ['variance','lengthscale']
        else:
            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscales.size)]
    def K(self,X,X2,target):
        self._K_computations(X,X2)
        np.add(self.variance*self._K_dvar, target,target)
    def Kdiag(self,X,target):
        np.add(target,self.variance,target)
-    def dK_dtheta(self,X,X2,target):
+
-        """Return shape is NxMx(Ntheta)"""
+    def dK_dtheta(self,partial,X,X2,target):
        self._K_computations(X,X2)
        dl = self._K_dvar[:,:,None]*self.variance*self._K_dist2/self.lengthscales
-        np.add(target[:,:,0],self._K_dvar, target[:,:,0])
+        target[0] += np.sum(self._K_dvar*partial)
-        np.add(target[:,:,1:],dl, target[:,:,1:])
+        target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
    def dKdiag_dtheta(self,X,target):
-        np.add(target[:,0],1.,target[:,0])
+        target[0] += np.sum(partial)
    def dK_dX(self,X,X2,target):
        self._K_computations(X,X2)
        dZ = self.variance*self._K_dvar[:,:,None]*self._K_dist/self.lengthscales2
-        np.add(target,-dZ.transpose(1,0,2),target)
+        dK_dX = -dZ.transpose(1,0,2)
        target += np.sum(dK_dX*partial.T[:,:,None],0)
    def dKdiag_dX(self,partial,X,target):
        pass
    def psi0(self,Z,mu,S,target):
-        np.add(target, self.variance, target)
+        target += self.variance
-    def dpsi0_dtheta(self,Z,mu,S,target):
+
-        target[:,0] += 1.
+    def dpsi0_dtheta(self,partial,Z,mu,S,target):
        target[0] += 1.
    def dpsi0_dmuS(self,Z,mu,S,target_mu,target_S):
        pass
    def psi1(self,Z,mu,S,target):
        """Think N,M,Q """
        self._psi_computations(Z,mu,S)
        np.add(target, self._psi1,target)
-    def dpsi1_dtheta(self,Z,mu,S,target):
+    def dpsi1_dtheta(self,partial,Z,mu,S,target):
        """N,Q,(Ntheta)"""
        self._psi_computations(Z,mu,S)
        denom_deriv = S[:,None,:]/(self.lengthscales**3+self.lengthscales*S[:,None,:])
        d_length = self._psi1[:,:,None]*(self.lengthscales*np.square(self._psi1_dist/(self.lengthscales2+S[:,None,:])) + denom_deriv)
-        target[:,:,0] += self._psi1/self.variance
+        target[0] += np.sum(partial*self._psi1/self.variance)
-        target[:,:,1:] += d_length
+        target[1:] += (d_length*partial[:,:,None]).sum(0).sum(0)
-    def dpsi1_dZ(self,Z,mu,S,target):
+
    def dpsi1_dZ(self,partial,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
        np.add(target,-self._psi1[:,:,None]*self._psi1_dist/self.lengthscales2/self._psi1_denom,target)
        target += np.sum(partial[:,:,None]*-self._psi1[:,:,None]*self._psi1_dist/self.lengthscales2/self._psi1_denom,0)
-    def dpsi1_dmuS(self,Z,mu,S,target_mu,target_S):
+    def dpsi1_dmuS(self,partial,Z,mu,S,target_mu,target_S):
        """return shapes are N,M,Q"""
        self._psi_computations(Z,mu,S)
        tmp = self._psi1[:,:,None]/self.lengthscales2/self._psi1_denom
-        np.add(target_mu,tmp*self._psi1_dist,target_mu)
+        target_mu += np.sum(partial*tmp*self._psi1_dist,1)
-        np.add(target_S, 0.5*tmp*(self._psi1_dist_sq-1), target_S)
+        target_S += np.sum(partial*0.5*tmp*(self._psi1_dist_sq-1),1)
    def psi2(self,Z,mu,S,target):
        """shape N,M,M"""
        self._psi_computations(Z,mu,S)
-        np.add(target, self._psi2,target)
+        target += self._psi2.sum(0) #TODO: psi2 should be NxMxM (for het. noise)
    def dpsi2_dtheta(self,Z,mu,S,target):
        """Shape N,M,M,Ntheta"""
@ -99,21 +112,21 @@ class rbf_ARD(kernpart):
        d_var = np.sum(2.*self._psi2/self.variance,0)
        d_length = self._psi2[:,:,:,None]*(0.5*self._psi2_Zdist_sq*self._psi2_denom + 2.*self._psi2_mudist_sq + 2.*S[:,None,None,:]/self.lengthscales2)/(self.lengthscales*self._psi2_denom)
        d_length = d_length.sum(0)
-        target[:,:,0] += d_var
+        target[0] += np.sum(partial*d_var)
-        target[:,:,1:] += d_length
+        target[1:] += (d_length*partial[:,:,None]).sum(0).sum(0)
    def dpsi2_dZ(self,Z,mu,S,target):
        """Returns shape N,M,M,Q"""
        self._psi_computations(Z,mu,S)
        dZ = self._psi2[:,:,:,None]/self.lengthscales2*(-0.5*self._psi2_Zdist + self._psi2_mudist/self._psi2_denom)
-        target += dZ
+        target += np.sum(partial[None,:,:,None]*dZ,0).sum(1)
    def dpsi2_dmuS(self,Z,mu,S,target_mu,target_S):
        """Think N,M,M,Q """
        self._psi_computations(Z,mu,S)
        tmp = self._psi2[:,:,:,None]/self.lengthscales2/self._psi2_denom
-        np.add(target_mu, -tmp*(2.*self._psi2_mudist),target_mu) #N,M,M,Q
+        target_mu += (partial*-tmp*2.*self._psi2_mudist).sum(1).sum(1)
-        np.add(target_S, tmp*(2.*self._psi2_mudist_sq-1), target_S) #N,M,M,Q
+        target_S += (partial*tmp*(2.*self._psi2_mudist_sq-1)).sum(1).sum(1)
    def _K_computations(self,X,X2):
        if not (np.all(X==self._X) and np.all(X2==self._X2)):
--- a/GPy/models/sparse_GP_regression.py
+++ b/GPy/models/sparse_GP_regression.py
@ -10,6 +10,10 @@ from .. import kern
 from ..inference.likelihoods import likelihood
 from GP_regression import GP_regression
 #Still TODO:
 # make use of slices properly (kernel can now do this)
 # enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array)
 class sparse_GP_regression(GP_regression):
    """
    Variational sparse GP model (Regression)
@ -22,6 +26,8 @@ class sparse_GP_regression(GP_regression):
    :type kernel: a GPy kernel
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (M x Q) | None
    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
    :type X_uncertainty: np.ndarray (N x Q) | None
    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type M: int
@ -31,7 +37,7 @@ class sparse_GP_regression(GP_regression):
    :type normalize_(X|Y): bool
    """
-    def __init__(self,X,Y,kernel=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False):
+    def __init__(self,X,Y,kernel=None, X_uncertainty=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False):
        self.beta = beta
        if Z is None:
            self.Z = np.random.permutation(X.copy())[:M]
@ -40,10 +46,20 @@ class sparse_GP_regression(GP_regression):
            assert Z.shape[1]==X.shape[1]
            self.Z = Z
            self.M = Z.shape[1]
        if X_uncertainty is None:
            self.has_uncertain_inputs=False
        else:
            assert X_uncertainty.shape==X.shape
            self.has_uncertain_inputs=False
            self.X_uncertainty = X_uncertainty
-        GP_regression.__init__(self, X, Y, kernel = kernel, normalize_X = normalize_X, normalize_Y = normalize_Y)
+        GP_regression.__init__(self, X, Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y)
        self.trYYT = np.sum(np.square(self.Y))
        #normalise X uncertainty also
        if self.has_uncertain_inputs:
            self.X_uncertainty /= np.square(self._Xstd)
    def set_param(self, p):
        self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
        self.beta = p[self.M*self.Q]
@ -54,18 +70,23 @@ class sparse_GP_regression(GP_regression):
    def _compute_kernel_matrices(self):
        # kernel computations, using BGPLVM notation
        #TODO: the following can be switched out in the case of uncertain inputs (or the BGPLVM!)
        #TODO: slices for psi statistics (easy enough)
        self.Kmm = self.kern.K(self.Z)
-        self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum()
+        if self.has_uncertain_inputs:
-        self.psi1 = self.kern.K(self.Z,self.X)
+            self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum()
-        self.psi2 = np.dot(self.psi1,self.psi1.T)
+            self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T
            self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)
        else:
            self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum()
            self.psi1 = self.kern.K(self.Z,self.X)
            self.psi2 = np.dot(self.psi1,self.psi1.T)
    def _computations(self):
        # TODO find routine to multiply triangular matrices
-        self.psi1Y = np.dot(self.psi1, self.Y)
+        self.V = self.beta*self.Y
-        self.psi1YYpsi1 = np.dot(self.psi1Y, self.psi1Y.T)
+        self.psi1V = np.dot(self.psi1, self.V)
        self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T)
        self.Lm = jitchol(self.Kmm)
        self.Lmi = chol_inv(self.Lm)
        self.Kmmi = np.dot(self.Lmi.T, self.Lmi)
@ -77,25 +98,19 @@ class sparse_GP_regression(GP_regression):
        self.LLambdai = np.dot(self.LBi, self.Lmi)
        self.trace_K = self.psi0 - np.trace(self.A)
        self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi)
-        self.C = mdot(self.LLambdai, self.psi1Y)
+        self.C = mdot(self.LLambdai, self.psi1V)
-        self.G =  mdot(self.LBL_inv, self.psi1YYpsi1, self.LBL_inv.T)
+        self.G =  mdot(self.LBL_inv, self.psi1VVpsi1, self.LBL_inv.T)
-        # Computes dL_dpsi
+        # Compute dL_dpsi
        self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N)
-        dC_dpsi1 = (self.LLambdai.T[:,:, None, None] * self.Y) # this is sane.
+        dC_dpsi1 = (self.LLambdai.T[:,:, None, None] * self.V)
-        tmp = (dC_dpsi1*self.C[None,:,None,:]).sum(1).sum(-1)
+        self.dL_dpsi1 = (dC_dpsi1*self.C[None,:,None,:]).sum(1).sum(-1)
-        self.dL_dpsi1 = self.beta2 * tmp
+        self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G)
        self.dL_dpsi2 = (- 0.5 * self.D * self.beta * (self.LBL_inv - self.Kmmi)
                         - self.beta**3 * 0.5 * self.G)
-        # Computes dL_dKmm TODO: nicer precomputations
+        # Compute dL_dKmm
-
+        self.dL_dKmm = -0.5 * self.beta * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB
-        tmp = self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi)
+        self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi) + self.Kmmi) # dC
-        self.dL_dKmm = -self.beta * self.D * 0.5 * mdot(self.Lmi.T, self.A, self.Lmi) # dB
+        self.dL_dKmm +=  np.dot(np.dot(self.G,self.beta*self.psi2) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE
        self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - tmp - tmp.T + self.Kmmi) # dC
        tmp = (mdot(self.LBL_inv, self.psi1YYpsi1, self.Kmmi)
               - self.beta*mdot(self.G, self.psi2, self.Kmmi))
        self.dL_dKmm += -0.5*self.beta2*(tmp + tmp.T - self.G) # dE
    def get_param(self):
        return np.hstack([self.Z.flatten(),self.beta,self.kern.extract_param()])
@ -104,65 +119,83 @@ class sparse_GP_regression(GP_regression):
        return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern.extract_param_names()
    def log_likelihood(self):
        """
        Compute the (lower bound on the) log marginal likelihood
        """
        A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta))
        B = -0.5*self.beta*self.D*self.trace_K
        C = -self.D * np.sum(np.log(np.diag(self.LB)))
        D = -0.5*self.beta*self.trYYT
-        E = +0.5*self.beta2*np.sum(self.psi1YYpsi1 * self.LBL_inv)
+        E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv)
        return A+B+C+D+E
    def dL_dbeta(self):
-        """ compute the gradient of the log likelihood wrt beta.
+        """
-        TODO: suport heteroscedatic noise"""
+        Compute the gradient of the log likelihood wrt beta.
        TODO: suport heteroscedatic noise
        """
        dA_dbeta =   0.5 * self.N*self.D/self.beta
        dB_dbeta = - 0.5 * self.D * self.trace_K
        dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)
        dD_dbeta = - 0.5 * self.trYYT
-        tmp = mdot(self.LBi.T, self.LLambdai, self.psi1Y)
+        tmp = mdot(self.LBi.T, self.LLambdai, self.psi1V)
-        dE_dbeta = (self.beta * np.sum(np.square(self.C)) - 0.5 * self.beta2
+        dE_dbeta = np.sum(np.square(self.C))/self.beta - 0.5 * np.sum(self.A * np.dot(tmp, tmp.T))
                    * np.sum(self.A * np.dot(tmp, tmp.T)))
        return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta)
    def dL_dtheta(self):
-        #re-cast computations in psi2 back to psi1:
+        """
-        dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
+        Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel
-
+        """
        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z)
-        dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X)
+        if self.has_uncertain_inputs:
-        dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
+            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty)
            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape
        else:
            #re-cast computations in psi2 back to psi1:
            dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
            dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X)
            dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
        return dL_dtheta
    def dL_dZ(self):
-        #re-cast computations in psi2 back to psi1:
+        """
-        dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
+        The derivative of the bound wrt the inducing inputs Z
-
+        """
        dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z,)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
-        dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X)
+        if self.has_uncertain_inputs:
            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
            dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty)
        else:
            #re-cast computations in psi2 back to psi1:
            dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
            dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X)
        return dL_dZ
    def log_likelihood_gradients(self):
        return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()])
-    def _raw_predict(self,Xnew,slices):
+    def _raw_predict(self, Xnew, slices):
        """Internal helper function for making predictions, does not account for normalisation"""
        Kx = self.kern.K(self.Z, Xnew)
        Kxx = self.kern.K(Xnew)
-        mu = self.beta * mdot(Kx.T, self.LBL_inv, self.psi1Y)
+        mu = mdot(Kx.T, self.LBL_inv, self.psi1V)
        var = Kxx - mdot(Kx.T, (self.Kmmi - self.LBL_inv), Kx) + np.eye(Xnew.shape[0])/self.beta # TODO: This beta doesn't belong here in the EP case.
        return mu,var
-    def plot(self,*args,**kwargs):
+    def plot(self, *args, **kwargs):
        """
        Plot the fitted model: just call the GP_regression plot function and then add inducing inputs
        """
        GP_regression.plot(self,*args,**kwargs)
        if self.Q==1:
            pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12)
            if self.has_uncertain_inputs:
                pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten()))
        if self.Q==2:
            pb.plot(self.Z[:,0],self.Z[:,1],'wo')
--- a/GPy/models/uncertain_input_GP_regression.py
+++ b/GPy/models/uncertain_input_GP_regression.py
@ -1,70 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 from ..util.linalg import mdot, jitchol, chol_inv, pdinv
 from ..util.plot import gpplot
 from .. import kern
 from ..inference.likelihoods import likelihood
 from sparse_GP_regression import sparse_GP_regression
 class uncertain_input_GP_regression(sparse_GP_regression):
    """
    Variational sparse GP model (Regression) with uncertainty on the inputs
    :param X: inputs
    :type X: np.ndarray (N x Q)
    :param X_uncertainty: uncertainty on X (Gaussian variances, assumed isotrpic)
    :type X_uncertainty: np.ndarray (N x Q)
    :param Y: observed data
    :type Y: np.ndarray of observations (N x D)
    :param kernel : the kernel/covariance function. See link kernels
    :type kernel: a GPy kernel
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (M x Q) | None
    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type M: int
    :param beta: noise precision. TODO> ignore beta if doing EP
    :type beta: float
    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """
    def __init__(self,X,Y,X_uncertainty,kernel=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False):
        self.X_uncertainty = X_uncertainty
        sparse_GP_regression.__init__(self, X, Y, kernel = kernel, beta = beta, normalize_X = normalize_X, normalize_Y = normalize_Y)
        self.trYYT = np.sum(np.square(self.Y))
    def _compute_kernel_matrices(self):
        # kernel computations, using BGPLVM notation
        #TODO: slices for psi statistics (easy enough)
        self.Kmm = self.kern.K(self.Z)
        self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum()
        self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T
        self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)
    def dL_dtheta(self):
        #re-cast computations in psi2 back to psi1:
        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z)
        dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty)
        dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
        dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape
        return dL_dtheta
    def dL_dZ(self):
        dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z,)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
        dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
        dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty)
        return dL_dZ
    def plot(self,*args,**kwargs):
        """
        Plot the fitted model: just call the sparse GP_regression plot function and then add
        markers to represent uncertainty on the inputs
        """
        sparse_GP_regression.plot(self,*args,**kwargs)
        if self.Q==1:
            pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten()))
--- a/GPy/models/uncollapsed_sparse_GP.py
+++ b/GPy/models/uncollapsed_sparse_GP.py
@ -0,0 +1,128 @@
 # Copyright (c) 2012 James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 from ..util.linalg import mdot, jitchol, chol_inv, pdinv
 from ..util.plot import gpplot
 from .. import kern
 from ..inference.likelihoods import likelihood
 from sparse_GP_regression import sparse_GP_regression
 class uncollapsed_sparse_GP(sparse_GP_regression):
    """
    Variational sparse GP model (Regression), where the approximating distribution q(u) is represented explicitly
    :param X: inputs
    :type X: np.ndarray (N x Q)
    :param Y: observed data
    :type Y: np.ndarray of observations (N x D)
    :param q_u: canonical parameters of the distribution squasehd into a 1D array
    :type q_u: np.ndarray
    :param kernel : the kernel/covariance function. See link kernels
    :type kernel: a GPy kernel
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (M x Q) | None
    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type M: int
    :param beta: noise precision. TODO> ignore beta if doing EP
    :type beta: float
    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """
    def __init__(self, X, Y, q_u=None, *args, **kwargs)
        D = Y.shape[1]
        if q_u is None:
            if Z is None:
                M = Z.shape[0]
            else:
                M=M
            self.set_vb_param(np.hstack((np.ones(M*D)),np.eye(M).flatten()))
        sparse_GP_regression.__init__(self, X, Y, *args, **kwargs)
    def _computations(self):
        self.V = self.beta*self.Y
        self.psi1V = np.dot(self.psi1, self.V)
        self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T)
        self.Lm = jitchol(self.Kmm)
        self.Lmi = chol_inv(self.Lm)
        self.Kmmi = np.dot(self.Lmi.T, self.Lmi)
        self.A = mdot(self.Lmi, self.psi2, self.Lmi.T)
        self.B = np.eye(self.M) + self.beta * self.A
        self.Lambda = mdot(self.Lmi.T,self.B,sel.Lmi)
        # Compute dL_dpsi
        self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N)
        self.dL_dpsi1 = 
        self.dL_dpsi2 = 
        # Compute dL_dKmm
        self.dL_dKmm = 
        self.dL_dKmm += 
        self.dL_dKmm += 
    def log_likelihood(self):
        """
        Compute the (lower bound on the) log marginal likelihood
        """
        A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta))
        B = -0.5*self.beta*self.D*self.trace_K
        C = -self.D *(self.Kmm_hld +0.5*np.sum(self.Lambda * self.mmT_S) + self.M/2.)
        E = -0.5*self.beta*self.trYYT
        F = np.sum(np.dot(self.V.T,self.projected_mean))
        return A+B+C+D+E+F
    def dL_dbeta(self):
        """
        Compute the gradient of the log likelihood wrt beta.
        TODO: suport heteroscedatic noise
        """
        dA_dbeta =   0.5 * self.N*self.D/self.beta
        dB_dbeta = - 0.5 * self.D * self.trace_K
        dC_dbeta = - 0.5 * self.D * #TODO
        dD_dbeta = - 0.5 * self.trYYT
        return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta)
    def _raw_predict(self, Xnew, slices):
        """Internal helper function for making predictions, does not account for normalisation"""
        #TODO
        return mu,var
    def set_vb_param(self,vb_param):
        """set the distribution q(u) from the canonical parameters"""
        self.q_u_prec = -2.*vb_param[self.M*self.D:].reshape(self.M,self.M)
        self.q_u_prec_L = jitchol(self.q_u_prec)
        self.q_u_cov_L = chol_inv(self.q_u_prec_L)
        self.q_u_cov = np.dot(self.q_u_cov_L,self.q_u_cov_L.T)
        self.q_u_mean = -2.*np.dot(self.q_u_cov,vb_param[:self.M*self.D].reshape(self.M,self.D))
        self.q_u_expectation = (self.q_u_mean, np.dot(self.q_u_mean,self.q_u_mean.T)+self.q_u_cov)
        self.q_u_canonical = (np.dot(self.q_u_prec, self.q_u_mean),-0.5*self.q_u_prec)
        #TODO: computations now?
    def get_vb_param(self):
        """
        Return the canonical parameters of the distribution q(u)
        """
        return np.hstack([e.flatten() for e in self.q_u_canonical])
    def vb_grad_natgrad(self):
        """
        Compute the gradients of the lower bound wrt the canonical and
        Expectation parameters of u.
        Note that the natural gradient in either is given by the gradient in the other (See Hensman et al 2012 Fast Variational inference in the conjugate exponential Family)
        """
        foobar #TODO
    def plot(self, *args, **kwargs):
        """
        add the distribution q(u) to the plot from sparse_GP_regression
        """
        sparse_GP_regression.plot(self,*args,**kwargs)
        #TODO: plot the q(u) dist.