From 1c60e50feddb8656f4be779347dd6e5a808c2038 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Sun, 2 Dec 2012 12:32:20 +0000
Subject: [PATCH 01/12] I think the gradients bug in the sparse GP model is due
 to Kmm being unstable to invert. REducing M in some of the examples really
 helps

---
 GPy/examples/sparse_GPLVM_demo.py         | 3 ++-
 GPy/examples/sparse_GP_regression_demo.py | 3 ++-
 GPy/models/sparse_GP_regression.py        | 4 ++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/GPy/examples/sparse_GPLVM_demo.py b/GPy/examples/sparse_GPLVM_demo.py
index af125d6e..6ca6c941 100644
--- a/GPy/examples/sparse_GPLVM_demo.py
+++ b/GPy/examples/sparse_GPLVM_demo.py
@@ -9,6 +9,7 @@ np.random.seed(1)
 print "sparse GPLVM with RBF kernel"
 
 N = 100
+M = 4
 Q = 1
 D = 2
 #generate GPLVM-like data
@@ -17,7 +18,7 @@ k = GPy.kern.rbf(Q, 1.0, 2.0) + GPy.kern.white(Q, 0.00001)
 K = k.K(X)
 Y = np.random.multivariate_normal(np.zeros(N),K,D).T
 
-m = GPy.models.sparse_GPLVM(Y, Q, M = 10)
+m = GPy.models.sparse_GPLVM(Y, Q, M=M)
 m.constrain_positive('(rbf|bias|noise)')
 m.constrain_bounded('white', 1e-3, 0.1)
 # m.plot()
diff --git a/GPy/examples/sparse_GP_regression_demo.py b/GPy/examples/sparse_GP_regression_demo.py
index 13ff47e0..6424b8f2 100644
--- a/GPy/examples/sparse_GP_regression_demo.py
+++ b/GPy/examples/sparse_GP_regression_demo.py
@@ -12,6 +12,7 @@ import GPy
 np.random.seed(2)
 pb.ion()
 N = 500
+M = 5
 
 ######################################
 ## 1 dimensional example
@@ -26,7 +27,7 @@ noise = GPy.kern.white(1)
 kernel = rbf + noise
 
 # create simple GP model
-m1 = GPy.models.sparse_GP_regression(X,Y,kernel, M = 10)
+m1 = GPy.models.sparse_GP_regression(X, Y, kernel, M=M)
 
 # contrain all parameters to be positive
 m1.constrain_positive('(variance|lengthscale|precision)')
diff --git a/GPy/models/sparse_GP_regression.py b/GPy/models/sparse_GP_regression.py
index a221ad31..4116cca2 100644
--- a/GPy/models/sparse_GP_regression.py
+++ b/GPy/models/sparse_GP_regression.py
@@ -90,9 +90,9 @@ class sparse_GP_regression(GP_regression):
 
         # Computes dL_dKmm TODO: nicer precomputations
 
+        self.dL_dKmm = -0.5 * self.beta * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB
         tmp = self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi)
-        self.dL_dKmm = -self.beta * self.D * 0.5 * mdot(self.Lmi.T, self.A, self.Lmi) # dB
-        self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - tmp - tmp.T + self.Kmmi) # dC
+        self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - tmp - tmp.T + self.Kmmi) # dC #TODO: is tmp PD? save some computations here
         tmp = (mdot(self.LBL_inv, self.psi1YYpsi1, self.Kmmi)
                - self.beta*mdot(self.G, self.psi2, self.Kmmi))
         self.dL_dKmm += -0.5*self.beta2*(tmp + tmp.T - self.G) # dE

From 365d069ae87fdcc14fd5d39873ef2f8609e86359 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Sun, 2 Dec 2012 12:35:54 +0000
Subject: [PATCH 02/12] saved a little computation by exploiting the symmetry
 of a matrix

---
 GPy/models/sparse_GP_regression.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/GPy/models/sparse_GP_regression.py b/GPy/models/sparse_GP_regression.py
index 4116cca2..3b1790a2 100644
--- a/GPy/models/sparse_GP_regression.py
+++ b/GPy/models/sparse_GP_regression.py
@@ -91,8 +91,7 @@ class sparse_GP_regression(GP_regression):
         # Computes dL_dKmm TODO: nicer precomputations
 
         self.dL_dKmm = -0.5 * self.beta * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB
-        tmp = self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi)
-        self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - tmp - tmp.T + self.Kmmi) # dC #TODO: is tmp PD? save some computations here
+        self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi) + self.Kmmi) # dC
         tmp = (mdot(self.LBL_inv, self.psi1YYpsi1, self.Kmmi)
                - self.beta*mdot(self.G, self.psi2, self.Kmmi))
         self.dL_dKmm += -0.5*self.beta2*(tmp + tmp.T - self.G) # dE

From 1d8104bcc202101d2e24c20dbbf708eee721fd23 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Sun, 2 Dec 2012 13:40:19 +0000
Subject: [PATCH 03/12] general tidying in sparse_GP_regression

Have also made a small ampount of headway in
enabling heteroscedatic noise.
---
 GPy/models/sparse_GP_regression.py | 58 ++++++++++++++++--------------
 1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/GPy/models/sparse_GP_regression.py b/GPy/models/sparse_GP_regression.py
index 3b1790a2..78a3f9a8 100644
--- a/GPy/models/sparse_GP_regression.py
+++ b/GPy/models/sparse_GP_regression.py
@@ -10,6 +10,10 @@ from .. import kern
 from ..inference.likelihoods import likelihood
 from GP_regression import GP_regression
 
+#Still TODO:
+# make use of slices properly (kernel can now do this)
+# enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array)
+
 class sparse_GP_regression(GP_regression):
     """
     Variational sparse GP model (Regression)
@@ -41,7 +45,7 @@ class sparse_GP_regression(GP_regression):
             self.Z = Z
             self.M = Z.shape[1]
 
-        GP_regression.__init__(self, X, Y, kernel = kernel, normalize_X = normalize_X, normalize_Y = normalize_Y)
+        GP_regression.__init__(self, X, Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y)
         self.trYYT = np.sum(np.square(self.Y))
 
     def set_param(self, p):
@@ -64,8 +68,9 @@ class sparse_GP_regression(GP_regression):
 
     def _computations(self):
         # TODO find routine to multiply triangular matrices
-        self.psi1Y = np.dot(self.psi1, self.Y)
-        self.psi1YYpsi1 = np.dot(self.psi1Y, self.psi1Y.T)
+        self.V = self.beta*self.Y
+        self.psi1V = np.dot(self.psi1, self.V)
+        self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T)
         self.Lm = jitchol(self.Kmm)
         self.Lmi = chol_inv(self.Lm)
         self.Kmmi = np.dot(self.Lmi.T, self.Lmi)
@@ -77,24 +82,19 @@ class sparse_GP_regression(GP_regression):
         self.LLambdai = np.dot(self.LBi, self.Lmi)
         self.trace_K = self.psi0 - np.trace(self.A)
         self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi)
-        self.C = mdot(self.LLambdai, self.psi1Y)
-        self.G =  mdot(self.LBL_inv, self.psi1YYpsi1, self.LBL_inv.T)
+        self.C = mdot(self.LLambdai, self.psi1V)
+        self.G =  mdot(self.LBL_inv, self.psi1VVpsi1, self.LBL_inv.T)
 
-        # Computes dL_dpsi
+        # Compute dL_dpsi
         self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N)
-        dC_dpsi1 = (self.LLambdai.T[:,:, None, None] * self.Y) # this is sane.
-        tmp = (dC_dpsi1*self.C[None,:,None,:]).sum(1).sum(-1)
-        self.dL_dpsi1 = self.beta2 * tmp
-        self.dL_dpsi2 = (- 0.5 * self.D * self.beta * (self.LBL_inv - self.Kmmi)
-                         - self.beta**3 * 0.5 * self.G)
-
-        # Computes dL_dKmm TODO: nicer precomputations
+        dC_dpsi1 = (self.LLambdai.T[:,:, None, None] * self.V)
+        self.dL_dpsi1 = (dC_dpsi1*self.C[None,:,None,:]).sum(1).sum(-1)
+        self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G)
 
+        # Compute dL_dKmm
         self.dL_dKmm = -0.5 * self.beta * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB
         self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi) + self.Kmmi) # dC
-        tmp = (mdot(self.LBL_inv, self.psi1YYpsi1, self.Kmmi)
-               - self.beta*mdot(self.G, self.psi2, self.Kmmi))
-        self.dL_dKmm += -0.5*self.beta2*(tmp + tmp.T - self.G) # dE
+        self.dL_dKmm +=  np.dot(np.dot(self.G,self.beta*self.psi2) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE
 
     def get_param(self):
         return np.hstack([self.Z.flatten(),self.beta,self.kern.extract_param()])
@@ -103,29 +103,35 @@ class sparse_GP_regression(GP_regression):
         return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern.extract_param_names()
 
     def log_likelihood(self):
+        """
+        Compute the (lower bound on the) log marginal likelihood
+        """
         A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta))
         B = -0.5*self.beta*self.D*self.trace_K
         C = -self.D * np.sum(np.log(np.diag(self.LB)))
         D = -0.5*self.beta*self.trYYT
-        E = +0.5*self.beta2*np.sum(self.psi1YYpsi1 * self.LBL_inv)
-
+        E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv)
         return A+B+C+D+E
 
     def dL_dbeta(self):
-        """ compute the gradient of the log likelihood wrt beta.
-        TODO: suport heteroscedatic noise"""
+        """
+        Compute the gradient of the log likelihood wrt beta.
+        TODO: suport heteroscedatic noise
+        """
 
         dA_dbeta =   0.5 * self.N*self.D/self.beta
         dB_dbeta = - 0.5 * self.D * self.trace_K
         dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)
         dD_dbeta = - 0.5 * self.trYYT
-        tmp = mdot(self.LBi.T, self.LLambdai, self.psi1Y)
-        dE_dbeta = (self.beta * np.sum(np.square(self.C)) - 0.5 * self.beta2
-                    * np.sum(self.A * np.dot(tmp, tmp.T)))
+        tmp = mdot(self.LBi.T, self.LLambdai, self.psi1V)
+        dE_dbeta = np.sum(np.square(self.C))/self.beta - 0.5 * np.sum(self.A * np.dot(tmp, tmp.T))
 
         return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta)
 
     def dL_dtheta(self):
+        """
+        Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel
+        """
         #re-cast computations in psi2 back to psi1:
         dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
 
@@ -146,17 +152,17 @@ class sparse_GP_regression(GP_regression):
     def log_likelihood_gradients(self):
         return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()])
 
-    def _raw_predict(self,Xnew,slices):
+    def _raw_predict(self, Xnew, slices):
         """Internal helper function for making predictions, does not account for normalisation"""
 
         Kx = self.kern.K(self.Z, Xnew)
         Kxx = self.kern.K(Xnew)
 
-        mu = self.beta * mdot(Kx.T, self.LBL_inv, self.psi1Y)
+        mu = mdot(Kx.T, self.LBL_inv, self.psi1V)
         var = Kxx - mdot(Kx.T, (self.Kmmi - self.LBL_inv), Kx) + np.eye(Xnew.shape[0])/self.beta # TODO: This beta doesn't belong here in the EP case.
         return mu,var
 
-    def plot(self,*args,**kwargs):
+    def plot(self, *args, **kwargs):
         """
         Plot the fitted model: just call the GP_regression plot function and then add inducing inputs
         """

From c4761bedf5fa74bbae254ab831f71c0340846f8a Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Sun, 2 Dec 2012 15:12:14 +0000
Subject: [PATCH 04/12] minor commenting

---
 GPy/models/sparse_GP_regression.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/GPy/models/sparse_GP_regression.py b/GPy/models/sparse_GP_regression.py
index 78a3f9a8..a9bb0206 100644
--- a/GPy/models/sparse_GP_regression.py
+++ b/GPy/models/sparse_GP_regression.py
@@ -142,6 +142,9 @@ class sparse_GP_regression(GP_regression):
         return dL_dtheta
 
     def dL_dZ(self):
+        """
+        The derivative of the bound wrt the inducing inputs Z
+        """
         #re-cast computations in psi2 back to psi1:
         dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
 

From a5b90fa541d42ae89065d586f794212766aecb29 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Sun, 2 Dec 2012 15:50:16 +0000
Subject: [PATCH 05/12] Added a skeleton of the uncollapsed sparse GP

---
 GPy/models/uncollapsed_sparse_GP.py | 129 ++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)
 create mode 100644 GPy/models/uncollapsed_sparse_GP.py

diff --git a/GPy/models/uncollapsed_sparse_GP.py b/GPy/models/uncollapsed_sparse_GP.py
new file mode 100644
index 00000000..e28e0340
--- /dev/null
+++ b/GPy/models/uncollapsed_sparse_GP.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+import pylab as pb
+from ..util.linalg import mdot, jitchol, chol_inv, pdinv
+from ..util.plot import gpplot
+from .. import kern
+from ..inference.likelihoods import likelihood
+from sparse_GP_regression import sparse_GP_regression
+
+class uncollapsed_sparse_GP(sparse_GP_regression):
+    """
+    Variational sparse GP model (Regression), where the approximating distribution q(u) is represented explicitly
+
+    :param X: inputs
+    :type X: np.ndarray (N x Q)
+    :param Y: observed data
+    :type Y: np.ndarray of observations (N x D)
+    :param kernel : the kernel/covariance function. See link kernels
+    :type kernel: a GPy kernel
+    :param Z: inducing inputs (optional, see note)
+    :type Z: np.ndarray (M x Q) | None
+    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
+    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
+    :type M: int
+    :param beta: noise precision. TODO> ignore beta if doing EP
+    :type beta: float
+    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
+    :type normalize_(X|Y): bool
+    """
+
+    def __init__(self, X, Y, q_u=None, kernel=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False):
+        D = Y.shape[1]
+        if q_u is None:
+            if Z is None:
+                self.q_u_mean = np.zeros((M,D))
+                self.q_u_cov = np.eye(M) #note: each output dist will have the same covariance...
+                self.q_u_prec = np.eye(M)
+            else:
+                self.q_u_mean = np.zeros((Z.shape[0],D))
+                self.q_u_cov = np.eye(Z.shape[0])
+                self.q_u_prec = np.eye(Z.shape[0])
+
+
+        sparse_GP_regression.__init__(self, X, Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y)
+
+    def _computations(self):
+        self.V = self.beta*self.Y
+        self.psi1V = np.dot(self.psi1, self.V)
+        self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T)
+        self.Lm = jitchol(self.Kmm)
+        self.Lmi = chol_inv(self.Lm)
+        self.Kmmi = np.dot(self.Lmi.T, self.Lmi)
+        self.A = mdot(self.Lmi, self.psi2, self.Lmi.T)
+        self.B = np.eye(self.M) + self.beta * self.A
+        self.Lambda = mdot(self.Lmi.T,self.B,sel.Lmi)
+
+        # Compute dL_dpsi
+        self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N)
+        dC_dpsi1 = 
+        self.dL_dpsi1 = 
+        self.dL_dpsi2 = 
+
+        # Compute dL_dKmm
+        self.dL_dKmm = 
+        self.dL_dKmm += 
+        self.dL_dKmm += 
+
+    def log_likelihood(self):
+        """
+        Compute the (lower bound on the) log marginal likelihood
+        """
+        A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta))
+        B = -0.5*self.beta*self.D*self.trace_K
+        C = -self.D *(self.Kmm_hld +0.5*np.sum(self.Lambda * self.mmT_S) + self.M/2.)
+        E = -0.5*self.beta*self.trYYT
+        F = np.sum(np.dot(self.V.T,self.projected_mean))
+        return A+B+C+D+E+F
+
+    def dL_dbeta(self):
+        """
+        Compute the gradient of the log likelihood wrt beta.
+        TODO: suport heteroscedatic noise
+        """
+        dA_dbeta =   0.5 * self.N*self.D/self.beta
+        dB_dbeta = - 0.5 * self.D * self.trace_K
+        dC_dbeta = - 0.5 * self.D * #TODO
+        dD_dbeta = - 0.5 * self.trYYT
+
+        return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta)
+
+    def _raw_predict(self, Xnew, slices):
+        """Internal helper function for making predictions, does not account for normalisation"""
+
+        #TODO
+        return mu,var
+
+    def set_vb_param(self,vb_param):
+        """set the distribution q(u) from the canonical parameters"""
+        self.q_u_prec = -2.*vb_param[self.M*self.D:].reshape(self.M,self.M)
+        self.q_u_prec_L = jitchol(self.q_u_prec)
+        self.q_u_cov_L = chol_inv(self.q_u_prec_L)
+        self.q_u_cov = np.dot(self.q_u_cov_L,self.q_u_cov_L.T)
+        self.q_u_mean = -2.*np.dot(self.q_u_cov,vb_param[:self.M*self.D].reshape(self.M,self.D))
+
+        #TODO: computations now?
+
+    def get_vb_param(self):
+        """
+        Return the canonical paramters of the distribution q(u)
+        """
+        return np.hstack((-0.5*np.dot(self.q_u_prec, self.q_u_mean).flatten()-0.5*self.q_u_prec.flatten()))
+
+    def vb_grad_natgrad(self):
+        """
+        Compute the gradients of the lower bound wrt the canonical and
+        Expectation parameters of u.
+
+        Note that the natural gradient in either is given by the gradient in the other (See Hensman et al 2012 Fast Variational inference in the conjugate exponential Family)
+        """
+        foobar #TODO
+
+    def plot(self, *args, **kwargs):
+        """
+        add the distribution q(u) to the plot from sparse_GP_regression
+        """
+        sparse_GP_regression.plot(self,*args,**kwargs)
+        #TODO: plot the q(u) dist.

From 3edd867ece4fa503d537e5488581718538429b8f Mon Sep 17 00:00:00 2001
From: Nicolas <n.durrande@sheffield.ac.uk>
Date: Mon, 3 Dec 2012 10:26:28 +0000
Subject: [PATCH 06/12] GPy: Some rewriting for the exponential and Matern
 kernels. They now pass the unit test.

---
 GPy/kern/Matern32.py    | 51 +++++++++++++++++++++-----------------
 GPy/kern/Matern52.py    | 55 +++++++++++++++++++++++------------------
 GPy/kern/exponential.py |  6 ++---
 3 files changed, 63 insertions(+), 49 deletions(-)

diff --git a/GPy/kern/Matern32.py b/GPy/kern/Matern32.py
index c37bd5c0..8223b37a 100644
--- a/GPy/kern/Matern32.py
+++ b/GPy/kern/Matern32.py
@@ -39,11 +39,13 @@ class Matern32(kernpart):
     def get_param(self):
         """return the value of the parameters."""
         return np.hstack((self.variance,self.lengthscales))
+
     def set_param(self,x):
         """set the value of the parameters."""
         assert x.size==(self.D+1)
         self.variance = x[0]
         self.lengthscales = x[1:]
+
     def get_param_names(self):
         """return parameter names."""
         if self.D==1:
@@ -56,10 +58,37 @@ class Matern32(kernpart):
         if X2 is None: X2 = X
         dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
         np.add(self.variance*(1+np.sqrt(3.)*dist)*np.exp(-np.sqrt(3.)*dist), target,target)
+
     def Kdiag(self,X,target):
         """Compute the diagonal of the covariance matrix associated to X."""
         np.add(target,self.variance,target)
 
+    def dK_dtheta(self,partial,X,X2,target):
+        """derivative of the covariance matrix with respect to the parameters."""
+        if X2 is None: X2 = X
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
+        dvar = (1+np.sqrt(3.)*dist)*np.exp(-np.sqrt(3.)*dist)
+        invdist = 1./np.where(dist!=0.,dist,np.inf)
+        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3
+        dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
+        target[0] += np.sum(dvar*partial)
+        target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
+
+    def dKdiag_dtheta(self,partial,X,target):
+        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
+        target[0] += np.sum(partial)
+
+    def dK_dX(self,X,X2,target):
+        """derivative of the covariance matrix with respect to X."""
+        if X2 is None: X2 = X
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
+        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
+        dK_dX += -  np.transpose(3*self.variance*dist*np.exp(-np.sqrt(3)*dist)*ddist_dX,(1,0,2))
+        target += np.sum(dK_dX*partial.T[:,:,None],0)
+        
+    def dKdiag_dX(self,X,target):
+        pass
+
     def Gram_matrix(self,F,F1,F2,lower,upper):
         """
         Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to D=1.
@@ -87,25 +116,3 @@ class Matern32(kernpart):
         #return(G)
         return(self.lengthscales**3/(12.*np.sqrt(3)*self.variance) * G + 1./self.variance*np.dot(Flower,Flower.T) + self.lengthscales**2/(3.*self.variance)*np.dot(F1lower,F1lower.T))
 
-    def dK_dtheta(self,X,X2,target):
-        """derivative of the cross-covariance matrix with respect to the parameters (shape is NxMxNparam)"""
-        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
-        dvar = (1+np.sqrt(3.)*dist)*np.exp(-np.sqrt(3.)*dist)
-        invdist = 1./np.where(dist!=0.,dist,np.inf)
-        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3
-        dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
-        np.add(target[:,:,0],dvar, target[:,:,0])
-        np.add(target[:,:,1:],dl, target[:,:,1:])
-    def dKdiag_dtheta(self,X,target):
-        """derivative of the diagonal of the covariance matrix with respect to the parameters (shape is NxNparam)"""
-        np.add(target[:,0],1.,target[:,0])
-    def dK_dX(self,X,X2,target):
-        """derivative of the covariance matrix with respect to X (*! shape is NxMxD !*)."""
-        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
-        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
-        target += -  np.transpose(3*self.variance*dist*np.exp(-np.sqrt(3)*dist)*ddist_dX,(1,0,2))
-    def dKdiag_dX(self,X,target):
-        pass
-
diff --git a/GPy/kern/Matern52.py b/GPy/kern/Matern52.py
index 4af65a89..65059a5b 100644
--- a/GPy/kern/Matern52.py
+++ b/GPy/kern/Matern52.py
@@ -33,33 +33,61 @@ class Matern52(kernpart):
         self.Nparam = self.D + 1
         self.name = 'Mat52'
         self.set_param(np.hstack((variance,lengthscales)))
-        self._Z, self._mu, self._S = np.empty(shape=(3,1)) # cached versions of Z,mu,S
+        
 
     def get_param(self):
         """return the value of the parameters."""
         return np.hstack((self.variance,self.lengthscales))
+
     def set_param(self,x):
         """set the value of the parameters."""
         assert x.size==(self.D+1)
         self.variance = x[0]
         self.lengthscales = x[1:]
-        self.lengthscales2 = np.square(self.lengthscales)
-        self._Z, self._mu, self._S = np.empty(shape=(3,1)) # cached versions of Z,mu,S
+
     def get_param_names(self):
         """return parameter names."""
         if self.D==1:
             return ['variance','lengthscale']
         else:
             return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscales.size)]
+
     def K(self,X,X2,target):
         """Compute the covariance matrix between X and X2."""
         if X2 is None: X2 = X
         dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
         np.add(self.variance*(1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist), target,target)
+
     def Kdiag(self,X,target):
         """Compute the diagonal of the covariance matrix associated to X."""
         np.add(target,self.variance,target)
 
+    def dK_dtheta(self,partial,X,X2,target):
+        """derivative of the covariance matrix with respect to the parameters."""
+        if X2 is None: X2 = X
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
+        invdist = 1./np.where(dist!=0.,dist,np.inf)
+        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3
+        dvar = (1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist)
+        dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
+        target[0] += np.sum(dvar*partial)
+        target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
+
+    def dKdiag_dtheta(self,X,target):
+        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
+        target[0] += np.sum(partial)
+
+    def dK_dX(self,X,X2,target):
+        """derivative of the covariance matrix with respect to X."""
+        if X2 is None: X2 = X
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
+        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
+        dK_dX += -  np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2))
+        target += np.sum(dK_dX*partial.T[:,:,None],0)
+
+    def dKdiag_dX(self,X,target):
+        pass
+
     def Gram_matrix(self,F,F1,F2,F3,lower,upper):
         """
         Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to D=1.
@@ -91,26 +119,5 @@ class Matern52(kernpart):
         orig2 = 3./5*self.lengthscales**2 * ( np.dot(F1lower,F1lower.T) + 1./8*np.dot(Flower,F2lower.T) + 1./8*np.dot(F2lower,Flower.T))
         return(1./self.variance* (G_coef*G + orig + orig2))
 
-    def dK_dtheta(self,X,X2,target):
-        """derivative of the cross-covariance matrix with respect to the parameters (shape is NxMxNparam)"""
-        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
-        invdist = 1./np.where(dist!=0.,dist,np.inf)
-        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3
-        dvar = (1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist)
-        dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
-        np.add(target[:,:,0],dvar, target[:,:,0])
-        np.add(target[:,:,1:],dl, target[:,:,1:])
-    def dKdiag_dtheta(self,X,target):
-        """derivative of the diagonal of the covariance matrix with respect to the parameters (shape is NxNparam)"""
-        np.add(target[:,0],1.,target[:,0])
-    def dK_dX(self,X,X2,target):
-        """derivative of the covariance matrix with respect to X (*! shape is NxMxD !*)."""
-        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
-        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
-        target += -  np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2))
-    def dKdiag_dX(self,X,target):
-        pass
 
 
diff --git a/GPy/kern/exponential.py b/GPy/kern/exponential.py
index 402ebd82..ba97881e 100644
--- a/GPy/kern/exponential.py
+++ b/GPy/kern/exponential.py
@@ -62,7 +62,7 @@ class exponential(kernpart):
         np.add(target,self.variance,target)
 
     def dK_dtheta(self,partial,X,X2,target):
-        """derivative of the cross-covariance matrix with respect to the parameters (shape is NxMxNparam)"""
+        """derivative of the covariance matrix with respect to the parameters."""
         if X2 is None: X2 = X
         dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
         invdist = 1./np.where(dist!=0.,dist,np.inf)
@@ -73,12 +73,12 @@ class exponential(kernpart):
         target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
 
     def dKdiag_dtheta(self,partial,X,target): 
-        """derivative of the diagonal of the covariance matrix with respect to the parameters (shape is NxNparam)"""
+        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
         #NB: derivative of diagonal elements wrt lengthscale is 0
         target[0] += np.sum(partial)
 
     def dK_dX(self,X,X2,target):
-        """derivative of the covariance matrix with respect to X (*! shape is NxMxD !*)."""
+        """derivative of the covariance matrix with respect to X."""
         if X2 is None: X2 = X
         dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
         ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)

From 049de98d3b510357f6f391c23f8be34eea159c5c Mon Sep 17 00:00:00 2001
From: Nicolas <n.durrande@sheffield.ac.uk>
Date: Mon, 3 Dec 2012 11:26:10 +0000
Subject: [PATCH 07/12] rbf_ARD now in the updated format for the computation
 of the derivatives (included for the psi-statistics, but not tested)

---
 GPy/kern/rbf_ARD.py | 63 +++++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 25 deletions(-)

diff --git a/GPy/kern/rbf_ARD.py b/GPy/kern/rbf_ARD.py
index f1e5f36a..1f90bb0a 100644
--- a/GPy/kern/rbf_ARD.py
+++ b/GPy/kern/rbf_ARD.py
@@ -30,6 +30,7 @@ class rbf_ARD(kernpart):
 
     def get_param(self):
         return np.hstack((self.variance,self.lengthscales))
+
     def set_param(self,x):
         assert x.size==(self.D+1)
         self.variance = x[0]
@@ -37,61 +38,73 @@ class rbf_ARD(kernpart):
         self.lengthscales2 = np.square(self.lengthscales)
         #reset cached results
         self._Z, self._mu, self._S = np.empty(shape=(3,1)) # cached versions of Z,mu,S
+
     def get_param_names(self):
         if self.D==1:
             return ['variance','lengthscale']
         else:
             return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscales.size)]
+
     def K(self,X,X2,target):
         self._K_computations(X,X2)
         np.add(self.variance*self._K_dvar, target,target)
+
     def Kdiag(self,X,target):
         np.add(target,self.variance,target)
-    def dK_dtheta(self,X,X2,target):
-        """Return shape is NxMx(Ntheta)"""
+
+    def dK_dtheta(self,partial,X,X2,target):
         self._K_computations(X,X2)
         dl = self._K_dvar[:,:,None]*self.variance*self._K_dist2/self.lengthscales
-        np.add(target[:,:,0],self._K_dvar, target[:,:,0])
-        np.add(target[:,:,1:],dl, target[:,:,1:])
+        target[0] += np.sum(self._K_dvar*partial)
+        target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
+
     def dKdiag_dtheta(self,X,target):
-        np.add(target[:,0],1.,target[:,0])
+        target[0] += np.sum(partial)
+
     def dK_dX(self,X,X2,target):
         self._K_computations(X,X2)
         dZ = self.variance*self._K_dvar[:,:,None]*self._K_dist/self.lengthscales2
-        np.add(target,-dZ.transpose(1,0,2),target)
+        dK_dX = -dZ.transpose(1,0,2)
+        target += np.sum(dK_dX*partial.T[:,:,None],0)
+
+    def dKdiag_dX(self,partial,X,target):
+        pass
+
     def psi0(self,Z,mu,S,target):
-        np.add(target, self.variance, target)
-    def dpsi0_dtheta(self,Z,mu,S,target):
-        target[:,0] += 1.
+        target += self.variance
+
+    def dpsi0_dtheta(self,partial,Z,mu,S,target):
+        target[0] += 1.
+
     def dpsi0_dmuS(self,Z,mu,S,target_mu,target_S):
         pass
+
     def psi1(self,Z,mu,S,target):
-        """Think N,M,Q """
         self._psi_computations(Z,mu,S)
         np.add(target, self._psi1,target)
 
-    def dpsi1_dtheta(self,Z,mu,S,target):
-        """N,Q,(Ntheta)"""
+    def dpsi1_dtheta(self,partial,Z,mu,S,target):
         self._psi_computations(Z,mu,S)
         denom_deriv = S[:,None,:]/(self.lengthscales**3+self.lengthscales*S[:,None,:])
         d_length = self._psi1[:,:,None]*(self.lengthscales*np.square(self._psi1_dist/(self.lengthscales2+S[:,None,:])) + denom_deriv)
-        target[:,:,0] += self._psi1/self.variance
-        target[:,:,1:] += d_length
-    def dpsi1_dZ(self,Z,mu,S,target):
+        target[0] += np.sum(partial*self._psi1/self.variance)
+        target[1:] += (d_length*partial[:,:,None]).sum(0).sum(0)
+
+    def dpsi1_dZ(self,partial,Z,mu,S,target):
         self._psi_computations(Z,mu,S)
         np.add(target,-self._psi1[:,:,None]*self._psi1_dist/self.lengthscales2/self._psi1_denom,target)
+        target += np.sum(partial[:,:,None]*-self._psi1[:,:,None]*self._psi1_dist/self.lengthscales2/self._psi1_denom,0)
 
-    def dpsi1_dmuS(self,Z,mu,S,target_mu,target_S):
+    def dpsi1_dmuS(self,partial,Z,mu,S,target_mu,target_S):
         """return shapes are N,M,Q"""
         self._psi_computations(Z,mu,S)
         tmp = self._psi1[:,:,None]/self.lengthscales2/self._psi1_denom
-        np.add(target_mu,tmp*self._psi1_dist,target_mu)
-        np.add(target_S, 0.5*tmp*(self._psi1_dist_sq-1), target_S)
+        target_mu += np.sum(partial*tmp*self._psi1_dist,1)
+        target_S += np.sum(partial*0.5*tmp*(self._psi1_dist_sq-1),1)
 
     def psi2(self,Z,mu,S,target):
-        """shape N,M,M"""
         self._psi_computations(Z,mu,S)
-        np.add(target, self._psi2,target)
+        target += self._psi2.sum(0) #TODO: psi2 should be NxMxM (for het. noise)
 
     def dpsi2_dtheta(self,Z,mu,S,target):
         """Shape N,M,M,Ntheta"""
@@ -99,21 +112,21 @@ class rbf_ARD(kernpart):
         d_var = np.sum(2.*self._psi2/self.variance,0)
         d_length = self._psi2[:,:,:,None]*(0.5*self._psi2_Zdist_sq*self._psi2_denom + 2.*self._psi2_mudist_sq + 2.*S[:,None,None,:]/self.lengthscales2)/(self.lengthscales*self._psi2_denom)
         d_length = d_length.sum(0)
-        target[:,:,0] += d_var
-        target[:,:,1:] += d_length
+        target[0] += np.sum(partial*d_var)
+        target[1:] += (d_length*partial[:,:,None]).sum(0).sum(0)
 
     def dpsi2_dZ(self,Z,mu,S,target):
         """Returns shape N,M,M,Q"""
         self._psi_computations(Z,mu,S)
         dZ = self._psi2[:,:,:,None]/self.lengthscales2*(-0.5*self._psi2_Zdist + self._psi2_mudist/self._psi2_denom)
-        target += dZ
+        target += np.sum(partial[None,:,:,None]*dZ,0).sum(1)
 
     def dpsi2_dmuS(self,Z,mu,S,target_mu,target_S):
         """Think N,M,M,Q """
         self._psi_computations(Z,mu,S)
         tmp = self._psi2[:,:,:,None]/self.lengthscales2/self._psi2_denom
-        np.add(target_mu, -tmp*(2.*self._psi2_mudist),target_mu) #N,M,M,Q
-        np.add(target_S, tmp*(2.*self._psi2_mudist_sq-1), target_S) #N,M,M,Q
+        target_mu += (partial*-tmp*2.*self._psi2_mudist).sum(1).sum(1)
+        target_S += (partial*tmp*(2.*self._psi2_mudist_sq-1)).sum(1).sum(1)
 
     def _K_computations(self,X,X2):
         if not (np.all(X==self._X) and np.all(X2==self._X2)):

From 5f22bd00e65dbf86df6751b82f1a7d325f29ef01 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Mon, 3 Dec 2012 18:35:05 +0000
Subject: [PATCH 08/12] made uncertain inputs a simple swith in the sparse GP
 class.  This simplifies the inherritance structure

---
 GPy/models/sparse_GP_regression.py          | 47 ++++++++++----
 GPy/models/uncertain_input_GP_regression.py | 70 ---------------------
 2 files changed, 34 insertions(+), 83 deletions(-)
 delete mode 100644 GPy/models/uncertain_input_GP_regression.py

diff --git a/GPy/models/sparse_GP_regression.py b/GPy/models/sparse_GP_regression.py
index a9bb0206..92280bc8 100644
--- a/GPy/models/sparse_GP_regression.py
+++ b/GPy/models/sparse_GP_regression.py
@@ -26,6 +26,8 @@ class sparse_GP_regression(GP_regression):
     :type kernel: a GPy kernel
     :param Z: inducing inputs (optional, see note)
     :type Z: np.ndarray (M x Q) | None
+    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
+    :type X_uncertainty: np.ndarray (N x Q) | None
     :param Zslices: slices for the inducing inputs (see slicing TODO: link)
     :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
     :type M: int
@@ -44,6 +46,12 @@ class sparse_GP_regression(GP_regression):
             assert Z.shape[1]==X.shape[1]
             self.Z = Z
             self.M = Z.shape[1]
+        if X_uncertainty is None:
+            self.has_uncertain_inputs=False
+        else:
+            assert X_uncertainty.shape==X.shape
+            self.has_uncertain_inputs=False
+            self.X_uncertainty = X_uncertainty
 
         GP_regression.__init__(self, X, Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y)
         self.trYYT = np.sum(np.square(self.Y))
@@ -58,13 +66,17 @@ class sparse_GP_regression(GP_regression):
 
     def _compute_kernel_matrices(self):
         # kernel computations, using BGPLVM notation
-        #TODO: the following can be switched out in the case of uncertain inputs (or the BGPLVM!)
         #TODO: slices for psi statistics (easy enough)
 
         self.Kmm = self.kern.K(self.Z)
-        self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum()
-        self.psi1 = self.kern.K(self.Z,self.X)
-        self.psi2 = np.dot(self.psi1,self.psi1.T)
+        if self.has_uncertain_inputs:
+            self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum()
+            self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T
+            self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)
+        else:
+            self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum()
+            self.psi1 = self.kern.K(self.Z,self.X)
+            self.psi2 = np.dot(self.psi1,self.psi1.T)
 
     def _computations(self):
         # TODO find routine to multiply triangular matrices
@@ -132,12 +144,16 @@ class sparse_GP_regression(GP_regression):
         """
         Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel
         """
-        #re-cast computations in psi2 back to psi1:
-        dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
-
         dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z)
-        dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X)
-        dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
+        if self.has_uncertain_inputs:
+            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty)
+            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
+            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape
+        else:
+            #re-cast computations in psi2 back to psi1:
+            dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
+            dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X)
+            dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
 
         return dL_dtheta
 
@@ -145,11 +161,14 @@ class sparse_GP_regression(GP_regression):
         """
         The derivative of the bound wrt the inducing inputs Z
         """
-        #re-cast computations in psi2 back to psi1:
-        dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
-
         dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z,)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
-        dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X)
+        if self.has_uncertain_inputs:
+            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
+            dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty)
+        else:
+            #re-cast computations in psi2 back to psi1:
+            dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
+            dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X)
         return dL_dZ
 
     def log_likelihood_gradients(self):
@@ -172,5 +191,7 @@ class sparse_GP_regression(GP_regression):
         GP_regression.plot(self,*args,**kwargs)
         if self.Q==1:
             pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12)
+            if self.has_uncertain_inputs:
+                pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten()))
         if self.Q==2:
             pb.plot(self.Z[:,0],self.Z[:,1],'wo')
diff --git a/GPy/models/uncertain_input_GP_regression.py b/GPy/models/uncertain_input_GP_regression.py
deleted file mode 100644
index 66724b07..00000000
--- a/GPy/models/uncertain_input_GP_regression.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import numpy as np
-import pylab as pb
-from ..util.linalg import mdot, jitchol, chol_inv, pdinv
-from ..util.plot import gpplot
-from .. import kern
-from ..inference.likelihoods import likelihood
-from sparse_GP_regression import sparse_GP_regression
-
-class uncertain_input_GP_regression(sparse_GP_regression):
-    """
-    Variational sparse GP model (Regression) with uncertainty on the inputs
-
-    :param X: inputs
-    :type X: np.ndarray (N x Q)
-    :param X_uncertainty: uncertainty on X (Gaussian variances, assumed isotrpic)
-    :type X_uncertainty: np.ndarray (N x Q)
-    :param Y: observed data
-    :type Y: np.ndarray of observations (N x D)
-    :param kernel : the kernel/covariance function. See link kernels
-    :type kernel: a GPy kernel
-    :param Z: inducing inputs (optional, see note)
-    :type Z: np.ndarray (M x Q) | None
-    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
-    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
-    :type M: int
-    :param beta: noise precision. TODO> ignore beta if doing EP
-    :type beta: float
-    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
-    :type normalize_(X|Y): bool
-    """
-
-    def __init__(self,X,Y,X_uncertainty,kernel=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False):
-        self.X_uncertainty = X_uncertainty
-        sparse_GP_regression.__init__(self, X, Y, kernel = kernel, beta = beta, normalize_X = normalize_X, normalize_Y = normalize_Y)
-        self.trYYT = np.sum(np.square(self.Y))
-
-    def _compute_kernel_matrices(self):
-        # kernel computations, using BGPLVM notation
-        #TODO: slices for psi statistics (easy enough)
-        self.Kmm = self.kern.K(self.Z)
-        self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum()
-        self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T
-        self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)
-
-    def dL_dtheta(self):
-        #re-cast computations in psi2 back to psi1:
-        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z)
-        dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty)
-        dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
-        dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape
-        return dL_dtheta
-
-    def dL_dZ(self):
-        dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z,)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
-        dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
-        dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty)
-        return dL_dZ
-
-    def plot(self,*args,**kwargs):
-        """
-        Plot the fitted model: just call the sparse GP_regression plot function and then add
-        markers to represent uncertainty on the inputs
-        """
-        sparse_GP_regression.plot(self,*args,**kwargs)
-        if self.Q==1:
-            pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten()))
-

From 1a9872c18797e48a428861476e7bebbac6bffa6e Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Mon, 3 Dec 2012 10:53:37 -0800
Subject: [PATCH 09/12] Some tidying in the uncollapsed GP

---
 GPy/models/uncollapsed_sparse_GP.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/GPy/models/uncollapsed_sparse_GP.py b/GPy/models/uncollapsed_sparse_GP.py
index e28e0340..b07b1134 100644
--- a/GPy/models/uncollapsed_sparse_GP.py
+++ b/GPy/models/uncollapsed_sparse_GP.py
@@ -17,6 +17,8 @@ class uncollapsed_sparse_GP(sparse_GP_regression):
     :type X: np.ndarray (N x Q)
     :param Y: observed data
     :type Y: np.ndarray of observations (N x D)
+    :param q_u: canonical parameters of the distribution squasehd into a 1D array
+    :type q_u: np.ndarray
     :param kernel : the kernel/covariance function. See link kernels
     :type kernel: a GPy kernel
     :param Z: inducing inputs (optional, see note)
@@ -30,20 +32,15 @@ class uncollapsed_sparse_GP(sparse_GP_regression):
     :type normalize_(X|Y): bool
     """
 
-    def __init__(self, X, Y, q_u=None, kernel=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False):
+    def __init__(self, X, Y, q_u=None, *args, **kwargs)
         D = Y.shape[1]
         if q_u is None:
             if Z is None:
-                self.q_u_mean = np.zeros((M,D))
-                self.q_u_cov = np.eye(M) #note: each output dist will have the same covariance...
-                self.q_u_prec = np.eye(M)
+                M = Z.shape[0]
             else:
-                self.q_u_mean = np.zeros((Z.shape[0],D))
-                self.q_u_cov = np.eye(Z.shape[0])
-                self.q_u_prec = np.eye(Z.shape[0])
-
-
-        sparse_GP_regression.__init__(self, X, Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y)
+                M=M
+            self.set_vb_param(np.hstack((np.ones(M*D)),np.eye(M).flatten()))
+        sparse_GP_regression.__init__(self, X, Y, *args, **kwargs)
 
     def _computations(self):
         self.V = self.beta*self.Y
@@ -58,7 +55,6 @@ class uncollapsed_sparse_GP(sparse_GP_regression):
 
         # Compute dL_dpsi
         self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N)
-        dC_dpsi1 = 
         self.dL_dpsi1 = 
         self.dL_dpsi2 = 
 
@@ -104,13 +100,16 @@ class uncollapsed_sparse_GP(sparse_GP_regression):
         self.q_u_cov = np.dot(self.q_u_cov_L,self.q_u_cov_L.T)
         self.q_u_mean = -2.*np.dot(self.q_u_cov,vb_param[:self.M*self.D].reshape(self.M,self.D))
 
+        self.q_u_expectation = (self.q_u_mean, np.dot(self.q_u_mean,self.q_u_mean.T)+self.q_u_cov)
+
+        self.q_u_canonical = (np.dot(self.q_u_prec, self.q_u_mean),-0.5*self.q_u_prec)
         #TODO: computations now?
 
     def get_vb_param(self):
         """
-        Return the canonical paramters of the distribution q(u)
+        Return the canonical parameters of the distribution q(u)
         """
-        return np.hstack((-0.5*np.dot(self.q_u_prec, self.q_u_mean).flatten()-0.5*self.q_u_prec.flatten()))
+        return np.hstack([e.flatten() for e in self.q_u_canonical])
 
     def vb_grad_natgrad(self):
         """

From 4dcf2b85ced1df4fe7c9536a2c903790c619361e Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Tue, 4 Dec 2012 13:10:17 -0800
Subject: [PATCH 10/12] models can now specify a preferred optimser (defaults
 to tnc)

---
 GPy/core/model.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index ab4f8246..a9b1f0d3 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -18,6 +18,7 @@ class model(parameterised):
         self.optimization_runs = []
         self.sampling_runs = []
         self.set_param(self.get_param())
+        self.preferred_optimizer = 'tnc'
     def get_param(self):
         raise NotImplementedError, "this needs to be implemented to utilise the model class"
     def set_param(self,x):
@@ -161,15 +162,19 @@ class model(parameterised):
         else:
             self.expand_param(initial_parameters)
 
-    def optimize(self, optimizer = 'tnc', start = None, **kwargs):
+    def optimize(self, optimizer=None, start=None, **kwargs):
         """
         Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
         kwargs are passed to the optimizer. They can be:
 
         :max_f_eval: maximum number of function evaluations
-        :messages: whether to display during optimisatio
+        :messages: whether to display during optimisation
+        :param optimzer: whice optimizer to use (defaults to self.preferred optimizer)
+        :type optimzer: string TODO: valid strings?
 
         """
+        if optimizer is None:
+            optimizer = self.preferred_optimizer
 
         def f(x):
             self.expand_param(x)

From c003b2f34d90e1e2304b2721d1287318a88fab63 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 5 Dec 2012 09:10:27 -0800
Subject: [PATCH 11/12] Minor edits

---
 GPy/inference/optimization.py       | 2 +-
 GPy/models/sparse_GP_regression.py  | 6 +++++-
 GPy/models/uncollapsed_sparse_GP.py | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/GPy/inference/optimization.py b/GPy/inference/optimization.py
index 5f9a7a73..bad1e041 100644
--- a/GPy/inference/optimization.py
+++ b/GPy/inference/optimization.py
@@ -9,7 +9,7 @@ import pylab as pb
 import datetime as dt
 
 class Optimizer():
-    def __init__(self, x_init, f_fp, f, fp , messages = False, max_f_eval = 1e4, ftol = None, gtol = None, xtol = None):
+    def __init__(self, x_init, f_fp, f, fp , messages=False, max_f_eval=1e4, ftol=None, gtol=None, xtol=None):
         """
         Superclass for all the optimizers.
 
diff --git a/GPy/models/sparse_GP_regression.py b/GPy/models/sparse_GP_regression.py
index 92280bc8..38aeef08 100644
--- a/GPy/models/sparse_GP_regression.py
+++ b/GPy/models/sparse_GP_regression.py
@@ -37,7 +37,7 @@ class sparse_GP_regression(GP_regression):
     :type normalize_(X|Y): bool
     """
 
-    def __init__(self,X,Y,kernel=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False):
+    def __init__(self,X,Y,kernel=None, X_uncertainty=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False):
         self.beta = beta
         if Z is None:
             self.Z = np.random.permutation(X.copy())[:M]
@@ -56,6 +56,10 @@ class sparse_GP_regression(GP_regression):
         GP_regression.__init__(self, X, Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y)
         self.trYYT = np.sum(np.square(self.Y))
 
+        #normalise X uncertainty also
+        if self.has_uncertain_inputs:
+            self.X_uncertainty /= np.square(self._Xstd)
+
     def set_param(self, p):
         self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
         self.beta = p[self.M*self.Q]
diff --git a/GPy/models/uncollapsed_sparse_GP.py b/GPy/models/uncollapsed_sparse_GP.py
index b07b1134..b5d4b054 100644
--- a/GPy/models/uncollapsed_sparse_GP.py
+++ b/GPy/models/uncollapsed_sparse_GP.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012 James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np

From 0ee11a2077820515dc1bc024b44547dc0f5e6b23 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <neil@stefano.(none)>
Date: Wed, 5 Dec 2012 19:19:15 -0800
Subject: [PATCH 12/12] Added datasets.py back in and minor changes.

---
 GPy/__init__.py               |   2 +-
 GPy/core/model.py             |   2 +-
 GPy/inference/optimization.py |  35 +++---
 GPy/kern/rbf.py               |  14 ++-
 GPy/kern/spline.py            |   2 +-
 GPy/models/GP_EP.py           |   3 +
 GPy/models/GP_regression.py   |   2 +-
 GPy/util/__init__.py          |   1 +
 GPy/util/datasets.py          | 206 ++++++++++++++++++++++++++++++++++
 9 files changed, 244 insertions(+), 23 deletions(-)
 create mode 100644 GPy/util/datasets.py

diff --git a/GPy/__init__.py b/GPy/__init__.py
index 9e370489..876e2ca6 100644
--- a/GPy/__init__.py
+++ b/GPy/__init__.py
@@ -6,5 +6,5 @@ import kern
 import models
 import inference
 import util
-#import examples
+import examples
 from core import priors
diff --git a/GPy/core/model.py b/GPy/core/model.py
index ab4f8246..3937a9cc 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -167,7 +167,7 @@ class model(parameterised):
         kwargs are passed to the optimizer. They can be:
 
         :max_f_eval: maximum number of function evaluations
-        :messages: whether to display during optimisatio
+        :messages: whether to display during optimisation
 
         """
 
diff --git a/GPy/inference/optimization.py b/GPy/inference/optimization.py
index 5f9a7a73..7c575a94 100644
--- a/GPy/inference/optimization.py
+++ b/GPy/inference/optimization.py
@@ -9,20 +9,22 @@ import pylab as pb
 import datetime as dt
 
 class Optimizer():
+    """
+    Superclass for all the optimizers.
+
+    :param x_init: initial set of parameters
+    :param f_fp: function that returns the function AND the gradients at the same time
+    :param f: function to optimize
+    :param fp: gradients
+    :param messages: print messages from the optimizer?
+    :type messages: (True | False)
+    :param max_f_eval: maximum number of function evaluations
+    
+    :rtype: optimizer object.
+    
+    """	
     def __init__(self, x_init, f_fp, f, fp , messages = False, max_f_eval = 1e4, ftol = None, gtol = None, xtol = None):
-        """
-        Superclass for all the optimizers.
 
-        Arguments:
-
-        x_init: initial set of parameters
-        f_fp: function that returns the function AND the gradients at the same time
-        f: function to optimize
-        fp: gradients
-        messages: print messages from the optimizer? (True | False)
-        max_f_eval: maximum number of function evaluations
-
-        """
         self.opt_name = None
         self.f_fp = f_fp
         self.f = f
@@ -47,7 +49,7 @@ class Optimizer():
         self.time = str(end-start)
 
     def opt(self):
-        raise NotImplementedError, "this needs to be implemented to utilise the optimizer class"
+        raise NotImplementedError, "this needs to be implemented to use the optimizer class"
 
     def plot(self):
         if self.trace == None:
@@ -136,8 +138,7 @@ class opt_simplex(Optimizer):
 
     def opt(self):
         """
-        The simplex optimizer does not require gradients, which
-        is great during development. Otherwise it's a bit slow.
+        The simplex optimizer does not require gradients.
         """
 
         statuses = ['Converged', 'Maximum number of function evaluations made','Maximum number of iterations reached']
@@ -164,11 +165,11 @@ class opt_simplex(Optimizer):
 # class opt_rasm(Optimizer):
 #     def __init__(self, *args, **kwargs):
 #         Optimizer.__init__(self, *args, **kwargs)
-#         self.opt_name = "Rasmussen's SCG"
+#         self.opt_name = "Rasmussen's Conjugate Gradient"
 
 #     def opt(self):
 #         """
-#         Run Rasmussen's SCG optimizer
+#         Run Rasmussen's Conjugate Gradient optimizer
 #         """
 
 #         assert self.f_fp != None, "Rasmussen's minimizer requires f_fp"
diff --git a/GPy/kern/rbf.py b/GPy/kern/rbf.py
index c300ae7c..9e2bb509 100644
--- a/GPy/kern/rbf.py
+++ b/GPy/kern/rbf.py
@@ -8,7 +8,13 @@ import hashlib
 
 class rbf(kernpart):
     """
-    Radial Basis Function kernel, aka squared-exponential or Gaussian kernel.
+    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel.
+
+    .. math::
+
+       k(r) = \sigma^2 \exp(- \frac{r^2}{2\ell}) \qquad \qquad \\text{ where  } r = \sqrt{\frac{\sum_{i=1}^d (x_i-x^\prime_i)^2}{\ell^2}}
+
+    where \ell is the lengthscale, \alpha the smoothness, \sigma^2 the variance and d the dimensionality of the input.
 
     :param D: the number of input dimensions
     :type D: int
@@ -44,6 +50,8 @@ class rbf(kernpart):
         return ['variance','lengthscale']
 
     def K(self,X,X2,target):
+        if X2 is None:
+            X2 = X
         self._K_computations(X,X2)
         np.add(self.variance*self._K_dvar, target,target)
 
@@ -78,7 +86,9 @@ class rbf(kernpart):
                 self._K_dist2 = (-2.*XXT + np.diag(XXT)[:,np.newaxis] + np.diag(XXT)[np.newaxis,:])/self.lengthscale2
             else:
                 self._K_dist2 = (-2.*XXT + np.sum(np.square(X),1)[:,None] + np.sum(np.square(X2),1)[None,:])/self.lengthscale2
-            self._K_exponent = -0.5*self._K_dist2
+            # TODO Remove comments if this is fine.
+            # Commented out by Neil as doesn't seem to be used elsewhere.
+            #self._K_exponent = -0.5*self._K_dist2
             self._K_dvar = np.exp(-0.5*self._K_dist2)
 
     def psi0(self,Z,mu,S,target):
diff --git a/GPy/kern/spline.py b/GPy/kern/spline.py
index 9b10bb6c..44033eea 100644
--- a/GPy/kern/spline.py
+++ b/GPy/kern/spline.py
@@ -6,7 +6,7 @@ from kernpart import kernpart
 import numpy as np
 import hashlib
 def theta(x):
-    """Heavisdie step function"""
+    """Heaviside step function"""
     return np.where(x>=0.,1.,0.)
 
 class spline(kernpart):
diff --git a/GPy/models/GP_EP.py b/GPy/models/GP_EP.py
index eb85b331..5dc721a4 100644
--- a/GPy/models/GP_EP.py
+++ b/GPy/models/GP_EP.py
@@ -26,6 +26,9 @@ class GP_EP(model):
         :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.] (list)
         :rtype: GPy model class.
         """
+        if kernel is None:
+            kernel = kern.rbf(X.shape[1]) + kern.bias(X.shape[1]) + kern.white(X.shape[1])
+
         assert isinstance(kernel,kern.kern), 'kernel is not a kern instance'
         self.likelihood = likelihood
         self.Y = self.likelihood.Y
diff --git a/GPy/models/GP_regression.py b/GPy/models/GP_regression.py
index c766f017..0433b987 100644
--- a/GPy/models/GP_regression.py
+++ b/GPy/models/GP_regression.py
@@ -29,7 +29,7 @@ class GP_regression(model):
 
     def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None):
         if kernel is None:
-            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1]) + kern.bias(X.shape[1])
+            kernel = kern.rbf(X.shape[1]) + kern.bias(X.shape[1]) + kern.white(X.shape[1])
 
         # parse arguments
         self.Xslices = Xslices
diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py
index 0e8144c4..3c28cde3 100644
--- a/GPy/util/__init__.py
+++ b/GPy/util/__init__.py
@@ -9,3 +9,4 @@ import squashers
 import Tango
 import misc
 import warping_functions
+import datasets
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
new file mode 100644
index 00000000..3379ccd3
--- /dev/null
+++ b/GPy/util/datasets.py
@@ -0,0 +1,206 @@
+import os
+import posix
+import pylab as pb
+import numpy as np
+import GPy
+import scipy.sparse
+import scipy.io
+data_path = os.path.join(os.path.dirname(__file__),'datasets')
+default_seed =10000
+
+# Some general utilities.
+def sample_class(f):
+    p = 1./(1.+np.exp(-f))
+    c = np.random.binomial(1,p)
+    c = np.where(c,1,-1)
+    return c
+
+def della_gatta_TRP63_gene_expression(gene_number=None):
+    matData = scipy.io.loadmat(os.path.join(data_path, 'DellaGattadata.mat'))
+    X = np.double(matData['timepoints'])
+    if gene_number == None:
+        Y = matData['exprs_tp53_RMA']
+    else:
+        Y = matData['exprs_tp53_RMA'][:, gene_number]
+        if len(Y.shape) == 1:
+            Y = Y[:, None]
+    return {'X': X, 'Y': Y, 'info': "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA."}
+
+
+# The data sets
+def oil():
+    fid = open(os.path.join(data_path, 'oil', 'DataTrn.txt'))
+    X = np.fromfile(fid, sep='\t').reshape((-1, 12))
+    fid.close()
+    fid = open(os.path.join(data_path, 'oil', 'DataTrnLbls.txt'))
+    Y = np.fromfile(fid, sep='\t').reshape((-1, 3))*2.-1.
+    fid.close()
+    return {'X': X, 'Y': Y, 'info': "The oil data from Bishop and James (1993)."}
+
+def oil_100(seed=default_seed):
+    np.random.seed(seed=seed)
+    data = oil()
+    indices = np.random.permutation(1000)
+    indices = indices[0:100]
+    X = data['X'][indices, :]
+    Y = data['Y'][indices, :]
+    return {'X': X, 'Y': Y, 'info': "Subsample of the oil data extracting 100 values randomly without replacement."}
+
+def pumadyn(seed=default_seed):
+    # Data is variance 1, no need to normalise.
+    data = np.loadtxt(os.path.join(data_path, 'pumadyn-32nm/Dataset.data.gz'))
+    indices = np.random.permutation(data.shape[0])
+    indicesTrain = indices[0:7168]
+    indicesTest = indices[7168:-1]
+    indicesTrain.sort(axis=0)
+    indicesTest.sort(axis=0)
+    X = data[indicesTrain, 0:-2]
+    Y = data[indicesTrain, -1][:, None]
+    Xtest = data[indicesTest, 0:-2]
+    Ytest = data[indicesTest, -1][:, None]
+    return {'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'info': "The puma robot arm data with 32 inputs. This data is the non linear case with medium noise (pumadyn-32nm). For training 7,168 examples are sampled without replacement."}
+
+
+
+def silhouette():
+    # Ankur Agarwal and Bill Trigg's silhoutte data.
+    matData = scipy.io.loadmat(os.path.join(data_path, 'mocap', 'ankur', 'ankurDataPoseSilhouette.mat'))
+    inMean = np.mean(matData['Y'])
+    inScales = np.sqrt(np.var(matData['Y']))
+    X = matData['Y'] - inMean
+    X = X/inScales
+    Xtest = matData['Y_test'] - inMean
+    Xtest = Xtest/inScales
+    Y = matData['Z']
+    Ytest = matData['Z_test']
+    return {'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'info': "Artificial silhouette simulation data developed from Agarwal and Triggs (2004)."}
+
+def swiss_roll_1000():
+    matData = scipy.io.loadmat(os.path.join(data_path, 'swiss_roll_data'))
+    Y = matData['X_data'][:, 0:1000].transpose()
+    return {'Y': Y, 'info': "Subsample of the swiss roll data extracting only the first 1000 values."}
+
+def swiss_roll():
+    matData = scipy.io.loadmat(os.path.join(data_path, 'swiss_roll_data.mat'))
+    Y = matData['X_data'][:, 0:3000].transpose()
+    return {'Y': Y, 'info': "The first 3,000 points from the swiss roll data of Tennenbaum, de Silva and Langford (2001)."}
+
+def toy_rbf_1d(seed=default_seed):
+    np.random.seed(seed=seed)
+    numIn = 1
+    N = 500
+    X = np.random.uniform(low=-1.0, high=1.0, size=(N, numIn))
+    X.sort(axis=0)
+    rbf = GPy.kern.rbf(numIn, variance=1., lengthscale=0.25)
+    white = GPy.kern.white(numIn, variance=1e-2)
+    kernel = rbf + white
+    K = kernel.K(X)
+    y = np.reshape(np.random.multivariate_normal(np.zeros(N), K), (N,1))
+    return {'X':X, 'Y':y, 'info': "Samples 500 values of a function from an RBF covariance with very small noise for inputs uniformly distributed between -1 and 1."}
+
+def toy_rbf_1d_50(seed=default_seed):
+    np.random.seed(seed=seed)
+    data = toy_rbf_1d()
+    indices = np.random.permutation(data['X'].shape[0])
+    indices = indices[0:50]
+    indices.sort(axis=0)
+    X = data['X'][indices, :]
+    Y = data['Y'][indices, :]
+    return {'X': X, 'Y': Y, 'info': "Subsamples the toy_rbf_sample with 50 values randomly taken from the original sample."}
+
+
+def toy_linear_1d_classification(seed=default_seed):
+    np.random.seed(seed=seed)
+    x1 = np.random.normal(-3,5,20)
+    x2 = np.random.normal(3,5,20)
+    X = (np.r_[x1,x2])[:,None]
+    return {'X': X, 'Y':  sample_class(2.*X), 'F': 2.*X}
+
+def rogers_girolami_olympics():
+    olympic_data = scipy.io.loadmat('/home/neil/public_html/olympics.mat')['male100']
+    X = olympic_data[:, 0][:, None]
+    Y= olympic_data[:, 1][:, None]
+    return {'X': X, 'Y': Y, 'info': "Olympic sprint times for 100 m men from 1896 until 2008. Example is from Rogers and Girolami's First Course in Machine Learning."}
+# def movielens_small(partNo=1,seed=default_seed):
+#     np.random.seed(seed=seed)
+
+#     fileName = os.path.join(data_path, 'movielens', 'small', 'u' + str(partNo) + '.base')
+#     fid = open(fileName)
+#     uTrain = np.fromfile(fid, sep='\t', dtype=np.int16).reshape((-1, 4))
+#     fid.close()
+#     maxVals = np.amax(uTrain, axis=0)
+#     numUsers = maxVals[0]
+#     numFilms = maxVals[1]
+#     numRatings = uTrain.shape[0]
+
+#     Y = scipy.sparse.lil_matrix((numFilms, numUsers), dtype=np.int8)
+#     for i in range(numUsers):
+#         ind = pb.mlab.find(uTrain[:, 0]==i+1)
+#         Y[uTrain[ind, 1]-1, i] = uTrain[ind, 2]
+
+#     fileName = os.path.join(data_path, 'movielens', 'small', 'u' + str(partNo) + '.test')
+#     fid = open(fileName)
+#     uTest = np.fromfile(fid, sep='\t', dtype=np.int16).reshape((-1, 4))
+#     fid.close()
+#     numTestRatings = uTest.shape[0]
+
+#     Ytest = scipy.sparse.lil_matrix((numFilms, numUsers), dtype=np.int8)
+#     for i in range(numUsers):
+#         ind = pb.mlab.find(uTest[:, 0]==i+1)
+#         Ytest[uTest[ind, 1]-1, i] = uTest[ind, 2]
+
+#     lbls = np.empty((1,1))
+#     lblstest = np.empty((1,1))
+#     return {'Y':Y, 'lbls':lbls, 'Ytest':Ytest, 'lblstest':lblstest}
+
+
+
+
+def crescent_data(num_data=200,seed=default_seed):
+    """Data set formed from a mixture of four Gaussians. In each class two of the Gaussians are elongated at right angles to each other and offset to form an approximation to the crescent data that is popular in semi-supervised learning as a toy problem.
+    :param num_data_part: number of data to be sampled (default is 200).
+    :type num_data: int
+    :param seed: random seed to be used for data generation.
+    :type seed: int"""
+    np.random.seed(seed=seed)
+    sqrt2 = np.sqrt(2)
+    # Rotation matrix
+    R = np.array([[sqrt2/2, -sqrt2/2], [sqrt2/2, sqrt2/2]])
+    # Scaling matrices
+    scales = []
+    scales.append(np.array([[3, 0], [0, 1]]))
+    scales.append(np.array([[3, 0], [0, 1]]))
+    scales.append([[1, 0], [0, 3]])
+    scales.append([[1, 0], [0, 3]])
+    means = []
+    means.append(np.array([4, 4]))
+    means.append(np.array([0, 4]))
+    means.append(np.array([-4, -4]))
+    means.append(np.array([0, -4]))
+
+    Xparts = []
+    num_data_part = []
+    num_data_total = 0
+    for i in range(0, 4):
+        num_data_part.append(round(((i+1)*num_data)/4.))
+        num_data_part[i] -= num_data_total
+        print num_data_part[i]
+        part = np.random.normal(size=(num_data_part[i], 2))
+        part = np.dot(np.dot(part, scales[i]), R) + means[i]
+        Xparts.append(part)
+        num_data_total += num_data_part[i]
+    X = np.vstack((Xparts[0], Xparts[1], Xparts[2], Xparts[3]))
+
+
+    Y = np.vstack((np.ones((num_data_part[0]+num_data_part[1], 1)), -np.ones((num_data_part[2]+num_data_part[3], 1))))
+    return {'X':X, 'Y':Y, 'info': "Two separate classes of data formed approximately in the shape of two crescents."}
+
+
+def creep_data():
+    all_data = np.loadtxt(os.path.join(data_path, 'creep', 'taka'))
+    y = all_data[:, 1:2].copy()
+    features = [0]
+    features.extend(range(2, 31))
+    X = all_data[:,features].copy()
+    
+    return {'X': X, 'y' : y}