From 6da0feb7fcc50e2527e2453f7478115c42055401 Mon Sep 17 00:00:00 2001
From: Nicolo Fusi <nicolo.fusi@gmail.com>
Date: Wed, 13 Mar 2013 14:24:10 +0000
Subject: [PATCH 01/21] added fixed effect kernel

---
 GPy/kern/__init__.py        |  2 +-
 GPy/kern/constructors.py    | 14 +++++++++++++
 GPy/kern/fixed.py           | 42 +++++++++++++++++++++++++++++++++++++
 GPy/testing/kernel_tests.py | 11 ++++++++++
 4 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 GPy/kern/fixed.py

diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index 6852384c..5d8a7d15 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -2,5 +2,5 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
-from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, spline, Brownian, rbf_sympy, sympykern, periodic_exponential, periodic_Matern32, periodic_Matern52, prod, prod_orthogonal, symmetric, coregionalise, rational_quadratic
+from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, spline, Brownian, rbf_sympy, sympykern, periodic_exponential, periodic_Matern32, periodic_Matern52, prod, prod_orthogonal, symmetric, coregionalise, rational_quadratic, fixed
 from kern import kern
diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index 983674b0..ec34242b 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -12,6 +12,7 @@ from exponential import exponential as exponentialpart
 from Matern32 import Matern32 as Matern32part
 from Matern52 import Matern52 as Matern52part
 from bias import bias as biaspart
+from fixed import fixed as fixedpart
 from finite_dimensional import finite_dimensional as finite_dimensionalpart
 from spline import spline as splinepart
 from Brownian import Brownian as Brownianpart
@@ -296,3 +297,16 @@ def rational_quadratic(D,variance=1., lengthscale=1., power=1.):
     """
     part = rational_quadraticpart(D,variance, lengthscale, power)
     return kern(D, [part])
+
+def fixed(D, K, variance=1.):
+    """
+     Construct a fixed effect kernel.
+
+     Arguments
+     ---------
+     D (int), obligatory
+     K (np.array), obligatory
+     variance (float)
+    """
+    part = fixedpart(D, K, variance)
+    return kern(D, [part])
diff --git a/GPy/kern/fixed.py b/GPy/kern/fixed.py
new file mode 100644
index 00000000..8732ec8f
--- /dev/null
+++ b/GPy/kern/fixed.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import kernpart
+import numpy as np
+import hashlib
+
+class fixed(kernpart):
+    def __init__(self,D,K,variance=1.):
+        """
+        :param D: the number of input dimensions
+        :type D: int
+        :param variance: the variance of the kernel
+        :type variance: float
+        """
+        self.D = D
+        self.fixed_K = K
+        self.Nparam = 1
+        self.name = 'fixed'
+        self._set_params(np.array([variance]).flatten())
+
+    def _get_params(self):
+        return self.variance
+
+    def _set_params(self,x):
+        assert x.shape==(1,)
+        self.variance = x
+
+    def _get_param_names(self):
+        return ['variance']
+
+    def K(self,X,X2,target):
+        target += self.variance * self.fixed_K
+
+    def dK_dtheta(self,partial,X,X2,target):
+        target += (partial * self.fixed_K).sum()
+
+    def dK_dX(self, partial,X, X2, target):
+        pass
+
+    def dKdiag_dX(self,partial,X,target):
+        pass
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index f1762db8..0f6d8772 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -15,6 +15,17 @@ class KernelTests(unittest.TestCase):
         m = GPy.models.GP_regression(X,Y,K)
         self.assertTrue(m.checkgrad())
 
+    def test_fixedkernel(self):
+        """
+        Fixed effect kernel test
+        """
+        X = np.random.rand(30, 4)
+        K = np.dot(X, X.T)
+        kernel = GPy.kern.fixed(4, K)
+        Y = np.ones((30,1))
+        m = GPy.models.GP_regression(X,Y,kernel=kernel)
+        self.assertTrue(m.checkgrad())
+
     def test_coregionalisation(self):
         X1 = np.random.rand(50,1)*8
         X2 = np.random.rand(30,1)*5

From 0c0c7b000c3eae21a48d91fd70ab8a1f697fc22e Mon Sep 17 00:00:00 2001
From: Ricardo Andrade <acq11ra@sheffield.ac.uk>
Date: Tue, 19 Mar 2013 19:08:25 +0000
Subject: [PATCH 02/21] Generalized FITC is back

---
 GPy/models/__init__.py         |   1 +
 GPy/models/generalized_FITC.py | 203 +++++++++++++++++++++++++++++++++
 2 files changed, 204 insertions(+)
 create mode 100644 GPy/models/generalized_FITC.py

diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py
index 22aa803c..f442dc67 100644
--- a/GPy/models/__init__.py
+++ b/GPy/models/__init__.py
@@ -11,3 +11,4 @@ from warped_GP import warpedGP
 from sparse_GPLVM import sparse_GPLVM
 from uncollapsed_sparse_GP import uncollapsed_sparse_GP
 from Bayesian_GPLVM import Bayesian_GPLVM
+from generalized_FITC import generalized_FITC
diff --git a/GPy/models/generalized_FITC.py b/GPy/models/generalized_FITC.py
new file mode 100644
index 00000000..739059a0
--- /dev/null
+++ b/GPy/models/generalized_FITC.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+import pylab as pb
+from ..util.linalg import mdot, jitchol, chol_inv, pdinv, trace_dot
+from ..util.plot import gpplot
+from .. import kern
+from scipy import stats, linalg
+from sparse_GP import sparse_GP
+
+class generalized_FITC(sparse_GP):
+    """
+    Naish-Guzman, A. and Holden, S. (2008) implemantation of EP with FITC.
+
+    :param X: inputs
+    :type X: np.ndarray (N x Q)
+    :param likelihood: a likelihood instance, containing the observed data
+    :type likelihood: GPy.likelihood.(Gaussian | EP)
+    :param kernel : the kernel/covariance function. See link kernels
+    :type kernel: a GPy kernel
+    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
+    :type X_uncertainty: np.ndarray (N x Q) | None
+    :param Z: inducing inputs (optional, see note)
+    :type Z: np.ndarray (M x Q) | None
+    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
+    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
+    :type M: int
+    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
+    :type normalize_(X|Y): bool
+    """
+
+    def __init__(self, X, likelihood, kernel, Z, X_uncertainty=None, Xslices=None,Zslices=None, normalize_X=False):
+
+        if type(Z) == int:
+            #FIXME
+            self.M = Z
+            self.Z = (np.random.random_sample(self.D*self.M)*(self.X.max()-self.X.min())+self.X.min()).reshape(self.M,-1)
+        elif type(Z) == np.ndarray:
+            self.Z = Z
+            self.M = self.Z.shape[0]
+
+        self._precision = likelihood.precision
+
+        sparse_GP.__init__(self, X, likelihood, kernel=kernel, Z=self.Z, X_uncertainty=None, Xslices=None,Zslices=None, normalize_X=False)
+
+    def _set_params(self, p):
+        self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
+        self.kern._set_params(p[self.Z.size:self.Z.size+self.kern.Nparam])
+        self.likelihood._set_params(p[self.Z.size+self.kern.Nparam:])
+        self._compute_kernel_matrices()
+        self._computations()
+        self._FITC_computations()
+
+    def update_likelihood_approximation(self):
+        """
+        Approximates a non-gaussian likelihood using Expectation Propagation
+
+        For a Gaussian (or direct: TODO) likelihood, no iteration is required:
+        this function does nothing
+        """
+        if self.has_uncertain_inputs:
+            raise NotImplementedError, "FITC approximation not implemented for uncertain inputs"
+        else:
+            self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
+            self._precision = self.likelihood.precision # Save the true precision
+            self.likelihood.precision = self.likelihood.precision/(1. + self.likelihood.precision*self.Diag0[:,None]) # Add the diagonal element of the FITC approximation
+            self._set_params(self._get_params()) # update the GP
+
+
+    def _FITC_computations(self):
+        """
+        FITC approximation doesn't have the correction term in the log-likelihood bound,
+        but adds a diagonal term to the covariance matrix.
+        This function:
+            - computes the diagonal term
+            - eliminates the extra terms computed in the sparse_GP approximation
+            - computes the likelihood gradients wrt the true precision.
+        """
+        if self.likelihood.is_heteroscedastic:
+            # Compute generalized FITC's diagonal term of the covariance
+            self.Qnn = mdot(self.psi1.T,self.Kmmi,self.psi1)
+            self.Diag0 = self.psi0 - np.diag(self.Qnn)
+
+            self.Diag = self.Diag0/(1.+ self.Diag0 * self._precision.flatten())
+            self.P = (self.Diag / self.Diag0)[:,None] * self.psi1.T
+            self.RPT0 = np.dot(self.Lmi,self.psi1)
+            self.L = np.linalg.cholesky(np.eye(self.M) + np.dot(self.RPT0,(1./self.Diag0 - self.Diag/(self.Diag0**2))[:,None]*self.RPT0.T))
+            self.R,info = linalg.flapack.dtrtrs(self.L,self.Lmi,lower=1)
+            self.RPT = np.dot(self.R,self.P.T)
+            self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T,self.RPT)
+            self.w = self.Diag * self.likelihood.v_tilde
+            self.gamma = np.dot(self.R.T, np.dot(self.RPT,self.likelihood.v_tilde))
+            self.mu = self.w + np.dot(self.P,self.gamma)
+            self.mu_tilde = (self.likelihood.v_tilde/self.likelihood.tau_tilde)[:,None]
+
+            # Remove extra term from dL_dpsi1
+            self.dL_dpsi1 += -mdot(self.Kmmi,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)) #dB
+        else:
+            # Remove extra term from dL_dpsi1
+            self.dL_dpsi1 += -mdot(self.Kmmi,self.psi1*self.likelihood.precision) #dB
+
+
+        sf = self.scale_factor
+        sf2 = sf**2
+
+        # Remove extra term from dL_dKmm
+        self.dL_dKmm += 0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi)*sf2 # dB
+        self.dL_dpsi0 = None
+
+        #the partial derivative vector for the likelihood
+        if self.likelihood.Nparams == 0:
+            #save computation here.
+            self.partial_for_likelihood = None
+        elif self.likelihood.is_heteroscedastic:
+            raise NotImplementedError, "heteroscedatic derivates not implemented"
+        else:
+            #likelihood is not heterscedatic
+            self.partial_for_likelihood =   - 0.5 * self.N*self.D*self._precision + 0.5 * np.sum(np.square(self.likelihood.Y))*self._precision**2
+            self.partial_for_likelihood += 0.5 * self.D * trace_dot(self.Bi,self.A)*self._precision
+            self.partial_for_likelihood += self._precision*(0.5*trace_dot(self.psi2_beta_scaled,self.E*sf2) - np.trace(self.Cpsi1VVpsi1))
+
+        #TODO partial derivative vector for the likelihood not implemented
+        #NOTE the true precison is now '_precison' not 'precision'
+
+    def dL_dtheta(self):
+        """
+        Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel
+        """
+        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z)
+        if self.has_uncertain_inputs:
+            raise NotImplementedError, "heteroscedatic derivates not implemented"
+        else:
+            #NOTE in sparse_GP this would include the gradient wrt psi0
+            dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1,self.Z,self.X)
+        return dL_dtheta
+
+
+    def log_likelihood(self):
+        """ Compute the (lower bound on the) log marginal likelihood """
+        sf2 = self.scale_factor**2
+        if self.likelihood.is_heteroscedastic:
+            A = -0.5*self.N*self.D*np.log(2.*np.pi) +0.5*np.sum(np.log(self.likelihood.precision)) -0.5*np.sum(self.V*self.likelihood.Y)
+        else:
+            A = -0.5*self.N*self.D*(np.log(2.*np.pi) + np.log(self.likelihood._variance)) -0.5*self.likelihood.precision*self.likelihood.trYYT
+        C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2))
+        D = 0.5*np.trace(self.Cpsi1VVpsi1)
+        return A+C+D
+
+    def _raw_predict(self, Xnew, slices, full_cov=True):
+        if self.likelihood.is_heteroscedastic:
+            """
+            Make a prediction for the vsGP model
+
+            Arguments
+            ---------
+            X : Input prediction data - Nx1 numpy array (floats)
+            """
+            Kx = self.kern.K(self.Z, Xnew)
+            if full_cov:
+                Kxx = self.kern.K(Xnew)
+            else:
+                Kxx = self.kern.K(Xnew)#FIXME
+                #raise NotImplementedError
+                #Kxx = self.kern.Kdiag(Xnew)
+
+            # q(u|f) = N(u| R0i*mu_u*f, R0i*C*R0i.T)
+
+            # Ci = I + (RPT0)Di(RPT0).T
+            # C = I - [RPT0] * (D+[RPT0].T*[RPT0])^-1*[RPT0].T
+            #   = I - [RPT0] * (D + self.Qnn)^-1 * [RPT0].T
+            #   = I - [RPT0] * (U*U.T)^-1 * [RPT0].T
+            #   = I - V.T * V
+            U = np.linalg.cholesky(np.diag(self.Diag0) + self.Qnn)
+            V,info = linalg.flapack.dtrtrs(U,self.RPT0.T,lower=1)
+            C = np.eye(self.M) - np.dot(V.T,V)
+            mu_u = np.dot(C,self.RPT0)*(1./self.Diag0[None,:])
+            #self.C = C
+            #self.RPT0 = np.dot(self.R0,self.Knm.T) P0.T
+            #self.mu_u = mu_u
+            #self.U = U
+            # q(u|y) = N(u| R0i*mu_H,R0i*Sigma_H*R0i.T)
+            mu_H = np.dot(mu_u,self.mu)
+            self.mu_H = mu_H
+            Sigma_H = C + np.dot(mu_u,np.dot(self.Sigma,mu_u.T))
+            # q(f_star|y) = N(f_star|mu_star,sigma2_star)
+            KR0T = np.dot(Kx.T,self.Lmi.T)
+            mu_star = np.dot(KR0T,mu_H)
+            sigma2_star = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.M),KR0T.T))
+            vdiag = np.diag(sigma2_star)
+            return mu_star[:,None],vdiag[:,None]
+        else:
+            """Internal helper function for making predictions, does not account for normalization"""
+            Kx = self.kern.K(self.Z, Xnew)
+            mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
+            if full_cov:
+                Kxx = self.kern.K(Xnew)
+                var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) #NOTE this won't work for plotting
+            else:
+                Kxx = self.kern.Kdiag(Xnew)
+                var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0)
+
+            return mu,var[:,None]

From a375cb9a1b72b9ce8b7a58a52ffc51373db6eb52 Mon Sep 17 00:00:00 2001
From: Ricardo Andrade <acq11ra@sheffield.ac.uk>
Date: Fri, 22 Mar 2013 16:04:51 +0000
Subject: [PATCH 03/21] generalized fitc + examples

---
 GPy/examples/fitc_class_01.py  | 53 +++++++++++++++++++++
 GPy/examples/fitc_class_02.py  | 66 ++++++++++++++++++++++++++
 GPy/examples/fitc_regr_01.py   | 53 +++++++++++++++++++++
 GPy/models/generalized_FITC.py | 84 ++++++++++++++++++----------------
 4 files changed, 216 insertions(+), 40 deletions(-)
 create mode 100644 GPy/examples/fitc_class_01.py
 create mode 100644 GPy/examples/fitc_class_02.py
 create mode 100644 GPy/examples/fitc_regr_01.py

diff --git a/GPy/examples/fitc_class_01.py b/GPy/examples/fitc_class_01.py
new file mode 100644
index 00000000..413d0c14
--- /dev/null
+++ b/GPy/examples/fitc_class_01.py
@@ -0,0 +1,53 @@
+import pylab as pb
+import numpy as np
+import GPy
+pb.ion()
+pb.close('all')
+
+"""
+Simple 1D classification example
+:param seed : seed value for data generation (default is 4).
+:type seed: int
+"""
+seed=10000
+
+data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+X = data['X']
+Y = data['Y'][:, 0:1]
+Y[Y == -1] = 0
+
+
+
+#X = np.vstack((np.random.uniform(0,10,(10,1)),np.random.uniform(7,17,(10,1)),np.random.uniform(15,25,(10,1))))
+#Y = np.vstack((np.zeros((10,1)),np.ones((10,1)),np.zeros((10,1))))
+
+# Kernel object
+kernel = GPy.kern.rbf(1) + GPy.kern.white(1)
+
+# Likelihood object
+distribution = GPy.likelihoods.likelihood_functions.probit()
+likelihood = GPy.likelihoods.EP(Y,distribution)
+
+Z = np.random.uniform(X.min(),X.max(),(10,1))
+#Z = np.array([0,20])[:,None]
+print Z
+
+# Model definition
+m = GPy.models.generalized_FITC(X,likelihood=likelihood,kernel=kernel,Z=Z,normalize_X=False)
+m.set('len',2.)
+
+m.ensure_default_constraints()
+# Optimize
+#m.constrain_fixed('iip')
+m.update_likelihood_approximation()
+print m.checkgrad(verbose=1)
+# Parameters optimization:
+m.optimize()
+#m.EPEM() #FIXME
+
+# Plot
+pb.subplot(211)
+m.plot_f()
+pb.subplot(212)
+m.plot()
+print(m)
diff --git a/GPy/examples/fitc_class_02.py b/GPy/examples/fitc_class_02.py
new file mode 100644
index 00000000..cd8d6700
--- /dev/null
+++ b/GPy/examples/fitc_class_02.py
@@ -0,0 +1,66 @@
+import pylab as pb
+import numpy as np
+import GPy
+pb.close('all')
+
+seed=10000
+"""Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
+
+:param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
+:param seed : seed value for data generation.
+:type seed: int
+:param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
+:type inducing: int
+"""
+
+data = GPy.util.datasets.crescent_data(seed=seed)
+
+# Kernel object
+kernel = GPy.kern.rbf(data['X'].shape[1]) + GPy.kern.white(data['X'].shape[1])
+
+# Likelihood object
+distribution = GPy.likelihoods.likelihood_functions.probit()
+likelihood = GPy.likelihoods.EP(data['Y'],distribution)
+
+sample = np.random.randint(0,data['X'].shape[0],10)
+Z = data['X'][sample,:]
+# create sparse GP EP model
+#m = GPy.models.sparse_GP(data['X'],likelihood=likelihood,kernel=kernel,Z=Z)
+m = GPy.models.generalized_FITC(data['X'],likelihood=likelihood,kernel=kernel,Z=Z)
+m.ensure_default_constraints()
+m.set('len',10.)
+
+m.update_likelihood_approximation()
+
+# optimize
+m.optimize()
+print(m)
+
+# plot
+m.plot()
+fitc = m
+
+pb.figure()
+# Kernel object
+kernel = GPy.kern.rbf(data['X'].shape[1]) + GPy.kern.white(data['X'].shape[1])
+
+# Likelihood object
+distribution = GPy.likelihoods.likelihood_functions.probit()
+likelihood = GPy.likelihoods.EP(data['Y'],distribution)
+
+sample = np.random.randint(0,data['X'].shape[0],10)
+Z = data['X'][sample,:]
+# create sparse GP EP model
+m = GPy.models.sparse_GP(data['X'],likelihood=likelihood,kernel=kernel,Z=Z)
+m.ensure_default_constraints()
+m.set('len',10.)
+
+m.update_likelihood_approximation()
+
+# optimize
+m.optimize()
+print(m)
+
+# plot
+m.plot()
+variational = m
diff --git a/GPy/examples/fitc_regr_01.py b/GPy/examples/fitc_regr_01.py
new file mode 100644
index 00000000..815e31a7
--- /dev/null
+++ b/GPy/examples/fitc_regr_01.py
@@ -0,0 +1,53 @@
+import pylab as pb
+import numpy as np
+import GPy
+pb.ion()
+pb.close('all')
+
+N = 400
+M = 10
+# sample inputs and outputs
+X = np.random.uniform(-3.,3.,(N,1))
+Y = np.sin(X)+np.random.randn(N,1)*0.05
+
+"""Run a 1D example of a sparse GP regression."""
+"""
+rbf =  GPy.kern.rbf(1)
+noise = GPy.kern.white(1)
+kernel = rbf + noise
+Z = np.random.uniform(-3,3,(M,1))
+likelihood = GPy.likelihoods.Gaussian(Y)
+m = GPy.models.sparse_GP(X, likelihood, kernel, Z)
+m.scale_factor=10000
+m.constrain_positive('(variance|lengthscale|precision)')
+m.checkgrad(verbose=1)
+m.optimize('tnc', messages = 1)
+pb.figure()
+m.plot()
+
+variational = m
+"""
+
+# construct kernel
+rbf =  GPy.kern.rbf(1)
+noise = GPy.kern.white(1)
+kernel = rbf + noise
+#Z = np.random.uniform(-3,3,(M,1))
+Z = variational.Z
+likelihood = GPy.likelihoods.Gaussian(Y)
+# create simple GP model
+m = GPy.models.generalized_FITC(X, likelihood, kernel, Z=Z)
+m.constrain_positive('(variance|lengthscale|precision)')
+#m.constrain_fixed('iip')
+m.checkgrad(verbose=1)
+m.optimize('tnc', messages = 1)
+#pb.figure()
+#m.plot()
+"""
+Xnew = X.copy().flatten()
+Xnew.sort()
+Xnew = Xnew[:,None]
+mean,var,low,up = m.predict(Xnew)
+GPy.util.plot.gpplot(Xnew,mean,low,up)
+fitc = m
+"""
diff --git a/GPy/models/generalized_FITC.py b/GPy/models/generalized_FITC.py
index 739059a0..505f442a 100644
--- a/GPy/models/generalized_FITC.py
+++ b/GPy/models/generalized_FITC.py
@@ -32,14 +32,8 @@ class generalized_FITC(sparse_GP):
 
     def __init__(self, X, likelihood, kernel, Z, X_uncertainty=None, Xslices=None,Zslices=None, normalize_X=False):
 
-        if type(Z) == int:
-            #FIXME
-            self.M = Z
-            self.Z = (np.random.random_sample(self.D*self.M)*(self.X.max()-self.X.min())+self.X.min()).reshape(self.M,-1)
-        elif type(Z) == np.ndarray:
-            self.Z = Z
-            self.M = self.Z.shape[0]
-
+        self.Z = Z
+        self.M = self.Z.shape[0]
         self._precision = likelihood.precision
 
         sparse_GP.__init__(self, X, likelihood, kernel=kernel, Z=self.Z, X_uncertainty=None, Xslices=None,Zslices=None, normalize_X=False)
@@ -64,25 +58,26 @@ class generalized_FITC(sparse_GP):
         else:
             self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
             self._precision = self.likelihood.precision # Save the true precision
-            self.likelihood.precision = self.likelihood.precision/(1. + self.likelihood.precision*self.Diag0[:,None]) # Add the diagonal element of the FITC approximation
+            self.likelihood.precision = self._precision/(1. + self._precision*self.Diag0[:,None]) # Add the diagonal element of the FITC approximation
             self._set_params(self._get_params()) # update the GP
 
-
     def _FITC_computations(self):
         """
         FITC approximation doesn't have the correction term in the log-likelihood bound,
-        but adds a diagonal term to the covariance matrix.
+        but adds a diagonal term to the covariance matrix: diag(Knn - Qnn).
         This function:
-            - computes the diagonal term
-            - eliminates the extra terms computed in the sparse_GP approximation
+            - computes the FITC diagonal term
+            - removes the extra terms computed in the sparse_GP approximation
             - computes the likelihood gradients wrt the true precision.
         """
+        #NOTE the true precison is now '_precison' not 'precision'
         if self.likelihood.is_heteroscedastic:
+
             # Compute generalized FITC's diagonal term of the covariance
             self.Qnn = mdot(self.psi1.T,self.Kmmi,self.psi1)
             self.Diag0 = self.psi0 - np.diag(self.Qnn)
-
             self.Diag = self.Diag0/(1.+ self.Diag0 * self._precision.flatten())
+
             self.P = (self.Diag / self.Diag0)[:,None] * self.psi1.T
             self.RPT0 = np.dot(self.Lmi,self.psi1)
             self.L = np.linalg.cholesky(np.eye(self.M) + np.dot(self.RPT0,(1./self.Diag0 - self.Diag/(self.Diag0**2))[:,None]*self.RPT0.T))
@@ -92,14 +87,13 @@ class generalized_FITC(sparse_GP):
             self.w = self.Diag * self.likelihood.v_tilde
             self.gamma = np.dot(self.R.T, np.dot(self.RPT,self.likelihood.v_tilde))
             self.mu = self.w + np.dot(self.P,self.gamma)
-            self.mu_tilde = (self.likelihood.v_tilde/self.likelihood.tau_tilde)[:,None]
 
             # Remove extra term from dL_dpsi1
             self.dL_dpsi1 += -mdot(self.Kmmi,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)) #dB
         else:
+            raise NotImplementedError, "homoscedastic fitc not implemented"
             # Remove extra term from dL_dpsi1
-            self.dL_dpsi1 += -mdot(self.Kmmi,self.psi1*self.likelihood.precision) #dB
-
+            #self.dL_dpsi1 += -mdot(self.Kmmi,self.psi1*self.likelihood.precision) #dB
 
         sf = self.scale_factor
         sf2 = sf**2
@@ -110,18 +104,16 @@ class generalized_FITC(sparse_GP):
 
         #the partial derivative vector for the likelihood
         if self.likelihood.Nparams == 0:
-            #save computation here.
             self.partial_for_likelihood = None
         elif self.likelihood.is_heteroscedastic:
-            raise NotImplementedError, "heteroscedatic derivates not implemented"
+            raise NotImplementedError, "heteroscedastic derivates not implemented"
         else:
+            raise NotImplementedError, "homoscedastic derivatives not implemented"
             #likelihood is not heterscedatic
-            self.partial_for_likelihood =   - 0.5 * self.N*self.D*self._precision + 0.5 * np.sum(np.square(self.likelihood.Y))*self._precision**2
-            self.partial_for_likelihood += 0.5 * self.D * trace_dot(self.Bi,self.A)*self._precision
-            self.partial_for_likelihood += self._precision*(0.5*trace_dot(self.psi2_beta_scaled,self.E*sf2) - np.trace(self.Cpsi1VVpsi1))
-
+            #self.partial_for_likelihood =   - 0.5 * self.N*self.D*self.likelihood.precision + 0.5 * np.sum(np.square(self.likelihood.Y))*self.likelihood.precision**2
+            #self.partial_for_likelihood += 0.5 * self.D * trace_dot(self.Bi,self.A)*self.likelihood.precision
+            #self.partial_for_likelihood += self.likelihood.precision*(0.5*trace_dot(self.psi2_beta_scaled,self.E*sf2) - np.trace(self.Cpsi1VVpsi1))
         #TODO partial derivative vector for the likelihood not implemented
-        #NOTE the true precison is now '_precison' not 'precision'
 
     def dL_dtheta(self):
         """
@@ -147,23 +139,15 @@ class generalized_FITC(sparse_GP):
         D = 0.5*np.trace(self.Cpsi1VVpsi1)
         return A+C+D
 
-    def _raw_predict(self, Xnew, slices, full_cov=True):
+    def _raw_predict(self, Xnew, slices, full_cov=False):
         if self.likelihood.is_heteroscedastic:
             """
-            Make a prediction for the vsGP model
+            Make a prediction for the generalized FITC model
 
             Arguments
             ---------
             X : Input prediction data - Nx1 numpy array (floats)
             """
-            Kx = self.kern.K(self.Z, Xnew)
-            if full_cov:
-                Kxx = self.kern.K(Xnew)
-            else:
-                Kxx = self.kern.K(Xnew)#FIXME
-                #raise NotImplementedError
-                #Kxx = self.kern.Kdiag(Xnew)
-
             # q(u|f) = N(u| R0i*mu_u*f, R0i*C*R0i.T)
 
             # Ci = I + (RPT0)Di(RPT0).T
@@ -184,13 +168,19 @@ class generalized_FITC(sparse_GP):
             self.mu_H = mu_H
             Sigma_H = C + np.dot(mu_u,np.dot(self.Sigma,mu_u.T))
             # q(f_star|y) = N(f_star|mu_star,sigma2_star)
+            Kx = self.kern.K(self.Z, Xnew)
             KR0T = np.dot(Kx.T,self.Lmi.T)
             mu_star = np.dot(KR0T,mu_H)
-            sigma2_star = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.M),KR0T.T))
-            vdiag = np.diag(sigma2_star)
-            return mu_star[:,None],vdiag[:,None]
-        else:
-            """Internal helper function for making predictions, does not account for normalization"""
+            if full_cov:
+                Kxx = self.kern.K(Xnew)
+                var = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.M),KR0T.T))
+            else:
+                Kxx = self.kern.Kdiag(Xnew)
+                Kxx_ = self.kern.K(Xnew)
+                var_ = Kxx_ + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.M),KR0T.T))
+                var = (Kxx + np.sum(KR0T.T*np.dot(Sigma_H - np.eye(self.M),KR0T.T),0))[:,None]
+            return mu_star[:,None],var
+            """
             Kx = self.kern.K(self.Z, Xnew)
             mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
             if full_cov:
@@ -199,5 +189,19 @@ class generalized_FITC(sparse_GP):
             else:
                 Kxx = self.kern.Kdiag(Xnew)
                 var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0)
-
+            a = kjk
             return mu,var[:,None]
+            """
+        else:
+            raise NotImplementedError, "homoscedastic fitc not implemented"
+            """
+            Kx = self.kern.K(self.Z, Xnew)
+            mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
+            if full_cov:
+                Kxx = self.kern.K(Xnew)
+                var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) #NOTE this won't work for plotting
+            else:
+                Kxx = self.kern.Kdiag(Xnew)
+                var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0)
+            return mu,var[:,None]
+            """

From 35a099f4edfa1605e2b02d943438dba1b6454413 Mon Sep 17 00:00:00 2001
From: Ricardo Andrade <acq11ra@sheffield.ac.uk>
Date: Fri, 22 Mar 2013 19:03:46 +0000
Subject: [PATCH 04/21] minor changes

---
 GPy/examples/fitc_class_01.py |  4 ++--
 GPy/examples/fitc_class_02.py | 15 ++++++++-------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/GPy/examples/fitc_class_01.py b/GPy/examples/fitc_class_01.py
index 413d0c14..06c948b5 100644
--- a/GPy/examples/fitc_class_01.py
+++ b/GPy/examples/fitc_class_01.py
@@ -42,8 +42,8 @@ m.ensure_default_constraints()
 m.update_likelihood_approximation()
 print m.checkgrad(verbose=1)
 # Parameters optimization:
-m.optimize()
-#m.EPEM() #FIXME
+#m.optimize()
+m.pseudo_EM() #FIXME
 
 # Plot
 pb.subplot(211)
diff --git a/GPy/examples/fitc_class_02.py b/GPy/examples/fitc_class_02.py
index cd8d6700..71e10081 100644
--- a/GPy/examples/fitc_class_02.py
+++ b/GPy/examples/fitc_class_02.py
@@ -25,15 +25,15 @@ likelihood = GPy.likelihoods.EP(data['Y'],distribution)
 sample = np.random.randint(0,data['X'].shape[0],10)
 Z = data['X'][sample,:]
 # create sparse GP EP model
-#m = GPy.models.sparse_GP(data['X'],likelihood=likelihood,kernel=kernel,Z=Z)
 m = GPy.models.generalized_FITC(data['X'],likelihood=likelihood,kernel=kernel,Z=Z)
 m.ensure_default_constraints()
 m.set('len',10.)
 
-m.update_likelihood_approximation()
+#m.update_likelihood_approximation()
 
 # optimize
-m.optimize()
+#m.optimize()
+m.pseudo_EM()
 print(m)
 
 # plot
@@ -48,17 +48,18 @@ kernel = GPy.kern.rbf(data['X'].shape[1]) + GPy.kern.white(data['X'].shape[1])
 distribution = GPy.likelihoods.likelihood_functions.probit()
 likelihood = GPy.likelihoods.EP(data['Y'],distribution)
 
-sample = np.random.randint(0,data['X'].shape[0],10)
-Z = data['X'][sample,:]
+#sample = np.random.randint(0,data['X'].shape[0],10)
+#Z = data['X'][sample,:]
 # create sparse GP EP model
 m = GPy.models.sparse_GP(data['X'],likelihood=likelihood,kernel=kernel,Z=Z)
 m.ensure_default_constraints()
 m.set('len',10.)
 
-m.update_likelihood_approximation()
+#m.update_likelihood_approximation()
 
 # optimize
-m.optimize()
+#m.optimize()
+m.pseudo_EM()
 print(m)
 
 # plot

From c72d2b99d99e8d97d827def1d3e180fc6d022165 Mon Sep 17 00:00:00 2001
From: Ricardo Andrade <acq11ra@sheffield.ac.uk>
Date: Wed, 27 Mar 2013 13:52:51 +0000
Subject: [PATCH 05/21] print iteration number

---
 GPy/core/model.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index cc8ae2c4..0804f277 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -467,3 +467,5 @@ class model(parameterised):
                 if ll_change < epsilon:
                     stop = True
             iteration += 1
+            if stop:
+                print "%s iterations." %iteration

From adfa6de1d87baced85215985da9ed2ce671c2a93 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 27 Mar 2013 15:03:46 +0000
Subject: [PATCH 06/21] rbf now works in a more memory friendly fashion

---
 GPy/examples/regression.py |  2 +-
 GPy/kern/rbf.py            | 30 +++++++++++++++---------------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index f5d0d3b1..7de95d20 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -66,7 +66,7 @@ def silhouette():
 
     # optimize
     m.ensure_default_constraints()
-    m.optimize()
+    m.optimize(messages=True)
 
     print(m)
     return m
diff --git a/GPy/kern/rbf.py b/GPy/kern/rbf.py
index ae587202..25e18a41 100644
--- a/GPy/kern/rbf.py
+++ b/GPy/kern/rbf.py
@@ -85,12 +85,10 @@ class rbf(kernpart):
     def dK_dtheta(self,dL_dK,X,X2,target):
         self._K_computations(X,X2)
         target[0] += np.sum(self._K_dvar*dL_dK)
-        if self.ARD == True:
-            dl = self._K_dvar[:,:,None]*self.variance*self._K_dist2/self.lengthscale
-            target[1:] += (dl*dL_dK[:,:,None]).sum(0).sum(0)
+        if self.ARD:
+            [np.add(target[1+q:2+q],self.variance/self.lengthscale[q]**3*np.sum(self._K_dvar*dL_dK*np.square(X[:,q][:,None]-X2[:,q][None,:])),target[1+q:2+q]) for q in range(self.D)]
         else:
-            target[1] += np.sum(self._K_dvar*self.variance*(self._K_dist2.sum(-1))/self.lengthscale*dL_dK)
-        #np.sum(self._K_dvar*self.variance*self._K_dist2/self.lengthscale*dL_dK)
+            target[1] += np.sum(self._K_dvar*self.variance*self._K_dist2/self.lengthscale*dL_dK)
 
     def dKdiag_dtheta(self,dL_dKdiag,X,target):
         #NB: derivative of diagonal elements wrt lengthscale is 0
@@ -98,7 +96,7 @@ class rbf(kernpart):
 
     def dK_dX(self,dL_dK,X,X2,target):
         self._K_computations(X,X2)
-        _K_dist = X[:,None,:]-X2[None,:,:]
+        _K_dist = X[:,None,:]-X2[None,:,:] #don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
         dK_dX = np.transpose(-self.variance*self._K_dvar[:,:,np.newaxis]*_K_dist/self.lengthscale2,(1,0,2))
         target += np.sum(dK_dX*dL_dK.T[:,:,None],0)
 
@@ -183,16 +181,18 @@ class rbf(kernpart):
     #---------------------------------------#
 
     def _K_computations(self,X,X2):
-        if not (np.all(X==self._X) and np.all(X2==self._X2)):
-            self._X = X
-            self._X2 = X2
+        if not (np.all(X==self._X) and np.all(X2==self._X2) and np.all(self._params == self._get_params())):
+            self._X = X.copy()
+            self._X2 = X2.copy()
+            self._params == self._get_params().copy()
             if X2 is None: X2 = X
-            self._K_dist = X[:,None,:]-X2[None,:,:] # this can be computationally heavy
-            self._params = np.empty(shape=(1,0))    #ensure the next section gets called
-        if not np.all(self._params == self._get_params()):
-            self._params == self._get_params()
-            self._K_dist2 = np.square(self._K_dist/self.lengthscale)
-            self._K_dvar = np.exp(-0.5*self._K_dist2.sum(-1))
+            #never do this: self._K_dist = X[:,None,:]-X2[None,:,:] # this can be computationally heavy
+            #_K_dist = X[:,None,:]-X2[None,:,:]
+            #_K_dist2 = np.square(_K_dist/self.lengthscale)
+            X = X/self.lengthscale
+            X2 = X2/self.lengthscale
+            self._K_dist2 = (-2.*np.dot(X, X2.T) + np.sum(np.square(X),1)[:,None] + np.sum(np.square(X2),1)[None,:])
+            self._K_dvar = np.exp(-0.5*self._K_dist2)
 
     def _psi_computations(self,Z,mu,S):
         #here are the "statistics" for psi1 and psi2

From 67d3ba6700983f4adf557d7120ab73ee86d0c62f Mon Sep 17 00:00:00 2001
From: Ricardo Andrade <acq11ra@sheffield.ac.uk>
Date: Wed, 27 Mar 2013 15:05:50 +0000
Subject: [PATCH 07/21] small changes

---
 GPy/examples/fitc_class_01.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/GPy/examples/fitc_class_01.py b/GPy/examples/fitc_class_01.py
index 06c948b5..38c9da63 100644
--- a/GPy/examples/fitc_class_01.py
+++ b/GPy/examples/fitc_class_01.py
@@ -11,15 +11,15 @@ Simple 1D classification example
 """
 seed=10000
 
-data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
-X = data['X']
-Y = data['Y'][:, 0:1]
-Y[Y == -1] = 0
+#data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+#X = data['X']
+#Y = data['Y'][:, 0:1]
+#Y[Y == -1] = 0
 
 
 
-#X = np.vstack((np.random.uniform(0,10,(10,1)),np.random.uniform(7,17,(10,1)),np.random.uniform(15,25,(10,1))))
-#Y = np.vstack((np.zeros((10,1)),np.ones((10,1)),np.zeros((10,1))))
+X = np.vstack((np.random.uniform(0,10,(10,1)),np.random.uniform(7,17,(10,1)),np.random.uniform(15,25,(10,1))))
+Y = np.vstack((np.zeros((10,1)),np.ones((10,1)),np.zeros((10,1))))
 
 # Kernel object
 kernel = GPy.kern.rbf(1) + GPy.kern.white(1)

From bf9e6cbd5f4680152727e908118418ca37a244db Mon Sep 17 00:00:00 2001
From: Ricardo Andrade <acq11ra@sheffield.ac.uk>
Date: Wed, 27 Mar 2013 15:06:21 +0000
Subject: [PATCH 08/21] Not needed

---
 GPy/examples/fitc_class_01.py | 53 ---------------------------
 GPy/examples/fitc_class_02.py | 67 -----------------------------------
 GPy/examples/fitc_regr_01.py  | 53 ---------------------------
 3 files changed, 173 deletions(-)
 delete mode 100644 GPy/examples/fitc_class_01.py
 delete mode 100644 GPy/examples/fitc_class_02.py
 delete mode 100644 GPy/examples/fitc_regr_01.py

diff --git a/GPy/examples/fitc_class_01.py b/GPy/examples/fitc_class_01.py
deleted file mode 100644
index 38c9da63..00000000
--- a/GPy/examples/fitc_class_01.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import pylab as pb
-import numpy as np
-import GPy
-pb.ion()
-pb.close('all')
-
-"""
-Simple 1D classification example
-:param seed : seed value for data generation (default is 4).
-:type seed: int
-"""
-seed=10000
-
-#data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
-#X = data['X']
-#Y = data['Y'][:, 0:1]
-#Y[Y == -1] = 0
-
-
-
-X = np.vstack((np.random.uniform(0,10,(10,1)),np.random.uniform(7,17,(10,1)),np.random.uniform(15,25,(10,1))))
-Y = np.vstack((np.zeros((10,1)),np.ones((10,1)),np.zeros((10,1))))
-
-# Kernel object
-kernel = GPy.kern.rbf(1) + GPy.kern.white(1)
-
-# Likelihood object
-distribution = GPy.likelihoods.likelihood_functions.probit()
-likelihood = GPy.likelihoods.EP(Y,distribution)
-
-Z = np.random.uniform(X.min(),X.max(),(10,1))
-#Z = np.array([0,20])[:,None]
-print Z
-
-# Model definition
-m = GPy.models.generalized_FITC(X,likelihood=likelihood,kernel=kernel,Z=Z,normalize_X=False)
-m.set('len',2.)
-
-m.ensure_default_constraints()
-# Optimize
-#m.constrain_fixed('iip')
-m.update_likelihood_approximation()
-print m.checkgrad(verbose=1)
-# Parameters optimization:
-#m.optimize()
-m.pseudo_EM() #FIXME
-
-# Plot
-pb.subplot(211)
-m.plot_f()
-pb.subplot(212)
-m.plot()
-print(m)
diff --git a/GPy/examples/fitc_class_02.py b/GPy/examples/fitc_class_02.py
deleted file mode 100644
index 71e10081..00000000
--- a/GPy/examples/fitc_class_02.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import pylab as pb
-import numpy as np
-import GPy
-pb.close('all')
-
-seed=10000
-"""Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
-
-:param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
-:param seed : seed value for data generation.
-:type seed: int
-:param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
-:type inducing: int
-"""
-
-data = GPy.util.datasets.crescent_data(seed=seed)
-
-# Kernel object
-kernel = GPy.kern.rbf(data['X'].shape[1]) + GPy.kern.white(data['X'].shape[1])
-
-# Likelihood object
-distribution = GPy.likelihoods.likelihood_functions.probit()
-likelihood = GPy.likelihoods.EP(data['Y'],distribution)
-
-sample = np.random.randint(0,data['X'].shape[0],10)
-Z = data['X'][sample,:]
-# create sparse GP EP model
-m = GPy.models.generalized_FITC(data['X'],likelihood=likelihood,kernel=kernel,Z=Z)
-m.ensure_default_constraints()
-m.set('len',10.)
-
-#m.update_likelihood_approximation()
-
-# optimize
-#m.optimize()
-m.pseudo_EM()
-print(m)
-
-# plot
-m.plot()
-fitc = m
-
-pb.figure()
-# Kernel object
-kernel = GPy.kern.rbf(data['X'].shape[1]) + GPy.kern.white(data['X'].shape[1])
-
-# Likelihood object
-distribution = GPy.likelihoods.likelihood_functions.probit()
-likelihood = GPy.likelihoods.EP(data['Y'],distribution)
-
-#sample = np.random.randint(0,data['X'].shape[0],10)
-#Z = data['X'][sample,:]
-# create sparse GP EP model
-m = GPy.models.sparse_GP(data['X'],likelihood=likelihood,kernel=kernel,Z=Z)
-m.ensure_default_constraints()
-m.set('len',10.)
-
-#m.update_likelihood_approximation()
-
-# optimize
-#m.optimize()
-m.pseudo_EM()
-print(m)
-
-# plot
-m.plot()
-variational = m
diff --git a/GPy/examples/fitc_regr_01.py b/GPy/examples/fitc_regr_01.py
deleted file mode 100644
index 815e31a7..00000000
--- a/GPy/examples/fitc_regr_01.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import pylab as pb
-import numpy as np
-import GPy
-pb.ion()
-pb.close('all')
-
-N = 400
-M = 10
-# sample inputs and outputs
-X = np.random.uniform(-3.,3.,(N,1))
-Y = np.sin(X)+np.random.randn(N,1)*0.05
-
-"""Run a 1D example of a sparse GP regression."""
-"""
-rbf =  GPy.kern.rbf(1)
-noise = GPy.kern.white(1)
-kernel = rbf + noise
-Z = np.random.uniform(-3,3,(M,1))
-likelihood = GPy.likelihoods.Gaussian(Y)
-m = GPy.models.sparse_GP(X, likelihood, kernel, Z)
-m.scale_factor=10000
-m.constrain_positive('(variance|lengthscale|precision)')
-m.checkgrad(verbose=1)
-m.optimize('tnc', messages = 1)
-pb.figure()
-m.plot()
-
-variational = m
-"""
-
-# construct kernel
-rbf =  GPy.kern.rbf(1)
-noise = GPy.kern.white(1)
-kernel = rbf + noise
-#Z = np.random.uniform(-3,3,(M,1))
-Z = variational.Z
-likelihood = GPy.likelihoods.Gaussian(Y)
-# create simple GP model
-m = GPy.models.generalized_FITC(X, likelihood, kernel, Z=Z)
-m.constrain_positive('(variance|lengthscale|precision)')
-#m.constrain_fixed('iip')
-m.checkgrad(verbose=1)
-m.optimize('tnc', messages = 1)
-#pb.figure()
-#m.plot()
-"""
-Xnew = X.copy().flatten()
-Xnew.sort()
-Xnew = Xnew[:,None]
-mean,var,low,up = m.predict(Xnew)
-GPy.util.plot.gpplot(Xnew,mean,low,up)
-fitc = m
-"""

From 85810030c544cd06c97ef9712e34c01071759c5a Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 27 Mar 2013 15:08:25 +0000
Subject: [PATCH 09/21] small efficiency changes in rbf

---
 GPy/kern/rbf.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/GPy/kern/rbf.py b/GPy/kern/rbf.py
index 25e18a41..84f7d68d 100644
--- a/GPy/kern/rbf.py
+++ b/GPy/kern/rbf.py
@@ -86,9 +86,9 @@ class rbf(kernpart):
         self._K_computations(X,X2)
         target[0] += np.sum(self._K_dvar*dL_dK)
         if self.ARD:
-            [np.add(target[1+q:2+q],self.variance/self.lengthscale[q]**3*np.sum(self._K_dvar*dL_dK*np.square(X[:,q][:,None]-X2[:,q][None,:])),target[1+q:2+q]) for q in range(self.D)]
+            [np.add(target[1+q:2+q],(self.variance/self.lengthscale[q]**3)*np.sum(self._K_dvar*dL_dK*np.square(X[:,q][:,None]-X2[:,q][None,:])),target[1+q:2+q]) for q in range(self.D)]
         else:
-            target[1] += np.sum(self._K_dvar*self.variance*self._K_dist2/self.lengthscale*dL_dK)
+            target[1] += (self.variance/self.lengthscale)*np.sum(self._K_dvar*self._K_dist2*dL_dK)
 
     def dKdiag_dtheta(self,dL_dKdiag,X,target):
         #NB: derivative of diagonal elements wrt lengthscale is 0
@@ -97,7 +97,7 @@ class rbf(kernpart):
     def dK_dX(self,dL_dK,X,X2,target):
         self._K_computations(X,X2)
         _K_dist = X[:,None,:]-X2[None,:,:] #don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
-        dK_dX = np.transpose(-self.variance*self._K_dvar[:,:,np.newaxis]*_K_dist/self.lengthscale2,(1,0,2))
+        dK_dX = (-self.variance/self.lengthscale2)*np.transpose(self._K_dvar[:,:,np.newaxis]*_K_dist,(1,0,2))
         target += np.sum(dK_dX*dL_dK.T[:,:,None],0)
 
     def dKdiag_dX(self,dL_dKdiag,X,target):

From 0f168a16670430af60d9885f9498aec9f9da99c8 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <neil@stefano.(none)>
Date: Mon, 1 Apr 2013 11:01:16 +0100
Subject: [PATCH 10/21] Added mocap.py for loading in motion capture data.

---
 GPy/util/mocap.py | 78 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 GPy/util/mocap.py

diff --git a/GPy/util/mocap.py b/GPy/util/mocap.py
new file mode 100644
index 00000000..7e4e9e11
--- /dev/null
+++ b/GPy/util/mocap.py
@@ -0,0 +1,78 @@
+import os
+import numpy as np
+
+def load_text_data(dataset, directory, centre=True):
+    """Load in a data set of marker points from the Ohio State University C3D motion capture files (http://accad.osu.edu/research/mocap/mocap_data.htm)."""
+
+    points, point_names = parse_text(os.path.join(directory, dataset + '.txt'))[0:2]
+    # Remove markers where there is a NaN
+    present_index = [i for i in range(points[0].shape[1]) if not (np.any(np.isnan(points[0][:, i])) or np.any(np.isnan(points[0][:, i])) or np.any(np.isnan(points[0][:, i])))]
+
+    # Concatanate the X, Y and Z markers together
+    Y = np.concatenate((points[0][:, present_index], points[1][:, present_index], points[2][:, present_index]), axis=1)
+    if centre:
+        Y = Y - Y.mean(axis=0)
+    Y = Y/400.
+    return Y
+
+def parse_text(file_name):
+    """Parse data from Ohio State University text mocap files (http://accad.osu.edu/research/mocap/mocap_data.htm)."""
+
+    # Read the header
+    fid = open(file_name, 'r')
+    point_names = np.array(fid.readline().split())[2:-1:3]
+    fid.close()
+    for i in range(len(point_names)):
+        point_names[i] = point_names[i][0:-3]
+
+    # Read the matrix data
+    S = np.loadtxt(file_name, skiprows=1)
+    field = np.uint(S[:, 0])
+    times = S[:, 1]
+    S = S[:, 2:]
+
+    # Set the -9999.99 markers to be not present
+    S[S==-9999.99] = np.NaN
+
+    # Store x, y and z in different arrays
+    points = []
+    points.append(S[:, 0:-1:3])
+    points.append(S[:, 1:-1:3])
+    points.append(S[:, 2:-1:3])
+
+    return points, point_names, times
+
+#def read_connections():
+
+
+#     fid = fopen(fileName);
+#     i = 1;
+#     rem = fgets(fid);	
+#     while(rem ~= -1)		
+#         [token, rem] = strtok(rem, ',');
+#         connections{i, 1} = fliplr(deblank(fliplr(deblank(token))));
+#         [token, rem] = strtok(rem, ',');
+#         connections{i, 2} = fliplr(deblank(fliplr(deblank(token))));
+#         i = i + 1;
+#         rem = fgets(fid);	
+#     end
+
+#     connect = zeros(length(pointNames));
+#     fclose(fid);
+#     for i = 1:size(connections, 1);
+#         for j = 1:length(pointNames)
+#             if strcmp(pointNames{j}, connections{i, 1}) | ...
+#                 strcmp(pointNames{j}, connections{i, 2})
+#       for k = 1:length(pointNames)
+#         if k == j
+#           break
+#         end
+#         if strcmp(pointNames{k}, connections{i, 1}) | ...
+#               strcmp(pointNames{k}, connections{i, 2})
+#           connect(j, k) = 1;
+#         end
+#       end
+#     end
+#   end
+# end
+# connect = sparse(connect);      

From 3fd0672092f58551ce48fb739709868d45deb49c Mon Sep 17 00:00:00 2001
From: Neil Lawrence <neil@stefano.(none)>
Date: Mon, 1 Apr 2013 20:20:01 +0100
Subject: [PATCH 11/21] Added base implementation of data visualization
 framework for use with GP-LVM.

---
 GPy/util/visualize.py | 159 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 159 insertions(+)
 create mode 100644 GPy/util/visualize.py

diff --git a/GPy/util/visualize.py b/GPy/util/visualize.py
new file mode 100644
index 00000000..eba89b8a
--- /dev/null
+++ b/GPy/util/visualize.py
@@ -0,0 +1,159 @@
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+import GPy
+import numpy as np
+
+class lvm:
+    def __init__(self, model, data_visualize, ax):
+        self.cid = ax.figure.canvas.mpl_connect('button_press_event', self.on_click)
+        self.cid = ax.figure.canvas.mpl_connect('motion_notify_event', self.on_move)
+        self.data_visualize = data_visualize
+        self.model = model
+        self.ax = ax
+        self.called = False
+        self.move_on = False
+
+    def on_click(self, event):
+        print 'click', event.xdata, event.ydata
+        if event.inaxes!=self.ax: return
+        self.move_on = not self.move_on
+        print
+        if self.called:
+            self.xs.append(event.xdata)
+            self.ys.append(event.ydata)
+            self.line.set_data(self.xs, self.ys)
+            self.line.figure.canvas.draw()
+        else:
+            self.xs = [event.xdata]
+            self.ys = [event.ydata]
+            self.line, = ax.plot(event.xdata, event.ydata)
+        self.called = True
+    def on_move(self, event):
+        if event.inaxes!=self.ax: return
+        if self.called and self.move_on:
+            # Call modify code on move
+            #print 'move', event.xdata, event.ydata
+            latent_values = np.array((event.xdata, event.ydata))
+            y = self.model.predict(latent_values)[0]
+            self.data_visualize.modify(y)
+            #print 'y', y
+
+class data_show:
+    """The data show class is a base class which describes how to visualize a particular data set. For example, motion capture data can be plotted as a stick figure, or images are shown using imshow. This class enables latent to data visualizations for the GP-LVM."""
+
+    def __init__(self, vals, axis=None):
+        self.vals = vals
+        # If no axes are defined, create some.
+        if axis==None:
+            fig = plt.figure()
+            self.axis = fig.add_subplot(111)            
+        else:
+            self.axis = axis
+
+    def modify(self, vals):
+        raise NotImplementedError, "this needs to be implemented to use the data_show class"
+
+class vector_show(data_show):
+    """A base visualization class that just shows a data vector as a plot of vector elements alongside their indices."""
+    def __init__(self, vals, axis=None):
+        data_show.__init__(self, vals, axis)
+        self.vals = vals.T
+        self.handle = plt.plot(np.arange(0, len(vals))[:, None], self.vals)[0]
+
+    def modify(self, vals):
+        xdata, ydata = self.handle.get_data()
+        self.vals = vals.T
+        self.handle.set_data(xdata, self.vals)
+        self.axis.figure.canvas.draw()
+
+class image_show(data_show):
+    """Show a data vector as an image."""
+    def __init__(self, vals, axis=None, dimensions=(16,16), transpose=False, invert=False):
+        data_show.__init__(self, vals, axis)
+        self.dimensions = dimensions
+        self.fig_display = plt.figure()
+        self.set_image(vals)
+        self.handle = plt.imshow(self.vals)
+        self.transpose = transpose
+        self.invert = invert
+
+    def modify(self, vals):
+        self.set_image(vals)
+        self.handle.set_array(self.vals)
+        self.axis.figure.canvas.draw()
+
+    def set_image(self, vals):
+        self.vals = np.reshape(vals, self.dimensions)
+        if self.transpose:
+            self.vals = self.vals.T
+        if self.invert:
+            self.vals = -self.vals
+
+class stick_show(data_show):
+    """Show a three dimensional point cloud as a figure. Connect elements of the figure together using the matrix connect."""
+
+    def __init__(self, vals, axis=None, connect=None):
+        if axis==None:
+            fig = plt.figure()
+            axis = fig.add_subplot(111, projection='3d')
+        data_show.__init__(self, vals, axis)
+        self.vals = vals.reshape((3, vals.shape[1]/3)).T
+        self.x_lim = np.array([self.vals[:, 0].min(), self.vals[:, 0].max()])
+        self.y_lim = np.array([self.vals[:, 1].min(), self.vals[:, 1].max()])
+        self.z_lim = np.array([self.vals[:, 2].min(), self.vals[:, 2].max()])
+        self.points_handle = self.axis.scatter(self.vals[:, 0], self.vals[:, 1], self.vals[:, 2])
+        self.axis.set_xlim(self.x_lim)
+        self.axis.set_ylim(self.y_lim)
+        self.axis.set_zlim(self.z_lim)
+        self.axis.set_aspect(1)
+        self.axis.autoscale(enable=False)
+
+        self.connect = connect
+        if not self.connect==None:
+            x = []
+            y = []
+            z = []
+            self.I, self.J = np.nonzero(self.connect)
+            for i in range(len(self.I)):
+                x.append(self.vals[self.I[i], 0])
+                x.append(self.vals[self.J[i], 0])
+                x.append(np.NaN)
+                y.append(self.vals[self.I[i], 1])
+                y.append(self.vals[self.J[i], 1])
+                y.append(np.NaN)
+                z.append(self.vals[self.I[i], 2])
+                z.append(self.vals[self.J[i], 2])
+                z.append(np.NaN)
+            self.line_handle = self.axis.plot(np.array(x), np.array(y), np.array(z), 'b-')
+        self.axis.figure.canvas.draw()
+
+    def modify(self, vals):
+        self.points_handle.remove()
+        self.line_handle[0].remove()
+        self.vals = vals.reshape((3, vals.shape[1]/3)).T
+        self.points_handle = self.axis.scatter(self.vals[:, 0], self.vals[:, 1], self.vals[:, 2])
+        self.axis.set_xlim(self.x_lim)
+        self.axis.set_ylim(self.y_lim)
+        self.axis.set_zlim(self.z_lim)
+        self.line_handle = []
+        if not self.connect==None:
+            x = []
+            y = []
+            z = []
+            self.I, self.J = np.nonzero(self.connect)
+            for i in range(len(self.I)):
+                x.append(self.vals[self.I[i], 0])
+                x.append(self.vals[self.J[i], 0])
+                x.append(np.NaN)
+                y.append(self.vals[self.I[i], 1])
+                y.append(self.vals[self.J[i], 1])
+                y.append(np.NaN)
+                z.append(self.vals[self.I[i], 2])
+                z.append(self.vals[self.J[i], 2])
+                z.append(np.NaN)
+            self.line_handle = self.axis.plot(np.array(x), np.array(y), np.array(z), 'b-')
+
+        self.axis.figure.canvas.draw()
+        
+
+

From fce4dd7fdea42d24abb14396d02bf1bf475e15d7 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <neil@stefano.(none)>
Date: Tue, 2 Apr 2013 02:20:53 +0200
Subject: [PATCH 12/21] Further edits on visualization code for faces example.

---
 GPy/core/model.py                        |  8 +--
 GPy/examples/dimensionality_reduction.py | 50 +++++++++++++++++
 GPy/examples/regression.py               |  6 +-
 GPy/models/GPLVM.py                      |  5 +-
 GPy/util/__init__.py                     |  2 +
 GPy/util/datasets.py                     | 45 ++++++++++-----
 GPy/util/linalg.py                       |  3 +-
 GPy/util/mocap.py                        | 70 +++++++++++-------------
 GPy/util/visualize.py                    | 42 ++++++++------
 9 files changed, 151 insertions(+), 80 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index 0804f277..9216aea6 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -23,13 +23,13 @@ class model(parameterised):
         self._set_params(self._get_params())
         self.preferred_optimizer = 'tnc'
     def _get_params(self):
-        raise NotImplementedError, "this needs to be implemented to utilise the model class"
+        raise NotImplementedError, "this needs to be implemented to use the model class"
     def _set_params(self,x):
-        raise NotImplementedError, "this needs to be implemented to utilise the model class"
+        raise NotImplementedError, "this needs to be implemented to use the model class"
     def log_likelihood(self):
-        raise NotImplementedError, "this needs to be implemented to utilise the model class"
+        raise NotImplementedError, "this needs to be implemented to use the model class"
     def _log_likelihood_gradients(self):
-        raise NotImplementedError, "this needs to be implemented to utilise the model class"
+        raise NotImplementedError, "this needs to be implemented to use the model class"
 
     def set_prior(self,which,what):
         """
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index d7610acb..164d04ae 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pylab as pb
+from matplotlib import pyplot as plt
+
 import GPy
 
 default_seed = np.random.seed(123344)
@@ -55,3 +57,51 @@ def GPLVM_oil_100():
     print(m)
     m.plot_latent(labels=data['Y'].argmax(axis=1))
     return m
+
+def oil_100():
+    data = GPy.util.datasets.oil_100()
+    m = GPy.models.GPLVM(data['X'], 2)
+
+    # optimize
+    m.ensure_default_constraints()
+    m.optimize(messages=1, max_iters=2)
+
+    # plot
+    print(m)
+    #m.plot_latent(labels=data['Y'].argmax(axis=1))
+    return m
+
+def brendan_faces():
+    data = GPy.util.datasets.brendan_faces()
+    Y = data['Y'][0:500, :]
+    m = GPy.models.GPLVM(Y, 2, init='rand')
+
+    # optimize
+    m.ensure_default_constraints()
+    m.optimize(messages=1, max_f_eval=40)
+
+    ax = m.plot_latent()
+    y = m.likelihood.Y[0,:]
+    data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, invert=False, scale=False)
+    lvm_visualizer = GPy.util.visualize.lvm(m, data_show, ax)
+    raw_input('Press enter to finish')
+    plt.close('all')
+
+    return m
+
+def stick():
+    data = GPy.util.datasets.stick()
+    m = GPy.models.GPLVM(data['Y'], 2, init='rand')
+    
+    # optimize
+    m.ensure_default_constraints()
+    m.optimize(messages=1, max_f_eval=10000)
+
+    ax = m.plot_latent()
+    y = m.likelihood.Y[0,:]
+    data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+    lvm_visualizer = GPy.util.visualize.lvm(m, data_show, ax)
+    raw_input('Press enter to finish')
+    plt.close('all')
+
+    return m
diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index 7de95d20..1a35df2f 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -73,7 +73,7 @@ def silhouette():
 
 def coregionalisation_toy2():
     """
-    A simple demonstration of coregionalisation on two sinusoidal functions
+    A simple demonstration of coregionalisation on two sinusoidal functions.
     """
     X1 = np.random.rand(50,1)*8
     X2 = np.random.rand(30,1)*5
@@ -106,7 +106,7 @@ def coregionalisation_toy2():
 
 def coregionalisation_toy():
     """
-    A simple demonstration of coregionalisation on two sinusoidal functions
+    A simple demonstration of coregionalisation on two sinusoidal functions.
     """
     X1 = np.random.rand(50,1)*8
     X2 = np.random.rand(30,1)*5
@@ -139,7 +139,7 @@ def coregionalisation_toy():
 
 def coregionalisation_sparse():
     """
-    A simple demonstration of coregionalisation on two sinusoidal functions
+    A simple demonstration of coregionalisation on two sinusoidal functions using sparse approximations.
     """
     X1 = np.random.rand(500,1)*8
     X2 = np.random.rand(300,1)*5
diff --git a/GPy/models/GPLVM.py b/GPy/models/GPLVM.py
index 32594594..03e9b715 100644
--- a/GPy/models/GPLVM.py
+++ b/GPy/models/GPLVM.py
@@ -81,6 +81,7 @@ class GPLVM(GP):
                 k = [p for p in self.kern.parts if p.name in ['rbf','linear']]
                 if (not len(k)==1) or (not k[0].ARD):
                     raise ValueError, "cannot Atomatically determine which dimensions to plot, please pass 'which_indices'"
+                input_1, input_2 = self.lengthscale_order()
                 k = k[0]
                 if k.name=='rbf':
                     input_1, input_2 = np.argsort(k.lengthscale)[:2]
@@ -92,7 +93,7 @@ class GPLVM(GP):
 	Xtest_full = np.zeros((Xtest.shape[0], self.X.shape[1]))
 	Xtest_full[:, :2] = Xtest
 	mu, var, low, up = self.predict(Xtest_full)
-	var = var[:, :2]
+	var = var.mean(axis=1) # this was var[:, :2] edit by Neil
         pb.imshow(var.reshape(resolution,resolution).T[::-1,:],extent=[xmin[0],xmax[0],xmin[1],xmax[1]],cmap=pb.cm.binary,interpolation='bilinear')
 
 
@@ -122,4 +123,4 @@ class GPLVM(GP):
         pb.xlim(xmin[0],xmax[0])
         pb.ylim(xmin[1],xmax[1])
 
-        return input_1, input_2
+        return pb.gca() #input_1, input_2 temporary removal, to return axes.
diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py
index c91557d0..56dbd5b9 100644
--- a/GPy/util/__init__.py
+++ b/GPy/util/__init__.py
@@ -10,4 +10,6 @@ import Tango
 import misc
 import warping_functions
 import datasets
+import mocap
+import visualize
 import decorators
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index ed808f1b..932690ec 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -15,12 +15,12 @@ def sample_class(f):
     return c
 
 def della_gatta_TRP63_gene_expression(gene_number=None):
-    matData = scipy.io.loadmat(os.path.join(data_path, 'DellaGattadata.mat'))
-    X = np.double(matData['timepoints'])
+    mat_data = scipy.io.loadmat(os.path.join(data_path, 'DellaGattadata.mat'))
+    X = np.double(mat_data['timepoints'])
     if gene_number == None:
-        Y = matData['exprs_tp53_RMA']
+        Y = mat_data['exprs_tp53_RMA']
     else:
-        Y = matData['exprs_tp53_RMA'][:, gene_number]
+        Y = mat_data['exprs_tp53_RMA'][:, gene_number]
         if len(Y.shape) == 1:
             Y = Y[:, None]
     return {'X': X, 'Y': Y, 'info': "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA."}
@@ -60,28 +60,42 @@ def pumadyn(seed=default_seed):
     return {'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'info': "The puma robot arm data with 32 inputs. This data is the non linear case with medium noise (pumadyn-32nm). For training 7,168 examples are sampled without replacement."}
 
 
+def brendan_faces():
+    mat_data = scipy.io.loadmat(os.path.join(data_path, 'frey_rawface.mat'))
+    Y = mat_data['ff'].T
+    return {'Y': Y, 'info': "Face data made available by Brendan Frey"}
+
+
+
 
 def silhouette():
     # Ankur Agarwal and Bill Trigg's silhoutte data.
-    matData = scipy.io.loadmat(os.path.join(data_path, 'mocap', 'ankur', 'ankurDataPoseSilhouette.mat'))
-    inMean = np.mean(matData['Y'])
-    inScales = np.sqrt(np.var(matData['Y']))
-    X = matData['Y'] - inMean
+    mat_data = scipy.io.loadmat(os.path.join(data_path, 'mocap', 'ankur', 'ankurDataPoseSilhouette.mat'))
+    inMean = np.mean(mat_data['Y'])
+    inScales = np.sqrt(np.var(mat_data['Y']))
+    X = mat_data['Y'] - inMean
     X = X/inScales
-    Xtest = matData['Y_test'] - inMean
+    Xtest = mat_data['Y_test'] - inMean
     Xtest = Xtest/inScales
-    Y = matData['Z']
-    Ytest = matData['Z_test']
+    Y = mat_data['Z']
+    Ytest = mat_data['Z_test']
     return {'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'info': "Artificial silhouette simulation data developed from Agarwal and Triggs (2004)."}
 
+def stick():
+    Y, connect = GPy.util.mocap.load_text_data('run1', data_path)
+    Y = Y[0:-1:4, :]
+    lbls = 'connect'
+    return {'Y': Y, 'connect' : connect, 'info': "Stick man data from Ohio."}
+
+
 def swiss_roll_1000():
-    matData = scipy.io.loadmat(os.path.join(data_path, 'swiss_roll_data'))
-    Y = matData['X_data'][:, 0:1000].transpose()
+    mat_data = scipy.io.loadmat(os.path.join(data_path, 'swiss_roll_data'))
+    Y = mat_data['X_data'][:, 0:1000].transpose()
     return {'Y': Y, 'info': "Subsample of the swiss roll data extracting only the first 1000 values."}
 
 def swiss_roll():
-    matData = scipy.io.loadmat(os.path.join(data_path, 'swiss_roll_data.mat'))
-    Y = matData['X_data'][:, 0:3000].transpose()
+    mat_data = scipy.io.loadmat(os.path.join(data_path, 'swiss_roll_data.mat'))
+    Y = mat_data['X_data'][:, 0:3000].transpose()
     return {'Y': Y, 'info': "The first 3,000 points from the swiss roll data of Tennenbaum, de Silva and Langford (2001)."}
 
 def toy_rbf_1d(seed=default_seed):
@@ -202,3 +216,4 @@ def creep_data():
     features.extend(range(2, 31))
     X = all_data[:,features].copy()
     return {'X': X, 'y' : y}
+
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index f21502a5..45644b41 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -145,7 +145,8 @@ def PCA(Y, Q):
     """
     if not np.allclose(Y.mean(axis=0), 0.0):
         print "Y is not zero mean, centering it locally (GPy.util.linalg.PCA)"
-        Y -= Y.mean(axis=0)
+        
+        Y -= Y.mean(axis=0) 
 
     Z = linalg.svd(Y, full_matrices = False)
     [X, W] = [Z[0][:,0:Q], np.dot(np.diag(Z[1]), Z[2]).T[:,0:Q]]
diff --git a/GPy/util/mocap.py b/GPy/util/mocap.py
index 7e4e9e11..e66a36b9 100644
--- a/GPy/util/mocap.py
+++ b/GPy/util/mocap.py
@@ -8,12 +8,17 @@ def load_text_data(dataset, directory, centre=True):
     # Remove markers where there is a NaN
     present_index = [i for i in range(points[0].shape[1]) if not (np.any(np.isnan(points[0][:, i])) or np.any(np.isnan(points[0][:, i])) or np.any(np.isnan(points[0][:, i])))]
 
+    point_names = point_names[present_index]
+    for i in range(3):
+        points[i] = points[i][:, present_index]
+        if centre:
+            points[i] = (points[i].T - points[i].mean(axis=1)).T 
+
     # Concatanate the X, Y and Z markers together
-    Y = np.concatenate((points[0][:, present_index], points[1][:, present_index], points[2][:, present_index]), axis=1)
-    if centre:
-        Y = Y - Y.mean(axis=0)
+    Y = np.concatenate((points[0], points[1], points[2]), axis=1)
     Y = Y/400.
-    return Y
+    connect = read_connections(os.path.join(directory, 'connections.txt'), point_names)
+    return Y, connect
 
 def parse_text(file_name):
     """Parse data from Ohio State University text mocap files (http://accad.osu.edu/research/mocap/mocap_data.htm)."""
@@ -23,7 +28,7 @@ def parse_text(file_name):
     point_names = np.array(fid.readline().split())[2:-1:3]
     fid.close()
     for i in range(len(point_names)):
-        point_names[i] = point_names[i][0:-3]
+        point_names[i] = point_names[i][0:-2]
 
     # Read the matrix data
     S = np.loadtxt(file_name, skiprows=1)
@@ -42,37 +47,28 @@ def parse_text(file_name):
 
     return points, point_names, times
 
-#def read_connections():
+def read_connections(file_name, point_names):
+    """Read a file detailing which markers should be connected to which for motion capture data."""
 
+    connections = []
+    fid = open(file_name, 'r')
+    line=fid.readline()
+    while(line):
+        connections.append(np.array(line.split(',')))
+        connections[-1][0] = connections[-1][0].strip()
+        connections[-1][1] = connections[-1][1].strip()
+        line = fid.readline()
+    connect = np.zeros((len(point_names), len(point_names)),dtype=bool)
+    for i in range(len(point_names)):
+        for j in range(len(point_names)):
+            for k in range(len(connections)):
+                if connections[k][0] == point_names[i] and connections[k][1] == point_names[j]:
+                    
+                    connect[i,j]=True
+                    connect[j,i]=True
+                    break
+    
+    return connect
 
-#     fid = fopen(fileName);
-#     i = 1;
-#     rem = fgets(fid);	
-#     while(rem ~= -1)		
-#         [token, rem] = strtok(rem, ',');
-#         connections{i, 1} = fliplr(deblank(fliplr(deblank(token))));
-#         [token, rem] = strtok(rem, ',');
-#         connections{i, 2} = fliplr(deblank(fliplr(deblank(token))));
-#         i = i + 1;
-#         rem = fgets(fid);	
-#     end
-
-#     connect = zeros(length(pointNames));
-#     fclose(fid);
-#     for i = 1:size(connections, 1);
-#         for j = 1:length(pointNames)
-#             if strcmp(pointNames{j}, connections{i, 1}) | ...
-#                 strcmp(pointNames{j}, connections{i, 2})
-#       for k = 1:length(pointNames)
-#         if k == j
-#           break
-#         end
-#         if strcmp(pointNames{k}, connections{i, 1}) | ...
-#               strcmp(pointNames{k}, connections{i, 2})
-#           connect(j, k) = 1;
-#         end
-#       end
-#     end
-#   end
-# end
-# connect = sparse(connect);      
+    
+  
diff --git a/GPy/util/visualize.py b/GPy/util/visualize.py
index eba89b8a..8a685ebb 100644
--- a/GPy/util/visualize.py
+++ b/GPy/util/visualize.py
@@ -4,18 +4,18 @@ import GPy
 import numpy as np
 
 class lvm:
-    def __init__(self, model, data_visualize, ax):
-        self.cid = ax.figure.canvas.mpl_connect('button_press_event', self.on_click)
-        self.cid = ax.figure.canvas.mpl_connect('motion_notify_event', self.on_move)
+    def __init__(self, model, data_visualize, latent_axis):
+        self.cid = latent_axis.figure.canvas.mpl_connect('button_press_event', self.on_click)
+        self.cid = latent_axis.figure.canvas.mpl_connect('motion_notify_event', self.on_move)
         self.data_visualize = data_visualize
         self.model = model
-        self.ax = ax
+        self.latent_axis = latent_axis
         self.called = False
         self.move_on = False
 
     def on_click(self, event):
         print 'click', event.xdata, event.ydata
-        if event.inaxes!=self.ax: return
+        if event.inaxes!=self.latent_axis: return
         self.move_on = not self.move_on
         print
         if self.called:
@@ -26,10 +26,10 @@ class lvm:
         else:
             self.xs = [event.xdata]
             self.ys = [event.ydata]
-            self.line, = ax.plot(event.xdata, event.ydata)
+            self.line, = self.latent_axis.plot(event.xdata, event.ydata)
         self.called = True
     def on_move(self, event):
-        if event.inaxes!=self.ax: return
+        if event.inaxes!=self.latent_axis: return
         if self.called and self.move_on:
             # Call modify code on move
             #print 'move', event.xdata, event.ydata
@@ -68,28 +68,34 @@ class vector_show(data_show):
 
 class image_show(data_show):
     """Show a data vector as an image."""
-    def __init__(self, vals, axis=None, dimensions=(16,16), transpose=False, invert=False):
+    def __init__(self, vals, axis=None, dimensions=(16,16), transpose=False, invert=False, scale=False):
         data_show.__init__(self, vals, axis)
         self.dimensions = dimensions
-        self.fig_display = plt.figure()
-        self.set_image(vals)
-        self.handle = plt.imshow(self.vals)
         self.transpose = transpose
         self.invert = invert
+        self.scale = scale
+        self.set_image(vals/255.)
+        self.handle = self.axis.imshow(self.vals, interpolation='nearest')
+        plt.show()
 
     def modify(self, vals):
-        self.set_image(vals)
+        self.set_image(vals/255.)
+        #self.handle.remove()
+        #self.handle = self.axis.imshow(self.vals)
         self.handle.set_array(self.vals)
-        self.axis.figure.canvas.draw()
-
+        #self.axis.figure.canvas.draw()
+        plt.show()
+        
     def set_image(self, vals):
-        self.vals = np.reshape(vals, self.dimensions)
+        self.vals = np.reshape(vals, self.dimensions, order='F')
         if self.transpose:
             self.vals = self.vals.T
-        if self.invert:
-            self.vals = -self.vals
+        if not self.scale:
+            self.vals = self.vals
+        #if self.invert:
+        #    self.vals = -self.vals
 
-class stick_show(data_show):
+class stick_show(data_show): 
     """Show a three dimensional point cloud as a figure. Connect elements of the figure together using the matrix connect."""
 
     def __init__(self, vals, axis=None, connect=None):

From 9422c603b1ca1cf854da58145ffbea982c243fee Mon Sep 17 00:00:00 2001
From: Neil Lawrence <neil@stefano.(none)>
Date: Tue, 2 Apr 2013 10:49:09 +0200
Subject: [PATCH 13/21] Minor modifications to visualization routines and
 examples.

---
 GPy/examples/dimensionality_reduction.py |  8 ++++----
 GPy/util/visualize.py                    | 23 +++++++++++------------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 164d04ae..e4308465 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -73,12 +73,12 @@ def oil_100():
 
 def brendan_faces():
     data = GPy.util.datasets.brendan_faces()
-    Y = data['Y'][0:500, :]
-    m = GPy.models.GPLVM(Y, 2, init='rand')
+    Y = data['Y'][0:-1:10, :]
+    m = GPy.models.GPLVM(data['Y'], 2)
 
     # optimize
     m.ensure_default_constraints()
-    m.optimize(messages=1, max_f_eval=40)
+    m.optimize(messages=1, max_f_eval=10000)
 
     ax = m.plot_latent()
     y = m.likelihood.Y[0,:]
@@ -91,7 +91,7 @@ def brendan_faces():
 
 def stick():
     data = GPy.util.datasets.stick()
-    m = GPy.models.GPLVM(data['Y'], 2, init='rand')
+    m = GPy.models.GPLVM(data['Y'], 2)
     
     # optimize
     m.ensure_default_constraints()
diff --git a/GPy/util/visualize.py b/GPy/util/visualize.py
index 8a685ebb..dde9cd32 100644
--- a/GPy/util/visualize.py
+++ b/GPy/util/visualize.py
@@ -14,19 +14,18 @@ class lvm:
         self.move_on = False
 
     def on_click(self, event):
-        print 'click', event.xdata, event.ydata
+        #print 'click', event.xdata, event.ydata
         if event.inaxes!=self.latent_axis: return
         self.move_on = not self.move_on
-        print
-        if self.called:
-            self.xs.append(event.xdata)
-            self.ys.append(event.ydata)
-            self.line.set_data(self.xs, self.ys)
-            self.line.figure.canvas.draw()
-        else:
-            self.xs = [event.xdata]
-            self.ys = [event.ydata]
-            self.line, = self.latent_axis.plot(event.xdata, event.ydata)
+        # if self.called:
+        #     self.xs.append(event.xdata)
+        #     self.ys.append(event.ydata)
+        #     self.line.set_data(self.xs, self.ys)
+        #     self.line.figure.canvas.draw()
+        # else:
+        #     self.xs = [event.xdata]
+        #     self.ys = [event.ydata]
+        #     self.line, = self.latent_axis.plot(event.xdata, event.ydata)
         self.called = True
     def on_move(self, event):
         if event.inaxes!=self.latent_axis: return
@@ -75,7 +74,7 @@ class image_show(data_show):
         self.invert = invert
         self.scale = scale
         self.set_image(vals/255.)
-        self.handle = self.axis.imshow(self.vals, interpolation='nearest')
+        self.handle = self.axis.imshow(self.vals, cmap=plt.cm.gray, interpolation='nearest')
         plt.show()
 
     def modify(self, vals):

From 97db2a5bd7e5c5339319e366cc34980c99b537e6 Mon Sep 17 00:00:00 2001
From: Neil Lawrence <neil@stefano.(none)>
Date: Tue, 2 Apr 2013 16:53:42 +0200
Subject: [PATCH 14/21] Edit to linalg.py PCA function to stop it changing data
 matrix.

---
 GPy/util/linalg.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 45644b41..59f598f9 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -146,9 +146,9 @@ def PCA(Y, Q):
     if not np.allclose(Y.mean(axis=0), 0.0):
         print "Y is not zero mean, centering it locally (GPy.util.linalg.PCA)"
         
-        Y -= Y.mean(axis=0) 
+        #Y -= Y.mean(axis=0) 
 
-    Z = linalg.svd(Y, full_matrices = False)
+    Z = linalg.svd(Y-Y.mean(axis=0), full_matrices = False)
     [X, W] = [Z[0][:,0:Q], np.dot(np.diag(Z[1]), Z[2]).T[:,0:Q]]
     v = X.std(axis=0)
     X /= v;

From 97704d992803677befeb5cbebeb6b7918bf7cb33 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Fri, 5 Apr 2013 11:09:14 +0100
Subject: [PATCH 15/21] added the rbfcos kernel

ARD seems to work

dK_dX still todo
---
 GPy/kern/__init__.py     |   2 +-
 GPy/kern/constructors.py |   8 +++
 GPy/kern/rbfcos.py       | 117 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 126 insertions(+), 1 deletion(-)
 create mode 100644 GPy/kern/rbfcos.py

diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index 5d8a7d15..f062ee56 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -2,5 +2,5 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
-from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, spline, Brownian, rbf_sympy, sympykern, periodic_exponential, periodic_Matern32, periodic_Matern52, prod, prod_orthogonal, symmetric, coregionalise, rational_quadratic, fixed
+from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, spline, Brownian, rbf_sympy, sympykern, periodic_exponential, periodic_Matern32, periodic_Matern52, prod, prod_orthogonal, symmetric, coregionalise, rational_quadratic, fixed, rbfcos
 from kern import kern
diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index 90b13600..6a968da4 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -24,6 +24,7 @@ from prod_orthogonal import prod_orthogonal as prod_orthogonalpart
 from symmetric import symmetric as symmetric_part
 from coregionalise import coregionalise as coregionalise_part
 from rational_quadratic import rational_quadratic as rational_quadraticpart
+from rbfcos import rbfcos as rbfcospart
 #TODO these s=constructors are not as clean as we'd like. Tidy the code up
 #using meta-classes to make the objects construct properly wthout them.
 
@@ -310,3 +311,10 @@ def fixed(D, K, variance=1.):
     """
     part = fixedpart(D, K, variance)
     return kern(D, [part])
+
+def rbfcos(D,variance=1.,frequencies=None,bandwidths=None,ARD=False):
+    """
+    construct a rbfcos kernel
+    """
+    part = rbfcospart(D,variance,frequencies,bandwidths,ARD)
+    return kern(D,[part])
diff --git a/GPy/kern/rbfcos.py b/GPy/kern/rbfcos.py
new file mode 100644
index 00000000..0a9182cc
--- /dev/null
+++ b/GPy/kern/rbfcos.py
@@ -0,0 +1,117 @@
+
+# Copyright (c) 2012, James Hensman and Andrew Gordon Wilson
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import kernpart
+import numpy as np
+import hashlib
+
+class rbfcos(kernpart):
+    def __init__(self,D,variance=1.,frequencies=None,bandwidths=None,ARD=False):
+        self.D = D
+        self.name = 'rbfcos'
+        if self.D>10:
+            print "Warning: the rbfcos kernel requires a lot of memory for high dimensional inputs"
+        self.ARD = ARD
+
+        #set the default frequencies and bandwidths, appropriate Nparam
+        if ARD:
+            self.Nparam = 2*self.D + 1
+            if frequencies is not None:
+                frequencies = np.asarray(frequencies)
+                assert frequencies.size == self.D, "bad number of frequencies"
+            else:
+                frequencies = np.ones(self.D)
+            if bandwidths is not None:
+                bandwidths = np.asarray(bandwidths)
+                assert bandwidths.size == self.D, "bad number of bandwidths"
+            else:
+                bandwidths = np.ones(self.D)
+        else:
+            self.Nparam = 3
+            if frequencies is not None:
+                frequencies = np.asarray(frequencies)
+                assert frequencies.size == 1, "Only one frequency needed for non-ARD kernel"
+            else:
+                frequencies = np.ones(1)
+
+            if bandwidths is not None:
+                bandwidths = np.asarray(bandwidths)
+                assert bandwidths.size == 1, "Only one bandwidth needed for non-ARD kernel"
+            else:
+                bandwidths = np.ones(1)
+
+        #initialise cache
+        self._X, self._X2, self._params = np.empty(shape=(3,1))
+
+        self._set_params(np.hstack((variance,frequencies.flatten(),bandwidths.flatten())))
+
+
+    def _get_params(self):
+        return np.hstack((self.variance,self.frequencies, self.bandwidths))
+
+    def _set_params(self,x):
+        assert x.size==(self.Nparam)
+        if self.ARD:
+            self.variance = x[0]
+            self.frequencies = x[1:1+self.D]
+            self.bandwidths = x[1+self.D:]
+        else:
+            self.variance, self.frequencies, self.bandwidths = x
+
+    def _get_param_names(self):
+        if self.Nparam == 3:
+            return ['variance','frequency','bandwidth']
+        else:
+            return ['variance']+['frequency_%i'%i for i in range(self.D)]+['bandwidth_%i'%i for i in range(self.D)]
+
+    def K(self,X,X2,target):
+        self._K_computations(X,X2)
+        target += self.variance*self._dvar
+
+    def Kdiag(self,X,target):
+        np.add(target,self.variance,target)
+
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        target[0] += np.sum(dL_dK*self._dvar)
+        if self.ARD:
+            for q in xrange(self.D):
+                target[q+1] += -2.*np.pi*self.variance*np.sum(dL_dK*self._dvar*np.tan(2.*np.pi*self.dist[:,:,q]*self.frequencies[q])*self.dist[:,:,q])
+                target[q+1+self.D] += -2.*np.pi**2*self.variance*np.sum(dL_dK*self._dvar*self.dist2[:,:,q])
+        else:
+            target[1] += -2.*np.pi*self.variance*np.sum(dL_dK*self._dvar*np.sum(np.tan(2.*np.pi*self.dist*self.frequencies)*self.dist,-1))
+            target[2] += -2.*np.pi**2*self.variance*np.sum(dL_dK*self._dvar*self.dist2.sum(-1))
+
+
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        target[0] += np.sum(dL_dKdiag)
+
+    def dK_dX(self,dL_dK,X,X2,target):
+        #TODO!!!
+        raise NotImplementedError
+
+    def dKdiag_dX(self,dL_dKdiag,X,target):
+        pass
+
+    def _K_computations(self,X,X2):
+        if not (np.all(X==self._X) and np.all(X2==self._X2)):
+            if X2 is None: X2 = X
+            self._X = X.copy()
+            self._X2 = X2.copy()
+
+            #do the distances: this will be high memory for large D
+            #NB: we don't take the abs of the dist because cos is symmetric
+            self.dist = X[:,None,:] - X2[None,:,:]
+            self.dist2 = np.square(self.dist)
+
+            #ensure the next section is computed:
+            self._params = np.empty(self.Nparam)
+
+        if not np.all(self._params == self._get_params()):
+            self._params == self._get_params().copy()
+
+            self._rbf_part = np.exp(-2.*np.pi**2*np.sum(self.dist2*self.bandwidths,-1))
+            self._cos_part = np.prod(np.cos(2.*np.pi*self.dist*self.frequencies),-1)
+            self._dvar = self._rbf_part*self._cos_part
+

From 8e4d839b5d6f4211b1e766edbc1d052fbdd77ac9 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Fri, 5 Apr 2013 12:08:21 +0100
Subject: [PATCH 16/21] yak shaving

---
 GPy/kern/rbfcos.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/GPy/kern/rbfcos.py b/GPy/kern/rbfcos.py
index 0a9182cc..094b806b 100644
--- a/GPy/kern/rbfcos.py
+++ b/GPy/kern/rbfcos.py
@@ -5,7 +5,6 @@
 
 from kernpart import kernpart
 import numpy as np
-import hashlib
 
 class rbfcos(kernpart):
     def __init__(self,D,variance=1.,frequencies=None,bandwidths=None,ARD=False):
@@ -32,13 +31,13 @@ class rbfcos(kernpart):
             self.Nparam = 3
             if frequencies is not None:
                 frequencies = np.asarray(frequencies)
-                assert frequencies.size == 1, "Only one frequency needed for non-ARD kernel"
+                assert frequencies.size == 1, "Exactly one frequency needed for non-ARD kernel"
             else:
                 frequencies = np.ones(1)
 
             if bandwidths is not None:
                 bandwidths = np.asarray(bandwidths)
-                assert bandwidths.size == 1, "Only one bandwidth needed for non-ARD kernel"
+                assert bandwidths.size == 1, "Exactly one bandwidth needed for non-ARD kernel"
             else:
                 bandwidths = np.ones(1)
 
@@ -74,14 +73,15 @@ class rbfcos(kernpart):
         np.add(target,self.variance,target)
 
     def dK_dtheta(self,dL_dK,X,X2,target):
+        self._K_computations(X,X2)
         target[0] += np.sum(dL_dK*self._dvar)
         if self.ARD:
             for q in xrange(self.D):
-                target[q+1] += -2.*np.pi*self.variance*np.sum(dL_dK*self._dvar*np.tan(2.*np.pi*self.dist[:,:,q]*self.frequencies[q])*self.dist[:,:,q])
-                target[q+1+self.D] += -2.*np.pi**2*self.variance*np.sum(dL_dK*self._dvar*self.dist2[:,:,q])
+                target[q+1] += -2.*np.pi*self.variance*np.sum(dL_dK*self._dvar*np.tan(2.*np.pi*self._dist[:,:,q]*self.frequencies[q])*self._dist[:,:,q])
+                target[q+1+self.D] += -2.*np.pi**2*self.variance*np.sum(dL_dK*self._dvar*self._dist2[:,:,q])
         else:
-            target[1] += -2.*np.pi*self.variance*np.sum(dL_dK*self._dvar*np.sum(np.tan(2.*np.pi*self.dist*self.frequencies)*self.dist,-1))
-            target[2] += -2.*np.pi**2*self.variance*np.sum(dL_dK*self._dvar*self.dist2.sum(-1))
+            target[1] += -2.*np.pi*self.variance*np.sum(dL_dK*self._dvar*np.sum(np.tan(2.*np.pi*self._dist*self.frequencies)*self._dist,-1))
+            target[2] += -2.*np.pi**2*self.variance*np.sum(dL_dK*self._dvar*self._dist2.sum(-1))
 
 
     def dKdiag_dtheta(self,dL_dKdiag,X,target):
@@ -102,8 +102,8 @@ class rbfcos(kernpart):
 
             #do the distances: this will be high memory for large D
             #NB: we don't take the abs of the dist because cos is symmetric
-            self.dist = X[:,None,:] - X2[None,:,:]
-            self.dist2 = np.square(self.dist)
+            self._dist = X[:,None,:] - X2[None,:,:]
+            self._dist2 = np.square(self._dist)
 
             #ensure the next section is computed:
             self._params = np.empty(self.Nparam)
@@ -111,7 +111,7 @@ class rbfcos(kernpart):
         if not np.all(self._params == self._get_params()):
             self._params == self._get_params().copy()
 
-            self._rbf_part = np.exp(-2.*np.pi**2*np.sum(self.dist2*self.bandwidths,-1))
-            self._cos_part = np.prod(np.cos(2.*np.pi*self.dist*self.frequencies),-1)
+            self._rbf_part = np.exp(-2.*np.pi**2*np.sum(self._dist2*self.bandwidths,-1))
+            self._cos_part = np.prod(np.cos(2.*np.pi*self._dist*self.frequencies),-1)
             self._dvar = self._rbf_part*self._cos_part
 

From 22ad5d94e95fe5e464e7c3ffc206f8dc650f9e62 Mon Sep 17 00:00:00 2001
From: Ricardo Andrade <acq11ra@sheffield.ac.uk>
Date: Fri, 5 Apr 2013 17:27:44 +0100
Subject: [PATCH 17/21] Changes in FITC approximation computation

---
 GPy/likelihoods/EP.py          | 12 +++++++++---
 GPy/models/generalized_FITC.py | 26 ++++++++++----------------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/GPy/likelihoods/EP.py b/GPy/likelihoods/EP.py
index b23eda9f..118b226a 100644
--- a/GPy/likelihoods/EP.py
+++ b/GPy/likelihoods/EP.py
@@ -302,10 +302,16 @@ class EP(likelihood):
                 mu = self.w + np.dot(P,self.gamma)
                 self.iterations += 1
             #Sigma recomptutation with Cholesky decompositon
-            Diag = Diag0/(1.+ Diag0 * self.tau_tilde)
-            P = (Diag / Diag0)[:,None] * P0
+            Iplus_Dprod_i = 1./(1.+ Diag0 * self.tau_tilde)
+            Diag = Diag0 * Iplus_Dprod_i
+            P = Iplus_Dprod_i[:,None] * P0
+
+            #Diag = Diag0/(1.+ Diag0 * self.tau_tilde)
+            #P = (Diag / Diag0)[:,None] * P0
             RPT0 = np.dot(R0,P0.T)
-            L = jitchol(np.eye(M) + np.dot(RPT0,(1./Diag0 - Diag/(Diag0**2))[:,None]*RPT0.T))
+            L = jitchol(np.eye(M) + np.dot(RPT0,((1. - Iplus_Dprod_i)/Diag0)[:,None]*RPT0.T))
+            #L = jitchol(np.eye(M) + np.dot(RPT0,(1./Diag0 - Iplus_Dprod_i/Diag0)[:,None]*RPT0.T))
+            #L = jitchol(np.eye(M) + np.dot(RPT0,(1./Diag0 - Diag/(Diag0**2))[:,None]*RPT0.T))
             R,info = linalg.lapack.flapack.dtrtrs(L,R0,lower=1)
             RPT = np.dot(R,P.T)
             Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
diff --git a/GPy/models/generalized_FITC.py b/GPy/models/generalized_FITC.py
index 505f442a..8c983b05 100644
--- a/GPy/models/generalized_FITC.py
+++ b/GPy/models/generalized_FITC.py
@@ -76,11 +76,17 @@ class generalized_FITC(sparse_GP):
             # Compute generalized FITC's diagonal term of the covariance
             self.Qnn = mdot(self.psi1.T,self.Kmmi,self.psi1)
             self.Diag0 = self.psi0 - np.diag(self.Qnn)
-            self.Diag = self.Diag0/(1.+ self.Diag0 * self._precision.flatten())
+            Iplus_Dprod_i = 1./(1.+ self.Diag0 * self._precision.flatten())
+            self.Diag = self.Diag0 * Iplus_Dprod_i
+            #self.Diag = self.Diag0/(1.+ self.Diag0 * self._precision.flatten())
 
-            self.P = (self.Diag / self.Diag0)[:,None] * self.psi1.T
+
+            self.P = Iplus_Dprod_i[:,None] * self.psi1.T
+            #self.P = (self.Diag / self.Diag0)[:,None] * self.psi1.T
             self.RPT0 = np.dot(self.Lmi,self.psi1)
-            self.L = np.linalg.cholesky(np.eye(self.M) + np.dot(self.RPT0,(1./self.Diag0 - self.Diag/(self.Diag0**2))[:,None]*self.RPT0.T))
+            self.L = np.linalg.cholesky(np.eye(self.M) + np.dot(self.RPT0,((1. - Iplus_Dprod_i)/self.Diag0)[:,None]*self.RPT0.T))
+            #self.L = np.linalg.cholesky(np.eye(self.M) + np.dot(self.RPT0,(1./self.Diag0 - Iplus_Dprod_i/self.Diag0)[:,None]*self.RPT0.T))
+            #self.L = np.linalg.cholesky(np.eye(self.M) + np.dot(self.RPT0,(1./self.Diag0 - self.Diag/(self.Diag0**2))[:,None]*self.RPT0.T))
             self.R,info = linalg.flapack.dtrtrs(self.L,self.Lmi,lower=1)
             self.RPT = np.dot(self.R,self.P.T)
             self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T,self.RPT)
@@ -89,7 +95,7 @@ class generalized_FITC(sparse_GP):
             self.mu = self.w + np.dot(self.P,self.gamma)
 
             # Remove extra term from dL_dpsi1
-            self.dL_dpsi1 += -mdot(self.Kmmi,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)) #dB
+            self.dL_dpsi1 -= mdot(self.Kmmi,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)) #dB
         else:
             raise NotImplementedError, "homoscedastic fitc not implemented"
             # Remove extra term from dL_dpsi1
@@ -180,18 +186,6 @@ class generalized_FITC(sparse_GP):
                 var_ = Kxx_ + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.M),KR0T.T))
                 var = (Kxx + np.sum(KR0T.T*np.dot(Sigma_H - np.eye(self.M),KR0T.T),0))[:,None]
             return mu_star[:,None],var
-            """
-            Kx = self.kern.K(self.Z, Xnew)
-            mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
-            if full_cov:
-                Kxx = self.kern.K(Xnew)
-                var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) #NOTE this won't work for plotting
-            else:
-                Kxx = self.kern.Kdiag(Xnew)
-                var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0)
-            a = kjk
-            return mu,var[:,None]
-            """
         else:
             raise NotImplementedError, "homoscedastic fitc not implemented"
             """

From bb734a6dd79ee8c4150b800fe61cf7249fa60460 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Tue, 9 Apr 2013 15:43:06 +0100
Subject: [PATCH 18/21] added simple BGPLVM_oil demo

---
 GPy/examples/dimensionality_reduction.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index e4308465..fed79417 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -42,16 +42,36 @@ def BGPLVM(seed = default_seed):
 
     return m
 
-def GPLVM_oil_100():
+def GPLVM_oil_100(optimize=True,M=15):
+    data = GPy.util.datasets.oil_100()
+
+    # create simple GP model
+    kernel = GPy.kern.rbf(6, ARD = True) + GPy.kern.bias(6)
+    m = GPy.models.Bayesian_GPLVM(data['X'], 6, kernel=kernel, M=M)
+    m.data_labels = data['Y'].argmax(axis=1)
+
+    # optimize
+    m.ensure_default_constraints()
+    if optimize:
+        m.optimize('scg',messages=1)
+
+    # plot
+    print(m)
+    m.plot_latent(labels=m.data_labels)
+    return m
+
+def BGPLVM_oil_100(optimize=True):
     data = GPy.util.datasets.oil_100()
 
     # create simple GP model
     kernel = GPy.kern.rbf(6, ARD = True) + GPy.kern.bias(6)
     m = GPy.models.GPLVM(data['X'], 6, kernel = kernel)
+    m.data_labels = data['Y'].argmax(axis=1)
 
     # optimize
     m.ensure_default_constraints()
-    m.optimize(messages=1)
+    if optimize:
+        m.optimize(messages=1)
 
     # plot
     print(m)

From fd0b172e81ceb3c63dc3ca7f5469b4f2cc8bc63f Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 10 Apr 2013 09:28:58 +0100
Subject: [PATCH 19/21] various work on BGPLVM oil demo

---
 GPy/core/model.py                        | 23 +++++++++++++++++++
 GPy/examples/dimensionality_reduction.py | 28 ++++++++++++++++--------
 GPy/models/Bayesian_GPLVM.py             | 20 ++++++++++++-----
 GPy/models/GPLVM.py                      | 24 +++++++++-----------
 4 files changed, 66 insertions(+), 29 deletions(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index 9216aea6..645f7228 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -423,6 +423,28 @@ class model(parameterised):
                 grad_string = "{0:^{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}".format(formatted_name,r,d,g, ng, c0 = cols[0]+9, c1 = cols[1], c2 = cols[2], c3 = cols[3], c4 = cols[4])
                 print grad_string
 
+    def input_sensitivity(self):
+        """
+        return an array describing the sesitivity of the model to each input
+
+        NB. Right now, we're basing this on the lengthscales (or variances) of the kernel.
+        TODO: proper sensitivity analysis
+        """
+
+        if not hasattr(self,'kern'):
+            raise ValueError, "this model has no kernel"
+
+        k = [p for p in self.kern.parts if p.name in ['rbf','linear']]
+        if (not len(k)==1) or (not k[0].ARD):
+            raise ValueError, "cannot determine sensitivity for this kernel"
+        k = k[0]
+
+        if k.name=='rbf':
+            return k.lengthscale
+        elif k.name=='linear':
+            return 1./k.variances
+
+
     def pseudo_EM(self,epsilon=.1,**kwargs):
         """
         TODO: Should this not bein the GP class?
@@ -469,3 +491,4 @@ class model(parameterised):
             iteration += 1
             if stop:
                 print "%s iterations." %iteration
+
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index fed79417..b3509ec5 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -47,7 +47,7 @@ def GPLVM_oil_100(optimize=True,M=15):
 
     # create simple GP model
     kernel = GPy.kern.rbf(6, ARD = True) + GPy.kern.bias(6)
-    m = GPy.models.Bayesian_GPLVM(data['X'], 6, kernel=kernel, M=M)
+    m = GPy.models.GPLVM(data['X'], 6, kernel=kernel, M=M)
     m.data_labels = data['Y'].argmax(axis=1)
 
     # optimize
@@ -60,22 +60,32 @@ def GPLVM_oil_100(optimize=True,M=15):
     m.plot_latent(labels=m.data_labels)
     return m
 
-def BGPLVM_oil_100(optimize=True):
-    data = GPy.util.datasets.oil_100()
+def BGPLVM_oil(optimize=True,N=100,Q=10,M=15):
+    data = GPy.util.datasets.oil()
 
     # create simple GP model
-    kernel = GPy.kern.rbf(6, ARD = True) + GPy.kern.bias(6)
-    m = GPy.models.GPLVM(data['X'], 6, kernel = kernel)
-    m.data_labels = data['Y'].argmax(axis=1)
+    kernel = GPy.kern.rbf(Q, ARD = True) + GPy.kern.bias(Q) + GPy.kern.white(Q,0.001)
+    m = GPy.models.Bayesian_GPLVM(data['X'][:N], Q, kernel = kernel,M=M)
+    m.data_labels = data['Y'][:N].argmax(axis=1)
+
+    #initial conditions
 
     # optimize
-    m.ensure_default_constraints()
     if optimize:
-        m.optimize(messages=1)
+        m.constrain_fixed('noise',0.05)
+        m.ensure_default_constraints()
+        m.optimize('scg',messages=1)
+        m.unconstrain('noise')
+        m.constrain_positive('noise')
+        m.optimize('scg',messages=1)
+    else:
+        m.ensure_default_constraints()
 
     # plot
     print(m)
-    m.plot_latent(labels=data['Y'].argmax(axis=1))
+    m.plot_latent(labels=m.data_labels)
+    pb.figure()
+    pb.bar(np.arange(m.kern.D),1./m.input_sensitivity())
     return m
 
 def oil_100():
diff --git a/GPy/models/Bayesian_GPLVM.py b/GPy/models/Bayesian_GPLVM.py
index 8f9759c3..0f5758ba 100644
--- a/GPy/models/Bayesian_GPLVM.py
+++ b/GPy/models/Bayesian_GPLVM.py
@@ -22,12 +22,12 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
     :type init: 'PCA'|'random'
 
     """
-    def __init__(self, Y, Q, X = None, S = None, init='PCA', M=10, Z=None, kernel=None, **kwargs):
+    def __init__(self, Y, Q, X = None, X_uncertainty = None, init='PCA', M=10, Z=None, kernel=None, **kwargs):
         if X == None:
             X = self.initialise_latent(init, Q, Y)
 
-        if S is None:
-            S = np.ones_like(X) * 1e-2#
+        if X_uncertainty is None:
+            X_uncertainty = np.ones_like(X) * 0.5
 
         if Z is None:
             Z = np.random.permutation(X.copy())[:M]
@@ -37,7 +37,7 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
             kernel = kern.rbf(Q) + kern.white(Q)
 
 
-        sparse_GP.__init__(self, X, Gaussian(Y), kernel, Z=Z, X_uncertainty=S, **kwargs)
+        sparse_GP.__init__(self, X, Gaussian(Y), kernel, Z=Z, X_uncertainty=X_uncertainty, **kwargs)
 
     def _get_param_names(self):
         X_names = sum([['X_%i_%i'%(n,q) for q in range(self.Q)] for n in range(self.N)],[])
@@ -84,6 +84,14 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
     def _log_likelihood_gradients(self):
         return np.hstack((self.dL_dmuS().flatten(), sparse_GP._log_likelihood_gradients(self)))
 
-    def plot_latent(self, *args, **kwargs):
-        input_1, input_2 = GPLVM.plot_latent(self, *args, **kwargs)
+    def plot_latent(self, which_indices=None,*args, **kwargs):
+
+        if which_indices is None:
+            try:
+                input_1, input_2 = np.argsort(self.input_sensitivity())[:2]
+            except:
+                raise ValueError, "cannot Atomatically determine which dimensions to plot, please pass 'which_indices'"
+        else:
+            input_1, input_2 = which_indices
+        GPLVM.plot_latent(self, which_indices=[input_1, input_2],*args, **kwargs)
         pb.plot(self.Z[:, input_1], self.Z[:, input_2], '^w')
diff --git a/GPy/models/GPLVM.py b/GPy/models/GPLVM.py
index 03e9b715..cc4be70e 100644
--- a/GPy/models/GPLVM.py
+++ b/GPy/models/GPLVM.py
@@ -67,7 +67,7 @@ class GPLVM(GP):
         """
 
         util.plot.Tango.reset()
-        
+
         if labels is None:
             labels = np.ones(self.N)
         if which_indices is None:
@@ -77,23 +77,19 @@ class GPLVM(GP):
             if self.Q==2:
                 input_1, input_2 = 0,1
             else:
-                #try to find a linear of RBF kern in the kernel
-                k = [p for p in self.kern.parts if p.name in ['rbf','linear']]
-                if (not len(k)==1) or (not k[0].ARD):
+                try:
+                    input_1, input_2 = np.argsort(self.input_sensitivity())[:2]
+                except:
                     raise ValueError, "cannot Atomatically determine which dimensions to plot, please pass 'which_indices'"
-                input_1, input_2 = self.lengthscale_order()
-                k = k[0]
-                if k.name=='rbf':
-                    input_1, input_2 = np.argsort(k.lengthscale)[:2]
-                elif k.name=='linear':
-                    input_1, input_2 = np.argsort(k.variances)[::-1][:2]
+        else:
+            input_1, input_2 = which_indices
 
         #first, plot the output variance as a function of the latent space
         Xtest, xx,yy,xmin,xmax = util.plot.x_frame2D(self.X[:,[input_1, input_2]],resolution=resolution)
-	Xtest_full = np.zeros((Xtest.shape[0], self.X.shape[1]))
-	Xtest_full[:, :2] = Xtest
-	mu, var, low, up = self.predict(Xtest_full)
-	var = var.mean(axis=1) # this was var[:, :2] edit by Neil
+        Xtest_full = np.zeros((Xtest.shape[0], self.X.shape[1]))
+        Xtest_full[:, :2] = Xtest
+        mu, var, low, up = self.predict(Xtest_full)
+        var = var.mean(axis=1) # this was var[:, :2] edit by Neil
         pb.imshow(var.reshape(resolution,resolution).T[::-1,:],extent=[xmin[0],xmax[0],xmin[1],xmax[1]],cmap=pb.cm.binary,interpolation='bilinear')
 
 

From 45403105f9793a88f9e9003f1f953efff9eb10d0 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 10 Apr 2013 09:34:15 +0100
Subject: [PATCH 20/21] changed X_uncertainty for X_variance (in the code) for
 consistency with actual naming (in the printing)

---
 GPy/models/Bayesian_GPLVM.py   | 22 +++++++++++-----------
 GPy/models/GP.py               |  2 +-
 GPy/models/generalized_FITC.py |  8 ++++----
 GPy/models/sparse_GP.py        | 30 +++++++++++++++---------------
 4 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/GPy/models/Bayesian_GPLVM.py b/GPy/models/Bayesian_GPLVM.py
index 0f5758ba..f66fabde 100644
--- a/GPy/models/Bayesian_GPLVM.py
+++ b/GPy/models/Bayesian_GPLVM.py
@@ -22,12 +22,12 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
     :type init: 'PCA'|'random'
 
     """
-    def __init__(self, Y, Q, X = None, X_uncertainty = None, init='PCA', M=10, Z=None, kernel=None, **kwargs):
+    def __init__(self, Y, Q, X = None, X_variance = None, init='PCA', M=10, Z=None, kernel=None, **kwargs):
         if X == None:
             X = self.initialise_latent(init, Q, Y)
 
-        if X_uncertainty is None:
-            X_uncertainty = np.ones_like(X) * 0.5
+        if X_variance is None:
+            X_variance = np.ones_like(X) * 0.5
 
         if Z is None:
             Z = np.random.permutation(X.copy())[:M]
@@ -37,7 +37,7 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
             kernel = kern.rbf(Q) + kern.white(Q)
 
 
-        sparse_GP.__init__(self, X, Gaussian(Y), kernel, Z=Z, X_uncertainty=X_uncertainty, **kwargs)
+        sparse_GP.__init__(self, X, Gaussian(Y), kernel, Z=Z, X_variance=X_variance, **kwargs)
 
     def _get_param_names(self):
         X_names = sum([['X_%i_%i'%(n,q) for q in range(self.Q)] for n in range(self.N)],[])
@@ -54,28 +54,28 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
         ===============================================================
 
         """
-        return np.hstack((self.X.flatten(), self.X_uncertainty.flatten(), sparse_GP._get_params(self)))
+        return np.hstack((self.X.flatten(), self.X_variance.flatten(), sparse_GP._get_params(self)))
 
     def _set_params(self,x):
         N, Q = self.N, self.Q
         self.X = x[:self.X.size].reshape(N,Q).copy()
-        self.X_uncertainty = x[(N*Q):(2*N*Q)].reshape(N,Q).copy()
+        self.X_variance = x[(N*Q):(2*N*Q)].reshape(N,Q).copy()
         sparse_GP._set_params(self, x[(2*N*Q):])
 
     def dL_dmuS(self):
-        dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi1_dmuS(self.dL_dpsi1,self.Z,self.X,self.X_uncertainty)
-        dL_dmu_psi1, dL_dS_psi1 = self.kern.dpsi0_dmuS(self.dL_dpsi0,self.Z,self.X,self.X_uncertainty)
-        dL_dmu_psi2, dL_dS_psi2 = self.kern.dpsi2_dmuS(self.dL_dpsi2,self.Z,self.X,self.X_uncertainty)
+        dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi1_dmuS(self.dL_dpsi1,self.Z,self.X,self.X_variance)
+        dL_dmu_psi1, dL_dS_psi1 = self.kern.dpsi0_dmuS(self.dL_dpsi0,self.Z,self.X,self.X_variance)
+        dL_dmu_psi2, dL_dS_psi2 = self.kern.dpsi2_dmuS(self.dL_dpsi2,self.Z,self.X,self.X_variance)
         dL_dmu = dL_dmu_psi0 + dL_dmu_psi1 + dL_dmu_psi2
         dL_dS = dL_dS_psi0 + dL_dS_psi1 + dL_dS_psi2
 
-        dKL_dS = (1. - (1./self.X_uncertainty))*0.5
+        dKL_dS = (1. - (1./self.X_variance))*0.5
         dKL_dmu = self.X
         return np.hstack(((dL_dmu - dKL_dmu).flatten(), (dL_dS - dKL_dS).flatten()))
 
     def KL_divergence(self):
         var_mean = np.square(self.X).sum()
-        var_S = np.sum(self.X_uncertainty - np.log(self.X_uncertainty))
+        var_S = np.sum(self.X_variance - np.log(self.X_variance))
         return 0.5*(var_mean + var_S) - 0.5*self.Q*self.N
 
     def log_likelihood(self):
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 53ba1183..cfda0cfe 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -269,7 +269,7 @@ class GP(model):
                 Zu = self.Z*self._Xstd + self._Xmean
                 pb.plot(Zu,Zu*0+pb.ylim()[0],'r|',mew=1.5,markersize=12)
                 if self.has_uncertain_inputs:
-                    pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten()))
+                    pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_variance.flatten()))
 
         elif self.X.shape[1]==2: #FIXME
             resolution = resolution or 50
diff --git a/GPy/models/generalized_FITC.py b/GPy/models/generalized_FITC.py
index 8c983b05..26875f64 100644
--- a/GPy/models/generalized_FITC.py
+++ b/GPy/models/generalized_FITC.py
@@ -19,8 +19,8 @@ class generalized_FITC(sparse_GP):
     :type likelihood: GPy.likelihood.(Gaussian | EP)
     :param kernel : the kernel/covariance function. See link kernels
     :type kernel: a GPy kernel
-    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
-    :type X_uncertainty: np.ndarray (N x Q) | None
+    :param X_variance: The variance in the measurements of X (Gaussian variance)
+    :type X_variance: np.ndarray (N x Q) | None
     :param Z: inducing inputs (optional, see note)
     :type Z: np.ndarray (M x Q) | None
     :param Zslices: slices for the inducing inputs (see slicing TODO: link)
@@ -30,13 +30,13 @@ class generalized_FITC(sparse_GP):
     :type normalize_(X|Y): bool
     """
 
-    def __init__(self, X, likelihood, kernel, Z, X_uncertainty=None, Xslices=None,Zslices=None, normalize_X=False):
+    def __init__(self, X, likelihood, kernel, Z, X_variance=None, Xslices=None,Zslices=None, normalize_X=False):
 
         self.Z = Z
         self.M = self.Z.shape[0]
         self._precision = likelihood.precision
 
-        sparse_GP.__init__(self, X, likelihood, kernel=kernel, Z=self.Z, X_uncertainty=None, Xslices=None,Zslices=None, normalize_X=False)
+        sparse_GP.__init__(self, X, likelihood, kernel=kernel, Z=self.Z, X_variance=None, Xslices=None,Zslices=None, normalize_X=False)
 
     def _set_params(self, p):
         self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py
index 3d44ad6b..e19945e2 100644
--- a/GPy/models/sparse_GP.py
+++ b/GPy/models/sparse_GP.py
@@ -22,8 +22,8 @@ class sparse_GP(GP):
     :type likelihood: GPy.likelihood.(Gaussian | EP)
     :param kernel : the kernel/covariance function. See link kernels
     :type kernel: a GPy kernel
-    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
-    :type X_uncertainty: np.ndarray (N x Q) | None
+    :param X_variance: The uncertainty in the measurements of X (Gaussian variance)
+    :type X_variance: np.ndarray (N x Q) | None
     :param Z: inducing inputs (optional, see note)
     :type Z: np.ndarray (M x Q) | None
     :param Zslices: slices for the inducing inputs (see slicing TODO: link)
@@ -33,7 +33,7 @@ class sparse_GP(GP):
     :type normalize_(X|Y): bool
     """
 
-    def __init__(self, X, likelihood, kernel, Z, X_uncertainty=None, Xslices=None,Zslices=None, normalize_X=False):
+    def __init__(self, X, likelihood, kernel, Z, X_variance=None, Xslices=None,Zslices=None, normalize_X=False):
         self.scale_factor = 100.0# a scaling factor to help keep the algorithm stable
 
         self.Z = Z
@@ -42,12 +42,12 @@ class sparse_GP(GP):
         self.M = Z.shape[0]
         self.likelihood = likelihood
 
-        if X_uncertainty is None:
+        if X_variance is None:
             self.has_uncertain_inputs=False
         else:
-            assert X_uncertainty.shape==X.shape
+            assert X_variance.shape==X.shape
             self.has_uncertain_inputs=True
-            self.X_uncertainty = X_uncertainty
+            self.X_variance = X_variance
 
         if not self.likelihood.is_heteroscedastic:
             self.likelihood.trYYT = np.trace(np.dot(self.likelihood.Y, self.likelihood.Y.T)) # TODO: something more elegant here?
@@ -56,16 +56,16 @@ class sparse_GP(GP):
 
         #normalize X uncertainty also
         if self.has_uncertain_inputs:
-            self.X_uncertainty /= np.square(self._Xstd)
+            self.X_variance /= np.square(self._Xstd)
 
 
     def _compute_kernel_matrices(self):
         # kernel computations, using BGPLVM notation
         self.Kmm = self.kern.K(self.Z)
         if self.has_uncertain_inputs:
-            self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty)
-            self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T
-            self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)
+            self.psi0 = self.kern.psi0(self.Z,self.X, self.X_variance)
+            self.psi1 = self.kern.psi1(self.Z,self.X, self.X_variance).T
+            self.psi2 = self.kern.psi2(self.Z,self.X, self.X_variance)
         else:
             self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices)
             self.psi1 = self.kern.K(self.Z,self.X)
@@ -210,9 +210,9 @@ class sparse_GP(GP):
         """
         dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z)
         if self.has_uncertain_inputs:
-            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty)
-            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
-            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z,self.X, self.X_uncertainty)
+            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_variance)
+            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_variance)
+            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z,self.X, self.X_variance)
         else:
             dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1,self.Z,self.X)
             dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
@@ -225,8 +225,8 @@ class sparse_GP(GP):
         """
         dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
         if self.has_uncertain_inputs:
-            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1,self.Z,self.X, self.X_uncertainty)
-            dL_dZ += 2.*self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # 'stripes'
+            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1,self.Z,self.X, self.X_variance)
+            dL_dZ += 2.*self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_variance) # 'stripes'
         else:
             dL_dZ += self.kern.dK_dX(self.dL_dpsi1,self.Z,self.X)
         return dL_dZ

From eb1d8f211fdebe5ef44d9720dd023aaecd72c645 Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Wed, 10 Apr 2013 11:04:49 +0100
Subject: [PATCH 21/21] stability improvements in sparse_GP

---
 GPy/examples/dimensionality_reduction.py |  2 --
 GPy/inference/SCG.py                     |  7 +++++--
 GPy/models/sparse_GP.py                  | 16 +++++++++++-----
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index b3509ec5..61a4abd8 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -68,8 +68,6 @@ def BGPLVM_oil(optimize=True,N=100,Q=10,M=15):
     m = GPy.models.Bayesian_GPLVM(data['X'][:N], Q, kernel = kernel,M=M)
     m.data_labels = data['Y'][:N].argmax(axis=1)
 
-    #initial conditions
-
     # optimize
     if optimize:
         m.constrain_fixed('noise',0.05)
diff --git a/GPy/inference/SCG.py b/GPy/inference/SCG.py
index f5e7ab22..0e85f243 100644
--- a/GPy/inference/SCG.py
+++ b/GPy/inference/SCG.py
@@ -23,6 +23,7 @@
 
 
 import numpy as np
+import sys
 
 def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xtol=1e-6, ftol=1e-6):
     """
@@ -103,11 +104,13 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
 
         iteration += 1
         if display:
-            print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta
+            print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
+            sys.stdout.flush()
 
         if success:
             # Test for termination
-            if np.max(np.abs(alpha*d)) < xtol or np.max(np.abs(fnew-fold)) < ftol:
+            if (np.max(np.abs(alpha*d)) < xtol) or (np.abs(fnew-fold) < ftol):
+                status='converged'
                 return x, flog, function_eval, status
 
             else:
diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py
index e19945e2..88abf77d 100644
--- a/GPy/models/sparse_GP.py
+++ b/GPy/models/sparse_GP.py
@@ -7,6 +7,7 @@ from ..util.linalg import mdot, jitchol, chol_inv, pdinv, trace_dot
 from ..util.plot import gpplot
 from .. import kern
 from GP import GP
+from scipy import linalg
 
 #Still TODO:
 # make use of slices properly (kernel can now do this)
@@ -96,21 +97,26 @@ class sparse_GP(GP):
         self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)
 
         self.V = (self.likelihood.precision/self.scale_factor)*self.likelihood.Y
-        self.A = mdot(self.Lmi, self.psi2_beta_scaled, self.Lmi.T)
+
+        #Compute A = L^-1 psi2 beta L^-T
+        tmp = linalg.lapack.flapack.dtrtrs(self.Lm,self.psi2_beta_scaled.T,lower=1)[0]
+        self.A = linalg.lapack.flapack.dtrtrs(self.Lm,np.asarray(tmp.T,order='F'),lower=1)[0]
+
         self.B = np.eye(self.M)/sf2 + self.A
 
         self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B)
 
         self.psi1V = np.dot(self.psi1, self.V)
-        tmp = np.dot(self.Lmi.T, self.LBi.T)
-        self.C = np.dot(tmp,tmp.T)
+        #tmp = np.dot(self.Lmi.T, self.LBi.T)
+        tmp = linalg.lapack.clapack.dtrtrs(self.Lm.T,np.asarray(self.LBi.T,order='C'),lower=0)[0]
+        self.C = np.dot(tmp,tmp.T) #TODO: tmp is triangular. replace with dtrmm (blas) when available
         self.Cpsi1V = np.dot(self.C,self.psi1V)
         self.Cpsi1VVpsi1 = np.dot(self.Cpsi1V,self.psi1V.T)
-        self.E = np.dot(self.Cpsi1VVpsi1,self.C)/sf2
+        #self.E = np.dot(self.Cpsi1VVpsi1,self.C)/sf2
+        self.E = np.dot(self.Cpsi1V/sf,self.Cpsi1V.T/sf)
 
         # Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertin inputs case
         self.dL_dpsi0 = - 0.5 * self.D * (self.likelihood.precision * np.ones([self.N,1])).flatten()
-        #self.dL_dpsi1 = mdot(self.V, self.psi1V.T,self.C).T
         self.dL_dpsi1 = np.dot(self.Cpsi1V,self.V.T)
         if self.likelihood.is_heteroscedastic:
             if self.has_uncertain_inputs: