From 0c0c7b000c3eae21a48d91fd70ab8a1f697fc22e Mon Sep 17 00:00:00 2001
From: Ricardo Andrade <acq11ra@sheffield.ac.uk>
Date: Tue, 19 Mar 2013 19:08:25 +0000
Subject: [PATCH 1/6] Generalized FITC is back

---
 GPy/models/__init__.py         |   1 +
 GPy/models/generalized_FITC.py | 203 +++++++++++++++++++++++++++++++++
 2 files changed, 204 insertions(+)
 create mode 100644 GPy/models/generalized_FITC.py

diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py
index 22aa803c..f442dc67 100644
--- a/GPy/models/__init__.py
+++ b/GPy/models/__init__.py
@@ -11,3 +11,4 @@ from warped_GP import warpedGP
 from sparse_GPLVM import sparse_GPLVM
 from uncollapsed_sparse_GP import uncollapsed_sparse_GP
 from Bayesian_GPLVM import Bayesian_GPLVM
+from generalized_FITC import generalized_FITC
diff --git a/GPy/models/generalized_FITC.py b/GPy/models/generalized_FITC.py
new file mode 100644
index 00000000..739059a0
--- /dev/null
+++ b/GPy/models/generalized_FITC.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+import pylab as pb
+from ..util.linalg import mdot, jitchol, chol_inv, pdinv, trace_dot
+from ..util.plot import gpplot
+from .. import kern
+from scipy import stats, linalg
+from sparse_GP import sparse_GP
+
+class generalized_FITC(sparse_GP):
+    """
+    Naish-Guzman, A. and Holden, S. (2008) implemantation of EP with FITC.
+
+    :param X: inputs
+    :type X: np.ndarray (N x Q)
+    :param likelihood: a likelihood instance, containing the observed data
+    :type likelihood: GPy.likelihood.(Gaussian | EP)
+    :param kernel : the kernel/covariance function. See link kernels
+    :type kernel: a GPy kernel
+    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
+    :type X_uncertainty: np.ndarray (N x Q) | None
+    :param Z: inducing inputs (optional, see note)
+    :type Z: np.ndarray (M x Q) | None
+    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
+    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
+    :type M: int
+    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
+    :type normalize_(X|Y): bool
+    """
+
+    def __init__(self, X, likelihood, kernel, Z, X_uncertainty=None, Xslices=None,Zslices=None, normalize_X=False):
+
+        if type(Z) == int:
+            #FIXME
+            self.M = Z
+            self.Z = (np.random.random_sample(self.D*self.M)*(self.X.max()-self.X.min())+self.X.min()).reshape(self.M,-1)
+        elif type(Z) == np.ndarray:
+            self.Z = Z
+            self.M = self.Z.shape[0]
+
+        self._precision = likelihood.precision
+
+        sparse_GP.__init__(self, X, likelihood, kernel=kernel, Z=self.Z, X_uncertainty=None, Xslices=None,Zslices=None, normalize_X=False)
+
+    def _set_params(self, p):
+        self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
+        self.kern._set_params(p[self.Z.size:self.Z.size+self.kern.Nparam])
+        self.likelihood._set_params(p[self.Z.size+self.kern.Nparam:])
+        self._compute_kernel_matrices()
+        self._computations()
+        self._FITC_computations()
+
+    def update_likelihood_approximation(self):
+        """
+        Approximates a non-gaussian likelihood using Expectation Propagation
+
+        For a Gaussian (or direct: TODO) likelihood, no iteration is required:
+        this function does nothing
+        """
+        if self.has_uncertain_inputs:
+            raise NotImplementedError, "FITC approximation not implemented for uncertain inputs"
+        else:
+            self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
+            self._precision = self.likelihood.precision # Save the true precision
+            self.likelihood.precision = self.likelihood.precision/(1. + self.likelihood.precision*self.Diag0[:,None]) # Add the diagonal element of the FITC approximation
+            self._set_params(self._get_params()) # update the GP
+
+
+    def _FITC_computations(self):
+        """
+        FITC approximation doesn't have the correction term in the log-likelihood bound,
+        but adds a diagonal term to the covariance matrix.
+        This function:
+            - computes the diagonal term
+            - eliminates the extra terms computed in the sparse_GP approximation
+            - computes the likelihood gradients wrt the true precision.
+        """
+        if self.likelihood.is_heteroscedastic:
+            # Compute generalized FITC's diagonal term of the covariance
+            self.Qnn = mdot(self.psi1.T,self.Kmmi,self.psi1)
+            self.Diag0 = self.psi0 - np.diag(self.Qnn)
+
+            self.Diag = self.Diag0/(1.+ self.Diag0 * self._precision.flatten())
+            self.P = (self.Diag / self.Diag0)[:,None] * self.psi1.T
+            self.RPT0 = np.dot(self.Lmi,self.psi1)
+            self.L = np.linalg.cholesky(np.eye(self.M) + np.dot(self.RPT0,(1./self.Diag0 - self.Diag/(self.Diag0**2))[:,None]*self.RPT0.T))
+            self.R,info = linalg.flapack.dtrtrs(self.L,self.Lmi,lower=1)
+            self.RPT = np.dot(self.R,self.P.T)
+            self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T,self.RPT)
+            self.w = self.Diag * self.likelihood.v_tilde
+            self.gamma = np.dot(self.R.T, np.dot(self.RPT,self.likelihood.v_tilde))
+            self.mu = self.w + np.dot(self.P,self.gamma)
+            self.mu_tilde = (self.likelihood.v_tilde/self.likelihood.tau_tilde)[:,None]
+
+            # Remove extra term from dL_dpsi1
+            self.dL_dpsi1 += -mdot(self.Kmmi,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)) #dB
+        else:
+            # Remove extra term from dL_dpsi1
+            self.dL_dpsi1 += -mdot(self.Kmmi,self.psi1*self.likelihood.precision) #dB
+
+
+        sf = self.scale_factor
+        sf2 = sf**2
+
+        # Remove extra term from dL_dKmm
+        self.dL_dKmm += 0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi)*sf2 # dB
+        self.dL_dpsi0 = None
+
+        #the partial derivative vector for the likelihood
+        if self.likelihood.Nparams == 0:
+            #save computation here.
+            self.partial_for_likelihood = None
+        elif self.likelihood.is_heteroscedastic:
+            raise NotImplementedError, "heteroscedatic derivates not implemented"
+        else:
+            #likelihood is not heterscedatic
+            self.partial_for_likelihood =   - 0.5 * self.N*self.D*self._precision + 0.5 * np.sum(np.square(self.likelihood.Y))*self._precision**2
+            self.partial_for_likelihood += 0.5 * self.D * trace_dot(self.Bi,self.A)*self._precision
+            self.partial_for_likelihood += self._precision*(0.5*trace_dot(self.psi2_beta_scaled,self.E*sf2) - np.trace(self.Cpsi1VVpsi1))
+
+        #TODO partial derivative vector for the likelihood not implemented
+        #NOTE the true precison is now '_precison' not 'precision'
+
+    def dL_dtheta(self):
+        """
+        Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel
+        """
+        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z)
+        if self.has_uncertain_inputs:
+            raise NotImplementedError, "heteroscedatic derivates not implemented"
+        else:
+            #NOTE in sparse_GP this would include the gradient wrt psi0
+            dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1,self.Z,self.X)
+        return dL_dtheta
+
+
+    def log_likelihood(self):
+        """ Compute the (lower bound on the) log marginal likelihood """
+        sf2 = self.scale_factor**2
+        if self.likelihood.is_heteroscedastic:
+            A = -0.5*self.N*self.D*np.log(2.*np.pi) +0.5*np.sum(np.log(self.likelihood.precision)) -0.5*np.sum(self.V*self.likelihood.Y)
+        else:
+            A = -0.5*self.N*self.D*(np.log(2.*np.pi) + np.log(self.likelihood._variance)) -0.5*self.likelihood.precision*self.likelihood.trYYT
+        C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2))
+        D = 0.5*np.trace(self.Cpsi1VVpsi1)
+        return A+C+D
+
+    def _raw_predict(self, Xnew, slices, full_cov=True):
+        if self.likelihood.is_heteroscedastic:
+            """
+            Make a prediction for the vsGP model
+
+            Arguments
+            ---------
+            X : Input prediction data - Nx1 numpy array (floats)
+            """
+            Kx = self.kern.K(self.Z, Xnew)
+            if full_cov:
+                Kxx = self.kern.K(Xnew)
+            else:
+                Kxx = self.kern.K(Xnew)#FIXME
+                #raise NotImplementedError
+                #Kxx = self.kern.Kdiag(Xnew)
+
+            # q(u|f) = N(u| R0i*mu_u*f, R0i*C*R0i.T)
+
+            # Ci = I + (RPT0)Di(RPT0).T
+            # C = I - [RPT0] * (D+[RPT0].T*[RPT0])^-1*[RPT0].T
+            #   = I - [RPT0] * (D + self.Qnn)^-1 * [RPT0].T
+            #   = I - [RPT0] * (U*U.T)^-1 * [RPT0].T
+            #   = I - V.T * V
+            U = np.linalg.cholesky(np.diag(self.Diag0) + self.Qnn)
+            V,info = linalg.flapack.dtrtrs(U,self.RPT0.T,lower=1)
+            C = np.eye(self.M) - np.dot(V.T,V)
+            mu_u = np.dot(C,self.RPT0)*(1./self.Diag0[None,:])
+            #self.C = C
+            #self.RPT0 = np.dot(self.R0,self.Knm.T) P0.T
+            #self.mu_u = mu_u
+            #self.U = U
+            # q(u|y) = N(u| R0i*mu_H,R0i*Sigma_H*R0i.T)
+            mu_H = np.dot(mu_u,self.mu)
+            self.mu_H = mu_H
+            Sigma_H = C + np.dot(mu_u,np.dot(self.Sigma,mu_u.T))
+            # q(f_star|y) = N(f_star|mu_star,sigma2_star)
+            KR0T = np.dot(Kx.T,self.Lmi.T)
+            mu_star = np.dot(KR0T,mu_H)
+            sigma2_star = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.M),KR0T.T))
+            vdiag = np.diag(sigma2_star)
+            return mu_star[:,None],vdiag[:,None]
+        else:
+            """Internal helper function for making predictions, does not account for normalization"""
+            Kx = self.kern.K(self.Z, Xnew)
+            mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
+            if full_cov:
+                Kxx = self.kern.K(Xnew)
+                var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) #NOTE this won't work for plotting
+            else:
+                Kxx = self.kern.Kdiag(Xnew)
+                var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0)
+
+            return mu,var[:,None]

From a375cb9a1b72b9ce8b7a58a52ffc51373db6eb52 Mon Sep 17 00:00:00 2001
From: Ricardo Andrade <acq11ra@sheffield.ac.uk>
Date: Fri, 22 Mar 2013 16:04:51 +0000
Subject: [PATCH 2/6] generalized fitc + examples

---
 GPy/examples/fitc_class_01.py  | 53 +++++++++++++++++++++
 GPy/examples/fitc_class_02.py  | 66 ++++++++++++++++++++++++++
 GPy/examples/fitc_regr_01.py   | 53 +++++++++++++++++++++
 GPy/models/generalized_FITC.py | 84 ++++++++++++++++++----------------
 4 files changed, 216 insertions(+), 40 deletions(-)
 create mode 100644 GPy/examples/fitc_class_01.py
 create mode 100644 GPy/examples/fitc_class_02.py
 create mode 100644 GPy/examples/fitc_regr_01.py

diff --git a/GPy/examples/fitc_class_01.py b/GPy/examples/fitc_class_01.py
new file mode 100644
index 00000000..413d0c14
--- /dev/null
+++ b/GPy/examples/fitc_class_01.py
@@ -0,0 +1,53 @@
+import pylab as pb
+import numpy as np
+import GPy
+pb.ion()
+pb.close('all')
+
+"""
+Simple 1D classification example
+:param seed : seed value for data generation (default is 4).
+:type seed: int
+"""
+seed=10000
+
+data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+X = data['X']
+Y = data['Y'][:, 0:1]
+Y[Y == -1] = 0
+
+
+
+#X = np.vstack((np.random.uniform(0,10,(10,1)),np.random.uniform(7,17,(10,1)),np.random.uniform(15,25,(10,1))))
+#Y = np.vstack((np.zeros((10,1)),np.ones((10,1)),np.zeros((10,1))))
+
+# Kernel object
+kernel = GPy.kern.rbf(1) + GPy.kern.white(1)
+
+# Likelihood object
+distribution = GPy.likelihoods.likelihood_functions.probit()
+likelihood = GPy.likelihoods.EP(Y,distribution)
+
+Z = np.random.uniform(X.min(),X.max(),(10,1))
+#Z = np.array([0,20])[:,None]
+print Z
+
+# Model definition
+m = GPy.models.generalized_FITC(X,likelihood=likelihood,kernel=kernel,Z=Z,normalize_X=False)
+m.set('len',2.)
+
+m.ensure_default_constraints()
+# Optimize
+#m.constrain_fixed('iip')
+m.update_likelihood_approximation()
+print m.checkgrad(verbose=1)
+# Parameters optimization:
+m.optimize()
+#m.EPEM() #FIXME
+
+# Plot
+pb.subplot(211)
+m.plot_f()
+pb.subplot(212)
+m.plot()
+print(m)
diff --git a/GPy/examples/fitc_class_02.py b/GPy/examples/fitc_class_02.py
new file mode 100644
index 00000000..cd8d6700
--- /dev/null
+++ b/GPy/examples/fitc_class_02.py
@@ -0,0 +1,66 @@
+import pylab as pb
+import numpy as np
+import GPy
+pb.close('all')
+
+seed=10000
+"""Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
+
+:param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
+:param seed : seed value for data generation.
+:type seed: int
+:param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
+:type inducing: int
+"""
+
+data = GPy.util.datasets.crescent_data(seed=seed)
+
+# Kernel object
+kernel = GPy.kern.rbf(data['X'].shape[1]) + GPy.kern.white(data['X'].shape[1])
+
+# Likelihood object
+distribution = GPy.likelihoods.likelihood_functions.probit()
+likelihood = GPy.likelihoods.EP(data['Y'],distribution)
+
+sample = np.random.randint(0,data['X'].shape[0],10)
+Z = data['X'][sample,:]
+# create sparse GP EP model
+#m = GPy.models.sparse_GP(data['X'],likelihood=likelihood,kernel=kernel,Z=Z)
+m = GPy.models.generalized_FITC(data['X'],likelihood=likelihood,kernel=kernel,Z=Z)
+m.ensure_default_constraints()
+m.set('len',10.)
+
+m.update_likelihood_approximation()
+
+# optimize
+m.optimize()
+print(m)
+
+# plot
+m.plot()
+fitc = m
+
+pb.figure()
+# Kernel object
+kernel = GPy.kern.rbf(data['X'].shape[1]) + GPy.kern.white(data['X'].shape[1])
+
+# Likelihood object
+distribution = GPy.likelihoods.likelihood_functions.probit()
+likelihood = GPy.likelihoods.EP(data['Y'],distribution)
+
+sample = np.random.randint(0,data['X'].shape[0],10)
+Z = data['X'][sample,:]
+# create sparse GP EP model
+m = GPy.models.sparse_GP(data['X'],likelihood=likelihood,kernel=kernel,Z=Z)
+m.ensure_default_constraints()
+m.set('len',10.)
+
+m.update_likelihood_approximation()
+
+# optimize
+m.optimize()
+print(m)
+
+# plot
+m.plot()
+variational = m
diff --git a/GPy/examples/fitc_regr_01.py b/GPy/examples/fitc_regr_01.py
new file mode 100644
index 00000000..815e31a7
--- /dev/null
+++ b/GPy/examples/fitc_regr_01.py
@@ -0,0 +1,53 @@
+import pylab as pb
+import numpy as np
+import GPy
+pb.ion()
+pb.close('all')
+
+N = 400
+M = 10
+# sample inputs and outputs
+X = np.random.uniform(-3.,3.,(N,1))
+Y = np.sin(X)+np.random.randn(N,1)*0.05
+
+"""Run a 1D example of a sparse GP regression."""
+"""
+rbf =  GPy.kern.rbf(1)
+noise = GPy.kern.white(1)
+kernel = rbf + noise
+Z = np.random.uniform(-3,3,(M,1))
+likelihood = GPy.likelihoods.Gaussian(Y)
+m = GPy.models.sparse_GP(X, likelihood, kernel, Z)
+m.scale_factor=10000
+m.constrain_positive('(variance|lengthscale|precision)')
+m.checkgrad(verbose=1)
+m.optimize('tnc', messages = 1)
+pb.figure()
+m.plot()
+
+variational = m
+"""
+
+# construct kernel
+rbf =  GPy.kern.rbf(1)
+noise = GPy.kern.white(1)
+kernel = rbf + noise
+#Z = np.random.uniform(-3,3,(M,1))
+Z = variational.Z
+likelihood = GPy.likelihoods.Gaussian(Y)
+# create simple GP model
+m = GPy.models.generalized_FITC(X, likelihood, kernel, Z=Z)
+m.constrain_positive('(variance|lengthscale|precision)')
+#m.constrain_fixed('iip')
+m.checkgrad(verbose=1)
+m.optimize('tnc', messages = 1)
+#pb.figure()
+#m.plot()
+"""
+Xnew = X.copy().flatten()
+Xnew.sort()
+Xnew = Xnew[:,None]
+mean,var,low,up = m.predict(Xnew)
+GPy.util.plot.gpplot(Xnew,mean,low,up)
+fitc = m
+"""
diff --git a/GPy/models/generalized_FITC.py b/GPy/models/generalized_FITC.py
index 739059a0..505f442a 100644
--- a/GPy/models/generalized_FITC.py
+++ b/GPy/models/generalized_FITC.py
@@ -32,14 +32,8 @@ class generalized_FITC(sparse_GP):
 
     def __init__(self, X, likelihood, kernel, Z, X_uncertainty=None, Xslices=None,Zslices=None, normalize_X=False):
 
-        if type(Z) == int:
-            #FIXME
-            self.M = Z
-            self.Z = (np.random.random_sample(self.D*self.M)*(self.X.max()-self.X.min())+self.X.min()).reshape(self.M,-1)
-        elif type(Z) == np.ndarray:
-            self.Z = Z
-            self.M = self.Z.shape[0]
-
+        self.Z = Z
+        self.M = self.Z.shape[0]
         self._precision = likelihood.precision
 
         sparse_GP.__init__(self, X, likelihood, kernel=kernel, Z=self.Z, X_uncertainty=None, Xslices=None,Zslices=None, normalize_X=False)
@@ -64,25 +58,26 @@ class generalized_FITC(sparse_GP):
         else:
             self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
             self._precision = self.likelihood.precision # Save the true precision
-            self.likelihood.precision = self.likelihood.precision/(1. + self.likelihood.precision*self.Diag0[:,None]) # Add the diagonal element of the FITC approximation
+            self.likelihood.precision = self._precision/(1. + self._precision*self.Diag0[:,None]) # Add the diagonal element of the FITC approximation
             self._set_params(self._get_params()) # update the GP
 
-
     def _FITC_computations(self):
         """
         FITC approximation doesn't have the correction term in the log-likelihood bound,
-        but adds a diagonal term to the covariance matrix.
+        but adds a diagonal term to the covariance matrix: diag(Knn - Qnn).
         This function:
-            - computes the diagonal term
-            - eliminates the extra terms computed in the sparse_GP approximation
+            - computes the FITC diagonal term
+            - removes the extra terms computed in the sparse_GP approximation
             - computes the likelihood gradients wrt the true precision.
         """
+        #NOTE the true precison is now '_precison' not 'precision'
         if self.likelihood.is_heteroscedastic:
+
             # Compute generalized FITC's diagonal term of the covariance
             self.Qnn = mdot(self.psi1.T,self.Kmmi,self.psi1)
             self.Diag0 = self.psi0 - np.diag(self.Qnn)
-
             self.Diag = self.Diag0/(1.+ self.Diag0 * self._precision.flatten())
+
             self.P = (self.Diag / self.Diag0)[:,None] * self.psi1.T
             self.RPT0 = np.dot(self.Lmi,self.psi1)
             self.L = np.linalg.cholesky(np.eye(self.M) + np.dot(self.RPT0,(1./self.Diag0 - self.Diag/(self.Diag0**2))[:,None]*self.RPT0.T))
@@ -92,14 +87,13 @@ class generalized_FITC(sparse_GP):
             self.w = self.Diag * self.likelihood.v_tilde
             self.gamma = np.dot(self.R.T, np.dot(self.RPT,self.likelihood.v_tilde))
             self.mu = self.w + np.dot(self.P,self.gamma)
-            self.mu_tilde = (self.likelihood.v_tilde/self.likelihood.tau_tilde)[:,None]
 
             # Remove extra term from dL_dpsi1
             self.dL_dpsi1 += -mdot(self.Kmmi,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)) #dB
         else:
+            raise NotImplementedError, "homoscedastic fitc not implemented"
             # Remove extra term from dL_dpsi1
-            self.dL_dpsi1 += -mdot(self.Kmmi,self.psi1*self.likelihood.precision) #dB
-
+            #self.dL_dpsi1 += -mdot(self.Kmmi,self.psi1*self.likelihood.precision) #dB
 
         sf = self.scale_factor
         sf2 = sf**2
@@ -110,18 +104,16 @@ class generalized_FITC(sparse_GP):
 
         #the partial derivative vector for the likelihood
         if self.likelihood.Nparams == 0:
-            #save computation here.
             self.partial_for_likelihood = None
         elif self.likelihood.is_heteroscedastic:
-            raise NotImplementedError, "heteroscedatic derivates not implemented"
+            raise NotImplementedError, "heteroscedastic derivates not implemented"
         else:
+            raise NotImplementedError, "homoscedastic derivatives not implemented"
             #likelihood is not heterscedatic
-            self.partial_for_likelihood =   - 0.5 * self.N*self.D*self._precision + 0.5 * np.sum(np.square(self.likelihood.Y))*self._precision**2
-            self.partial_for_likelihood += 0.5 * self.D * trace_dot(self.Bi,self.A)*self._precision
-            self.partial_for_likelihood += self._precision*(0.5*trace_dot(self.psi2_beta_scaled,self.E*sf2) - np.trace(self.Cpsi1VVpsi1))
-
+            #self.partial_for_likelihood =   - 0.5 * self.N*self.D*self.likelihood.precision + 0.5 * np.sum(np.square(self.likelihood.Y))*self.likelihood.precision**2
+            #self.partial_for_likelihood += 0.5 * self.D * trace_dot(self.Bi,self.A)*self.likelihood.precision
+            #self.partial_for_likelihood += self.likelihood.precision*(0.5*trace_dot(self.psi2_beta_scaled,self.E*sf2) - np.trace(self.Cpsi1VVpsi1))
         #TODO partial derivative vector for the likelihood not implemented
-        #NOTE the true precison is now '_precison' not 'precision'
 
     def dL_dtheta(self):
         """
@@ -147,23 +139,15 @@ class generalized_FITC(sparse_GP):
         D = 0.5*np.trace(self.Cpsi1VVpsi1)
         return A+C+D
 
-    def _raw_predict(self, Xnew, slices, full_cov=True):
+    def _raw_predict(self, Xnew, slices, full_cov=False):
         if self.likelihood.is_heteroscedastic:
             """
-            Make a prediction for the vsGP model
+            Make a prediction for the generalized FITC model
 
             Arguments
             ---------
             X : Input prediction data - Nx1 numpy array (floats)
             """
-            Kx = self.kern.K(self.Z, Xnew)
-            if full_cov:
-                Kxx = self.kern.K(Xnew)
-            else:
-                Kxx = self.kern.K(Xnew)#FIXME
-                #raise NotImplementedError
-                #Kxx = self.kern.Kdiag(Xnew)
-
             # q(u|f) = N(u| R0i*mu_u*f, R0i*C*R0i.T)
 
             # Ci = I + (RPT0)Di(RPT0).T
@@ -184,13 +168,19 @@ class generalized_FITC(sparse_GP):
             self.mu_H = mu_H
             Sigma_H = C + np.dot(mu_u,np.dot(self.Sigma,mu_u.T))
             # q(f_star|y) = N(f_star|mu_star,sigma2_star)
+            Kx = self.kern.K(self.Z, Xnew)
             KR0T = np.dot(Kx.T,self.Lmi.T)
             mu_star = np.dot(KR0T,mu_H)
-            sigma2_star = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.M),KR0T.T))
-            vdiag = np.diag(sigma2_star)
-            return mu_star[:,None],vdiag[:,None]
-        else:
-            """Internal helper function for making predictions, does not account for normalization"""
+            if full_cov:
+                Kxx = self.kern.K(Xnew)
+                var = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.M),KR0T.T))
+            else:
+                Kxx = self.kern.Kdiag(Xnew)
+                Kxx_ = self.kern.K(Xnew)
+                var_ = Kxx_ + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.M),KR0T.T))
+                var = (Kxx + np.sum(KR0T.T*np.dot(Sigma_H - np.eye(self.M),KR0T.T),0))[:,None]
+            return mu_star[:,None],var
+            """
             Kx = self.kern.K(self.Z, Xnew)
             mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
             if full_cov:
@@ -199,5 +189,19 @@ class generalized_FITC(sparse_GP):
             else:
                 Kxx = self.kern.Kdiag(Xnew)
                 var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0)
-
+            a = kjk
             return mu,var[:,None]
+            """
+        else:
+            raise NotImplementedError, "homoscedastic fitc not implemented"
+            """
+            Kx = self.kern.K(self.Z, Xnew)
+            mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
+            if full_cov:
+                Kxx = self.kern.K(Xnew)
+                var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) #NOTE this won't work for plotting
+            else:
+                Kxx = self.kern.Kdiag(Xnew)
+                var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0)
+            return mu,var[:,None]
+            """

From 35a099f4edfa1605e2b02d943438dba1b6454413 Mon Sep 17 00:00:00 2001
From: Ricardo Andrade <acq11ra@sheffield.ac.uk>
Date: Fri, 22 Mar 2013 19:03:46 +0000
Subject: [PATCH 3/6] minor changes

---
 GPy/examples/fitc_class_01.py |  4 ++--
 GPy/examples/fitc_class_02.py | 15 ++++++++-------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/GPy/examples/fitc_class_01.py b/GPy/examples/fitc_class_01.py
index 413d0c14..06c948b5 100644
--- a/GPy/examples/fitc_class_01.py
+++ b/GPy/examples/fitc_class_01.py
@@ -42,8 +42,8 @@ m.ensure_default_constraints()
 m.update_likelihood_approximation()
 print m.checkgrad(verbose=1)
 # Parameters optimization:
-m.optimize()
-#m.EPEM() #FIXME
+#m.optimize()
+m.pseudo_EM() #FIXME
 
 # Plot
 pb.subplot(211)
diff --git a/GPy/examples/fitc_class_02.py b/GPy/examples/fitc_class_02.py
index cd8d6700..71e10081 100644
--- a/GPy/examples/fitc_class_02.py
+++ b/GPy/examples/fitc_class_02.py
@@ -25,15 +25,15 @@ likelihood = GPy.likelihoods.EP(data['Y'],distribution)
 sample = np.random.randint(0,data['X'].shape[0],10)
 Z = data['X'][sample,:]
 # create sparse GP EP model
-#m = GPy.models.sparse_GP(data['X'],likelihood=likelihood,kernel=kernel,Z=Z)
 m = GPy.models.generalized_FITC(data['X'],likelihood=likelihood,kernel=kernel,Z=Z)
 m.ensure_default_constraints()
 m.set('len',10.)
 
-m.update_likelihood_approximation()
+#m.update_likelihood_approximation()
 
 # optimize
-m.optimize()
+#m.optimize()
+m.pseudo_EM()
 print(m)
 
 # plot
@@ -48,17 +48,18 @@ kernel = GPy.kern.rbf(data['X'].shape[1]) + GPy.kern.white(data['X'].shape[1])
 distribution = GPy.likelihoods.likelihood_functions.probit()
 likelihood = GPy.likelihoods.EP(data['Y'],distribution)
 
-sample = np.random.randint(0,data['X'].shape[0],10)
-Z = data['X'][sample,:]
+#sample = np.random.randint(0,data['X'].shape[0],10)
+#Z = data['X'][sample,:]
 # create sparse GP EP model
 m = GPy.models.sparse_GP(data['X'],likelihood=likelihood,kernel=kernel,Z=Z)
 m.ensure_default_constraints()
 m.set('len',10.)
 
-m.update_likelihood_approximation()
+#m.update_likelihood_approximation()
 
 # optimize
-m.optimize()
+#m.optimize()
+m.pseudo_EM()
 print(m)
 
 # plot

From c72d2b99d99e8d97d827def1d3e180fc6d022165 Mon Sep 17 00:00:00 2001
From: Ricardo Andrade <acq11ra@sheffield.ac.uk>
Date: Wed, 27 Mar 2013 13:52:51 +0000
Subject: [PATCH 4/6] print iteration number

---
 GPy/core/model.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index cc8ae2c4..0804f277 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -467,3 +467,5 @@ class model(parameterised):
                 if ll_change < epsilon:
                     stop = True
             iteration += 1
+            if stop:
+                print "%s iterations." %iteration

From 67d3ba6700983f4adf557d7120ab73ee86d0c62f Mon Sep 17 00:00:00 2001
From: Ricardo Andrade <acq11ra@sheffield.ac.uk>
Date: Wed, 27 Mar 2013 15:05:50 +0000
Subject: [PATCH 5/6] small changes

---
 GPy/examples/fitc_class_01.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/GPy/examples/fitc_class_01.py b/GPy/examples/fitc_class_01.py
index 06c948b5..38c9da63 100644
--- a/GPy/examples/fitc_class_01.py
+++ b/GPy/examples/fitc_class_01.py
@@ -11,15 +11,15 @@ Simple 1D classification example
 """
 seed=10000
 
-data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
-X = data['X']
-Y = data['Y'][:, 0:1]
-Y[Y == -1] = 0
+#data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+#X = data['X']
+#Y = data['Y'][:, 0:1]
+#Y[Y == -1] = 0
 
 
 
-#X = np.vstack((np.random.uniform(0,10,(10,1)),np.random.uniform(7,17,(10,1)),np.random.uniform(15,25,(10,1))))
-#Y = np.vstack((np.zeros((10,1)),np.ones((10,1)),np.zeros((10,1))))
+X = np.vstack((np.random.uniform(0,10,(10,1)),np.random.uniform(7,17,(10,1)),np.random.uniform(15,25,(10,1))))
+Y = np.vstack((np.zeros((10,1)),np.ones((10,1)),np.zeros((10,1))))
 
 # Kernel object
 kernel = GPy.kern.rbf(1) + GPy.kern.white(1)

From bf9e6cbd5f4680152727e908118418ca37a244db Mon Sep 17 00:00:00 2001
From: Ricardo Andrade <acq11ra@sheffield.ac.uk>
Date: Wed, 27 Mar 2013 15:06:21 +0000
Subject: [PATCH 6/6] Not needed

---
 GPy/examples/fitc_class_01.py | 53 ---------------------------
 GPy/examples/fitc_class_02.py | 67 -----------------------------------
 GPy/examples/fitc_regr_01.py  | 53 ---------------------------
 3 files changed, 173 deletions(-)
 delete mode 100644 GPy/examples/fitc_class_01.py
 delete mode 100644 GPy/examples/fitc_class_02.py
 delete mode 100644 GPy/examples/fitc_regr_01.py

diff --git a/GPy/examples/fitc_class_01.py b/GPy/examples/fitc_class_01.py
deleted file mode 100644
index 38c9da63..00000000
--- a/GPy/examples/fitc_class_01.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import pylab as pb
-import numpy as np
-import GPy
-pb.ion()
-pb.close('all')
-
-"""
-Simple 1D classification example
-:param seed : seed value for data generation (default is 4).
-:type seed: int
-"""
-seed=10000
-
-#data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
-#X = data['X']
-#Y = data['Y'][:, 0:1]
-#Y[Y == -1] = 0
-
-
-
-X = np.vstack((np.random.uniform(0,10,(10,1)),np.random.uniform(7,17,(10,1)),np.random.uniform(15,25,(10,1))))
-Y = np.vstack((np.zeros((10,1)),np.ones((10,1)),np.zeros((10,1))))
-
-# Kernel object
-kernel = GPy.kern.rbf(1) + GPy.kern.white(1)
-
-# Likelihood object
-distribution = GPy.likelihoods.likelihood_functions.probit()
-likelihood = GPy.likelihoods.EP(Y,distribution)
-
-Z = np.random.uniform(X.min(),X.max(),(10,1))
-#Z = np.array([0,20])[:,None]
-print Z
-
-# Model definition
-m = GPy.models.generalized_FITC(X,likelihood=likelihood,kernel=kernel,Z=Z,normalize_X=False)
-m.set('len',2.)
-
-m.ensure_default_constraints()
-# Optimize
-#m.constrain_fixed('iip')
-m.update_likelihood_approximation()
-print m.checkgrad(verbose=1)
-# Parameters optimization:
-#m.optimize()
-m.pseudo_EM() #FIXME
-
-# Plot
-pb.subplot(211)
-m.plot_f()
-pb.subplot(212)
-m.plot()
-print(m)
diff --git a/GPy/examples/fitc_class_02.py b/GPy/examples/fitc_class_02.py
deleted file mode 100644
index 71e10081..00000000
--- a/GPy/examples/fitc_class_02.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import pylab as pb
-import numpy as np
-import GPy
-pb.close('all')
-
-seed=10000
-"""Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
-
-:param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
-:param seed : seed value for data generation.
-:type seed: int
-:param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
-:type inducing: int
-"""
-
-data = GPy.util.datasets.crescent_data(seed=seed)
-
-# Kernel object
-kernel = GPy.kern.rbf(data['X'].shape[1]) + GPy.kern.white(data['X'].shape[1])
-
-# Likelihood object
-distribution = GPy.likelihoods.likelihood_functions.probit()
-likelihood = GPy.likelihoods.EP(data['Y'],distribution)
-
-sample = np.random.randint(0,data['X'].shape[0],10)
-Z = data['X'][sample,:]
-# create sparse GP EP model
-m = GPy.models.generalized_FITC(data['X'],likelihood=likelihood,kernel=kernel,Z=Z)
-m.ensure_default_constraints()
-m.set('len',10.)
-
-#m.update_likelihood_approximation()
-
-# optimize
-#m.optimize()
-m.pseudo_EM()
-print(m)
-
-# plot
-m.plot()
-fitc = m
-
-pb.figure()
-# Kernel object
-kernel = GPy.kern.rbf(data['X'].shape[1]) + GPy.kern.white(data['X'].shape[1])
-
-# Likelihood object
-distribution = GPy.likelihoods.likelihood_functions.probit()
-likelihood = GPy.likelihoods.EP(data['Y'],distribution)
-
-#sample = np.random.randint(0,data['X'].shape[0],10)
-#Z = data['X'][sample,:]
-# create sparse GP EP model
-m = GPy.models.sparse_GP(data['X'],likelihood=likelihood,kernel=kernel,Z=Z)
-m.ensure_default_constraints()
-m.set('len',10.)
-
-#m.update_likelihood_approximation()
-
-# optimize
-#m.optimize()
-m.pseudo_EM()
-print(m)
-
-# plot
-m.plot()
-variational = m
diff --git a/GPy/examples/fitc_regr_01.py b/GPy/examples/fitc_regr_01.py
deleted file mode 100644
index 815e31a7..00000000
--- a/GPy/examples/fitc_regr_01.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import pylab as pb
-import numpy as np
-import GPy
-pb.ion()
-pb.close('all')
-
-N = 400
-M = 10
-# sample inputs and outputs
-X = np.random.uniform(-3.,3.,(N,1))
-Y = np.sin(X)+np.random.randn(N,1)*0.05
-
-"""Run a 1D example of a sparse GP regression."""
-"""
-rbf =  GPy.kern.rbf(1)
-noise = GPy.kern.white(1)
-kernel = rbf + noise
-Z = np.random.uniform(-3,3,(M,1))
-likelihood = GPy.likelihoods.Gaussian(Y)
-m = GPy.models.sparse_GP(X, likelihood, kernel, Z)
-m.scale_factor=10000
-m.constrain_positive('(variance|lengthscale|precision)')
-m.checkgrad(verbose=1)
-m.optimize('tnc', messages = 1)
-pb.figure()
-m.plot()
-
-variational = m
-"""
-
-# construct kernel
-rbf =  GPy.kern.rbf(1)
-noise = GPy.kern.white(1)
-kernel = rbf + noise
-#Z = np.random.uniform(-3,3,(M,1))
-Z = variational.Z
-likelihood = GPy.likelihoods.Gaussian(Y)
-# create simple GP model
-m = GPy.models.generalized_FITC(X, likelihood, kernel, Z=Z)
-m.constrain_positive('(variance|lengthscale|precision)')
-#m.constrain_fixed('iip')
-m.checkgrad(verbose=1)
-m.optimize('tnc', messages = 1)
-#pb.figure()
-#m.plot()
-"""
-Xnew = X.copy().flatten()
-Xnew.sort()
-Xnew = Xnew[:,None]
-mean,var,low,up = m.predict(Xnew)
-GPy.util.plot.gpplot(Xnew,mean,low,up)
-fitc = m
-"""