Merge branch 'devel' of github.com:SheffieldML/GPy into devel

2026-05-13 14:03:20 +02:00 · 2013-05-08 15:47:33 +02:00 · 2013-05-08 15:47:33 +02:00 · dc1e747702
commit dc1e747702
parent b17cc60182 e60eb4e236
44 changed files with 4026 additions and 1575 deletions
--- a/GPy/models/Bayesian_GPLVM.py
+++ b/GPy/models/Bayesian_GPLVM.py
@ -9,6 +9,10 @@ from sparse_GP import sparse_GP
 from GPy.util.linalg import pdinv
 from ..likelihoods import Gaussian
 from .. import kern
+from numpy.linalg.linalg import LinAlgError
+import itertools
+from matplotlib.colors import colorConverter
+from matplotlib.figure import SubplotParams

 class Bayesian_GPLVM(sparse_GP, GPLVM):
    """
@ -22,12 +26,14 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
    :type init: 'PCA'|'random'

    """
-    def __init__(self, Y, Q, X=None, X_variance=None, init='PCA', M=10, Z=None, kernel=None, **kwargs):
+    def __init__(self, Y, Q, X=None, X_variance=None, init='PCA', M=10,
+                 Z=None, kernel=None, oldpsave=5, _debug=False,
+                 **kwargs):
        if X == None:
            X = self.initialise_latent(init, Q, Y)

        if X_variance is None:
-            X_variance = np.ones_like(X) * 0.5
+            X_variance = np.clip((np.ones_like(X) * 0.5) + .01 * np.random.randn(*X.shape), 0.001, 1)

        if Z is None:
            Z = np.random.permutation(X.copy())[:M]
@ -36,9 +42,31 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
        if kernel is None:
            kernel = kern.rbf(Q) + kern.white(Q)

+        self.oldpsave = oldpsave
+        self._oldps = []
+        self._debug = _debug
+
+        if self._debug:
+            self.f_call = 0
+            self._count = itertools.count()
+            self._savedklll = []
+            self._savedparams = []
+            self._savedgradients = []
+            self._savederrors = []
+            self._savedpsiKmm = []

        sparse_GP.__init__(self, X, Gaussian(Y), kernel, Z=Z, X_variance=X_variance, **kwargs)

+    @property
+    def oldps(self):
+        return self._oldps
+    @oldps.setter
+    def oldps(self, p):
+        if len(self._oldps) == (self.oldpsave + 1):
+            self._oldps.pop()
+        # if len(self._oldps) == 0 or not np.any([np.any(np.abs(p - op) > 1e-5) for op in self._oldps]):
+        self._oldps.insert(0, p.copy())
+
    def _get_param_names(self):
        X_names = sum([['X_%i_%i' % (n, q) for q in range(self.Q)] for n in range(self.N)], [])
        S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.Q)] for n in range(self.N)], [])
@ -54,17 +82,26 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
        ===============================================================

        """
-        return np.hstack((self.X.flatten(), self.X_variance.flatten(), sparse_GP._get_params(self)))
-
-    def _set_params(self, x):
-        N, Q = self.N, self.Q
-        self.X = x[:self.X.size].reshape(N, Q).copy()
-        self.X_variance = x[(N * Q):(2 * N * Q)].reshape(N, Q).copy()
-        sparse_GP._set_params(self, x[(2 * N * Q):])
+        x = np.hstack((self.X.flatten(), self.X_variance.flatten(), sparse_GP._get_params(self)))
+        return x

+    def _set_params(self, x, save_old=True, save_count=0):
+        try:
+            N, Q = self.N, self.Q
+            self.X = x[:self.X.size].reshape(N, Q).copy()
+            self.X_variance = x[(N * Q):(2 * N * Q)].reshape(N, Q).copy()
+            sparse_GP._set_params(self, x[(2 * N * Q):])
+            self.oldps = x
+        except (LinAlgError, FloatingPointError, ZeroDivisionError):
+            print "\rWARNING: Caught LinAlgError, continueing without setting            "
+            if self._debug:
+                self._savederrors.append(self.f_call)
+            if save_count > 10:
+                raise
+            self._set_params(self.oldps[-1], save_old=False, save_count=save_count + 1)

    def dKL_dmuS(self):
-        dKL_dS = (1. - (1. / self.X_variance)) * 0.5
+        dKL_dS = (1. - (1. / (self.X_variance))) * 0.5
        dKL_dmu = self.X
        return dKL_dmu, dKL_dS

@ -83,13 +120,40 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
        return 0.5 * (var_mean + var_S) - 0.5 * self.Q * self.N

    def log_likelihood(self):
-        return sparse_GP.log_likelihood(self) - self.KL_divergence()
+        ll = sparse_GP.log_likelihood(self)
+        kl = self.KL_divergence()
+
+#         if ll < -2E4:
+#             ll = -2E4 + np.random.randn()
+#         if kl > 5E4:
+#             kl = 5E4 + np.random.randn()
+
+        if self._debug:
+            self.f_call = self._count.next()
+            if self.f_call % 1 == 0:
+                self._savedklll.append([self.f_call, ll, kl])
+                self._savedparams.append([self.f_call, self._get_params()])
+                self._savedgradients.append([self.f_call, self._log_likelihood_gradients()])
+                self._savedpsiKmm.append([self.f_call, [self.Kmm, self.dL_dKmm]])
+        # print "\nkl:", kl, "ll:", ll
+        return ll - kl

    def _log_likelihood_gradients(self):
        dKL_dmu, dKL_dS = self.dKL_dmuS()
        dL_dmu, dL_dS = self.dL_dmuS()
        # TODO: find way to make faster
-        dbound_dmuS = np.hstack(((dL_dmu - dKL_dmu).flatten(), (dL_dS - dKL_dS).flatten()))
+
+        d_dmu = (dL_dmu - dKL_dmu).flatten()
+        d_dS = (dL_dS - dKL_dS).flatten()
+        # TEST KL: ====================
+        # d_dmu = (dKL_dmu).flatten()
+        # d_dS = (dKL_dS).flatten()
+        # ========================
+        # TEST L: ====================
+#         d_dmu = (dL_dmu).flatten()
+#         d_dS = (dL_dS).flatten()
+        # ========================
+        dbound_dmuS = np.hstack((d_dmu, d_dS))
        return np.hstack((dbound_dmuS.flatten(), sparse_GP._log_likelihood_gradients(self)))

    def plot_latent(self, which_indices=None, *args, **kwargs):
@ -104,3 +168,288 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
        ax = GPLVM.plot_latent(self, which_indices=[input_1, input_2], *args, **kwargs)
        ax.plot(self.Z[:, input_1], self.Z[:, input_2], '^w')
        return ax
+
+    def plot_X_1d(self, fig=None, axes=None, fig_num="MRD X 1d", colors=None):
+        """
+        Plot latent space X in 1D:
+        
+            -if fig is given, create Q subplots in fig and plot in these
+            -if axes is given plot Q 1D latent space plots of X into each `axis`
+            -if neither fig nor axes is given create a figure with fig_num and plot in there
+            
+        colors:
+            
+            colors of different latent space dimensions Q
+        """
+        import pylab
+        if fig is None and axes is None:
+            fig = pylab.figure(num=fig_num, figsize=(8, min(12, (2 * self.X.shape[1]))))
+        if colors is None:
+            colors = pylab.gca()._get_lines.color_cycle
+            pylab.clf()
+        else:
+            colors = iter(colors)
+        plots = []
+        for i in range(self.X.shape[1]):
+            if axes is None:
+                ax = fig.add_subplot(self.X.shape[1], 1, i + 1)
+            else:
+                ax = axes[i]
+            ax.plot(self.X, c='k', alpha=.3)
+            plots.extend(ax.plot(self.X.T[i], c=colors.next(), label=r"$\mathbf{{X_{}}}$".format(i)))
+            ax.fill_between(np.arange(self.X.shape[0]),
+                            self.X.T[i] - 2 * np.sqrt(self.X_variance.T[i]),
+                            self.X.T[i] + 2 * np.sqrt(self.X_variance.T[i]),
+                            facecolor=plots[-1].get_color(),
+                            alpha=.3)
+            ax.legend(borderaxespad=0.)
+            if i < self.X.shape[1] - 1:
+                ax.set_xticklabels('')
+        pylab.draw()
+        fig.tight_layout(h_pad=.01)  # , rect=(0, 0, 1, .95))
+        return fig
+
+    def _debug_filter_params(self, x):
+        start, end = 0, self.X.size,
+        X = x[start:end].reshape(self.N, self.Q)
+        start, end = end, end + self.X_variance.size
+        X_v = x[start:end].reshape(self.N, self.Q)
+        start, end = end, end + (self.M * self.Q)
+        Z = x[start:end].reshape(self.M, self.Q)
+        start, end = end, end + self.Q
+        theta = x[start:]
+        return X, X_v, Z, theta
+
+
+    def _debug_get_axis(self, figs):
+        if figs[-1].axes:
+            ax1 = figs[-1].axes[0]
+            ax1.cla()
+        else:
+            ax1 = figs[-1].add_subplot(111)
+        return ax1
+
+    def _debug_plot(self):
+        assert self._debug, "must enable _debug, to debug-plot"
+        import pylab
+#         from mpl_toolkits.mplot3d import Axes3D
+        figs = [pylab.figure('BGPLVM DEBUG', figsize=(12, 4))]
+#         fig.clf()
+
+        # log like
+#         splotshape = (6, 4)
+#         ax1 = pylab.subplot2grid(splotshape, (0, 0), 1, 4)
+        ax1 = self._debug_get_axis(figs)
+        ax1.text(.5, .5, "Optimization", alpha=.3, transform=ax1.transAxes,
+                 ha='center', va='center')
+        kllls = np.array(self._savedklll)
+        LL, = ax1.plot(kllls[:, 0], kllls[:, 1] - kllls[:, 2], '-', label=r'$\log p(\mathbf{Y})$', mew=1.5)
+        KL, = ax1.plot(kllls[:, 0], kllls[:, 2], '-', label=r'$\mathcal{KL}(p||q)$', mew=1.5)
+        L, = ax1.plot(kllls[:, 0], kllls[:, 1], '-', label=r'$L$', mew=1.5)  # \mathds{E}_{q(\mathbf{X})}[p(\mathbf{Y|X})\frac{p(\mathbf{X})}{q(\mathbf{X})}]
+
+        param_dict = dict(self._savedparams)
+        gradient_dict = dict(self._savedgradients)
+        kmm_dict = dict(self._savedpsiKmm)
+        iters = np.array(param_dict.keys())
+        self.showing = 0
+
+#         ax2 = pylab.subplot2grid(splotshape, (1, 0), 2, 4)
+        figs.append(pylab.figure("BGPLVM DEBUG X", figsize=(12, 4)))
+        ax2 = self._debug_get_axis(figs)
+        ax2.text(.5, .5, r"$\mathbf{X}$", alpha=.5, transform=ax2.transAxes,
+                 ha='center', va='center')
+        figs[-1].canvas.draw()
+        figs[-1].tight_layout(rect=(0, 0, 1, .86))
+#         ax3 = pylab.subplot2grid(splotshape, (3, 0), 2, 4, sharex=ax2)
+        figs.append(pylab.figure("BGPLVM DEBUG S", figsize=(12, 4)))
+        ax3 = self._debug_get_axis(figs)
+        ax3.text(.5, .5, r"$\mathbf{S}$", alpha=.5, transform=ax3.transAxes,
+                 ha='center', va='center')
+        figs[-1].canvas.draw()
+        figs[-1].tight_layout(rect=(0, 0, 1, .86))
+#         ax4 = pylab.subplot2grid(splotshape, (5, 0), 2, 2)
+        figs.append(pylab.figure("BGPLVM DEBUG Z", figsize=(6, 4)))
+        ax4 = self._debug_get_axis(figs)
+        ax4.text(.5, .5, r"$\mathbf{Z}$", alpha=.5, transform=ax4.transAxes,
+                 ha='center', va='center')
+        figs[-1].canvas.draw()
+        figs[-1].tight_layout(rect=(0, 0, 1, .86))
+#         ax5 = pylab.subplot2grid(splotshape, (5, 2), 2, 2)
+        figs.append(pylab.figure("BGPLVM DEBUG theta", figsize=(6, 4)))
+        ax5 = self._debug_get_axis(figs)
+        ax5.text(.5, .5, r"${\theta}$", alpha=.5, transform=ax5.transAxes,
+                 ha='center', va='center')
+        figs[-1].canvas.draw()
+        figs[-1].tight_layout(rect=(.15, 0, 1, .86))
+        figs.append(pylab.figure("BGPLVM DEBUG Kmm", figsize=(12, 6)))
+        fig = figs[-1]
+        ax6 = fig.add_subplot(121)
+        ax6.text(.5, .5, r"${\mathbf{K}_{mm}}$", color='magenta', alpha=.5, transform=ax6.transAxes,
+                 ha='center', va='center')
+        ax7 = fig.add_subplot(122)
+        ax7.text(.5, .5, r"${\frac{dL}{dK_{mm}}}$", color='magenta', alpha=.5, transform=ax7.transAxes,
+                 ha='center', va='center')
+
+        X, S, Z, theta = self._debug_filter_params(param_dict[self.showing])
+        Xg, Sg, Zg, thetag = self._debug_filter_params(gradient_dict[self.showing])
+#         Xg, Sg, Zg, thetag = -Xg, -Sg, -Zg, -thetag
+
+        quiver_units = 'xy'
+        quiver_scale = 1
+        quiver_scale_units = 'xy'
+        Xlatentplts = ax2.plot(X, ls="-", marker="x")
+        colors = colorConverter.to_rgba_array([p.get_color() for p in Xlatentplts], .4)
+        Ulatent = np.zeros_like(X)
+        xlatent = np.tile(np.arange(0, X.shape[0])[:, None], X.shape[1])
+        Xlatentgrads = ax2.quiver(xlatent, X, Ulatent, Xg, color=colors,
+                                  units=quiver_units, scale_units=quiver_scale_units,
+                                  scale=quiver_scale)
+
+        Slatentplts = ax3.plot(S, ls="-", marker="x")
+        Slatentgrads = ax3.quiver(xlatent, S, Ulatent, Sg, color=colors,
+                                  units=quiver_units, scale_units=quiver_scale_units,
+                                  scale=quiver_scale)
+        ax3.set_ylim(0, 1.)
+
+        xZ = np.tile(np.arange(0, Z.shape[0])[:, None], Z.shape[1])
+        UZ = np.zeros_like(Z)
+        Zplts = ax4.plot(Z, ls="-", marker="x")
+        Zgrads = ax4.quiver(xZ, Z, UZ, Zg, color=colors,
+                                  units=quiver_units, scale_units=quiver_scale_units,
+                                  scale=quiver_scale)
+
+        xtheta = np.arange(len(theta))
+        Utheta = np.zeros_like(theta)
+        thetaplts = ax5.bar(xtheta - .4, theta, color=colors)
+        thetagrads = ax5.quiver(xtheta, theta, Utheta, thetag, color=colors,
+                                  units=quiver_units, scale_units=quiver_scale_units,
+                                  scale=quiver_scale,
+                                  edgecolors=('k',), linewidths=[1])
+        pylab.setp(thetaplts, zorder=0)
+        pylab.setp(thetagrads, zorder=10)
+        ax5.set_xticks(np.arange(len(theta)))
+        ax5.set_xticklabels(self._get_param_names()[-len(theta):], rotation=17)
+
+        imkmm = ax6.imshow(kmm_dict[self.showing][0])
+        from mpl_toolkits.axes_grid1 import make_axes_locatable
+        divider = make_axes_locatable(ax6)
+        caxkmm = divider.append_axes("right", "5%", pad="1%")
+        cbarkmm = pylab.colorbar(imkmm, cax=caxkmm)
+
+        imkmmdl = ax7.imshow(kmm_dict[self.showing][1])
+        divider = make_axes_locatable(ax7)
+        caxkmmdl = divider.append_axes("right", "5%", pad="1%")
+        cbarkmmdl = pylab.colorbar(imkmmdl, cax=caxkmmdl)
+
+#         Qleg = ax1.legend(Xlatentplts, [r"$Q_{}$".format(i + 1) for i in range(self.Q)],
+#                    loc=3, ncol=self.Q, bbox_to_anchor=(0, 1.15, 1, 1.15),
+#                    borderaxespad=0, mode="expand")
+        ax2.legend(Xlatentplts, [r"$Q_{}$".format(i + 1) for i in range(self.Q)],
+                   loc=3, ncol=self.Q, bbox_to_anchor=(0, 1.1, 1, 1.1),
+                   borderaxespad=0, mode="expand")
+        ax3.legend(Xlatentplts, [r"$Q_{}$".format(i + 1) for i in range(self.Q)],
+                   loc=3, ncol=self.Q, bbox_to_anchor=(0, 1.1, 1, 1.1),
+                   borderaxespad=0, mode="expand")
+        ax4.legend(Xlatentplts, [r"$Q_{}$".format(i + 1) for i in range(self.Q)],
+                   loc=3, ncol=self.Q, bbox_to_anchor=(0, 1.1, 1, 1.1),
+                   borderaxespad=0, mode="expand")
+        ax5.legend(Xlatentplts, [r"$Q_{}$".format(i + 1) for i in range(self.Q)],
+                   loc=3, ncol=self.Q, bbox_to_anchor=(0, 1.1, 1, 1.1),
+                   borderaxespad=0, mode="expand")
+        Lleg = ax1.legend()
+        Lleg.draggable()
+#         ax1.add_artist(Qleg)
+
+        indicatorKL, = ax1.plot(kllls[self.showing, 0], kllls[self.showing, 2], 'o', c=KL.get_color())
+        indicatorLL, = ax1.plot(kllls[self.showing, 0], kllls[self.showing, 1] - kllls[self.showing, 2], 'o', c=LL.get_color())
+        indicatorL, = ax1.plot(kllls[self.showing, 0], kllls[self.showing, 1], 'o', c=L.get_color())
+#         for err in self._savederrors:
+#             if err < kllls.shape[0]:
+#                 ax1.scatter(kllls[err, 0], kllls[err, 2], s=50, marker=(5, 2), c=KL.get_color())
+#                 ax1.scatter(kllls[err, 0], kllls[err, 1] - kllls[err, 2], s=50, marker=(5, 2), c=LL.get_color())
+#                 ax1.scatter(kllls[err, 0], kllls[err, 1], s=50, marker=(5, 2), c=L.get_color())
+
+#         try:
+#             for f in figs:
+#                 f.canvas.draw()
+#                 f.tight_layout(box=(0, .15, 1, .9))
+# #             pylab.draw()
+# #             pylab.tight_layout(box=(0, .1, 1, .9))
+#         except:
+#             pass
+
+        # parameter changes
+        # ax2 = pylab.subplot2grid((4, 1), (1, 0), 3, 1, projection='3d')
+        button_options = [0, 0]  # [0]: clicked -- [1]: dragged
+
+        def update_plots(event):
+            if button_options[0] and not button_options[1]:
+#               event.button, event.x, event.y, event.xdata, event.ydata)
+                tmp = np.abs(iters - event.xdata)
+                closest_hit = iters[tmp == tmp.min()][0]
+
+                if closest_hit != self.showing:
+                    self.showing = closest_hit
+                    # print closest_hit, iters, event.xdata
+
+                    indicatorLL.set_data(self.showing, kllls[self.showing, 1] - kllls[self.showing, 2])
+                    indicatorKL.set_data(self.showing, kllls[self.showing, 2])
+                    indicatorL.set_data(self.showing, kllls[self.showing, 1])
+
+                    X, S, Z, theta = self._debug_filter_params(param_dict[self.showing])
+                    Xg, Sg, Zg, thetag = self._debug_filter_params(gradient_dict[self.showing])
+#                     Xg, Sg, Zg, thetag = -Xg, -Sg, -Zg, -thetag
+
+                    for i, Xlatent in enumerate(Xlatentplts):
+                        Xlatent.set_ydata(X[:, i])
+                    Xlatentgrads.set_offsets(np.array([xlatent.ravel(), X.ravel()]).T)
+                    Xlatentgrads.set_UVC(Ulatent, Xg)
+
+                    for i, Slatent in enumerate(Slatentplts):
+                        Slatent.set_ydata(S[:, i])
+                    Slatentgrads.set_offsets(np.array([xlatent.ravel(), S.ravel()]).T)
+                    Slatentgrads.set_UVC(Ulatent, Sg)
+
+                    for i, Zlatent in enumerate(Zplts):
+                        Zlatent.set_ydata(Z[:, i])
+                    Zgrads.set_offsets(np.array([xZ.ravel(), Z.ravel()]).T)
+                    Zgrads.set_UVC(UZ, Zg)
+
+                    for p, t in zip(thetaplts, theta):
+                        p.set_height(t)
+                    thetagrads.set_offsets(np.array([xtheta.ravel(), theta.ravel()]).T)
+                    thetagrads.set_UVC(Utheta, thetag)
+
+                    imkmm.set_data(kmm_dict[self.showing][0])
+                    imkmm.autoscale()
+                    cbarkmm.update_normal(imkmm)
+
+                    imkmmdl.set_data(kmm_dict[self.showing][1])
+                    imkmmdl.autoscale()
+                    cbarkmmdl.update_normal(imkmmdl)
+
+                    ax2.relim()
+                    # ax3.relim()
+                    ax4.relim()
+                    ax5.relim()
+                    ax2.autoscale()
+                    # ax3.autoscale()
+                    ax4.autoscale()
+                    ax5.autoscale()
+
+                    [fig.canvas.draw() for fig in figs]
+            button_options[0] = 0
+            button_options[1] = 0
+
+        def onclick(event):
+            if event.inaxes is ax1 and event.button == 1:
+                button_options[0] = 1
+        def motion(event):
+            if button_options[0]:
+                button_options[1] = 1
+
+        cidr = figs[0].canvas.mpl_connect('button_release_event', update_plots)
+        cidp = figs[0].canvas.mpl_connect('button_press_event', onclick)
+        cidd = figs[0].canvas.mpl_connect('motion_notify_event', motion)
+
+        return ax1, ax2, ax3, ax4, ax5, ax6, ax7
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@ -6,8 +6,8 @@ import numpy as np
 import pylab as pb
 from .. import kern
 from ..core import model
-from ..util.linalg import pdinv,mdot
-from ..util.plot import gpplot,x_frame1D,x_frame2D, Tango
+from ..util.linalg import pdinv, mdot
+from ..util.plot import gpplot, x_frame1D, x_frame2D, Tango
 from ..likelihoods import EP

 class GP(model):
@ -19,9 +19,6 @@ class GP(model):
    :parm likelihood: a GPy likelihood
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
-    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
-    :type normalize_Y: False|True
-    :param Xslices: how the X,Y data co-vary in the kernel (i.e. which "outputs" they correspond to). See (link:slicing)
    :rtype: model object
    :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1
    :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.]
@ -30,33 +27,31 @@ class GP(model):
    .. Note:: Multiple independent outputs are allowed using columns of Y

    """
-    def __init__(self, X, likelihood, kernel, normalize_X=False, Xslices=None):
+    def __init__(self, X, likelihood, kernel, normalize_X=False):

        # parse arguments
-        self.Xslices = Xslices
        self.X = X
-        assert len(self.X.shape)==2
+        assert len(self.X.shape) == 2
        self.N, self.Q = self.X.shape
        assert isinstance(kernel, kern.kern)
        self.kern = kernel
-
-        #here's some simple normalization for the inputs
-        if normalize_X:
-            self._Xmean = X.mean(0)[None,:]
-            self._Xstd = X.std(0)[None,:]
-            self.X = (X.copy() - self._Xmean) / self._Xstd
-            if hasattr(self,'Z'):
-                self.Z = (self.Z - self._Xmean) / self._Xstd
-        else:
-            self._Xmean = np.zeros((1,self.X.shape[1]))
-            self._Xstd = np.ones((1,self.X.shape[1]))
-
        self.likelihood = likelihood
-        #assert self.X.shape[0] == self.likelihood.Y.shape[0]
-        #self.N, self.D = self.likelihood.Y.shape
        assert self.X.shape[0] == self.likelihood.data.shape[0]
        self.N, self.D = self.likelihood.data.shape

+        # here's some simple normalization for the inputs
+        if normalize_X:
+            self._Xmean = X.mean(0)[None, :]
+            self._Xstd = X.std(0)[None, :]
+            self.X = (X.copy() - self._Xmean) / self._Xstd
+            if hasattr(self, 'Z'):
+                self.Z = (self.Z - self._Xmean) / self._Xstd
+        else:
+            self._Xmean = np.zeros((1, self.X.shape[1]))
+            self._Xstd = np.ones((1, self.X.shape[1]))
+
+        if not hasattr(self,'has_uncertain_inputs'):
+            self.has_uncertain_inputs = False
        model.__init__(self)

    def dL_dZ(self):
@ -65,24 +60,24 @@ class GP(model):
        """
        return np.zeros_like(self.Z)

-    def _set_params(self,p):
-        self.kern._set_params_transformed(p[:self.kern.Nparam])
-        #self.likelihood._set_params(p[self.kern.Nparam:])               # test by Nicolas
-        self.likelihood._set_params(p[self.kern.Nparam_transformed():])    # test by Nicolas
+    def _set_params(self, p):
+        self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
+        # self.likelihood._set_params(p[self.kern.Nparam:])               # test by Nicolas
+        self.likelihood._set_params(p[self.kern.Nparam_transformed():])  # test by Nicolas


-        self.K = self.kern.K(self.X,slices1=self.Xslices,slices2=self.Xslices)
+        self.K = self.kern.K(self.X)
        self.K += self.likelihood.covariance_matrix

        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)

-        #the gradient of the likelihood wrt the covariance matrix
+        # the gradient of the likelihood wrt the covariance matrix
        if self.likelihood.YYT is None:
-            alpha = np.dot(self.Ki,self.likelihood.Y)
-            self.dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Ki)
+            alpha = np.dot(self.Ki, self.likelihood.Y)
+            self.dL_dK = 0.5 * (np.dot(alpha, alpha.T) - self.D * self.Ki)
        else:
            tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki)
-            self.dL_dK = 0.5*(tmp - self.D*self.Ki)
+            self.dL_dK = 0.5 * (tmp - self.D * self.Ki)

    def _get_params(self):
        return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params()))
@ -94,20 +89,20 @@ class GP(model):
        """
        Approximates a non-gaussian likelihood using Expectation Propagation

-        For a Gaussian (or direct: TODO) likelihood, no iteration is required:
+        For a Gaussian likelihood, no iteration is required:
        this function does nothing
        """
        self.likelihood.fit_full(self.kern.K(self.X))
-        self._set_params(self._get_params()) # update the GP
+        self._set_params(self._get_params())  # update the GP

    def _model_fit_term(self):
        """
        Computes the model fit using YYT if it's available
        """
        if self.likelihood.YYT is None:
-            return -0.5*np.sum(np.square(np.dot(self.Li,self.likelihood.Y)))
+            return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y)))
        else:
-            return -0.5*np.sum(np.multiply(self.Ki, self.likelihood.YYT))
+            return -0.5 * np.sum(np.multiply(self.Ki, self.likelihood.YYT))

    def log_likelihood(self):
        """
@ -117,38 +112,40 @@ class GP(model):
        model for a new variable Y* = v_tilde/tau_tilde, with a covariance
        matrix K* = K + diag(1./tau_tilde) plus a normalization term.
        """
-        return -0.5*self.D*self.K_logdet + self._model_fit_term() + self.likelihood.Z
+        return -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z


    def _log_likelihood_gradients(self):
        """
        The gradient of all parameters.

-        For the kernel parameters, use the chain rule via dL_dK
-
-        For the likelihood parameters, pass in alpha = K^-1 y
+        Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
        """
-        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK,X=self.X,slices1=self.Xslices,slices2=self.Xslices), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
+        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))

-    def _raw_predict(self,_Xnew,slices=None, full_cov=False):
+    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False):
        """
        Internal helper function for making predictions, does not account
        for normalization or likelihood
+
+         #TODO: which_parts does nothing
+
+
        """
-        Kx = self.kern.K(self.X,_Xnew, slices1=self.Xslices,slices2=slices)
-        mu = np.dot(np.dot(Kx.T,self.Ki),self.likelihood.Y)
-        KiKx = np.dot(self.Ki,Kx)
+        Kx = self.kern.K(self.X, _Xnew,which_parts=which_parts)
+        mu = np.dot(np.dot(Kx.T, self.Ki), self.likelihood.Y)
+        KiKx = np.dot(self.Ki, Kx)
        if full_cov:
-            Kxx = self.kern.K(_Xnew, slices1=slices,slices2=slices)
-            var = Kxx - np.dot(KiKx.T,Kx)
+            Kxx = self.kern.K(_Xnew, which_parts=which_parts)
+            var = Kxx - np.dot(KiKx.T, Kx)
        else:
-            Kxx = self.kern.Kdiag(_Xnew, slices=slices)
-            var = Kxx - np.sum(np.multiply(KiKx,Kx),0)
-            var = var[:,None]
+            Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
+            var = Kxx - np.sum(np.multiply(KiKx, Kx), 0)
+            var = var[:, None]
        return mu, var


-    def predict(self,Xnew, slices=None, full_cov=False):
+    def predict(self, Xnew, which_parts='all', full_cov=False):
        """
        Predict the function(s) at the new point(s) Xnew.

@ -156,35 +153,30 @@ class GP(model):
        ---------
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.Q
-        :param slices:  specifies which outputs kernel(s) the Xnew correspond to (see below)
-        :type slices: (None, list of slice objects, list of ints)
+        :param which_parts:  specifies which outputs kernel(s) to use in prediction
+        :type which_parts: ('all', list of bools)
        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
        :type full_cov: bool
        :rtype: posterior mean,  a Numpy array, Nnew x self.D
        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.D

-        .. Note:: "slices" specifies how the the points X_new co-vary wich the training points.
-
-             - If None, the new points covary throigh every kernel part (default)
-             - If a list of slices, the i^th slice specifies which data are affected by the i^th kernel part
-             - If a list of booleans, specifying which kernel parts are active

           If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew.
           This is to allow for different normalizations of the output dimensions.

        """
-        #normalize X values
+        # normalize X values
        Xnew = (Xnew.copy() - self._Xmean) / self._Xstd
-        mu, var = self._raw_predict(Xnew, slices, full_cov)
+        mu, var = self._raw_predict(Xnew, which_parts, full_cov)

-        #now push through likelihood TODO
+        # now push through likelihood
        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)

        return mean, var, _025pm, _975pm


-    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_functions='all', resolution=None, full_cov=False):
+    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False):
        """
        Plot the GP's view of the world, where the data is normalized and the likelihood is Gaussian

@ -192,8 +184,8 @@ class GP(model):
        :param which_data: which if the training data to plot (default all)
        :type which_data: 'all' or a slice object to slice self.X, self.Y
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
-        :param which_functions: which of the kernel functions to plot (additively)
-        :type which_functions: list of bools
+        :param which_parts: which of the kernel functions to plot (additively)
+        :type which_parts: 'all', or list of bools
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D

        Plot the posterior of the GP.
@ -204,86 +196,86 @@ class GP(model):
        Can plot only part of the data and part of the posterior functions using which_data and which_functions
        Plot the data's view of the world, with non-normalized values and GP predictions passed through the likelihood
        """
-        if which_functions=='all':
-            which_functions = [True]*self.kern.Nparts
-        if which_data=='all':
+        if which_data == 'all':
            which_data = slice(None)

        if self.X.shape[1] == 1:
            Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits)
            if samples == 0:
-                m,v = self._raw_predict(Xnew, slices=which_functions)
-                gpplot(Xnew,m,m-2*np.sqrt(v),m+2*np.sqrt(v))
-                pb.plot(self.X[which_data],self.likelihood.Y[which_data],'kx',mew=1.5)
+                m, v = self._raw_predict(Xnew, which_parts=which_parts)
+                gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v))
+                pb.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
            else:
-                m,v = self._raw_predict(Xnew, slices=which_functions,full_cov=True)
-                Ysim = np.random.multivariate_normal(m.flatten(),v,samples)
-                gpplot(Xnew,m,m-2*np.sqrt(np.diag(v)[:,None]),m+2*np.sqrt(np.diag(v))[:,None])
+                m, v = self._raw_predict(Xnew, which_parts=which_parts, full_cov=True)
+                Ysim = np.random.multivariate_normal(m.flatten(), v, samples)
+                gpplot(Xnew, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None])
                for i in range(samples):
-                    pb.plot(Xnew,Ysim[i,:],Tango.colorsHex['darkBlue'],linewidth=0.25)
-            pb.plot(self.X[which_data],self.likelihood.Y[which_data],'kx',mew=1.5)
-            pb.xlim(xmin,xmax)
-            ymin,ymax = min(np.append(self.likelihood.Y,m-2*np.sqrt(np.diag(v)[:,None]))), max(np.append(self.likelihood.Y,m+2*np.sqrt(np.diag(v)[:,None])))
-            ymin, ymax = ymin - 0.1*(ymax - ymin), ymax + 0.1*(ymax - ymin)
-            pb.ylim(ymin,ymax)
-            if hasattr(self,'Z'):
-                pb.plot(self.Z,self.Z*0+pb.ylim()[0],'r|',mew=1.5,markersize=12)
+                    pb.plot(Xnew, Ysim[i, :], Tango.colorsHex['darkBlue'], linewidth=0.25)
+            pb.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
+            pb.xlim(xmin, xmax)
+            ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
+            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
+            pb.ylim(ymin, ymax)
+            if hasattr(self, 'Z'):
+                pb.plot(self.Z, self.Z * 0 + pb.ylim()[0], 'r|', mew=1.5, markersize=12)

        elif self.X.shape[1] == 2:
            resolution = resolution or 50
-            Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits,resolution)
-            m,v = self._raw_predict(Xnew, slices=which_functions)
-            m = m.reshape(resolution,resolution).T
-            pb.contour(xx,yy,m,vmin=m.min(),vmax=m.max(),cmap=pb.cm.jet)
-            pb.scatter(Xorig[:,0],Xorig[:,1],40,Yorig,linewidth=0,cmap=pb.cm.jet,vmin=m.min(), vmax=m.max())
-            pb.xlim(xmin[0],xmax[0])
-            pb.ylim(xmin[1],xmax[1])
+            Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits, resolution)
+            m, v = self._raw_predict(Xnew, which_parts=which_parts)
+            m = m.reshape(resolution, resolution).T
+            pb.contour(xx, yy, m, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
+            pb.scatter(Xorig[:, 0], Xorig[:, 1], 40, Yorig, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max())
+            pb.xlim(xmin[0], xmax[0])
+            pb.ylim(xmin[1], xmax[1])
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"

-    def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None,levels=20):
+    def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20):
        """
        TODO: Docstrings!
        :param levels: for 2D plotting, the number of contour levels to use

        """
        # TODO include samples
-        if which_functions=='all':
-            which_functions = [True]*self.kern.Nparts
-        if which_data=='all':
+        if which_data == 'all':
            which_data = slice(None)

        if self.X.shape[1] == 1:

-            Xu = self.X * self._Xstd + self._Xmean #NOTE self.X are the normalized values now
+            Xu = self.X * self._Xstd + self._Xmean  # NOTE self.X are the normalized values now

            Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
-            m, var, lower, upper = self.predict(Xnew, slices=which_functions)
-            gpplot(Xnew,m, lower, upper)
-            pb.plot(Xu[which_data],self.likelihood.data[which_data],'kx',mew=1.5)
-            ymin,ymax = min(np.append(self.likelihood.data,lower)), max(np.append(self.likelihood.data,upper))
-            ymin, ymax = ymin - 0.1*(ymax - ymin), ymax + 0.1*(ymax - ymin)
-            pb.xlim(xmin,xmax)
-            pb.ylim(ymin,ymax)
-            if hasattr(self,'Z'):
-                Zu = self.Z*self._Xstd + self._Xmean
-                pb.plot(Zu,Zu*0+pb.ylim()[0],'r|',mew=1.5,markersize=12)
-                if self.has_uncertain_inputs:
-                    pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_variance.flatten()))
+            m, var, lower, upper = self.predict(Xnew, which_parts=which_parts)
+            gpplot(Xnew, m, lower, upper)
+            pb.plot(Xu[which_data], self.likelihood.data[which_data], 'kx', mew=1.5)
+            if self.has_uncertain_inputs:
+                pb.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
+                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
+                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)

-        elif self.X.shape[1]==2: #FIXME
+            ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
+            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
+            pb.xlim(xmin, xmax)
+            pb.ylim(ymin, ymax)
+            if hasattr(self, 'Z'):
+                Zu = self.Z * self._Xstd + self._Xmean
+                pb.plot(Zu, Zu * 0 + pb.ylim()[0], 'r|', mew=1.5, markersize=12)
+                    # pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_variance.flatten()))
+
+        elif self.X.shape[1] == 2:  # FIXME
            resolution = resolution or 50
-            Xnew, xx, yy, xmin, xmax = x_frame2D(self.X, plot_limits,resolution)
-            x, y = np.linspace(xmin[0],xmax[0],resolution), np.linspace(xmin[1],xmax[1],resolution)
-            m, var, lower, upper = self.predict(Xnew, slices=which_functions)
-            m = m.reshape(resolution,resolution).T
-            pb.contour(x,y,m,levels,vmin=m.min(),vmax=m.max(),cmap=pb.cm.jet)
+            Xnew, xx, yy, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
+            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
+            m, var, lower, upper = self.predict(Xnew, which_parts=which_parts)
+            m = m.reshape(resolution, resolution).T
+            pb.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
            Yf = self.likelihood.Y.flatten()
-            pb.scatter(self.X[:,0], self.X[:,1], 40, Yf, cmap=pb.cm.jet,vmin=m.min(),vmax=m.max(), linewidth=0.)
-            pb.xlim(xmin[0],xmax[0])
-            pb.ylim(xmin[1],xmax[1])
-            if hasattr(self,'Z'):
-                pb.plot(self.Z[:,0],self.Z[:,1],'wo')
+            pb.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+            pb.xlim(xmin[0], xmax[0])
+            pb.ylim(xmin[1], xmax[1])
+            if hasattr(self, 'Z'):
+                pb.plot(self.Z[:, 0], self.Z[:, 1], 'wo')

        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
--- a/GPy/models/GPLVM.py
+++ b/GPy/models/GPLVM.py
@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+### Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


@ -24,12 +24,12 @@ class GPLVM(GP):
    :type init: 'PCA'|'random'

    """
-    def __init__(self, Y, Q, init='PCA', X = None, kernel=None, **kwargs):
+    def __init__(self, Y, Q, init='PCA', X = None, kernel=None, normalize_Y=False, **kwargs):
        if X is None:
            X = self.initialise_latent(init, Q, Y)
        if kernel is None:
            kernel = kern.rbf(Q) + kern.bias(Q)
-        likelihood = Gaussian(Y)
+        likelihood = Gaussian(Y, normalize=normalize_Y)
        GP.__init__(self, X, likelihood, kernel, **kwargs)

    def initialise_latent(self, init, Q, Y):
@ -91,8 +91,8 @@ class GPLVM(GP):
        Xtest_full[:, :2] = Xtest
        mu, var, low, up = self.predict(Xtest_full)
        var = var[:, :1]
-        ax.imshow(var.reshape(resolution, resolution).T[::-1, :],
-                  extent=[xmin[0], xmax[0], xmin[1], xmax[1]], cmap=pb.cm.binary,interpolation='bilinear')
+        ax.imshow(var.reshape(resolution, resolution).T,
+                  extent=[xmin[0], xmax[0], xmin[1], xmax[1]], cmap=pb.cm.binary,interpolation='bilinear',origin='lower')

        for i,ul in enumerate(np.unique(labels)):
            if type(ul) is np.string_:
--- a/GPy/models/GP_regression.py
+++ b/GPy/models/GP_regression.py
@ -11,26 +11,24 @@ class GP_regression(GP):
    """
    Gaussian Process model for regression

-    This is a thin wrapper around the GP class, with a set of sensible defalts
+    This is a thin wrapper around the models.GP class, with a set of sensible defalts

    :param X: input observations
    :param Y: observed values
-    :param kernel: a GPy kernel, defaults to rbf+white
+    :param kernel: a GPy kernel, defaults to rbf
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_Y: False|True
-    :param Xslices: how the X,Y data co-vary in the kernel (i.e. which "outputs" they correspond to). See (link:slicing)
-    :rtype: model object

    .. Note:: Multiple independent outputs are allowed using columns of Y

    """

-    def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None):
+    def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False):
        if kernel is None:
            kernel = kern.rbf(X.shape[1])

        likelihood = likelihoods.Gaussian(Y,normalize=normalize_Y)

-        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X, Xslices=Xslices)
+        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -9,7 +9,6 @@ from sparse_GP_regression import sparse_GP_regression
 from GPLVM import GPLVM
 from warped_GP import warpedGP
 from sparse_GPLVM import sparse_GPLVM
-from uncollapsed_sparse_GP import uncollapsed_sparse_GP
 from Bayesian_GPLVM import Bayesian_GPLVM
 from mrd import MRD
 from generalized_FITC import generalized_FITC
--- a/GPy/models/generalized_FITC.py
+++ b/GPy/models/generalized_FITC.py
@ -9,6 +9,12 @@ from .. import kern
 from scipy import stats, linalg
 from sparse_GP import sparse_GP

+def backsub_both_sides(L,X):
+    """ Return L^-T * X * L^-1, assumuing X is symmetrical and L is lower cholesky"""
+    tmp,_ = linalg.lapack.flapack.dtrtrs(L,np.asfortranarray(X),lower=1,trans=1)
+    return linalg.lapack.flapack.dtrtrs(L,np.asfortranarray(tmp.T),lower=1,trans=1)[0].T
+
+
 class generalized_FITC(sparse_GP):
    """
    Naish-Guzman, A. and Holden, S. (2008) implemantation of EP with FITC.
@ -23,20 +29,19 @@ class generalized_FITC(sparse_GP):
    :type X_variance: np.ndarray (N x Q) | None
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (M x Q) | None
-    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type M: int
    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """

-    def __init__(self, X, likelihood, kernel, Z, X_variance=None, Xslices=None,Zslices=None, normalize_X=False):
+    def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False):

        self.Z = Z
        self.M = self.Z.shape[0]
-        self._precision = likelihood.precision
+        self.true_precision = likelihood.precision

-        sparse_GP.__init__(self, X, likelihood, kernel=kernel, Z=self.Z, X_variance=None, Xslices=None,Zslices=None, normalize_X=False)
+        sparse_GP.__init__(self, X, likelihood, kernel=kernel, Z=self.Z, X_variance=None, normalize_X=False)

    def _set_params(self, p):
        self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
@ -52,13 +57,16 @@ class generalized_FITC(sparse_GP):

        For a Gaussian (or direct: TODO) likelihood, no iteration is required:
        this function does nothing
+
+        Diag(Knn - Qnn) is added to the noise term to use the tools already implemented in sparse_GP.
+        The true precison is now 'true_precision' not 'precision'.
        """
        if self.has_uncertain_inputs:
            raise NotImplementedError, "FITC approximation not implemented for uncertain inputs"
        else:
            self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
-            self._precision = self.likelihood.precision # Save the true precision
-            self.likelihood.precision = self._precision/(1. + self._precision*self.Diag0[:,None]) # Add the diagonal element of the FITC approximation
+            self.true_precision = self.likelihood.precision # Save the true precision
+            self.likelihood.precision = self.true_precision/(1. + self.true_precision*self.Diag0[:,None]) # Add the diagonal element of the FITC approximation
            self._set_params(self._get_params()) # update the GP

    def _FITC_computations(self):
@ -70,23 +78,23 @@ class generalized_FITC(sparse_GP):
            - removes the extra terms computed in the sparse_GP approximation
            - computes the likelihood gradients wrt the true precision.
        """
-        #NOTE the true precison is now '_precison' not 'precision'
+        #NOTE the true precison is now 'true_precision' not 'precision'
        if self.likelihood.is_heteroscedastic:

            # Compute generalized FITC's diagonal term of the covariance
-            self.Qnn = mdot(self.psi1.T,self.Kmmi,self.psi1)
+            self.Lmi,info = linalg.lapack.flapack.dtrtrs(self.Lm,np.eye(self.M),lower=1)
+            Lmipsi1 = np.dot(self.Lmi,self.psi1)
+            self.Qnn = np.dot(Lmipsi1.T,Lmipsi1)
+            #self.Kmmi, Lm, Lmi, Kmm_logdet = pdinv(self.Kmm)
+            #self.Qnn = mdot(self.psi1.T,self.Kmmi,self.psi1)
+            #a = kj
            self.Diag0 = self.psi0 - np.diag(self.Qnn)
-            Iplus_Dprod_i = 1./(1.+ self.Diag0 * self._precision.flatten())
+            Iplus_Dprod_i = 1./(1.+ self.Diag0 * self.true_precision.flatten())
            self.Diag = self.Diag0 * Iplus_Dprod_i
-            #self.Diag = self.Diag0/(1.+ self.Diag0 * self._precision.flatten())
-

            self.P = Iplus_Dprod_i[:,None] * self.psi1.T
-            #self.P = (self.Diag / self.Diag0)[:,None] * self.psi1.T
            self.RPT0 = np.dot(self.Lmi,self.psi1)
            self.L = np.linalg.cholesky(np.eye(self.M) + np.dot(self.RPT0,((1. - Iplus_Dprod_i)/self.Diag0)[:,None]*self.RPT0.T))
-            #self.L = np.linalg.cholesky(np.eye(self.M) + np.dot(self.RPT0,(1./self.Diag0 - Iplus_Dprod_i/self.Diag0)[:,None]*self.RPT0.T))
-            #self.L = np.linalg.cholesky(np.eye(self.M) + np.dot(self.RPT0,(1./self.Diag0 - self.Diag/(self.Diag0**2))[:,None]*self.RPT0.T))
            self.R,info = linalg.flapack.dtrtrs(self.L,self.Lmi,lower=1)
            self.RPT = np.dot(self.R,self.P.T)
            self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T,self.RPT)
@ -95,7 +103,16 @@ class generalized_FITC(sparse_GP):
            self.mu = self.w + np.dot(self.P,self.gamma)

            # Remove extra term from dL_dpsi1
-            self.dL_dpsi1 -= mdot(self.Kmmi,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)) #dB
+            self.dL_dpsi1 -= mdot(self.Lmi.T,Lmipsi1*self.likelihood.precision.flatten().reshape(1,self.N))
+            #self.Kmmi, Lm, Lmi, Kmm_logdet = pdinv(self.Kmm)
+            #self.dL_dpsi1 -= mdot(self.Kmmi,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)) #dB
+
+            #########333333
+            #self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B)
+            #########333333
+
+
+
        else:
            raise NotImplementedError, "homoscedastic fitc not implemented"
            # Remove extra term from dL_dpsi1
@ -141,11 +158,14 @@ class generalized_FITC(sparse_GP):
            A = -0.5*self.N*self.D*np.log(2.*np.pi) +0.5*np.sum(np.log(self.likelihood.precision)) -0.5*np.sum(self.V*self.likelihood.Y)
        else:
            A = -0.5*self.N*self.D*(np.log(2.*np.pi) + np.log(self.likelihood._variance)) -0.5*self.likelihood.precision*self.likelihood.trYYT
-        C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2))
-        D = 0.5*np.trace(self.Cpsi1VVpsi1)
+        C = -self.D * (np.sum(np.log(np.diag(self.LB))) + 0.5*self.M*np.log(sf2))
+        #C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2))
+        D = 0.5*np.sum(np.square(self._LBi_Lmi_psi1V))
+        #self.Cpsi1VVpsi1 = np.dot(self.Cpsi1V,self.psi1V.T)
+        #D_ = 0.5*np.trace(self.Cpsi1VVpsi1)
        return A+C+D

-    def _raw_predict(self, Xnew, slices, full_cov=False):
+    def _raw_predict(self, Xnew, which_parts, full_cov=False):
        if self.likelihood.is_heteroscedastic:
            """
            Make a prediction for the generalized FITC model
@ -174,16 +194,16 @@ class generalized_FITC(sparse_GP):
            self.mu_H = mu_H
            Sigma_H = C + np.dot(mu_u,np.dot(self.Sigma,mu_u.T))
            # q(f_star|y) = N(f_star|mu_star,sigma2_star)
-            Kx = self.kern.K(self.Z, Xnew)
+            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
            KR0T = np.dot(Kx.T,self.Lmi.T)
            mu_star = np.dot(KR0T,mu_H)
            if full_cov:
-                Kxx = self.kern.K(Xnew)
+                Kxx = self.kern.K(Xnew,which_parts=which_parts)
                var = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.M),KR0T.T))
            else:
-                Kxx = self.kern.Kdiag(Xnew)
-                Kxx_ = self.kern.K(Xnew)
-                var_ = Kxx_ + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.M),KR0T.T))
+                Kxx = self.kern.Kdiag(Xnew,which_parts=which_parts)
+                Kxx_ = self.kern.K(Xnew,which_parts=which_parts) # TODO: RA, is this line needed?
+                var_ = Kxx_ + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.M),KR0T.T)) # TODO: RA, is this line needed?
                var = (Kxx + np.sum(KR0T.T*np.dot(Sigma_H - np.eye(self.M),KR0T.T),0))[:,None]
            return mu_star[:,None],var
        else:
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@ -271,90 +271,52 @@ class MRD(model):
        self.Z = Z
        return Z

-    def plot_X_1d(self, colors=None):
-        fig = pylab.figure(num="MRD X 1d", figsize=(min(8, (3 * len(self.bgplvms))), min(12, (2 * self.X.shape[1]))))
-        fig.clf()
-        ax1 = fig.add_subplot(self.X.shape[1], 1, 1)
-        if colors is None:
-            colors = ax1._get_lines.color_cycle
-        ax1.plot(self.X, c='k', alpha=.3)
-        plots = ax1.plot(self.X.T[0], c=colors.next())
-        ax1.fill_between(numpy.arange(self.X.shape[0]),
-                         self.X.T[0] - 2 * numpy.sqrt(self.gref.X_variance.T[0]),
-                         self.X.T[0] + 2 * numpy.sqrt(self.gref.X_variance.T[0]),
-                         facecolor=plots[-1].get_color(),
-                         alpha=.3)
-        ax1.text(1, 1, r"$\mathbf{{X_{}}}".format(1),
-                 horizontalalignment='right',
-                 verticalalignment='top',
-                 transform=ax1.transAxes)
-        for i in range(self.X.shape[1] - 1):
-            ax = fig.add_subplot(self.X.shape[1], 1, i + 2)
-            ax.plot(self.X, c='k', alpha=.3)
-            plots.extend(ax.plot(self.X.T[i + 1], c=colors.next()))
-            ax.fill_between(numpy.arange(self.X.shape[0]),
-                            self.X.T[i + 1] - 2 * numpy.sqrt(self.gref.X_variance.T[i + 1]),
-                            self.X.T[i + 1] + 2 * numpy.sqrt(self.gref.X_variance.T[i + 1]),
-                            facecolor=plots[-1].get_color(),
-                            alpha=.3)
-            if i < self.X.shape[1] - 2:
-                ax.set_xticklabels('')
-        ax1.set_xticklabels('')
-#         ax1.legend(plots, [r"$\mathbf{{X_{}}}$".format(i + 1) for i in range(self.X.shape[1])],
-#                    bbox_to_anchor=(0., 1 + .01 * self.X.shape[1],
-#                                    1., 1. + .01 * self.X.shape[1]), loc=3,
-#                    ncol=self.X.shape[1], mode="expand", borderaxespad=0.)
+    def _handle_plotting(self, fig_num, axes, plotf):
+        if axes is None:
+            fig = pylab.figure(num=fig_num, figsize=(4 * len(self.bgplvms), 3 * len(self.bgplvms)))
+        for i, g in enumerate(self.bgplvms):
+            if axes is None:
+                ax = fig.add_subplot(1, len(self.bgplvms), i + 1)
+            else:
+                ax = axes[i]
+            plotf(i, g, ax)
        pylab.draw()
-        fig.tight_layout(h_pad=.01, rect=(0, 0, 1, .95))
+        if axes is None:
+            fig.tight_layout()
+            return fig
+        else:
+            return pylab.gcf()
+
+    def plot_X(self, fig_num="MRD Predictions", axes=None):
+        fig = self._handle_plotting(fig_num, axes, lambda i, g, ax: ax.imshow(g.X))
        return fig

-    def plot_X(self):
-        fig = pylab.figure("MRD X", figsize=(4 * len(self.bgplvms), 3))
-        fig.clf()
-        for i, g in enumerate(self.bgplvms):
-            ax = fig.add_subplot(1, len(self.bgplvms), i + 1)
-            ax.imshow(g.X)
-        pylab.draw()
-        fig.tight_layout()
+    def plot_predict(self, fig_num="MRD Predictions", axes=None):
+        fig = self._handle_plotting(fig_num, axes, lambda i, g, ax: ax.imshow(g.predict(g.X)[0]))
        return fig

-    def plot_predict(self):
-        fig = pylab.figure("MRD Predictions", figsize=(4 * len(self.bgplvms), 3))
-        fig.clf()
-        for i, g in enumerate(self.bgplvms):
-            ax = fig.add_subplot(1, len(self.bgplvms), i + 1)
-            ax.imshow(g.predict(g.X)[0])
-        pylab.draw()
-        fig.tight_layout()
+    def plot_scales(self, fig_num="MRD Scales", axes=None, *args, **kwargs):
+        fig = self._handle_plotting(fig_num, axes, lambda i, g, ax: g.kern.plot_ARD(ax=ax, *args, **kwargs))
        return fig

-    def plot_scales(self, *args, **kwargs):
-        fig = pylab.figure("MRD Scales", figsize=(4 * len(self.bgplvms), 3))
-        fig.clf()
-        for i, g in enumerate(self.bgplvms):
-            ax = fig.add_subplot(1, len(self.bgplvms), i + 1)
-            g.kern.plot_ARD(ax=ax, *args, **kwargs)
-        pylab.draw()
-        fig.tight_layout()
-        return fig
-
-    def plot_latent(self, *args, **kwargs):
-        fig = pylab.figure("MRD Latent Spaces", figsize=(4 * len(self.bgplvms), 3))
-        fig.clf()
-        for i, g in enumerate(self.bgplvms):
-            ax = fig.add_subplot(1, len(self.bgplvms), i + 1)
-            g.plot_latent(ax=ax, *args, **kwargs)
-        pylab.draw()
-        fig.tight_layout()
+    def plot_latent(self, fig_num="MRD Latent Spaces", axes=None, *args, **kwargs):
+        fig = self._handle_plotting(fig_num, axes, lambda i, g, ax: g.plot_latent(ax=ax, *args, **kwargs))
        return fig

    def _debug_plot(self):
-        self.plot_X()
        self.plot_X_1d()
-        self.plot_latent()
-        self.plot_scales()
+        fig = pylab.figure("MRD DEBUG PLOT", figsize=(4 * len(self.bgplvms), 9))
+        fig.clf()
+        axes = [fig.add_subplot(3, len(self.bgplvms), i + 1) for i in range(len(self.bgplvms))]
+        self.plot_X(axes=axes)
+        axes = [fig.add_subplot(3, len(self.bgplvms), i + len(self.bgplvms) + 1) for i in range(len(self.bgplvms))]
+        self.plot_latent(axes=axes)
+        axes = [fig.add_subplot(3, len(self.bgplvms), i + 2 * len(self.bgplvms) + 1) for i in range(len(self.bgplvms))]
+        self.plot_scales(axes=axes)
+        pylab.draw()
+        fig.tight_layout()

-    def _debug_optimize(self, opt='scg', maxiters=500, itersteps=10):
+    def _debug_optimize(self, opt='scg', maxiters=5000, itersteps=10):
        iters = 0
        optstep = lambda: self.optimize(opt, messages=1, max_f_eval=itersteps)
        self._debug_plot()
--- a/GPy/models/sparse_GP.py
+++ b/GPy/models/sparse_GP.py
@ -3,15 +3,16 @@

 import numpy as np
 import pylab as pb
-from ..util.linalg import mdot, jitchol, chol_inv, pdinv, trace_dot
+from ..util.linalg import mdot, jitchol, tdot, symmetrify
 from ..util.plot import gpplot
 from .. import kern
 from GP import GP
 from scipy import linalg

-#Still TODO:
-# make use of slices properly (kernel can now do this)
-# enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array)
+def backsub_both_sides(L,X):
+    """ Return L^-T * X * L^-1, assumuing X is symmetrical and L is lower cholesky"""
+    tmp,_ = linalg.lapack.flapack.dtrtrs(L,np.asfortranarray(X),lower=1,trans=1)
+    return linalg.lapack.flapack.dtrtrs(L,np.asfortranarray(tmp.T),lower=1,trans=1)[0].T

 class sparse_GP(GP):
    """
@ -27,19 +28,16 @@ class sparse_GP(GP):
    :type X_variance: np.ndarray (N x Q) | None
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (M x Q) | None
-    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type M: int
    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """

-    def __init__(self, X, likelihood, kernel, Z, X_variance=None, Xslices=None,Zslices=None, normalize_X=False):
+    def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False):
        self.scale_factor = 100.0# a scaling factor to help keep the algorithm stable
        self.auto_scale_factor = False
        self.Z = Z
-        self.Zslices = Zslices
-        self.Xslices = Xslices
        self.M = Z.shape[0]
        self.likelihood = likelihood

@ -50,10 +48,7 @@ class sparse_GP(GP):
            self.has_uncertain_inputs=True
            self.X_variance = X_variance

-        if not self.likelihood.is_heteroscedastic:
-            self.likelihood.trYYT = np.trace(np.dot(self.likelihood.Y, self.likelihood.Y.T)) # TODO: something more elegant here?
-
-        GP.__init__(self, X, likelihood, kernel=kernel, normalize_X=normalize_X, Xslices=Xslices)
+        GP.__init__(self, X, likelihood, kernel=kernel, normalize_X=normalize_X)

        #normalize X uncertainty also
        if self.has_uncertain_inputs:
@ -68,87 +63,89 @@ class sparse_GP(GP):
            self.psi1 = self.kern.psi1(self.Z,self.X, self.X_variance).T
            self.psi2 = self.kern.psi2(self.Z,self.X, self.X_variance)
        else:
-            self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices)
+            self.psi0 = self.kern.Kdiag(self.X)
            self.psi1 = self.kern.K(self.Z,self.X)
            self.psi2 = None

    def _computations(self):
-        #TODO: find routine to multiply triangular matrices
-        #TODO: slices for psi statistics (easy enough)
-
        sf = self.scale_factor
        sf2 = sf**2

-        #The rather complex computations of psi2_beta_scaled
+        #factor Kmm
+        self.Lm = jitchol(self.Kmm)
+
+        #The rather complex computations of self.A
        if self.likelihood.is_heteroscedastic:
            assert self.likelihood.D == 1 #TODO: what if the likelihood is heterscedatic and there are multiple independent outputs?
            if self.has_uncertain_inputs:
-                self.psi2_beta_scaled = (self.psi2*(self.likelihood.precision.flatten().reshape(self.N,1,1)/sf2)).sum(0)
+                psi2_beta_scaled = (self.psi2*(self.likelihood.precision.flatten().reshape(self.N,1,1)/sf2)).sum(0)
+                evals, evecs = linalg.eigh(psi2_beta_scaled)
+                clipped_evals = np.clip(evals,0.,1e6) # TODO: make clipping configurable
+                if not np.allclose(evals, clipped_evals):
+                    print "Warning: clipping posterior eigenvalues"
+                tmp = evecs*np.sqrt(clipped_evals)
+                tmp, _ = linalg.lapack.flapack.dtrtrs(self.Lm,np.asfortranarray(tmp),lower=1)
+                self.A = tdot(tmp)
            else:
                tmp = self.psi1*(np.sqrt(self.likelihood.precision.flatten().reshape(1,self.N))/sf)
-                self.psi2_beta_scaled = np.dot(tmp,tmp.T)
+                tmp, _ = linalg.lapack.flapack.dtrtrs(self.Lm,np.asfortranarray(tmp),lower=1)
+                self.A = tdot(tmp)
        else:
            if self.has_uncertain_inputs:
-                self.psi2_beta_scaled = (self.psi2*(self.likelihood.precision/sf2)).sum(0)
+                psi2_beta_scaled = (self.psi2*(self.likelihood.precision/sf2)).sum(0)
+                evals, evecs = linalg.eigh(psi2_beta_scaled)
+                clipped_evals = np.clip(evals,0.,1e6) # TODO: make clipping configurable
+                if not np.allclose(evals, clipped_evals):
+                    print "Warning: clipping posterior eigenvalues"
+                tmp = evecs*np.sqrt(clipped_evals)
+                tmp, _ = linalg.lapack.flapack.dtrtrs(self.Lm,np.asfortranarray(tmp),lower=1)
+                self.A = tdot(tmp)
            else:
                tmp = self.psi1*(np.sqrt(self.likelihood.precision)/sf)
-                self.psi2_beta_scaled = np.dot(tmp,tmp.T)
+                tmp, _ = linalg.lapack.flapack.dtrtrs(self.Lm,np.asfortranarray(tmp),lower=1)
+                self.A = tdot(tmp)

-        self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)
+        #factor B
+        self.B = np.eye(self.M)/sf2 + self.A
+        self.LB = jitchol(self.B)

        self.V = (self.likelihood.precision/self.scale_factor)*self.likelihood.Y
-
-        #Compute A = L^-1 psi2 beta L^-T
-        #self. A = mdot(self.Lmi,self.psi2_beta_scaled,self.Lmi.T)
-        tmp = linalg.lapack.flapack.dtrtrs(self.Lm,self.psi2_beta_scaled.T,lower=1)[0]
-        self.A = linalg.lapack.flapack.dtrtrs(self.Lm,np.asarray(tmp.T,order='F'),lower=1)[0]
-
-        self.B = np.eye(self.M)/sf2 + self.A
-
-        self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B)
-
        self.psi1V = np.dot(self.psi1, self.V)
-        #tmp = np.dot(self.Lmi.T, self.LBi.T)
-        tmp = linalg.lapack.clapack.dtrtrs(self.Lm.T,np.asarray(self.LBi.T,order='C'),lower=0)[0]
-        self.C = np.dot(tmp,tmp.T) #TODO: tmp is triangular. replace with dtrmm (blas) when available
-        self.Cpsi1V = np.dot(self.C,self.psi1V)
-        self.Cpsi1VVpsi1 = np.dot(self.Cpsi1V,self.psi1V.T)
-        #self.E = np.dot(self.Cpsi1VVpsi1,self.C)/sf2
-        self.E = np.dot(self.Cpsi1V/sf,self.Cpsi1V.T/sf)
-
-        # Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertin inputs case
-        self.dL_dpsi0 = - 0.5 * self.D * (self.likelihood.precision * np.ones([self.N,1])).flatten()
-        self.dL_dpsi1 = np.dot(self.Cpsi1V,self.V.T)
-        if self.likelihood.is_heteroscedastic:
-            if self.has_uncertain_inputs:
-                self.dL_dpsi2 = 0.5 * self.likelihood.precision[:,None,None] * self.D * self.Kmmi[None,:,:] # dB
-                self.dL_dpsi2 += - 0.5 * self.likelihood.precision[:,None,None]/sf2 * self.D * self.C[None,:,:] # dC
-                self.dL_dpsi2 += - 0.5 * self.likelihood.precision[:,None,None]* self.E[None,:,:] # dD
-            else:
-                self.dL_dpsi1 += mdot(self.Kmmi,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)) #dB
-                self.dL_dpsi1 += -mdot(self.C,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)/sf2) #dC
-                self.dL_dpsi1 += -mdot(self.E,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)) #dD
-                self.dL_dpsi2 = None
-
-        else:
-            self.dL_dpsi2 = 0.5 * self.likelihood.precision * self.D * self.Kmmi # dB
-            self.dL_dpsi2 += - 0.5 * self.likelihood.precision/sf2 * self.D * self.C # dC
-            self.dL_dpsi2 += - 0.5 * self.likelihood.precision * self.E # dD
-            if self.has_uncertain_inputs:
-                #repeat for each of the N psi_2 matrices
-                self.dL_dpsi2 = np.repeat(self.dL_dpsi2[None,:,:],self.N,axis=0)
-            else:
-                self.dL_dpsi1 += 2.*np.dot(self.dL_dpsi2,self.psi1)
-                self.dL_dpsi2 = None

+        #back substutue C into psi1V
+        tmp,info1 = linalg.lapack.flapack.dtrtrs(self.Lm,np.asfortranarray(self.psi1V),lower=1,trans=0)
+        self._LBi_Lmi_psi1V,_ = linalg.lapack.flapack.dtrtrs(self.LB,np.asfortranarray(tmp),lower=1,trans=0)
+        tmp,info2 = linalg.lapack.flapack.dpotrs(self.LB,tmp,lower=1)
+        self.Cpsi1V,info3 = linalg.lapack.flapack.dtrtrs(self.Lm,tmp,lower=1,trans=1)

        # Compute dL_dKmm
-        #self.dL_dKmm_old = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi)*sf2 # dB
-        #self.dL_dKmm += -0.5 * self.D * (- self.C/sf2 - 2.*mdot(self.C, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC
-        #self.dL_dKmm +=  np.dot(np.dot(self.E*sf2, self.psi2_beta_scaled) - self.Cpsi1VVpsi1, self.Kmmi) + 0.5*self.E # dD
-        tmp = linalg.lapack.flapack.dtrtrs(self.Lm,np.asfortranarray(self.A),lower=1,trans=1)[0]
-        self.dL_dKmm = -0.5*self.D*sf2*linalg.lapack.flapack.dtrtrs(self.Lm,np.asfortranarray(tmp.T),lower=1,trans=1)[0] #dA
-        self.dL_dKmm += 0.5*(self.D*(self.C/sf2 -self.Kmmi) + self.E) + np.dot(np.dot(self.D*self.C + self.E*sf2,self.psi2_beta_scaled) - self.Cpsi1VVpsi1,self.Kmmi) # d(C+D)
+        tmp = tdot(self._LBi_Lmi_psi1V)
+        self.DBi_plus_BiPBi = backsub_both_sides(self.LB, self.D*np.eye(self.M) + tmp)
+        tmp = -0.5*self.DBi_plus_BiPBi/sf2
+        tmp += -0.5*self.B*sf2*self.D
+        tmp += self.D*np.eye(self.M)
+        self.dL_dKmm = backsub_both_sides(self.Lm,tmp)
+
+        # Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertain inputs case
+        self.dL_dpsi0 = - 0.5 * self.D * (self.likelihood.precision * np.ones([self.N,1])).flatten()
+        self.dL_dpsi1 = np.dot(self.Cpsi1V,self.V.T)
+        dL_dpsi2_beta = 0.5*backsub_both_sides(self.Lm,self.D*np.eye(self.M) - self.DBi_plus_BiPBi)
+        if self.likelihood.is_heteroscedastic:
+            if self.has_uncertain_inputs:
+                self.dL_dpsi2 = self.likelihood.precision[:,None,None]*dL_dpsi2_beta[None,:,:]
+            else:
+                self.dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta,self.psi1*self.likelihood.precision.reshape(1,self.N))
+                self.dL_dpsi2 = None
+        else:
+            dL_dpsi2 = self.likelihood.precision*dL_dpsi2_beta
+            if self.has_uncertain_inputs:
+                #repeat for each of the N psi_2 matrices
+                self.dL_dpsi2 = np.repeat(dL_dpsi2[None,:,:],self.N,axis=0)
+            else:
+                #subsume back into psi1 (==Kmn)
+                self.dL_dpsi1 += 2.*np.dot(dL_dpsi2,self.psi1)
+                self.dL_dpsi2 = None
+

        #the partial derivative vector for the likelihood
        if self.likelihood.Nparams ==0:
@ -156,16 +153,11 @@ class sparse_GP(GP):
            self.partial_for_likelihood = None
        elif self.likelihood.is_heteroscedastic:
            raise NotImplementedError, "heteroscedatic derivates not implemented"
-            #self.partial_for_likelihood = - 0.5 * self.D*self.likelihood.precision + 0.5 * (self.likelihood.Y**2).sum(1)*self.likelihood.precision**2 #dA
-            #self.partial_for_likelihood +=  0.5 * self.D * (self.psi0*self.likelihood.precision**2 - (self.psi2*self.Kmmi[None,:,:]*self.likelihood.precision[:,None,None]**2).sum(1).sum(1)/sf2) #dB
-            #self.partial_for_likelihood +=  0.5 * self.D * np.sum(self.Bi*self.A)*self.likelihood.precision #dC
-            #self.partial_for_likelihood += -np.diag(np.dot((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) , self.psi1VVpsi1 ))*self.likelihood.precision #dD
        else:
            #likelihood is not heterscedatic
-            self.partial_for_likelihood =   - 0.5 * self.N*self.D*self.likelihood.precision + 0.5 * np.sum(np.square(self.likelihood.Y))*self.likelihood.precision**2
+            self.partial_for_likelihood =   - 0.5 * self.N*self.D*self.likelihood.precision + 0.5 * self.likelihood.trYYT*self.likelihood.precision**2
            self.partial_for_likelihood += 0.5 * self.D * (self.psi0.sum()*self.likelihood.precision**2 - np.trace(self.A)*self.likelihood.precision*sf2)
-            self.partial_for_likelihood += 0.5 * self.D * trace_dot(self.Bi,self.A)*self.likelihood.precision
-            self.partial_for_likelihood += self.likelihood.precision*(0.5*trace_dot(self.psi2_beta_scaled,self.E*sf2) - np.trace(self.Cpsi1VVpsi1))
+            self.partial_for_likelihood += self.likelihood.precision*(0.5*np.sum(self.A*self.DBi_plus_BiPBi) - np.sum(np.square(self._LBi_Lmi_psi1V)))



@ -178,8 +170,8 @@ class sparse_GP(GP):
        else:
            A = -0.5*self.N*self.D*(np.log(2.*np.pi) + np.log(self.likelihood._variance)) -0.5*self.likelihood.precision*self.likelihood.trYYT
            B = -0.5*self.D*(np.sum(self.likelihood.precision*self.psi0) - np.trace(self.A)*sf2)
-        C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2))
-        D = 0.5*np.trace(self.Cpsi1VVpsi1)
+        C = -self.D * (np.sum(np.log(np.diag(self.LB))) + 0.5*self.M*np.log(sf2))
+        D = 0.5*np.sum(np.square(self._LBi_Lmi_psi1V))
        return A+B+C+D

    def _set_params(self, p):
@ -187,13 +179,14 @@ class sparse_GP(GP):
        self.kern._set_params(p[self.Z.size:self.Z.size+self.kern.Nparam])
        self.likelihood._set_params(p[self.Z.size+self.kern.Nparam:])
        self._compute_kernel_matrices()
-        if self.auto_scale_factor:
-            self.scale_factor = np.sqrt(self.psi2.sum(0).mean()*self.likelihood.precision)
        #if self.auto_scale_factor:
-        #    if self.likelihood.is_heteroscedastic:
-        #        self.scale_factor = max(1,np.sqrt(self.psi2_beta_scaled.sum(0).mean()))
-        #    else:
-        #        self.scale_factor = np.sqrt(self.psi2.sum(0).mean()*self.likelihood.precision)
+        #    self.scale_factor = np.sqrt(self.psi2.sum(0).mean()*self.likelihood.precision)
+        #if self.auto_scale_factor:
+            #if self.likelihood.is_heteroscedastic:
+                #self.scale_factor = max(100,np.sqrt(self.psi2_beta_scaled.sum(0).mean()))
+            #else:
+                #self.scale_factor = np.sqrt(self.psi2.sum(0).mean()*self.likelihood.precision)
+        self.scale_factor = 1.
        self._computations()

    def _get_params(self):
@ -239,24 +232,28 @@ class sparse_GP(GP):
        """
        The derivative of the bound wrt the inducing inputs Z
        """
-        dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
+        dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm, self.Z)  # factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
        if self.has_uncertain_inputs:
-            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1,self.Z,self.X, self.X_variance)
-            dL_dZ += 2.*self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_variance) # 'stripes'
+            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1, self.Z, self.X, self.X_variance)
+            dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2, self.Z, self.X, self.X_variance)
        else:
-            dL_dZ += self.kern.dK_dX(self.dL_dpsi1,self.Z,self.X)
+            dL_dZ += self.kern.dK_dX(self.dL_dpsi1, self.Z, self.X)
        return dL_dZ

-    def _raw_predict(self, Xnew, slices, full_cov=False):
+    def _raw_predict(self, Xnew, which_parts='all', full_cov=False):
        """Internal helper function for making predictions, does not account for normalization"""

-        Kx = self.kern.K(self.Z, Xnew)
-        mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
+        Bi,_ = linalg.lapack.flapack.dpotri(self.LB,lower=0) # WTH? this lower switch should be 1, but that doesn't work!
+        symmetrify(Bi)
+        Kmmi_LmiBLmi = backsub_both_sides(self.Lm,np.eye(self.M) - Bi)
+
+        Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
+        mu = np.dot(Kx.T, self.Cpsi1V/self.scale_factor)
        if full_cov:
-            Kxx = self.kern.K(Xnew)
-            var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) #NOTE this won't work for plotting
+            Kxx = self.kern.K(Xnew,which_parts=which_parts)
+            var = Kxx - mdot(Kx.T, Kmmi_LmiBLmi, Kx) #NOTE this won't work for plotting
        else:
-            Kxx = self.kern.Kdiag(Xnew)
-            var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0)
+            Kxx = self.kern.Kdiag(Xnew,which_parts=which_parts)
+            var = Kxx - np.sum(Kx*np.dot(Kmmi_LmiBLmi, Kx),0)

        return mu,var[:,None]
--- a/GPy/models/sparse_GP_regression.py
+++ b/GPy/models/sparse_GP_regression.py
@ -13,7 +13,7 @@ class sparse_GP_regression(sparse_GP):
    """
    Gaussian Process model for regression

-    This is a thin wrapper around the GP class, with a set of sensible defalts
+    This is a thin wrapper around the sparse_GP class, with a set of sensible defalts

    :param X: input observations
    :param Y: observed values
@ -22,25 +22,25 @@ class sparse_GP_regression(sparse_GP):
    :type normalize_X: False|True
    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_Y: False|True
-    :param Xslices: how the X,Y data co-vary in the kernel (i.e. which "outputs" they correspond to). See (link:slicing)
    :rtype: model object

    .. Note:: Multiple independent outputs are allowed using columns of Y

    """

-    def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None,Z=None, M=10):
-        #kern defaults to rbf
+    def __init__(self, X, Y, kernel=None, normalize_X=False, normalize_Y=False, Z=None, M=10):
+        #kern defaults to rbf (plus white for stability)
        if kernel is None:
            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1],1e-3)

        #Z defaults to a subset of the data
        if Z is None:
-            Z = np.random.permutation(X.copy())[:M]
+            i = np.random.permutation(X.shape[0])[:M]
+            Z = X[i].copy()
        else:
            assert Z.shape[1]==X.shape[1]

        #likelihood defaults to Gaussian
        likelihood = likelihoods.Gaussian(Y,normalize=normalize_Y)

-        sparse_GP.__init__(self, X, likelihood, kernel, Z, normalize_X=normalize_X, Xslices=Xslices)
+        sparse_GP.__init__(self, X, likelihood, kernel, Z, normalize_X=normalize_X)
--- a/GPy/models/uncollapsed_sparse_GP.py
+++ b/GPy/models/uncollapsed_sparse_GP.py
@ -1,151 +0,0 @@
-# Copyright (c) 2012 James Hensman
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import numpy as np
-import pylab as pb
-from ..util.linalg import mdot, jitchol, chol_inv, pdinv
-from .. import kern
-from ..likelihoods import likelihood
-from sparse_GP import sparse_GP
-
-class uncollapsed_sparse_GP(sparse_GP):
-    """
-    Variational sparse GP model (Regression), where the approximating distribution q(u) is represented explicitly
-
-    :param X: inputs
-    :type X: np.ndarray (N x Q)
-    :param likelihood: GPy likelihood class, containing observed data
-    :param q_u: canonical parameters of the distribution squasehd into a 1D array
-    :type q_u: np.ndarray
-    :param kernel : the kernel/covariance function. See link kernels
-    :type kernel: a GPy kernel
-    :param Z: inducing inputs (optional, see note)
-    :type Z: np.ndarray (M x Q) | None
-    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
-    :param normalize_X : whether to normalize the data before computing (predictions will be in original scales)
-    :type normalize_X: bool
-    """
-
-    def __init__(self, X, likelihood, kernel, Z, q_u=None, **kwargs):
-        self.M = Z.shape[0]
-        if q_u is None:
-            q_u = np.hstack((np.random.randn(self.M*likelihood.D),-0.5*np.eye(self.M).flatten()))
-            self.likelihood = likelihood
-        self.set_vb_param(q_u)
-        sparse_GP.__init__(self, X, likelihood, kernel, Z, **kwargs)
-
-    def _computations(self):
-        # kernel computations, using BGPLVM notation
-        self.Kmm = self.kern.K(self.Z)
-        if self.has_uncertain_inputs:
-            raise NotImplementedError
-        else:
-            self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices)
-            self.psi1 = self.kern.K(self.Z,self.X)
-            if self.likelihood.is_heteroscedastic:
-                raise NotImplementedError
-            else:
-                tmp = self.psi1*(np.sqrt(self.likelihood.precision)/sf)
-            self.psi2_beta_scaled = np.dot(tmp,tmp.T)
-            self.psi2 = self.psi1.T[:,:,None]*self.psi1.T[:,None,:]
-
-
-        self.V = self.likelihood.precision*self.Y
-        self.VmT = np.dot(self.V,self.q_u_expectation[0].T)
-        self.psi1V = np.dot(self.psi1, self.V)
-        self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T)
-        self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)
-        self.A = mdot(self.Lmi, self.beta*self.psi2, self.Lmi.T)
-        self.B = np.eye(self.M) + self.A
-        self.Lambda = mdot(self.Lmi.T,self.B,self.Lmi)
-        self.trace_K = self.psi0 - np.trace(self.A)/self.beta
-        self.projected_mean = mdot(self.psi1.T,self.Kmmi,self.q_u_expectation[0])
-
-        # Compute dL_dpsi
-        self.dL_dpsi0 = - 0.5 * self.likelihood.D * self.beta * np.ones(self.N)
-        self.dL_dpsi1 = np.dot(self.VmT,self.Kmmi).T # This is the correct term for E I think...
-        self.dL_dpsi2 = 0.5 * self.beta * self.likelihood.D * (self.Kmmi - mdot(self.Kmmi,self.q_u_expectation[1],self.Kmmi))
-
-        # Compute dL_dKmm
-        tmp = self.beta*mdot(self.psi2,self.Kmmi,self.q_u_expectation[1]) -np.dot(self.q_u_expectation[0],self.psi1V.T)
-        tmp += tmp.T
-        tmp += self.likelihood.D*(-self.beta*self.psi2 - self.Kmm + self.q_u_expectation[1])
-        self.dL_dKmm = 0.5*mdot(self.Kmmi,tmp,self.Kmmi)
-
-        #Compute the gradient of the log likelihood wrt noise variance
-        #TODO: suport heteroscedatic noise
-        dbeta =   0.5 * self.N*self.likelihood.D/self.beta
-        dbeta += - 0.5 * self.likelihood.D * self.trace_K
-        dbeta += - 0.5 * self.likelihood.D * np.sum(self.q_u_expectation[1]*mdot(self.Kmmi,self.psi2,self.Kmmi))
-        dbeta += - 0.5 * self.trYYT
-        dbeta += np.sum(np.dot(self.Y.T,self.projected_mean))
-        self.partial_for_likelihood = -dbeta*self.likelihood.precision**2
-
-    def log_likelihood(self):
-        """
-        Compute the (lower bound on the) log marginal likelihood
-        """
-        A = -0.5*self.N*self.likelihood.D*(np.log(2.*np.pi) - np.log(self.beta))
-        B = -0.5*self.beta*self.likelihood.D*self.trace_K
-        C = -0.5*self.likelihood.D *(self.Kmm_logdet-self.q_u_logdet + np.sum(self.Lambda * self.q_u_expectation[1]) - self.M)
-        D = -0.5*self.beta*self.trYYT
-        E = np.sum(np.dot(self.V.T,self.projected_mean))
-        return A+B+C+D+E
-
-    def _raw_predict(self, Xnew, slices,full_cov=False):
-        """Internal helper function for making predictions, does not account for normalization"""
-        Kx = self.kern.K(Xnew,self.Z)
-        mu = mdot(Kx,self.Kmmi,self.q_u_expectation[0])
-
-        tmp = self.Kmmi- mdot(self.Kmmi,self.q_u_cov,self.Kmmi)
-        if full_cov:
-            Kxx = self.kern.K(Xnew)
-            var = Kxx - mdot(Kx,tmp,Kx.T)
-        else:
-            Kxx = self.kern.Kdiag(Xnew)
-            var = (Kxx - np.sum(Kx*np.dot(Kx,tmp),1))[:,None]
-        return mu,var
-
-
-    def set_vb_param(self,vb_param):
-        """set the distribution q(u) from the canonical parameters"""
-        self.q_u_prec = -2.*vb_param[-self.M**2:].reshape(self.M, self.M)
-        self.q_u_cov, q_u_Li, q_u_L, tmp = pdinv(self.q_u_prec)
-        self.q_u_logdet = -tmp
-        self.q_u_mean = np.dot(self.q_u_cov,vb_param[:self.M*self.likelihood.D].reshape(self.M,self.likelihood.D))
-
-        self.q_u_expectation = (self.q_u_mean, np.dot(self.q_u_mean,self.q_u_mean.T)+self.q_u_cov*self.likelihood.D)
-
-        self.q_u_canonical = (np.dot(self.q_u_prec, self.q_u_mean),-0.5*self.q_u_prec)
-        #TODO: computations now?
-
-    def get_vb_param(self):
-        """
-        Return the canonical parameters of the distribution q(u)
-        """
-        return np.hstack([e.flatten() for e in self.q_u_canonical])
-
-    def vb_grad_natgrad(self):
-        """
-        Compute the gradients of the lower bound wrt the canonical and
-        Expectation parameters of u.
-
-        Note that the natural gradient in either is given by the gradient in the other (See Hensman et al 2012 Fast Variational inference in the conjugate exponential Family)
-        """
-        dL_dmmT_S = -0.5*self.Lambda-self.q_u_canonical[1]
-        dL_dm = np.dot(self.Kmmi,self.psi1V) - np.dot(self.Lambda,self.q_u_mean)
-
-        #dL_dSim =
-        #dL_dmhSi =
-
-        return np.hstack((dL_dm.flatten(),dL_dmmT_S.flatten()))  # natgrad only, grad TODO
-
-
-    def plot(self, *args, **kwargs):
-        """
-        add the distribution q(u) to the plot from sparse_GP
-        """
-        sparse_GP.plot(self,*args,**kwargs)
-        if self.Q==1:
-            pb.errorbar(self.Z[:,0],self.q_u_expectation[0][:,0],yerr=2.*np.sqrt(np.diag(self.q_u_cov)),fmt=None,ecolor='b')
-
--- a/GPy/models/warped_GP.py
+++ b/GPy/models/warped_GP.py
@ -14,7 +14,7 @@ from .. import likelihoods
 from .. import kern

 class warpedGP(GP):
-    def __init__(self, X, Y, kernel=None, warping_function = None, warping_terms = 3, normalize_X=False, normalize_Y=False, Xslices=None):
+    def __init__(self, X, Y, kernel=None, warping_function = None, warping_terms = 3, normalize_X=False, normalize_Y=False):

        if kernel is None:
            kernel = kern.rbf(X.shape[1])
@ -29,7 +29,7 @@ class warpedGP(GP):
        self.predict_in_warped_space = False
        likelihood = likelihoods.Gaussian(self.transform_data(), normalize=normalize_Y)

-        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X, Xslices=Xslices)
+        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)

    def _scale_data(self, Y):
        self._Ymax = Y.max()