merged conflict in tutorial's tests

2026-06-11 15:15:15 +02:00 · 2013-06-05 17:31:07 +01:00 · 2013-06-05 17:31:07 +01:00 · a56072b159
commit a56072b159
parent 73a122362f efbf169a6a
75 changed files with 1964 additions and 1674 deletions
--- a/GPy/core/init.py
+++ b/GPy/core/init.py
@ -1,8 +1,9 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from GP import GP
-from sparse_GP import sparse_GP
 from model import *
 from parameterised import *
 import priors
+from GPy.core.gp import GP
+from GPy.core.sparse_gp import SparseGP
+from fitc import FITC
--- a/GPy/models/FITC.py
+++ b/GPy/models/FITC.py
@ -7,57 +7,62 @@ from ..util.linalg import mdot, jitchol, chol_inv, tdot, symmetrify,pdinv
 from ..util.plot import gpplot
 from .. import kern
 from scipy import stats, linalg
-from ..core import sparse_GP
+from sparse_gp import SparseGP

-def backsub_both_sides(L,X):
-    """ Return L^-T * X * L^-1, assumuing X is symmetrical and L is lower cholesky"""
-    tmp,_ = linalg.lapack.flapack.dtrtrs(L,np.asfortranarray(X),lower=1,trans=1)
-    return linalg.lapack.flapack.dtrtrs(L,np.asfortranarray(tmp.T),lower=1,trans=1)[0].T
+class FITC(SparseGP):
+    """
+    sparse FITC approximation

-class FITC(sparse_GP):
+    :param X: inputs
+    :type X: np.ndarray (num_data x Q)
+    :param likelihood: a likelihood instance, containing the observed data
+    :type likelihood: GPy.likelihood.(Gaussian | EP)
+    :param kernel : the kernel (covariance function). See link kernels
+    :type kernel: a GPy.kern.kern instance
+    :param Z: inducing inputs (optional, see note)
+    :type Z: np.ndarray (M x Q) | None
+    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
+    :type normalize_(X|Y): bool
+    """

-    def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False):
-        super(FITC, self).__init__(X, likelihood, kernel, normalize_X=normalize_X)
+    def __init__(self, X, likelihood, kernel, Z, normalize_X=False):
+        SparseGP.__init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False)
+        assert self.output_dim == 1, "FITC model is not defined for handling multiple outputs"

    def update_likelihood_approximation(self):
        """
-        Approximates a non-gaussian likelihood using Expectation Propagation
+        Approximates a non-Gaussian likelihood using Expectation Propagation

-        For a Gaussian (or direct: TODO) likelihood, no iteration is required:
+        For a Gaussian likelihood, no iteration is required:
        this function does nothing
-
-        Diag(Knn - Qnn) is added to the noise term to use the tools already implemented in sparse_GP.
-        The true precison is now 'true_precision' not 'precision'.
        """
-        if self.has_uncertain_inputs:
-            raise NotImplementedError, "FITC approximation not implemented for uncertain inputs"
-        else:
-            self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
-            self._set_params(self._get_params()) # update the GP
+        self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
+        self._set_params(self._get_params()) # update the GP
+
+    def _compute_kernel_matrices(self):
+        # kernel computations, using BGPLVM notation
+        self.Kmm = self.kern.K(self.Z)
+        self.psi0 = self.kern.Kdiag(self.X)
+        self.psi1 = self.kern.K(self.Z, self.X)
+        self.psi2 = None

    def _computations(self):
-
        #factor Kmm
        self.Lm = jitchol(self.Kmm)
-        self.Lmi,info = linalg.lapack.flapack.dtrtrs(self.Lm,np.eye(self.M),lower=1)
+        self.Lmi,info = linalg.lapack.flapack.dtrtrs(self.Lm,np.eye(self.num_inducing),lower=1)
        Lmipsi1 = np.dot(self.Lmi,self.psi1)
        self.Qnn = np.dot(Lmipsi1.T,Lmipsi1).copy()
        self.Diag0 = self.psi0 - np.diag(self.Qnn)
-        self.beta_star = self.likelihood.precision/(1. + self.likelihood.precision*self.Diag0[:,None]) #Includes Diag0 in the precision
+        self.beta_star = self.likelihood.precision/(1. + self.likelihood.precision*self.Diag0[:,None]) #NOTE: beta_star contains Diag0 and the precision
        self.V_star = self.beta_star * self.likelihood.Y

        # The rather complex computations of self.A
-        if self.has_uncertain_inputs:
-                raise NotImplementedError
-        else:
-            if self.likelihood.is_heteroscedastic:
-                assert self.likelihood.D == 1
-            tmp = self.psi1 * (np.sqrt(self.beta_star.flatten().reshape(1, self.N)))
-            tmp, _ = linalg.lapack.flapack.dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1)
-            self.A = tdot(tmp)
+        tmp = self.psi1 * (np.sqrt(self.beta_star.flatten().reshape(1, self.num_data)))
+        tmp, _ = linalg.lapack.flapack.dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1)
+        self.A = tdot(tmp)

        # factor B
-        self.B = np.eye(self.M) + self.A
+        self.B = np.eye(self.num_inducing) + self.A
        self.LB = jitchol(self.B)
        self.LBi = chol_inv(self.LB)
        self.psi1V = np.dot(self.psi1, self.V_star)
@ -108,18 +113,12 @@ class FITC(sparse_GP):
        self._dpsi1_dX_jkj = 0
        self._dpsi1_dtheta_jkj = 0

-        for i,V_n,alpha_n,gamma_n,gamma_k in zip(range(self.N),self.V_star,alpha,gamma_2,gamma_3):
+        for i,V_n,alpha_n,gamma_n,gamma_k in zip(range(self.num_data),self.V_star,alpha,gamma_2,gamma_3):
            K_pp_K = np.dot(Kmmipsi1[:,i:(i+1)],Kmmipsi1[:,i:(i+1)].T)
-
-            #Diag_dpsi1 = Diag_dA_dpsi1: yT*beta_star*y + Diag_dC_dpsi1 +Diag_dD_dpsi1
            _dpsi1 = (-V_n**2 - alpha_n + 2.*gamma_k - gamma_n**2) * Kmmipsi1.T[i:(i+1),:]
-
-            #Diag_dKmm = Diag_dA_dKmm: yT*beta_star*y +Diag_dC_dKmm +Diag_dD_dKmm
            _dKmm = .5*(V_n**2 + alpha_n + gamma_n**2 - 2.*gamma_k) * K_pp_K #Diag_dD_dKmm
-
            self._dpsi1_dtheta += self.kern.dK_dtheta(_dpsi1,self.X[i:i+1,:],self.Z)
            self._dKmm_dtheta += self.kern.dK_dtheta(_dKmm,self.Z)
-
            self._dKmm_dX += 2.*self.kern.dK_dX(_dKmm ,self.Z)
            self._dpsi1_dX += self.kern.dK_dX(_dpsi1.T,self.Z,self.X[i:i+1,:])

@ -128,7 +127,7 @@ class FITC(sparse_GP):
            # save computation here.
            self.partial_for_likelihood = None
        elif self.likelihood.is_heteroscedastic:
-            raise NotImplementedError, "heteroscedatic derivates not implemented"
+            raise NotImplementedError, "heteroscedatic derivates not implemented."
        else:
            # likelihood is not heterscedatic
            dbstar_dnoise = self.likelihood.precision * (self.beta_star**2 * self.Diag0[:,None] - self.beta_star)
@ -138,14 +137,14 @@ class FITC(sparse_GP):
            aux_1 = self.likelihood.Y.T * np.dot(self._LBi_Lmi_psi1V.T,LBiLmipsi1)
            aux_2 = np.dot(LBiLmipsi1.T,self._LBi_Lmi_psi1V)

-            dA_dnoise = 0.5 * self.D * (dbstar_dnoise/self.beta_star).sum() - 0.5 * self.D * np.sum(self.likelihood.Y**2 * dbstar_dnoise)
+            dA_dnoise = 0.5 * self.input_dim * (dbstar_dnoise/self.beta_star).sum() - 0.5 * self.input_dim * np.sum(self.likelihood.Y**2 * dbstar_dnoise)
            dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T,self.LBi,Lmi_psi1) *  Lmi_psi1 * dbstar_dnoise.T)
            dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T,self.LBi,Lmi_psi1) *  Lmi_psi1 * dbstar_dnoise.T)

            dD_dnoise_1 =  mdot(self.V_star*LBiLmipsi1.T,LBiLmipsi1*dbstar_dnoise.T*self.likelihood.Y.T)
            alpha = mdot(LBiLmipsi1,self.V_star)
            alpha_ = mdot(LBiLmipsi1.T,alpha)
-            dD_dnoise_2 = -0.5 * self.D * np.sum(alpha_**2 * dbstar_dnoise )
+            dD_dnoise_2 = -0.5 * self.input_dim * np.sum(alpha_**2 * dbstar_dnoise )

            dD_dnoise_1 = mdot(self.V_star.T,self.psi1.T,self.Lmi.T,self.LBi.T,self.LBi,self.Lmi,self.psi1,dbstar_dnoise*self.likelihood.Y)
            dD_dnoise_2 = 0.5*mdot(self.V_star.T,self.psi1.T,Hi,self.psi1,dbstar_dnoise*self.psi1.T,Hi,self.psi1,self.V_star)
@ -155,8 +154,8 @@ class FITC(sparse_GP):

    def log_likelihood(self):
        """ Compute the (lower bound on the) log marginal likelihood """
-        A = -0.5 * self.N * self.D * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.beta_star)) - 0.5 * np.sum(self.V_star * self.likelihood.Y)
-        C = -self.D * (np.sum(np.log(np.diag(self.LB))))
+        A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.beta_star)) - 0.5 * np.sum(self.V_star * self.likelihood.Y)
+        C = -self.output_dim * (np.sum(np.log(np.diag(self.LB))))
        D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V))
        return A + C + D

@ -165,35 +164,30 @@ class FITC(sparse_GP):
        return np.hstack((self.dL_dZ().flatten(), self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))

    def dL_dtheta(self):
-        if self.has_uncertain_inputs:
-            raise NotImplementedError, "FITC approximation not implemented for uncertain inputs"
-        else:
-            dL_dtheta = self.kern.dKdiag_dtheta(self._dL_dpsi0,self.X)
-            dL_dtheta += self.kern.dK_dtheta(self._dL_dpsi1,self.X,self.Z)
-            dL_dtheta += self.kern.dK_dtheta(self._dL_dKmm,X=self.Z)
-            dL_dtheta += self._dKmm_dtheta
-            dL_dtheta += self._dpsi1_dtheta
+        dL_dtheta = self.kern.dKdiag_dtheta(self._dL_dpsi0,self.X)
+        dL_dtheta += self.kern.dK_dtheta(self._dL_dpsi1,self.X,self.Z)
+        dL_dtheta += self.kern.dK_dtheta(self._dL_dKmm,X=self.Z)
+        dL_dtheta += self._dKmm_dtheta
+        dL_dtheta += self._dpsi1_dtheta
        return dL_dtheta

    def dL_dZ(self):
-        if self.has_uncertain_inputs:
-            raise NotImplementedError, "FITC approximation not implemented for uncertain inputs"
-        else:
-            dL_dZ = self.kern.dK_dX(self._dL_dpsi1.T,self.Z,self.X)
-            dL_dZ += 2. * self.kern.dK_dX(self._dL_dKmm,X=self.Z)
-            dL_dZ += self._dpsi1_dX
-            dL_dZ += self._dKmm_dX
+        dL_dZ = self.kern.dK_dX(self._dL_dpsi1.T,self.Z,self.X)
+        dL_dZ += 2. * self.kern.dK_dX(self._dL_dKmm,X=self.Z)
+        dL_dZ += self._dpsi1_dX
+        dL_dZ += self._dKmm_dX
        return dL_dZ

-    def _raw_predict(self, Xnew, which_parts, full_cov=False):
+    def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
+        assert X_variance_new is None, "FITC model is not defined for handling uncertain inputs."

        if self.likelihood.is_heteroscedastic:
            Iplus_Dprod_i = 1./(1.+ self.Diag0 * self.likelihood.precision.flatten())
            self.Diag = self.Diag0 * Iplus_Dprod_i
            self.P = Iplus_Dprod_i[:,None] * self.psi1.T
            self.RPT0 = np.dot(self.Lmi,self.psi1)
-            self.L = np.linalg.cholesky(np.eye(self.M) + np.dot(self.RPT0,((1. - Iplus_Dprod_i)/self.Diag0)[:,None]*self.RPT0.T))
-            self.R,info = linalg.flapack.dtrtrs(self.L,self.Lmi,lower=1)
+            self.L = np.linalg.cholesky(np.eye(self.num_inducing) + np.dot(self.RPT0,((1. - Iplus_Dprod_i)/self.Diag0)[:,None]*self.RPT0.T))
+            self.R,info = linalg.lapack.flapack.dtrtrs(self.L,self.Lmi,lower=1)
            self.RPT = np.dot(self.R,self.P.T)
            self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T,self.RPT)
            self.w = self.Diag * self.likelihood.v_tilde
@ -210,13 +204,13 @@ class FITC(sparse_GP):
            # q(u|f) = N(u| R0i*mu_u*f, R0i*C*R0i.T)

            # Ci = I + (RPT0)Di(RPT0).T
-            # C = I - [RPT0] * (D+[RPT0].T*[RPT0])^-1*[RPT0].T
-            #   = I - [RPT0] * (D + self.Qnn)^-1 * [RPT0].T
+            # C = I - [RPT0] * (input_dim+[RPT0].T*[RPT0])^-1*[RPT0].T
+            #   = I - [RPT0] * (input_dim + self.Qnn)^-1 * [RPT0].T
            #   = I - [RPT0] * (U*U.T)^-1 * [RPT0].T
            #   = I - V.T * V
            U = np.linalg.cholesky(np.diag(self.Diag0) + self.Qnn)
-            V,info = linalg.flapack.dtrtrs(U,self.RPT0.T,lower=1)
-            C = np.eye(self.M) - np.dot(V.T,V)
+            V,info = linalg.lapack.flapack.dtrtrs(U,self.RPT0.T,lower=1)
+            C = np.eye(self.num_inducing) - np.dot(V.T,V)
            mu_u = np.dot(C,self.RPT0)*(1./self.Diag0[None,:])
            #self.C = C
            #self.RPT0 = np.dot(self.R0,self.Knm.T) P0.T
@ -232,13 +226,13 @@ class FITC(sparse_GP):
            mu_star = np.dot(KR0T,mu_H)
            if full_cov:
                Kxx = self.kern.K(Xnew,which_parts=which_parts)
-                var = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.M),KR0T.T))
+                var = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.num_inducing),KR0T.T))
            else:
                Kxx = self.kern.Kdiag(Xnew,which_parts=which_parts)
-                var = (Kxx + np.sum(KR0T.T*np.dot(Sigma_H - np.eye(self.M),KR0T.T),0))[:,None]
+                var = (Kxx + np.sum(KR0T.T*np.dot(Sigma_H - np.eye(self.num_inducing),KR0T.T),0))[:,None]
            return mu_star[:,None],var
        else:
-            raise NotImplementedError, "homoscedastic fitc not implemented"
+            raise NotImplementedError, "Heteroscedastic case not implemented."
            """
            Kx = self.kern.K(self.Z, Xnew)
            mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -33,8 +33,8 @@ class GP(GPBase):
        self._set_params(self._get_params())

    def _set_params(self, p):
-        self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
-        self.likelihood._set_params(p[self.kern.Nparam_transformed():])
+        self.kern._set_params_transformed(p[:self.kern.num_params_transformed()])
+        self.likelihood._set_params(p[self.kern.num_params_transformed():])

        self.K = self.kern.K(self.X)
        self.K += self.likelihood.covariance_matrix
@ -46,12 +46,12 @@ class GP(GPBase):
            #alpha = np.dot(self.Ki, self.likelihood.Y)
            alpha,_ = linalg.lapack.flapack.dpotrs(self.L, self.likelihood.Y,lower=1)

-            self.dL_dK = 0.5 * (tdot(alpha) - self.D * self.Ki)
+            self.dL_dK = 0.5 * (tdot(alpha) - self.output_dim * self.Ki)
        else:
            #tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki)
            tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1)
            tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(tmp.T), lower=1)
-            self.dL_dK = 0.5 * (tmp - self.D * self.Ki)
+            self.dL_dK = 0.5 * (tmp - self.output_dim * self.Ki)

    def _get_params(self):
        return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params()))
@ -89,7 +89,7 @@ class GP(GPBase):
        model for a new variable Y* = v_tilde/tau_tilde, with a covariance
        matrix K* = K + diag(1./tau_tilde) plus a normalization term.
        """
-        return -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
+        return -0.5 * self.output_dim * self.K_logdet + self._model_fit_term() + self.likelihood.Z


    def _log_likelihood_gradients(self):
@ -117,7 +117,7 @@ class GP(GPBase):
            var = Kxx - np.sum(np.multiply(KiKx, Kx), 0)
            var = var[:, None]
        if stop:
-            debug_this
+            debug_this # @UndefinedVariable
        return mu, var

    def predict(self, Xnew, which_parts='all', full_cov=False):
@ -131,12 +131,12 @@ class GP(GPBase):
        :type which_parts: ('all', list of bools)
        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
        :type full_cov: bool
-        :rtype: posterior mean,  a Numpy array, Nnew x self.D
+        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
-        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.D
+        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim


-           If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew.
+           If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
           This is to allow for different normalizations of the output dimensions.

        """
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@ -1,34 +1,34 @@
 import numpy as np
-import model
 from .. import kern
 from ..util.plot import gpplot, Tango, x_frame1D, x_frame2D
 import pylab as pb
+from GPy.core.model import Model

-class GPBase(model.model):
+class GPBase(Model):
    """
-    Gaussian Process model for holding shared behaviour between
+    Gaussian Process Model for holding shared behaviour between
    sprase_GP and GP models
    """

    def __init__(self, X, likelihood, kernel, normalize_X=False):
        self.X = X
        assert len(self.X.shape) == 2
-        self.N, self.input_dim = self.X.shape
+        self.num_data, self.input_dim = self.X.shape
        assert isinstance(kernel, kern.kern)
        self.kern = kernel
        self.likelihood = likelihood
        assert self.X.shape[0] == self.likelihood.data.shape[0]
-        self.N, self.D = self.likelihood.data.shape
+        self.num_data, self.output_dim = self.likelihood.data.shape

        if normalize_X:
            self._Xmean = X.mean(0)[None, :]
            self._Xstd = X.std(0)[None, :]
            self.X = (X.copy() - self._Xmean) / self._Xstd
        else:
-            self._Xmean = np.zeros((1,self.input_dim))
-            self._Xstd = np.ones((1,self.input_dim))
+            self._Xmean = np.zeros((1, self.input_dim))
+            self._Xstd = np.ones((1, self.input_dim))

-        model.model.__init__(self)
+        Model.__init__(self)

        # All leaf nodes should call self._set_params(self._get_params()) at
        # the end
@ -70,7 +70,7 @@ class GPBase(model.model):
            else:
                m, v = self._raw_predict(Xnew, which_parts=which_parts, full_cov=True)
                Ysim = np.random.multivariate_normal(m.flatten(), v, samples)
-                gpplot(Xnew, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None,], axes=ax)
+                gpplot(Xnew, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None, ], axes=ax)
                for i in range(samples):
                    ax.plot(Xnew, Ysim[i, :], Tango.colorsHex['darkBlue'], linewidth=0.25)
            ax.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
@ -84,8 +84,8 @@ class GPBase(model.model):
            Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits, resolution)
            m, v = self._raw_predict(Xnew, which_parts=which_parts)
            m = m.reshape(resolution, resolution).T
-            ax.contour(xx, yy, m, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
-            ax.scatter(self.X[:, 0], self.X[:, 1], 40, self.likelihood.Y, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max())
+            ax.contour(xx, yy, m, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
+            ax.scatter(self.X[:, 0], self.X[:, 1], 40, self.likelihood.Y, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max()) # @UndefinedVariable
            ax.set_xlim(xmin[0], xmax[0])
            ax.set_ylim(xmin[1], xmax[1])
        else:
@ -94,9 +94,9 @@ class GPBase(model.model):
    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None):
        """
        TODO: Docstrings!
+        
        :param levels: for 2D plotting, the number of contour levels to use
        is ax is None, create a new figure
-
        """
        # TODO include samples
        if which_data == 'all':
@ -108,27 +108,27 @@ class GPBase(model.model):

        if self.X.shape[1] == 1:

-            Xu = self.X * self._Xstd + self._Xmean  # NOTE self.X are the normalized values now
+            Xu = self.X * self._Xstd + self._Xmean # NOTE self.X are the normalized values now

            Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
-            m, var, lower, upper = self.predict(Xnew, which_parts=which_parts)
+            m, _, lower, upper = self.predict(Xnew, which_parts=which_parts)
            for d in range(m.shape[1]):
-                gpplot(Xnew, m[:,d], lower[:,d], upper[:,d],axes=ax)
-                ax.plot(Xu[which_data], self.likelihood.data[which_data,d], 'kx', mew=1.5)
+                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax)
+                ax.plot(Xu[which_data], self.likelihood.data[which_data, d], 'kx', mew=1.5)
            ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
            ax.set_xlim(xmin, xmax)
            ax.set_ylim(ymin, ymax)

-        elif self.X.shape[1] == 2:  # FIXME
+        elif self.X.shape[1] == 2: # FIXME
            resolution = resolution or 50
-            Xnew, xx, yy, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
+            Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
-            m, var, lower, upper = self.predict(Xnew, which_parts=which_parts)
+            m, _, lower, upper = self.predict(Xnew, which_parts=which_parts)
            m = m.reshape(resolution, resolution).T
-            ax.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
+            ax.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
            Yf = self.likelihood.Y.flatten()
-            ax.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+            ax.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.) # @UndefinedVariable
            ax.set_xlim(xmin[0], xmax[0])
            ax.set_ylim(xmin[1], xmax[1])

--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -6,37 +6,32 @@ from .. import likelihoods
 from ..inference import optimization
 from ..util.linalg import jitchol
 from GPy.util.misc import opt_wrapper
-from parameterised import parameterised
-from scipy import optimize
+from parameterised import Parameterised
 import multiprocessing as mp
 import numpy as np
-import priors
-import re
-import sys
-import pdb
 from GPy.core.domains import POSITIVE, REAL
 # import numdifftools as ndt

-class model(parameterised):
+class Model(Parameterised):
    def __init__(self):
-        parameterised.__init__(self)
+        Parameterised.__init__(self)
        self.priors = None
        self.optimization_runs = []
        self.sampling_runs = []
-        self.preferred_optimizer = 'tnc'
-        #self._set_params(self._get_params()) has been taken out as it should only be called on leaf nodes
+        self.preferred_optimizer = 'scg'
+        # self._set_params(self._get_params()) has been taken out as it should only be called on leaf nodes
    def _get_params(self):
-        raise NotImplementedError, "this needs to be implemented to use the model class"
+        raise NotImplementedError, "this needs to be implemented to use the Model class"
    def _set_params(self, x):
-        raise NotImplementedError, "this needs to be implemented to use the model class"
+        raise NotImplementedError, "this needs to be implemented to use the Model class"
    def log_likelihood(self):
-        raise NotImplementedError, "this needs to be implemented to use the model class"
+        raise NotImplementedError, "this needs to be implemented to use the Model class"
    def _log_likelihood_gradients(self):
-        raise NotImplementedError, "this needs to be implemented to use the model class"
+        raise NotImplementedError, "this needs to be implemented to use the Model class"

    def set_prior(self, regexp, what):
        """
-        Sets priors on the model parameters.
+        Sets priors on the Model parameters.

        Arguments
        ---------
@ -65,7 +60,7 @@ class model(parameterised):
        if len(tie_matches) > 1:
            raise ValueError, "cannot place Prior across multiple ties"
        elif len(tie_matches) == 1:
-            which = which[:1]  # just place a Prior object on the first parameter
+            which = which[:1] # just place a Prior object on the first parameter


        # check constraints are okay
@ -95,7 +90,7 @@ class model(parameterised):

    def get_gradient(self, name, return_names=False):
        """
-        Get model gradient(s) by name. The name is applied as a regular expression and all parameters that match that regular expression are returned.
+        Get Model gradient(s) by name. The name is applied as a regular expression and all parameters that match that regular expression are returned.
        """
        matches = self.grep_param_names(name)
        if len(matches):
@ -135,7 +130,7 @@ class model(parameterised):

    def randomize(self):
        """
-        Randomize the model.
+        Randomize the Model.
        Make this draw from the Prior if one exists, else draw from N(0,1)
        """
        # first take care of all parameters (from N(0,1))
@ -147,16 +142,16 @@ class model(parameterised):
        if self.priors is not None:
            [np.put(x, i, p.rvs(1)) for i, p in enumerate(self.priors) if not p is None]
        self._set_params(x)
-        self._set_params_transformed(self._get_params_transformed())  # makes sure all of the tied parameters get the same init (since there's only one prior object...)
+        self._set_params_transformed(self._get_params_transformed()) # makes sure all of the tied parameters get the same init (since there's only one prior object...)


-    def optimize_restarts(self, Nrestarts=10, robust=False, verbose=True, parallel=False, num_processes=None, **kwargs):
+    def optimize_restarts(self, num_restarts=10, robust=False, verbose=True, parallel=False, num_processes=None, **kwargs):
        """
-        Perform random restarts of the model, and set the model to the best
+        Perform random restarts of the Model, and set the Model to the best
        seen solution.

        If the robust flag is set, exceptions raised during optimizations will
-        be handled silently.  If _all_ runs fail, the model is reset to the
+        be handled silently.  If _all_ runs fail, the Model is reset to the
        existing parameter values.

        Notes
@ -179,19 +174,19 @@ class model(parameterised):
            try:
                jobs = []
                pool = mp.Pool(processes=num_processes)
-                for i in range(Nrestarts):
+                for i in range(num_restarts):
                    self.randomize()
                    job = pool.apply_async(opt_wrapper, args=(self,), kwds=kwargs)
                    jobs.append(job)

-                pool.close()  # signal that no more data coming in
-                pool.join()  # wait for all the tasks to complete
+                pool.close() # signal that no more data coming in
+                pool.join() # wait for all the tasks to complete
            except KeyboardInterrupt:
                print "Ctrl+c received, terminating and joining pool."
                pool.terminate()
                pool.join()

-        for i in range(Nrestarts):
+        for i in range(num_restarts):
            try:
                if not parallel:
                    self.randomize()
@ -200,10 +195,10 @@ class model(parameterised):
                    self.optimization_runs.append(jobs[i].get())

                if verbose:
-                    print("Optimization restart {0}/{1}, f = {2}".format(i + 1, Nrestarts, self.optimization_runs[-1].f_opt))
+                    print("Optimization restart {0}/{1}, f = {2}".format(i + 1, num_restarts, self.optimization_runs[-1].f_opt))
            except Exception as e:
                if robust:
-                    print("Warning - optimization restart {0}/{1} failed".format(i + 1, Nrestarts))
+                    print("Warning - optimization restart {0}/{1} failed".format(i + 1, num_restarts))
                else:
                    raise e

@ -218,20 +213,16 @@ class model(parameterised):
        Ensure that any variables which should clearly be positive have been constrained somehow.
        """
        positive_strings = ['variance', 'lengthscale', 'precision', 'kappa']
-        param_names = self._get_param_names()
+        # param_names = self._get_param_names()
        currently_constrained = self.all_constrained_indices()
        to_make_positive = []
        for s in positive_strings:
-            for i in self.grep_param_names(".*"+s):
+            for i in self.grep_param_names(".*" + s):
                if not (i in currently_constrained):
-                    #to_make_positive.append(re.escape(param_names[i]))
                    to_make_positive.append(i)
        if len(to_make_positive):
-            #self.constrain_positive('(' + '|'.join(to_make_positive) + ')')
            self.constrain_positive(np.asarray(to_make_positive))

-
-
    def objective_function(self, x):
        """
        The objective function passed to the optimizer. It combines the likelihood and the priors.
@ -244,18 +235,18 @@ class model(parameterised):
        Gets the gradients from the likelihood and the priors.
        """
        self._set_params_transformed(x)
-        obj_grads = - self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
+        obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
        return obj_grads

    def objective_and_gradients(self, x):
        self._set_params_transformed(x)
        obj_f = -self.log_likelihood() - self.log_prior()
-        obj_grads = - self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
+        obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
        return obj_f, obj_grads

    def optimize(self, optimizer=None, start=None, **kwargs):
        """
-        Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
+        Optimize the Model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
        kwargs are passed to the optimizer. They can be:

        :max_f_eval: maximum number of function evaluations
@ -278,7 +269,7 @@ class model(parameterised):

    def optimize_SGD(self, momentum=0.1, learning_rate=0.01, iterations=20, **kwargs):
        # assert self.Y.shape[1] > 1, "SGD only works with D > 1"
-        sgd = SGD.StochasticGD(self, iterations, learning_rate, momentum, **kwargs)
+        sgd = SGD.StochasticGD(self, iterations, learning_rate, momentum, **kwargs) # @UndefinedVariable
        sgd.run()
        self.optimization_runs.append(sgd)

@ -295,7 +286,7 @@ class model(parameterised):
            def f(x):
                self._set_params(x)
                return self.log_likelihood()
-            h = ndt.Hessian(f)
+            h = ndt.Hessian(f) # @UndefinedVariable
            A = -h(x)
            self._set_params(x)
        # check for almost zero components on the diagonal which screw up the cholesky
@ -304,7 +295,7 @@ class model(parameterised):
        return A

    def Laplace_evidence(self):
-        """Returns an estiamte of the model evidence based on the Laplace approximation.
+        """Returns an estiamte of the Model evidence based on the Laplace approximation.
        Uses a numerical estimate of the hessian if none is available analytically"""
        A = self.Laplace_covariance()
        try:
@ -314,12 +305,12 @@ class model(parameterised):
        return 0.5 * self._get_params().size * np.log(2 * np.pi) + self.log_likelihood() - hld

    def __str__(self):
-        s = parameterised.__str__(self).split('\n')
+        s = Parameterised.__str__(self).split('\n')
        # add priors to the string
        if self.priors is not None:
            strs = [str(p) if p is not None else '' for p in self.priors]
        else:
-            strs = ['']*len(self._get_params())
+            strs = [''] * len(self._get_params())
        width = np.array(max([len(p) for p in strs] + [5])) + 4

        log_like = self.log_likelihood()
@ -340,7 +331,7 @@ class model(parameterised):

    def checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3):
        """
-        Check the gradient of the model by comparing to a numerical estimate.
+        Check the gradient of the Model by comparing to a numerical estimate.
        If the verbose flag is passed, invividual components are tested (and printed)

        :param verbose: If True, print a "full" checking of each parameter
@ -392,7 +383,11 @@ class model(parameterised):
            if target_param is None:
                param_list = range(len(x))
            else:
-                param_list = self.grep_param_names(target_param)
+                param_list = self.grep_param_names(target_param, transformed=True, search=True)
+                if not np.any(param_list):
+                    print "No free parameters to check"
+                    return
+

            for i in param_list:
                xx = x.copy()
@ -419,15 +414,15 @@ class model(parameterised):

    def input_sensitivity(self):
        """
-        return an array describing the sesitivity of the model to each input
+        return an array describing the sesitivity of the Model to each input

        NB. Right now, we're basing this on the lengthscales (or
        variances) of the kernel.  TODO: proper sensitivity analysis
-        where we integrate across the model inputs and evaluate the
-        effect on the variance of the model output.  """
+        where we integrate across the Model inputs and evaluate the
+        effect on the variance of the Model output.  """

        if not hasattr(self, 'kern'):
-            raise ValueError, "this model has no kernel"
+            raise ValueError, "this Model has no kernel"

        k = [p for p in self.kern.parts if p.name in ['rbf', 'linear']]
        if (not len(k) == 1) or (not k[0].ARD):
@ -474,8 +469,8 @@ class model(parameterised):
            ll_change = new_ll - last_ll

            if ll_change < 0:
-                self.likelihood = last_approximation  # restore previous likelihood approximation
-                self._set_params(last_params)  # restore model parameters
+                self.likelihood = last_approximation # restore previous likelihood approximation
+                self._set_params(last_params) # restore Model parameters
                print "Log-likelihood decrement: %s \nLast likelihood update discarded." % ll_change
                stop = True
            else:
--- a/GPy/core/parameterised.py
+++ b/GPy/core/parameterised.py
@ -6,12 +6,10 @@ import numpy as np
 import re
 import copy
 import cPickle
-import os
-from ..util.squashers import sigmoid
 import warnings
 import transformations

-class parameterised(object):
+class Parameterised(object):
    def __init__(self):
        """
        This is the base class for model and kernel. Mostly just handles tieing and constraining of parameters
@ -36,7 +34,7 @@ class parameterised(object):
        """
        Returns a **copy** of parameters in non transformed space

-        :see_also: :py:func:`GPy.core.parameterised.params_transformed`
+        :see_also: :py:func:`GPy.core.Parameterised.params_transformed`
        """
        return self._get_params()

@ -49,7 +47,7 @@ class parameterised(object):
        """
        Returns a **copy** of parameters in transformed space

-        :see_also: :py:func:`GPy.core.parameterised.params`
+        :see_also: :py:func:`GPy.core.Parameterised.params`
        """
        return self._get_params_transformed()

@ -85,7 +83,7 @@ class parameterised(object):
            else:
                return self._get_params()[matches]
        else:
-            raise AttributeError, "no parameter matches %s" % name
+            raise AttributeError, "no parameter matches %s" % regexp

    def __setitem__(self, name, val):
        """
@ -113,13 +111,13 @@ class parameterised(object):
        if hasattr(self, 'prior'):
            pass

-        self._set_params_transformed(self._get_params_transformed())  # sets tied parameters to single value
+        self._set_params_transformed(self._get_params_transformed()) # sets tied parameters to single value

    def untie_everything(self):
        """Unties all parameters by setting tied_indices to an empty list."""
        self.tied_indices = []

-    def grep_param_names(self, regexp):
+    def grep_param_names(self, regexp, transformed=False, search=False):
        """
        :param regexp: regular expression to select parameter names
        :type regexp: re | str | int
@ -129,15 +127,23 @@ class parameterised(object):
          Other objects are passed through - i.e. integers which weren't meant for grepping
        """

+        if transformed:
+            names = self._get_param_names_transformed()
+        else:
+            names = self._get_param_names()
+
        if type(regexp) in [str, np.string_, np.str]:
            regexp = re.compile(regexp)
-            return np.nonzero([regexp.match(name) for name in self._get_param_names()])[0]
        elif type(regexp) is re._pattern_type:
-            return np.nonzero([regexp.match(name) for name in self._get_param_names()])[0]
+            pass
        else:
            return regexp
+        if search:
+            return np.nonzero([regexp.search(name) for name in names])[0]
+        else:
+            return np.nonzero([regexp.match(name) for name in names])[0]

-    def Nparam_transformed(self):
+    def num_params_transformed(self):
        removed = 0
        for tie in self.tied_indices:
            removed += tie.size - 1
@ -151,18 +157,18 @@ class parameterised(object):
        """Unconstrain matching parameters.  does not untie parameters"""
        matches = self.grep_param_names(regexp)

-        #tranformed contraints:
+        # tranformed contraints:
        for match in matches:
-            self.constrained_indices = [i[i<>match] for i in self.constrained_indices]
+            self.constrained_indices = [i[i <> match] for i in self.constrained_indices]

-        #remove empty constraints
-        tmp = zip(*[(i,t) for i,t in zip(self.constrained_indices,self.constraints) if len(i)])
+        # remove empty constraints
+        tmp = zip(*[(i, t) for i, t in zip(self.constrained_indices, self.constraints) if len(i)])
        if tmp:
-            self.constrained_indices, self.constraints = zip(*[(i,t) for i,t in zip(self.constrained_indices,self.constraints) if len(i)])
+            self.constrained_indices, self.constraints = zip(*[(i, t) for i, t in zip(self.constrained_indices, self.constraints) if len(i)])
            self.constrained_indices, self.constraints = list(self.constrained_indices), list(self.constraints)

        # fixed:
-        self.fixed_values = [np.delete(values, np.nonzero(np.sum(indices[:, None] == matches[None, :], 1))[0]) for indices,values in zip(self.fixed_indices,self.fixed_values)]
+        self.fixed_values = [np.delete(values, np.nonzero(np.sum(indices[:, None] == matches[None, :], 1))[0]) for indices, values in zip(self.fixed_indices, self.fixed_values)]
        self.fixed_indices = [np.delete(indices, np.nonzero(np.sum(indices[:, None] == matches[None, :], 1))[0]) for indices in self.fixed_indices]

        # remove empty elements
@ -181,7 +187,7 @@ class parameterised(object):
        """ Set positive constraints. """
        self.constrain(regexp, transformations.logexp())

-    def constrain_bounded(self, regexp,lower, upper):
+    def constrain_bounded(self, regexp, lower, upper):
        """ Set bounded constraints. """
        self.constrain(regexp, transformations.logistic(lower, upper))

@ -191,8 +197,8 @@ class parameterised(object):
        else:
            return np.empty(shape=(0,))

-    def constrain(self,regexp,transform):
-        assert isinstance(transform,transformations.transformation)
+    def constrain(self, regexp, transform):
+        assert isinstance(transform, transformations.transformation)

        matches = self.grep_param_names(regexp)
        overlap = set(matches).intersection(set(self.all_constrained_indices()))
@ -223,7 +229,6 @@ class parameterised(object):
        To fix multiple parameters to the same value, simply pass a regular expression which matches both parameter names, or pass both of the indexes
        """
        matches = self.grep_param_names(regexp)
-
        overlap = set(matches).intersection(set(self.all_constrained_indices()))
        if overlap:
            self.unconstrain(np.asarray(list(overlap)))
@ -244,7 +249,7 @@ class parameterised(object):
    def _get_params_transformed(self):
        """use self._get_params to get the 'true' parameters of the model, which are then tied, constrained and fixed"""
        x = self._get_params()
-        [np.put(x,i,t.finv(x[i])) for i,t in zip(self.constrained_indices,self.constraints)]
+        [np.put(x, i, t.finv(x[i])) for i, t in zip(self.constrained_indices, self.constraints)]

        to_remove = self.fixed_indices + [t[1:] for t in self.tied_indices]
        if len(to_remove):
@ -256,7 +261,7 @@ class parameterised(object):
        """ takes the vector x, which is then modified (by untying, reparameterising or inserting fixed values), and then call self._set_params"""
        self._set_params(self._untransform_params(x))

-    def _untransform_params(self,x):
+    def _untransform_params(self, x):
        """
        The transformation required for _set_params_transformed.

@ -283,9 +288,9 @@ class parameterised(object):
        [np.put(xx, i, v) for i, v in zip(self.fixed_indices, self.fixed_values)]
        [np.put(xx, i, v) for i, v in [(t[1:], xx[t[0]]) for t in self.tied_indices] ]

-        [np.put(xx,i,t.f(xx[i])) for i,t in zip(self.constrained_indices, self.constraints)]
-        if hasattr(self,'debug'):
-            stop
+        [np.put(xx, i, t.f(xx[i])) for i, t in zip(self.constrained_indices, self.constraints)]
+        if hasattr(self, 'debug'):
+            stop # @UndefinedVariable

        return xx

@ -309,7 +314,7 @@ class parameterised(object):
            remove = np.hstack((remove, np.hstack(self.fixed_indices)))

        # add markers to show that some variables are constrained
-        for i,t in zip(self.constrained_indices,self.constraints):
+        for i, t in zip(self.constrained_indices, self.constraints):
            for ii in i:
                n[ii] = n[ii] + t.__str__()

@ -326,10 +331,10 @@ class parameterised(object):
        if not N:
            return "This object has no free parameters."
        header = ['Name', 'Value', 'Constraints', 'Ties']
-        values = self._get_params()  # map(str,self._get_params())
+        values = self._get_params() # map(str,self._get_params())
        # sort out the constraints
        constraints = [''] * len(names)
-        for i,t in zip(self.constrained_indices,self.constraints):
+        for i, t in zip(self.constrained_indices, self.constraints):
            for ii in i:
                constraints[ii] = t.__str__()
        for i in self.fixed_indices:
@ -347,7 +352,7 @@ class parameterised(object):
        max_constraint = max([len(constraints[i]) for i in range(len(constraints))] + [len(header[2])])
        max_ties = max([len(ties[i]) for i in range(len(ties))] + [len(header[3])])
        cols = np.array([max_names, max_values, max_constraint, max_ties]) + 4
-        columns = cols.sum()
+        # columns = cols.sum()

        header_string = ["{h:^{col}}".format(h=header[i], col=cols[i]) for i in range(len(cols))]
        header_string = map(lambda x: '|'.join(x), [header_string])
--- a/GPy/core/priors.py
+++ b/GPy/core/priors.py
@ -99,9 +99,9 @@ class MultivariateGaussian:
        assert len(self.var.shape) == 2
        assert self.var.shape[0] == self.var.shape[1]
        assert self.var.shape[0] == self.mu.size
-        self.D = self.mu.size
+        self.input_dim = self.mu.size
        self.inv, self.hld = pdinv(self.var)
-        self.constant = -0.5 * self.D * np.log(2 * np.pi) - self.hld
+        self.constant = -0.5 * self.input_dim * np.log(2 * np.pi) - self.hld

    def summary(self):
        raise NotImplementedError
@ -121,7 +121,7 @@ class MultivariateGaussian:
        return np.random.multivariate_normal(self.mu, self.var, n)

    def plot(self):
-        if self.D == 2:
+        if self.input_dim == 2:
            rvs = self.rvs(200)
            pb.plot(rvs[:, 0], rvs[:, 1], 'kx', mew=1.5)
            xmin, xmax = pb.xlim()
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -8,22 +8,22 @@ from scipy import linalg
 from ..likelihoods import Gaussian
 from gp_base import GPBase

-class sparse_GP(GPBase):
+class SparseGP(GPBase):
    """
    Variational sparse GP model

    :param X: inputs
-    :type X: np.ndarray (N x input_dim)
+    :type X: np.ndarray (num_data x input_dim)
    :param likelihood: a likelihood instance, containing the observed data
    :type likelihood: GPy.likelihood.(Gaussian | EP | Laplace)
    :param kernel : the kernel (covariance function). See link kernels
    :type kernel: a GPy.kern.kern instance
    :param X_variance: The uncertainty in the measurements of X (Gaussian variance)
-    :type X_variance: np.ndarray (N x input_dim) | None
+    :type X_variance: np.ndarray (num_data x input_dim) | None
    :param Z: inducing inputs (optional, see note)
-    :type Z: np.ndarray (M x input_dim) | None
-    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
-    :type M: int
+    :type Z: np.ndarray (num_inducing x input_dim) | None
+    :param num_inducing : Number of inducing points (optional, default 10. Ignored if Z is not None)
+    :type num_inducing: int
    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """
@ -32,7 +32,7 @@ class sparse_GP(GPBase):
        GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)

        self.Z = Z
-        self.M = Z.shape[0]
+        self.num_inducing = Z.shape[0]
        self.likelihood = likelihood

        if X_variance is None:
@ -69,7 +69,7 @@ class sparse_GP(GPBase):
        # The rather complex computations of self.A
        if self.has_uncertain_inputs:
            if self.likelihood.is_heteroscedastic:
-                psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.N, 1, 1))).sum(0)
+                psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.num_data, 1, 1))).sum(0)
            else:
                psi2_beta = self.psi2.sum(0) * self.likelihood.precision
            evals, evecs = linalg.eigh(psi2_beta)
@ -77,7 +77,7 @@ class sparse_GP(GPBase):
            tmp = evecs * np.sqrt(clipped_evals)
        else:
            if self.likelihood.is_heteroscedastic:
-                tmp = self.psi1 * (np.sqrt(self.likelihood.precision.flatten().reshape(1, self.N)))
+                tmp = self.psi1 * (np.sqrt(self.likelihood.precision.flatten().reshape(1, self.num_data)))
            else:
                tmp = self.psi1 * (np.sqrt(self.likelihood.precision))
        tmp, _ = linalg.lapack.flapack.dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1)
@ -85,7 +85,7 @@ class sparse_GP(GPBase):


        # factor B
-        self.B = np.eye(self.M) + self.A
+        self.B = np.eye(self.num_inducing) + self.A
        self.LB = jitchol(self.B)

        # TODO: make a switch for either first compute psi1V, or VV.T
@ -99,28 +99,28 @@ class sparse_GP(GPBase):

        # Compute dL_dKmm
        tmp = tdot(self._LBi_Lmi_psi1V)
-        self.DBi_plus_BiPBi = backsub_both_sides(self.LB, self.D * np.eye(self.M) + tmp)
+        self.DBi_plus_BiPBi = backsub_both_sides(self.LB, self.output_dim * np.eye(self.num_inducing) + tmp)
        tmp = -0.5 * self.DBi_plus_BiPBi
-        tmp += -0.5 * self.B * self.D
-        tmp += self.D * np.eye(self.M)
+        tmp += -0.5 * self.B * self.output_dim
+        tmp += self.output_dim * np.eye(self.num_inducing)
        self.dL_dKmm = backsub_both_sides(self.Lm, tmp)

        # Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertain inputs case
-        self.dL_dpsi0 = -0.5 * self.D * (self.likelihood.precision * np.ones([self.N, 1])).flatten()
+        self.dL_dpsi0 = -0.5 * self.output_dim * (self.likelihood.precision * np.ones([self.num_data, 1])).flatten()
        self.dL_dpsi1 = np.dot(self.Cpsi1V, self.likelihood.V.T)
-        dL_dpsi2_beta = 0.5 * backsub_both_sides(self.Lm, self.D * np.eye(self.M) - self.DBi_plus_BiPBi)
+        dL_dpsi2_beta = 0.5 * backsub_both_sides(self.Lm, self.output_dim * np.eye(self.num_inducing) - self.DBi_plus_BiPBi)

        if self.likelihood.is_heteroscedastic:
            if self.has_uncertain_inputs:
                self.dL_dpsi2 = self.likelihood.precision.flatten()[:, None, None] * dL_dpsi2_beta[None, :, :]
            else:
-                self.dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, self.psi1 * self.likelihood.precision.reshape(1, self.N))
+                self.dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, self.psi1 * self.likelihood.precision.reshape(1, self.num_data))
                self.dL_dpsi2 = None
        else:
            dL_dpsi2 = self.likelihood.precision * dL_dpsi2_beta
            if self.has_uncertain_inputs:
                # repeat for each of the N psi_2 matrices
-                self.dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], self.N, axis=0)
+                self.dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], self.num_data, axis=0)
            else:
                # subsume back into psi1 (==Kmn)
                self.dL_dpsi1 += 2.*np.dot(dL_dpsi2, self.psi1)
@ -135,26 +135,26 @@ class sparse_GP(GPBase):
            raise NotImplementedError, "heteroscedatic derivates not implemented"
        else:
            # likelihood is not heterscedatic
-            self.partial_for_likelihood = -0.5 * self.N * self.D * self.likelihood.precision + 0.5 * self.likelihood.trYYT * self.likelihood.precision ** 2
-            self.partial_for_likelihood += 0.5 * self.D * (self.psi0.sum() * self.likelihood.precision ** 2 - np.trace(self.A) * self.likelihood.precision)
+            self.partial_for_likelihood = -0.5 * self.num_data * self.output_dim * self.likelihood.precision + 0.5 * self.likelihood.trYYT * self.likelihood.precision ** 2
+            self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0.sum() * self.likelihood.precision ** 2 - np.trace(self.A) * self.likelihood.precision)
            self.partial_for_likelihood += self.likelihood.precision * (0.5 * np.sum(self.A * self.DBi_plus_BiPBi) - np.sum(np.square(self._LBi_Lmi_psi1V)))

    def log_likelihood(self):
        """ Compute the (lower bound on the) log marginal likelihood """
        if self.likelihood.is_heteroscedastic:
-            A = -0.5 * self.N * self.D * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.likelihood.V * self.likelihood.Y)
-            B = -0.5 * self.D * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A))
+            A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.likelihood.V * self.likelihood.Y)
+            B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A))
        else:
-            A = -0.5 * self.N * self.D * (np.log(2.*np.pi) - np.log(self.likelihood.precision)) - 0.5 * self.likelihood.precision * self.likelihood.trYYT
-            B = -0.5 * self.D * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A))
-        C = -self.D * (np.sum(np.log(np.diag(self.LB)))) # + 0.5 * self.M * np.log(sf2))
+            A = -0.5 * self.num_data * self.output_dim * (np.log(2.*np.pi) - np.log(self.likelihood.precision)) - 0.5 * self.likelihood.precision * self.likelihood.trYYT
+            B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A))
+        C = -self.output_dim * (np.sum(np.log(np.diag(self.LB)))) # + 0.5 * self.num_inducing * np.log(sf2))
        D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V))
        return A + B + C + D + self.likelihood.Z

    def _set_params(self, p):
-        self.Z = p[:self.M * self.input_dim].reshape(self.M, self.input_dim)
-        self.kern._set_params(p[self.Z.size:self.Z.size + self.kern.Nparam])
-        self.likelihood._set_params(p[self.Z.size + self.kern.Nparam:])
+        self.Z = p[:self.num_inducing * self.input_dim].reshape(self.num_inducing, self.input_dim)
+        self.kern._set_params(p[self.Z.size:self.Z.size + self.kern.num_params])
+        self.likelihood._set_params(p[self.Z.size + self.kern.num_params:])
        self._compute_kernel_matrices()
        self._computations()

@ -221,7 +221,7 @@ class sparse_GP(GPBase):

        Bi, _ = linalg.lapack.flapack.dpotri(self.LB, lower=0)  # WTH? this lower switch should be 1, but that doesn't work!
        symmetrify(Bi)
-        Kmmi_LmiBLmi = backsub_both_sides(self.Lm, np.eye(self.M) - Bi)
+        Kmmi_LmiBLmi = backsub_both_sides(self.Lm, np.eye(self.num_inducing) - Bi)

        if X_variance_new is None:
            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
@ -259,12 +259,12 @@ class sparse_GP(GPBase):
        :type which_parts: ('all', list of bools)
        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
        :type full_cov: bool
-        :rtype: posterior mean,  a Numpy array, Nnew x self.D
+        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
-        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.D
+        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim


-           If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew.
+           If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
           This is to allow for different normalizations of the output dimensions.

        """
--- a/GPy/examples/init.py
+++ b/GPy/examples/init.py
@ -4,5 +4,5 @@
 import classification
 import regression
 import dimensionality_reduction
-import non_gaussian
+import non_Gaussian
 import tutorials
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -24,7 +24,7 @@ def crescent_data(seed=default_seed): # FIXME
    Y = data['Y']
    Y[Y.flatten()==-1] = 0

-    m = GPy.models.GP_classification(data['X'], Y)
+    m = GPy.models.GPClassification(data['X'], Y)
    m.ensure_default_constraints()
    m.update_likelihood_approximation()
    m.optimize()
@ -41,7 +41,7 @@ def oil():
    Y[Y.flatten()==-1] = 0

    # Create GP model
-    m = GPy.models.GP_classification(data['X'], Y)
+    m = GPy.models.GPClassification(data['X'], Y)

    # Contrain all parameters to be positive
    m.constrain_positive('')
@ -66,7 +66,7 @@ def toy_linear_1d_classification(seed=default_seed):
    Y[Y.flatten() == -1] = 0

    # Model definition
-    m = GPy.models.GP_classification(data['X'], Y)
+    m = GPy.models.GPClassification(data['X'], Y)
    m.ensure_default_constraints()

    # Optimize
@ -95,7 +95,7 @@ def sparse_toy_linear_1d_classification(seed=default_seed):
    Y[Y.flatten() == -1] = 0

    # Model definition
-    m = GPy.models.sparse_GP_classification(data['X'], Y)
+    m = GPy.models.SparseGPClassification(data['X'], Y)
    m['.*len']= 2.

    m.ensure_default_constraints()
@ -114,7 +114,8 @@ def sparse_toy_linear_1d_classification(seed=default_seed):
    return m

 def sparse_crescent_data(inducing=10, seed=default_seed):
-    """Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
+    """
+    Run a Gaussian process classification with DTC approxiamtion on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.

    :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
    :param seed : seed value for data generation.
@ -127,7 +128,7 @@ def sparse_crescent_data(inducing=10, seed=default_seed):
    Y = data['Y']
    Y[Y.flatten()==-1]=0

-    m = GPy.models.sparse_GP_classification(data['X'], Y)
+    m = GPy.models.SparseGPClassification(data['X'], Y)
    m.ensure_default_constraints()
    m['.*len'] = 10.
    m.update_likelihood_approximation()
@ -135,3 +136,33 @@ def sparse_crescent_data(inducing=10, seed=default_seed):
    print(m)
    m.plot()
    return m
+
+def FITC_crescent_data(inducing=10, seed=default_seed):
+    """
+    Run a Gaussian process classification with FITC approximation on the crescent data. The demonstration uses EP to approximate the likelihood.
+
+    :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
+    :param seed : seed value for data generation.
+    :type seed: int
+    :param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
+    :type inducing: int
+    """
+
+    data = GPy.util.datasets.crescent_data(seed=seed)
+    Y = data['Y']
+    Y[Y.flatten()==-1]=0
+
+
+    data = GPy.util.datasets.crescent_data(seed=seed)
+    Y = data['Y']
+    Y[Y.flatten()==-1]=0
+
+    m = GPy.models.FITCClassification(data['X'], Y)
+    m.ensure_default_constraints()
+    m['.*len'] = 3.
+    m.update_likelihood_approximation()
+    m.optimize()
+    print(m)
+    m.plot()
+    return m
+
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -5,29 +5,28 @@ import numpy as np
 from matplotlib import pyplot as plt

 import GPy
-from GPy.models.Bayesian_GPLVM import Bayesian_GPLVM
-from GPy.util.datasets import swiss_roll_generated
 from GPy.core.transformations import logexp
+from GPy.models.bayesian_gplvm import BayesianGPLVM

 default_seed = np.random.seed(123344)

 def BGPLVM(seed=default_seed):
    N = 10
-    M = 3
+    num_inducing = 3
    Q = 2
    D = 4
    # generate GPLVM-like data
    X = np.random.rand(N, Q)
    k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
    K = k.K(X)
-    Y = np.random.multivariate_normal(np.zeros(N), K, D).T
+    Y = np.random.multivariate_normal(np.zeros(N), K, Q).T

    k = GPy.kern.rbf(Q, ARD=True) + GPy.kern.linear(Q, ARD=True) + GPy.kern.rbf(Q, ARD=True) + GPy.kern.white(Q)
    # k = GPy.kern.rbf(Q) + GPy.kern.rbf(Q) + GPy.kern.white(Q)
    # k = GPy.kern.rbf(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
    # k = GPy.kern.rbf(Q, ARD = False)  + GPy.kern.white(Q, 0.00001)

-    m = GPy.models.Bayesian_GPLVM(Y, Q, kernel=k, M=M)
+    m = GPy.models.BayesianGPLVM(Y, Q, kernel=k, num_inducing=num_inducing)
    m.constrain_positive('(rbf|bias|noise|white|S)')
    # m.constrain_fixed('S', 1)

@ -63,8 +62,8 @@ def GPLVM_oil_100(optimize=True):
    m.plot_latent(labels=m.data_labels)
    return m

-def swiss_roll(optimize=True, N=1000, M=15, Q=4, sigma=.2, plot=False):
-    from GPy.util.datasets import swiss_roll
+def swiss_roll(optimize=True, N=1000, num_inducing=15, Q=4, sigma=.2, plot=False):
+    from GPy.util.datasets import swiss_roll_generated
    from GPy.core.transformations import logexp_clipped

    data = swiss_roll_generated(N=N, sigma=sigma)
@ -101,24 +100,24 @@ def swiss_roll(optimize=True, N=1000, M=15, Q=4, sigma=.2, plot=False):
    S = (var * np.ones_like(X) + np.clip(np.random.randn(N, Q) * var ** 2,
                                         - (1 - var),
                                         (1 - var))) + .001
-    Z = np.random.permutation(X)[:M]
+    Z = np.random.permutation(X)[:num_inducing]

    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2)) + GPy.kern.white(Q, np.exp(-2))

-    m = Bayesian_GPLVM(Y, Q, X=X, X_variance=S, M=M, Z=Z, kernel=kernel)
+    m = BayesianGPLVM(Y, Q, X=X, X_variance=S, num_inducing=num_inducing, Z=Z, kernel=kernel)
    m.data_colors = c
    m.data_t = t

-    m.constrain('variance|length', logexp_clipped())
-    m['lengthscale'] = 1. # X.var(0).max() / X.var(0)
-    m['noise'] = Y.var() / 100.
    m.ensure_default_constraints()
+    m['rbf_lengthscale'] = 1. # X.var(0).max() / X.var(0)
+    m['noise_variance'] = Y.var() / 100.
+    m['bias_variance'] = 0.05

    if optimize:
        m.optimize('scg', messages=1)
    return m

-def BGPLVM_oil(optimize=True, N=100, Q=5, M=25, max_f_eval=4e3, plot=False, **k):
+def BGPLVM_oil(optimize=True, N=100, Q=5, num_inducing=25, max_f_eval=4e3, plot=False, **k):
    np.random.seed(0)
    data = GPy.util.datasets.oil()
    from GPy.core.transformations import logexp_clipped
@ -129,7 +128,7 @@ def BGPLVM_oil(optimize=True, N=100, Q=5, M=25, max_f_eval=4e3, plot=False, **k)
    Yn = Y - Y.mean(0)
    Yn /= Yn.std(0)

-    m = GPy.models.Bayesian_GPLVM(Yn, Q, kernel=kernel, M=M, **k)
+    m = GPy.models.BayesianGPLVM(Yn, Q, kernel=kernel, num_inducing=num_inducing, **k)
    m.data_labels = data['Y'][:N].argmax(axis=1)

    # m.constrain('variance|leng', logexp_clipped())
@ -168,7 +167,7 @@ def oil_100():



-def _simulate_sincos(D1, D2, D3, N, M, Q, plot_sim=False):
+def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
    x = np.linspace(0, 4 * np.pi, N)[:, None]
    s1 = np.vectorize(lambda x: np.sin(x))
    s2 = np.vectorize(lambda x: np.cos(x))
@ -228,13 +227,13 @@ def bgplvm_simulation_matlab_compare():
    Y = sim_data['Y']
    S = sim_data['S']
    mu = sim_data['mu']
-    M, [_, Q] = 3, mu.shape
+    num_inducing, [_, Q] = 3, mu.shape

    from GPy.models import mrd
    from GPy import kern
    reload(mrd); reload(kern)
    k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2))
-    m = Bayesian_GPLVM(Y, Q, init="PCA", M=M, kernel=k,
+    m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k,
 #                        X=mu,
 #                        X_variance=S,
                       _debug=False)
@ -248,8 +247,8 @@ def bgplvm_simulation(optimize='scg',
                      plot=True,
                      max_f_eval=2e4):
 #     from GPy.core.transformations import logexp_clipped
-    D1, D2, D3, N, M, Q = 15, 8, 8, 100, 3, 5
-    slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, M, Q, plot)
+    D1, D2, D3, N, num_inducing, Q = 15, 8, 8, 100, 3, 5
+    slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot)

    from GPy.models import mrd
    from GPy import kern
@ -259,7 +258,7 @@ def bgplvm_simulation(optimize='scg',
    Y = Ylist[0]

    k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2)) # + kern.bias(Q)
-    m = Bayesian_GPLVM(Y, Q, init="PCA", M=M, kernel=k, _debug=True)
+    m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k, _debug=True)
    # m.constrain('variance|noise', logexp_clipped())
    m.ensure_default_constraints()
    m['noise'] = Y.var() / 100.
@ -276,8 +275,8 @@ def bgplvm_simulation(optimize='scg',
    return m

 def mrd_simulation(optimize=True, plot=True, plot_sim=True, **kw):
-    D1, D2, D3, N, M, Q = 150, 200, 400, 500, 3, 7
-    slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, M, Q, plot_sim)
+    D1, D2, D3, N, num_inducing, Q = 150, 200, 400, 500, 3, 7
+    slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)

    from GPy.models import mrd
    from GPy import kern
@ -285,7 +284,7 @@ def mrd_simulation(optimize=True, plot=True, plot_sim=True, **kw):
    reload(mrd); reload(kern)

    k = kern.linear(Q, [.05] * Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2))
-    m = mrd.MRD(Ylist, Q=Q, M=M, kernels=k, initx="", initz='permute', **kw)
+    m = mrd.MRD(Ylist, input_dim=Q, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw)

    for i, Y in enumerate(Ylist):
        m['{}_noise'.format(i + 1)] = Y.var() / 100.
@ -297,7 +296,7 @@ def mrd_simulation(optimize=True, plot=True, plot_sim=True, **kw):

    if optimize:
        print "Optimizing Model:"
-        m.optimize('scg', messages=1, max_iters=5e4, max_f_eval=5e4)
+        m.optimize('scg', messages=1, max_iters=5e4, max_f_eval=5e4, gtol=.05)
    if plot:
        m.plot_X_1d("MRD Latent Space 1D")
        m.plot_scales("MRD Scales")
@ -313,7 +312,7 @@ def brendan_faces():
    Yn /= Yn.std()

    m = GPy.models.GPLVM(Yn, Q)
-    # m = GPy.models.Bayesian_GPLVM(Yn, Q, M=100)
+    # m = GPy.models.BayesianGPLVM(Yn, Q, num_inducing=100)

    # optimize
    m.constrain('rbf|noise|white', GPy.core.transformations.logexp_clipped())
@ -377,16 +376,16 @@ def cmu_mocap(subject='35', motion=['01'], in_place=True):
 #     X /= X.std(axis=0)
 #
 #     Q = 10
-#     M = 30
+#     num_inducing = 30
 #
 #     kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q)
-#     m = GPy.models.Bayesian_GPLVM(X, Q, kernel=kernel, M=M)
+#     m = GPy.models.BayesianGPLVM(X, Q, kernel=kernel, num_inducing=num_inducing)
 #     # m.scale_factor = 100.0
 #     m.constrain_positive('(white|noise|bias|X_variance|rbf_variance|rbf_length)')
 #     from sklearn import cluster
-#     km = cluster.KMeans(M, verbose=10)
+#     km = cluster.KMeans(num_inducing, verbose=10)
 #     Z = km.fit(m.X).cluster_centers_
-#     # Z = GPy.util.misc.kmm_init(m.X, M)
+#     # Z = GPy.util.misc.kmm_init(m.X, num_inducing)
 #     m.set('iip', Z)
 #     m.set('bias', 1e-4)
 #     # optimize
--- a/GPy/examples/non_Gaussian.py
+++ b/GPy/examples/non_Gaussian.py
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -10,71 +10,71 @@ import numpy as np
 import GPy


-def toy_rbf_1d(max_nb_eval_optim=100):
+def toy_rbf_1d(optimizer='tnc', max_nb_eval_optim=100):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
    data = GPy.util.datasets.toy_rbf_1d()

-    # create simple GP model
-    m = GPy.models.GP_regression(data['X'],data['Y'])
+    # create simple GP Model
+    m = GPy.models.GPRegression(data['X'],data['Y'])

    # optimize
    m.ensure_default_constraints()
-    m.optimize(max_f_eval=max_nb_eval_optim)
+    m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
    # plot
    m.plot()
    print(m)
    return m

-def rogers_girolami_olympics(max_nb_eval_optim=100):
+def rogers_girolami_olympics(optim_iters=100):
    """Run a standard Gaussian process regression on the Rogers and Girolami olympics data."""
    data = GPy.util.datasets.rogers_girolami_olympics()

-    # create simple GP model
-    m = GPy.models.GP_regression(data['X'],data['Y'])
+    # create simple GP Model
+    m = GPy.models.GPRegression(data['X'],data['Y'])

    #set the lengthscale to be something sensible (defaults to 1)
    m['rbf_lengthscale'] = 10

    # optimize
    m.ensure_default_constraints()
-    m.optimize(max_f_eval=max_nb_eval_optim)
+    m.optimize(max_f_eval=optim_iters)

    # plot
    m.plot(plot_limits = (1850, 2050))
    print(m)
    return m

-def toy_rbf_1d_50(max_nb_eval_optim=100):
+def toy_rbf_1d_50(optim_iters=100):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
    data = GPy.util.datasets.toy_rbf_1d_50()

-    # create simple GP model
-    m = GPy.models.GP_regression(data['X'],data['Y'])
+    # create simple GP Model
+    m = GPy.models.GPRegression(data['X'],data['Y'])

    # optimize
    m.ensure_default_constraints()
-    m.optimize(max_f_eval=max_nb_eval_optim)
+    m.optimize(max_f_eval=optim_iters)

    # plot
    m.plot()
    print(m)
    return m

-def silhouette(max_nb_eval_optim=100):
+def silhouette(optim_iters=100):
    """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper."""
    data = GPy.util.datasets.silhouette()

-    # create simple GP model
-    m = GPy.models.GP_regression(data['X'],data['Y'])
+    # create simple GP Model
+    m = GPy.models.GPRegression(data['X'],data['Y'])

    # optimize
    m.ensure_default_constraints()
-    m.optimize(messages=True,max_f_eval=max_nb_eval_optim)
+    m.optimize(messages=True,max_f_eval=optim_iters)

    print(m)
    return m

-def coregionalisation_toy2(max_nb_eval_optim=100):
+def coregionalisation_toy2(optim_iters=100):
    """
    A simple demonstration of coregionalisation on two sinusoidal functions.
    """
@ -87,13 +87,13 @@ def coregionalisation_toy2(max_nb_eval_optim=100):
    Y = np.vstack((Y1,Y2))

    k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
-    k2 = GPy.kern.coregionalise(2,1)
+    k2 = GPy.kern.Coregionalise(2,1)
    k = k1.prod(k2,tensor=True)
-    m = GPy.models.GP_regression(X,Y,kernel=k)
+    m = GPy.models.GPRegression(X,Y,kernel=k)
    m.constrain_fixed('.*rbf_var',1.)
    #m.constrain_positive('.*kappa')
    m.ensure_default_constraints()
-    m.optimize('sim',messages=1,max_f_eval=max_nb_eval_optim)
+    m.optimize('sim',messages=1,max_f_eval=optim_iters)

    pb.figure()
    Xtest1 = np.hstack((np.linspace(0,9,100)[:,None],np.zeros((100,1))))
@ -106,7 +106,7 @@ def coregionalisation_toy2(max_nb_eval_optim=100):
    pb.plot(X2[:,0],Y2[:,0],'gx',mew=2)
    return m

-def coregionalisation_toy(max_nb_eval_optim=100):
+def coregionalisation_toy(optim_iters=100):
    """
    A simple demonstration of coregionalisation on two sinusoidal functions.
    """
@ -119,13 +119,13 @@ def coregionalisation_toy(max_nb_eval_optim=100):
    Y = np.vstack((Y1,Y2))

    k1 = GPy.kern.rbf(1)
-    k2 = GPy.kern.coregionalise(2,2)
+    k2 = GPy.kern.Coregionalise(2,2)
    k = k1.prod(k2,tensor=True)
-    m = GPy.models.GP_regression(X,Y,kernel=k)
+    m = GPy.models.GPRegression(X,Y,kernel=k)
    m.constrain_fixed('.*rbf_var',1.)
    #m.constrain_positive('kappa')
    m.ensure_default_constraints()
-    m.optimize(max_f_eval=max_nb_eval_optim)
+    m.optimize(max_f_eval=optim_iters)

    pb.figure()
    Xtest1 = np.hstack((np.linspace(0,9,100)[:,None],np.zeros((100,1))))
@ -139,7 +139,7 @@ def coregionalisation_toy(max_nb_eval_optim=100):
    return m


-def coregionalisation_sparse(max_nb_eval_optim=100):
+def coregionalisation_sparse(optim_iters=100):
    """
    A simple demonstration of coregionalisation on two sinusoidal functions using sparse approximations.
    """
@ -151,21 +151,21 @@ def coregionalisation_sparse(max_nb_eval_optim=100):
    Y2 = -np.sin(X2) + np.random.randn(*X2.shape)*0.05
    Y = np.vstack((Y1,Y2))

-    M = 40
-    Z = np.hstack((np.random.rand(M,1)*8,np.random.randint(0,2,M)[:,None]))
+    num_inducing = 40
+    Z = np.hstack((np.random.rand(num_inducing,1)*8,np.random.randint(0,2,num_inducing)[:,None]))

    k1 = GPy.kern.rbf(1)
-    k2 = GPy.kern.coregionalise(2,2)
+    k2 = GPy.kern.Coregionalise(2,2)
    k = k1.prod(k2,tensor=True) + GPy.kern.white(2,0.001)

-    m = GPy.models.sparse_GP_regression(X,Y,kernel=k,Z=Z)
-    m.scale_factor = 10000.
+    m = GPy.models.SparseGPRegression(X,Y,kernel=k,Z=Z)
    m.constrain_fixed('.*rbf_var',1.)
-    #m.constrain_positive('kappa')
    m.constrain_fixed('iip')
+    m.constrain_bounded('noise_variance',1e-3,1e-1)
    m.ensure_default_constraints()
-    m.optimize_restarts(5, robust=True, messages=1, max_f_eval=max_nb_eval_optim)
+    m.optimize_restarts(5, robust=True, messages=1, max_f_eval=optim_iters)

+    #plotting:
    pb.figure()
    Xtest1 = np.hstack((np.linspace(0,9,100)[:,None],np.zeros((100,1))))
    Xtest2 = np.hstack((np.linspace(0,9,100)[:,None],np.ones((100,1))))
@ -181,7 +181,7 @@ def coregionalisation_sparse(max_nb_eval_optim=100):
    return m


-def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000, max_nb_eval_optim=100):
+def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000, optim_iters=300):
    """Show an example of a multimodal error surface for Gaussian process regression. Gene 939 has bimodal behaviour where the noisey mode is higher."""

    # Contour over a range of length scales and signal/noise ratios.
@ -197,7 +197,7 @@ def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000
    data['Y'] = data['Y'] - np.mean(data['Y'])

    lls = GPy.examples.regression._contour_data(data, length_scales, log_SNRs, GPy.kern.rbf)
-    pb.contour(length_scales, log_SNRs, np.exp(lls), 20)
+    pb.contour(length_scales, log_SNRs, np.exp(lls), 20, cmap=pb.cm.jet)
    ax = pb.gca()
    pb.xlabel('length scale')
    pb.ylabel('log_10 SNR')
@ -211,18 +211,20 @@ def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000
    optim_point_y = np.empty(2)
    np.random.seed(seed=seed)
    for i in range(0, model_restarts):
-        kern = GPy.kern.rbf(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.)) + GPy.kern.white(1,variance=np.random.exponential(1.))
+        #kern = GPy.kern.rbf(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.))
+        kern = GPy.kern.rbf(1, variance=np.random.uniform(1e-3,1), lengthscale=np.random.uniform(5,50))

-        m = GPy.models.GP_regression(data['X'],data['Y'], kernel=kern)
-        optim_point_x[0] = m.get('rbf_lengthscale')
-        optim_point_y[0] = np.log10(m.get('rbf_variance')) - np.log10(m.get('white_variance'));
+        m = GPy.models.GPRegression(data['X'],data['Y'], kernel=kern)
+        m['noise_variance'] = np.random.uniform(1e-3,1)
+        optim_point_x[0] = m['rbf_lengthscale']
+        optim_point_y[0] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);

        # optimize
        m.ensure_default_constraints()
-        m.optimize(xtol=1e-6, ftol=1e-6, max_f_eval=max_nb_eval_optim)
+        m.optimize('scg', xtol=1e-6, ftol=1e-6, max_f_eval=optim_iters)

-        optim_point_x[1] = m.get('rbf_lengthscale')
-        optim_point_y[1] = np.log10(m.get('rbf_variance')) - np.log10(m.get('white_variance'));
+        optim_point_x[1] = m['rbf_lengthscale']
+        optim_point_y[1] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);

        pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1]-optim_point_x[0], optim_point_y[1]-optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k')
        models.append(m)
@ -231,42 +233,35 @@ def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000
    ax.set_ylim(ylim)
    return (models, lls)

-def _contour_data(data, length_scales, log_SNRs, signal_kernel_call=GPy.kern.rbf):
+def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):
    """Evaluate the GP objective function for a given data set for a range of signal to noise ratios and a range of lengthscales.

    :data_set: A data set from the utils.datasets director.
    :length_scales: a list of length scales to explore for the contour plot.
    :log_SNRs: a list of base 10 logarithm signal to noise ratios to explore for the contour plot.
-    :signal_kernel: a kernel to use for the 'signal' portion of the data."""
+    :kernel: a kernel to use for the 'signal' portion of the data."""

    lls = []
    total_var = np.var(data['Y'])
+    kernel = kernel_call(1, variance=1., lengthscale=1.)
+    Model = GPy.models.GPRegression(data['X'], data['Y'], kernel=kernel)
    for log_SNR in log_SNRs:
-        SNR = 10**log_SNR
+        SNR = 10.**log_SNR
+        noise_var = total_var/(1.+SNR)
+        signal_var = total_var - noise_var
+        Model.kern['.*variance'] = signal_var
+        Model['noise_variance'] = noise_var
        length_scale_lls = []
+
        for length_scale in length_scales:
-            noise_var = 1.
-            signal_var = SNR
-            noise_var = noise_var/(noise_var + signal_var)*total_var
-            signal_var = signal_var/(noise_var + signal_var)*total_var
+            Model['.*lengthscale'] = length_scale
+            length_scale_lls.append(Model.log_likelihood())

-            signal_kernel = signal_kernel_call(1, variance=signal_var, lengthscale=length_scale)
-            noise_kernel = GPy.kern.white(1, variance=noise_var)
-            kernel = signal_kernel + noise_kernel
-            K = kernel.K(data['X'])
-            total_var = (np.dot(np.dot(data['Y'].T,GPy.util.linalg.pdinv(K)[0]), data['Y'])/data['Y'].shape[0])[0,0]
-            noise_var *= total_var
-            signal_var *= total_var
-
-            kernel = signal_kernel_call(1, variance=signal_var, lengthscale=length_scale) + GPy.kern.white(1, variance=noise_var)
-
-            model = GPy.models.GP_regression(data['X'], data['Y'], kernel=kernel)
-            model.constrain_positive('')
-            length_scale_lls.append(model.log_likelihood())
        lls.append(length_scale_lls)
+
    return np.array(lls)

-def sparse_GP_regression_1D(N = 400, M = 5, max_nb_eval_optim=100):
+def sparse_GP_regression_1D(N = 400, num_inducing = 5, optim_iters=100):
    """Run a 1D example of a sparse GP regression."""
    # sample inputs and outputs
    X = np.random.uniform(-3.,3.,(N,1))
@ -275,17 +270,17 @@ def sparse_GP_regression_1D(N = 400, M = 5, max_nb_eval_optim=100):
    rbf =  GPy.kern.rbf(1)
    noise = GPy.kern.white(1)
    kernel = rbf + noise
-    # create simple GP model
-    m = GPy.models.sparse_GP_regression(X, Y, kernel, M=M)
+    # create simple GP Model
+    m = GPy.models.SparseGPRegression(X, Y, kernel, num_inducing=num_inducing)

    m.ensure_default_constraints()

    m.checkgrad(verbose=1)
-    m.optimize('tnc', messages = 1, max_f_eval=max_nb_eval_optim)
+    m.optimize('tnc', messages = 1, max_f_eval=optim_iters)
    m.plot()
    return m

-def sparse_GP_regression_2D(N = 400, M = 50, max_nb_eval_optim=100):
+def sparse_GP_regression_2D(N = 400, num_inducing = 50, optim_iters=100):
    """Run a 2D example of a sparse GP regression."""
    X = np.random.uniform(-3.,3.,(N,2))
    Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(N,1)*0.05
@ -295,8 +290,8 @@ def sparse_GP_regression_2D(N = 400, M = 50, max_nb_eval_optim=100):
    noise = GPy.kern.white(2)
    kernel = rbf + noise

-    # create simple GP model
-    m = GPy.models.sparse_GP_regression(X,Y,kernel, M = M)
+    # create simple GP Model
+    m = GPy.models.SparseGPRegression(X,Y,kernel, num_inducing = num_inducing)

    # contrain all parameters to be positive (but not inducing inputs)
    m.ensure_default_constraints()
@ -305,13 +300,12 @@ def sparse_GP_regression_2D(N = 400, M = 50, max_nb_eval_optim=100):
    m.checkgrad()

    # optimize and plot
-    pb.figure()
-    m.optimize('tnc', messages = 1, max_f_eval=max_nb_eval_optim)
+    m.optimize('tnc', messages = 1, max_f_eval=optim_iters)
    m.plot()
    print(m)
    return m

-def uncertain_inputs_sparse_regression(max_nb_eval_optim=100):
+def uncertain_inputs_sparse_regression(optim_iters=100):
    """Run a 1D example of a sparse GP regression with uncertain inputs."""
    fig, axes = pb.subplots(1,2,figsize=(12,5))

@ -324,18 +318,18 @@ def uncertain_inputs_sparse_regression(max_nb_eval_optim=100):

    k = GPy.kern.rbf(1) + GPy.kern.white(1)

-    # create simple GP model - no input uncertainty on this one
-    m = GPy.models.sparse_GP_regression(X, Y, kernel=k, Z=Z)
+    # create simple GP Model - no input uncertainty on this one
+    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z)
    m.ensure_default_constraints()
-    m.optimize('scg', messages=1, max_f_eval=max_nb_eval_optim)
+    m.optimize('scg', messages=1, max_f_eval=optim_iters)
    m.plot(ax=axes[0])
    axes[0].set_title('no input uncertainty')


-    #the same model with uncertainty
-    m = GPy.models.sparse_GP_regression(X, Y, kernel=k, Z=Z, X_variance=S)
+    #the same Model with uncertainty
+    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z, X_variance=S)
    m.ensure_default_constraints()
-    m.optimize('scg', messages=1, max_f_eval=max_nb_eval_optim)
+    m.optimize('scg', messages=1, max_f_eval=optim_iters)
    m.plot(ax=axes[1])
    axes[1].set_title('with input uncertainty')
    print(m)
--- a/GPy/examples/tutorials.py
+++ b/GPy/examples/tutorials.py
@ -19,7 +19,7 @@ def tuto_GP_regression():

    kernel = GPy.kern.rbf(input_dim=1, variance=1., lengthscale=1.)

-    m = GPy.models.GP_regression(X,Y,kernel)
+    m = GPy.models.GPRegression(X, Y, kernel)

    print m
    m.plot()
@ -46,7 +46,7 @@ def tuto_GP_regression():
    ker = GPy.kern.Matern52(2,ARD=True) + GPy.kern.white(2)

    # create simple GP model
-    m = GPy.models.GP_regression(X,Y,ker)
+    m = GPy.models.GPRegression(X, Y, ker)

    # contrain all parameters to be positive
    m.constrain_positive('')
@ -114,7 +114,12 @@ def tuto_kernel_overview():
    Y = 0.5*X[:,:1] + 0.5*X[:,1:] + 2*np.sin(X[:,:1]) * np.sin(X[:,1:])

    # Create GP regression model
+<<<<<<< HEAD
    m = GPy.models.GP_regression(X,Y,Kanova)
+=======
+    m = GPy.models.GPRegression(X, Y, Kanova)
+    pb.figure(figsize=(5,5))
+>>>>>>> efbf169a6a17d824234d538553ffcbe0c4bddc40
    m.plot()
   
    pb.figure(figsize=(20,3))
@ -140,5 +145,5 @@ def model_interaction():
    X = np.random.randn(20,1)
    Y = np.sin(X) + np.random.randn(*X.shape)*0.01 + 5.
    k = GPy.kern.rbf(1) + GPy.kern.bias(1)
-    return GPy.models.GP_regression(X,Y,kernel=k)
+    return GPy.models.GPRegression(X, Y, kernel=k)

--- a/GPy/inference/optimization.py
+++ b/GPy/inference/optimization.py
@ -1,18 +1,16 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-import pdb
 import pylab as pb
 import datetime as dt
 from scipy import optimize
-import numpy as np

 try:
    import rasmussens_minimize as rasm
    rasm_available = True
 except ImportError:
    rasm_available = False
-from SCG import SCG
+from scg import SCG

 class Optimizer():
    """
@ -51,9 +49,9 @@ class Optimizer():
        start = dt.datetime.now()
        self.opt(**kwargs)
        end = dt.datetime.now()
-        self.time = str(end-start)
+        self.time = str(end - start)

-    def opt(self, f_fp = None, f = None, fp = None):
+    def opt(self, f_fp=None, f=None, fp=None):
        raise NotImplementedError, "this needs to be implemented to use the optimizer class"

    def plot(self):
@ -78,7 +76,7 @@ class opt_tnc(Optimizer):
        Optimizer.__init__(self, *args, **kwargs)
        self.opt_name = "TNC (Scipy implementation)"

-    def opt(self, f_fp = None, f = None, fp = None):
+    def opt(self, f_fp=None, f=None, fp=None):
        """
        Run the TNC optimizer

@ -96,8 +94,8 @@ class opt_tnc(Optimizer):
        if self.gtol is not None:
            opt_dict['pgtol'] = self.gtol

-        opt_result = optimize.fmin_tnc(f_fp, self.x_init, messages = self.messages,
-                       maxfun = self.max_f_eval, **opt_dict)
+        opt_result = optimize.fmin_tnc(f_fp, self.x_init, messages=self.messages,
+                       maxfun=self.max_f_eval, **opt_dict)
        self.x_opt = opt_result[0]
        self.f_opt = f_fp(self.x_opt)[0]
        self.funct_eval = opt_result[1]
@ -108,7 +106,7 @@ class opt_lbfgsb(Optimizer):
        Optimizer.__init__(self, *args, **kwargs)
        self.opt_name = "L-BFGS-B (Scipy implementation)"

-    def opt(self, f_fp = None, f = None, fp = None):
+    def opt(self, f_fp=None, f=None, fp=None):
        """
        Run the optimizer

@ -130,8 +128,8 @@ class opt_lbfgsb(Optimizer):
        if self.gtol is not None:
            opt_dict['pgtol'] = self.gtol

-        opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint = iprint,
-                                            maxfun = self.max_f_eval, **opt_dict)
+        opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint=iprint,
+                                            maxfun=self.max_f_eval, **opt_dict)
        self.x_opt = opt_result[0]
        self.f_opt = f_fp(self.x_opt)[0]
        self.funct_eval = opt_result[2]['funcalls']
@ -142,12 +140,12 @@ class opt_simplex(Optimizer):
        Optimizer.__init__(self, *args, **kwargs)
        self.opt_name = "Nelder-Mead simplex routine (via Scipy)"

-    def opt(self, f_fp = None, f = None, fp = None):
+    def opt(self, f_fp=None, f=None, fp=None):
        """
        The simplex optimizer does not require gradients.
        """

-        statuses = ['Converged', 'Maximum number of function evaluations made','Maximum number of iterations reached']
+        statuses = ['Converged', 'Maximum number of function evaluations made', 'Maximum number of iterations reached']

        opt_dict = {}
        if self.xtol is not None:
@ -157,8 +155,8 @@ class opt_simplex(Optimizer):
        if self.gtol is not None:
            print "WARNING: simplex doesn't have an gtol arg, so I'm going to ignore it"

-        opt_result = optimize.fmin(f, self.x_init, (), disp = self.messages,
-                   maxfun = self.max_f_eval, full_output=True, **opt_dict)
+        opt_result = optimize.fmin(f, self.x_init, (), disp=self.messages,
+                   maxfun=self.max_f_eval, full_output=True, **opt_dict)

        self.x_opt = opt_result[0]
        self.f_opt = opt_result[1]
@ -172,7 +170,7 @@ class opt_rasm(Optimizer):
        Optimizer.__init__(self, *args, **kwargs)
        self.opt_name = "Rasmussen's Conjugate Gradient"

-    def opt(self, f_fp = None, f = None, fp = None):
+    def opt(self, f_fp=None, f=None, fp=None):
        """
        Run Rasmussen's Conjugate Gradient optimizer
        """
@ -189,8 +187,8 @@ class opt_rasm(Optimizer):
        if self.gtol is not None:
            print "WARNING: minimize doesn't have an gtol arg, so I'm going to ignore it"

-        opt_result = rasm.minimize(self.x_init, f_fp, (), messages = self.messages,
-                                   maxnumfuneval = self.max_f_eval)
+        opt_result = rasm.minimize(self.x_init, f_fp, (), messages=self.messages,
+                                   maxnumfuneval=self.max_f_eval)
        self.x_opt = opt_result[0]
        self.f_opt = opt_result[1][-1]
        self.funct_eval = opt_result[2]
@ -203,7 +201,7 @@ class opt_SCG(Optimizer):
        Optimizer.__init__(self, *args, **kwargs)
        self.opt_name = "Scaled Conjugate Gradients"

-    def opt(self, f_fp = None, f = None, fp = None):
+    def opt(self, f_fp=None, f=None, fp=None):
        assert not f is None
        assert not fp is None
        opt_result = SCG(f, fp, self.x_init, display=self.messages,
@ -218,7 +216,7 @@ class opt_SCG(Optimizer):
        self.status = opt_result[3]

 def get_optimizer(f_min):
-    from SGD import opt_SGD
+    from sgd import opt_SGD

    optimizers = {'fmin_tnc': opt_tnc,
          'simplex': opt_simplex,
--- a/GPy/inference/scg.py
+++ b/GPy/inference/scg.py
--- a/GPy/inference/sgd.py
+++ b/GPy/inference/sgd.py
@ -11,17 +11,17 @@ class opt_SGD(Optimizer):
    Optimize using stochastic gradient descent.

    *** Parameters ***
-    model: reference to the model object
+    Model: reference to the Model object
    iterations: number of iterations
    learning_rate: learning rate
    momentum: momentum

    """

-    def __init__(self, start, iterations = 10, learning_rate = 1e-4, momentum = 0.9, model = None, messages = False, batch_size = 1, self_paced = False, center = True, iteration_file = None, learning_rate_adaptation=None, actual_iter=None, schedule=None, **kwargs):
+    def __init__(self, start, iterations = 10, learning_rate = 1e-4, momentum = 0.9, Model = None, messages = False, batch_size = 1, self_paced = False, center = True, iteration_file = None, learning_rate_adaptation=None, actual_iter=None, schedule=None, **kwargs):
        self.opt_name = "Stochastic Gradient Descent"

-        self.model = model
+        self.Model = Model
        self.iterations = iterations
        self.momentum = momentum
        self.learning_rate = learning_rate
@ -42,17 +42,17 @@ class opt_SGD(Optimizer):
                self.learning_rate_0 = self.learning_rate.mean()

        self.schedule = schedule
-        # if len([p for p in self.model.kern.parts if p.name == 'bias']) == 1:
+        # if len([p for p in self.Model.kern.parts if p.name == 'bias']) == 1:
        #     self.param_traces.append(('bias',[]))
-        # if len([p for p in self.model.kern.parts if p.name == 'linear']) == 1:
+        # if len([p for p in self.Model.kern.parts if p.name == 'linear']) == 1:
        #     self.param_traces.append(('linear',[]))
-        # if len([p for p in self.model.kern.parts if p.name == 'rbf']) == 1:
+        # if len([p for p in self.Model.kern.parts if p.name == 'rbf']) == 1:
        #     self.param_traces.append(('rbf_var',[]))

        self.param_traces = dict(self.param_traces)
        self.fopt_trace = []

-        num_params = len(self.model._get_params())
+        num_params = len(self.Model._get_params())
        if isinstance(self.learning_rate, float):
            self.learning_rate = np.ones((num_params,)) * self.learning_rate

@ -84,7 +84,7 @@ class opt_SGD(Optimizer):
        return (np.isnan(data).sum(axis=1) == 0)

    def check_for_missing(self, data):
-        if sp.sparse.issparse(self.model.likelihood.Y):
+        if sp.sparse.issparse(self.Model.likelihood.Y):
            return True
        else:
            return np.isnan(data).sum() > 0
@ -107,32 +107,32 @@ class opt_SGD(Optimizer):

    def shift_constraints(self, j):

-        constrained_indices = copy.deepcopy(self.model.constrained_indices)
+        constrained_indices = copy.deepcopy(self.Model.constrained_indices)

        for c, constraint in enumerate(constrained_indices):
            mask = (np.ones_like(constrained_indices[c]) == 1)
            for i in range(len(constrained_indices[c])):
                pos = np.where(j == constrained_indices[c][i])[0]
                if len(pos) == 1:
-                    self.model.constrained_indices[c][i] = pos
+                    self.Model.constrained_indices[c][i] = pos
                else:
                    mask[i] = False

-            self.model.constrained_indices[c] = self.model.constrained_indices[c][mask]
+            self.Model.constrained_indices[c] = self.Model.constrained_indices[c][mask]
        return constrained_indices
        # back them up
-        # bounded_i = copy.deepcopy(self.model.constrained_bounded_indices)
-        # bounded_l = copy.deepcopy(self.model.constrained_bounded_lowers)
-        # bounded_u = copy.deepcopy(self.model.constrained_bounded_uppers)
+        # bounded_i = copy.deepcopy(self.Model.constrained_bounded_indices)
+        # bounded_l = copy.deepcopy(self.Model.constrained_bounded_lowers)
+        # bounded_u = copy.deepcopy(self.Model.constrained_bounded_uppers)

        # for b in range(len(bounded_i)): # for each group of constraints
        #     for bc in range(len(bounded_i[b])):
        #         pos = np.where(j == bounded_i[b][bc])[0]
        #         if len(pos) == 1:
-        #             pos2 = np.where(self.model.constrained_bounded_indices[b] == bounded_i[b][bc])[0][0]
-        #             self.model.constrained_bounded_indices[b][pos2] = pos[0]
+        #             pos2 = np.where(self.Model.constrained_bounded_indices[b] == bounded_i[b][bc])[0][0]
+        #             self.Model.constrained_bounded_indices[b][pos2] = pos[0]
        #         else:
-        #             if len(self.model.constrained_bounded_indices[b]) == 1:
+        #             if len(self.Model.constrained_bounded_indices[b]) == 1:
        #                 # if it's the last index to be removed
        #                 # the logic here is just a mess. If we remove the last one, then all the
        #                 # b-indices change and we have to iterate through everything to find our
@ -140,35 +140,35 @@ class opt_SGD(Optimizer):
        #                 raise NotImplementedError

        #             else: # just remove it from the indices
-        #                 mask = self.model.constrained_bounded_indices[b] != bc
-        #                 self.model.constrained_bounded_indices[b] = self.model.constrained_bounded_indices[b][mask]
+        #                 mask = self.Model.constrained_bounded_indices[b] != bc
+        #                 self.Model.constrained_bounded_indices[b] = self.Model.constrained_bounded_indices[b][mask]


        # # here we shif the positive constraints. We cycle through each positive
        # # constraint
-        # positive = self.model.constrained_positive_indices.copy()
+        # positive = self.Model.constrained_positive_indices.copy()
        # mask = (np.ones_like(positive) == 1)
        # for p in range(len(positive)):
        #     # we now check whether the constrained index appears in the j vector
        #     # (the vector of the "active" indices)
-        #     pos = np.where(j == self.model.constrained_positive_indices[p])[0]
+        #     pos = np.where(j == self.Model.constrained_positive_indices[p])[0]
        #     if len(pos) == 1:
-        #         self.model.constrained_positive_indices[p] = pos
+        #         self.Model.constrained_positive_indices[p] = pos
        #     else:
        #         mask[p] = False
-        # self.model.constrained_positive_indices = self.model.constrained_positive_indices[mask]
+        # self.Model.constrained_positive_indices = self.Model.constrained_positive_indices[mask]

        # return (bounded_i, bounded_l, bounded_u), positive

    def restore_constraints(self, c):#b, p):
-        # self.model.constrained_bounded_indices = b[0]
-        # self.model.constrained_bounded_lowers = b[1]
-        # self.model.constrained_bounded_uppers = b[2]
-        # self.model.constrained_positive_indices = p
-        self.model.constrained_indices = c
+        # self.Model.constrained_bounded_indices = b[0]
+        # self.Model.constrained_bounded_lowers = b[1]
+        # self.Model.constrained_bounded_uppers = b[2]
+        # self.Model.constrained_positive_indices = p
+        self.Model.constrained_indices = c

    def get_param_shapes(self, N = None, input_dim = None):
-        model_name = self.model.__class__.__name__
+        model_name = self.Model.__class__.__name__
        if model_name == 'GPLVM':
            return [(N, input_dim)]
        if model_name == 'Bayesian_GPLVM':
@ -179,37 +179,37 @@ class opt_SGD(Optimizer):
    def step_with_missing_data(self, f_fp, X, step, shapes):
        N, input_dim = X.shape

-        if not sp.sparse.issparse(self.model.likelihood.Y):
-            Y = self.model.likelihood.Y
-            samples = self.non_null_samples(self.model.likelihood.Y)
-            self.model.N = samples.sum()
+        if not sp.sparse.issparse(self.Model.likelihood.Y):
+            Y = self.Model.likelihood.Y
+            samples = self.non_null_samples(self.Model.likelihood.Y)
+            self.Model.N = samples.sum()
            Y = Y[samples]
        else:
-            samples = self.model.likelihood.Y.nonzero()[0]
-            self.model.N = len(samples)
-            Y = np.asarray(self.model.likelihood.Y[samples].todense(), dtype = np.float64)
+            samples = self.Model.likelihood.Y.nonzero()[0]
+            self.Model.N = len(samples)
+            Y = np.asarray(self.Model.likelihood.Y[samples].todense(), dtype = np.float64)

-        if self.model.N == 0 or Y.std() == 0.0:
-            return 0, step, self.model.N
+        if self.Model.N == 0 or Y.std() == 0.0:
+            return 0, step, self.Model.N

-        self.model.likelihood._offset = Y.mean()
-        self.model.likelihood._scale = Y.std()
-        self.model.likelihood.set_data(Y)
-        # self.model.likelihood.V = self.model.likelihood.Y*self.model.likelihood.precision
+        self.Model.likelihood._offset = Y.mean()
+        self.Model.likelihood._scale = Y.std()
+        self.Model.likelihood.set_data(Y)
+        # self.Model.likelihood.V = self.Model.likelihood.Y*self.Model.likelihood.precision

-        sigma = self.model.likelihood._variance
-        self.model.likelihood._variance = None # invalidate cache
-        self.model.likelihood._set_params(sigma)
+        sigma = self.Model.likelihood._variance
+        self.Model.likelihood._variance = None # invalidate cache
+        self.Model.likelihood._set_params(sigma)


        j = self.subset_parameter_vector(self.x_opt, samples, shapes)
-        self.model.X = X[samples]
+        self.Model.X = X[samples]

-        model_name = self.model.__class__.__name__
+        model_name = self.Model.__class__.__name__

        if model_name == 'Bayesian_GPLVM':
-            self.model.likelihood.YYT = np.dot(self.model.likelihood.Y, self.model.likelihood.Y.T)
-            self.model.likelihood.trYYT = np.trace(self.model.likelihood.YYT)
+            self.Model.likelihood.YYT = np.dot(self.Model.likelihood.Y, self.Model.likelihood.Y.T)
+            self.Model.likelihood.trYYT = np.trace(self.Model.likelihood.YYT)

        ci = self.shift_constraints(j)
        f, fp = f_fp(self.x_opt[j])
@ -218,18 +218,18 @@ class opt_SGD(Optimizer):
        self.x_opt[j] -= step[j]
        self.restore_constraints(ci)

-        self.model.grads[j] = fp
+        self.Model.grads[j] = fp
        # restore likelihood _offset and _scale, otherwise when we call set_data(y) on
        # the next feature, it will get normalized with the mean and std of this one.
-        self.model.likelihood._offset = 0
-        self.model.likelihood._scale = 1
+        self.Model.likelihood._offset = 0
+        self.Model.likelihood._scale = 1

-        return f, step, self.model.N
+        return f, step, self.Model.N

    def adapt_learning_rate(self, t, D):
        if self.learning_rate_adaptation == 'adagrad':
            if t > 0:
-                g_k = self.model.grads
+                g_k = self.Model.grads
                self.s_k += np.square(g_k)
                t0 = 100.0
                self.learning_rate = 0.1/(t0 + np.sqrt(self.s_k))
@ -245,8 +245,8 @@ class opt_SGD(Optimizer):


        elif self.learning_rate_adaptation == 'semi_pesky':
-            if self.model.__class__.__name__ == 'Bayesian_GPLVM':
-                g_t = self.model.grads
+            if self.Model.__class__.__name__ == 'Bayesian_GPLVM':
+                g_t = self.Model.grads
                if t == 0:
                    self.hbar_t = 0.0
                    self.tau_t = 100.0
@ -259,28 +259,28 @@ class opt_SGD(Optimizer):


    def opt(self, f_fp=None, f=None, fp=None):
-        self.x_opt = self.model._get_params_transformed()
+        self.x_opt = self.Model._get_params_transformed()
        self.grads = []

-        X, Y = self.model.X.copy(), self.model.likelihood.Y.copy()
+        X, Y = self.Model.X.copy(), self.Model.likelihood.Y.copy()

-        self.model.likelihood.YYT = 0
-        self.model.likelihood.trYYT = 0
-        self.model.likelihood._offset = 0.0
-        self.model.likelihood._scale = 1.0
+        self.Model.likelihood.YYT = 0
+        self.Model.likelihood.trYYT = 0
+        self.Model.likelihood._offset = 0.0
+        self.Model.likelihood._scale = 1.0

-        N, input_dim = self.model.X.shape
-        D = self.model.likelihood.Y.shape[1]
-        num_params = self.model._get_params()
+        N, input_dim = self.Model.X.shape
+        D = self.Model.likelihood.Y.shape[1]
+        num_params = self.Model._get_params()
        self.trace = []
-        missing_data = self.check_for_missing(self.model.likelihood.Y)
+        missing_data = self.check_for_missing(self.Model.likelihood.Y)

        step = np.zeros_like(num_params)
        for it in range(self.iterations):
            if self.actual_iter != None:
                it = self.actual_iter

-            self.model.grads = np.zeros_like(self.x_opt) # TODO this is ugly
+            self.Model.grads = np.zeros_like(self.x_opt) # TODO this is ugly

            if it == 0 or self.self_paced is False:
                features = np.random.permutation(Y.shape[1])
@ -292,29 +292,29 @@ class opt_SGD(Optimizer):
            NLL = []
            import pylab as plt
            for count, j in enumerate(features):
-                self.model.D = len(j)
-                self.model.likelihood.D = len(j)
-                self.model.likelihood.set_data(Y[:, j])
-                # self.model.likelihood.V = self.model.likelihood.Y*self.model.likelihood.precision
+                self.Model.input_dim = len(j)
+                self.Model.likelihood.input_dim = len(j)
+                self.Model.likelihood.set_data(Y[:, j])
+                # self.Model.likelihood.V = self.Model.likelihood.Y*self.Model.likelihood.precision

-                sigma = self.model.likelihood._variance
-                self.model.likelihood._variance = None # invalidate cache
-                self.model.likelihood._set_params(sigma)
+                sigma = self.Model.likelihood._variance
+                self.Model.likelihood._variance = None # invalidate cache
+                self.Model.likelihood._set_params(sigma)

                if missing_data:
                    shapes = self.get_param_shapes(N, input_dim)
                    f, step, Nj = self.step_with_missing_data(f_fp, X, step, shapes)
                else:
-                    self.model.likelihood.YYT = np.dot(self.model.likelihood.Y, self.model.likelihood.Y.T)
-                    self.model.likelihood.trYYT = np.trace(self.model.likelihood.YYT)
+                    self.Model.likelihood.YYT = np.dot(self.Model.likelihood.Y, self.Model.likelihood.Y.T)
+                    self.Model.likelihood.trYYT = np.trace(self.Model.likelihood.YYT)
                    Nj = N
                    f, fp = f_fp(self.x_opt)
-                    self.model.grads = fp.copy()
+                    self.Model.grads = fp.copy()
                    step = self.momentum * step + self.learning_rate * fp
                    self.x_opt -= step

                if self.messages == 2:
-                    noise = self.model.likelihood._variance
+                    noise = self.Model.likelihood._variance
                    status = "evaluating {feature: 5d}/{tot: 5d} \t f: {f: 2.3f} \t non-missing: {nm: 4d}\t noise: {noise: 2.4f}\r".format(feature = count, tot = len(features), f = f, nm = Nj, noise = noise)
                    sys.stdout.write(status)
                    sys.stdout.flush()
@ -328,19 +328,19 @@ class opt_SGD(Optimizer):
                # plt.plot(self.param_traces['noise'])

                # for k in self.param_traces.keys():
-                #     self.param_traces[k].append(self.model.get(k)[0])
-            self.grads.append(self.model.grads.tolist())
+                #     self.param_traces[k].append(self.Model.get(k)[0])
+            self.grads.append(self.Model.grads.tolist())
            # should really be a sum(), but earlier samples in the iteration will have a very crappy ll
            self.f_opt = np.mean(NLL)
-            self.model.N = N
-            self.model.X = X
-            self.model.D = D
-            self.model.likelihood.N = N
-            self.model.likelihood.D = D
-            self.model.likelihood.Y = Y
-            sigma = self.model.likelihood._variance
-            self.model.likelihood._variance = None # invalidate cache
-            self.model.likelihood._set_params(sigma)
+            self.Model.N = N
+            self.Model.X = X
+            self.Model.input_dim = D
+            self.Model.likelihood.N = N
+            self.Model.likelihood.input_dim = D
+            self.Model.likelihood.Y = Y
+            sigma = self.Model.likelihood._variance
+            self.Model.likelihood._variance = None # invalidate cache
+            self.Model.likelihood._set_params(sigma)

            self.trace.append(self.f_opt)
            if self.iteration_file is not None:
--- a/GPy/kern/Brownian.py
+++ b/GPy/kern/Brownian.py
@ -2,26 +2,26 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np

 def theta(x):
    """Heavisdie step function"""
    return np.where(x>=0.,1.,0.)

-class Brownian(kernpart):
+class Brownian(Kernpart):
    """
    Brownian Motion kernel.

-    :param D: the number of input dimensions
-    :type D: int
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
    :param variance:
    :type variance: float
    """
-    def __init__(self,D,variance=1.):
-        self.D = D
-        assert self.D==1, "Brownian motion in 1D only"
-        self.Nparam = 1.
+    def __init__(self,input_dim,variance=1.):
+        self.input_dim = input_dim
+        assert self.input_dim==1, "Brownian motion in 1D only"
+        self.num_params = 1.
        self.name = 'Brownian'
        self._set_params(np.array([variance]).flatten())

--- a/GPy/kern/Matern32.py
+++ b/GPy/kern/Matern32.py
@ -2,22 +2,20 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np
-import hashlib
-from ..util.linalg import pdinv,mdot
 from scipy import integrate

-class Matern32(kernpart):
+class Matern32(Kernpart):
    """
    Matern 3/2 kernel:

    .. math::

-       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^D \\frac{(x_i-y_i)^2}{\ell_i^2} }
+       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }

-    :param D: the number of input dimensions
-    :type D: int
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
    :param lengthscale: the vector of lengthscale :math:`\ell_i`
@ -28,11 +26,11 @@ class Matern32(kernpart):

    """

-    def __init__(self,D,variance=1.,lengthscale=None,ARD=False):
-        self.D = D
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False):
+        self.input_dim = input_dim
        self.ARD = ARD
        if ARD == False:
-            self.Nparam = 2
+            self.num_params = 2
            self.name = 'Mat32'
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
@ -40,78 +38,78 @@ class Matern32(kernpart):
            else:
                lengthscale = np.ones(1)
        else:
-            self.Nparam = self.D + 1
+            self.num_params = self.input_dim + 1
            self.name = 'Mat32'
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
-                assert lengthscale.size == self.D, "bad number of lengthscales"
+                assert lengthscale.size == self.input_dim, "bad number of lengthscales"
            else:
-                lengthscale = np.ones(self.D)
-        self._set_params(np.hstack((variance,lengthscale.flatten())))
+                lengthscale = np.ones(self.input_dim)
+        self._set_params(np.hstack((variance, lengthscale.flatten())))

    def _get_params(self):
        """return the value of the parameters."""
-        return np.hstack((self.variance,self.lengthscale))
+        return np.hstack((self.variance, self.lengthscale))

-    def _set_params(self,x):
+    def _set_params(self, x):
        """set the value of the parameters."""
-        assert x.size == self.Nparam
+        assert x.size == self.num_params
        self.variance = x[0]
        self.lengthscale = x[1:]

    def _get_param_names(self):
        """return parameter names."""
-        if self.Nparam == 2:
-            return ['variance','lengthscale']
+        if self.num_params == 2:
+            return ['variance', 'lengthscale']
        else:
-            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscale.size)]
+            return ['variance'] + ['lengthscale_%i' % i for i in range(self.lengthscale.size)]

-    def K(self,X,X2,target):
+    def K(self, X, X2, target):
        """Compute the covariance matrix between X and X2."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
-        np.add(self.variance*(1+np.sqrt(3.)*dist)*np.exp(-np.sqrt(3.)*dist), target,target)
+        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
+        np.add(self.variance * (1 + np.sqrt(3.) * dist) * np.exp(-np.sqrt(3.) * dist), target, target)

-    def Kdiag(self,X,target):
+    def Kdiag(self, X, target):
        """Compute the diagonal of the covariance matrix associated to X."""
-        np.add(target,self.variance,target)
+        np.add(target, self.variance, target)

-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def dK_dtheta(self, dL_dK, X, X2, target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
-        dvar = (1+np.sqrt(3.)*dist)*np.exp(-np.sqrt(3.)*dist)
-        invdist = 1./np.where(dist!=0.,dist,np.inf)
-        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscale**3
-        #dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
-        target[0] += np.sum(dvar*dL_dK)
+        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
+        dvar = (1 + np.sqrt(3.) * dist) * np.exp(-np.sqrt(3.) * dist)
+        invdist = 1. / np.where(dist != 0., dist, np.inf)
+        dist2M = np.square(X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 3
+        # dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
+        target[0] += np.sum(dvar * dL_dK)
        if self.ARD == True:
-            dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
-            #dl = self.variance*dvar[:,:,None]*dist2M*invdist[:,:,None]
-            target[1:] += (dl*dL_dK[:,:,None]).sum(0).sum(0)
+            dl = (self.variance * 3 * dist * np.exp(-np.sqrt(3.) * dist))[:, :, np.newaxis] * dist2M * invdist[:, :, np.newaxis]
+            # dl = self.variance*dvar[:,:,None]*dist2M*invdist[:,:,None]
+            target[1:] += (dl * dL_dK[:, :, None]).sum(0).sum(0)
        else:
-            dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist)) * dist2M.sum(-1)*invdist
-            #dl = self.variance*dvar*dist2M.sum(-1)*invdist
-            target[1] += np.sum(dl*dL_dK)
+            dl = (self.variance * 3 * dist * np.exp(-np.sqrt(3.) * dist)) * dist2M.sum(-1) * invdist
+            # dl = self.variance*dvar*dist2M.sum(-1)*invdist
+            target[1] += np.sum(dl * dL_dK)

-    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+    def dKdiag_dtheta(self, dL_dKdiag, X, target):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
        target[0] += np.sum(dL_dKdiag)

-    def dK_dX(self,dL_dK,X,X2,target):
+    def dK_dX(self, dL_dK, X, X2, target):
        """derivative of the covariance matrix with respect to X."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
-        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
-        dK_dX = - np.transpose(3*self.variance*dist*np.exp(-np.sqrt(3)*dist)*ddist_dX,(1,0,2))
-        target += np.sum(dK_dX*dL_dK.T[:,:,None],0)
+        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
+        ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
+        dK_dX = -np.transpose(3 * self.variance * dist * np.exp(-np.sqrt(3) * dist) * ddist_dX, (1, 0, 2))
+        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)

-    def dKdiag_dX(self,dL_dKdiag,X,target):
+    def dKdiag_dX(self, dL_dKdiag, X, target):
        pass

-    def Gram_matrix(self,F,F1,F2,lower,upper):
+    def Gram_matrix(self, F, F1, F2, lower, upper):
        """
-        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to D=1.
+        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to input_dim=1.

        :param F: vector of functions
        :type F: np.array
@ -122,16 +120,16 @@ class Matern32(kernpart):
        :param lower,upper: boundaries of the input domain
        :type lower,upper: floats
        """
-        assert self.D == 1
-        def L(x,i):
-            return(3./self.lengthscale**2*F[i](x) + 2*np.sqrt(3)/self.lengthscale*F1[i](x) + F2[i](x))
+        assert self.input_dim == 1
+        def L(x, i):
+            return(3. / self.lengthscale ** 2 * F[i](x) + 2 * np.sqrt(3) / self.lengthscale * F1[i](x) + F2[i](x))
        n = F.shape[0]
-        G = np.zeros((n,n))
+        G = np.zeros((n, n))
        for i in range(n):
-            for j in range(i,n):
-                G[i,j] = G[j,i] = integrate.quad(lambda x : L(x,i)*L(x,j),lower,upper)[0]
-        Flower = np.array([f(lower) for f in F])[:,None]
-        F1lower = np.array([f(lower) for f in F1])[:,None]
-        #print "OLD \n", np.dot(F1lower,F1lower.T), "\n \n"
-        #return(G)
-        return(self.lengthscale**3/(12.*np.sqrt(3)*self.variance) * G + 1./self.variance*np.dot(Flower,Flower.T) + self.lengthscale**2/(3.*self.variance)*np.dot(F1lower,F1lower.T))
+            for j in range(i, n):
+                G[i, j] = G[j, i] = integrate.quad(lambda x : L(x, i) * L(x, j), lower, upper)[0]
+        Flower = np.array([f(lower) for f in F])[:, None]
+        F1lower = np.array([f(lower) for f in F1])[:, None]
+        # print "OLD \n", np.dot(F1lower,F1lower.T), "\n \n"
+        # return(G)
+        return(self.lengthscale ** 3 / (12.*np.sqrt(3) * self.variance) * G + 1. / self.variance * np.dot(Flower, Flower.T) + self.lengthscale ** 2 / (3.*self.variance) * np.dot(F1lower, F1lower.T))
--- a/GPy/kern/Matern52.py
+++ b/GPy/kern/Matern52.py
@ -2,21 +2,21 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np
 import hashlib
 from scipy import integrate

-class Matern52(kernpart):
+class Matern52(Kernpart):
    """
    Matern 5/2 kernel:

    .. math::

-       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r) \ \ \ \ \  \\text{ where  } r = \sqrt{\sum_{i=1}^D \\frac{(x_i-y_i)^2}{\ell_i^2} }
+       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r) \ \ \ \ \  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }

-    :param D: the number of input dimensions
-    :type D: int
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
    :param lengthscale: the vector of lengthscale :math:`\ell_i`
@ -26,11 +26,11 @@ class Matern52(kernpart):
    :rtype: kernel object

    """
-    def __init__(self,D,variance=1.,lengthscale=None,ARD=False):
-        self.D = D
+    def __init__(self,input_dim,variance=1.,lengthscale=None,ARD=False):
+        self.input_dim = input_dim
        self.ARD = ARD
        if ARD == False:
-            self.Nparam = 2
+            self.num_params = 2
            self.name = 'Mat52'
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
@ -38,13 +38,13 @@ class Matern52(kernpart):
            else:
                lengthscale = np.ones(1)
        else:
-            self.Nparam = self.D + 1
+            self.num_params = self.input_dim + 1
            self.name = 'Mat52'
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
-                assert lengthscale.size == self.D, "bad number of lengthscales"
+                assert lengthscale.size == self.input_dim, "bad number of lengthscales"
            else:
-                lengthscale = np.ones(self.D)
+                lengthscale = np.ones(self.input_dim)
        self._set_params(np.hstack((variance,lengthscale.flatten())))

    def _get_params(self):
@ -53,13 +53,13 @@ class Matern52(kernpart):

    def _set_params(self,x):
        """set the value of the parameters."""
-        assert x.size == self.Nparam
+        assert x.size == self.num_params
        self.variance = x[0]
        self.lengthscale = x[1:]

    def _get_param_names(self):
        """return parameter names."""
-        if self.Nparam == 2:
+        if self.num_params == 2:
            return ['variance','lengthscale']
        else:
            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscale.size)]
@ -109,7 +109,7 @@ class Matern52(kernpart):

    def Gram_matrix(self,F,F1,F2,F3,lower,upper):
        """
-        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to D=1.
+        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to input_dim=1.

        :param F: vector of functions
        :type F: np.array
@ -122,7 +122,7 @@ class Matern52(kernpart):
        :param lower,upper: boundaries of the input domain
        :type lower,upper: floats
        """
-        assert self.D == 1
+        assert self.input_dim == 1
        def L(x,i):
            return(5*np.sqrt(5)/self.lengthscale**3*F[i](x) + 15./self.lengthscale**2*F1[i](x)+ 3*np.sqrt(5)/self.lengthscale*F2[i](x) + F3[i](x))
        n = F.shape[0]
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, spline, Brownian, periodic_exponential, periodic_Matern32, periodic_Matern52, prod, symmetric, coregionalise, rational_quadratic, fixed, rbfcos, independent_outputs
+from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, spline, Brownian, periodic_exponential, periodic_Matern32, periodic_Matern52, prod, symmetric, Coregionalise, rational_quadratic, Fixed, rbfcos, IndependentOutputs
 try:
    from constructors import rbf_sympy, sympykern # these depend on sympy
 except:
--- a/GPy/kern/bias.py
+++ b/GPy/kern/bias.py
@ -2,20 +2,20 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np
 import hashlib

-class bias(kernpart):
-    def __init__(self,D,variance=1.):
+class bias(Kernpart):
+    def __init__(self,input_dim,variance=1.):
        """
-        :param D: the number of input dimensions
-        :type D: int
+        :param input_dim: the number of input dimensions
+        :type input_dim: int
        :param variance: the variance of the kernel
        :type variance: float
        """
-        self.D = D
-        self.Nparam = 1
+        self.input_dim = input_dim
+        self.num_params = 1
        self.name = 'bias'
        self._set_params(np.array([variance]).flatten())

--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@ -12,7 +12,7 @@ from exponential import exponential as exponentialpart
 from Matern32 import Matern32 as Matern32part
 from Matern52 import Matern52 as Matern52part
 from bias import bias as biaspart
-from fixed import fixed as fixedpart
+from fixed import Fixed as fixedpart
 from finite_dimensional import finite_dimensional as finite_dimensionalpart
 from spline import spline as splinepart
 from Brownian import Brownian as Brownianpart
@ -21,10 +21,10 @@ from periodic_Matern32 import periodic_Matern32 as periodic_Matern32part
 from periodic_Matern52 import periodic_Matern52 as periodic_Matern52part
 from prod import prod as prodpart
 from symmetric import symmetric as symmetric_part
-from coregionalise import coregionalise as coregionalise_part
+from coregionalise import Coregionalise as coregionalise_part
 from rational_quadratic import rational_quadratic as rational_quadraticpart
 from rbfcos import rbfcos as rbfcospart
-from independent_outputs import independent_outputs as independent_output_part
+from independent_outputs import IndependentOutputs as independent_output_part
 #TODO these s=constructors are not as clean as we'd like. Tidy the code up
 #using meta-classes to make the objects construct properly wthout them.

@ -33,8 +33,8 @@ def rbf(D,variance=1., lengthscale=None,ARD=False):
    """
    Construct an RBF kernel

-    :param D: dimensionality of the kernel, obligatory
-    :type D: int
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float
    :param lengthscale: the lengthscale of the kernel
@ -51,7 +51,7 @@ def linear(D,variances=None,ARD=False):

     Arguments
     ---------
-     D (int), obligatory
+    input_dimD (int), obligatory
     variances (np.ndarray)
     ARD (boolean)
    """
@ -64,7 +64,7 @@ def white(D,variance=1.):

     Arguments
     ---------
-     D (int), obligatory
+    input_dimD (int), obligatory
     variance (float)
    """
    part = whitepart(D,variance)
@ -74,8 +74,8 @@ def exponential(D,variance=1., lengthscale=None, ARD=False):
    """
    Construct an exponential kernel

-    :param D: dimensionality of the kernel, obligatory
-    :type D: int
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float
    :param lengthscale: the lengthscale of the kernel
@ -90,8 +90,8 @@ def Matern32(D,variance=1., lengthscale=None, ARD=False):
    """
     Construct a Matern 3/2 kernel.

-    :param D: dimensionality of the kernel, obligatory
-    :type D: int
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float
    :param lengthscale: the lengthscale of the kernel
@ -106,8 +106,8 @@ def Matern52(D,variance=1., lengthscale=None, ARD=False):
    """
     Construct a Matern 5/2 kernel.

-    :param D: dimensionality of the kernel, obligatory
-    :type D: int
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float
    :param lengthscale: the lengthscale of the kernel
@ -124,7 +124,7 @@ def bias(D,variance=1.):

     Arguments
     ---------
-     D (int), obligatory
+     input_dim (int), obligatory
     variance (float)
    """
    part = biaspart(D,variance)
@ -133,7 +133,7 @@ def bias(D,variance=1.):
 def finite_dimensional(D,F,G,variances=1.,weights=None):
    """
    Construct a finite dimensional kernel.
-    D: int - the number of input dimensions
+    input_dim: int - the number of input dimensions
    F: np.array of functions with shape (n,) - the n basis functions
    G: np.array with shape (n,n) - the Gram matrix associated to F
    variances : np.ndarray with shape (n,)
@ -145,8 +145,8 @@ def spline(D,variance=1.):
    """
    Construct a spline kernel.

-    :param D: Dimensionality of the kernel
-    :type D: int
+    :param input_dim: Dimensionality of the kernel
+    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float
    """
@ -157,8 +157,8 @@ def Brownian(D,variance=1.):
    """
    Construct a Brownian motion kernel.

-    :param D: Dimensionality of the kernel
-    :type D: int
+    :param input_dim: Dimensionality of the kernel
+    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float
    """
@ -204,8 +204,8 @@ def periodic_exponential(D=1,variance=1., lengthscale=None, period=2*np.pi,n_fre
    """
    Construct an periodic exponential kernel

-    :param D: dimensionality, only defined for D=1
-    :type D: int
+    :param input_dim: dimensionality, only defined for input_dim=1
+    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float
    :param lengthscale: the lengthscale of the kernel
@ -222,8 +222,8 @@ def periodic_Matern32(D,variance=1., lengthscale=None, period=2*np.pi,n_freq=10,
    """
     Construct a periodic Matern 3/2 kernel.

-     :param D: dimensionality, only defined for D=1
-     :type D: int
+     :param input_dim: dimensionality, only defined for input_dim=1
+     :type input_dim: int
     :param variance: the variance of the kernel
     :type variance: float
     :param lengthscale: the lengthscale of the kernel
@ -240,8 +240,8 @@ def periodic_Matern52(D,variance=1., lengthscale=None, period=2*np.pi,n_freq=10,
    """
     Construct a periodic Matern 5/2 kernel.

-     :param D: dimensionality, only defined for D=1
-     :type D: int
+     :param input_dim: dimensionality, only defined for input_dim=1
+     :type input_dim: int
     :param variance: the variance of the kernel
     :type variance: float
     :param lengthscale: the lengthscale of the kernel
@ -256,14 +256,14 @@ def periodic_Matern52(D,variance=1., lengthscale=None, period=2*np.pi,n_freq=10,

 def prod(k1,k2,tensor=False):
    """
-     Construct a product kernel over D from two kernels over D
+     Construct a product kernel over input_dim from two kernels over input_dim

    :param k1, k2: the kernels to multiply
    :type k1, k2: kernpart
    :rtype: kernel object
    """
    part = prodpart(k1,k2,tensor)
-    return kern(part.D, [part])
+    return kern(part.input_dim, [part])

 def symmetric(k):
    """
@ -273,7 +273,7 @@ def symmetric(k):
    k_.parts = [symmetric_part(p) for p in k.parts]
    return k_

-def coregionalise(Nout,R=1, W=None, kappa=None):
+def Coregionalise(Nout,R=1, W=None, kappa=None):
    p = coregionalise_part(Nout,R,W,kappa)
    return kern(1,[p])

@ -282,8 +282,8 @@ def rational_quadratic(D,variance=1., lengthscale=1., power=1.):
    """
     Construct rational quadratic kernel.

-    :param D: the number of input dimensions
-    :type D: int (D=1 is the only value currently supported)
+    :param input_dim: the number of input dimensions
+    :type input_dim: int (input_dim=1 is the only value currently supported)
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
    :param lengthscale: the lengthscale :math:`\ell`
@ -294,13 +294,13 @@ def rational_quadratic(D,variance=1., lengthscale=1., power=1.):
    part = rational_quadraticpart(D,variance, lengthscale, power)
    return kern(D, [part])

-def fixed(D, K, variance=1.):
+def Fixed(D, K, variance=1.):
    """
-     Construct a fixed effect kernel.
+     Construct a Fixed effect kernel.

     Arguments
     ---------
-     D (int), obligatory
+     input_dim (int), obligatory
     K (np.array), obligatory
     variance (float)
    """
@ -314,13 +314,13 @@ def rbfcos(D,variance=1.,frequencies=None,bandwidths=None,ARD=False):
    part = rbfcospart(D,variance,frequencies,bandwidths,ARD)
    return kern(D,[part])

-def independent_outputs(k):
+def IndependentOutputs(k):
    """
    Construct a kernel with independent outputs from an existing kernel
    """
    for sl in k.input_slices:
        assert (sl.start is None) and (sl.stop is None), "cannot adjust input slices! (TODO)"
    parts = [independent_output_part(p) for p in k.parts]
-    return kern(k.D+1,parts)
+    return kern(k.input_dim+1,parts)


--- a/GPy/kern/coregionalise.py
+++ b/GPy/kern/coregionalise.py
@ -1,18 +1,18 @@
 # Copyright (c) 2012, James Hensman and Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np
 from GPy.util.linalg import mdot, pdinv
 import pdb
 from scipy import weave

-class coregionalise(kernpart):
+class Coregionalise(Kernpart):
    """
    Kernel for Intrinsic Corregionalization Models
    """
    def __init__(self,Nout,R=1, W=None, kappa=None):
-        self.D = 1
+        self.input_dim = 1
        self.name = 'coregion'
        self.Nout = Nout
        self.R = R
@ -26,14 +26,14 @@ class coregionalise(kernpart):
        else:
            assert kappa.shape==(self.Nout,)
        self.kappa = kappa
-        self.Nparam = self.Nout*(self.R + 1)
+        self.num_params = self.Nout*(self.R + 1)
        self._set_params(np.hstack([self.W.flatten(),self.kappa]))

    def _get_params(self):
        return np.hstack([self.W.flatten(),self.kappa])

    def _set_params(self,x):
-        assert x.size == self.Nparam
+        assert x.size == self.num_params
        self.kappa = x[-self.Nout:]
        self.W = x[:-self.Nout].reshape(self.Nout,self.R)
        self.B = np.dot(self.W,self.W.T) + np.diag(self.kappa)
@ -69,14 +69,14 @@ class coregionalise(kernpart):
        else:
            index2 = np.asarray(index2,dtype=np.int)
            code="""
-            for(int i=0;i<M; i++){
+            for(int i=0;i<num_inducing; i++){
              for(int j=0; j<N; j++){
-                  target[i+j*M] += B[Nout*index[j]+index2[i]];
+                  target[i+j*num_inducing] += B[Nout*index[j]+index2[i]];
                }
              }
            """
-            N,M,B,Nout = index.size,index2.size, self.B, self.Nout
-            weave.inline(code,['target','index','index2','N','M','B','Nout'])
+            N,num_inducing,B,Nout = index.size,index2.size, self.B, self.Nout
+            weave.inline(code,['target','index','index2','N','num_inducing','B','Nout'])


    def Kdiag(self,index,target):
@ -91,14 +91,14 @@ class coregionalise(kernpart):
            index2 = np.asarray(index2,dtype=np.int)

        code="""
-        for(int i=0; i<M; i++){
+        for(int i=0; i<num_inducing; i++){
          for(int j=0; j<N; j++){
-            dL_dK_small[index[j] + Nout*index2[i]] += dL_dK[i+j*M];
+            dL_dK_small[index[j] + Nout*index2[i]] += dL_dK[i+j*num_inducing];
          }
        }
        """
-        N, M, Nout = index.size, index2.size, self.Nout
-        weave.inline(code, ['N','M','Nout','dL_dK','dL_dK_small','index','index2'])
+        N, num_inducing, Nout = index.size, index2.size, self.Nout
+        weave.inline(code, ['N','num_inducing','Nout','dL_dK','dL_dK_small','index','index2'])

        dkappa = np.diag(dL_dK_small)
        dL_dK_small += dL_dK_small.T
--- a/GPy/kern/exponential.py
+++ b/GPy/kern/exponential.py
@ -2,21 +2,20 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np
-import hashlib
 from scipy import integrate

-class exponential(kernpart):
+class exponential(Kernpart):
    """
    Exponential kernel (aka Ornstein-Uhlenbeck or Matern 1/2)

    .. math::

-       k(r) = \sigma^2 \exp(- r) \ \ \ \ \  \\text{ where  } r = \sqrt{\sum_{i=1}^D \\frac{(x_i-y_i)^2}{\ell_i^2} }
+       k(r) = \sigma^2 \exp(- r) \ \ \ \ \  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }

-    :param D: the number of input dimensions
-    :type D: int
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
    :param lengthscale: the vector of lengthscale :math:`\ell_i`
@ -26,11 +25,11 @@ class exponential(kernpart):
    :rtype: kernel object

    """
-    def __init__(self,D,variance=1.,lengthscale=None,ARD=False):
-        self.D = D
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False):
+        self.input_dim = input_dim
        self.ARD = ARD
        if ARD == False:
-            self.Nparam = 2
+            self.num_params = 2
            self.name = 'exp'
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
@ -38,76 +37,76 @@ class exponential(kernpart):
            else:
                lengthscale = np.ones(1)
        else:
-            self.Nparam = self.D + 1
+            self.num_params = self.input_dim + 1
            self.name = 'exp'
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
-                assert lengthscale.size == self.D, "bad number of lengthscales"
+                assert lengthscale.size == self.input_dim, "bad number of lengthscales"
            else:
-                lengthscale = np.ones(self.D)
-        self._set_params(np.hstack((variance,lengthscale.flatten())))
+                lengthscale = np.ones(self.input_dim)
+        self._set_params(np.hstack((variance, lengthscale.flatten())))

    def _get_params(self):
        """return the value of the parameters."""
-        return np.hstack((self.variance,self.lengthscale))
+        return np.hstack((self.variance, self.lengthscale))

-    def _set_params(self,x):
+    def _set_params(self, x):
        """set the value of the parameters."""
-        assert x.size == self.Nparam
+        assert x.size == self.num_params
        self.variance = x[0]
        self.lengthscale = x[1:]

    def _get_param_names(self):
        """return parameter names."""
-        if self.Nparam == 2:
-            return ['variance','lengthscale']
+        if self.num_params == 2:
+            return ['variance', 'lengthscale']
        else:
-            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscale.size)]
+            return ['variance'] + ['lengthscale_%i' % i for i in range(self.lengthscale.size)]

-    def K(self,X,X2,target):
+    def K(self, X, X2, target):
        """Compute the covariance matrix between X and X2."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
-        np.add(self.variance*np.exp(-dist), target,target)
+        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
+        np.add(self.variance * np.exp(-dist), target, target)

-    def Kdiag(self,X,target):
+    def Kdiag(self, X, target):
        """Compute the diagonal of the covariance matrix associated to X."""
-        np.add(target,self.variance,target)
+        np.add(target, self.variance, target)

-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def dK_dtheta(self, dL_dK, X, X2, target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
-        invdist = 1./np.where(dist!=0.,dist,np.inf)
-        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscale**3
+        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
+        invdist = 1. / np.where(dist != 0., dist, np.inf)
+        dist2M = np.square(X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 3
        dvar = np.exp(-dist)
-        target[0] += np.sum(dvar*dL_dK)
+        target[0] += np.sum(dvar * dL_dK)
        if self.ARD == True:
-            dl = self.variance*dvar[:,:,None]*dist2M*invdist[:,:,None]
-            target[1:] += (dl*dL_dK[:,:,None]).sum(0).sum(0)
+            dl = self.variance * dvar[:, :, None] * dist2M * invdist[:, :, None]
+            target[1:] += (dl * dL_dK[:, :, None]).sum(0).sum(0)
        else:
-            dl = self.variance*dvar*dist2M.sum(-1)*invdist
-            target[1] += np.sum(dl*dL_dK)
+            dl = self.variance * dvar * dist2M.sum(-1) * invdist
+            target[1] += np.sum(dl * dL_dK)

-    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+    def dKdiag_dtheta(self, dL_dKdiag, X, target):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
-        #NB: derivative of diagonal elements wrt lengthscale is 0
+        # NB: derivative of diagonal elements wrt lengthscale is 0
        target[0] += np.sum(dL_dKdiag)

-    def dK_dX(self,dL_dK,X,X2,target):
+    def dK_dX(self, dL_dK, X, X2, target):
        """derivative of the covariance matrix with respect to X."""
        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
-        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
-        dK_dX = - np.transpose(self.variance*np.exp(-dist)*ddist_dX,(1,0,2))
-        target += np.sum(dK_dX*dL_dK.T[:,:,None],0)
+        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
+        ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
+        dK_dX = -np.transpose(self.variance * np.exp(-dist) * ddist_dX, (1, 0, 2))
+        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)

-    def dKdiag_dX(self,dL_dKdiag,X,target):
+    def dKdiag_dX(self, dL_dKdiag, X, target):
        pass

-    def Gram_matrix(self,F,F1,lower,upper):
+    def Gram_matrix(self, F, F1, lower, upper):
        """
-        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to D=1.
+        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to input_dim=1.

        :param F: vector of functions
        :type F: np.array
@ -116,13 +115,13 @@ class exponential(kernpart):
        :param lower,upper: boundaries of the input domain
        :type lower,upper: floats
        """
-        assert self.D == 1
-        def L(x,i):
-            return(1./self.lengthscale*F[i](x) + F1[i](x))
+        assert self.input_dim == 1
+        def L(x, i):
+            return(1. / self.lengthscale * F[i](x) + F1[i](x))
        n = F.shape[0]
-        G = np.zeros((n,n))
+        G = np.zeros((n, n))
        for i in range(n):
-            for j in range(i,n):
-                G[i,j] = G[j,i] = integrate.quad(lambda x : L(x,i)*L(x,j),lower,upper)[0]
-        Flower = np.array([f(lower) for f in F])[:,None]
-        return(self.lengthscale/2./self.variance * G + 1./self.variance * np.dot(Flower,Flower.T))
+            for j in range(i, n):
+                G[i, j] = G[j, i] = integrate.quad(lambda x : L(x, i) * L(x, j), lower, upper)[0]
+        Flower = np.array([f(lower) for f in F])[:, None]
+        return(self.lengthscale / 2. / self.variance * G + 1. / self.variance * np.dot(Flower, Flower.T))
--- a/GPy/kern/finite_dimensional.py
+++ b/GPy/kern/finite_dimensional.py
@ -2,21 +2,21 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np
 from ..util.linalg import pdinv,mdot

-class finite_dimensional(kernpart):
-    def __init__(self, D, F, G, variance=1., weights=None):
+class finite_dimensional(Kernpart):
+    def __init__(self, input_dim, F, G, variance=1., weights=None):
        """
        Argumnents
        ----------
-        D: int - the number of input dimensions
+        input_dim: int - the number of input dimensions
        F: np.array of functions with shape (n,) - the n basis functions
        G: np.array with shape (n,n) - the Gram matrix associated to F
        weights : np.ndarray with shape (n,)
        """
-        self.D = D
+        self.input_dim = input_dim
        self.F = F
        self.G = G
        self.G_1 ,L,Li,logdet = pdinv(G)
@ -25,14 +25,14 @@ class finite_dimensional(kernpart):
            assert weights.shape==(self.n,)
        else:
            weights = np.ones(self.n)
-        self.Nparam = self.n + 1
+        self.num_params = self.n + 1
        self.name = 'finite_dim'
        self._set_params(np.hstack((variance,weights)))

    def _get_params(self):
        return np.hstack((self.variance,self.weights))
    def _set_params(self,x):
-        assert x.size == (self.Nparam)
+        assert x.size == (self.num_params)
        self.variance = x[0]
        self.weights = x[1:]
    def _get_param_names(self):
@ -48,7 +48,7 @@ class finite_dimensional(kernpart):
        product = self.variance * mdot(FX,np.diag(np.sqrt(self.weights)),self.G_1,np.diag(np.sqrt(self.weights)),FX2.T)
        np.add(product,target,target)
    def Kdiag(self,X,target):
-        product = np.diag(self.K(X,X2))
+        product = np.diag(self.K(X, X))
        np.add(target,product,target)
    def dK_dtheta(self,X,X2,target):
        """Return shape is NxMx(Ntheta)"""
--- a/GPy/kern/fixed.py
+++ b/GPy/kern/fixed.py
@ -1,42 +1,41 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np
-import hashlib

-class fixed(kernpart):
-    def __init__(self,D,K,variance=1.):
+class Fixed(Kernpart):
+    def __init__(self, input_dim, K, variance=1.):
        """
-        :param D: the number of input dimensions
-        :type D: int
+        :param input_dim: the number of input dimensions
+        :type input_dim: int
        :param variance: the variance of the kernel
        :type variance: float
        """
-        self.D = D
+        self.input_dim = input_dim
        self.fixed_K = K
-        self.Nparam = 1
-        self.name = 'fixed'
+        self.num_params = 1
+        self.name = 'Fixed'
        self._set_params(np.array([variance]).flatten())

    def _get_params(self):
        return self.variance

-    def _set_params(self,x):
-        assert x.shape==(1,)
+    def _set_params(self, x):
+        assert x.shape == (1,)
        self.variance = x

    def _get_param_names(self):
        return ['variance']

-    def K(self,X,X2,target):
+    def K(self, X, X2, target):
        target += self.variance * self.fixed_K

-    def dK_dtheta(self,partial,X,X2,target):
+    def dK_dtheta(self, partial, X, X2, target):
        target += (partial * self.fixed_K).sum()

-    def dK_dX(self, partial,X, X2, target):
+    def dK_dX(self, partial, X, X2, target):
        pass

-    def dKdiag_dX(self,partial,X,target):
+    def dKdiag_dX(self, partial, X, target):
        pass
--- a/GPy/kern/independent_outputs.py
+++ b/GPy/kern/independent_outputs.py
@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np

 def index_to_slices(index):
@ -31,7 +31,7 @@ def index_to_slices(index):
    [ret[ind_i].append(slice(*indexes_i)) for ind_i,indexes_i in zip(ind[switchpoints[:-1]],zip(switchpoints,switchpoints[1:]))]
    return ret

-class independent_outputs(kernpart):
+class IndependentOutputs(Kernpart):
    """
    A kernel part shich can reopresent several independent functions.
    this kernel 'switches off' parts of the matrix where the output indexes are different.
@ -41,8 +41,8 @@ class independent_outputs(kernpart):

    """
    def __init__(self,k):
-        self.D = k.D + 1
-        self.Nparam = k.Nparam
+        self.input_dim = k.input_dim + 1
+        self.num_params = k.num_params
        self.name = 'iops('+ k.name + ')'
        self.k = k

--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@ -4,32 +4,31 @@

 import numpy as np
 import pylab as pb
-from ..core.parameterised import parameterised
-from kernpart import kernpart
+from ..core.parameterised import Parameterised
+from kernpart import Kernpart
 import itertools
 from prod import prod
-from ..util.linalg import symmetrify

-class kern(parameterised):
-    def __init__(self, D, parts=[], input_slices=None):
+class kern(Parameterised):
+    def __init__(self, input_dim, parts=[], input_slices=None):
        """
        This is the main kernel class for GPy. It handles multiple (additive) kernel functions, and keeps track of variaous things like which parameters live where.

        The technical code for kernels is divided into _parts_ (see e.g. rbf.py). This obnject contains a list of parts, which are computed additively. For multiplication, special _prod_ parts are used.

-        :param D: The dimensioality of the kernel's input space
-        :type D: int
+        :param input_dim: The dimensionality of the kernel's input space
+        :type input_dim: int
        :param parts: the 'parts' (PD functions) of the kernel
-        :type parts: list of kernpart objects
+        :type parts: list of Kernpart objects
        :param input_slices: the slices on the inputs which apply to each kernel
        :type input_slices: list of slice objects, or list of bools

        """
        self.parts = parts
        self.Nparts = len(parts)
-        self.Nparam = sum([p.Nparam for p in self.parts])
+        self.num_params = sum([p.num_params for p in self.parts])

-        self.D = D
+        self.input_dim = input_dim

        # deal with input_slices
        if input_slices is None:
@ -39,11 +38,11 @@ class kern(parameterised):
            self.input_slices = [sl if type(sl) is slice else slice(None) for sl in input_slices]

        for p in self.parts:
-            assert isinstance(p, kernpart), "bad kernel part"
+            assert isinstance(p, Kernpart), "bad kernel part"

        self.compute_param_slices()

-        parameterised.__init__(self)
+        Parameterised.__init__(self)


    def plot_ARD(self, fignum=None, ax=None):
@ -80,8 +79,8 @@ class kern(parameterised):
        self.param_slices = []
        count = 0
        for p in self.parts:
-            self.param_slices.append(slice(count, count + p.Nparam))
-            count += p.Nparam
+            self.param_slices.append(slice(count, count + p.num_params))
+            count += p.num_params

    def __add__(self, other):
        """
@ -96,29 +95,29 @@ class kern(parameterised):
        :type other: GPy.kern
        """
        if tensor:
-            D = self.D + other.D
-            self_input_slices = [slice(*sl.indices(self.D)) for sl in self.input_slices]
-            other_input_indices = [sl.indices(other.D) for sl in other.input_slices]
-            other_input_slices = [slice(i[0] + self.D, i[1] + self.D, i[2]) for i in other_input_indices]
+            D = self.input_dim + other.input_dim
+            self_input_slices = [slice(*sl.indices(self.input_dim)) for sl in self.input_slices]
+            other_input_indices = [sl.indices(other.input_dim) for sl in other.input_slices]
+            other_input_slices = [slice(i[0] + self.input_dim, i[1] + self.input_dim, i[2]) for i in other_input_indices]

            newkern = kern(D, self.parts + other.parts, self_input_slices + other_input_slices)

            # transfer constraints:
-            newkern.constrained_indices = self.constrained_indices + [x + self.Nparam for x in other.constrained_indices]
+            newkern.constrained_indices = self.constrained_indices + [x + self.num_params for x in other.constrained_indices]
            newkern.constraints = self.constraints + other.constraints
-            newkern.fixed_indices = self.fixed_indices + [self.Nparam + x for x in other.fixed_indices]
+            newkern.fixed_indices = self.fixed_indices + [self.num_params + x for x in other.fixed_indices]
            newkern.fixed_values = self.fixed_values + other.fixed_values
            newkern.constraints = self.constraints + other.constraints
-            newkern.tied_indices = self.tied_indices + [self.Nparam + x for x in other.tied_indices]
+            newkern.tied_indices = self.tied_indices + [self.num_params + x for x in other.tied_indices]
        else:
-            assert self.D == other.D
-            newkern = kern(self.D, self.parts + other.parts, self.input_slices + other.input_slices)
+            assert self.input_dim == other.input_dim
+            newkern = kern(self.input_dim, self.parts + other.parts, self.input_slices + other.input_slices)
            # transfer constraints:
-            newkern.constrained_indices = self.constrained_indices + [i + self.Nparam  for i in other.constrained_indices]
+            newkern.constrained_indices = self.constrained_indices + [i + self.num_params  for i in other.constrained_indices]
            newkern.constraints = self.constraints + other.constraints
-            newkern.fixed_indices = self.fixed_indices + [self.Nparam + x for x in other.fixed_indices]
+            newkern.fixed_indices = self.fixed_indices + [self.num_params + x for x in other.fixed_indices]
            newkern.fixed_values = self.fixed_values + other.fixed_values
-            newkern.tied_indices = self.tied_indices + [self.Nparam + x for x in other.tied_indices]
+            newkern.tied_indices = self.tied_indices + [self.num_params + x for x in other.tied_indices]
        return newkern

    def __mul__(self, other):
@ -138,16 +137,16 @@ class kern(parameterised):

        slices = []
        for sl1, sl2 in itertools.product(K1.input_slices, K2.input_slices):
-            s1, s2 = [False] * K1.D, [False] * K2.D
+            s1, s2 = [False] * K1.input_dim, [False] * K2.input_dim
            s1[sl1], s2[sl2] = [True], [True]
            slices += [s1 + s2]

        newkernparts = [prod(k1, k2, tensor) for k1, k2 in itertools.product(K1.parts, K2.parts)]

        if tensor:
-            newkern = kern(K1.D + K2.D, newkernparts, slices)
+            newkern = kern(K1.input_dim + K2.input_dim, newkernparts, slices)
        else:
-            newkern = kern(K1.D, newkernparts, slices)
+            newkern = kern(K1.input_dim, newkernparts, slices)

        newkern._follow_constrains(K1, K2)
        return newkern
@ -158,13 +157,13 @@ class kern(parameterised):
        K1_param = []
        n = 0
        for k1 in K1.parts:
-            K1_param += [range(n, n + k1.Nparam)]
-            n += k1.Nparam
+            K1_param += [range(n, n + k1.num_params)]
+            n += k1.num_params
        n = 0
        K2_param = []
        for k2 in K2.parts:
-            K2_param += [range(K1.Nparam + n, K1.Nparam + n + k2.Nparam)]
-            n += k2.Nparam
+            K2_param += [range(K1.num_params + n, K1.num_params + n + k2.num_params)]
+            n += k2.num_params
        index_param = []
        for p1 in K1_param:
            for p2 in K2_param:
@ -172,12 +171,12 @@ class kern(parameterised):
        index_param = np.array(index_param)

        # Get the ties and constrains of the kernels before the multiplication
-        prev_ties = K1.tied_indices + [arr + K1.Nparam for arr in K2.tied_indices]
+        prev_ties = K1.tied_indices + [arr + K1.num_params for arr in K2.tied_indices]

-        prev_constr_ind = [K1.constrained_indices] + [K1.Nparam + i for i in K2.constrained_indices]
+        prev_constr_ind = [K1.constrained_indices] + [K1.num_params + i for i in K2.constrained_indices]
        prev_constr = K1.constraints + K2.constraints

-        # prev_constr_fix = K1.fixed_indices + [arr + K1.Nparam for arr in K2.fixed_indices]
+        # prev_constr_fix = K1.fixed_indices + [arr + K1.num_params for arr in K2.fixed_indices]
        # prev_constr_fix_values = K1.fixed_values + K2.fixed_values

        # follow the previous ties
@ -186,7 +185,7 @@ class kern(parameterised):
                index_param[np.where(index_param == j)[0]] = arr[0]

        # ties and constrains
-        for i in range(K1.Nparam + K2.Nparam):
+        for i in range(K1.num_params + K2.num_params):
            index = np.where(index_param == i)[0]
            if index.size > 1:
                self.tie_params(index)
@ -211,7 +210,7 @@ class kern(parameterised):
    def K(self, X, X2=None, which_parts='all'):
        if which_parts == 'all':
            which_parts = [True] * self.Nparts
-        assert X.shape[1] == self.D
+        assert X.shape[1] == self.input_dim
        if X2 is None:
            target = np.zeros((X.shape[0], X.shape[0]))
            [p.K(X[:, i_s], None, target=target) for p, i_s, part_i_used in zip(self.parts, self.input_slices, which_parts) if part_i_used]
@ -223,14 +222,14 @@ class kern(parameterised):
    def dK_dtheta(self, dL_dK, X, X2=None):
        """
        :param dL_dK: An array of dL_dK derivaties, dL_dK
-        :type dL_dK: Np.ndarray (N x M)
+        :type dL_dK: Np.ndarray (N x num_inducing)
        :param X: Observed data inputs
-        :type X: np.ndarray (N x D)
+        :type X: np.ndarray (N x input_dim)
        :param X2: Observed dara inputs (optional, defaults to X)
-        :type X2: np.ndarray (M x D)
+        :type X2: np.ndarray (num_inducing x input_dim)
        """
-        assert X.shape[1] == self.D
-        target = np.zeros(self.Nparam)
+        assert X.shape[1] == self.input_dim
+        target = np.zeros(self.num_params)
        if X2 is None:
            [p.dK_dtheta(dL_dK, X[:, i_s], None, target[ps]) for p, i_s, ps, in zip(self.parts, self.input_slices, self.param_slices)]
        else:
@ -251,20 +250,20 @@ class kern(parameterised):
    def Kdiag(self, X, which_parts='all'):
        if which_parts == 'all':
            which_parts = [True] * self.Nparts
-        assert X.shape[1] == self.D
+        assert X.shape[1] == self.input_dim
        target = np.zeros(X.shape[0])
        [p.Kdiag(X[:, i_s], target=target) for p, i_s, part_on in zip(self.parts, self.input_slices, which_parts) if part_on]
        return target

    def dKdiag_dtheta(self, dL_dKdiag, X):
-        assert X.shape[1] == self.D
+        assert X.shape[1] == self.input_dim
        assert dL_dKdiag.size == X.shape[0]
-        target = np.zeros(self.Nparam)
+        target = np.zeros(self.num_params)
        [p.dKdiag_dtheta(dL_dKdiag, X[:, i_s], target[ps]) for p, i_s, ps in zip(self.parts, self.input_slices, self.param_slices)]
        return self._transform_gradients(target)

    def dKdiag_dX(self, dL_dKdiag, X):
-        assert X.shape[1] == self.D
+        assert X.shape[1] == self.input_dim
        target = np.zeros_like(X)
        [p.dKdiag_dX(dL_dKdiag, X[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
        return target
@ -275,7 +274,7 @@ class kern(parameterised):
        return target

    def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S):
-        target = np.zeros(self.Nparam)
+        target = np.zeros(self.num_params)
        [p.dpsi0_dtheta(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self.parts, self.param_slices, self.input_slices)]
        return self._transform_gradients(target)

@ -290,7 +289,7 @@ class kern(parameterised):
        return target

    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S):
-        target = np.zeros((self.Nparam))
+        target = np.zeros((self.num_params))
        [p.dpsi1_dtheta(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self.parts, self.param_slices, self.input_slices)]
        return self._transform_gradients(target)

@ -300,16 +299,16 @@ class kern(parameterised):
        return target

    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S):
-        """return shapes are N,M,input_dim"""
+        """return shapes are N,num_inducing,input_dim"""
        target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
        [p.dpsi1_dmuS(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
        return target_mu, target_S

    def psi2(self, Z, mu, S):
        """
-        :param Z: np.ndarray of inducing inputs (M x input_dim)
+        :param Z: np.ndarray of inducing inputs (num_inducing x input_dim)
        :param mu, S: np.ndarrays of means and variances (each N x input_dim)
-        :returns psi2: np.ndarray (N,M,M)
+        :returns psi2: np.ndarray (N,num_inducing,num_inducing)
        """
        target = np.zeros((mu.shape[0], Z.shape[0], Z.shape[0]))
        [p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
@ -327,13 +326,13 @@ class kern(parameterised):
            p2.psi1(Z, mu, S, tmp2)

            prod = np.multiply(tmp1, tmp2)
-            crossterms += prod[:,:,None] + prod[:, None, :]
-            
+            crossterms += prod[:, :, None] + prod[:, None, :]
+
        target += crossterms
        return target

    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S):
-        target = np.zeros(self.Nparam)
+        target = np.zeros(self.num_params)
        [p.dpsi2_dtheta(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, i_s, ps in zip(self.parts, self.input_slices, self.param_slices)]

        # compute the "cross" terms
@ -345,14 +344,14 @@ class kern(parameterised):

            tmp = np.zeros((mu.shape[0], Z.shape[0]))
            p1.psi1(Z, mu, S, tmp)
-            p2.dpsi1_dtheta((tmp[:,None,:]*dL_dpsi2).sum(1)*2., Z, mu, S, target[ps2])
+            p2.dpsi1_dtheta((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target[ps2])

        return self._transform_gradients(target)

    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S):
        target = np.zeros_like(Z)
        [p.dpsi2_dZ(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
-        #target *= 2
+        # target *= 2

        # compute the "cross" terms
        # TODO: we need input_slices here.
@ -362,7 +361,7 @@ class kern(parameterised):
            tmp = np.zeros((mu.shape[0], Z.shape[0]))
            p1.psi1(Z, mu, S, tmp)
            tmp2 = np.zeros_like(target)
-            p2.dpsi1_dZ((tmp[:,None,:]*dL_dpsi2).sum(1).T, Z, mu, S, tmp2)
+            p2.dpsi1_dZ((tmp[:, None, :] * dL_dpsi2).sum(1).T, Z, mu, S, tmp2)
            target += tmp2

        return target * 2
@ -379,14 +378,14 @@ class kern(parameterised):

            tmp = np.zeros((mu.shape[0], Z.shape[0]))
            p1.psi1(Z, mu, S, tmp)
-            p2.dpsi1_dmuS((tmp[:,None,:]*dL_dpsi2).sum(1).T*2., Z, mu, S, target_mu, target_S)
+            p2.dpsi1_dmuS((tmp[:, None, :] * dL_dpsi2).sum(1).T * 2., Z, mu, S, target_mu, target_S)

        return target_mu, target_S

    def plot(self, x=None, plot_limits=None, which_parts='all', resolution=None, *args, **kwargs):
        if which_parts == 'all':
            which_parts = [True] * self.Nparts
-        if self.D == 1:
+        if self.input_dim == 1:
            if x is None:
                x = np.zeros((1, 1))
            else:
@ -408,7 +407,7 @@ class kern(parameterised):
            pb.xlabel("x")
            pb.ylabel("k(x,%0.1f)" % x)

-        elif self.D == 2:
+        elif self.input_dim == 2:
            if x is None:
                x = np.zeros((1, 2))
            else:
@ -430,7 +429,7 @@ class kern(parameterised):
            Xnew = np.vstack((xx.flatten(), yy.flatten())).T
            Kx = self.K(Xnew, x, which_parts)
            Kx = Kx.reshape(resolution, resolution).T
-            pb.contour(xg, yg, Kx, vmin=Kx.min(), vmax=Kx.max(), cmap=pb.cm.jet, *args, **kwargs)
+            pb.contour(xg, yg, Kx, vmin=Kx.min(), vmax=Kx.max(), cmap=pb.cm.jet, *args, **kwargs) # @UndefinedVariable
            pb.xlim(xmin[0], xmax[0])
            pb.ylim(xmin[1], xmax[1])
            pb.xlabel("x1")
--- a/GPy/kern/kernpart.py
+++ b/GPy/kern/kernpart.py
@ -2,18 +2,18 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-class kernpart(object):
-    def __init__(self,D):
+class Kernpart(object):
+    def __init__(self,input_dim):
        """
        The base class for a kernpart: a positive definite function which forms part of a kernel

-        :param D: the number of input dimensions to the function
-        :type D: int
+        :param input_dim: the number of input dimensions to the function
+        :type input_dim: int

        Do not instantiate.
        """
-        self.D = D
-        self.Nparam = 1
+        self.input_dim = input_dim
+        self.num_params = 1
        self.name = 'unnamed'

    def _get_params(self):
--- a/GPy/kern/linear.py
+++ b/GPy/kern/linear.py
@ -2,21 +2,21 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np
 from ..util.linalg import tdot
 from scipy import weave

-class linear(kernpart):
+class linear(Kernpart):
    """
    Linear kernel

    .. math::

-       k(x,y) = \sum_{i=1}^D \sigma^2_i x_iy_i
+       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i x_iy_i

-    :param D: the number of input dimensions
-    :type D: int
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
    :param variances: the vector of variances :math:`\sigma^2_i`
    :type variances: array or list of the appropriate size (or float if there is only one variance parameter)
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel has only one variance parameter \sigma^2, otherwise there is one variance parameter per dimension.
@ -24,11 +24,11 @@ class linear(kernpart):
    :rtype: kernel object
    """

-    def __init__(self, D, variances=None, ARD=False):
-        self.D = D
+    def __init__(self, input_dim, variances=None, ARD=False):
+        self.input_dim = input_dim
        self.ARD = ARD
        if ARD == False:
-            self.Nparam = 1
+            self.num_params = 1
            self.name = 'linear'
            if variances is not None:
                variances = np.asarray(variances)
@ -37,13 +37,13 @@ class linear(kernpart):
                variances = np.ones(1)
            self._Xcache, self._X2cache = np.empty(shape=(2,))
        else:
-            self.Nparam = self.D
+            self.num_params = self.input_dim
            self.name = 'linear'
            if variances is not None:
                variances = np.asarray(variances)
-                assert variances.size == self.D, "bad number of lengthscales"
+                assert variances.size == self.input_dim, "bad number of lengthscales"
            else:
-                variances = np.ones(self.D)
+                variances = np.ones(self.input_dim)
        self._set_params(variances.flatten())

        # initialize cache
@ -54,12 +54,12 @@ class linear(kernpart):
        return self.variances

    def _set_params(self, x):
-        assert x.size == (self.Nparam)
+        assert x.size == (self.num_params)
        self.variances = x
        self.variances2 = np.square(self.variances)

    def _get_param_names(self):
-        if self.Nparam == 1:
+        if self.num_params == 1:
            return ['variance']
        else:
            return ['variance_%i' % i for i in range(self.variances.size)]
@ -82,7 +82,7 @@ class linear(kernpart):
    def dK_dtheta(self, dL_dK, X, X2, target):
        if self.ARD:
            if X2 is None:
-                [np.add(target[i:i + 1], np.sum(dL_dK * tdot(X[:, i:i + 1])), target[i:i + 1]) for i in range(self.D)]
+                [np.add(target[i:i + 1], np.sum(dL_dK * tdot(X[:, i:i + 1])), target[i:i + 1]) for i in range(self.input_dim)]
            else:
                product = X[:, None, :] * X2[None, :, :]
                target += (dL_dK[:, :, None] * product).sum(0).sum(0)
@ -138,7 +138,7 @@ class linear(kernpart):

    def psi2(self, Z, mu, S, target):
        """
-        returns N,M,M matrix
+        returns N,num_inducing,num_inducing matrix
        """
        self._psi_computations(Z, mu, S)
 #         psi2_old = self.ZZ * np.square(self.variances) * self.mu2_S[:, None, None, :]
@ -153,7 +153,7 @@ class linear(kernpart):
 #                     psi2_real[n, m, m_prime] = np.dot(tmp, (
 #                             self._Z[m_prime:m_prime + 1] * self.variances).T)
 #         mu2_S = (self._mu[:, None, :] * self._mu[:, :, None])
-#         mu2_S[:, np.arange(self.D), np.arange(self.D)] += self._S
+#         mu2_S[:, np.arange(self.input_dim), np.arange(self.input_dim)] += self._S
 #         psi2 = (self.ZA[None, :, None, :] * mu2_S[:, None]).sum(-1)
 #         psi2 = (psi2[:, :, None] * self.ZA[None, None]).sum(-1)
 #         psi2_tensor = np.tensordot(self.ZZ[None, :, :, :] * np.square(self.variances), self.mu2_S[:, None, None, :], ((3), (3))).squeeze().T
@ -168,7 +168,7 @@ class linear(kernpart):
            target += tmp.sum()

    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
-        """Think N,M,M,input_dim """
+        """Think N,num_inducing,num_inducing,input_dim """
        self._psi_computations(Z, mu, S)
        AZZA = self.ZA.T[:, None, :, None] * self.ZA[None, :, None, :]
        AZZA = AZZA + AZZA.swapaxes(1, 2)
@ -184,7 +184,7 @@ class linear(kernpart):
        double factor,tmp;
        #pragma omp parallel for private(m,mm,q,qq,factor,tmp)
        for(n=0;n<N;n++){
-          for(m=0;m<M;m++){
+          for(m=0;m<num_inducing;m++){
            for(mm=0;mm<=m;mm++){
              //add in a factor of 2 for the off-diagonal terms (and then count them only once)
              if(m==mm)
@ -215,9 +215,9 @@ class linear(kernpart):
                         'extra_compile_args': ['-fopenmp -O3'],  #-march=native'],
                         'extra_link_args'   : ['-lgomp']}

-        N,M,input_dim = mu.shape[0],Z.shape[0],mu.shape[1]
+        N,num_inducing,input_dim = mu.shape[0],Z.shape[0],mu.shape[1]
        weave.inline(code, support_code=support_code, libraries=['gomp'],
-                     arg_names=['N','M','input_dim','mu','AZZA','AZZA_2','target_mu','target_S','dL_dpsi2'],
+                     arg_names=['N','num_inducing','input_dim','mu','AZZA','AZZA_2','target_mu','target_S','dL_dpsi2'],
                     type_converters=weave.converters.blitz,**weave_options)


@ -231,9 +231,9 @@ class linear(kernpart):
        code="""
        int n,m,mm,q;
        #pragma omp parallel for private(n,mm,q)
-        for(m=0;m<M;m++){
+        for(m=0;m<num_inducing;m++){
          for(q=0;q<input_dim;q++){
-            for(mm=0;mm<M;mm++){
+            for(mm=0;mm<num_inducing;mm++){
              for(n=0;n<N;n++){
                target(m,q) += dL_dpsi2(n,m,mm)*AZA(n,mm,q);
              }
@ -249,9 +249,9 @@ class linear(kernpart):
                         'extra_compile_args': ['-fopenmp -O3'],  #-march=native'],
                         'extra_link_args'   : ['-lgomp']}

-        N,M,input_dim = mu.shape[0],Z.shape[0],mu.shape[1]
+        N,num_inducing,input_dim = mu.shape[0],Z.shape[0],mu.shape[1]
        weave.inline(code, support_code=support_code, libraries=['gomp'],
-                     arg_names=['N','M','input_dim','AZA','target','dL_dpsi2'],
+                     arg_names=['N','num_inducing','input_dim','AZA','target','dL_dpsi2'],
                     type_converters=weave.converters.blitz,**weave_options)


@ -278,7 +278,7 @@ class linear(kernpart):
        muS_changed = not (np.array_equal(mu, self._mu) and np.array_equal(S, self._S))
        if Zv_changed:
            # Z has changed, compute Z specific stuff
-            # self.ZZ = Z[:,None,:]*Z[None,:,:] # M,M,input_dim
+            # self.ZZ = Z[:,None,:]*Z[None,:,:] # num_inducing,num_inducing,input_dim
 #             self.ZZ = np.empty((Z.shape[0], Z.shape[0], Z.shape[1]), order='F')
 #             [tdot(Z[:, i:i + 1], self.ZZ[:, :, i].T) for i in xrange(Z.shape[1])]
            self.ZA = Z * self.variances
@ -291,5 +291,5 @@ class linear(kernpart):
            self.inner[:, diag_indices[0], diag_indices[1]] += S
            self._mu, self._S = mu.copy(), S.copy()
        if Zv_changed or muS_changed:
-            self.ZAinner = np.dot(self.ZA, self.inner).swapaxes(0, 1)  # NOTE: self.ZAinner \in [M x N x input_dim]!
+            self.ZAinner = np.dot(self.ZA, self.inner).swapaxes(0, 1)  # NOTE: self.ZAinner \in [num_inducing x N x input_dim]!
            self._psi2 = np.dot(self.ZAinner, self.ZA.T)
--- a/GPy/kern/periodic_Matern32.py
+++ b/GPy/kern/periodic_Matern32.py
@ -2,21 +2,21 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np
-from GPy.util.linalg import mdot, pdinv
+from GPy.util.linalg import mdot
 from GPy.util.decorators import silence_errors

-class periodic_Matern32(kernpart):
+class periodic_Matern32(Kernpart):
    """
-    Kernel of the periodic subspace (up to a given frequency) of a Matern 3/2 RKHS. Only defined for D=1.
+    Kernel of the periodic subspace (up to a given frequency) of a Matern 3/2 RKHS. Only defined for input_dim=1.

-    :param D: the number of input dimensions
-    :type D: int
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
    :param variance: the variance of the Matern kernel
    :type variance: float
    :param lengthscale: the lengthscale of the Matern kernel
-    :type lengthscale: np.ndarray of size (D,)
+    :type lengthscale: np.ndarray of size (input_dim,)
    :param period: the period
    :type period: float
    :param n_freq: the number of frequencies considered for the periodic subspace
@ -25,17 +25,17 @@ class periodic_Matern32(kernpart):

    """

-    def __init__(self,D=1,variance=1.,lengthscale=None,period=2*np.pi,n_freq=10,lower=0.,upper=4*np.pi):
-        assert D==1, "Periodic kernels are only defined for D=1"
+    def __init__(self, input_dim=1, variance=1., lengthscale=None, period=2 * np.pi, n_freq=10, lower=0., upper=4 * np.pi):
+        assert input_dim==1, "Periodic kernels are only defined for input_dim=1"
        self.name = 'periodic_Mat32'
-        self.D = D
+        self.input_dim = input_dim
        if lengthscale is not None:
            lengthscale = np.asarray(lengthscale)
            assert lengthscale.size == 1, "Wrong size: only one lengthscale needed"
        else:
            lengthscale = np.ones(1)
        self.lower,self.upper = lower, upper
-        self.Nparam = 3
+        self.num_params = 3
        self.n_freq = n_freq
        self.n_basis = 2*n_freq
        self._set_params(np.hstack((variance,lengthscale,period)))
@ -64,7 +64,7 @@ class periodic_Matern32(kernpart):
    def _get_params(self):
        """return the value of the parameters."""
        return np.hstack((self.variance,self.lengthscale,self.period))
-    
+
    def _set_params(self,x):
        """set the value of the parameters."""
        assert x.size==3
@ -113,7 +113,7 @@ class periodic_Matern32(kernpart):

    @silence_errors
    def dK_dtheta(self,dL_dK,X,X2,target):
-        """derivative of the covariance matrix with respect to the parameters (shape is NxMxNparam)"""
+        """derivative of the covariance matrix with respect to the parameters (shape is Nxnum_inducingxNparam)"""
        if X2 is None: X2 = X
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
--- a/GPy/kern/periodic_Matern52.py
+++ b/GPy/kern/periodic_Matern52.py
@ -2,21 +2,21 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np
-from GPy.util.linalg import mdot, pdinv
+from GPy.util.linalg import mdot
 from GPy.util.decorators import silence_errors

-class periodic_Matern52(kernpart):
+class periodic_Matern52(Kernpart):
    """
-    Kernel of the periodic subspace (up to a given frequency) of a Matern 5/2 RKHS. Only defined for D=1.
+    Kernel of the periodic subspace (up to a given frequency) of a Matern 5/2 RKHS. Only defined for input_dim=1.

-    :param D: the number of input dimensions
-    :type D: int
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
    :param variance: the variance of the Matern kernel
    :type variance: float
    :param lengthscale: the lengthscale of the Matern kernel
-    :type lengthscale: np.ndarray of size (D,)
+    :type lengthscale: np.ndarray of size (input_dim,)
    :param period: the period
    :type period: float
    :param n_freq: the number of frequencies considered for the periodic subspace
@ -25,17 +25,17 @@ class periodic_Matern52(kernpart):

    """

-    def __init__(self,D=1,variance=1.,lengthscale=None,period=2*np.pi,n_freq=10,lower=0.,upper=4*np.pi):
-        assert D==1, "Periodic kernels are only defined for D=1"
+    def __init__(self,input_dim=1,variance=1.,lengthscale=None,period=2*np.pi,n_freq=10,lower=0.,upper=4*np.pi):
+        assert input_dim==1, "Periodic kernels are only defined for input_dim=1"
        self.name = 'periodic_Mat52'
-        self.D = D
+        self.input_dim = input_dim
        if lengthscale is not None:
            lengthscale = np.asarray(lengthscale)
            assert lengthscale.size == 1, "Wrong size: only one lengthscale needed"
        else:
            lengthscale = np.ones(1)
        self.lower,self.upper = lower, upper
-        self.Nparam = 3
+        self.num_params = 3
        self.n_freq = n_freq
        self.n_basis = 2*n_freq
        self._set_params(np.hstack((variance,lengthscale,period)))
@ -64,7 +64,7 @@ class periodic_Matern52(kernpart):
    def _get_params(self):
        """return the value of the parameters."""
        return np.hstack((self.variance,self.lengthscale,self.period))
-    
+
    def _set_params(self,x):
        """set the value of the parameters."""
        assert x.size==3
@ -115,7 +115,7 @@ class periodic_Matern52(kernpart):

    @silence_errors
    def dK_dtheta(self,dL_dK,X,X2,target):
-        """derivative of the covariance matrix with respect to the parameters (shape is NxMxNparam)"""
+        """derivative of the covariance matrix with respect to the parameters (shape is Nxnum_inducingxNparam)"""
        if X2 is None: X2 = X
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
@ -209,7 +209,7 @@ class periodic_Matern52(kernpart):
        F2lower = np.array(self._cos(self.basis_alpha*self.basis_omega**2,self.basis_omega,self.basis_phi+np.pi)(self.lower))[:,None]

        #dK_dvar
-        dK_dvar = 1./self.variance*mdot(FX,self.Gi,FX2.T)
+        dK_dvar = 1. / self.variance * mdot(FX, self.Gi, FX.T)

        #dK_dlen
        da_dlen = [-3*self.a[0]/self.lengthscale, -2*self.a[1]/self.lengthscale, -self.a[2]/self.lengthscale, 0.]
--- a/GPy/kern/periodic_exponential.py
+++ b/GPy/kern/periodic_exponential.py
@ -2,21 +2,21 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np
-from GPy.util.linalg import mdot, pdinv
+from GPy.util.linalg import mdot
 from GPy.util.decorators import silence_errors

-class periodic_exponential(kernpart):
+class periodic_exponential(Kernpart):
    """
-    Kernel of the periodic subspace (up to a given frequency) of a exponential (Matern 1/2) RKHS. Only defined for D=1.
+    Kernel of the periodic subspace (up to a given frequency) of a exponential (Matern 1/2) RKHS. Only defined for input_dim=1.

-    :param D: the number of input dimensions
-    :type D: int
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
    :param variance: the variance of the Matern kernel
    :type variance: float
    :param lengthscale: the lengthscale of the Matern kernel
-    :type lengthscale: np.ndarray of size (D,)
+    :type lengthscale: np.ndarray of size (input_dim,)
    :param period: the period
    :type period: float
    :param n_freq: the number of frequencies considered for the periodic subspace
@ -25,17 +25,17 @@ class periodic_exponential(kernpart):

    """

-    def __init__(self,D=1,variance=1.,lengthscale=None,period=2*np.pi,n_freq=10,lower=0.,upper=4*np.pi):
-        assert D==1, "Periodic kernels are only defined for D=1"
+    def __init__(self, input_dim=1, variance=1., lengthscale=None, period=2 * np.pi, n_freq=10, lower=0., upper=4 * np.pi):
+        assert input_dim==1, "Periodic kernels are only defined for input_dim=1"
        self.name = 'periodic_exp'
-        self.D = D
+        self.input_dim = input_dim
        if lengthscale is not None:
            lengthscale = np.asarray(lengthscale)
            assert lengthscale.size == 1, "Wrong size: only one lengthscale needed"
        else:
            lengthscale = np.ones(1)
        self.lower,self.upper = lower, upper
-        self.Nparam = 3
+        self.num_params = 3
        self.n_freq = n_freq
        self.n_basis = 2*n_freq
        self._set_params(np.hstack((variance,lengthscale,period)))
@ -64,7 +64,7 @@ class periodic_exponential(kernpart):
    def _get_params(self):
        """return the value of the parameters."""
        return np.hstack((self.variance,self.lengthscale,self.period))
-    
+
    def _set_params(self,x):
        """set the value of the parameters."""
        assert x.size==3
@ -111,7 +111,7 @@ class periodic_exponential(kernpart):

    @silence_errors
    def dK_dtheta(self,dL_dK,X,X2,target):
-        """derivative of the covariance matrix with respect to the parameters (shape is NxMxNparam)"""
+        """derivative of the covariance matrix with respect to the parameters (shape is Nxnum_inducingxNparam)"""
        if X2 is None: X2 = X
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
--- a/GPy/kern/prod.py
+++ b/GPy/kern/prod.py
@ -1,35 +1,35 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np
 import hashlib

-class prod(kernpart):
+class prod(Kernpart):
    """
    Computes the product of 2 kernels

    :param k1, k2: the kernels to multiply
-    :type k1, k2: kernpart
+    :type k1, k2: Kernpart
    :param tensor: The kernels are either multiply as functions defined on the same input space (default) or on the product of the input spaces
    :type tensor: Boolean
    :rtype: kernel object

    """
    def __init__(self,k1,k2,tensor=False):
-        self.Nparam = k1.Nparam + k2.Nparam
+        self.num_params = k1.num_params + k2.num_params
        self.name = k1.name + '<times>' + k2.name
        self.k1 = k1
        self.k2 = k2
        if tensor:
-            self.D = k1.D + k2.D
-            self.slice1 = slice(0,self.k1.D)
-            self.slice2 = slice(self.k1.D,self.k1.D+self.k2.D)
+            self.input_dim = k1.input_dim + k2.input_dim
+            self.slice1 = slice(0,self.k1.input_dim)
+            self.slice2 = slice(self.k1.input_dim,self.k1.input_dim+self.k2.input_dim)
        else:
-            assert k1.D == k2.D, "Error: The input spaces of the kernels to sum don't have the same dimension."
-            self.D = k1.D
-            self.slice1 = slice(0,self.D)
-            self.slice2 = slice(0,self.D)
+            assert k1.input_dim == k2.input_dim, "Error: The input spaces of the kernels to sum don't have the same dimension."
+            self.input_dim = k1.input_dim
+            self.slice1 = slice(0,self.input_dim)
+            self.slice2 = slice(0,self.input_dim)

        self._X, self._X2, self._params = np.empty(shape=(3,1))
        self._set_params(np.hstack((k1._get_params(),k2._get_params())))
@ -40,8 +40,8 @@ class prod(kernpart):

    def _set_params(self,x):
        """set the value of the parameters."""
-        self.k1._set_params(x[:self.k1.Nparam])
-        self.k2._set_params(x[self.k1.Nparam:])
+        self.k1._set_params(x[:self.k1.num_params])
+        self.k2._set_params(x[self.k1.num_params:])

    def _get_param_names(self):
        """return parameter names."""
@ -55,11 +55,11 @@ class prod(kernpart):
        """derivative of the covariance matrix with respect to the parameters."""
        self._K_computations(X,X2)
        if X2 is None:
-            self.k1.dK_dtheta(dL_dK*self._K2, X[:,self.slice1], None, target[:self.k1.Nparam])
-            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.slice2], None, target[self.k1.Nparam:])
+            self.k1.dK_dtheta(dL_dK*self._K2, X[:,self.slice1], None, target[:self.k1.num_params])
+            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.slice2], None, target[self.k1.num_params:])
        else:
-            self.k1.dK_dtheta(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:self.k1.Nparam])
-            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[self.k1.Nparam:])
+            self.k1.dK_dtheta(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:self.k1.num_params])
+            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[self.k1.num_params:])

    def Kdiag(self,X,target):
        """Compute the diagonal of the covariance matrix associated to X."""
@ -74,8 +74,8 @@ class prod(kernpart):
        K2 = np.zeros(X.shape[0])
        self.k1.Kdiag(X[:,self.slice1],K1)
        self.k2.Kdiag(X[:,self.slice2],K2)
-        self.k1.dKdiag_dtheta(dL_dKdiag*K2,X[:,self.slice1],target[:self.k1.Nparam])
-        self.k2.dKdiag_dtheta(dL_dKdiag*K1,X[:,self.slice2],target[self.k1.Nparam:])
+        self.k1.dKdiag_dtheta(dL_dKdiag*K2,X[:,self.slice1],target[:self.k1.num_params])
+        self.k2.dKdiag_dtheta(dL_dKdiag*K1,X[:,self.slice2],target[self.k1.num_params:])

    def dK_dX(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
--- a/GPy/kern/prod_orthogonal.py
+++ b/GPy/kern/prod_orthogonal.py
@ -1,23 +1,23 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np
 import hashlib
 #from scipy import integrate # This may not be necessary (Nicolas, 20th Feb)

-class prod_orthogonal(kernpart):
+class prod_orthogonal(Kernpart):
    """
    Computes the product of 2 kernels

    :param k1, k2: the kernels to multiply
-    :type k1, k2: kernpart
+    :type k1, k2: Kernpart
    :rtype: kernel object

    """
    def __init__(self,k1,k2):
-        self.D = k1.D + k2.D
-        self.Nparam = k1.Nparam + k2.Nparam
+        self.input_dim = k1.input_dim + k2.input_dim
+        self.num_params = k1.num_params + k2.num_params
        self.name = k1.name + '<times>' + k2.name
        self.k1 = k1
        self.k2 = k2
@ -30,8 +30,8 @@ class prod_orthogonal(kernpart):

    def _set_params(self,x):
        """set the value of the parameters."""
-        self.k1._set_params(x[:self.k1.Nparam])
-        self.k2._set_params(x[self.k1.Nparam:])
+        self.k1._set_params(x[:self.k1.num_params])
+        self.k2._set_params(x[self.k1.num_params:])

    def _get_param_names(self):
        """return parameter names."""
@ -45,42 +45,42 @@ class prod_orthogonal(kernpart):
        """derivative of the covariance matrix with respect to the parameters."""
        self._K_computations(X,X2)
        if X2 is None:
-            self.k1.dK_dtheta(dL_dK*self._K2, X[:,:self.k1.D], None, target[:self.k1.Nparam])
-            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.k1.D:], None, target[self.k1.Nparam:])
+            self.k1.dK_dtheta(dL_dK*self._K2, X[:,:self.k1.input_dim], None, target[:self.k1.num_params])
+            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.k1.input_dim:], None, target[self.k1.num_params:])
        else:
-            self.k1.dK_dtheta(dL_dK*self._K2, X[:,:self.k1.D], X2[:,:self.k1.D], target[:self.k1.Nparam])
-            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.k1.D:], X2[:,self.k1.D:], target[self.k1.Nparam:])
+            self.k1.dK_dtheta(dL_dK*self._K2, X[:,:self.k1.input_dim], X2[:,:self.k1.input_dim], target[:self.k1.num_params])
+            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.k1.input_dim:], X2[:,self.k1.input_dim:], target[self.k1.num_params:])

    def Kdiag(self,X,target):
        """Compute the diagonal of the covariance matrix associated to X."""
        target1 = np.zeros(X.shape[0])
        target2 = np.zeros(X.shape[0])
-        self.k1.Kdiag(X[:,:self.k1.D],target1)
-        self.k2.Kdiag(X[:,self.k1.D:],target2)
+        self.k1.Kdiag(X[:,:self.k1.input_dim],target1)
+        self.k2.Kdiag(X[:,self.k1.input_dim:],target2)
        target += target1 * target2

    def dKdiag_dtheta(self,dL_dKdiag,X,target):
        K1 = np.zeros(X.shape[0])
        K2 = np.zeros(X.shape[0])
-        self.k1.Kdiag(X[:,:self.k1.D],K1)
-        self.k2.Kdiag(X[:,self.k1.D:],K2)
-        self.k1.dKdiag_dtheta(dL_dKdiag*K2,X[:,:self.k1.D],target[:self.k1.Nparam])
-        self.k2.dKdiag_dtheta(dL_dKdiag*K1,X[:,self.k1.D:],target[self.k1.Nparam:])
+        self.k1.Kdiag(X[:,:self.k1.input_dim],K1)
+        self.k2.Kdiag(X[:,self.k1.input_dim:],K2)
+        self.k1.dKdiag_dtheta(dL_dKdiag*K2,X[:,:self.k1.input_dim],target[:self.k1.num_params])
+        self.k2.dKdiag_dtheta(dL_dKdiag*K1,X[:,self.k1.input_dim:],target[self.k1.num_params:])

    def dK_dX(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
        self._K_computations(X,X2)
-        self.k1.dK_dX(dL_dK*self._K2, X[:,:self.k1.D], X2[:,:self.k1.D], target)
-        self.k2.dK_dX(dL_dK*self._K1, X[:,self.k1.D:], X2[:,self.k1.D:], target)
+        self.k1.dK_dX(dL_dK*self._K2, X[:,:self.k1.input_dim], X2[:,:self.k1.input_dim], target)
+        self.k2.dK_dX(dL_dK*self._K1, X[:,self.k1.input_dim:], X2[:,self.k1.input_dim:], target)

    def dKdiag_dX(self, dL_dKdiag, X, target):
        K1 = np.zeros(X.shape[0])
        K2 = np.zeros(X.shape[0])
-        self.k1.Kdiag(X[:,0:self.k1.D],K1)
-        self.k2.Kdiag(X[:,self.k1.D:],K2)
+        self.k1.Kdiag(X[:,0:self.k1.input_dim],K1)
+        self.k2.Kdiag(X[:,self.k1.input_dim:],K2)

-        self.k1.dK_dX(dL_dKdiag*K2, X[:,:self.k1.D], target)
-        self.k2.dK_dX(dL_dKdiag*K1, X[:,self.k1.D:], target)
+        self.k1.dK_dX(dL_dKdiag*K2, X[:,:self.k1.input_dim], target)
+        self.k2.dK_dX(dL_dKdiag*K1, X[:,self.k1.input_dim:], target)

    def _K_computations(self,X,X2):
        if not (np.array_equal(X,self._X) and np.array_equal(X2,self._X2) and np.array_equal(self._params , self._get_params())):
@ -90,12 +90,12 @@ class prod_orthogonal(kernpart):
                self._X2 = None
                self._K1 = np.zeros((X.shape[0],X.shape[0]))
                self._K2 = np.zeros((X.shape[0],X.shape[0]))
-                self.k1.K(X[:,:self.k1.D],None,self._K1)
-                self.k2.K(X[:,self.k1.D:],None,self._K2)
+                self.k1.K(X[:,:self.k1.input_dim],None,self._K1)
+                self.k2.K(X[:,self.k1.input_dim:],None,self._K2)
            else:
                self._X2 = X2.copy()
                self._K1 = np.zeros((X.shape[0],X2.shape[0]))
                self._K2 = np.zeros((X.shape[0],X2.shape[0]))
-                self.k1.K(X[:,:self.k1.D],X2[:,:self.k1.D],self._K1)
-                self.k2.K(X[:,self.k1.D:],X2[:,self.k1.D:],self._K2)
+                self.k1.K(X[:,:self.k1.input_dim],X2[:,:self.k1.input_dim],self._K1)
+                self.k2.K(X[:,self.k1.input_dim:],X2[:,self.k1.input_dim:],self._K2)

--- a/GPy/kern/rational_quadratic.py
+++ b/GPy/kern/rational_quadratic.py
@ -2,10 +2,10 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np

-class rational_quadratic(kernpart):
+class rational_quadratic(Kernpart):
    """
    rational quadratic kernel

@ -13,21 +13,21 @@ class rational_quadratic(kernpart):

       k(r) = \sigma^2 \\bigg( 1 + \\frac{r^2}{2 \ell^2} \\bigg)^{- \\alpha} \ \ \ \ \  \\text{ where  } r^2 = (x-y)^2

-    :param D: the number of input dimensions
-    :type D: int (D=1 is the only value currently supported)
+    :param input_dim: the number of input dimensions
+    :type input_dim: int (input_dim=1 is the only value currently supported)
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
    :param lengthscale: the lengthscale :math:`\ell`
    :type lengthscale: float
    :param power: the power :math:`\\alpha`
    :type power: float
-    :rtype: kernpart object
+    :rtype: Kernpart object

    """
-    def __init__(self,D,variance=1.,lengthscale=1.,power=1.):
-        assert D == 1, "For this kernel we assume D=1"
-        self.D = D
-        self.Nparam = 3
+    def __init__(self,input_dim,variance=1.,lengthscale=1.,power=1.):
+        assert input_dim == 1, "For this kernel we assume input_dim=1"
+        self.input_dim = input_dim
+        self.num_params = 3
        self.name = 'rat_quad'
        self.variance = variance
        self.lengthscale = lengthscale
--- a/GPy/kern/rbf.py
+++ b/GPy/kern/rbf.py
@ -2,13 +2,13 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np
 import hashlib
 from scipy import weave
 from ..util.linalg import tdot

-class rbf(kernpart):
+class rbf(Kernpart):
    """
    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel:

@ -18,8 +18,8 @@ class rbf(kernpart):

    where \ell_i is the lengthscale, \sigma^2 the variance and d the dimensionality of the input.

-    :param D: the number of input dimensions
-    :type D: int
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float
    :param lengthscale: the vector of lengthscale of the kernel
@ -31,76 +31,76 @@ class rbf(kernpart):
    .. Note: this object implements both the ARD and 'spherical' version of the function
    """

-    def __init__(self,D,variance=1.,lengthscale=None,ARD=False):
-        self.D = D
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False):
+        self.input_dim = input_dim
        self.name = 'rbf'
        self.ARD = ARD
        if not ARD:
-            self.Nparam = 2
+            self.num_params = 2
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
            else:
                lengthscale = np.ones(1)
        else:
-            self.Nparam = self.D + 1
+            self.num_params = self.input_dim + 1
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
-                assert lengthscale.size == self.D, "bad number of lengthscales"
+                assert lengthscale.size == self.input_dim, "bad number of lengthscales"
            else:
-                lengthscale = np.ones(self.D)
+                lengthscale = np.ones(self.input_dim)

-        self._set_params(np.hstack((variance,lengthscale.flatten())))
+        self._set_params(np.hstack((variance, lengthscale.flatten())))

-        #initialize cache
-        self._Z, self._mu, self._S = np.empty(shape=(3,1))
-        self._X, self._X2, self._params = np.empty(shape=(3,1))
+        # initialize cache
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))

-        #a set of optional args to pass to weave
+        # a set of optional args to pass to weave
        self.weave_options = {'headers'           : ['<omp.h>'],
-                         'extra_compile_args': ['-fopenmp -O3'],  #-march=native'],
+                         'extra_compile_args': ['-fopenmp -O3'], # -march=native'],
                         'extra_link_args'   : ['-lgomp']}



    def _get_params(self):
-        return np.hstack((self.variance,self.lengthscale))
+        return np.hstack((self.variance, self.lengthscale))

-    def _set_params(self,x):
-        assert x.size==(self.Nparam)
+    def _set_params(self, x):
+        assert x.size == (self.num_params)
        self.variance = x[0]
        self.lengthscale = x[1:]
        self.lengthscale2 = np.square(self.lengthscale)
-        #reset cached results
-        self._X, self._X2, self._params = np.empty(shape=(3,1))
-        self._Z, self._mu, self._S = np.empty(shape=(3,1)) # cached versions of Z,mu,S
+        # reset cached results
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1)) # cached versions of Z,mu,S

    def _get_param_names(self):
-        if self.Nparam == 2:
-            return ['variance','lengthscale']
+        if self.num_params == 2:
+            return ['variance', 'lengthscale']
        else:
-            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscale.size)]
+            return ['variance'] + ['lengthscale_%i' % i for i in range(self.lengthscale.size)]

-    def K(self,X,X2,target):
-        self._K_computations(X,X2)
-        target += self.variance*self._K_dvar
+    def K(self, X, X2, target):
+        self._K_computations(X, X2)
+        target += self.variance * self._K_dvar

-    def Kdiag(self,X,target):
-        np.add(target,self.variance,target)
+    def Kdiag(self, X, target):
+        np.add(target, self.variance, target)

-    def dK_dtheta(self,dL_dK,X,X2,target):
-        self._K_computations(X,X2)
-        target[0] += np.sum(self._K_dvar*dL_dK)
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        self._K_computations(X, X2)
+        target[0] += np.sum(self._K_dvar * dL_dK)
        if self.ARD:
-            dvardLdK = self._K_dvar*dL_dK
-            var_len3 = self.variance/np.power(self.lengthscale,3)
+            dvardLdK = self._K_dvar * dL_dK
+            var_len3 = self.variance / np.power(self.lengthscale, 3)
            if X2 is None:
-                #save computation for the symmetrical case
+                # save computation for the symmetrical case
                dvardLdK += dvardLdK.T
                code = """
                int q,i,j;
                double tmp;
-                for(q=0; q<D; q++){
+                for(q=0; q<input_dim; q++){
                  tmp = 0;
                  for(i=0; i<N; i++){
                    for(j=0; j<i; j++){
@ -110,39 +110,39 @@ class rbf(kernpart):
                  target(q+1) += var_len3(q)*tmp;
                }
                """
-                N,M,D = X.shape[0], X.shape[0], self.D
+                N, num_inducing, input_dim = X.shape[0], X.shape[0], self.input_dim
            else:
                code = """
                int q,i,j;
                double tmp;
-                for(q=0; q<D; q++){
+                for(q=0; q<input_dim; q++){
                  tmp = 0;
                  for(i=0; i<N; i++){
-                    for(j=0; j<M; j++){
+                    for(j=0; j<num_inducing; j++){
                      tmp += (X(i,q)-X2(j,q))*(X(i,q)-X2(j,q))*dvardLdK(i,j);
                    }
                  }
                  target(q+1) += var_len3(q)*tmp;
                }
                """
-                N,M,D = X.shape[0], X2.shape[0], self.D
-            #[np.add(target[1+q:2+q],var_len3[q]*np.sum(dvardLdK*np.square(X[:,q][:,None]-X2[:,q][None,:])),target[1+q:2+q]) for q in range(self.D)]
-            weave.inline(code, arg_names=['N','M','D','X','X2','target','dvardLdK','var_len3'],
-                 type_converters=weave.converters.blitz,**self.weave_options)
+                N, num_inducing, input_dim = X.shape[0], X2.shape[0], self.input_dim
+            # [np.add(target[1+q:2+q],var_len3[q]*np.sum(dvardLdK*np.square(X[:,q][:,None]-X2[:,q][None,:])),target[1+q:2+q]) for q in range(self.input_dim)]
+            weave.inline(code, arg_names=['N','num_inducing','input_dim','X','X2','target','dvardLdK','var_len3'],
+                 type_converters=weave.converters.blitz, **self.weave_options)
        else:
-            target[1] += (self.variance/self.lengthscale)*np.sum(self._K_dvar*self._K_dist2*dL_dK)
+            target[1] += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dK)

-    def dKdiag_dtheta(self,dL_dKdiag,X,target):
-        #NB: derivative of diagonal elements wrt lengthscale is 0
+    def dKdiag_dtheta(self, dL_dKdiag, X, target):
+        # NB: derivative of diagonal elements wrt lengthscale is 0
        target[0] += np.sum(dL_dKdiag)

-    def dK_dX(self,dL_dK,X,X2,target):
-        self._K_computations(X,X2)
-        _K_dist = X[:,None,:]-X2[None,:,:] #don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
-        dK_dX = (-self.variance/self.lengthscale2)*np.transpose(self._K_dvar[:,:,np.newaxis]*_K_dist,(1,0,2))
-        target += np.sum(dK_dX*dL_dK.T[:,:,None],0)
+    def dK_dX(self, dL_dK, X, X2, target):
+        self._K_computations(X, X2)
+        _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
+        dK_dX = (-self.variance / self.lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
+        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)

-    def dKdiag_dX(self,dL_dKdiag,X,target):
+    def dKdiag_dX(self, dL_dKdiag, X, target):
        pass


@ -150,101 +150,100 @@ class rbf(kernpart):
    #             PSI statistics            #
    #---------------------------------------#

-    def psi0(self,Z,mu,S,target):
+    def psi0(self, Z, mu, S, target):
        target += self.variance

-    def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target):
+    def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S, target):
        target[0] += np.sum(dL_dpsi0)

-    def dpsi0_dmuS(self,dL_dpsi0,Z,mu,S,target_mu,target_S):
+    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S, target_mu, target_S):
        pass

-    def psi1(self,Z,mu,S,target):
-        self._psi_computations(Z,mu,S)
+    def psi1(self, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
        target += self._psi1

-    def dpsi1_dtheta(self,dL_dpsi1,Z,mu,S,target):
-        self._psi_computations(Z,mu,S)
-        denom_deriv = S[:,None,:]/(self.lengthscale**3+self.lengthscale*S[:,None,:])
-        d_length = self._psi1[:,:,None]*(self.lengthscale*np.square(self._psi1_dist/(self.lengthscale2+S[:,None,:])) + denom_deriv)
-        target[0] += np.sum(dL_dpsi1*self._psi1/self.variance)
-        dpsi1_dlength = d_length*dL_dpsi1[:,:,None]
+    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        denom_deriv = S[:, None, :] / (self.lengthscale ** 3 + self.lengthscale * S[:, None, :])
+        d_length = self._psi1[:, :, None] * (self.lengthscale * np.square(self._psi1_dist / (self.lengthscale2 + S[:, None, :])) + denom_deriv)
+        target[0] += np.sum(dL_dpsi1 * self._psi1 / self.variance)
+        dpsi1_dlength = d_length * dL_dpsi1[:, :, None]
        if not self.ARD:
            target[1] += dpsi1_dlength.sum()
        else:
            target[1:] += dpsi1_dlength.sum(0).sum(0)

-    def dpsi1_dZ(self,dL_dpsi1,Z,mu,S,target):
-        self._psi_computations(Z,mu,S)
-        denominator = (self.lengthscale2*(self._psi1_denom))
-        dpsi1_dZ = - self._psi1[:,:,None] * ((self._psi1_dist/denominator))
-        target += np.sum(dL_dpsi1.T[:,:,None] * dpsi1_dZ, 0)
+    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        denominator = (self.lengthscale2 * (self._psi1_denom))
+        dpsi1_dZ = -self._psi1[:, :, None] * ((self._psi1_dist / denominator))
+        target += np.sum(dL_dpsi1.T[:, :, None] * dpsi1_dZ, 0)

-    def dpsi1_dmuS(self,dL_dpsi1,Z,mu,S,target_mu,target_S):
-        self._psi_computations(Z,mu,S)
-        tmp = self._psi1[:,:,None]/self.lengthscale2/self._psi1_denom
-        target_mu += np.sum(dL_dpsi1.T[:, :, None]*tmp*self._psi1_dist,1)
-        target_S += np.sum(dL_dpsi1.T[:, :, None]*0.5*tmp*(self._psi1_dist_sq-1),1)
+    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
+        self._psi_computations(Z, mu, S)
+        tmp = self._psi1[:, :, None] / self.lengthscale2 / self._psi1_denom
+        target_mu += np.sum(dL_dpsi1.T[:, :, None] * tmp * self._psi1_dist, 1)
+        target_S += np.sum(dL_dpsi1.T[:, :, None] * 0.5 * tmp * (self._psi1_dist_sq - 1), 1)

-    def psi2(self,Z,mu,S,target):
-        self._psi_computations(Z,mu,S)
+    def psi2(self, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
        target += self._psi2

-    def dpsi2_dtheta(self,dL_dpsi2,Z,mu,S,target):
-        """Shape N,M,M,Ntheta"""
-        self._psi_computations(Z,mu,S)
-        d_var = 2.*self._psi2/self.variance
-        d_length = 2.*self._psi2[:,:,:,None]*(self._psi2_Zdist_sq*self._psi2_denom + self._psi2_mudist_sq + S[:,None,None,:]/self.lengthscale2)/(self.lengthscale*self._psi2_denom)
+    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target):
+        """Shape N,num_inducing,num_inducing,Ntheta"""
+        self._psi_computations(Z, mu, S)
+        d_var = 2.*self._psi2 / self.variance
+        d_length = 2.*self._psi2[:, :, :, None] * (self._psi2_Zdist_sq * self._psi2_denom + self._psi2_mudist_sq + S[:, None, None, :] / self.lengthscale2) / (self.lengthscale * self._psi2_denom)

-        target[0] += np.sum(dL_dpsi2*d_var)
-        dpsi2_dlength = d_length*dL_dpsi2[:,:,:,None]
+        target[0] += np.sum(dL_dpsi2 * d_var)
+        dpsi2_dlength = d_length * dL_dpsi2[:, :, :, None]
        if not self.ARD:
            target[1] += dpsi2_dlength.sum()
        else:
            target[1:] += dpsi2_dlength.sum(0).sum(0).sum(0)

-    def dpsi2_dZ(self,dL_dpsi2,Z,mu,S,target):
-        self._psi_computations(Z,mu,S)
-        term1 = self._psi2_Zdist/self.lengthscale2 # M, M, input_dim
-        term2 = self._psi2_mudist/self._psi2_denom/self.lengthscale2 # N, M, M, input_dim
-        dZ = self._psi2[:,:,:,None] * (term1[None] + term2)
-        target += (dL_dpsi2[:,:,:,None]*dZ).sum(0).sum(0)
-
-    def dpsi2_dmuS(self,dL_dpsi2,Z,mu,S,target_mu,target_S):
-        """Think N,M,M,input_dim """
-        self._psi_computations(Z,mu,S)
-        tmp = self._psi2[:,:,:,None]/self.lengthscale2/self._psi2_denom
-        target_mu += -2.*(dL_dpsi2[:,:,:,None]*tmp*self._psi2_mudist).sum(1).sum(1)
-        target_S += (dL_dpsi2[:,:,:,None]*tmp*(2.*self._psi2_mudist_sq-1)).sum(1).sum(1)
+    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        term1 = self._psi2_Zdist / self.lengthscale2 # num_inducing, num_inducing, input_dim
+        term2 = self._psi2_mudist / self._psi2_denom / self.lengthscale2 # N, num_inducing, num_inducing, input_dim
+        dZ = self._psi2[:, :, :, None] * (term1[None] + term2)
+        target += (dL_dpsi2[:, :, :, None] * dZ).sum(0).sum(0)

+    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
+        """Think N,num_inducing,num_inducing,input_dim """
+        self._psi_computations(Z, mu, S)
+        tmp = self._psi2[:, :, :, None] / self.lengthscale2 / self._psi2_denom
+        target_mu += -2.*(dL_dpsi2[:, :, :, None] * tmp * self._psi2_mudist).sum(1).sum(1)
+        target_S += (dL_dpsi2[:, :, :, None] * tmp * (2.*self._psi2_mudist_sq - 1)).sum(1).sum(1)

    #---------------------------------------#
    #            Precomputations            #
    #---------------------------------------#

-    def _K_computations(self,X,X2):
-        if not (np.array_equal(X,self._X) and np.array_equal(X2,self._X2) and np.array_equal(self._params , self._get_params())):
+    def _K_computations(self, X, X2):
+        if not (np.array_equal(X, self._X) and np.array_equal(X2, self._X2) and np.array_equal(self._params , self._get_params())):
            self._X = X.copy()
            self._params == self._get_params().copy()
            if X2 is None:
                self._X2 = None
-                X = X/self.lengthscale
-                Xsquare = np.sum(np.square(X),1)
-                self._K_dist2 = -2.*tdot(X) + (Xsquare[:,None] + Xsquare[None,:])
+                X = X / self.lengthscale
+                Xsquare = np.sum(np.square(X), 1)
+                self._K_dist2 = -2.*tdot(X) + (Xsquare[:, None] + Xsquare[None, :])
            else:
                self._X2 = X2.copy()
-                X = X/self.lengthscale
-                X2 = X2/self.lengthscale
-                self._K_dist2 = -2.*np.dot(X, X2.T) + (np.sum(np.square(X),1)[:,None] + np.sum(np.square(X2),1)[None,:])
-            self._K_dvar = np.exp(-0.5*self._K_dist2)
+                X = X / self.lengthscale
+                X2 = X2 / self.lengthscale
+                self._K_dist2 = -2.*np.dot(X, X2.T) + (np.sum(np.square(X), 1)[:, None] + np.sum(np.square(X2), 1)[None, :])
+            self._K_dvar = np.exp(-0.5 * self._K_dist2)

-    def _psi_computations(self,Z,mu,S):
-        #here are the "statistics" for psi1 and psi2
+    def _psi_computations(self, Z, mu, S):
+        # here are the "statistics" for psi1 and psi2
        if not np.array_equal(Z, self._Z):
            #Z has changed, compute Z specific stuff
-            self._psi2_Zhat = 0.5*(Z[:,None,:] +Z[None,:,:]) # M,M,input_dim
-            self._psi2_Zdist = 0.5*(Z[:,None,:]-Z[None,:,:]) # M,M,input_dim
-            self._psi2_Zdist_sq = np.square(self._psi2_Zdist/self.lengthscale) # M,M,input_dim
+            self._psi2_Zhat = 0.5*(Z[:,None,:] +Z[None,:,:]) # num_inducing,num_inducing,input_dim
+            self._psi2_Zdist = 0.5*(Z[:,None,:]-Z[None,:,:]) # num_inducing,num_inducing,input_dim
+            self._psi2_Zdist_sq = np.square(self._psi2_Zdist/self.lengthscale) # num_inducing,num_inducing,input_dim
            self._Z = Z

        if not (np.array_equal(Z, self._Z) and np.array_equal(mu, self._mu) and np.array_equal(S, self._S)):
@ -258,39 +257,39 @@ class rbf(kernpart):
            self._psi1 = self.variance*np.exp(self._psi1_exponent)

            #psi2
-            self._psi2_denom = 2.*S[:,None,None,:]/self.lengthscale2+1. # N,M,M,input_dim
+            self._psi2_denom = 2.*S[:,None,None,:]/self.lengthscale2+1. # N,num_inducing,num_inducing,input_dim
            self._psi2_mudist, self._psi2_mudist_sq, self._psi2_exponent, _ = self.weave_psi2(mu,self._psi2_Zhat)
-            #self._psi2_mudist = mu[:,None,None,:]-self._psi2_Zhat #N,M,M,input_dim
+            #self._psi2_mudist = mu[:,None,None,:]-self._psi2_Zhat #N,num_inducing,num_inducing,input_dim
            #self._psi2_mudist_sq = np.square(self._psi2_mudist)/(self.lengthscale2*self._psi2_denom)
-            #self._psi2_exponent = np.sum(-self._psi2_Zdist_sq -self._psi2_mudist_sq -0.5*np.log(self._psi2_denom),-1) #N,M,M
-            self._psi2 = np.square(self.variance)*np.exp(self._psi2_exponent) # N,M,M
+            #self._psi2_exponent = np.sum(-self._psi2_Zdist_sq -self._psi2_mudist_sq -0.5*np.log(self._psi2_denom),-1) #N,num_inducing,num_inducing
+            self._psi2 = np.square(self.variance)*np.exp(self._psi2_exponent) # N,num_inducing,num_inducing

            #store matrices for caching
            self._Z, self._mu, self._S = Z, mu,S

    def weave_psi2(self,mu,Zhat):
        N,input_dim = mu.shape
-        M = Zhat.shape[0]
+        num_inducing = Zhat.shape[0]

-        mudist = np.empty((N,M,M,input_dim))
-        mudist_sq = np.empty((N,M,M,input_dim))
-        psi2_exponent = np.zeros((N,M,M))
-        psi2 = np.empty((N,M,M))
+        mudist = np.empty((N,num_inducing,num_inducing,input_dim))
+        mudist_sq = np.empty((N,num_inducing,num_inducing,input_dim))
+        psi2_exponent = np.zeros((N,num_inducing,num_inducing))
+        psi2 = np.empty((N,num_inducing,num_inducing))

        psi2_Zdist_sq = self._psi2_Zdist_sq
-        _psi2_denom = self._psi2_denom.squeeze().reshape(N,self.D)
-        half_log_psi2_denom = 0.5*np.log(self._psi2_denom).squeeze().reshape(N,self.D)
+        _psi2_denom = self._psi2_denom.squeeze().reshape(N, self.input_dim)
+        half_log_psi2_denom = 0.5 * np.log(self._psi2_denom).squeeze().reshape(N, self.input_dim)
        variance_sq = float(np.square(self.variance))
        if self.ARD:
            lengthscale2 = self.lengthscale2
        else:
-            lengthscale2 = np.ones(input_dim)*self.lengthscale2
+            lengthscale2 = np.ones(input_dim) * self.lengthscale2
        code = """
        double tmp;

        #pragma omp parallel for private(tmp)
        for (int n=0; n<N; n++){
-            for (int m=0; m<M; m++){
+            for (int m=0; m<num_inducing; m++){
               for (int mm=0; mm<(m+1); mm++){
                   for (int q=0; q<input_dim; q++){
                       //compute mudist
@ -325,7 +324,7 @@ class rbf(kernpart):
        #include <math.h>
        """
        weave.inline(code, support_code=support_code, libraries=['gomp'],
-                     arg_names=['N','M','input_dim','mu','Zhat','mudist_sq','mudist','lengthscale2','_psi2_denom','psi2_Zdist_sq','psi2_exponent','half_log_psi2_denom','psi2','variance_sq'],
-                     type_converters=weave.converters.blitz,**self.weave_options)
+                     arg_names=['N','num_inducing','input_dim','mu','Zhat','mudist_sq','mudist','lengthscale2','_psi2_denom','psi2_Zdist_sq','psi2_exponent','half_log_psi2_denom','psi2','variance_sq'],
+                     type_converters=weave.converters.blitz, **self.weave_options)

-        return mudist,mudist_sq, psi2_exponent, psi2
+        return mudist, mudist_sq, psi2_exponent, psi2
--- a/GPy/kern/rbfcos.py
+++ b/GPy/kern/rbfcos.py
@ -3,32 +3,32 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np

-class rbfcos(kernpart):
-    def __init__(self,D,variance=1.,frequencies=None,bandwidths=None,ARD=False):
-        self.D = D
+class rbfcos(Kernpart):
+    def __init__(self,input_dim,variance=1.,frequencies=None,bandwidths=None,ARD=False):
+        self.input_dim = input_dim
        self.name = 'rbfcos'
-        if self.D>10:
+        if self.input_dim>10:
            print "Warning: the rbfcos kernel requires a lot of memory for high dimensional inputs"
        self.ARD = ARD

-        #set the default frequencies and bandwidths, appropriate Nparam
+        #set the default frequencies and bandwidths, appropriate num_params
        if ARD:
-            self.Nparam = 2*self.D + 1
+            self.num_params = 2*self.input_dim + 1
            if frequencies is not None:
                frequencies = np.asarray(frequencies)
-                assert frequencies.size == self.D, "bad number of frequencies"
+                assert frequencies.size == self.input_dim, "bad number of frequencies"
            else:
-                frequencies = np.ones(self.D)
+                frequencies = np.ones(self.input_dim)
            if bandwidths is not None:
                bandwidths = np.asarray(bandwidths)
-                assert bandwidths.size == self.D, "bad number of bandwidths"
+                assert bandwidths.size == self.input_dim, "bad number of bandwidths"
            else:
-                bandwidths = np.ones(self.D)
+                bandwidths = np.ones(self.input_dim)
        else:
-            self.Nparam = 3
+            self.num_params = 3
            if frequencies is not None:
                frequencies = np.asarray(frequencies)
                assert frequencies.size == 1, "Exactly one frequency needed for non-ARD kernel"
@ -51,19 +51,19 @@ class rbfcos(kernpart):
        return np.hstack((self.variance,self.frequencies, self.bandwidths))

    def _set_params(self,x):
-        assert x.size==(self.Nparam)
+        assert x.size==(self.num_params)
        if self.ARD:
            self.variance = x[0]
-            self.frequencies = x[1:1+self.D]
-            self.bandwidths = x[1+self.D:]
+            self.frequencies = x[1:1+self.input_dim]
+            self.bandwidths = x[1+self.input_dim:]
        else:
            self.variance, self.frequencies, self.bandwidths = x

    def _get_param_names(self):
-        if self.Nparam == 3:
+        if self.num_params == 3:
            return ['variance','frequency','bandwidth']
        else:
-            return ['variance']+['frequency_%i'%i for i in range(self.D)]+['bandwidth_%i'%i for i in range(self.D)]
+            return ['variance']+['frequency_%i'%i for i in range(self.input_dim)]+['bandwidth_%i'%i for i in range(self.input_dim)]

    def K(self,X,X2,target):
        self._K_computations(X,X2)
@ -76,9 +76,9 @@ class rbfcos(kernpart):
        self._K_computations(X,X2)
        target[0] += np.sum(dL_dK*self._dvar)
        if self.ARD:
-            for q in xrange(self.D):
+            for q in xrange(self.input_dim):
                target[q+1] += -2.*np.pi*self.variance*np.sum(dL_dK*self._dvar*np.tan(2.*np.pi*self._dist[:,:,q]*self.frequencies[q])*self._dist[:,:,q])
-                target[q+1+self.D] += -2.*np.pi**2*self.variance*np.sum(dL_dK*self._dvar*self._dist2[:,:,q])
+                target[q+1+self.input_dim] += -2.*np.pi**2*self.variance*np.sum(dL_dK*self._dvar*self._dist2[:,:,q])
        else:
            target[1] += -2.*np.pi*self.variance*np.sum(dL_dK*self._dvar*np.sum(np.tan(2.*np.pi*self._dist*self.frequencies)*self._dist,-1))
            target[2] += -2.*np.pi**2*self.variance*np.sum(dL_dK*self._dvar*self._dist2.sum(-1))
@ -100,13 +100,13 @@ class rbfcos(kernpart):
            self._X = X.copy()
            self._X2 = X2.copy()

-            #do the distances: this will be high memory for large D
+            #do the distances: this will be high memory for large input_dim
            #NB: we don't take the abs of the dist because cos is symmetric
            self._dist = X[:,None,:] - X2[None,:,:]
            self._dist2 = np.square(self._dist)

            #ensure the next section is computed:
-            self._params = np.empty(self.Nparam)
+            self._params = np.empty(self.num_params)

        if not np.all(self._params == self._get_params()):
            self._params == self._get_params().copy()
--- a/GPy/kern/spline.py
+++ b/GPy/kern/spline.py
@ -2,28 +2,28 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np
 import hashlib
 def theta(x):
    """Heaviside step function"""
    return np.where(x>=0.,1.,0.)

-class spline(kernpart):
+class spline(Kernpart):
    """
    Spline kernel

-    :param D: the number of input dimensions (fixed to 1 right now TODO)
-    :type D: int
+    :param input_dim: the number of input dimensions (fixed to 1 right now TODO)
+    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float

    """

-    def __init__(self,D,variance=1.,lengthscale=1.):
-        self.D = D
-        assert self.D==1
-        self.Nparam = 1
+    def __init__(self,input_dim,variance=1.,lengthscale=1.):
+        self.input_dim = input_dim
+        assert self.input_dim==1
+        self.num_params = 1
        self.name = 'spline'
        self._set_params(np.squeeze(variance))

--- a/GPy/kern/symmetric.py
+++ b/GPy/kern/symmetric.py
@ -1,27 +1,27 @@
 # Copyright (c) 2012 James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np

-class symmetric(kernpart):
+class symmetric(Kernpart):
    """
    Symmetrical kernels

    :param k: the kernel to symmetrify
-    :type k: kernpart
+    :type k: Kernpart
    :param transform: the transform to use in symmetrification (allows symmetry on specified axes)
-    :type transform: A numpy array (D x D) specifiying the transform
-    :rtype: kernpart
+    :type transform: A numpy array (input_dim x input_dim) specifiying the transform
+    :rtype: Kernpart

    """
    def __init__(self,k,transform=None):
        if transform is None:
-            transform = np.eye(k.D)*-1.
-        assert transform.shape == (k.D, k.D)
+            transform = np.eye(k.input_dim)*-1.
+        assert transform.shape == (k.input_dim, k.input_dim)
        self.transform = transform
-        self.D = k.D
-        self.Nparam = k.Nparam
+        self.input_dim = k.input_dim
+        self.num_params = k.num_params
        self.name = k.name + '_symm'
        self.k = k
        self._set_params(k._get_params())
--- a/GPy/kern/sympykern.py
+++ b/GPy/kern/sympykern.py
@ -9,9 +9,9 @@ import sys
 current_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
 import tempfile
 import pdb
-from kernpart import kernpart
+from kernpart import Kernpart

-class spkern(kernpart):
+class spkern(Kernpart):
    """
    A kernel object, where all the hard work in done by sympy.

@ -26,7 +26,7 @@ class spkern(kernpart):
     - to handle multiple inputs, call them x1, z1, etc
     - to handle multpile correlated outputs, you'll need to define each covariance function and 'cross' variance function. TODO
    """
-    def __init__(self,D,k,param=None):
+    def __init__(self,input_dim,k,param=None):
        self.name='sympykern'
        self._sp_k = k
        sp_vars = [e for e in k.atoms() if e.is_Symbol]
@ -35,15 +35,15 @@ class spkern(kernpart):
        assert all([x.name=='x%i'%i for i,x in enumerate(self._sp_x)])
        assert all([z.name=='z%i'%i for i,z in enumerate(self._sp_z)])
        assert len(self._sp_x)==len(self._sp_z)
-        self.D = len(self._sp_x)
-        assert self.D == D
+        self.input_dim = len(self._sp_x)
+        assert self.input_dim == input_dim
        self._sp_theta = sorted([e for e in sp_vars if not (e.name[0]=='x' or e.name[0]=='z')],key=lambda e:e.name)
-        self.Nparam = len(self._sp_theta)
+        self.num_params = len(self._sp_theta)

        #deal with param
        if param is None:
-            param = np.ones(self.Nparam)
-        assert param.size==self.Nparam
+            param = np.ones(self.num_params)
+        assert param.size==self.num_params
        self._set_params(param)

        #Differentiate!
@ -69,15 +69,15 @@ class spkern(kernpart):

    def compute_psi_stats(self):
        #define some normal distributions
-        mus = [sp.var('mu%i'%i,real=True) for i in range(self.D)]
-        Ss = [sp.var('S%i'%i,positive=True) for i in range(self.D)]
+        mus = [sp.var('mu%i'%i,real=True) for i in range(self.input_dim)]
+        Ss = [sp.var('S%i'%i,positive=True) for i in range(self.input_dim)]
        normals = [(2*sp.pi*Si)**(-0.5)*sp.exp(-0.5*(xi-mui)**2/Si) for xi, mui, Si in zip(self._sp_x, mus, Ss)]

        #do some integration!
        #self._sp_psi0 = ??
        self._sp_psi1 = self._sp_k
-        for i in range(self.D):
-            print 'perfoming integrals %i of %i'%(i+1,2*self.D)
+        for i in range(self.input_dim):
+            print 'perfoming integrals %i of %i'%(i+1,2*self.input_dim)
            sys.stdout.flush()
            self._sp_psi1 *= normals[i]
            self._sp_psi1 = sp.integrate(self._sp_psi1,(self._sp_x[i],-sp.oo,sp.oo))
@ -85,10 +85,10 @@ class spkern(kernpart):
        self._sp_psi1 = self._sp_psi1.simplify()

        #and here's psi2 (eek!)
-        zprime = [sp.Symbol('zp%i'%i) for i in range(self.D)]
+        zprime = [sp.Symbol('zp%i'%i) for i in range(self.input_dim)]
        self._sp_psi2 = self._sp_k.copy()*self._sp_k.copy().subs(zip(self._sp_z,zprime))
-        for i in range(self.D):
-            print 'perfoming integrals %i of %i'%(self.D+i+1,2*self.D)
+        for i in range(self.input_dim):
+            print 'perfoming integrals %i of %i'%(self.input_dim+i+1,2*self.input_dim)
            sys.stdout.flush()
            self._sp_psi2 *= normals[i]
            self._sp_psi2 = sp.integrate(self._sp_psi2,(self._sp_x[i],-sp.oo,sp.oo))
@ -113,21 +113,21 @@ class spkern(kernpart):
        self._function_code = re.sub('DiracDelta\(.+?,.+?\)','0.0',self._function_code)

        #Here's some code to do the looping for K
-        arglist = ", ".join(["X[i*D+%s]"%x.name[1:] for x in self._sp_x]\
-                + ["Z[j*D+%s]"%z.name[1:] for z in self._sp_z]\
-                + ["param[%i]"%i for i in range(self.Nparam)])
+        arglist = ", ".join(["X[i*input_dim+%s]"%x.name[1:] for x in self._sp_x]\
+                + ["Z[j*input_dim+%s]"%z.name[1:] for z in self._sp_z]\
+                + ["param[%i]"%i for i in range(self.num_params)])

        self._K_code =\
        """
        int i;
        int j;
        int N = target_array->dimensions[0];
-        int M = target_array->dimensions[1];
-        int D = X_array->dimensions[1];
+        int num_inducing = target_array->dimensions[1];
+        int input_dim = X_array->dimensions[1];
        //#pragma omp parallel for private(j)
        for (i=0;i<N;i++){
-            for (j=0;j<M;j++){
-                target[i*M+j] = k(%s);
+            for (j=0;j<num_inducing;j++){
+                target[i*num_inducing+j] = k(%s);
            }
        }
        %s
@ -140,7 +140,7 @@ class spkern(kernpart):
        """
        int i;
        int N = target_array->dimensions[0];
-        int D = X_array->dimensions[1];
+        int input_dim = X_array->dimensions[1];
        //#pragma omp parallel for
        for (i=0;i<N;i++){
                target[i] = k(%s);
@ -149,17 +149,17 @@ class spkern(kernpart):
        """%(diag_arglist,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed

        #here's some code to compute gradients
-        funclist = '\n'.join([' '*16 + 'target[%i] += partial[i*M+j]*dk_d%s(%s);'%(i,theta.name,arglist) for i,theta in  enumerate(self._sp_theta)])
+        funclist = '\n'.join([' '*16 + 'target[%i] += partial[i*num_inducing+j]*dk_d%s(%s);'%(i,theta.name,arglist) for i,theta in  enumerate(self._sp_theta)])
        self._dK_dtheta_code =\
        """
        int i;
        int j;
        int N = partial_array->dimensions[0];
-        int M = partial_array->dimensions[1];
-        int D = X_array->dimensions[1];
+        int num_inducing = partial_array->dimensions[1];
+        int input_dim = X_array->dimensions[1];
        //#pragma omp parallel for private(j)
        for (i=0;i<N;i++){
-            for (j=0;j<M;j++){
+            for (j=0;j<num_inducing;j++){
 %s
            }
        }
@ -169,12 +169,12 @@ class spkern(kernpart):
        #here's some code to compute gradients for Kdiag TODO: thius is yucky.
        diag_funclist = re.sub('Z','X',funclist,count=0)
        diag_funclist = re.sub('j','i',diag_funclist)
-        diag_funclist = re.sub('partial\[i\*M\+i\]','partial[i]',diag_funclist)
+        diag_funclist = re.sub('partial\[i\*num_inducing\+i\]','partial[i]',diag_funclist)
        self._dKdiag_dtheta_code =\
        """
        int i;
        int N = partial_array->dimensions[0];
-        int D = X_array->dimensions[1];
+        int input_dim = X_array->dimensions[1];
        for (i=0;i<N;i++){
                %s
        }
@ -182,20 +182,20 @@ class spkern(kernpart):
        """%(diag_funclist,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed

        #Here's some code to do gradients wrt x
-        gradient_funcs = "\n".join(["target[i*D+%i] += partial[i*M+j]*dk_dx%i(%s);"%(q,q,arglist) for q in range(self.D)])
+        gradient_funcs = "\n".join(["target[i*input_dim+%i] += partial[i*num_inducing+j]*dk_dx%i(%s);"%(q,q,arglist) for q in range(self.input_dim)])
        self._dK_dX_code = \
        """
        int i;
        int j;
        int N = partial_array->dimensions[0];
-        int M = partial_array->dimensions[1];
-        int D = X_array->dimensions[1];
+        int num_inducing = partial_array->dimensions[1];
+        int input_dim = X_array->dimensions[1];
        //#pragma omp parallel for private(j)
        for (i=0;i<N; i++){
-            for (j=0; j<M; j++){
+            for (j=0; j<num_inducing; j++){
                %s
-                //if(isnan(target[i*D+2])){printf("%%f\\n",dk_dx2(X[i*D+0], X[i*D+1], X[i*D+2], Z[j*D+0], Z[j*D+1], Z[j*D+2], param[0], param[1], param[2], param[3], param[4], param[5]));}
-                //if(isnan(target[i*D+2])){printf("%%f,%%f,%%i,%%i\\n", X[i*D+2], Z[j*D+2],i,j);}
+                //if(isnan(target[i*input_dim+2])){printf("%%f\\n",dk_dx2(X[i*input_dim+0], X[i*input_dim+1], X[i*input_dim+2], Z[j*input_dim+0], Z[j*input_dim+1], Z[j*input_dim+2], param[0], param[1], param[2], param[3], param[4], param[5]));}
+                //if(isnan(target[i*input_dim+2])){printf("%%f,%%f,%%i,%%i\\n", X[i*input_dim+2], Z[j*input_dim+2],i,j);}

            }
        }
@ -208,8 +208,8 @@ class spkern(kernpart):
        int i;
        int j;
        int N = partial_array->dimensions[0];
-        int M = 0;
-        int D = X_array->dimensions[1];
+        int num_inducing = 0;
+        int input_dim = X_array->dimensions[1];
        for (i=0;i<N; i++){
            j = i;
            %s
--- a/GPy/kern/white.py
+++ b/GPy/kern/white.py
@ -2,20 +2,20 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kernpart import kernpart
+from kernpart import Kernpart
 import numpy as np
-class white(kernpart):
+class white(Kernpart):
    """
    White noise kernel.

-    :param D: the number of input dimensions
-    :type D: int
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
    :param variance:
    :type variance: float
    """
-    def __init__(self,D,variance=1.):
-        self.D = D
-        self.Nparam = 1
+    def __init__(self,input_dim,variance=1.):
+        self.input_dim = input_dim
+        self.num_params = 1
        self.name = 'white'
        self._set_params(np.array([variance]).flatten())
        self._psi1 = 0 # TODO: more elegance here
--- a/GPy/likelihoods/init.py
+++ b/GPy/likelihoods/init.py
@ -1,4 +1,4 @@
-from EP import EP
-from Gaussian import Gaussian
+from ep import EP
+from gaussian import Gaussian
 # TODO: from Laplace import Laplace
 import likelihood_functions as functions
--- a/GPy/likelihoods/ep.py
+++ b/GPy/likelihoods/ep.py
@ -4,23 +4,23 @@ from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot
 from likelihood import likelihood

 class EP(likelihood):
-    def __init__(self,data,likelihood_function,epsilon=1e-3,power_ep=[1.,1.]):
+    def __init__(self,data,LikelihoodFunction,epsilon=1e-3,power_ep=[1.,1.]):
        """
        Expectation Propagation

        Arguments
        ---------
        epsilon : Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
-        likelihood_function : a likelihood function (see likelihood_functions.py)
+        LikelihoodFunction : a likelihood function (see likelihood_functions.py)
        """
-        self.likelihood_function = likelihood_function
+        self.LikelihoodFunction = LikelihoodFunction
        self.epsilon = epsilon
        self.eta, self.delta = power_ep
        self.data = data
-        self.N, self.D = self.data.shape
+        self.N, self.output_dim = self.data.shape
        self.is_heteroscedastic = True
        self.Nparams = 0
-        self._transf_data = self.likelihood_function._preprocess_values(data)
+        self._transf_data = self.LikelihoodFunction._preprocess_values(data)

        #Initial values - Likelihood approximation parameters:
        #p(y|f) = t(f|tau_tilde,v_tilde)
@ -48,7 +48,7 @@ class EP(likelihood):
    def predictive_values(self,mu,var,full_cov):
        if full_cov:
            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
-        return self.likelihood_function.predictive_values(mu,var)
+        return self.LikelihoodFunction.predictive_values(mu,var)

    def _get_params(self):
        return np.zeros(0)
@ -110,7 +110,7 @@ class EP(likelihood):
                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
@ -139,7 +139,7 @@ class EP(likelihood):
        The expectation-propagation algorithm with sparse pseudo-input.
        For nomenclature see ... 2013.
        """
-        M = Kmm.shape[0]
+        num_inducing = Kmm.shape[0]

        #TODO: this doesn't work with uncertain inputs!

@ -200,7 +200,7 @@ class EP(likelihood):
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
@ -235,7 +235,7 @@ class EP(likelihood):
        The expectation-propagation algorithm with sparse pseudo-input.
        For nomenclature see Naish-Guzman and Holden, 2008.
        """
-        M = Kmm.shape[0]
+        num_inducing = Kmm.shape[0]

        """
        Prior approximation parameters:
@ -258,7 +258,7 @@ class EP(likelihood):
        mu = w + P*Gamma
        """
        self.w = np.zeros(self.N)
-        self.Gamma = np.zeros(M)
+        self.Gamma = np.zeros(num_inducing)
        mu = np.zeros(self.N)
        P = P0.copy()
        R = R0.copy()
@ -295,7 +295,7 @@ class EP(likelihood):
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
@ -305,10 +305,10 @@ class EP(likelihood):
                dtd1 = Delta_tau*Diag[i] + 1.
                dii = Diag[i]
                Diag[i] = dii - (Delta_tau * dii**2.)/dtd1
-                pi_ = P[i,:].reshape(1,M)
+                pi_ = P[i,:].reshape(1,num_inducing)
                P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_
                Rp_i = np.dot(R,pi_.T)
-                RTR = np.dot(R.T,np.dot(np.eye(M) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R))
+                RTR = np.dot(R.T,np.dot(np.eye(num_inducing) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R))
                R = jitchol(RTR).T
                self.w[i] += (Delta_v - Delta_tau*self.w[i])*dii/dtd1
                self.Gamma += (Delta_v - Delta_tau*mu[i])*np.dot(RTR,P[i,:].T)
@ -321,7 +321,7 @@ class EP(likelihood):
            Diag = Diag0 * Iplus_Dprod_i
            P = Iplus_Dprod_i[:,None] * P0
            safe_diag = np.where(Diag0 < self.tau_tilde, self.tau_tilde/(1.+Diag0*self.tau_tilde), (1. - Iplus_Dprod_i)/Diag0)
-            L = jitchol(np.eye(M) + np.dot(RPT0,safe_diag[:,None]*RPT0.T))
+            L = jitchol(np.eye(num_inducing) + np.dot(RPT0,safe_diag[:,None]*RPT0.T))
            R,info = linalg.lapack.flapack.dtrtrs(L,R0,lower=1)
            RPT = np.dot(R,P.T)
            Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -15,7 +15,7 @@ class Gaussian(likelihood):
        self.is_heteroscedastic = False
        self.Nparams = 1
        self.Z = 0. # a correction factor which accounts for the approximation made
-        N, self.D = data.shape
+        N, self.output_dim = data.shape

        # normalization
        if normalize:
@ -24,8 +24,8 @@ class Gaussian(likelihood):
            # Don't scale outputs which have zero variance to zero.
            self._scale[np.nonzero(self._scale == 0.)] = 1.0e-3
        else:
-            self._offset = np.zeros((1, self.D))
-            self._scale = np.ones((1, self.D))
+            self._offset = np.zeros((1, self.output_dim))
+            self._scale = np.ones((1, self.output_dim))

        self.set_data(data)

@ -35,7 +35,7 @@ class Gaussian(likelihood):
    def set_data(self, data):
        self.data = data
        self.N, D = data.shape
-        assert D == self.D
+        assert D == self.output_dim
        self.Y = (self.data - self._offset) / self._scale
        if D > self.N:
            self.YYT = np.dot(self.Y, self.Y.T)
@ -52,9 +52,9 @@ class Gaussian(likelihood):

    def _set_params(self, x):
        x = np.float64(x)
-        if self._variance != x:
+        if np.all(self._variance != x):
            if x == 0.:
-                self.precision = None
+                self.precision = np.inf
                self.V = None
            else:
                self.precision = 1. / x
@ -68,9 +68,9 @@ class Gaussian(likelihood):
        """
        mean = mu * self._scale + self._offset
        if full_cov:
-            if self.D > 1:
+            if self.output_dim > 1:
                raise NotImplementedError, "TODO"
-                # Note. for D>1, we need to re-normalise all the outputs independently.
+                # Note. for output_dim>1, we need to re-normalise all the outputs independently.
                # This will mess up computations of diag(true_var), below.
                # note that the upper, lower quantiles should be the same shape as mean
            # Augment the output variance with the likelihood variance and rescale.
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@ -10,12 +10,12 @@ from ..util.plot import gpplot
 from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
 import link_functions

-class likelihood_function(object):
+class LikelihoodFunction(object):
    """
    Likelihood class for doing Expectation propagation

    :param Y: observed output (Nx1 numpy.darray)
-    ..Note:: Y values allowed depend on the likelihood_function used
+    ..Note:: Y values allowed depend on the LikelihoodFunction used
    """
    def __init__(self,link):
        if link == self._analytical:
@ -69,7 +69,7 @@ class likelihood_function(object):
        sigma2_hat = m2 - mu_hat**2 # Second central moment
        return float(Z_hat), float(mu_hat), float(sigma2_hat)

-class binomial(likelihood_function):
+class Binomial(LikelihoodFunction):
    """
    Probit likelihood
    Y is expected to take values in {-1,1}
@ -82,7 +82,7 @@ class binomial(likelihood_function):
        self._analytical = link_functions.probit
        if not link:
            link = self._analytical
-        super(binomial, self).__init__(link)
+        super(Binomial, self).__init__(link)

    def _distribution(self,gp,obs):
        pass
@ -134,7 +134,7 @@ class binomial(likelihood_function):
        p_975 = stats.norm.cdf(norm_975/np.sqrt(1+var))
        return mean[:,None], np.nan*var, p_025[:,None], p_975[:,None] # TODO: var

-class Poisson(likelihood_function):
+class Poisson(LikelihoodFunction):
    """
    Poisson likelihood
    Y is expected to take values in {0,1,2,...}
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -1,15 +1,12 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-
-from GP_regression import GP_regression
-from GP_classification import GP_classification
-from sparse_GP_regression import sparse_GP_regression
-from sparse_GP_classification import sparse_GP_classification
-from GPLVM import GPLVM
-from warped_GP import warpedGP
-from sparse_GPLVM import sparse_GPLVM
-from Bayesian_GPLVM import Bayesian_GPLVM
+from gp_regression import GPRegression
+from gp_classification import GPClassification
+from sparse_gp_regression import SparseGPRegression
+from sparse_gp_classification import SparseGPClassification
+from fitc_classification import FITCClassification
+from gplvm import GPLVM
+from warped_gp import WarpedGP
+from bayesian_gplvm import BayesianGPLVM
 from mrd import MRD
-from generalized_FITC import generalized_FITC
-from FITC import FITC
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@ -2,21 +2,16 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-import pylab as pb
-import sys, pdb
-from GPLVM import GPLVM
-from ..core import sparse_GP
-from GPy.util.linalg import pdinv
+from ..core import SparseGP
 from ..likelihoods import Gaussian
 from .. import kern
-from numpy.linalg.linalg import LinAlgError
 import itertools
 from matplotlib.colors import colorConverter
-from matplotlib.figure import SubplotParams
 from GPy.inference.optimization import SCG
 from GPy.util import plot_latent
+from GPy.models.gplvm import GPLVM

-class Bayesian_GPLVM(sparse_GP, GPLVM):
+class BayesianGPLVM(SparseGP, GPLVM):
    """
    Bayesian Gaussian Process Latent Variable Model

@ -28,7 +23,7 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
    :type init: 'PCA'|'random'

    """
-    def __init__(self, likelihood_or_Y, input_dim, X=None, X_variance=None, init='PCA', M=10,
+    def __init__(self, likelihood_or_Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
                 Z=None, kernel=None, oldpsave=10, _debug=False,
                 **kwargs):
        if type(likelihood_or_Y) is np.ndarray:
@ -44,7 +39,7 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
            X_variance = np.clip((np.ones_like(X) * 0.5) + .01 * np.random.randn(*X.shape), 0.001, 1)

        if Z is None:
-            Z = np.random.permutation(X.copy())[:M]
+            Z = np.random.permutation(X.copy())[:num_inducing]
        assert Z.shape[1] == X.shape[1]

        if kernel is None:
@ -64,7 +59,7 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
            self._savedpsiKmm = []
            self._savedABCD = []

-        sparse_GP.__init__(self, X, likelihood, kernel, Z=Z, X_variance=X_variance, **kwargs)
+        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, X_variance=X_variance, **kwargs)
        self._set_params(self._get_params())

    @property
@ -78,21 +73,21 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
        self._oldps.insert(0, p.copy())

    def _get_param_names(self):
-        X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.N)], [])
-        S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.N)], [])
-        return (X_names + S_names + sparse_GP._get_param_names(self))
+        X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
+        S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
+        return (X_names + S_names + SparseGP._get_param_names(self))

    def _get_params(self):
        """
        Horizontally stacks the parameters in order to present them to the optimizer.
-        The resulting 1-D array has this structure:
+        The resulting 1-input_dim array has this structure:

        ===============================================================
        |       mu       |        S        |    Z    | theta |  beta  |
        ===============================================================

        """
-        x = np.hstack((self.X.flatten(), self.X_variance.flatten(), sparse_GP._get_params(self)))
+        x = np.hstack((self.X.flatten(), self.X_variance.flatten(), SparseGP._get_params(self)))
        return x

    def _clipped(self, x):
@ -101,10 +96,10 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
    def _set_params(self, x, save_old=True, save_count=0):
 #         try:
            x = self._clipped(x)
-            N, input_dim = self.N, self.input_dim
+            N, input_dim = self.num_data, self.input_dim
            self.X = x[:self.X.size].reshape(N, input_dim).copy()
            self.X_variance = x[(N * input_dim):(2 * N * input_dim)].reshape(N, input_dim).copy()
-            sparse_GP._set_params(self, x[(2 * N * input_dim):])
+            SparseGP._set_params(self, x[(2 * N * input_dim):])
 #             self.oldps = x
 #         except (LinAlgError, FloatingPointError, ZeroDivisionError):
 #             print "\rWARNING: Caught LinAlgError, continueing without setting            "
@ -131,10 +126,10 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
    def KL_divergence(self):
        var_mean = np.square(self.X).sum()
        var_S = np.sum(self.X_variance - np.log(self.X_variance))
-        return 0.5 * (var_mean + var_S) - 0.5 * self.input_dim * self.N
+        return 0.5 * (var_mean + var_S) - 0.5 * self.input_dim * self.num_data

    def log_likelihood(self):
-        ll = sparse_GP.log_likelihood(self)
+        ll = SparseGP.log_likelihood(self)
        kl = self.KL_divergence()

 #         if ll < -2E4:
@ -151,14 +146,14 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
                self._savedpsiKmm.append([self.f_call, [self.Kmm, self.dL_dKmm]])
 #                 sf2 = self.scale_factor ** 2
                if self.likelihood.is_heteroscedastic:
-                    A = -0.5 * self.N * self.D * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.V * self.likelihood.Y)
-#                     B = -0.5 * self.D * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A) * sf2)
-                    B = -0.5 * self.D * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A))
+                    A = -0.5 * self.num_data * self.input_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.V * self.likelihood.Y)
+#                     B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A) * sf2)
+                    B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A))
                else:
-                    A = -0.5 * self.N * self.D * (np.log(2.*np.pi) + np.log(self.likelihood._variance)) - 0.5 * self.likelihood.precision * self.likelihood.trYYT
-#                     B = -0.5 * self.D * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A) * sf2)
-                    B = -0.5 * self.D * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A))
-                C = -self.D * (np.sum(np.log(np.diag(self.LB)))) # + 0.5 * self.M * np.log(sf2))
+                    A = -0.5 * self.num_data * self.input_dim * (np.log(2.*np.pi) + np.log(self.likelihood._variance)) - 0.5 * self.likelihood.precision * self.likelihood.trYYT
+#                     B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A) * sf2)
+                    B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A))
+                C = -self.input_dim * (np.sum(np.log(np.diag(self.LB)))) # + 0.5 * self.num_inducing * np.log(sf2))
                D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V))
                self._savedABCD.append([self.f_call, A, B, C, D])

@ -181,7 +176,7 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
 #         d_dS = (dL_dS).flatten()
        # ========================
        self.dbound_dmuS = np.hstack((d_dmu, d_dS))
-        self.dbound_dZtheta = sparse_GP._log_likelihood_gradients(self)
+        self.dbound_dZtheta = SparseGP._log_likelihood_gradients(self)
        return self._clipped(np.hstack((self.dbound_dmuS.flatten(), self.dbound_dZtheta)))

    def plot_latent(self, *args, **kwargs):
@ -200,7 +195,7 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
        means = np.zeros((N_test, input_dim))
        covars = np.zeros((N_test, input_dim))

-        dpsi0 = -0.5 * self.D * self.likelihood.precision
+        dpsi0 = -0.5 * self.input_dim * self.likelihood.precision
        dpsi2 = self.dL_dpsi2[0][None, :, :] # TODO: this may change if we ignore het. likelihoods
        V = self.likelihood.precision * Y
        dpsi1 = np.dot(self.Cpsi1V, V.T)
@ -263,7 +258,7 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):

    def __getstate__(self):
        return (self.likelihood, self.input_dim, self.X, self.X_variance,
-                self.init, self.M, self.Z, self.kern,
+                self.init, self.num_inducing, self.Z, self.kern,
                self.oldpsave, self._debug)

    def __setstate__(self, state):
@ -271,11 +266,11 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):

    def _debug_filter_params(self, x):
        start, end = 0, self.X.size,
-        X = x[start:end].reshape(self.N, self.input_dim)
+        X = x[start:end].reshape(self.num_data, self.input_dim)
        start, end = end, end + self.X_variance.size
-        X_v = x[start:end].reshape(self.N, self.input_dim)
-        start, end = end, end + (self.M * self.input_dim)
-        Z = x[start:end].reshape(self.M, self.input_dim)
+        X_v = x[start:end].reshape(self.num_data, self.input_dim)
+        start, end = end, end + (self.num_inducing * self.input_dim)
+        Z = x[start:end].reshape(self.num_inducing, self.input_dim)
        start, end = end, end + self.input_dim
        theta = x[start:]
        return X, X_v, Z, theta
@ -353,12 +348,12 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
        figs.append(pylab.figure("BGPLVM DEBUG Kmm", figsize=(12, 6)))
        fig = figs[-1]
        ax8 = fig.add_subplot(121)
-        ax8.text(.5, .5, r"${\mathbf{A,B,C,D}}$", color='k', alpha=.5, transform=ax8.transAxes,
+        ax8.text(.5, .5, r"${\mathbf{A,B,C,input_dim}}$", color='k', alpha=.5, transform=ax8.transAxes,
                 ha='center', va='center')
        ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 1], label='A')
        ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 2], label='B')
        ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 3], label='C')
-        ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 4], label='D')
+        ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 4], label='input_dim')
        ax8.legend()
        figs[-1].canvas.draw()
        figs[-1].tight_layout(rect=(.15, 0, 1, .86))
--- a/GPy/models/fitc.py
+++ b/GPy/models/fitc.py
@ -0,0 +1,252 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+import pylab as pb
+from ..util.linalg import mdot, jitchol, chol_inv, tdot, symmetrify, pdinv
+from ..util.plot import gpplot
+from .. import kern
+from scipy import stats, linalg
+from GPy.core.sparse_gp import SparseGP
+
+def backsub_both_sides(L, X):
+    """ Return L^-T * X * L^-1, assumuing X is symmetrical and L is lower cholesky"""
+    tmp, _ = linalg.lapack.flapack.dtrtrs(L, np.asfortranarray(X), lower=1, trans=1)
+    return linalg.lapack.flapack.dtrtrs(L, np.asfortranarray(tmp.T), lower=1, trans=1)[0].T
+
+class FITC(SparseGP):
+
+    def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False):
+        super(FITC, self).__init__(X, likelihood, kernel, normalize_X=normalize_X)
+
+    def update_likelihood_approximation(self):
+        """
+        Approximates a non-gaussian likelihood using Expectation Propagation
+
+        For a Gaussian (or direct: TODO) likelihood, no iteration is required:
+        this function does nothing
+
+        Diag(Knn - Qnn) is added to the noise term to use the tools already implemented in SparseGP.
+        The true precison is now 'true_precision' not 'precision'.
+        """
+        if self.has_uncertain_inputs:
+            raise NotImplementedError, "FITC approximation not implemented for uncertain inputs"
+        else:
+            self.likelihood.fit_FITC(self.Kmm, self.psi1, self.psi0)
+            self._set_params(self._get_params()) # update the GP
+
+    def _computations(self):
+
+        # factor Kmm
+        self.Lm = jitchol(self.Kmm)
+        self.Lmi, info = linalg.lapack.flapack.dtrtrs(self.Lm, np.eye(self.num_inducing), lower=1)
+        Lmipsi1 = np.dot(self.Lmi, self.psi1)
+        self.Qnn = np.dot(Lmipsi1.T, Lmipsi1).copy()
+        self.Diag0 = self.psi0 - np.diag(self.Qnn)
+        self.beta_star = self.likelihood.precision / (1. + self.likelihood.precision * self.Diag0[:, None]) # Includes Diag0 in the precision
+        self.V_star = self.beta_star * self.likelihood.Y
+
+        # The rather complex computations of self.A
+        if self.has_uncertain_inputs:
+                raise NotImplementedError
+        else:
+            if self.likelihood.is_heteroscedastic:
+                assert self.likelihood.input_dim == 1
+            tmp = self.psi1 * (np.sqrt(self.beta_star.flatten().reshape(1, self.num_data)))
+            tmp, _ = linalg.lapack.flapack.dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1)
+            self.A = tdot(tmp)
+
+        # factor B
+        self.B = np.eye(self.num_inducing) + self.A
+        self.LB = jitchol(self.B)
+        self.LBi = chol_inv(self.LB)
+        self.psi1V = np.dot(self.psi1, self.V_star)
+
+        Lmi_psi1V, info = linalg.lapack.flapack.dtrtrs(self.Lm, np.asfortranarray(self.psi1V), lower=1, trans=0)
+        self._LBi_Lmi_psi1V, _ = linalg.lapack.flapack.dtrtrs(self.LB, np.asfortranarray(Lmi_psi1V), lower=1, trans=0)
+
+        Kmmipsi1 = np.dot(self.Lmi.T, Lmipsi1)
+        b_psi1_Ki = self.beta_star * Kmmipsi1.T
+        Ki_pbp_Ki = np.dot(Kmmipsi1, b_psi1_Ki)
+        Kmmi = np.dot(self.Lmi.T, self.Lmi)
+        LBiLmi = np.dot(self.LBi, self.Lmi)
+        LBL_inv = np.dot(LBiLmi.T, LBiLmi)
+        VVT = np.outer(self.V_star, self.V_star)
+        VV_p_Ki = np.dot(VVT, Kmmipsi1.T)
+        Ki_pVVp_Ki = np.dot(Kmmipsi1, VV_p_Ki)
+        psi1beta = self.psi1 * self.beta_star.T
+        H = self.Kmm + mdot(self.psi1, psi1beta.T)
+        LH = jitchol(H)
+        LHi = chol_inv(LH)
+        Hi = np.dot(LHi.T, LHi)
+
+        betapsi1TLmiLBi = np.dot(psi1beta.T, LBiLmi.T)
+        alpha = np.array([np.dot(a.T, a) for a in betapsi1TLmiLBi])[:, None]
+        gamma_1 = mdot(VVT, self.psi1.T, Hi)
+        pHip = mdot(self.psi1.T, Hi, self.psi1)
+        gamma_2 = mdot(self.beta_star * pHip, self.V_star)
+        gamma_3 = self.V_star * gamma_2
+
+        self._dL_dpsi0 = -0.5 * self.beta_star # dA_dpsi0: logdet(self.beta_star)
+        self._dL_dpsi0 += .5 * self.V_star ** 2 # dA_psi0: yT*beta_star*y
+        self._dL_dpsi0 += .5 * alpha # dC_dpsi0
+        self._dL_dpsi0 += 0.5 * mdot(self.beta_star * pHip, self.V_star) ** 2 - self.V_star * mdot(self.V_star.T, pHip * self.beta_star).T # dD_dpsi0
+
+        self._dL_dpsi1 = b_psi1_Ki.copy() # dA_dpsi1: logdet(self.beta_star)
+        self._dL_dpsi1 += -np.dot(psi1beta.T, LBL_inv) # dC_dpsi1
+        self._dL_dpsi1 += gamma_1 - mdot(psi1beta.T, Hi, self.psi1, gamma_1) # dD_dpsi1
+
+        self._dL_dKmm = -0.5 * np.dot(Kmmipsi1, b_psi1_Ki) # dA_dKmm: logdet(self.beta_star)
+        self._dL_dKmm += .5 * (LBL_inv - Kmmi) + mdot(LBL_inv, psi1beta, Kmmipsi1.T) # dC_dKmm
+        self._dL_dKmm += -.5 * mdot(Hi, self.psi1, gamma_1) # dD_dKmm
+
+        self._dpsi1_dtheta = 0
+        self._dpsi1_dX = 0
+        self._dKmm_dtheta = 0
+        self._dKmm_dX = 0
+
+        self._dpsi1_dX_jkj = 0
+        self._dpsi1_dtheta_jkj = 0
+
+        for i, V_n, alpha_n, gamma_n, gamma_k in zip(range(self.num_data), self.V_star, alpha, gamma_2, gamma_3):
+            K_pp_K = np.dot(Kmmipsi1[:, i:(i + 1)], Kmmipsi1[:, i:(i + 1)].T)
+
+            # Diag_dpsi1 = Diag_dA_dpsi1: yT*beta_star*y + Diag_dC_dpsi1 +Diag_dD_dpsi1
+            _dpsi1 = (-V_n ** 2 - alpha_n + 2.*gamma_k - gamma_n ** 2) * Kmmipsi1.T[i:(i + 1), :]
+
+            # Diag_dKmm = Diag_dA_dKmm: yT*beta_star*y +Diag_dC_dKmm +Diag_dD_dKmm
+            _dKmm = .5 * (V_n ** 2 + alpha_n + gamma_n ** 2 - 2.*gamma_k) * K_pp_K # Diag_dD_dKmm
+
+            self._dpsi1_dtheta += self.kern.dK_dtheta(_dpsi1, self.X[i:i + 1, :], self.Z)
+            self._dKmm_dtheta += self.kern.dK_dtheta(_dKmm, self.Z)
+
+            self._dKmm_dX += 2.*self.kern.dK_dX(_dKmm , self.Z)
+            self._dpsi1_dX += self.kern.dK_dX(_dpsi1.T, self.Z, self.X[i:i + 1, :])
+
+        # the partial derivative vector for the likelihood
+        if self.likelihood.Nparams == 0:
+            # save computation here.
+            self.partial_for_likelihood = None
+        elif self.likelihood.is_heteroscedastic:
+            raise NotImplementedError, "heteroscedatic derivates not implemented"
+        else:
+            # likelihood is not heterscedatic
+            dbstar_dnoise = self.likelihood.precision * (self.beta_star ** 2 * self.Diag0[:, None] - self.beta_star)
+            Lmi_psi1 = mdot(self.Lmi, self.psi1)
+            LBiLmipsi1 = np.dot(self.LBi, Lmi_psi1)
+            aux_0 = np.dot(self._LBi_Lmi_psi1V.T, LBiLmipsi1)
+            aux_1 = self.likelihood.Y.T * np.dot(self._LBi_Lmi_psi1V.T, LBiLmipsi1)
+            aux_2 = np.dot(LBiLmipsi1.T, self._LBi_Lmi_psi1V)
+
+            dA_dnoise = 0.5 * self.input_dim * (dbstar_dnoise / self.beta_star).sum() - 0.5 * self.input_dim * np.sum(self.likelihood.Y ** 2 * dbstar_dnoise)
+            dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T, self.LBi, Lmi_psi1) * Lmi_psi1 * dbstar_dnoise.T)
+            dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T, self.LBi, Lmi_psi1) * Lmi_psi1 * dbstar_dnoise.T)
+
+            dD_dnoise_1 = mdot(self.V_star * LBiLmipsi1.T, LBiLmipsi1 * dbstar_dnoise.T * self.likelihood.Y.T)
+            alpha = mdot(LBiLmipsi1, self.V_star)
+            alpha_ = mdot(LBiLmipsi1.T, alpha)
+            dD_dnoise_2 = -0.5 * self.input_dim * np.sum(alpha_ ** 2 * dbstar_dnoise)
+
+            dD_dnoise_1 = mdot(self.V_star.T, self.psi1.T, self.Lmi.T, self.LBi.T, self.LBi, self.Lmi, self.psi1, dbstar_dnoise * self.likelihood.Y)
+            dD_dnoise_2 = 0.5 * mdot(self.V_star.T, self.psi1.T, Hi, self.psi1, dbstar_dnoise * self.psi1.T, Hi, self.psi1, self.V_star)
+            dD_dnoise = dD_dnoise_1 + dD_dnoise_2
+
+            self.partial_for_likelihood = dA_dnoise + dC_dnoise + dD_dnoise
+
+    def log_likelihood(self):
+        """ Compute the (lower bound on the) log marginal likelihood """
+        A = -0.5 * self.num_data * self.input_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.beta_star)) - 0.5 * np.sum(self.V_star * self.likelihood.Y)
+        C = -self.input_dim * (np.sum(np.log(np.diag(self.LB))))
+        D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V))
+        return A + C + D
+
+    def _log_likelihood_gradients(self):
+        pass
+        return np.hstack((self.dL_dZ().flatten(), self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))
+
+    def dL_dtheta(self):
+        if self.has_uncertain_inputs:
+            raise NotImplementedError, "FITC approximation not implemented for uncertain inputs"
+        else:
+            dL_dtheta = self.kern.dKdiag_dtheta(self._dL_dpsi0, self.X)
+            dL_dtheta += self.kern.dK_dtheta(self._dL_dpsi1, self.X, self.Z)
+            dL_dtheta += self.kern.dK_dtheta(self._dL_dKmm, X=self.Z)
+            dL_dtheta += self._dKmm_dtheta
+            dL_dtheta += self._dpsi1_dtheta
+        return dL_dtheta
+
+    def dL_dZ(self):
+        if self.has_uncertain_inputs:
+            raise NotImplementedError, "FITC approximation not implemented for uncertain inputs"
+        else:
+            dL_dZ = self.kern.dK_dX(self._dL_dpsi1.T, self.Z, self.X)
+            dL_dZ += 2. * self.kern.dK_dX(self._dL_dKmm, X=self.Z)
+            dL_dZ += self._dpsi1_dX
+            dL_dZ += self._dKmm_dX
+        return dL_dZ
+
+    def _raw_predict(self, Xnew, which_parts, full_cov=False):
+
+        if self.likelihood.is_heteroscedastic:
+            Iplus_Dprod_i = 1. / (1. + self.Diag0 * self.likelihood.precision.flatten())
+            self.Diag = self.Diag0 * Iplus_Dprod_i
+            self.P = Iplus_Dprod_i[:, None] * self.psi1.T
+            self.RPT0 = np.dot(self.Lmi, self.psi1)
+            self.L = np.linalg.cholesky(np.eye(self.num_inducing) + np.dot(self.RPT0, ((1. - Iplus_Dprod_i) / self.Diag0)[:, None] * self.RPT0.T))
+            self.R, info = linalg.flapack.dtrtrs(self.L, self.Lmi, lower=1)
+            self.RPT = np.dot(self.R, self.P.T)
+            self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T, self.RPT)
+            self.w = self.Diag * self.likelihood.v_tilde
+            self.Gamma = np.dot(self.R.T, np.dot(self.RPT, self.likelihood.v_tilde))
+            self.mu = self.w + np.dot(self.P, self.Gamma)
+
+            """
+            Make a prediction for the generalized FITC model
+
+            Arguments
+            ---------
+            X : Input prediction data - Nx1 numpy array (floats)
+            """
+            # q(u|f) = N(u| R0i*mu_u*f, R0i*C*R0i.T)
+
+            # Ci = I + (RPT0)Di(RPT0).T
+            # C = I - [RPT0] * (input_dim+[RPT0].T*[RPT0])^-1*[RPT0].T
+            #   = I - [RPT0] * (input_dim + self.Qnn)^-1 * [RPT0].T
+            #   = I - [RPT0] * (U*U.T)^-1 * [RPT0].T
+            #   = I - V.T * V
+            U = np.linalg.cholesky(np.diag(self.Diag0) + self.Qnn)
+            V, info = linalg.flapack.dtrtrs(U, self.RPT0.T, lower=1)
+            C = np.eye(self.num_inducing) - np.dot(V.T, V)
+            mu_u = np.dot(C, self.RPT0) * (1. / self.Diag0[None, :])
+            # self.C = C
+            # self.RPT0 = np.dot(self.R0,self.Knm.T) P0.T
+            # self.mu_u = mu_u
+            # self.U = U
+            # q(u|y) = N(u| R0i*mu_H,R0i*Sigma_H*R0i.T)
+            mu_H = np.dot(mu_u, self.mu)
+            self.mu_H = mu_H
+            Sigma_H = C + np.dot(mu_u, np.dot(self.Sigma, mu_u.T))
+            # q(f_star|y) = N(f_star|mu_star,sigma2_star)
+            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
+            KR0T = np.dot(Kx.T, self.Lmi.T)
+            mu_star = np.dot(KR0T, mu_H)
+            if full_cov:
+                Kxx = self.kern.K(Xnew, which_parts=which_parts)
+                var = Kxx + np.dot(KR0T, np.dot(Sigma_H - np.eye(self.num_inducing), KR0T.T))
+            else:
+                Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)
+                var = (Kxx + np.sum(KR0T.T * np.dot(Sigma_H - np.eye(self.num_inducing), KR0T.T), 0))[:, None]
+            return mu_star[:, None], var
+        else:
+            raise NotImplementedError, "homoscedastic fitc not implemented"
+            """
+            Kx = self.kern.K(self.Z, Xnew)
+            mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
+            if full_cov:
+                Kxx = self.kern.K(Xnew)
+                var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) #NOTE this won't work for plotting
+            else:
+                Kxx = self.kern.Kdiag(Xnew)
+                var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0)
+            return mu,var[:,None]
+            """
--- a/GPy/models/fitc_classification.py
+++ b/GPy/models/fitc_classification.py
@ -0,0 +1,47 @@
+# Copyright (c) 2013, Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from ..core import FITC
+from .. import likelihoods
+from .. import kern
+from ..likelihoods import likelihood
+
+class FITCClassification(FITC):
+    """
+    FITC approximation for classification
+
+    This is a thin wrapper around the FITC class, with a set of sensible defaults
+
+    :param X: input observations
+    :param Y: observed values
+    :param likelihood: a GPy likelihood, defaults to Binomial with probit link_function
+    :param kernel: a GPy kernel, defaults to rbf+white
+    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_X: False|True
+    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_Y: False|True
+    :rtype: model object
+
+    """
+
+    def __init__(self, X, Y=None, likelihood=None, kernel=None, normalize_X=False, normalize_Y=False, Z=None, M=10):
+        if kernel is None:
+            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1],1e-3)
+
+        if likelihood is None:
+            distribution = likelihoods.likelihood_functions.Binomial()
+            likelihood = likelihoods.EP(Y, distribution)
+        elif Y is not None:
+            if not all(Y.flatten() == likelihood.data.flatten()):
+                raise Warning, 'likelihood.data and Y are different.'
+
+        if Z is None:
+            i = np.random.permutation(X.shape[0])[:M]
+            Z = X[i].copy()
+        else:
+            assert Z.shape[1]==X.shape[1]
+
+        FITC.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X)
+        self._set_params(self._get_params())
--- a/GPy/models/generalized_fitc.py
+++ b/GPy/models/generalized_fitc.py
@ -2,20 +2,17 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-import pylab as pb
-from ..util.linalg import mdot, jitchol, chol_inv, pdinv, trace_dot
-from ..util.plot import gpplot
-from .. import kern
-from scipy import stats, linalg
-from ..core import sparse_GP
+from scipy import linalg
+from GPy.core.sparse_gp import SparseGP
+from GPy.util.linalg import mdot

-def backsub_both_sides(L,X):
+def backsub_both_sides(L, X):
    """ Return L^-T * X * L^-1, assumuing X is symmetrical and L is lower cholesky"""
-    tmp,_ = linalg.lapack.flapack.dtrtrs(L,np.asfortranarray(X),lower=1,trans=1)
-    return linalg.lapack.flapack.dtrtrs(L,np.asfortranarray(tmp.T),lower=1,trans=1)[0].T
+    tmp, _ = linalg.lapack.flapack.dtrtrs(L, np.asfortranarray(X), lower=1, trans=1)
+    return linalg.lapack.flapack.dtrtrs(L, np.asfortranarray(tmp.T), lower=1, trans=1)[0].T


-class generalized_FITC(sparse_GP):
+class GeneralizedFITC(SparseGP):
    """
    Naish-Guzman, A. and Holden, S. (2008) implemantation of EP with FITC.

@ -28,25 +25,26 @@ class generalized_FITC(sparse_GP):
    :param X_variance: The variance in the measurements of X (Gaussian variance)
    :type X_variance: np.ndarray (N x input_dim) | None
    :param Z: inducing inputs (optional, see note)
-    :type Z: np.ndarray (M x input_dim) | None
-    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
-    :type M: int
+    :type Z: np.ndarray (num_inducing x input_dim) | None
+    :param num_inducing : Number of inducing points (optional, default 10. Ignored if Z is not None)
+    :type num_inducing: int
    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """

    def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False):
+
        self.Z = Z
-        self.M = self.Z.shape[0]
+        self.num_inducing = self.Z.shape[0]
        self.true_precision = likelihood.precision

-        super(generalized_FITC, self).__init__(X, likelihood, kernel=kernel, Z=self.Z, X_variance=X_variance, normalize_X=normalize_X)
+        super(GeneralizedFITC, self).__init__(X, likelihood, kernel=kernel, Z=self.Z, X_variance=X_variance, normalize_X=normalize_X)
        self._set_params(self._get_params())

    def _set_params(self, p):
-        self.Z = p[:self.M*self.input_dim].reshape(self.M, self.input_dim)
-        self.kern._set_params(p[self.Z.size:self.Z.size+self.kern.Nparam])
-        self.likelihood._set_params(p[self.Z.size+self.kern.Nparam:])
+        self.Z = p[:self.num_inducing * self.input_dim].reshape(self.num_inducing, self.input_dim)
+        self.kern._set_params(p[self.Z.size:self.Z.size + self.kern.num_params])
+        self.likelihood._set_params(p[self.Z.size + self.kern.num_params:])
        self._compute_kernel_matrices()
        self._computations()
        self._FITC_computations()
@ -58,15 +56,15 @@ class generalized_FITC(sparse_GP):
        For a Gaussian (or direct: TODO) likelihood, no iteration is required:
        this function does nothing

-        Diag(Knn - Qnn) is added to the noise term to use the tools already implemented in sparse_GP.
+        Diag(Knn - Qnn) is added to the noise term to use the tools already implemented in SparseGP.
        The true precison is now 'true_precision' not 'precision'.
        """
        if self.has_uncertain_inputs:
            raise NotImplementedError, "FITC approximation not implemented for uncertain inputs"
        else:
-            self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
+            self.likelihood.fit_FITC(self.Kmm, self.psi1, self.psi0)
            self.true_precision = self.likelihood.precision # Save the true precision
-            self.likelihood.precision = self.true_precision/(1. + self.true_precision*self.Diag0[:,None]) # Add the diagonal element of the FITC approximation
+            self.likelihood.precision = self.true_precision / (1. + self.true_precision * self.Diag0[:, None]) # Add the diagonal element of the FITC approximation
            self._set_params(self._get_params()) # update the GP

    def _FITC_computations(self):
@ -75,40 +73,40 @@ class generalized_FITC(sparse_GP):
        but adds a diagonal term to the covariance matrix: diag(Knn - Qnn).
        This function:
            - computes the FITC diagonal term
-            - removes the extra terms computed in the sparse_GP approximation
+            - removes the extra terms computed in the SparseGP approximation
            - computes the likelihood gradients wrt the true precision.
        """
-        #NOTE the true precison is now 'true_precision' not 'precision'
+        # NOTE the true precison is now 'true_precision' not 'precision'
        if self.likelihood.is_heteroscedastic:

            # Compute generalized FITC's diagonal term of the covariance
-            self.Lmi,info = linalg.lapack.flapack.dtrtrs(self.Lm,np.eye(self.M),lower=1)
-            Lmipsi1 = np.dot(self.Lmi,self.psi1)
-            self.Qnn = np.dot(Lmipsi1.T,Lmipsi1)
-            #self.Kmmi, Lm, Lmi, Kmm_logdet = pdinv(self.Kmm)
-            #self.Qnn = mdot(self.psi1.T,self.Kmmi,self.psi1)
-            #a = kj
+            self.Lmi, info = linalg.lapack.flapack.dtrtrs(self.Lm, np.eye(self.num_inducing), lower=1)
+            Lmipsi1 = np.dot(self.Lmi, self.psi1)
+            self.Qnn = np.dot(Lmipsi1.T, Lmipsi1)
+            # self.Kmmi, Lm, Lmi, Kmm_logdet = pdinv(self.Kmm)
+            # self.Qnn = mdot(self.psi1.T,self.Kmmi,self.psi1)
+            # a = kj
            self.Diag0 = self.psi0 - np.diag(self.Qnn)
-            Iplus_Dprod_i = 1./(1.+ self.Diag0 * self.true_precision.flatten())
+            Iplus_Dprod_i = 1. / (1. + self.Diag0 * self.true_precision.flatten())
            self.Diag = self.Diag0 * Iplus_Dprod_i

-            self.P = Iplus_Dprod_i[:,None] * self.psi1.T
-            self.RPT0 = np.dot(self.Lmi,self.psi1)
-            self.L = np.linalg.cholesky(np.eye(self.M) + np.dot(self.RPT0,((1. - Iplus_Dprod_i)/self.Diag0)[:,None]*self.RPT0.T))
-            self.R,info = linalg.flapack.dtrtrs(self.L,self.Lmi,lower=1)
-            self.RPT = np.dot(self.R,self.P.T)
-            self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T,self.RPT)
+            self.P = Iplus_Dprod_i[:, None] * self.psi1.T
+            self.RPT0 = np.dot(self.Lmi, self.psi1)
+            self.L = np.linalg.cholesky(np.eye(self.num_inducing) + np.dot(self.RPT0, ((1. - Iplus_Dprod_i) / self.Diag0)[:, None] * self.RPT0.T))
+            self.R, info = linalg.lapack.dtrtrs(self.L, self.Lmi, lower=1)
+            self.RPT = np.dot(self.R, self.P.T)
+            self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T, self.RPT)
            self.w = self.Diag * self.likelihood.v_tilde
-            self.Gamma = np.dot(self.R.T, np.dot(self.RPT,self.likelihood.v_tilde))
-            self.mu = self.w + np.dot(self.P,self.Gamma)
+            self.Gamma = np.dot(self.R.T, np.dot(self.RPT, self.likelihood.v_tilde))
+            self.mu = self.w + np.dot(self.P, self.Gamma)

            # Remove extra term from dL_dpsi1
-            self.dL_dpsi1 -= mdot(self.Lmi.T,Lmipsi1*self.likelihood.precision.flatten().reshape(1,self.N))
+            self.dL_dpsi1 -= mdot(self.Lmi.T,Lmipsi1 * self.likelihood.precision.flatten().reshape(1,self.num_data))
            #self.Kmmi, Lm, Lmi, Kmm_logdet = pdinv(self.Kmm)
-            #self.dL_dpsi1 -= mdot(self.Kmmi,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)) #dB
+            #self.dL_dpsi1 -= mdot(self.Kmmi,self.psi1*self.likelihood.precision.flatten().reshape(1,self.num_data)) #dB

            #########333333
-            #self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B)
+            # self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B)
            #########333333


@ -116,16 +114,16 @@ class generalized_FITC(sparse_GP):
        else:
            raise NotImplementedError, "homoscedastic fitc not implemented"
            # Remove extra term from dL_dpsi1
-            #self.dL_dpsi1 += -mdot(self.Kmmi,self.psi1*self.likelihood.precision) #dB
+            # self.dL_dpsi1 += -mdot(self.Kmmi,self.psi1*self.likelihood.precision) #dB

        sf = self.scale_factor
-        sf2 = sf**2
+        sf2 = sf ** 2

        # Remove extra term from dL_dKmm
-        self.dL_dKmm += 0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi)*sf2 # dB
+        self.dL_dKmm += 0.5 * self.input_dim * mdot(self.Lmi.T, self.A, self.Lmi) * sf2 # dB
        self.dL_dpsi0 = None

-        #the partial derivative vector for the likelihood
+        # the partial derivative vector for the likelihood
        if self.likelihood.Nparams == 0:
            self.partial_for_likelihood = None
        elif self.likelihood.is_heteroscedastic:
@ -133,8 +131,8 @@ class generalized_FITC(sparse_GP):
        else:
            raise NotImplementedError, "homoscedastic derivatives not implemented"
            #likelihood is not heterscedatic
-            #self.partial_for_likelihood =   - 0.5 * self.N*self.D*self.likelihood.precision + 0.5 * np.sum(np.square(self.likelihood.Y))*self.likelihood.precision**2
-            #self.partial_for_likelihood += 0.5 * self.D * trace_dot(self.Bi,self.A)*self.likelihood.precision
+            #self.partial_for_likelihood =   - 0.5 * self.num_data*self.input_dim*self.likelihood.precision + 0.5 * np.sum(np.square(self.likelihood.Y))*self.likelihood.precision**2
+            #self.partial_for_likelihood += 0.5 * self.input_dim * trace_dot(self.Bi,self.A)*self.likelihood.precision
            #self.partial_for_likelihood += self.likelihood.precision*(0.5*trace_dot(self.psi2_beta_scaled,self.E*sf2) - np.trace(self.Cpsi1VVpsi1))
        #TODO partial derivative vector for the likelihood not implemented

@ -142,28 +140,28 @@ class generalized_FITC(sparse_GP):
        """
        Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel
        """
-        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z)
+        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm, self.Z)
        if self.has_uncertain_inputs:
            raise NotImplementedError, "heteroscedatic derivates not implemented"
        else:
-            #NOTE in sparse_GP this would include the gradient wrt psi0
-            dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1,self.Z,self.X)
+            # NOTE in SparseGP this would include the gradient wrt psi0
+            dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1, self.Z, self.X)
        return dL_dtheta


    def log_likelihood(self):
        """ Compute the (lower bound on the) log marginal likelihood """
-        sf2 = self.scale_factor**2
+        sf2 = self.scale_factor ** 2
        if self.likelihood.is_heteroscedastic:
-            A = -0.5*self.N*self.D*np.log(2.*np.pi) +0.5*np.sum(np.log(self.likelihood.precision)) -0.5*np.sum(self.V*self.likelihood.Y)
+            A = -0.5*self.num_data*self.input_dim*np.log(2.*np.pi) +0.5*np.sum(np.log(self.likelihood.precision)) -0.5*np.sum(self.V*self.likelihood.Y)
        else:
-            A = -0.5*self.N*self.D*(np.log(2.*np.pi) + np.log(self.likelihood._variance)) -0.5*self.likelihood.precision*self.likelihood.trYYT
-        C = -self.D * (np.sum(np.log(np.diag(self.LB))) + 0.5*self.M*np.log(sf2))
-        #C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2))
+            A = -0.5*self.num_data*self.input_dim*(np.log(2.*np.pi) + np.log(self.likelihood._variance)) -0.5*self.likelihood.precision*self.likelihood.trYYT
+        C = -self.input_dim * (np.sum(np.log(np.diag(self.LB))) + 0.5*self.num_inducing*np.log(sf2))
+        #C = -0.5*self.input_dim * (self.B_logdet + self.num_inducing*np.log(sf2))
        D = 0.5*np.sum(np.square(self._LBi_Lmi_psi1V))
        #self.Cpsi1VVpsi1 = np.dot(self.Cpsi1V,self.psi1V.T)
        #D_ = 0.5*np.trace(self.Cpsi1VVpsi1)
-        return A+C+D
+        return A + C + D

    def _raw_predict(self, Xnew, which_parts, full_cov=False):
        if self.likelihood.is_heteroscedastic:
@ -177,35 +175,35 @@ class generalized_FITC(sparse_GP):
            # q(u|f) = N(u| R0i*mu_u*f, R0i*C*R0i.T)

            # Ci = I + (RPT0)Di(RPT0).T
-            # C = I - [RPT0] * (D+[RPT0].T*[RPT0])^-1*[RPT0].T
-            #   = I - [RPT0] * (D + self.Qnn)^-1 * [RPT0].T
+            # C = I - [RPT0] * (input_dim+[RPT0].T*[RPT0])^-1*[RPT0].T
+            #   = I - [RPT0] * (input_dim + self.Qnn)^-1 * [RPT0].T
            #   = I - [RPT0] * (U*U.T)^-1 * [RPT0].T
            #   = I - V.T * V
            U = np.linalg.cholesky(np.diag(self.Diag0) + self.Qnn)
-            V,info = linalg.flapack.dtrtrs(U,self.RPT0.T,lower=1)
-            C = np.eye(self.M) - np.dot(V.T,V)
-            mu_u = np.dot(C,self.RPT0)*(1./self.Diag0[None,:])
-            #self.C = C
-            #self.RPT0 = np.dot(self.R0,self.Knm.T) P0.T
-            #self.mu_u = mu_u
-            #self.U = U
+            V, info = linalg.flapack.dtrtrs(U, self.RPT0.T, lower=1)
+            C = np.eye(self.num_inducing) - np.dot(V.T, V)
+            mu_u = np.dot(C, self.RPT0) * (1. / self.Diag0[None, :])
+            # self.C = C
+            # self.RPT0 = np.dot(self.R0,self.Knm.T) P0.T
+            # self.mu_u = mu_u
+            # self.U = U
            # q(u|y) = N(u| R0i*mu_H,R0i*Sigma_H*R0i.T)
-            mu_H = np.dot(mu_u,self.mu)
+            mu_H = np.dot(mu_u, self.mu)
            self.mu_H = mu_H
-            Sigma_H = C + np.dot(mu_u,np.dot(self.Sigma,mu_u.T))
+            Sigma_H = C + np.dot(mu_u, np.dot(self.Sigma, mu_u.T))
            # q(f_star|y) = N(f_star|mu_star,sigma2_star)
            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
-            KR0T = np.dot(Kx.T,self.Lmi.T)
-            mu_star = np.dot(KR0T,mu_H)
+            KR0T = np.dot(Kx.T, self.Lmi.T)
+            mu_star = np.dot(KR0T, mu_H)
            if full_cov:
-                Kxx = self.kern.K(Xnew,which_parts=which_parts)
-                var = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.M),KR0T.T))
+                Kxx = self.kern.K(Xnew, which_parts=which_parts)
+                var = Kxx + np.dot(KR0T, np.dot(Sigma_H - np.eye(self.num_inducing), KR0T.T))
            else:
-                Kxx = self.kern.Kdiag(Xnew,which_parts=which_parts)
-                Kxx_ = self.kern.K(Xnew,which_parts=which_parts) # TODO: RA, is this line needed?
-                var_ = Kxx_ + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.M),KR0T.T)) # TODO: RA, is this line needed?
-                var = (Kxx + np.sum(KR0T.T*np.dot(Sigma_H - np.eye(self.M),KR0T.T),0))[:,None]
-            return mu_star[:,None],var
+                Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)
+                Kxx_ = self.kern.K(Xnew, which_parts=which_parts) # TODO: RA, is this line needed?
+                var_ = Kxx_ + np.dot(KR0T, np.dot(Sigma_H - np.eye(self.num_inducing), KR0T.T)) # TODO: RA, is this line needed?
+                var = (Kxx + np.sum(KR0T.T * np.dot(Sigma_H - np.eye(self.num_inducing), KR0T.T), 0))[:, None]
+            return mu_star[:, None], var
        else:
            raise NotImplementedError, "homoscedastic fitc not implemented"
            """
--- a/GPy/models/gp_classification.py
+++ b/GPy/models/gp_classification.py
@ -7,15 +7,15 @@ from ..core import GP
 from .. import likelihoods
 from .. import kern

-class GP_classification(GP):
+class GPClassification(GP):
    """
    Gaussian Process classification

-    This is a thin wrapper around the models.GP class, with a set of sensible defalts
+    This is a thin wrapper around the models.GP class, with a set of sensible defaults

    :param X: input observations
    :param Y: observed values
-    :param likelihood: a GPy likelihood, defaults to binomial with probit link_function
+    :param likelihood: a GPy likelihood, defaults to Binomial with probit link_function
    :param kernel: a GPy kernel, defaults to rbf
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
@ -31,7 +31,7 @@ class GP_classification(GP):
            kernel = kern.rbf(X.shape[1])

        if likelihood is None:
-            distribution = likelihoods.likelihood_functions.binomial()
+            distribution = likelihoods.likelihood_functions.Binomial()
            likelihood = likelihoods.EP(Y, distribution)
        elif Y is not None:
            if not all(Y.flatten() == likelihood.data.flatten()):
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@ -7,11 +7,11 @@ from ..core import GP
 from .. import likelihoods
 from .. import kern

-class GP_regression(GP):
+class GPRegression(GP):
    """
    Gaussian Process model for regression

-    This is a thin wrapper around the models.GP class, with a set of sensible defalts
+    This is a thin wrapper around the models.GP class, with a set of sensible defaults

    :param X: input observations
    :param Y: observed values
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@ -6,7 +6,7 @@ import numpy as np
 import pylab as pb
 import sys, pdb
 from .. import kern
-from ..core import model
+from ..core import Model
 from ..util.linalg import pdinv, PCA
 from ..core import GP
 from ..likelihoods import Gaussian
@ -42,13 +42,13 @@ class GPLVM(GP):
            return np.random.randn(Y.shape[0], input_dim)

    def _get_param_names(self):
-        return sum([['X_%i_%i'%(n,q) for q in range(self.input_dim)] for n in range(self.N)],[]) + GP._get_param_names(self)
+        return sum([['X_%i_%i'%(n,q) for q in range(self.input_dim)] for n in range(self.num_data)],[]) + GP._get_param_names(self)

    def _get_params(self):
        return np.hstack((self.X.flatten(), GP._get_params(self)))

    def _set_params(self,x):
-        self.X = x[:self.N*self.input_dim].reshape(self.N,self.input_dim).copy()
+        self.X = x[:self.num_data*self.input_dim].reshape(self.num_data,self.input_dim).copy()
        GP._set_params(self, x[self.X.size:])

    def _log_likelihood_gradients(self):
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@ -3,17 +3,16 @@ Created on 10 Apr 2013

@author: Max Zwiessele
 '''
-from GPy.core import model
-from GPy.models.Bayesian_GPLVM import Bayesian_GPLVM
-from GPy.core import sparse_GP
+from GPy.core import Model
+from GPy.core import SparseGP
 from GPy.util.linalg import PCA
-from scipy import linalg
 import numpy
 import itertools
 import pylab
 from GPy.kern.kern import kern
+from GPy.models.bayesian_gplvm import BayesianGPLVM

-class MRD(model):
+class MRD(Model):
    """
    Do MRD on given Datasets in Ylist.
    All Ys in likelihood_list are in [N x Dn], where Dn can be different per Yn,
@ -34,18 +33,18 @@ class MRD(model):
    :param X_variance:
        Initial latent space variance
    :param init: [cooncat|single|random]
-        initialization method to use: 
+        initialization method to use:
            *concat: PCA on concatenated outputs
            *single: PCA on each output
            *random: random
-    :param M:
+    :param num_inducing:
        number of inducing inputs to use
    :param Z:
        initial inducing inputs
    :param kernels: list of kernels or kernel shared for all BGPLVMS
    :type kernels: [GPy.kern.kern] | GPy.kern.kern | None (default)
    """
-    def __init__(self, likelihood_or_Y_list, input_dim, M=10, names=None,
+    def __init__(self, likelihood_or_Y_list, input_dim, num_inducing=10, names=None,
                 kernels=None, initx='PCA',
                 initz='permute', _debug=False, **kw):
        if names is None:
@ -62,24 +61,24 @@ class MRD(model):
        assert not ('kernel' in kw), "pass kernels through `kernels` argument"

        self.input_dim = input_dim
-        self.M = M
+        self.num_inducing = num_inducing
        self._debug = _debug

        self._init = True
        X = self._init_X(initx, likelihood_or_Y_list)
        Z = self._init_Z(initz, X)
-        self.bgplvms = [Bayesian_GPLVM(l, input_dim=input_dim, kernel=k, X=X, Z=Z, M=self.M, **kw) for l, k in zip(likelihood_or_Y_list, kernels)]
+        self.bgplvms = [BayesianGPLVM(l, input_dim=input_dim, kernel=k, X=X, Z=Z, num_inducing=self.num_inducing, **kw) for l, k in zip(likelihood_or_Y_list, kernels)]
        del self._init

        self.gref = self.bgplvms[0]
-        nparams = numpy.array([0] + [sparse_GP._get_params(g).size - g.Z.size for g in self.bgplvms])
+        nparams = numpy.array([0] + [SparseGP._get_params(g).size - g.Z.size for g in self.bgplvms])
        self.nparams = nparams.cumsum()

-        self.N = self.gref.N
-        self.NQ = self.N * self.input_dim
-        self.MQ = self.M * self.input_dim
+        self.num_data = self.gref.num_data
+        self.NQ = self.num_data * self.input_dim
+        self.MQ = self.num_inducing * self.input_dim

-        model.__init__(self) # @UndefinedVariable
+        Model.__init__(self) # @UndefinedVariable
        self._set_params(self._get_params())

    @property
@ -143,15 +142,15 @@ class MRD(model):
        self._init_Z(initz, self.X)

    def _get_param_names(self):
-        # X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.N)], [])
-        # S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.N)], [])
+        # X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
+        # S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
        n1 = self.gref._get_param_names()
        n1var = n1[:self.NQ * 2 + self.MQ]
        map_names = lambda ns, name: map(lambda x: "{1}_{0}".format(*x),
                                         itertools.izip(ns,
                                                        itertools.repeat(name)))
        return list(itertools.chain(n1var, *(map_names(\
-                sparse_GP._get_param_names(g)[self.MQ:], n) \
+                SparseGP._get_param_names(g)[self.MQ:], n) \
                for g, n in zip(self.bgplvms, self.names))))

    def _get_params(self):
@ -165,14 +164,14 @@ class MRD(model):
        X = self.gref.X.ravel()
        X_var = self.gref.X_variance.ravel()
        Z = self.gref.Z.ravel()
-        thetas = [sparse_GP._get_params(g)[g.Z.size:] for g in self.bgplvms]
+        thetas = [SparseGP._get_params(g)[g.Z.size:] for g in self.bgplvms]
        params = numpy.hstack([X, X_var, Z, numpy.hstack(thetas)])
        return params

 #     def _set_var_params(self, g, X, X_var, Z):
-#         g.X = X.reshape(self.N, self.input_dim)
-#         g.X_variance = X_var.reshape(self.N, self.input_dim)
-#         g.Z = Z.reshape(self.M, self.input_dim)
+#         g.X = X.reshape(self.num_data, self.input_dim)
+#         g.X_variance = X_var.reshape(self.num_data, self.input_dim)
+#         g.Z = Z.reshape(self.num_inducing, self.input_dim)
 #
 #     def _set_kern_params(self, g, p):
 #         g.kern._set_params(p[:g.kern.Nparam])
@ -206,7 +205,7 @@ class MRD(model):
    def log_likelihood(self):
        ll = -self.gref.KL_divergence()
        for g in self.bgplvms:
-            ll += sparse_GP.log_likelihood(g)
+            ll += SparseGP.log_likelihood(g)
        return ll

    def _log_likelihood_gradients(self):
@ -215,7 +214,7 @@ class MRD(model):
        dLdmu -= dKLmu
        dLdS -= dKLdS
        dLdmuS = numpy.hstack((dLdmu.flatten(), dLdS.flatten())).flatten()
-        dldzt1 = reduce(lambda a, b: a + b, (sparse_GP._log_likelihood_gradients(g)[:self.MQ] for g in self.bgplvms))
+        dldzt1 = reduce(lambda a, b: a + b, (SparseGP._log_likelihood_gradients(g)[:self.MQ] for g in self.bgplvms))

        return numpy.hstack((dLdmuS,
                             dldzt1,
@ -250,9 +249,9 @@ class MRD(model):
        if X is None:
            X = self.X
        if init in "permute":
-            Z = numpy.random.permutation(X.copy())[:self.M]
+            Z = numpy.random.permutation(X.copy())[:self.num_inducing]
        elif init in "random":
-            Z = numpy.random.randn(self.M, self.input_dim) * X.var()
+            Z = numpy.random.randn(self.num_inducing, self.input_dim) * X.var()
        self.Z = Z
        return Z

@ -274,8 +273,8 @@ class MRD(model):
        else:
            return pylab.gcf()

-    def plot_X_1d(self):
-        return self.gref.plot_X_1d()
+    def plot_X_1d(self, *a, **kw):
+        return self.gref.plot_X_1d(*a, **kw)

    def plot_X(self, fignum=None, ax=None):
        fig = self._handle_plotting(fignum, ax, lambda i, g, ax: ax.imshow(g.X))
--- a/GPy/models/sparse_GPLVM.py
+++ b/GPy/models/sparse_GPLVM.py
@ -1,61 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-import pylab as pb
-import sys, pdb
-# from .. import kern
-# from ..core import model
-# from ..util.linalg import pdinv, PCA
-from GPLVM import GPLVM
-from sparse_GP_regression import sparse_GP_regression
-
-class sparse_GPLVM(sparse_GP_regression, GPLVM):
-    """
-    Sparse Gaussian Process Latent Variable Model
-
-    :param Y: observed data
-    :type Y: np.ndarray
-    :param input_dim: latent dimensionality
-    :type input_dim: int
-    :param init: initialisation method for the latent space
-    :type init: 'PCA'|'random'
-
-    """
-    def __init__(self, Y, input_dim, kernel=None, init='PCA', M=10):
-        X = self.initialise_latent(init, input_dim, Y)
-        sparse_GP_regression.__init__(self, X, Y, kernel=kernel,M=M)
-
-    def _get_param_names(self):
-        return (sum([['X_%i_%i'%(n,q) for q in range(self.input_dim)] for n in range(self.N)],[])
-                + sparse_GP_regression._get_param_names(self))
-
-    def _get_params(self):
-        return np.hstack((self.X.flatten(), sparse_GP_regression._get_params(self)))
-
-    def _set_params(self,x):
-        self.X = x[:self.X.size].reshape(self.N,self.input_dim).copy()
-        sparse_GP_regression._set_params(self, x[self.X.size:])
-
-    def log_likelihood(self):
-        return sparse_GP_regression.log_likelihood(self)
-
-    def dL_dX(self):
-        dL_dX = self.kern.dKdiag_dX(self.dL_dpsi0,self.X)
-        dL_dX += self.kern.dK_dX(self.dL_dpsi1.T,self.X,self.Z)
-
-        return dL_dX
-
-    def _log_likelihood_gradients(self):
-        return np.hstack((self.dL_dX().flatten(), sparse_GP_regression._log_likelihood_gradients(self)))
-
-    def plot(self):
-        GPLVM.plot(self)
-        #passing Z without a small amout of jitter will induce the white kernel where we don;t want it!
-        mu, var, upper, lower = sparse_GP_regression.predict(self, self.Z+np.random.randn(*self.Z.shape)*0.0001)
-        pb.plot(mu[:, 0] , mu[:, 1], 'ko')
-
-    def plot_latent(self, *args, **kwargs):
-        input_1, input_2 = GPLVM.plot_latent(*args, **kwargs)
-        pb.plot(m.Z[:, input_1], m.Z[:, input_2], '^w')
--- a/GPy/models/sparse_gp_classification.py
+++ b/GPy/models/sparse_gp_classification.py
@ -3,21 +3,20 @@


 import numpy as np
-from ..core import sparse_GP
+from ..core import SparseGP
 from .. import likelihoods
 from .. import kern
 from ..likelihoods import likelihood
-from GP_regression import GP_regression

-class sparse_GP_classification(sparse_GP):
+class SparseGPClassification(SparseGP):
    """
    sparse Gaussian Process model for classification

-    This is a thin wrapper around the sparse_GP class, with a set of sensible defalts
+    This is a thin wrapper around the sparse_GP class, with a set of sensible defaults

    :param X: input observations
    :param Y: observed values
-    :param likelihood: a GPy likelihood, defaults to binomial with probit link_function
+    :param likelihood: a GPy likelihood, defaults to Binomial with probit link_function
    :param kernel: a GPy kernel, defaults to rbf+white
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
@ -25,26 +24,24 @@ class sparse_GP_classification(sparse_GP):
    :type normalize_Y: False|True
    :rtype: model object

-    .. Note:: Multiple independent outputs are allowed using columns of Y
-
    """

-    def __init__(self, X, Y=None, likelihood=None, kernel=None, normalize_X=False, normalize_Y=False, Z=None, M=10):
+    def __init__(self, X, Y=None, likelihood=None, kernel=None, normalize_X=False, normalize_Y=False, Z=None, num_inducing=10):
        if kernel is None:
            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1],1e-3)

        if likelihood is None:
-            distribution = likelihoods.likelihood_functions.binomial()
+            distribution = likelihoods.likelihood_functions.Binomial()
            likelihood = likelihoods.EP(Y, distribution)
        elif Y is not None:
            if not all(Y.flatten() == likelihood.data.flatten()):
                raise Warning, 'likelihood.data and Y are different.'

        if Z is None:
-            i = np.random.permutation(X.shape[0])[:M]
+            i = np.random.permutation(X.shape[0])[:num_inducing]
            Z = X[i].copy()
        else:
            assert Z.shape[1]==X.shape[1]

-        sparse_GP.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X)
+        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X)
        self._set_params(self._get_params())
--- a/GPy/models/sparse_gp_regression.py
+++ b/GPy/models/sparse_gp_regression.py
@ -3,17 +3,15 @@


 import numpy as np
-from ..core import sparse_GP
+from ..core import SparseGP
 from .. import likelihoods
 from .. import kern
-from ..likelihoods import likelihood
-from GP_regression import GP_regression

-class sparse_GP_regression(sparse_GP):
+class SparseGPRegression(SparseGP):
    """
    Gaussian Process model for regression

-    This is a thin wrapper around the sparse_GP class, with a set of sensible defalts
+    This is a thin wrapper around the SparseGP class, with a set of sensible defalts

    :param X: input observations
    :param Y: observed values
@ -28,20 +26,20 @@ class sparse_GP_regression(sparse_GP):

    """

-    def __init__(self, X, Y, kernel=None, normalize_X=False, normalize_Y=False, Z=None, M=10, X_variance=None):
-        #kern defaults to rbf (plus white for stability)
+    def __init__(self, X, Y, kernel=None, normalize_X=False, normalize_Y=False, Z=None, num_inducing=10, X_variance=None):
+        # kern defaults to rbf (plus white for stability)
        if kernel is None:
-            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1],1e-3)
+            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1], 1e-3)

-        #Z defaults to a subset of the data
+        # Z defaults to a subset of the data
        if Z is None:
-            i = np.random.permutation(X.shape[0])[:M]
+            i = np.random.permutation(X.shape[0])[:num_inducing]
            Z = X[i].copy()
        else:
-            assert Z.shape[1]==X.shape[1]
+            assert Z.shape[1] == X.shape[1]

-        #likelihood defaults to Gaussian
-        likelihood = likelihoods.Gaussian(Y,normalize=normalize_Y)
+        # likelihood defaults to Gaussian
+        likelihood = likelihoods.Gaussian(Y, normalize=normalize_Y)

-        sparse_GP.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X, X_variance=X_variance)
+        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X, X_variance=X_variance)
        self._set_params(self._get_params())
--- a/GPy/models/sparse_gplvm.py
+++ b/GPy/models/sparse_gplvm.py
@ -0,0 +1,61 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+import pylab as pb
+import sys, pdb
+from GPy.models.sparse_gp_regression import SparseGPRegression
+from GPy.models.gplvm import GPLVM
+# from .. import kern
+# from ..core import model
+# from ..util.linalg import pdinv, PCA
+
+class SparseGPLVM(SparseGPRegression, GPLVM):
+    """
+    Sparse Gaussian Process Latent Variable Model
+
+    :param Y: observed data
+    :type Y: np.ndarray
+    :param input_dim: latent dimensionality
+    :type input_dim: int
+    :param init: initialisation method for the latent space
+    :type init: 'PCA'|'random'
+
+    """
+    def __init__(self, Y, input_dim, kernel=None, init='PCA', num_inducing=10):
+        X = self.initialise_latent(init, input_dim, Y)
+        SparseGPRegression.__init__(self, X, Y, kernel=kernel, num_inducing=num_inducing)
+
+    def _get_param_names(self):
+        return (sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
+                + SparseGPRegression._get_param_names(self))
+
+    def _get_params(self):
+        return np.hstack((self.X.flatten(), SparseGPRegression._get_params(self)))
+
+    def _set_params(self, x):
+        self.X = x[:self.X.size].reshape(self.num_data, self.input_dim).copy()
+        SparseGPRegression._set_params(self, x[self.X.size:])
+
+    def log_likelihood(self):
+        return SparseGPRegression.log_likelihood(self)
+
+    def dL_dX(self):
+        dL_dX = self.kern.dKdiag_dX(self.dL_dpsi0, self.X)
+        dL_dX += self.kern.dK_dX(self.dL_dpsi1.T, self.X, self.Z)
+
+        return dL_dX
+
+    def _log_likelihood_gradients(self):
+        return np.hstack((self.dL_dX().flatten(), SparseGPRegression._log_likelihood_gradients(self)))
+
+    def plot(self):
+        GPLVM.plot(self)
+        # passing Z without a small amout of jitter will induce the white kernel where we don;t want it!
+        mu, var, upper, lower = SparseGPRegression.predict(self, self.Z + np.random.randn(*self.Z.shape) * 0.0001)
+        pb.plot(mu[:, 0] , mu[:, 1], 'ko')
+
+    def plot_latent(self, *args, **kwargs):
+        input_1, input_2 = GPLVM.plot_latent(*args, **kwargs)
+        pb.plot(m.Z[:, input_1], m.Z[:, input_2], '^w')
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@ -3,25 +3,21 @@


 import numpy as np
-from .. import kern
-from ..core import model
-from ..util.linalg import pdinv
-from ..util.plot import gpplot
 from ..util.warping_functions import *
-from GP_regression import GP_regression
 from ..core import GP
 from .. import likelihoods
-from .. import kern
+from GPy.util.warping_functions import TanhWarpingFunction_d
+from GPy import kern

-class warpedGP(GP):
-    def __init__(self, X, Y, kernel=None, warping_function = None, warping_terms = 3, normalize_X=False, normalize_Y=False):
+class WarpedGP(GP):
+    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3, normalize_X=False, normalize_Y=False):

        if kernel is None:
            kernel = kern.rbf(X.shape[1])

        if warping_function == None:
            self.warping_function = TanhWarpingFunction_d(warping_terms)
-            self.warping_params = (np.random.randn(self.warping_function.n_terms*3+1,) * 1)
+            self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1,) * 1)

        Y = self._scale_data(Y)
        self.has_uncertain_inputs = False
@ -35,10 +31,10 @@ class warpedGP(GP):
    def _scale_data(self, Y):
        self._Ymax = Y.max()
        self._Ymin = Y.min()
-        return (Y-self._Ymin)/(self._Ymax-self._Ymin) - 0.5
+        return (Y - self._Ymin) / (self._Ymax - self._Ymin) - 0.5

    def _unscale_data(self, Y):
-        return (Y + 0.5)*(self._Ymax - self._Ymin) + self._Ymin
+        return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin

    def _set_params(self, x):
        self.warping_params = x[:self.warping_function.num_parameters]
@ -68,15 +64,15 @@ class warpedGP(GP):
        alpha = np.dot(self.Ki, self.likelihood.Y.flatten())
        warping_grads = self.warping_function_gradients(alpha)

-        warping_grads = np.append(warping_grads[:,:-1].flatten(), warping_grads[0,-1])
+        warping_grads = np.append(warping_grads[:, :-1].flatten(), warping_grads[0, -1])
        return np.hstack((warping_grads.flatten(), ll_grads.flatten()))

    def warping_function_gradients(self, Kiy):
        grad_y = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
        grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed, self.warping_params,
-                                                                 return_covar_chain = True)
-        djac_dpsi = ((1.0/grad_y[:,:, None, None])*grad_y_psi).sum(axis=0).sum(axis=0)
-        dquad_dpsi = (Kiy[:,None,None,None] * grad_psi).sum(axis=0).sum(axis=0)
+                                                                 return_covar_chain=True)
+        djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
+        dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)

        return -dquad_dpsi + djac_dpsi

--- a/GPy/testing/bgplvm_tests.py
+++ b/GPy/testing/bgplvm_tests.py
@ -4,70 +4,71 @@
 import unittest
 import numpy as np
 import GPy
+from GPy.models.bayesian_gplvm import BayesianGPLVM

 class BGPLVMTests(unittest.TestCase):
    def test_bias_kern(self):
-        N, M, input_dim, D = 10, 3, 2, 4
+        N, num_inducing, input_dim, D = 10, 3, 2, 4
        X = np.random.rand(N, input_dim)
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        K = k.K(X)
-        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
        Y -= Y.mean(axis=0)
        k = GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
-        m = GPy.models.Bayesian_GPLVM(Y, input_dim, kernel = k,  M=M)
+        m = BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())

    def test_linear_kern(self):
-        N, M, input_dim, D = 10, 3, 2, 4
+        N, num_inducing, input_dim, D = 10, 3, 2, 4
        X = np.random.rand(N, input_dim)
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        K = k.K(X)
-        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
        Y -= Y.mean(axis=0)
        k = GPy.kern.linear(input_dim) + GPy.kern.white(input_dim, 0.00001)
-        m = GPy.models.Bayesian_GPLVM(Y, input_dim, kernel = k,  M=M)
+        m = BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())

    def test_rbf_kern(self):
-        N, M, input_dim, D = 10, 3, 2, 4
+        N, num_inducing, input_dim, D = 10, 3, 2, 4
        X = np.random.rand(N, input_dim)
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        K = k.K(X)
-        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
        Y -= Y.mean(axis=0)
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
-        m = GPy.models.Bayesian_GPLVM(Y, input_dim, kernel = k,  M=M)
+        m = BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())

    def test_rbf_bias_kern(self):
-        N, M, input_dim, D = 10, 3, 2, 4
+        N, num_inducing, input_dim, D = 10, 3, 2, 4
        X = np.random.rand(N, input_dim)
        k = GPy.kern.rbf(input_dim) +  GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
        K = k.K(X)
-        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
        Y -= Y.mean(axis=0)
        k = GPy.kern.rbf(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
-        m = GPy.models.Bayesian_GPLVM(Y, input_dim, kernel = k,  M=M)
+        m = BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())

    #@unittest.skip('psi2 cross terms are NotImplemented for this combination')
    def test_linear_bias_kern(self):
-        N, M, input_dim, D = 30, 5, 4, 30
+        N, num_inducing, input_dim, D = 30, 5, 4, 30
        X = np.random.rand(N, input_dim)
        k = GPy.kern.linear(input_dim) +  GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
        K = k.K(X)
-        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
        Y -= Y.mean(axis=0)
        k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
-        m = GPy.models.Bayesian_GPLVM(Y, input_dim, kernel = k,  M=M)
+        m = BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())
--- a/GPy/testing/examples_tests.py
+++ b/GPy/testing/examples_tests.py
@ -9,40 +9,41 @@ import pkgutil
 import os
 import random
 from nose.tools import nottest
+import sys

 class ExamplesTests(unittest.TestCase):
-    def _checkgrad(self, model):
-        self.assertTrue(model.checkgrad())
+    def _checkgrad(self, Model):
+        self.assertTrue(Model.checkgrad())

-    def _model_instance(self, model):
-        self.assertTrue(isinstance(model, GPy.models))
+    def _model_instance(self, Model):
+        self.assertTrue(isinstance(Model, GPy.models))

 """
-def model_instance_generator(model):
+def model_instance_generator(Model):
    def check_model_returned(self):
-        self._model_instance(model)
+        self._model_instance(Model)
    return check_model_returned

-def checkgrads_generator(model):
+def checkgrads_generator(Model):
    def model_checkgrads(self):
-        self._checkgrad(model)
+        self._checkgrad(Model)
    return model_checkgrads
 """

-def model_checkgrads(model):
-    model.randomize()
-    assert model.checkgrad()
+def model_checkgrads(Model):
+    Model.randomize()
+    assert Model.checkgrad()


-def model_instance(model):
-    assert isinstance(model, GPy.core.model)
+def model_instance(Model):
+    assert isinstance(Model, GPy.core.Model)

@nottest
 def test_models():
    examples_path = os.path.dirname(GPy.examples.__file__)
-    #Load modules
+    # Load modules
    for loader, module_name, is_pkg in pkgutil.iter_modules([examples_path]):
-        #Load examples
+        # Load examples
        module_examples = loader.find_module(module_name).load_module(module_name)
        print "MODULE", module_examples
        print "Before"
@ -56,26 +57,27 @@ def test_models():
                continue

            print "Testing example: ", example[0]
-            #Generate model
-            model = example[1]()
-            print model
+            # Generate Model
+            Model = example[1]()
+            print Model

-            #Create tests for instance check
+            # Create tests for instance check
            """
-            test = model_instance_generator(model)
+            test = model_instance_generator(Model)
            test.__name__ = 'test_instance_%s' % example[0]
            setattr(ExamplesTests, test.__name__, test)

            #Create tests for checkgrads check
-            test = checkgrads_generator(model)
+            test = checkgrads_generator(Model)
            test.__name__ = 'test_checkgrads_%s' % example[0]
            setattr(ExamplesTests, test.__name__, test)
            """
            model_checkgrads.description = 'test_checkgrads_%s' % example[0]
-            yield model_checkgrads, model
+            yield model_checkgrads, Model
            model_instance.description = 'test_instance_%s' % example[0]
-            yield model_instance, model
+            yield model_instance, Model

 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
-    unittest.main()
+    # unittest.main()
+    test_models()
--- a/GPy/testing/gplvm_tests.py
+++ b/GPy/testing/gplvm_tests.py
@ -7,11 +7,11 @@ import GPy

 class GPLVMTests(unittest.TestCase):
    def test_bias_kern(self):
-        N, M, input_dim, D = 10, 3, 2, 4
+        N, num_inducing, input_dim, D = 10, 3, 2, 4
        X = np.random.rand(N, input_dim)
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        K = k.K(X)
-        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
        k = GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
        m.ensure_default_constraints()
@ -19,11 +19,11 @@ class GPLVMTests(unittest.TestCase):
        self.assertTrue(m.checkgrad())

    def test_linear_kern(self):
-        N, M, input_dim, D = 10, 3, 2, 4
+        N, num_inducing, input_dim, D = 10, 3, 2, 4
        X = np.random.rand(N, input_dim)
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        K = k.K(X)
-        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
        k = GPy.kern.linear(input_dim) + GPy.kern.white(input_dim, 0.00001)
        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
        m.ensure_default_constraints()
@ -31,11 +31,11 @@ class GPLVMTests(unittest.TestCase):
        self.assertTrue(m.checkgrad())

    def test_rbf_kern(self):
-        N, M, input_dim, D = 10, 3, 2, 4
+        N, num_inducing, input_dim, D = 10, 3, 2, 4
        X = np.random.rand(N, input_dim)
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        K = k.K(X)
-        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
        m.ensure_default_constraints()
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -12,7 +12,7 @@ class KernelTests(unittest.TestCase):
        K.constrain_fixed('2')
        X = np.random.rand(5,5)
        Y = np.ones((5,1))
-        m = GPy.models.GP_regression(X,Y,K)
+        m = GPy.models.GPRegression(X,Y,K)
        self.assertTrue(m.checkgrad())

    def test_fixedkernel(self):
@ -21,9 +21,9 @@ class KernelTests(unittest.TestCase):
        """
        X = np.random.rand(30, 4)
        K = np.dot(X, X.T)
-        kernel = GPy.kern.fixed(4, K)
+        kernel = GPy.kern.Fixed(4, K)
        Y = np.ones((30,1))
-        m = GPy.models.GP_regression(X,Y,kernel=kernel)
+        m = GPy.models.GPRegression(X,Y,kernel=kernel)
        self.assertTrue(m.checkgrad())

    def test_coregionalisation(self):
@ -36,9 +36,9 @@ class KernelTests(unittest.TestCase):
        Y = np.vstack((Y1,Y2))

        k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
-        k2 = GPy.kern.coregionalise(2,1)
+        k2 = GPy.kern.Coregionalise(2,1)
        k = k1.prod(k2,tensor=True)
-        m = GPy.models.GP_regression(X,Y,kernel=k)
+        m = GPy.models.GPRegression(X,Y,kernel=k)
        self.assertTrue(m.checkgrad())


--- a/GPy/testing/mrd_tests.py
+++ b/GPy/testing/mrd_tests.py
@ -14,16 +14,16 @@ class MRDTests(unittest.TestCase):

    def test_gradients(self):
        num_m = 3
-        N, M, input_dim, D = 20, 8, 6, 20
+        N, num_inducing, input_dim, D = 20, 8, 6, 20
        X = np.random.rand(N, input_dim)

        k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
        K = k.K(X)

-        Ylist = [np.random.multivariate_normal(np.zeros(N), K, D).T for _ in range(num_m)]
+        Ylist = [np.random.multivariate_normal(np.zeros(N), K, input_dim).T for _ in range(num_m)]
        likelihood_list = [GPy.likelihoods.Gaussian(Y) for Y in Ylist]

-        m = GPy.models.MRD(likelihood_list, input_dim=input_dim, kernels=k, M=M)
+        m = GPy.models.MRD(likelihood_list, input_dim=input_dim, kernels=k, num_inducing=num_inducing)
        m.ensure_default_constraints()

        self.assertTrue(m.checkgrad())
--- a/GPy/testing/prior_tests.py
+++ b/GPy/testing/prior_tests.py
@ -13,7 +13,7 @@ class PriorTests(unittest.TestCase):
        y  = b*X + C + 1*np.sin(X)
        y += 0.05*np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
-        m = GPy.models.GP_regression(X, y)
+        m = GPy.models.GPRegression(X, y)
        m.ensure_default_constraints()
        lognormal = GPy.priors.LogGaussian(1, 2)
        m.set_prior('rbf', lognormal)
@ -27,7 +27,7 @@ class PriorTests(unittest.TestCase):
        y  = b*X + C + 1*np.sin(X)
        y += 0.05*np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
-        m = GPy.models.GP_regression(X, y)
+        m = GPy.models.GPRegression(X, y)
        m.ensure_default_constraints()
        Gamma = GPy.priors.Gamma(1, 1)
        m.set_prior('rbf', Gamma)
@ -41,7 +41,7 @@ class PriorTests(unittest.TestCase):
        y  = b*X + C + 1*np.sin(X)
        y += 0.05*np.random.randn(len(X))
        X, y = X[:, None], y[:, None]
-        m = GPy.models.GP_regression(X, y)
+        m = GPy.models.GPRegression(X, y)
        m.ensure_default_constraints()
        gaussian = GPy.priors.Gaussian(1, 1)
        success = False
--- a/GPy/testing/psi_stat_expactation_tests.py
+++ b/GPy/testing/psi_stat_expactation_tests.py
@ -21,36 +21,36 @@ def ard(p):

@testing.deepTest(__test__)
 class Test(unittest.TestCase):
-    D = 9
-    M = 4
+    input_dim = 9
+    num_inducing = 4
    N = 3
    Nsamples = 6e6

    def setUp(self):
        self.kerns = (
-#                       (GPy.kern.rbf(self.D, ARD=True) +
-#                        GPy.kern.linear(self.D, ARD=True) +
-#                        GPy.kern.bias(self.D) +
-#                        GPy.kern.white(self.D)),
-                      (GPy.kern.rbf(self.D, np.random.rand(), np.random.rand(self.D), ARD=True) +
-                       GPy.kern.rbf(self.D, np.random.rand(), np.random.rand(self.D), ARD=True) +
-                       GPy.kern.linear(self.D, np.random.rand(self.D), ARD=True) +
-                       GPy.kern.bias(self.D) +
-                       GPy.kern.white(self.D)),
-#                       GPy.kern.rbf(self.D), GPy.kern.rbf(self.D, ARD=True),
-#                       GPy.kern.linear(self.D, ARD=False), GPy.kern.linear(self.D, ARD=True),
-#                       GPy.kern.linear(self.D) + GPy.kern.bias(self.D),
-#                       GPy.kern.rbf(self.D) + GPy.kern.bias(self.D),
-#                       GPy.kern.linear(self.D) + GPy.kern.bias(self.D) + GPy.kern.white(self.D),
-#                       GPy.kern.rbf(self.D) + GPy.kern.bias(self.D) + GPy.kern.white(self.D),
-#                       GPy.kern.bias(self.D), GPy.kern.white(self.D),
+#                       (GPy.kern.rbf(self.input_dim, ARD=True) +
+#                        GPy.kern.linear(self.input_dim, ARD=True) +
+#                        GPy.kern.bias(self.input_dim) +
+#                        GPy.kern.white(self.input_dim)),
+                      (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
+                       GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
+                       GPy.kern.linear(self.input_dim, np.random.rand(self.input_dim), ARD=True) +
+                       GPy.kern.bias(self.input_dim) +
+                       GPy.kern.white(self.input_dim)),
+#                       GPy.kern.rbf(self.input_dim), GPy.kern.rbf(self.input_dim, ARD=True),
+#                       GPy.kern.linear(self.input_dim, ARD=False), GPy.kern.linear(self.input_dim, ARD=True),
+#                       GPy.kern.linear(self.input_dim) + GPy.kern.bias(self.input_dim),
+#                       GPy.kern.rbf(self.input_dim) + GPy.kern.bias(self.input_dim),
+#                       GPy.kern.linear(self.input_dim) + GPy.kern.bias(self.input_dim) + GPy.kern.white(self.input_dim),
+#                       GPy.kern.rbf(self.input_dim) + GPy.kern.bias(self.input_dim) + GPy.kern.white(self.input_dim),
+#                       GPy.kern.bias(self.input_dim), GPy.kern.white(self.input_dim),
                      )
-        self.q_x_mean = np.random.randn(self.D)
-        self.q_x_variance = np.exp(np.random.randn(self.D))
-        self.q_x_samples = np.random.randn(self.Nsamples, self.D) * np.sqrt(self.q_x_variance) + self.q_x_mean
-        self.Z = np.random.randn(self.M, self.D)
-        self.q_x_mean.shape = (1, self.D)
-        self.q_x_variance.shape = (1, self.D)
+        self.q_x_mean = np.random.randn(self.input_dim)
+        self.q_x_variance = np.exp(np.random.randn(self.input_dim))
+        self.q_x_samples = np.random.randn(self.Nsamples, self.input_dim) * np.sqrt(self.q_x_variance) + self.q_x_mean
+        self.Z = np.random.randn(self.num_inducing, self.input_dim)
+        self.q_x_mean.shape = (1, self.input_dim)
+        self.q_x_variance.shape = (1, self.input_dim)

    def test_psi0(self):
        for kern in self.kerns:
@ -63,7 +63,7 @@ class Test(unittest.TestCase):
        for kern in self.kerns:
            Nsamples = 100
            psi1 = kern.psi1(self.Z, self.q_x_mean, self.q_x_variance)
-            K_ = np.zeros((Nsamples, self.M))
+            K_ = np.zeros((Nsamples, self.num_inducing))
            diffs = []
            for i, q_x_sample_stripe in enumerate(np.array_split(self.q_x_samples, self.Nsamples / Nsamples)):
                K = kern.K(q_x_sample_stripe, self.Z)
@ -89,7 +89,7 @@ class Test(unittest.TestCase):
        for kern in self.kerns:
            Nsamples = 100
            psi2 = kern.psi2(self.Z, self.q_x_mean, self.q_x_variance)
-            K_ = np.zeros((self.M, self.M))
+            K_ = np.zeros((self.num_inducing, self.num_inducing))
            diffs = []
            for i, q_x_sample_stripe in enumerate(np.array_split(self.q_x_samples, self.Nsamples / Nsamples)):
                K = kern.K(q_x_sample_stripe, self.Z)
--- a/GPy/testing/psi_stat_gradient_tests.py
+++ b/GPy/testing/psi_stat_gradient_tests.py
@ -8,23 +8,23 @@ import numpy

 import GPy
 import itertools
-from GPy.core import model
+from GPy.core import Model

-class PsiStatModel(model):
-    def __init__(self, which, X, X_variance, Z, M, kernel):
+class PsiStatModel(Model):
+    def __init__(self, which, X, X_variance, Z, num_inducing, kernel):
        self.which = which
        self.X = X
        self.X_variance = X_variance
        self.Z = Z
        self.N, self.input_dim = X.shape
-        self.M, input_dim = Z.shape
+        self.num_inducing, input_dim = Z.shape
        assert self.input_dim == input_dim, "shape missmatch: Z:{!s} X:{!s}".format(Z.shape, X.shape)
        self.kern = kernel
        super(PsiStatModel, self).__init__()
        self.psi_ = self.kern.__getattribute__(self.which)(self.Z, self.X, self.X_variance)
    def _get_param_names(self):
        Xnames = ["{}_{}_{}".format(what, i, j) for what, i, j in itertools.product(['X', 'X_variance'], range(self.N), range(self.input_dim))]
-        Znames = ["Z_{}_{}".format(i, j) for i, j in itertools.product(range(self.M), range(self.input_dim))]
+        Znames = ["Z_{}_{}".format(i, j) for i, j in itertools.product(range(self.num_inducing), range(self.input_dim))]
        return Xnames + Znames + self.kern._get_param_names()
    def _get_params(self):
        return numpy.hstack([self.X.flatten(), self.X_variance.flatten(), self.Z.flatten(), self.kern._get_params()])
@ -34,7 +34,7 @@ class PsiStatModel(model):
        start, end = end, end + self.X_variance.size
        self.X_variance = x[start: end].reshape(self.N, self.input_dim)
        start, end = end, end + self.Z.size
-        self.Z = x[start: end].reshape(self.M, self.input_dim)
+        self.Z = x[start: end].reshape(self.num_inducing, self.input_dim)
        self.kern._set_params(x[end:])
    def log_likelihood(self):
        return self.kern.__getattribute__(self.which)(self.Z, self.X, self.X_variance).sum()
@ -43,19 +43,19 @@ class PsiStatModel(model):
        try:
            psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance)
        except AttributeError:
-            psiZ = numpy.zeros(self.M * self.input_dim)
+            psiZ = numpy.zeros(self.num_inducing * self.input_dim)
        thetagrad = self.kern.__getattribute__("d" + self.which + "_dtheta")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance).flatten()
        return numpy.hstack((psimu.flatten(), psiS.flatten(), psiZ.flatten(), thetagrad))

 class DPsiStatTest(unittest.TestCase):
    input_dim = 5
    N = 50
-    M = 10
-    D = 20
+    num_inducing = 10
+    input_dim = 20
    X = numpy.random.randn(N, input_dim)
    X_var = .5 * numpy.ones_like(X) + .4 * numpy.clip(numpy.random.randn(*X.shape), 0, 1)
-    Z = numpy.random.permutation(X)[:M]
-    Y = X.dot(numpy.random.randn(input_dim, D))
+    Z = numpy.random.permutation(X)[:num_inducing]
+    Y = X.dot(numpy.random.randn(input_dim, input_dim))
 #     kernels = [GPy.kern.linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)), GPy.kern.rbf(input_dim, ARD=True), GPy.kern.bias(input_dim)]

    kernels = [GPy.kern.linear(input_dim), GPy.kern.rbf(input_dim), GPy.kern.bias(input_dim),
@ -65,42 +65,39 @@ class DPsiStatTest(unittest.TestCase):
    def testPsi0(self):
        for k in self.kernels:
            m = PsiStatModel('psi0', X=self.X, X_variance=self.X_var, Z=self.Z,
-                         M=self.M, kernel=k)
-            try:
-                assert m.checkgrad(), "{} x psi0".format("+".join(map(lambda x: x.name, k.parts)))
-            except:
-                import ipdb;ipdb.set_trace()
+                             num_inducing=self.num_inducing, kernel=k)
+            assert m.checkgrad(), "{} x psi0".format("+".join(map(lambda x: x.name, k.parts)))

 #     def testPsi1(self):
 #         for k in self.kernels:
 #             m = PsiStatModel('psi1', X=self.X, X_variance=self.X_var, Z=self.Z,
-#                      M=self.M, kernel=k)
+#                      num_inducing=self.num_inducing, kernel=k)
 #             assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k.parts)))

    def testPsi2_lin(self):
        k = self.kernels[0]
        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
-                     M=self.M, kernel=k)
+                     num_inducing=self.num_inducing, kernel=k)
        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
    def testPsi2_lin_bia(self):
        k = self.kernels[3]
        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
-                     M=self.M, kernel=k)
+                     num_inducing=self.num_inducing, kernel=k)
        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
    def testPsi2_rbf(self):
        k = self.kernels[1]
        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
-                     M=self.M, kernel=k)
+                     num_inducing=self.num_inducing, kernel=k)
        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
    def testPsi2_rbf_bia(self):
        k = self.kernels[-1]
        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
-                     M=self.M, kernel=k)
+                     num_inducing=self.num_inducing, kernel=k)
        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
    def testPsi2_bia(self):
        k = self.kernels[2]
        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
-                     M=self.M, kernel=k)
+                     num_inducing=self.num_inducing, kernel=k)
        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))


@ -108,25 +105,25 @@ if __name__ == "__main__":
    import sys
    interactive = 'i' in sys.argv
    if interactive:
-#         N, M, input_dim, D = 30, 5, 4, 30
+#         N, num_inducing, input_dim, input_dim = 30, 5, 4, 30
 #         X = numpy.random.rand(N, input_dim)
 #         k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
 #         K = k.K(X)
-#         Y = numpy.random.multivariate_normal(numpy.zeros(N), K, D).T
+#         Y = numpy.random.multivariate_normal(numpy.zeros(N), K, input_dim).T
 #         Y -= Y.mean(axis=0)
 #         k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
-#         m = GPy.models.Bayesian_GPLVM(Y, input_dim, kernel=k, M=M)
+#         m = GPy.models.Bayesian_GPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
 #         m.ensure_default_constraints()
 #         m.randomize()
 # #         self.assertTrue(m.checkgrad())
        numpy.random.seed(0)
        input_dim = 5
        N = 50
-        M = 10
+        num_inducing = 10
        D = 15
        X = numpy.random.randn(N, input_dim)
        X_var = .5 * numpy.ones_like(X) + .1 * numpy.clip(numpy.random.randn(*X.shape), 0, 1)
-        Z = numpy.random.permutation(X)[:M]
+        Z = numpy.random.permutation(X)[:num_inducing]
        Y = X.dot(numpy.random.randn(input_dim, D))
 #         kernel = GPy.kern.bias(input_dim)
 #
@ -136,22 +133,22 @@ if __name__ == "__main__":

 #         for k in kernels:
 #             m = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
-#                      M=M, kernel=k)
+#                      num_inducing=num_inducing, kernel=k)
 #             assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k.parts)))
 #
 #         m0 = PsiStatModel('psi0', X=X, X_variance=X_var, Z=Z,
-#                          M=M, kernel=GPy.kern.linear(input_dim))
+#                          num_inducing=num_inducing, kernel=GPy.kern.linear(input_dim))
 #         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
-#                          M=M, kernel=kernel)
+#                          num_inducing=num_inducing, kernel=kernel)
 #         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
-#                          M=M, kernel=kernel)
+#                          num_inducing=num_inducing, kernel=kernel)
 #         m2 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
-#                          M=M, kernel=GPy.kern.rbf(input_dim))
+#                          num_inducing=num_inducing, kernel=GPy.kern.rbf(input_dim))
        m3 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
-                         M=M, kernel=GPy.kern.linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)))
+                         num_inducing=num_inducing, kernel=GPy.kern.linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)))
        m3.ensure_default_constraints()
        # + GPy.kern.bias(input_dim))
 #         m4 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
-#                          M=M, kernel=GPy.kern.rbf(input_dim) + GPy.kern.bias(input_dim))
+#                          num_inducing=num_inducing, kernel=GPy.kern.rbf(input_dim) + GPy.kern.bias(input_dim))
    else:
        unittest.main()
--- a/GPy/testing/sparse_gplvm_tests.py
+++ b/GPy/testing/sparse_gplvm_tests.py
@ -4,41 +4,42 @@
 import unittest
 import numpy as np
 import GPy
+from GPy.models.sparse_gplvm import SparseGPLVM

 class sparse_GPLVMTests(unittest.TestCase):
    def test_bias_kern(self):
-        N, M, input_dim, D = 10, 3, 2, 4
+        N, num_inducing, input_dim, D = 10, 3, 2, 4
        X = np.random.rand(N, input_dim)
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        K = k.K(X)
-        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
        k = GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
-        m = GPy.models.sparse_GPLVM(Y, input_dim, kernel = k, M=M)
+        m = SparseGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())

    @unittest.skip('linear kernels do not have dKdiag_dX')
    def test_linear_kern(self):
-        N, M, input_dim, D = 10, 3, 2, 4
+        N, num_inducing, input_dim, D = 10, 3, 2, 4
        X = np.random.rand(N, input_dim)
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        K = k.K(X)
-        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
        k = GPy.kern.linear(input_dim) + GPy.kern.white(input_dim, 0.00001)
-        m = GPy.models.sparse_GPLVM(Y, input_dim, kernel = k, M=M)
+        m = SparseGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())

    def test_rbf_kern(self):
-        N, M, input_dim, D = 10, 3, 2, 4
+        N, num_inducing, input_dim, D = 10, 3, 2, 4
        X = np.random.rand(N, input_dim)
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        K = k.K(X)
-        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
-        m = GPy.models.sparse_GPLVM(Y, input_dim, kernel = k, M=M)
+        m = SparseGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@ -5,33 +5,33 @@
 import unittest
 import numpy as np
 import GPy
+from GPy.likelihoods.likelihood_functions import Binomial

 class GradientTests(unittest.TestCase):
    def setUp(self):
        ######################################
-        ## 1 dimensional example
+        # # 1 dimensional example

        # sample inputs and outputs
-        self.X1D = np.random.uniform(-3.,3.,(20,1))
-        self.Y1D = np.sin(self.X1D)+np.random.randn(20,1)*0.05
+        self.X1D = np.random.uniform(-3., 3., (20, 1))
+        self.Y1D = np.sin(self.X1D) + np.random.randn(20, 1) * 0.05

        ######################################
-        ## 2 dimensional example
+        # # 2 dimensional example

        # sample inputs and outputs
-        self.X2D = np.random.uniform(-3.,3.,(40,2))
-        self.Y2D = np.sin(self.X2D[:,0:1]) * np.sin(self.X2D[:,1:2])+np.random.randn(40,1)*0.05
+        self.X2D = np.random.uniform(-3., 3., (40, 2))
+        self.Y2D = np.sin(self.X2D[:, 0:1]) * np.sin(self.X2D[:, 1:2]) + np.random.randn(40, 1) * 0.05

-    def check_model_with_white(self, kern, model_type='GP_regression', dimension=1):
-        #Get the correct gradients
+    def check_model_with_white(self, kern, model_type='GPRegression', dimension=1):
+        # Get the correct gradients
        if dimension == 1:
            X = self.X1D
            Y = self.Y1D
        else:
            X = self.X2D
            Y = self.Y2D
-
-        #Get model type (GP_regression, GP_sparse_regression, etc)
+        # Get model type (GPRegression, SparseGPRegression, etc)
        model_fit = getattr(GPy.models, model_type)

        noise = GPy.kern.white(dimension)
@ -42,114 +42,114 @@ class GradientTests(unittest.TestCase):
        # contrain all parameters to be positive
        self.assertTrue(m.checkgrad())

-    def test_gp_regression_rbf_1d(self):
+    def test_GPRegression_rbf_1d(self):
        ''' Testing the GP regression with rbf kernel with white kernel on 1d data '''
        rbf = GPy.kern.rbf(1)
-        self.check_model_with_white(rbf, model_type='GP_regression', dimension=1)
+        self.check_model_with_white(rbf, model_type='GPRegression', dimension=1)

-    def test_GP_regression_rbf_2D(self):
+    def test_GPRegression_rbf_2D(self):
        ''' Testing the GP regression with rbf and white kernel on 2d data '''
        rbf = GPy.kern.rbf(2)
-        self.check_model_with_white(rbf, model_type='GP_regression', dimension=2)
+        self.check_model_with_white(rbf, model_type='GPRegression', dimension=2)

-    def test_GP_regression_rbf_ARD_2D(self):
+    def test_GPRegression_rbf_ARD_2D(self):
        ''' Testing the GP regression with rbf and white kernel on 2d data '''
-        k = GPy.kern.rbf(2,ARD=True)
-        self.check_model_with_white(k, model_type='GP_regression', dimension=2)
+        k = GPy.kern.rbf(2, ARD=True)
+        self.check_model_with_white(k, model_type='GPRegression', dimension=2)

-    def test_GP_regression_matern52_1D(self):
+    def test_GPRegression_matern52_1D(self):
        ''' Testing the GP regression with matern52 kernel on 1d data '''
        matern52 = GPy.kern.Matern52(1)
-        self.check_model_with_white(matern52, model_type='GP_regression', dimension=1)
+        self.check_model_with_white(matern52, model_type='GPRegression', dimension=1)

-    def test_GP_regression_matern52_2D(self):
+    def test_GPRegression_matern52_2D(self):
        ''' Testing the GP regression with matern52 kernel on 2d data '''
        matern52 = GPy.kern.Matern52(2)
-        self.check_model_with_white(matern52, model_type='GP_regression', dimension=2)
+        self.check_model_with_white(matern52, model_type='GPRegression', dimension=2)

-    def test_GP_regression_matern52_ARD_2D(self):
+    def test_GPRegression_matern52_ARD_2D(self):
        ''' Testing the GP regression with matern52 kernel on 2d data '''
-        matern52 = GPy.kern.Matern52(2,ARD=True)
-        self.check_model_with_white(matern52, model_type='GP_regression', dimension=2)
+        matern52 = GPy.kern.Matern52(2, ARD=True)
+        self.check_model_with_white(matern52, model_type='GPRegression', dimension=2)

-    def test_GP_regression_matern32_1D(self):
+    def test_GPRegression_matern32_1D(self):
        ''' Testing the GP regression with matern32 kernel on 1d data '''
        matern32 = GPy.kern.Matern32(1)
-        self.check_model_with_white(matern32, model_type='GP_regression', dimension=1)
+        self.check_model_with_white(matern32, model_type='GPRegression', dimension=1)

-    def test_GP_regression_matern32_2D(self):
+    def test_GPRegression_matern32_2D(self):
        ''' Testing the GP regression with matern32 kernel on 2d data '''
        matern32 = GPy.kern.Matern32(2)
-        self.check_model_with_white(matern32, model_type='GP_regression', dimension=2)
+        self.check_model_with_white(matern32, model_type='GPRegression', dimension=2)

-    def test_GP_regression_matern32_ARD_2D(self):
+    def test_GPRegression_matern32_ARD_2D(self):
        ''' Testing the GP regression with matern32 kernel on 2d data '''
-        matern32 = GPy.kern.Matern32(2,ARD=True)
-        self.check_model_with_white(matern32, model_type='GP_regression', dimension=2)
+        matern32 = GPy.kern.Matern32(2, ARD=True)
+        self.check_model_with_white(matern32, model_type='GPRegression', dimension=2)

-    def test_GP_regression_exponential_1D(self):
+    def test_GPRegression_exponential_1D(self):
        ''' Testing the GP regression with exponential kernel on 1d data '''
        exponential = GPy.kern.exponential(1)
-        self.check_model_with_white(exponential, model_type='GP_regression', dimension=1)
+        self.check_model_with_white(exponential, model_type='GPRegression', dimension=1)

-    def test_GP_regression_exponential_2D(self):
+    def test_GPRegression_exponential_2D(self):
        ''' Testing the GP regression with exponential kernel on 2d data '''
        exponential = GPy.kern.exponential(2)
-        self.check_model_with_white(exponential, model_type='GP_regression', dimension=2)
+        self.check_model_with_white(exponential, model_type='GPRegression', dimension=2)

-    def test_GP_regression_exponential_ARD_2D(self):
+    def test_GPRegression_exponential_ARD_2D(self):
        ''' Testing the GP regression with exponential kernel on 2d data '''
-        exponential = GPy.kern.exponential(2,ARD=True)
-        self.check_model_with_white(exponential, model_type='GP_regression', dimension=2)
+        exponential = GPy.kern.exponential(2, ARD=True)
+        self.check_model_with_white(exponential, model_type='GPRegression', dimension=2)

-    def test_GP_regression_bias_kern_1D(self):
+    def test_GPRegression_bias_kern_1D(self):
        ''' Testing the GP regression with bias kernel on 1d data '''
        bias = GPy.kern.bias(1)
-        self.check_model_with_white(bias, model_type='GP_regression', dimension=1)
+        self.check_model_with_white(bias, model_type='GPRegression', dimension=1)

-    def test_GP_regression_bias_kern_2D(self):
+    def test_GPRegression_bias_kern_2D(self):
        ''' Testing the GP regression with bias kernel on 2d data '''
        bias = GPy.kern.bias(2)
-        self.check_model_with_white(bias, model_type='GP_regression', dimension=2)
+        self.check_model_with_white(bias, model_type='GPRegression', dimension=2)

-    def test_GP_regression_linear_kern_1D_ARD(self):
+    def test_GPRegression_linear_kern_1D_ARD(self):
        ''' Testing the GP regression with linear kernel on 1d data '''
-        linear = GPy.kern.linear(1,ARD=True)
-        self.check_model_with_white(linear, model_type='GP_regression', dimension=1)
+        linear = GPy.kern.linear(1, ARD=True)
+        self.check_model_with_white(linear, model_type='GPRegression', dimension=1)

-    def test_GP_regression_linear_kern_2D_ARD(self):
+    def test_GPRegression_linear_kern_2D_ARD(self):
        ''' Testing the GP regression with linear kernel on 2d data '''
-        linear = GPy.kern.linear(2,ARD=True)
-        self.check_model_with_white(linear, model_type='GP_regression', dimension=2)
+        linear = GPy.kern.linear(2, ARD=True)
+        self.check_model_with_white(linear, model_type='GPRegression', dimension=2)

-    def test_GP_regression_linear_kern_1D(self):
+    def test_GPRegression_linear_kern_1D(self):
        ''' Testing the GP regression with linear kernel on 1d data '''
        linear = GPy.kern.linear(1)
-        self.check_model_with_white(linear, model_type='GP_regression', dimension=1)
+        self.check_model_with_white(linear, model_type='GPRegression', dimension=1)

-    def test_GP_regression_linear_kern_2D(self):
+    def test_GPRegression_linear_kern_2D(self):
        ''' Testing the GP regression with linear kernel on 2d data '''
        linear = GPy.kern.linear(2)
-        self.check_model_with_white(linear, model_type='GP_regression', dimension=2)
+        self.check_model_with_white(linear, model_type='GPRegression', dimension=2)

-    def test_sparse_GP_regression_rbf_white_kern_1d(self):
+    def test_SparseGPRegression_rbf_white_kern_1d(self):
        ''' Testing the sparse GP regression with rbf kernel with white kernel on 1d data '''
        rbf = GPy.kern.rbf(1)
-        self.check_model_with_white(rbf, model_type='sparse_GP_regression', dimension=1)
+        self.check_model_with_white(rbf, model_type='SparseGPRegression', dimension=1)

-    def test_sparse_GP_regression_rbf_white_kern_2D(self):
+    def test_SparseGPRegression_rbf_white_kern_2D(self):
        ''' Testing the sparse GP regression with rbf and white kernel on 2d data '''
        rbf = GPy.kern.rbf(2)
-        self.check_model_with_white(rbf, model_type='sparse_GP_regression', dimension=2)
+        self.check_model_with_white(rbf, model_type='SparseGPRegression', dimension=2)

    def test_GPLVM_rbf_bias_white_kern_2D(self):
        """ Testing GPLVM with rbf + bias and white kernel """
        N, input_dim, D = 50, 1, 2
        X = np.random.rand(N, input_dim)
-        k = GPy.kern.rbf(input_dim, 0.5, 0.9*np.ones((1,))) + GPy.kern.bias(input_dim, 0.1) + GPy.kern.white(input_dim, 0.05)
+        k = GPy.kern.rbf(input_dim, 0.5, 0.9 * np.ones((1,))) + GPy.kern.bias(input_dim, 0.1) + GPy.kern.white(input_dim, 0.05)
        K = k.K(X)
-        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
-        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
+        Y = np.random.multivariate_normal(np.zeros(N), K, input_dim).T
+        m = GPy.models.GPLVM(Y, input_dim, kernel=k)
        m.ensure_default_constraints()
        self.assertTrue(m.checkgrad())

@ -159,43 +159,46 @@ class GradientTests(unittest.TestCase):
        X = np.random.rand(N, input_dim)
        k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim, 0.1) + GPy.kern.white(input_dim, 0.05)
        K = k.K(X)
-        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
-        m = GPy.models.GPLVM(Y, input_dim, init = 'PCA', kernel = k)
+        Y = np.random.multivariate_normal(np.zeros(N), K, input_dim).T
+        m = GPy.models.GPLVM(Y, input_dim, init='PCA', kernel=k)
        m.ensure_default_constraints()
        self.assertTrue(m.checkgrad())

    def test_GP_EP_probit(self):
        N = 20
-        X = np.hstack([np.random.normal(5,2,N/2),np.random.normal(10,2,N/2)])[:,None]
-        Y = np.hstack([np.ones(N/2),np.zeros(N/2)])[:,None]
+        X = np.hstack([np.random.normal(5, 2, N / 2), np.random.normal(10, 2, N / 2)])[:, None]
+        Y = np.hstack([np.ones(N / 2), np.zeros(N / 2)])[:, None]
        kernel = GPy.kern.rbf(1)
-        distribution = GPy.likelihoods.likelihood_functions.binomial()
+        distribution = GPy.likelihoods.likelihood_functions.Binomial()
        likelihood = GPy.likelihoods.EP(Y, distribution)
        m = GPy.core.GP(X, likelihood, kernel)
        m.ensure_default_constraints()
        m.update_likelihood_approximation()
        self.assertTrue(m.checkgrad())
-        #self.assertTrue(m.EPEM)
+        # self.assertTrue(m.EPEM)

    def test_sparse_EP_DTC_probit(self):
        N = 20
-        X = np.hstack([np.random.normal(5,2,N/2),np.random.normal(10,2,N/2)])[:,None]
-        Y = np.hstack([np.ones(N/2),np.zeros(N/2)])[:,None]
-        Z = np.linspace(0,15,4)[:,None]
+        X = np.hstack([np.random.normal(5, 2, N / 2), np.random.normal(10, 2, N / 2)])[:, None]
+        Y = np.hstack([np.ones(N / 2), np.zeros(N / 2)])[:, None]
+        Z = np.linspace(0, 15, 4)[:, None]
        kernel = GPy.kern.rbf(1)
-        distribution = GPy.likelihoods.likelihood_functions.binomial()
+        distribution = GPy.likelihoods.likelihood_functions.Binomial()
        likelihood = GPy.likelihoods.EP(Y, distribution)
-        m = GPy.core.sparse_GP(X, likelihood, kernel,Z)
+        m = GPy.core.SparseGP(X, likelihood, kernel, Z)
        m.ensure_default_constraints()
        m.update_likelihood_approximation()
        self.assertTrue(m.checkgrad())

    def test_generalized_FITC(self):
        N = 20
-        X = np.hstack([np.random.rand(N/2)+1,np.random.rand(N/2)-1])[:,None]
+        X = np.hstack([np.random.rand(N / 2) + 1, np.random.rand(N / 2) - 1])[:, None]
        k = GPy.kern.rbf(1) + GPy.kern.white(1)
        Y = np.hstack([np.ones(N/2),-np.ones(N/2)])[:,None]
-        likelihood = GPy.inference.likelihoods.binomial(Y)
+
+        distribution = GPy.likelihoods.likelihood_functions.Binomial()
+        likelihood = GPy.likelihoods.EP(Y, distribution)
+        #likelihood = GPy.inference.likelihoods.Binomial(Y)
        m = GPy.models.generalized_FITC(X,likelihood,k,inducing=4)
        m.constrain_positive('(var|len)')
        m.approximate_likelihood()
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -22,7 +22,7 @@ def fetch_dataset(resource, save_name = None, save_file = True, messages = True)
        print "Downloading resource: " , resource, " ... ",
    response = url.urlopen(resource)
    # TODO: Some error checking...
-    # ... 
+    # ...
    html = response.read()
    response.close()
    if save_file:
@ -33,8 +33,6 @@ def fetch_dataset(resource, save_name = None, save_file = True, messages = True)
            if messages:
                print "Done!"
    return html
-        
-    

 def della_gatta_TRP63_gene_expression(gene_number=None):
    mat_data = scipy.io.loadmat(os.path.join(data_path, 'DellaGattadata.mat'))
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -1,87 +1,80 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-#tdot function courtesy of Ian Murray:
+# tdot function courtesy of Ian Murray:
 # Iain Murray, April 2013. iain contactable via iainmurray.net
 # http://homepages.inf.ed.ac.uk/imurray2/code/tdot/tdot.py

 import numpy as np
-from scipy import linalg, optimize, weave
-import pylab as pb
-import Tango
-import sys
-import re
-import pdb
-import cPickle
+from scipy import linalg, weave
 import types
 import ctypes
 from ctypes import byref, c_char, c_int, c_double # TODO
-#import scipy.lib.lapack
-import scipy as sp
+# import scipy.lib.lapack

 try:
-    _blaslib = ctypes.cdll.LoadLibrary(np.core._dotblas.__file__)
+    _blaslib = ctypes.cdll.LoadLibrary(np.core._dotblas.__file__) # @UndefinedVariable
    _blas_available = True
 except:
    _blas_available = False

-def trace_dot(a,b):
+def trace_dot(a, b):
    """
    efficiently compute the trace of the matrix product of a and b
    """
-    return np.sum(a*b)
+    return np.sum(a * b)

 def mdot(*args):
-   """Multiply all the arguments using matrix product rules.
-   The output is equivalent to multiplying the arguments one by one
-   from left to right using dot().
-   Precedence can be controlled by creating tuples of arguments,
-   for instance mdot(a,((b,c),d)) multiplies a (a*((b*c)*d)).
-   Note that this means the output of dot(a,b) and mdot(a,b) will differ if
-   a or b is a pure tuple of numbers.
-   """
-   if len(args)==1:
-       return args[0]
-   elif len(args)==2:
-       return _mdot_r(args[0],args[1])
-   else:
-       return _mdot_r(args[:-1],args[-1])
+    """Multiply all the arguments using matrix product rules.
+    The output is equivalent to multiplying the arguments one by one
+    from left to right using dot().
+    Precedence can be controlled by creating tuples of arguments,
+    for instance mdot(a,((b,c),d)) multiplies a (a*((b*c)*d)).
+    Note that this means the output of dot(a,b) and mdot(a,b) will differ if
+    a or b is a pure tuple of numbers.
+    """
+    if len(args) == 1:
+        return args[0]
+    elif len(args) == 2:
+        return _mdot_r(args[0], args[1])
+    else:
+        return _mdot_r(args[:-1], args[-1])

-def _mdot_r(a,b):
-   """Recursive helper for mdot"""
-   if type(a)==types.TupleType:
-       if len(a)>1:
-           a = mdot(*a)
-       else:
-           a = a[0]
-   if type(b)==types.TupleType:
-       if len(b)>1:
-           b = mdot(*b)
-       else:
-           b = b[0]
-   return np.dot(a,b)
+def _mdot_r(a, b):
+    """Recursive helper for mdot"""
+    if type(a) == types.TupleType:
+        if len(a) > 1:
+            a = mdot(*a)
+        else:
+            a = a[0]
+    if type(b) == types.TupleType:
+        if len(b) > 1:
+            b = mdot(*b)
+        else:
+            b = b[0]
+    return np.dot(a, b)

-def jitchol(A,maxtries=5):
+def jitchol(A, maxtries=5):
    A = np.asfortranarray(A)
-    L,info = linalg.lapack.flapack.dpotrf(A,lower=1)
-    if info ==0:
+    L, info = linalg.lapack.flapack.dpotrf(A, lower=1)
+    if info == 0:
        return L
    else:
        diagA = np.diag(A)
-        if np.any(diagA<0.):
+        if np.any(diagA < 0.):
            raise linalg.LinAlgError, "not pd: negative diagonal elements"
-        jitter= diagA.mean()*1e-6
-        for i in range(1,maxtries+1):
+        jitter = diagA.mean() * 1e-6
+        for i in range(1, maxtries + 1):
            print 'Warning: adding jitter of {:.10e}'.format(jitter)
            try:
-                return linalg.cholesky(A+np.eye(A.shape[0]).T*jitter, lower = True)
+                return linalg.cholesky(A + np.eye(A.shape[0]).T * jitter, lower=True)
            except:
                jitter *= 10
-        raise linalg.LinAlgError,"not positive definite, even with jitter."
+        raise linalg.LinAlgError, "not positive definite, even with jitter."



-def jitchol_old(A,maxtries=5):
+def jitchol_old(A, maxtries=5):
    """
    :param A : An almost pd square matrix

@ -93,20 +86,20 @@ def jitchol_old(A,maxtries=5):
      np.allclose(sp.linalg.cholesky(XXT, lower = True), np.triu(sp.linalg.cho_factor(XXT)[0]).T)
    """
    try:
-        return linalg.cholesky(A, lower = True)
+        return linalg.cholesky(A, lower=True)
    except linalg.LinAlgError:
        diagA = np.diag(A)
-        if np.any(diagA<0.):
+        if np.any(diagA < 0.):
            raise linalg.LinAlgError, "not pd: negative diagonal elements"
-        jitter= diagA.mean()*1e-6
-        for i in range(1,maxtries+1):
+        jitter = diagA.mean() * 1e-6
+        for i in range(1, maxtries + 1):
            print '\rWarning: adding jitter of {:.10e}                        '.format(jitter),
            try:
-                return linalg.cholesky(A+np.eye(A.shape[0]).T*jitter, lower = True)
+                return linalg.cholesky(A + np.eye(A.shape[0]).T * jitter, lower=True)
            except:
                jitter *= 10

-        raise linalg.LinAlgError,"not positive definite, even with jitter."
+        raise linalg.LinAlgError, "not positive definite, even with jitter."

 def pdinv(A, *args):
    """
@ -125,7 +118,7 @@ def pdinv(A, *args):
    logdet = 2.*np.sum(np.log(np.diag(L)))
    Li = chol_inv(L)
    Ai, _ = linalg.lapack.flapack.dpotri(L)
-    #Ai = np.tril(Ai) + np.tril(Ai,-1).T
+    # Ai = np.tril(Ai) + np.tril(Ai,-1).T
    symmetrify(Ai)

    return Ai, L, Li, logdet
@ -140,7 +133,7 @@ def chol_inv(L):

    """

-    return linalg.lapack.flapack.dtrtri(L, lower = True)[0]
+    return linalg.lapack.flapack.dtrtri(L, lower=True)[0]


 def multiple_pdinv(A):
@ -155,11 +148,11 @@ def multiple_pdinv(A):
    hld: 0.5* the log of the determinants of A
    """
    N = A.shape[-1]
-    chols = [jitchol(A[:,:,i]) for i in range(N)]
+    chols = [jitchol(A[:, :, i]) for i in range(N)]
    halflogdets = [np.sum(np.log(np.diag(L[0]))) for L in chols]
-    invs = [linalg.lapack.flapack.dpotri(L[0],True)[0] for L in chols]
-    invs = [np.triu(I)+np.triu(I,1).T for I in invs]
-    return np.dstack(invs),np.array(halflogdets)
+    invs = [linalg.lapack.flapack.dpotri(L[0], True)[0] for L in chols]
+    invs = [np.triu(I) + np.triu(I, 1).T for I in invs]
+    return np.dstack(invs), np.array(halflogdets)


 def PCA(Y, input_dim):
@ -179,18 +172,18 @@ def PCA(Y, input_dim):
    if not np.allclose(Y.mean(axis=0), 0.0):
        print "Y is not zero mean, centering it locally (GPy.util.linalg.PCA)"

-        #Y -= Y.mean(axis=0)
+        # Y -= Y.mean(axis=0)

-    Z = linalg.svd(Y-Y.mean(axis=0), full_matrices = False)
-    [X, W] = [Z[0][:,0:input_dim], np.dot(np.diag(Z[1]), Z[2]).T[:,0:input_dim]]
+    Z = linalg.svd(Y - Y.mean(axis=0), full_matrices=False)
+    [X, W] = [Z[0][:, 0:input_dim], np.dot(np.diag(Z[1]), Z[2]).T[:, 0:input_dim]]
    v = X.std(axis=0)
    X /= v;
    W *= v;
    return X, W.T


-def tdot_numpy(mat,out=None):
-    return np.dot(mat,mat.T,out)
+def tdot_numpy(mat, out=None):
+    return np.dot(mat, mat.T, out)

 def tdot_blas(mat, out=None):
    """returns np.dot(mat, mat.T), but faster for large 2D arrays of doubles."""
@ -198,16 +191,16 @@ def tdot_blas(mat, out=None):
        return np.dot(mat, mat.T)
    nn = mat.shape[0]
    if out is None:
-        out = np.zeros((nn,nn))
+        out = np.zeros((nn, nn))
    else:
        assert(out.dtype == 'float64')
-        assert(out.shape == (nn,nn))
+        assert(out.shape == (nn, nn))
        # FIXME: should allow non-contiguous out, and copy output into it:
        assert(8 in out.strides)
        # zeroing needed because of dumb way I copy across triangular answer
        out[:] = 0.0

-    ## Call to DSYRK from BLAS
+    # # Call to DSYRK from BLAS
    # If already in Fortran order (rare), and has the right sorts of strides I
    # could avoid the copy. I also thought swapping to cblas API would allow use
    # of C order. However, I tried that and had errors with large matrices:
@ -226,17 +219,17 @@ def tdot_blas(mat, out=None):
    _blaslib.dsyrk_(byref(UPLO), byref(TRANS), byref(N), byref(K),
            byref(ALPHA), A, byref(LDA), byref(BETA), C, byref(LDC))

-    symmetrify(out,upper=True)
+    symmetrify(out, upper=True)

    return out

 def tdot(*args, **kwargs):
    if _blas_available:
-        return tdot_blas(*args,**kwargs)
+        return tdot_blas(*args, **kwargs)
    else:
-        return tdot_numpy(*args,**kwargs)
+        return tdot_numpy(*args, **kwargs)

-def DSYR_blas(A,x,alpha=1.):
+def DSYR_blas(A, x, alpha=1.):
    """
    Performs a symmetric rank-1 update operation:
    A <- A + alpha * np.dot(x,x.T)
@ -256,9 +249,9 @@ def DSYR_blas(A,x,alpha=1.):
    INCX = c_int(1)
    _blaslib.dsyr_(byref(UPLO), byref(N), byref(ALPHA),
            x_, byref(INCX), A_, byref(LDA))
-    symmetrify(A,upper=True)
+    symmetrify(A, upper=True)

-def DSYR_numpy(A,x,alpha=1.):
+def DSYR_numpy(A, x, alpha=1.):
    """
    Performs a symmetric rank-1 update operation:
    A <- A + alpha * np.dot(x,x.T)
@ -269,23 +262,23 @@ def DSYR_numpy(A,x,alpha=1.):
    :param x: Nx1 np.array
    :param alpha: scalar
    """
-    A += alpha*np.dot(x[:,None],x[None,:])
+    A += alpha * np.dot(x[:, None], x[None, :])


 def DSYR(*args, **kwargs):
    if _blas_available:
-        return DSYR_blas(*args,**kwargs)
+        return DSYR_blas(*args, **kwargs)
    else:
-        return DSYR_numpy(*args,**kwargs)
+        return DSYR_numpy(*args, **kwargs)

-def symmetrify(A,upper=False):
+def symmetrify(A, upper=False):
    """
    Take the square matrix A and make it symmetrical by copting elements from the lower half to the upper

    works IN PLACE.
    """
-    N,M = A.shape
-    assert N==M
+    N, M = A.shape
+    assert N == M
    c_contig_code = """
    int iN;
    for (int i=1; i<N; i++){
@ -305,13 +298,13 @@ def symmetrify(A,upper=False):
    }
    """
    if A.flags['C_CONTIGUOUS'] and upper:
-        weave.inline(f_contig_code,['A','N'], extra_compile_args=['-O3'])
+        weave.inline(f_contig_code, ['A', 'N'], extra_compile_args=['-O3'])
    elif A.flags['C_CONTIGUOUS'] and not upper:
-        weave.inline(c_contig_code,['A','N'], extra_compile_args=['-O3'])
+        weave.inline(c_contig_code, ['A', 'N'], extra_compile_args=['-O3'])
    elif A.flags['F_CONTIGUOUS'] and upper:
-        weave.inline(c_contig_code,['A','N'], extra_compile_args=['-O3'])
+        weave.inline(c_contig_code, ['A', 'N'], extra_compile_args=['-O3'])
    elif A.flags['F_CONTIGUOUS'] and not upper:
-        weave.inline(f_contig_code,['A','N'], extra_compile_args=['-O3'])
+        weave.inline(f_contig_code, ['A', 'N'], extra_compile_args=['-O3'])
    else:
        if upper:
            tmp = np.tril(A.T)
@ -319,15 +312,15 @@ def symmetrify(A,upper=False):
            tmp = np.tril(A)
        A[:] = 0.0
        A += tmp
-        A += np.tril(tmp,-1).T
+        A += np.tril(tmp, -1).T


 def symmetrify_murray(A):
    A += A.T
    nn = A.shape[0]
-    A[[range(nn),range(nn)]] /= 2.0
+    A[[range(nn), range(nn)]] /= 2.0

-def cholupdate(L,x):
+def cholupdate(L, x):
    """
    update the LOWER cholesky factor of a pd matrix IN PLACE

@ -337,7 +330,7 @@ def cholupdate(L,x):
    support_code = """
    #include <math.h>
    """
-    code="""
+    code = """
    double r,c,s;
    int j,i;
    for(j=0; j<N; j++){
@ -353,11 +346,11 @@ def cholupdate(L,x):
    """
    x = x.copy()
    N = x.size
-    weave.inline(code, support_code=support_code, arg_names=['N','L','x'], type_converters=weave.converters.blitz)
+    weave.inline(code, support_code=support_code, arg_names=['N', 'L', 'x'], type_converters=weave.converters.blitz)

-def backsub_both_sides(L, X,transpose='left'):
+def backsub_both_sides(L, X, transpose='left'):
    """ Return L^-T * X * L^-1, assumuing X is symmetrical and L is lower cholesky"""
-    if transpose=='left':
+    if transpose == 'left':
        tmp, _ = linalg.lapack.flapack.dtrtrs(L, np.asfortranarray(X), lower=1, trans=1)
        return linalg.lapack.flapack.dtrtrs(L, np.asfortranarray(tmp.T), lower=1, trans=1)[0].T
    else:
--- a/GPy/util/plot_latent.py
+++ b/GPy/util/plot_latent.py
@ -2,9 +2,9 @@ import pylab as pb
 import numpy as np
 from .. import util

-def plot_latent(model, labels=None, which_indices=None, resolution=50, ax=None, marker='o', s=40):
+def plot_latent(Model, labels=None, which_indices=None, resolution=50, ax=None, marker='o', s=40):
    """
-    :param labels: a np.array of size model.N containing labels for the points (can be number, strings, etc)
+    :param labels: a np.array of size Model.N containing labels for the points (can be number, strings, etc)
    :param resolution: the resolution of the grid on which to evaluate the predictive variance
    """
    if ax is None:
@ -12,26 +12,26 @@ def plot_latent(model, labels=None, which_indices=None, resolution=50, ax=None,
    util.plot.Tango.reset()

    if labels is None:
-        labels = np.ones(model.N)
+        labels = np.ones(Model.N)
    if which_indices is None:
-        if model.input_dim==1:
+        if Model.input_dim==1:
            input_1 = 0
            input_2 = None
-        if model.input_dim==2:
+        if Model.input_dim==2:
            input_1, input_2 = 0,1
        else:
            try:
-                input_1, input_2 = np.argsort(model.input_sensitivity())[:2]
+                input_1, input_2 = np.argsort(Model.input_sensitivity())[:2]
            except:
                raise ValueError, "cannot Atomatically determine which dimensions to plot, please pass 'which_indices'"
    else:
        input_1, input_2 = which_indices

    #first, plot the output variance as a function of the latent space
-    Xtest, xx,yy,xmin,xmax = util.plot.x_frame2D(model.X[:,[input_1, input_2]],resolution=resolution)
-    Xtest_full = np.zeros((Xtest.shape[0], model.X.shape[1]))
+    Xtest, xx,yy,xmin,xmax = util.plot.x_frame2D(Model.X[:,[input_1, input_2]],resolution=resolution)
+    Xtest_full = np.zeros((Xtest.shape[0], Model.X.shape[1]))
    Xtest_full[:, :2] = Xtest
-    mu, var, low, up = model.predict(Xtest_full)
+    mu, var, low, up = Model.predict(Xtest_full)
    var = var[:, :1]
    ax.imshow(var.reshape(resolution, resolution).T,
              extent=[xmin[0], xmax[0], xmin[1], xmax[1]], cmap=pb.cm.binary,interpolation='bilinear',origin='lower')
@ -55,12 +55,12 @@ def plot_latent(model, labels=None, which_indices=None, resolution=50, ax=None,
            m = marker

        index = np.nonzero(labels==ul)[0]
-        if model.input_dim==1:
-            x = model.X[index,input_1]
+        if Model.input_dim==1:
+            x = Model.X[index,input_1]
            y = np.zeros(index.size)
        else:
-            x = model.X[index,input_1]
-            y = model.X[index,input_2]
+            x = Model.X[index,input_1]
+            y = Model.X[index,input_2]
        ax.scatter(x, y, marker=m, s=s, color=util.plot.Tango.nextMedium(), label=this_label)

    ax.set_xlabel('latent dimension %i'%input_1)
@ -76,16 +76,16 @@ def plot_latent(model, labels=None, which_indices=None, resolution=50, ax=None,
    return ax


-def plot_latent_indices(model, which_indices=None, *args, **kwargs):
+def plot_latent_indices(Model, which_indices=None, *args, **kwargs):

    if which_indices is None:
        try:
-            input_1, input_2 = np.argsort(model.input_sensitivity())[:2]
+            input_1, input_2 = np.argsort(Model.input_sensitivity())[:2]
        except:
            raise ValueError, "cannot Automatically determine which dimensions to plot, please pass 'which_indices'"
    else:
        input_1, input_2 = which_indices
-    ax = plot_latent(model, which_indices=[input_1, input_2], *args, **kwargs)
+    ax = plot_latent(Model, which_indices=[input_1, input_2], *args, **kwargs)
    # TODO: Here test if there are inducing points...
-    ax.plot(model.Z[:, input_1], model.Z[:, input_2], '^w')
+    ax.plot(Model.Z[:, input_1], Model.Z[:, input_2], '^w')
    return ax
--- a/GPy/util/visualize.py
+++ b/GPy/util/visualize.py
@ -43,16 +43,16 @@ class vector_show(data_show):


 class lvm(data_show):
-    def __init__(self, vals, model, data_visualize, latent_axes=None, sense_axes=None, latent_index=[0,1]):
-        """Visualize a latent variable model
+    def __init__(self, vals, Model, data_visualize, latent_axes=None, sense_axes=None, latent_index=[0,1]):
+        """Visualize a latent variable Model

-        :param model: the latent variable model to visualize.
+        :param Model: the latent variable Model to visualize.
        :param data_visualize: the object used to visualize the data which has been modelled.
        :type data_visualize: visualize.data_show  type.
        :param latent_axes: the axes where the latent visualization should be plotted.
        """
        if vals == None:
-            vals = model.X[0]
+            vals = Model.X[0]

        data_show.__init__(self, vals, axes=latent_axes)

@ -68,13 +68,13 @@ class lvm(data_show):
            self.cid = latent_axes[0].figure.canvas.mpl_connect('axes_enter_event', self.on_enter)

        self.data_visualize = data_visualize
-        self.model = model
+        self.Model = Model
        self.latent_axes = latent_axes
        self.sense_axes = sense_axes
        self.called = False
        self.move_on = False
        self.latent_index = latent_index
-        self.latent_dim = model.input_dim
+        self.latent_dim = Model.input_dim

        # The red cross which shows current latent point.
        self.latent_values = vals
@ -85,7 +85,7 @@ class lvm(data_show):
    def modify(self, vals):
        """When latent values are modified update the latent representation and ulso update the output visualization."""
        self.vals = vals.copy()
-        y = self.model.predict(self.vals)[0]
+        y = self.Model.predict(self.vals)[0]
        self.data_visualize.modify(y)
        self.latent_handle.set_data(self.vals[self.latent_index[0]], self.vals[self.latent_index[1]])
        self.axes.figure.canvas.draw()
@ -113,15 +113,15 @@ class lvm(data_show):
        # A click in the bar chart axis for selection a dimension.
        if self.sense_axes != None:
            self.sense_axes.cla()
-            self.sense_axes.bar(np.arange(self.model.input_dim),1./self.model.input_sensitivity(),color='b')
+            self.sense_axes.bar(np.arange(self.Model.input_dim),1./self.Model.input_sensitivity(),color='b')

            if self.latent_index[1] == self.latent_index[0]:
-                self.sense_axes.bar(np.array(self.latent_index[0]),1./self.model.input_sensitivity()[self.latent_index[0]],color='y')
-                self.sense_axes.bar(np.array(self.latent_index[1]),1./self.model.input_sensitivity()[self.latent_index[1]],color='y')
+                self.sense_axes.bar(np.array(self.latent_index[0]),1./self.Model.input_sensitivity()[self.latent_index[0]],color='y')
+                self.sense_axes.bar(np.array(self.latent_index[1]),1./self.Model.input_sensitivity()[self.latent_index[1]],color='y')

            else:
-                self.sense_axes.bar(np.array(self.latent_index[0]),1./self.model.input_sensitivity()[self.latent_index[0]],color='g')
-                self.sense_axes.bar(np.array(self.latent_index[1]),1./self.model.input_sensitivity()[self.latent_index[1]],color='r')
+                self.sense_axes.bar(np.array(self.latent_index[0]),1./self.Model.input_sensitivity()[self.latent_index[0]],color='g')
+                self.sense_axes.bar(np.array(self.latent_index[1]),1./self.Model.input_sensitivity()[self.latent_index[1]],color='r')

            self.sense_axes.figure.canvas.draw()

@ -131,21 +131,21 @@ class lvm_subplots(lvm):
    latent_axes is a np array of dimension np.ceil(input_dim/2),
    one for each pair of the latent dimensions.
    """
-    def __init__(self, vals, model, data_visualize, latent_axes=None, sense_axes=None):
-        self.nplots = int(np.ceil(model.input_dim/2.))+1
+    def __init__(self, vals, Model, data_visualize, latent_axes=None, sense_axes=None):
+        self.nplots = int(np.ceil(Model.input_dim/2.))+1
        assert len(latent_axes)==self.nplots
        if vals==None:
-            vals = model.X[0, :]
+            vals = Model.X[0, :]
        self.latent_values = vals 

        for i, axis in enumerate(latent_axes):
            if i == self.nplots-1:
-                if self.nplots*2!=model.input_dim:
+                if self.nplots*2!=Model.input_dim:
                    latent_index = [i*2, i*2]
-                lvm.__init__(self, self.latent_vals, model, data_visualize, axis, sense_axes, latent_index=latent_index)
+                lvm.__init__(self, self.latent_vals, Model, data_visualize, axis, sense_axes, latent_index=latent_index)
            else:
                latent_index = [i*2, i*2+1]
-                lvm.__init__(self, self.latent_vals, model, data_visualize, axis, latent_index=latent_index)
+                lvm.__init__(self, self.latent_vals, Model, data_visualize, axis, latent_index=latent_index)



@ -158,7 +158,7 @@ class lvm_dimselect(lvm):
    GPy.examples.dimensionality_reduction.BGPVLM_oil()

    """
-    def __init__(self, vals, model, data_visualize, latent_axes=None, sense_axes=None, latent_index=[0, 1]):
+    def __init__(self, vals, Model, data_visualize, latent_axes=None, sense_axes=None, latent_index=[0, 1]):
        if latent_axes==None and sense_axes==None:
            self.fig,(latent_axes,self.sense_axes) = plt.subplots(1,2)
        elif sense_axes==None:
@ -167,14 +167,14 @@ class lvm_dimselect(lvm):
        else:
            self.sense_axes = sense_axes
        
-        lvm.__init__(self,vals,model,data_visualize,latent_axes,sense_axes,latent_index)
+        lvm.__init__(self,vals,Model,data_visualize,latent_axes,sense_axes,latent_index)
        print "use left and right mouse butons to select dimensions"


    def on_click(self, event):

        if event.inaxes==self.sense_axes:
-            new_index = max(0,min(int(np.round(event.xdata-0.5)),self.model.input_dim-1))
+            new_index = max(0,min(int(np.round(event.xdata-0.5)),self.Model.input_dim-1))
            if event.button == 1:
                # Make it red if and y-axis (red=port=left) if it is a left button click
                self.latent_index[1] = new_index                
@ -185,7 +185,7 @@ class lvm_dimselect(lvm):
            self.show_sensitivities()

            self.latent_axes.cla()
-            self.model.plot_latent(which_indices=self.latent_index,
+            self.Model.plot_latent(which_indices=self.latent_index,
                                   ax=self.latent_axes)
            self.latent_handle = self.latent_axes.plot([0],[0],'rx',mew=2)[0]
            self.modify(self.latent_values)
@ -199,7 +199,7 @@ class lvm_dimselect(lvm):

    def on_leave(self,event):
        latent_values = self.latent_values.copy()
-        y = self.model.predict(latent_values[None,:])[0]
+        y = self.Model.predict(latent_values[None,:])[0]
        self.data_visualize.modify(y)


@ -221,7 +221,7 @@ class image_show(data_show):
        if not self.palette == []: # Can just show the image (self.set_image() took care of setting the palette)
            self.handle = self.axes.imshow(self.vals, interpolation='nearest')
        else: # Use a boring gray map.
-            self.handle = self.axes.imshow(self.vals, cmap=plt.cm.gray, interpolation='nearest')
+            self.handle = self.axes.imshow(self.vals, cmap=plt.cm.gray, interpolation='nearest') # @UndefinedVariable
        plt.show()

    def modify(self, vals):