Merge branch 'newGP' of github.com:SheffieldML/GPy into newGP

Conflicts: GPy/likelihoods/EP.py GPy/likelihoods/likelihood_functions.py
2026-05-08 11:32:39 +02:00 · 2013-02-01 13:19:59 +00:00 · 2013-02-01 13:19:59 +00:00 · 879fa138e1
commit 879fa138e1
parent 182c4c7d64 346f9dd8bd
7 changed files with 464 additions and 369 deletions
--- a/GPy/likelihoods/EP.py
+++ b/GPy/likelihoods/EP.py
@ -1,11 +1,9 @@
 import numpy as np
 import random
 from scipy import stats, linalg
 #from ..core import model
 from ..util.linalg import pdinv,mdot,jitchol
-from ..util.plot import gpplot
+from likelihood import likelihood
-class EP:
+class EP(likelihood):
    def __init__(self,data,likelihood_function,epsilon=1e-3,power_ep=[1.,1.]):
        """
        Expectation Propagation
@ -20,11 +18,10 @@ class EP:
        self.eta, self.delta = power_ep
        self.data = data
        self.N = self.data.size
        self.is_heteroscedastic = True
-        """
+        #Initial values - Likelihood approximation parameters:
-        Initial values - Likelihood approximation parameters:
+        #p(y|f) = t(f|tau_tilde,v_tilde)
        p(y|f) = t(f|tau_tilde,v_tilde)
        """
        self.tau_tilde = np.zeros(self.N)
        self.v_tilde = np.zeros(self.N)
@ -51,9 +48,11 @@ class EP:
        mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model
        sigma_sum = 1./self.tau_ + 1./self.tau_tilde
        mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2
-        Z_ep = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant
+        self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep
-        self.Y, self.beta, self.Z =  mu_tilde[:,None],self.tau_tilde[:,None], Z_ep
+
-        self.variance = np.diag(1./self.beta.flatten())
+        self.Y =  mu_tilde[:,None]
        self.precsion = self.tau_tilde[:,None]
        self.covariance_matrix = np.diag(1./self.precision)
    def fit_full(self,K):
        """
--- a/GPy/likelihoods/Gaussian.py
+++ b/GPy/likelihoods/Gaussian.py
@ -1,7 +1,9 @@
 import numpy as np
 from likelihood import likelihood
-class Gaussian:
+class Gaussian(likelihood):
    def __init__(self,data,variance=1.,normalize=False):
        self.is_heteroscedastic = False
        self.data = data
        self.N,D = data.shape
        self.Z = 0. # a correction factor which accounts for the approximation made
@ -19,6 +21,7 @@ class Gaussian:
        self.YYT = np.dot(self.Y,self.Y.T)
        self._set_params(np.asarray(variance))
    def _get_params(self):
        return np.asarray(self._variance)
@ -27,7 +30,8 @@ class Gaussian:
    def _set_params(self,x):
        self._variance = x
-        self.variance = np.eye(self.N)*self._variance
+        self.covariance_matrix = np.eye(self.N)*self._variance
        self.precision = 1./self._variance
    def predictive_values(self,mu,var):
        """
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -0,0 +1,35 @@
 import numpy as np
 class likelihood:
    """
    The atom for a likelihood class
    This object interfaces the GP and the data. The most basic likelihood
    (Gaussian) inherits directly from this, as does the EP algorithm
    Some things must be defined for this to work properly:
    self.Y : the effective Gaussian target of the GP
    self.N, self.D : Y.shape
    self.covariance_matrix : the effective (noise) covariance of the GP targets
    self.Z : a factor which gets added to the likelihood (0 for a Gaussian, Z_EP for EP)
    self.is_heteroscedastic : enables significant computational savings in GP
    self.precision : a scalar or vector representation of the effective target precision
    self.YYT : (optional) = np.dot(self.Y, self.Y.T) enables computational savings for D>N
    """
    def __init__(self,data):
        raise ValueError, "this class is not to be instantiated"
    def _get_params(self):
        raise NotImplementedError
    def _get_param_names(self):
        raise NotImplementedError
    def _set_params(self,x):
        raise NotImplementedError
    def fit(self):
        raise NotImplementedError
    def _gradients(self,partial):
        raise NotImplementedError
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@ -9,18 +9,22 @@ import pylab as pb
 from ..util.plot import gpplot
 #from . import EP
-class likelihood:
+class likelihood_function:
    """
    Likelihood class for doing Expectation propagation
    :param Y: observed output (Nx1 numpy.darray)
-    ..Note:: Y values allowed depend on the likelihood used
+    ..Note:: Y values allowed depend on the likelihood_function used
    """
    def __init__(self,location=0,scale=1):
        self.location = location
        self.scale = scale
 <<<<<<< HEAD
 class Probit(likelihood):
 =======
 class probit(likelihood_function):
 >>>>>>> 346f9dd8bd3207959b87ded258e55aeb094f1ea3
    """
    Probit likelihood
    Y is expected to take values in {-1,1}
@ -29,6 +33,11 @@ class Probit(likelihood):
    L(x) = \\Phi (Y_i*f_i)
    $$
    """
 <<<<<<< HEAD
 =======
    def __init__(self,location=0,scale=1):
        likelihood_function.__init__(self,Y,location,scale)
 >>>>>>> 346f9dd8bd3207959b87ded258e55aeb094f1ea3
    def moments_match(self,data_i,tau_i,v_i):
        """
@ -57,7 +66,11 @@ class Probit(likelihood):
        p_95 = np.ones([mu.size])
        return mean, p_05, p_95
 <<<<<<< HEAD
 class Poisson(likelihood):
 =======
 class poisson(likelihood_function):
 >>>>>>> 346f9dd8bd3207959b87ded258e55aeb094f1ea3
    """
    Poisson likelihood
    Y is expected to take values in {0,1,2,...}
@ -66,6 +79,12 @@ class Poisson(likelihood):
    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
    $$
    """
 <<<<<<< HEAD
 =======
    def __init__(self,Y,location=0,scale=1):
        assert len(Y[Y<0]) == 0, "Output cannot have negative values"
        likelihood_function.__init__(self,Y,location,scale)
 >>>>>>> 346f9dd8bd3207959b87ded258e55aeb094f1ea3
    def moments_match(self,i,tau_i,v_i):
        """
@ -129,7 +148,54 @@ class Poisson(likelihood):
        Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
        """
        mean = np.exp(mu*self.scale + self.location)
 <<<<<<< HEAD
        tmp = stats.poisson.ppf(np.array([.05,.95]),mu)
        p_05 = tmp[:,0]
        p_95 = tmp[:,1]
        return mean,p_05,p_95
 =======
        if all:
            tmp = stats.poisson.ppf(np.array([.05,.95]),mu)
            p_05 = tmp[:,0]
            p_95 = tmp[:,1]
            return mean,mean,p_05,p_95
        else:
            return mean
    def _log_likelihood_gradients():
        raise NotImplementedError
    def plot(self,X,mu,var,phi,X_obs,Z=None,samples=0):
        assert X_obs.shape[1] == 1, 'Number of dimensions must be 1'
        gpplot(X,phi,phi.flatten())
        pb.plot(X_obs,self.Y,'kx',mew=1.5)
        if samples:
            phi_samples = np.vstack([np.random.poisson(phi.flatten(),phi.size) for s in range(samples)])
            pb.plot(X,phi_samples.T, alpha = 0.4, c='#3465a4', linewidth = 0.8)
        if Z is not None:
            pb.plot(Z,Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12)
 class gaussian(likelihood_function):
    """
    Gaussian likelihood
    Y is expected to take values in (-inf,inf)
    """
    def moments_match(self,i,tau_i,v_i):
        """
        Moments match of the marginal approximation in EP algorithm
        :param i: number of observation (int)
        :param tau_i: precision of the cavity distribution (float)
        :param v_i: mean/variance of the cavity distribution (float)
        """
        mu = v_i/tau_i
        sigma = np.sqrt(1./tau_i)
        s = 1. if self.Y[i] == 0 else 1./self.Y[i]
        sigma2_hat = 1./(1./sigma**2 + 1./s**2)
        mu_hat = sigma2_hat*(mu/sigma**2 + self.Y[i]/s**2)
        Z_hat = 1./np.sqrt(2*np.pi) * 1./np.sqrt(sigma**2+s**2) * np.exp(-.5*(mu-self.Y[i])**2/(sigma**2 + s**2))
        return Z_hat, mu_hat, sigma2_hat
    def _log_likelihood_gradients():
        raise NotImplementedError
 >>>>>>> 346f9dd8bd3207959b87ded258e55aeb094f1ea3
--- a/GPy/models/sparse_GP.py
+++ b/GPy/models/sparse_GP.py
@ -6,37 +6,36 @@ import pylab as pb
 from ..util.linalg import mdot, jitchol, chol_inv, pdinv
 from ..util.plot import gpplot
 from .. import kern
 from ..inference.likelihoods import likelihood
 from GP import GP
 #Still TODO:
 # make use of slices properly (kernel can now do this)
 # enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array)
 class sparse_GP(GP):
    """
-    Variational sparse GP model (Regression)
+    Variational sparse GP model
    :param X: inputs
    :type X: np.ndarray (N x Q)
-    :param Y: observed data
+    :param likelihood: a likelihood instance, containing the observed data
-    :type Y: np.ndarray of observations (N x D)
+    :type likelihood: GPy.likelihood.(Gaussian | EP)
    :param kernel : the kernel/covariance function. See link kernels
    :type kernel: a GPy kernel
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (M x Q) | None
    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
    :type X_uncertainty: np.ndarray (N x Q) | None
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (M x Q) | None
    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type M: int
    :param beta: noise precision. TODO> ignore beta if doing EP
    :type beta: float
    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """
-    def __init__(self,X,Y=None,kernel=None,X_uncertainty=None,beta=100.,Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False,likelihood=None,method_ep='DTC',epsilon_ep=1e-3,power_ep=[1.,1.]):
+    def __init__(self,X,likelihood,kernel, X_uncertainty=None, Z=None,Zslices=None,M=10,normalize_X=False):
        self.scale_factor = 1000.0# a scaling factor to help keep the algorithm stable
        if Z is None:
            self.Z = np.random.permutation(X.copy())[:M]
@ -52,140 +51,91 @@ class sparse_GP(GP):
            self.has_uncertain_inputs=True
            self.X_uncertainty = X_uncertainty
-        GP.__init__(self, X=X, Y=Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y,likelihood=likelihood,epsilon_ep=epsilon_ep,power_ep=power_ep)
+        GP.__init__(self, X, Y, kernel=kernel, normalize_X=normalize_X, Xslices=Xslices)
        #normalise X uncertainty also
        if self.has_uncertain_inputs:
            self.X_uncertainty /= np.square(self._Xstd)
        if not self.EP:
            self.trYYT = np.sum(np.square(self.Y))
        else:
            self.method_ep = method_ep
        #normalise X uncertainty also
        if self.has_uncertain_inputs:
            self.X_uncertainty /= np.square(self._Xstd)
    def _set_params(self, p):
        self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
        if not self.EP:
            self.beta = p[self.M*self.Q]
            self.kern._set_params(p[self.Z.size + 1:])
        else:
            self.kern._set_params(p[self.Z.size:])
            if self.Y is None:
                self.Y = np.ones([self.N,1])
        self._compute_kernel_matrices()
        self._computations()
    def _get_params(self):
        if not self.EP:
            return np.hstack([self.Z.flatten(),self.beta,self.kern._get_params_transformed()])
        else:
            return np.hstack([self.Z.flatten(),self.kern._get_params_transformed()])
    def _get_param_names(self):
        if not self.EP:
            return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern._get_param_names_transformed()
        else:
            return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + self.kern._get_param_names_transformed()
    def _compute_kernel_matrices(self):
        # kernel computations, using BGPLVM notation
        #TODO: slices for psi statistics (easy enough)
        self.Kmm = self.kern.K(self.Z)
        if self.has_uncertain_inputs:
            if not self.EP:
                self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty)#.sum() NOTE psi0 is now a vector
                self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T
                self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)
                #self.psi2_beta_scaled = ?
            else:
                raise NotImplementedError, "uncertain_inputs not yet supported for EP"
        else:
            self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices)#.sum()
            self.psi1 = self.kern.K(self.Z,self.X)
            self.psi2 = np.dot(self.psi1,self.psi1.T)
            self.psi2_beta_scaled = np.dot(self.psi1,self.beta*self.psi1.T)
    def _computations(self):
        # TODO find routine to multiply triangular matrices
-        self.V = self.beta*self.Y
+        #TODO: slices for psi statistics (easy enough)
        sf = self.scale_factor
        sf2 = sf**2
        # kernel computations, using BGPLVM notation
        self.Kmm = self.kern.K(self.Z)
        if self.has_uncertain_inputs:
            self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum()
            self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T
            self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)
            self.psi2_beta_scaled = (self.psi2*(self.beta/sf2)).sum(0)
        else:
            self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum()
            self.psi1 = self.kern.K(self.Z,self.X)
            tmp = self.psi1*(np.sqrt(self.likelihood.beta)/sf)
            self.psi2_beta_scaled = np.dot(tmp,tmp.T)
        self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)#+np.eye(self.M)*1e-3)
        self.V = (self.likelihood.beta/self.scale_factor)*self.Y
        self.A = mdot(self.Lmi, self.psi2_beta_scaled, self.Lmi.T)
        self.B = np.eye(self.M)/sf2 + self.A
        self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B)
        self.psi1V = np.dot(self.psi1, self.V)
        self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T)
-        self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)
+        self.C = mdot(self.Lmi.T, self.Bi, self.Lmi)
-        self.A = mdot(self.Lmi, self.psi2_beta_scaled, self.Lmi.T)
+        self.E = mdot(self.C, self.psi1VVpsi1/sf2, self.C.T)
        self.B = np.eye(self.M) + self.A
        self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B)
        self.LLambdai = np.dot(self.LBi, self.Lmi)
        self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi)
        self.C = mdot(self.LLambdai, self.psi1V)
        self.G =  mdot(self.LBL_inv, self.psi1VVpsi1, self.LBL_inv.T)
        self.trace_K_beta_scaled = (self.psi0*self.beta).sum() - np.trace(self.A)
        if not self.EP:
            self.trace_K = self.psi0.sum() - np.trace(self.A)/self.beta
-        # Compute dL_dpsi
+        # Compute dL_dpsi # FIXME
-        self.dL_dpsi1 = mdot(self.LLambdai.T,self.C,self.V.T)
+        self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N)
-        if not self.EP:
+        self.dL_dpsi1 = mdot(self.V, self.psi1V.T,self.C).T
-            self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N)
+        self.dL_dpsi2 = 0.5 * self.beta * self.D * self.Kmmi[None,:,:] # dB
-            if self.has_uncertain_inputs:
+        self.dL_dpsi2 += - 0.5 * self.beta/sf2 * self.D * self.C[None,:,:] # dC
-                self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G)
+        self.dL_dpsi2 += - 0.5 * self.beta * self.E[None,:,:] # dD
            else:
                self.dL_dpsi2_ = - 0.5 * (self.D*(self.LBL_inv - self.Kmmi) + self.G)
        else:
            self.dL_dpsi0 = - 0.5 * self.D * self.beta.flatten()
            if not self.has_uncertain_inputs:
                self.dL_dpsi2_ = - 0.5 * (self.D*(self.LBL_inv - self.Kmmi) + self.G)
        # Compute dL_dKmm
-        self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB
+        self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi)*sf2 # dB
-        self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*mdot(self.LBL_inv, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC
+        self.dL_dKmm += -0.5 * self.D * (- self.C/sf2 - 2.*mdot(self.C, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC
-        self.dL_dKmm +=  np.dot(np.dot(self.G,self.psi2_beta_scaled) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE
+        self.dL_dKmm +=  np.dot(np.dot(self.E*sf2, self.psi2_beta_scaled) - np.dot(self.C, self.psi1VVpsi1), self.Kmmi) + 0.5*self.E # dD
-    def approximate_likelihood(self):
+
-        assert not isinstance(self.likelihood, gaussian), "EP is only available for non-gaussian likelihoods"
+    def _set_params(self, p):
-        if self.method_ep == 'DTC':
+        self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
-            self.ep_approx = DTC(self.Kmm,self.likelihood,self.psi1,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta])
+        self.beta = p[self.M*self.Q] # FIXME
-        elif self.method_ep == 'FITC':
+        self.kern._set_params(p[self.Z.size + 1:])
            self.ep_approx = FITC(self.Kmm,self.likelihood,self.psi1,self.psi0,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta])
        else:
            self.ep_approx = Full(self.X,self.likelihood,self.kernel,inducing=None,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta])
        self.beta, self.Y, self.Z_ep = self.ep_approx.fit_EP()
        self.trbetaYYT = np.sum(np.square(self.Y)*self.beta)
        self._computations()
    def _get_params(self):
        return np.hstack([self.Z.flatten(),GP._get_params(self))
    def _get_param_names(self):
        return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + GP._get_param_names(self)
    def log_likelihood(self):
-        """
+        """ Compute the (lower bound on the) log marginal likelihood """
-        Compute the (lower bound on the) log marginal likelihood
+        sf2 = self.scale_factor**2
-        """
+        A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta)) -0.5*self.beta*self.trYYT # FIXME
-        if not self.EP:
+        B = -0.5*self.D*(self.beta*self.psi0-np.trace(self.A)*sf2)# FIXME
-            A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta))
+        C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2))
-            D = -0.5*self.beta*self.trYYT
+        D = +0.5*np.sum(self.psi1VVpsi1 * self.C)
-        else:
+        return A+B+C+D
            A = -0.5*self.D*(self.N*np.log(2.*np.pi) - np.sum(np.log(self.beta)))
            D = -0.5*self.trbetaYYT
        B = -0.5*self.D*self.trace_K_beta_scaled
        C = -0.5*self.D * self.B_logdet
        E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv)
        return A+B+C+D+E
    def _log_likelihood_gradients(self):
        return np.hstack([self.dL_dZ().flatten(), GP._log_likelihood_gradients(self)])
        # FIXME: move this into the lieklihood class
    def dL_dbeta(self):
-        """
+        sf2 = self.scale_factor**2
-        Compute the gradient of the log likelihood wrt beta.
+        dA_dbeta =   0.5 * self.N*self.D/self.beta - 0.5 * self.trYYT
-        """
+        dB_dbeta = - 0.5 * self.D * (self.psi0 - np.trace(self.A)/self.beta*sf2)
        #TODO: suport heteroscedatic noise
        dA_dbeta =   0.5 * self.N*self.D/self.beta
        dB_dbeta = - 0.5 * self.D * self.trace_K
        dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)/self.beta
-        dD_dbeta = - 0.5 * self.trYYT
+        dD_dbeta = np.sum((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) * self.psi1VVpsi1 )/self.beta
        tmp = mdot(self.LBi.T, self.LLambdai, self.psi1V)
        dE_dbeta = (np.sum(np.square(self.C)) - 0.5 * np.sum(self.A * np.dot(tmp, tmp.T)))/self.beta
-        return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta)
+        return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta)
    def dL_dtheta(self):
        """
@ -195,10 +145,10 @@ class sparse_GP(GP):
        if self.has_uncertain_inputs:
            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty)
            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
-            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape
+            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.dL_dpsi1.T, self.Z,self.X, self.X_uncertainty)
        else:
            #re-cast computations in psi2 back to psi1:
-            dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2_,self.beta.T*self.psi1) #dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
+            dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2.sum(0),self.psi1)
            dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X)
            dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
@ -208,48 +158,36 @@ class sparse_GP(GP):
        """
        The derivative of the bound wrt the inducing inputs Z
        """
-        dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z,)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
+        dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
        if self.has_uncertain_inputs:
-            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
+            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1,self.Z,self.X, self.X_uncertainty)
-            dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty)
+            dL_dZ += 2.*self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # 'stripes'
        else:
            #re-cast computations in psi2 back to psi1:
-            dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2_,self.beta.T*self.psi1)#dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
+            dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2.sum(0),self.psi1)
            dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X)
        return dL_dZ
    def _log_likelihood_gradients(self):
        if not self.EP:
            return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()])
        else:
            return np.hstack([self.dL_dZ().flatten(), self.dL_dtheta()])
    def _raw_predict(self, Xnew, slices, full_cov=False):
        """Internal helper function for making predictions, does not account for normalisation"""
        Kx = self.kern.K(self.Z, Xnew)
-        mu = mdot(Kx.T, self.LBL_inv, self.psi1V)
+        mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
-        phi = None
+
        if full_cov:
            Kxx = self.kern.K(Xnew)
-            var = Kxx - mdot(Kx.T, (self.Kmmi - self.LBL_inv), Kx)
+            var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx)
            if not self.EP:
                var += np.eye(Xnew.shape[0])/self.beta
            else:
                raise NotImplementedError, "full_cov = True not implemented for EP"
        else:
            Kxx = self.kern.Kdiag(Xnew)
-            var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.LBL_inv, Kx),0)
+            var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0)
-            if not self.EP:
+
-                var += 1./self.beta
+        return mu,var
            else:
                phi = self.likelihood.predictive_mean(mu,var)
        return mu,var,phi
    def plot(self, *args, **kwargs):
        """
        Plot the fitted model: just call the GP_regression plot function and then add inducing inputs
        """
-        GP.plot(self,*args,**kwargs)
+        GP_regression.plot(self,*args,**kwargs)
        if self.Q==1:
            pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12)
            if self.has_uncertain_inputs:
--- a/GPy/models/sparse_GP_old.py
+++ b/GPy/models/sparse_GP_old.py
@ -0,0 +1,258 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 from ..util.linalg import mdot, jitchol, chol_inv, pdinv
 from ..util.plot import gpplot
 from .. import kern
 from GP import GP
 #Still TODO:
 # make use of slices properly (kernel can now do this)
 # enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array)
 class sparse_GP(GP):
    """
    Variational sparse GP model (Regression)
    :param X: inputs
    :type X: np.ndarray (N x Q)
    :param Y: observed data
    :type Y: np.ndarray of observations (N x D)
    :param kernel : the kernel/covariance function. See link kernels
    :type kernel: a GPy kernel
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (M x Q) | None
    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
    :type X_uncertainty: np.ndarray (N x Q) | None
    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type M: int
    :param beta: noise precision. TODO> ignore beta if doing EP
    :type beta: float
    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """
    def __init__(self,X,Y=None,kernel=None,X_uncertainty=None,beta=100.,Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False,likelihood=None,method_ep='DTC',epsilon_ep=1e-3,power_ep=[1.,1.]):
        if Z is None:
            self.Z = np.random.permutation(X.copy())[:M]
            self.M = M
        else:
            assert Z.shape[1]==X.shape[1]
            self.Z = Z
            self.M = Z.shape[0]
        if X_uncertainty is None:
            self.has_uncertain_inputs=False
        else:
            assert X_uncertainty.shape==X.shape
            self.has_uncertain_inputs=True
            self.X_uncertainty = X_uncertainty
        GP.__init__(self, X=X, Y=Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y,likelihood=likelihood,epsilon_ep=epsilon_ep,power_ep=power_ep)
        #normalise X uncertainty also
        if self.has_uncertain_inputs:
            self.X_uncertainty /= np.square(self._Xstd)
        if not self.EP:
            self.trYYT = np.sum(np.square(self.Y))
        else:
            self.method_ep = method_ep
        #normalise X uncertainty also
        if self.has_uncertain_inputs:
            self.X_uncertainty /= np.square(self._Xstd)
    def _set_params(self, p):
        self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
        if not self.EP:
            self.beta = p[self.M*self.Q]
            self.kern._set_params(p[self.Z.size + 1:])
        else:
            self.kern._set_params(p[self.Z.size:])
            if self.Y is None:
                self.Y = np.ones([self.N,1])
        self._compute_kernel_matrices()
        self._computations()
    def _get_params(self):
        if not self.EP:
            return np.hstack([self.Z.flatten(),self.beta,self.kern._get_params_transformed()])
        else:
            return np.hstack([self.Z.flatten(),self.kern._get_params_transformed()])
    def _get_param_names(self):
        if not self.EP:
            return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern._get_param_names_transformed()
        else:
            return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + self.kern._get_param_names_transformed()
    def _compute_kernel_matrices(self):
        # kernel computations, using BGPLVM notation
        #TODO: slices for psi statistics (easy enough)
        self.Kmm = self.kern.K(self.Z)
        if self.has_uncertain_inputs:
            if not self.EP:
                self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty)#.sum() NOTE psi0 is now a vector
                self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T
                self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)
                #self.psi2_beta_scaled = ?
            else:
                raise NotImplementedError, "uncertain_inputs not yet supported for EP"
        else:
            self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices)#.sum()
            self.psi1 = self.kern.K(self.Z,self.X)
            self.psi2 = np.dot(self.psi1,self.psi1.T)
            self.psi2_beta_scaled = np.dot(self.psi1,self.beta*self.psi1.T)
    def _computations(self):
        # TODO find routine to multiply triangular matrices
        self.V = self.beta*self.Y
        self.psi1V = np.dot(self.psi1, self.V)
        self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T)
        self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)
        self.A = mdot(self.Lmi, self.psi2_beta_scaled, self.Lmi.T)
        self.B = np.eye(self.M) + self.A
        self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B)
        self.LLambdai = np.dot(self.LBi, self.Lmi)
        self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi)
        self.C = mdot(self.LLambdai, self.psi1V)
        self.G =  mdot(self.LBL_inv, self.psi1VVpsi1, self.LBL_inv.T)
        self.trace_K_beta_scaled = (self.psi0*self.beta).sum() - np.trace(self.A)
        if not self.EP:
            self.trace_K = self.psi0.sum() - np.trace(self.A)/self.beta
        # Compute dL_dpsi
        self.dL_dpsi1 = mdot(self.LLambdai.T,self.C,self.V.T)
        if not self.EP:
            self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N)
            if self.has_uncertain_inputs:
                self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G)
            else:
                self.dL_dpsi2_ = - 0.5 * (self.D*(self.LBL_inv - self.Kmmi) + self.G)
        else:
            self.dL_dpsi0 = - 0.5 * self.D * self.beta.flatten()
            if not self.has_uncertain_inputs:
                self.dL_dpsi2_ = - 0.5 * (self.D*(self.LBL_inv - self.Kmmi) + self.G)
        # Compute dL_dKmm
        self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB
        self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*mdot(self.LBL_inv, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC
        self.dL_dKmm +=  np.dot(np.dot(self.G,self.psi2_beta_scaled) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE
    def approximate_likelihood(self):
        assert not isinstance(self.likelihood, gaussian), "EP is only available for non-gaussian likelihoods"
        if self.method_ep == 'DTC':
            self.ep_approx = DTC(self.Kmm,self.likelihood,self.psi1,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta])
        elif self.method_ep == 'FITC':
            self.ep_approx = FITC(self.Kmm,self.likelihood,self.psi1,self.psi0,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta])
        else:
            self.ep_approx = Full(self.X,self.likelihood,self.kernel,inducing=None,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta])
        self.beta, self.Y, self.Z_ep = self.ep_approx.fit_EP()
        self.trbetaYYT = np.sum(np.square(self.Y)*self.beta)
        self._computations()
    def log_likelihood(self):
        """
        Compute the (lower bound on the) log marginal likelihood
        """
        if not self.EP:
            A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta))
            D = -0.5*self.beta*self.trYYT
        else:
            A = -0.5*self.D*(self.N*np.log(2.*np.pi) - np.sum(np.log(self.beta)))
            D = -0.5*self.trbetaYYT
        B = -0.5*self.D*self.trace_K_beta_scaled
        C = -0.5*self.D * self.B_logdet
        E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv)
        return A+B+C+D+E
    def dL_dbeta(self):
        """
        Compute the gradient of the log likelihood wrt beta.
        """
        #TODO: suport heteroscedatic noise
        dA_dbeta =   0.5 * self.N*self.D/self.beta
        dB_dbeta = - 0.5 * self.D * self.trace_K
        dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)/self.beta
        dD_dbeta = - 0.5 * self.trYYT
        tmp = mdot(self.LBi.T, self.LLambdai, self.psi1V)
        dE_dbeta = (np.sum(np.square(self.C)) - 0.5 * np.sum(self.A * np.dot(tmp, tmp.T)))/self.beta
        return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta)
    def dL_dtheta(self):
        """
        Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel
        """
        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z)
        if self.has_uncertain_inputs:
            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty)
            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape
        else:
            #re-cast computations in psi2 back to psi1:
            dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2_,self.beta.T*self.psi1) #dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
            dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X)
            dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
        return dL_dtheta
    def dL_dZ(self):
        """
        The derivative of the bound wrt the inducing inputs Z
        """
        dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z,)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
        if self.has_uncertain_inputs:
            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
            dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty)
        else:
            #re-cast computations in psi2 back to psi1:
            dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2_,self.beta.T*self.psi1)#dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
            dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X)
        return dL_dZ
    def _log_likelihood_gradients(self):
        if not self.EP:
            return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()])
        else:
            return np.hstack([self.dL_dZ().flatten(), self.dL_dtheta()])
    def _raw_predict(self, Xnew, slices, full_cov=False):
        """Internal helper function for making predictions, does not account for normalisation"""
        Kx = self.kern.K(self.Z, Xnew)
        mu = mdot(Kx.T, self.LBL_inv, self.psi1V)
        phi = None
        if full_cov:
            Kxx = self.kern.K(Xnew)
            var = Kxx - mdot(Kx.T, (self.Kmmi - self.LBL_inv), Kx)
            if not self.EP:
                var += np.eye(Xnew.shape[0])/self.beta
            else:
                raise NotImplementedError, "full_cov = True not implemented for EP"
        else:
            Kxx = self.kern.Kdiag(Xnew)
            var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.LBL_inv, Kx),0)
            if not self.EP:
                var += 1./self.beta
            else:
                phi = self.likelihood.predictive_mean(mu,var)
        return mu,var,phi
    def plot(self, *args, **kwargs):
        """
        Plot the fitted model: just call the GP_regression plot function and then add inducing inputs
        """
        GP.plot(self,*args,**kwargs)
        if self.Q==1:
            pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12)
            if self.has_uncertain_inputs:
                pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten()))
        if self.Q==2:
            pb.plot(self.Z[:,0],self.Z[:,1],'wo')
--- a/GPy/models/sparse_GP_regression.py
+++ b/GPy/models/sparse_GP_regression.py
@ -1,205 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 from ..util.linalg import mdot, jitchol, chol_inv, pdinv
 from ..util.plot import gpplot
 from .. import kern
 from ..inference.likelihoods import likelihood
 from GP_regression import GP_regression
 #Still TODO:
 # make use of slices properly (kernel can now do this)
 # enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array)
 class sparse_GP_regression(GP_regression):
    """
    Variational sparse GP model (Regression)
    :param X: inputs
    :type X: np.ndarray (N x Q)
    :param Y: observed data
    :type Y: np.ndarray of observations (N x D)
    :param kernel : the kernel/covariance function. See link kernels
    :type kernel: a GPy kernel
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (M x Q) | None
    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
    :type X_uncertainty: np.ndarray (N x Q) | None
    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type M: int
    :param beta: noise precision. TODO> ignore beta if doing EP
    :type beta: float
    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """
    def __init__(self,X,Y,kernel=None, X_uncertainty=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False):
        self.scale_factor = 1000.0
        self.beta = beta
        if Z is None:
            self.Z = np.random.permutation(X.copy())[:M]
            self.M = M
        else:
            assert Z.shape[1]==X.shape[1]
            self.Z = Z
            self.M = Z.shape[0]
        if X_uncertainty is None:
            self.has_uncertain_inputs=False
        else:
            assert X_uncertainty.shape==X.shape
            self.has_uncertain_inputs=True
            self.X_uncertainty = X_uncertainty
        GP_regression.__init__(self, X, Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y)
        self.trYYT = np.sum(np.square(self.Y))
        #normalise X uncertainty also
        if self.has_uncertain_inputs:
            self.X_uncertainty /= np.square(self._Xstd)
    def _computations(self):
        # TODO find routine to multiply triangular matrices
        #TODO: slices for psi statistics (easy enough)
        # kernel computations, using BGPLVM notation
        self.Kmm = self.kern.K(self.Z)
        if self.has_uncertain_inputs:
            self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum()
            self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T
            self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)
            self.psi2_beta_scaled = (self.psi2*(self.beta/self.scale_factor**2)).sum(0)
        else:
            self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum()
            self.psi1 = self.kern.K(self.Z,self.X)
            #self.psi2 = np.dot(self.psi1,self.psi1.T)
            #self.psi2 = self.psi1.T[:,:,None]*self.psi1.T[:,None,:]
            tmp = self.psi1/(self.scale_factor/np.sqrt(self.beta))
            self.psi2_beta_scaled = np.dot(tmp,tmp.T)
        sf = self.scale_factor
        sf2 = sf**2
        self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)#+np.eye(self.M)*1e-3)
        self.V = (self.beta/self.scale_factor)*self.Y
        self.A = mdot(self.Lmi, self.psi2_beta_scaled, self.Lmi.T)
        self.B = np.eye(self.M)/sf2 + self.A
        self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B)
        self.psi1V = np.dot(self.psi1, self.V)
        self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T)
        self.C = mdot(self.Lmi.T, self.Bi, self.Lmi)
        self.E = mdot(self.C, self.psi1VVpsi1/sf2, self.C.T)
        # Compute dL_dpsi
        self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N)
        self.dL_dpsi1 = mdot(self.V, self.psi1V.T,self.C).T
        self.dL_dpsi2 = 0.5 * self.beta * self.D * self.Kmmi[None,:,:] # dB
        self.dL_dpsi2 += - 0.5 * self.beta/sf2 * self.D * self.C[None,:,:] # dC
        self.dL_dpsi2 += - 0.5 * self.beta * self.E[None,:,:] # dD
        # Compute dL_dKmm
        self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi)*sf2 # dB
        self.dL_dKmm += -0.5 * self.D * (- self.C/sf2 - 2.*mdot(self.C, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC
        self.dL_dKmm +=  np.dot(np.dot(self.E*sf2, self.psi2_beta_scaled) - np.dot(self.C, self.psi1VVpsi1), self.Kmmi) + 0.5*self.E # dD
    def _set_params(self, p):
        self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
        self.beta = p[self.M*self.Q]
        self.kern._set_params(p[self.Z.size + 1:])
        self._computations()
    def _get_params(self):
        return np.hstack([self.Z.flatten(),self.beta,self.kern._get_params_transformed()])
    def _get_param_names(self):
        return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern._get_param_names_transformed()
    def log_likelihood(self):
        """ Compute the (lower bound on the) log marginal likelihood """
        sf2 = self.scale_factor**2
        A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta)) -0.5*self.beta*self.trYYT
        B = -0.5*self.D*(self.beta*self.psi0-np.trace(self.A)*sf2)
        C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2))
        D = +0.5*np.sum(self.psi1VVpsi1 * self.C)
        return A+B+C+D
    def _log_likelihood_gradients(self):
        return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()])
    def dL_dbeta(self):
        """
        Compute the gradient of the log likelihood wrt beta.
        """
        #TODO: suport heteroscedatic noise
        sf2 = self.scale_factor**2
        dA_dbeta =   0.5 * self.N*self.D/self.beta - 0.5 * self.trYYT
        dB_dbeta = - 0.5 * self.D * (self.psi0 - np.trace(self.A)/self.beta*sf2)
        dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)/self.beta
        dD_dbeta = np.sum((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) * self.psi1VVpsi1 )/self.beta
        return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta)
    def dL_dtheta(self):
        """
        Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel
        """
        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z)
        if self.has_uncertain_inputs:
            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty)
            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.dL_dpsi1.T, self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape
        else:
            #re-cast computations in psi2 back to psi1:
            dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2.sum(0),self.psi1)
            dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X)
            dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
        return dL_dtheta
    def dL_dZ(self):
        """
        The derivative of the bound wrt the inducing inputs Z
        """
        dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
        if self.has_uncertain_inputs:
            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1,self.Z,self.X, self.X_uncertainty)
            dL_dZ += 2.*self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # 'stripes'
        else:
            #re-cast computations in psi2 back to psi1:
            dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2.sum(0),self.psi1)
            dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X)
        return dL_dZ
    def _raw_predict(self, Xnew, slices, full_cov=False):
        """Internal helper function for making predictions, does not account for normalisation"""
        Kx = self.kern.K(self.Z, Xnew)
        mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
        if full_cov:
            Kxx = self.kern.K(Xnew)
            var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) + np.eye(Xnew.shape[0])/self.beta # TODO: This beta doesn't belong here in the EP case.
        else:
            Kxx = self.kern.Kdiag(Xnew)
            var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0) + 1./self.beta # TODO: This beta doesn't belong here in the EP case.
        return mu,var
    def plot(self, *args, **kwargs):
        """
        Plot the fitted model: just call the GP_regression plot function and then add inducing inputs
        """
        GP_regression.plot(self,*args,**kwargs)
        if self.Q==1:
            pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12)
            if self.has_uncertain_inputs:
                pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten()))
        if self.Q==2:
            pb.plot(self.Z[:,0],self.Z[:,1],'wo')