Merge pull request #542 from SheffieldML/EP_refactor

Merge in the EP refactoring
2026-04-29 23:06:22 +02:00 · 2017-09-06 10:08:08 +01:00 · 2017-09-06 10:08:08 +01:00 · 0fb273a41c
commit 0fb273a41c
parent 7592088a1c 44a232ff6a
9 changed files with 721 additions and 202 deletions
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@ -6,9 +6,111 @@ from paramz import ObsAr
 from . import ExactGaussianInference, VarDTC
 from ...util import diag
 from .posterior import PosteriorEP as Posterior
 from ...likelihoods import Gaussian
 log_2_pi = np.log(2*np.pi)
 #Four wrapper classes to help modularisation of different EP versions
 class marginalMoments(object):
    def __init__(self, num_data):
        self.Z_hat = np.empty(num_data,dtype=np.float64)
        self.mu_hat = np.empty(num_data,dtype=np.float64)
        self.sigma2_hat = np.empty(num_data,dtype=np.float64)
 class cavityParams(object):
    def __init__(self, num_data):
        self.tau = np.empty(num_data,dtype=np.float64)
        self.v = np.empty(num_data,dtype=np.float64)
    def _update_i(self, eta, ga_approx, post_params, i):
        self.tau[i] = 1./post_params.Sigma_diag[i] - eta*ga_approx.tau[i]
        self.v[i] = post_params.mu[i]/post_params.Sigma_diag[i] - eta*ga_approx.v[i]
 class gaussianApproximation(object):
    def __init__(self, v, tau):
        self.tau = tau
        self.v = v
    def _update_i(self, eta, delta, post_params, marg_moments, i):
        #Site parameters update
        delta_tau = delta/eta*(1./marg_moments.sigma2_hat[i] - 1./post_params.Sigma_diag[i])
        delta_v = delta/eta*(marg_moments.mu_hat[i]/marg_moments.sigma2_hat[i] - post_params.mu[i]/post_params.Sigma_diag[i])
        tau_tilde_prev = self.tau[i]
        self.tau[i] += delta_tau
        # Enforce positivity of tau_tilde. Even though this is guaranteed for logconcave sites, it is still possible
        # to get negative values due to numerical errors. Moreover, the value of tau_tilde should be positive in order to
        # update the marginal likelihood without runnint into instabilities issues.
        if self.tau[i] < np.finfo(float).eps:
            self.tau[i] = np.finfo(float).eps
            delta_tau = self.tau[i] - tau_tilde_prev
        self.v[i] += delta_v
        return (delta_tau, delta_v)
 class posteriorParamsBase(object):
    def __init__(self, mu, Sigma_diag):
        self.mu = mu
        self.Sigma_diag = Sigma_diag
    def _update_rank1(self, *arg):
        pass
    def _recompute(self, *arg):
        pass
 class posteriorParams(posteriorParamsBase):
    def __init__(self, mu, Sigma, L=None):
        self.Sigma = Sigma
        self.L = L
        Sigma_diag = np.diag(self.Sigma)
        super(posteriorParams, self).__init__(mu, Sigma_diag)
    def _update_rank1(self, delta_tau, ga_approx, i):
        ci = delta_tau/(1.+ delta_tau*self.Sigma_diag[i])
        DSYR(self.Sigma, self.Sigma[:,i].copy(), -ci)
        self.mu = np.dot(self.Sigma, ga_approx.v)
    @staticmethod
    def _recompute(K, ga_approx):
        num_data = len(ga_approx.tau)
        tau_tilde_root = np.sqrt(ga_approx.tau)
        Sroot_tilde_K = tau_tilde_root[:,None] * K
        B = np.eye(num_data) + Sroot_tilde_K * tau_tilde_root[None,:]
        L = jitchol(B)
        V, _ = dtrtrs(L, Sroot_tilde_K, lower=1)
        Sigma = K - np.dot(V.T,V) #K - KS^(1/2)BS^(1/2)K = (K^(-1) + \Sigma^(-1))^(-1)
        mu = np.dot(Sigma,ga_approx.v)
        return posteriorParams(mu=mu, Sigma=Sigma, L=L)
 class posteriorParamsDTC(posteriorParamsBase):
    def __init__(self, mu, Sigma_diag):
        super(posteriorParamsDTC, self).__init__(mu, Sigma_diag)
    def _update_rank1(self, LLT, Kmn, delta_v, delta_tau, i):
        #DSYR(Sigma, Sigma[:,i].copy(), -delta_tau/(1.+ delta_tau*Sigma[i,i]))
        DSYR(LLT,Kmn[:,i].copy(),delta_tau)
        L = jitchol(LLT)
        V,info = dtrtrs(L,Kmn,lower=1)
        self.Sigma_diag = np.maximum(np.sum(V*V,-2), np.finfo(float).eps)  #diag(K_nm (L L^\top)^(-1)) K_mn
        si = np.sum(V.T*V[:,i],-1) #(V V^\top)[:,i]
        self.mu += (delta_v-delta_tau*self.mu[i])*si
        #mu = np.dot(Sigma, v_tilde)
    @staticmethod
    def _recompute(LLT0, Kmn, ga_approx):
        LLT = LLT0 + np.dot(Kmn*ga_approx.tau[None,:],Kmn.T)
        L = jitchol(LLT)
        V, _ = dtrtrs(L,Kmn,lower=1)
        #Sigma_diag = np.sum(V*V,-2)
        #Knmv_tilde = np.dot(Kmn,v_tilde)
        #mu = np.dot(V2.T,Knmv_tilde)
        Sigma = np.dot(V.T,V)
        mu = np.dot(Sigma, ga_approx.v)
        Sigma_diag = np.diag(Sigma).copy()
        return posteriorParamsDTC(mu, Sigma_diag), LLT
 class EPBase(object):
    def __init__(self, epsilon=1e-6, eta=1., delta=1., always_reset=False, max_iters=np.inf, ep_mode="alternated", parallel_updates=False):
        """
@ -28,6 +130,7 @@ class EPBase(object):
        :parallel_updates: boolean. If true, updates of the parameters of the sites in parallel
        """
        super(EPBase, self).__init__()
        self.always_reset = always_reset
        self.epsilon, self.eta, self.delta, self.max_iters = epsilon, eta, delta, max_iters
        self.ep_mode = ep_mode
@ -35,7 +138,7 @@ class EPBase(object):
        self.reset()
    def reset(self):
-        self.old_mutilde, self.old_vtilde = None, None
+        self.ga_approx_old = None
        self._ep_approximation = None
    def on_optimization_start(self):
@ -45,6 +148,11 @@ class EPBase(object):
        # TODO: update approximation in the end as well? Maybe even with a switch?
        pass
    def _stop_criteria(self, ga_approx):
        tau_diff = np.mean(np.square(ga_approx.tau-self.ga_approx_old.tau))
        v_diff = np.mean(np.square(ga_approx.v-self.ga_approx_old.v))
        return ((tau_diff < self.epsilon) and (v_diff < self.epsilon))
    def __setstate__(self, state):
        super(EPBase, self).__setstate__(state[0])
        self.epsilon, self.eta, self.delta = state[1]
@ -67,19 +175,18 @@ class EP(EPBase, ExactGaussianInference):
        if self.ep_mode=="nested":
            #Force EP at each step of the optimization
            self._ep_approximation = None
-            mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
+            post_params, ga_approx, cav_params, log_Z_tilde = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
        elif self.ep_mode=="alternated":
            if getattr(self, '_ep_approximation', None) is None:
                #if we don't yet have the results of runnign EP, run EP and store the computed factors in self._ep_approximation
-                mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
+                post_params, ga_approx, cav_params, log_Z_tilde = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
            else:
                #if we've already run EP, just use the existing approximation stored in self._ep_approximation
-                mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation
+                post_params, ga_approx, cav_params, log_Z_tilde = self._ep_approximation
        else:
            raise ValueError("ep_mode value not valid")
-        v_tilde = mu_tilde * tau_tilde
+        return self._inference(Y, K, ga_approx, cav_params, likelihood, Y_metadata=Y_metadata,  Z_tilde=log_Z_tilde)
        return self._inference(K, tau_tilde, v_tilde, likelihood, Y_metadata=Y_metadata,  Z_tilde=log_Z_tilde.sum())
    def expectation_propagation(self, K, Y, likelihood, Y_metadata):
@ -90,41 +197,57 @@ class EP(EPBase, ExactGaussianInference):
        # than ObsArrays
        Y = Y.values.copy()
-        #Initial values - Marginal moments
+        #Initial values - Marginal moments, cavity params, gaussian approximation params and posterior params
-        Z_hat = np.empty(num_data,dtype=np.float64)
+        marg_moments = marginalMoments(num_data)
-        mu_hat = np.empty(num_data,dtype=np.float64)
+        cav_params = cavityParams(num_data)
-        sigma2_hat = np.empty(num_data,dtype=np.float64)
+        ga_approx, post_params = self._init_approximations(K, num_data)
-        tau_cav = np.empty(num_data,dtype=np.float64)
+        #Approximation
-        v_cav = np.empty(num_data,dtype=np.float64)
+        stop = False
        iterations = 0
        while not stop and (iterations < self.max_iters):
            self._local_updates(num_data, cav_params, post_params, marg_moments, ga_approx, likelihood, Y, Y_metadata)
            #(re) compute Sigma and mu using full Cholesky decompy
            post_params = posteriorParams._recompute(K, ga_approx)
            #monitor convergence
            if iterations > 0:
                stop = self._stop_criteria(ga_approx)
            self.ga_approx_old = gaussianApproximation(ga_approx.v.copy(), ga_approx.tau.copy())
            iterations += 1
        # Z_tilde after removing the terms that can lead to infinite terms due to tau_tilde close to zero.
        # This terms cancel with the coreresponding terms in the marginal loglikelihood
        log_Z_tilde = self._log_Z_tilde(marg_moments, ga_approx, cav_params)
                         # - 0.5*np.log(tau_tilde) + 0.5*(v_tilde*v_tilde*1./tau_tilde)
        return (post_params, ga_approx, cav_params, log_Z_tilde)
    def _init_approximations(self, K, num_data):
        #initial values - Gaussian factors
        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
-        if self.old_mutilde is None:
+        if self.ga_approx_old is None:
-            tau_tilde, mu_tilde, v_tilde = np.zeros((3, num_data))
+            v_tilde, tau_tilde = np.zeros((2, num_data))
            ga_approx = gaussianApproximation(v_tilde, tau_tilde)
            Sigma = K.copy()
            diag.add(Sigma, 1e-7)
            mu = np.zeros(num_data)
            post_params = posteriorParams(mu, Sigma)
        else:
-            assert self.old_mutilde.size == num_data, "data size mis-match: did you change the data? try resetting!"
+            assert self.ga_approx_old.v.size == num_data, "data size mis-match: did you change the data? try resetting!"
-            mu_tilde, v_tilde = self.old_mutilde, self.old_vtilde
+            ga_approx = gaussianApproximation(self.ga_approx_old.v, self.ga_approx_old.tau)
-            tau_tilde = v_tilde/mu_tilde
+            post_params = posteriorParams._recompute(K, ga_approx)
-            mu, Sigma, _ = self._ep_compute_posterior(K, tau_tilde, v_tilde)
+            diag.add(post_params.Sigma, 1e-7)
            diag.add(Sigma, 1e-7)
            # TODO: Check the log-marginal under both conditions and choose the best one
        return (ga_approx, post_params)
-        #Approximation
+    def _local_updates(self, num_data, cav_params, post_params, marg_moments, ga_approx, likelihood, Y, Y_metadata, update_order=None):
-        tau_diff = self.epsilon + 1.
+            if update_order is None:
-        v_diff = self.epsilon + 1.
+                update_order = np.random.permutation(num_data)
        tau_tilde_old = np.nan
        v_tilde_old = np.nan
        iterations = 0
        while ((tau_diff > self.epsilon) or (v_diff > self.epsilon)) and (iterations < self.max_iters):
            update_order = np.random.permutation(num_data)
            for i in update_order:
                #Cavity distribution parameters
-                tau_cav[i] = 1./Sigma[i,i] - self.eta*tau_tilde[i]
+                cav_params._update_i(self.eta, ga_approx, post_params, i)
-                v_cav[i] = mu[i]/Sigma[i,i] - self.eta*v_tilde[i]
+
                if Y_metadata is not None:
                    # Pick out the relavent metadata for Yi
                    Y_metadata_i = {}
@ -133,91 +256,45 @@ class EP(EPBase, ExactGaussianInference):
                else:
                    Y_metadata_i = None
                #Marginal moments
-                Z_hat[i], mu_hat[i], sigma2_hat[i] = likelihood.moments_match_ep(Y[i], tau_cav[i], v_cav[i], Y_metadata_i=Y_metadata_i)
+                marg_moments.Z_hat[i], marg_moments.mu_hat[i], marg_moments.sigma2_hat[i] = likelihood.moments_match_ep(Y[i], cav_params.tau[i], cav_params.v[i], Y_metadata_i=Y_metadata_i)
                #Site parameters update
                delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
                delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
                tau_tilde_prev = tau_tilde[i]
                tau_tilde[i] += delta_tau
-                # Enforce positivity of tau_tilde. Even though this is guaranteed for logconcave sites, it is still possible
+                #Site parameters update
-                # to get negative values due to numerical errors. Moreover, the value of tau_tilde should be positive in order to
+                delta_tau, delta_v = ga_approx._update_i(self.eta, self.delta, post_params, marg_moments, i)
                # update the marginal likelihood without inestability issues.
                if tau_tilde[i] < np.finfo(float).eps:
                    tau_tilde[i] = np.finfo(float).eps
                    delta_tau = tau_tilde[i] - tau_tilde_prev
                v_tilde[i] += delta_v
                if self.parallel_updates == False:
-                    #Posterior distribution parameters update
+                    post_params._update_rank1(delta_tau, ga_approx, i)
                    ci = delta_tau/(1.+ delta_tau*Sigma[i,i])
                    DSYR(Sigma, Sigma[:,i].copy(), -ci)
                    mu = np.dot(Sigma, v_tilde)
-            #(re) compute Sigma and mu using full Cholesky decompy
+    def _log_Z_tilde(self, marg_moments, ga_approx, cav_params):
-            mu, Sigma, _ = self._ep_compute_posterior(K, tau_tilde, v_tilde)
+        return np.sum((np.log(marg_moments.Z_hat) + 0.5*np.log(2*np.pi) + 0.5*np.log(1+ga_approx.tau/cav_params.tau) - 0.5 * ((ga_approx.v)**2 * 1./(cav_params.tau + ga_approx.tau))
                + 0.5*(cav_params.v * ( ( (ga_approx.tau/cav_params.tau) * cav_params.v - 2.0 * ga_approx.v ) * 1./(cav_params.tau + ga_approx.tau)))))
            #monitor convergence
            if iterations > 0:
                tau_diff = np.mean(np.square(tau_tilde-tau_tilde_old))
                v_diff = np.mean(np.square(v_tilde-v_tilde_old))
            tau_tilde_old = tau_tilde.copy()
            v_tilde_old = v_tilde.copy()
            iterations += 1
-        mu_tilde = v_tilde/tau_tilde
+    def _ep_marginal(self, K, ga_approx, Z_tilde):
-        mu_cav = v_cav/tau_cav
+        post_params = posteriorParams._recompute(K, ga_approx)
        sigma2_sigma2tilde = 1./tau_cav + 1./tau_tilde
        # Z_tilde after removing the terms that can lead to infinite terms due to tau_tilde close to zero.
        # This terms cancel with the coreresponding terms in the marginal loglikelihood
        log_Z_tilde = (np.log(Z_hat) + 0.5*np.log(2*np.pi) + 0.5*np.log(1+tau_tilde/tau_cav)
                         - 0.5 * ((v_tilde)**2 * 1./(tau_cav + tau_tilde)) + 0.5*(v_cav * ( ( (tau_tilde/tau_cav) * v_cav - 2.0 * v_tilde ) * 1./(tau_cav + tau_tilde))))
                         # - 0.5*np.log(tau_tilde) + 0.5*(v_tilde*v_tilde*1./tau_tilde)
        self.old_mutilde = mu_tilde
        self.old_vtilde = v_tilde
        return mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde
    def _ep_compute_posterior(self, K, tau_tilde, v_tilde):
        num_data = len(tau_tilde)
        tau_tilde_root = np.sqrt(tau_tilde)
        Sroot_tilde_K = tau_tilde_root[:,None] * K
        B = np.eye(num_data) + Sroot_tilde_K * tau_tilde_root[None,:]
        L = jitchol(B)
        V, _ = dtrtrs(L, Sroot_tilde_K, lower=1)
        Sigma = K - np.dot(V.T,V) #K - KS^(1/2)BS^(1/2)K = (K^(-1) + \Sigma^(-1))^(-1)
        mu = np.dot(Sigma,v_tilde)
        return (mu, Sigma, L)
    def _ep_marginal(self, K, tau_tilde, v_tilde, Z_tilde):
        mu, Sigma, L = self._ep_compute_posterior(K, tau_tilde, v_tilde)
        # Gaussian log marginal excluding terms that can go to infinity due to arbitrarily small tau_tilde.
        # These terms cancel out with the terms excluded from Z_tilde
-        B_logdet = np.sum(2.0*np.log(np.diag(L)))
+        B_logdet = np.sum(2.0*np.log(np.diag(post_params.L)))
-        log_marginal =  0.5*(-len(tau_tilde) * log_2_pi - B_logdet + np.sum(v_tilde * np.dot(Sigma,v_tilde)))
+        log_marginal =  0.5*(-len(ga_approx.tau) * log_2_pi - B_logdet + np.sum(ga_approx.v * np.dot(post_params.Sigma,ga_approx.v)))
        log_marginal += Z_tilde
-        return log_marginal, mu, Sigma, L
+        return log_marginal, post_params
-    def _inference(self, K, tau_tilde, v_tilde, likelihood, Z_tilde, Y_metadata=None):
+    def _inference(self, Y, K, ga_approx, cav_params, likelihood, Z_tilde, Y_metadata=None):
-        log_marginal, mu, Sigma, L = self._ep_marginal(K, tau_tilde, v_tilde, Z_tilde)
+        log_marginal, post_params = self._ep_marginal(K, ga_approx, Z_tilde)
-        tau_tilde_root = np.sqrt(tau_tilde)
+        tau_tilde_root = np.sqrt(ga_approx.tau)
        Sroot_tilde_K = tau_tilde_root[:,None] * K
-        aux_alpha , _ = dpotrs(L, np.dot(Sroot_tilde_K, v_tilde), lower=1)
+        aux_alpha , _ = dpotrs(post_params.L, np.dot(Sroot_tilde_K, ga_approx.v), lower=1)
-        alpha = (v_tilde - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde)
+        alpha = (ga_approx.v - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde)
-        LWi, _ = dtrtrs(L, np.diag(tau_tilde_root), lower=1)
+        LWi, _ = dtrtrs(post_params.L, np.diag(tau_tilde_root), lower=1)
        Wi = np.dot(LWi.T,LWi)
        symmetrify(Wi) #(K + Sigma^(\tilde))^(-1)
        dL_dK = 0.5 * (tdot(alpha) - Wi)
-        dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK), Y_metadata)
+        dL_dthetaL = likelihood.ep_gradients(Y, cav_params.tau, cav_params.v, np.diag(dL_dK), Y_metadata=Y_metadata, quad_mode='gh')
        return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
@ -244,24 +321,25 @@ class EPDTC(EPBase, VarDTC):
        if self.ep_mode=="nested":
            #Force EP at each step of the optimization
            self._ep_approximation = None
-            mu, Sigma_diag, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
+            post_params, ga_approx, log_Z_tilde = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
        elif self.ep_mode=="alternated":
            if getattr(self, '_ep_approximation', None) is None:
                #if we don't yet have the results of runnign EP, run EP and store the computed factors in self._ep_approximation
-                mu, Sigma_diag, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
+                post_params, ga_approx, log_Z_tilde = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
            else:
                #if we've already run EP, just use the existing approximation stored in self._ep_approximation
-                mu, Sigma_diag, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation
+                post_params, ga_approx, log_Z_tilde = self._ep_approximation
        else:
            raise ValueError("ep_mode value not valid")
-        return super(EPDTC, self).inference(kern, X, Z, likelihood, mu_tilde,
+        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
        return super(EPDTC, self).inference(kern, X, Z, likelihood, ObsAr(mu_tilde[:,None]),
                                            mean_function=mean_function,
                                            Y_metadata=Y_metadata,
-                                            precision=tau_tilde,
+                                            precision=ga_approx.tau,
                                            Lm=Lm, dL_dKmm=dL_dKmm,
-                                            psi0=psi0, psi1=psi1, psi2=psi2, Z_tilde=log_Z_tilde.sum())
+                                            psi0=psi0, psi1=psi1, psi2=psi2, Z_tilde=log_Z_tilde)
    def expectation_propagation(self, Kmm, Kmn, Y, likelihood, Y_metadata):
@ -272,119 +350,91 @@ class EPDTC(EPBase, VarDTC):
        # than ObsArrays
        Y = Y.values.copy()
-        #Initial values - Marginal moments
+        #Initial values - Marginal moments, cavity params, gaussian approximation params and posterior params
-        Z_hat = np.zeros(num_data,dtype=np.float64)
+        marg_moments = marginalMoments(num_data)
-        mu_hat = np.zeros(num_data,dtype=np.float64)
+        cav_params = cavityParams(num_data)
-        sigma2_hat = np.zeros(num_data,dtype=np.float64)
+        ga_approx, post_params, LLT0, LLT = self._init_approximations(Kmm, Kmn, num_data)
-        tau_cav = np.empty(num_data,dtype=np.float64)
+        #Approximation
-        v_cav = np.empty(num_data,dtype=np.float64)
+        stop = False
        iterations = 0
        while not stop and (iterations < self.max_iters):
            self._local_updates(num_data, LLT0, LLT, Kmn, cav_params, post_params, marg_moments, ga_approx, likelihood, Y, Y_metadata)
            #(re) compute Sigma, Sigma_diag and mu using full Cholesky decompy
            post_params, LLT = posteriorParamsDTC._recompute(LLT0, Kmn, ga_approx)
            post_params.Sigma_diag = np.maximum(post_params.Sigma_diag, np.finfo(float).eps)
            #monitor convergence
            if iterations > 0:
                stop = self._stop_criteria(ga_approx)
            self.ga_approx_old = gaussianApproximation(ga_approx.v.copy(), ga_approx.tau.copy())
            iterations += 1
        log_Z_tilde = self._log_Z_tilde(marg_moments, ga_approx, cav_params)
        return post_params, ga_approx, log_Z_tilde
    def _log_Z_tilde(self, marg_moments, ga_approx, cav_params):
        mu_tilde = ga_approx.v/ga_approx.tau
        mu_cav = cav_params.v/cav_params.tau
        sigma2_sigma2tilde = 1./cav_params.tau + 1./ga_approx.tau
        return np.sum((np.log(marg_moments.Z_hat) + 0.5*np.log(2*np.pi) + 0.5*np.log(sigma2_sigma2tilde)
                         + 0.5*((mu_cav - mu_tilde)**2) / (sigma2_sigma2tilde)))
    def _init_approximations(self, Kmm, Kmn, num_data):
        #initial values - Gaussian factors
        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
        LLT0 = Kmm.copy()
        Lm = jitchol(LLT0) #K_m = L_m L_m^\top
-        Vm,info = dtrtrs(Lm,Kmn,lower=1)
+        Vm,info = dtrtrs(Lm, Kmn,lower=1)
        # Lmi = dtrtri(Lm)
        # Kmmi = np.dot(Lmi.T,Lmi)
        # KmmiKmn = np.dot(Kmmi,Kmn)
        # Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
        Qnn_diag = np.sum(Vm*Vm,-2) #diag(Knm Kmm^(-1) Kmn)
        #diag.add(LLT0, 1e-8)
-        if self.old_mutilde is None:
+        if self.ga_approx_old is None:
            #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
            LLT = LLT0.copy() #Sigma = K.copy()
            mu = np.zeros(num_data)
            Sigma_diag = Qnn_diag.copy() + 1e-8
-            tau_tilde, mu_tilde, v_tilde = np.zeros((3, num_data))
+            v_tilde, tau_tilde = np.zeros((2, num_data))
            ga_approx = gaussianApproximation(v_tilde, tau_tilde)
            post_params = posteriorParamsDTC(mu, Sigma_diag)
        else:
-            assert self.old_mutilde.size == num_data, "data size mis-match: did you change the data? try resetting!"
+            assert self.ga_approx_old.v.size == num_data, "data size mis-match: did you change the data? try resetting!"
-            mu_tilde, v_tilde = self.old_mutilde, self.old_vtilde
+            ga_approx = gaussianApproximation(self.ga_approx_old.v, self.ga_approx_old.tau)
-            tau_tilde = v_tilde/mu_tilde
+            post_params, LLT = posteriorParamsDTC._recompute(LLT0, Kmn, ga_approx)
-            mu, Sigma_diag, LLT = self._ep_compute_posterior(LLT0, Kmn, tau_tilde, v_tilde)
+            post_params.Sigma_diag += 1e-8
-            Sigma_diag += 1e-8
+
            # TODO: Check the log-marginal under both conditions and choose the best one
-        #Approximation
+        return (ga_approx, post_params, LLT0, LLT)
-        tau_diff = self.epsilon + 1.
+
-        v_diff = self.epsilon + 1.
+    def _local_updates(self, num_data, LLT0, LLT, Kmn, cav_params, post_params, marg_moments, ga_approx, likelihood, Y, Y_metadata, update_order=None):
-        tau_tilde_old = np.nan
+        if update_order is None:
        v_tilde_old = np.nan
        iterations = 0
        while  ((tau_diff > self.epsilon) or (v_diff > self.epsilon)) and (iterations < self.max_iters):
            update_order = np.random.permutation(num_data)
-            for i in update_order:
+        for i in update_order:
                #Cavity distribution parameters
                tau_cav[i] = 1./Sigma_diag[i] - self.eta*tau_tilde[i]
                v_cav[i] = mu[i]/Sigma_diag[i] - self.eta*v_tilde[i]
                if Y_metadata is not None:
                    # Pick out the relavent metadata for Yi
                    Y_metadata_i = {}
                    for key in Y_metadata.keys():
                        Y_metadata_i[key] = Y_metadata[key][i, :]
                else:
                    Y_metadata_i = None
-                #Marginal moments
+            #Cavity distribution parameters
-                Z_hat[i], mu_hat[i], sigma2_hat[i] = likelihood.moments_match_ep(Y[i], tau_cav[i], v_cav[i], Y_metadata_i=Y_metadata_i)
+            cav_params._update_i(self.eta, ga_approx, post_params, i)
                #Site parameters update
                delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
                tau_tilde_prev = tau_tilde[i]
                tau_tilde[i] += delta_tau
                # Enforce positivity of tau_tilde. Even though this is guaranteed for logconcave sites, it is still possible
                # to get negative values due to numerical errors. Moreover, the value of tau_tilde should be positive in order to
                # update the marginal likelihood without inestability issues.
                if tau_tilde[i] < np.finfo(float).eps:
                    tau_tilde[i] = np.finfo(float).eps
                    delta_tau = tau_tilde[i] - tau_tilde_prev
                v_tilde[i] += delta_v
-                #Posterior distribution parameters update
+            if Y_metadata is not None:
-                if self.parallel_updates == False:
+                # Pick out the relavent metadata for Yi
-                    #DSYR(Sigma, Sigma[:,i].copy(), -delta_tau/(1.+ delta_tau*Sigma[i,i]))
+                Y_metadata_i = {}
-                    DSYR(LLT,Kmn[:,i].copy(),delta_tau)
+                for key in Y_metadata.keys():
-                    L = jitchol(LLT)
+                    Y_metadata_i[key] = Y_metadata[key][i, :]
-                    V,info = dtrtrs(L,Kmn,lower=1)
+            else:
-                    Sigma_diag = np.maximum(np.sum(V*V,-2), np.finfo(float).eps)  #diag(K_nm (L L^\top)^(-1)) K_mn
+                Y_metadata_i = None
                    si = np.sum(V.T*V[:,i],-1) #(V V^\top)[:,i]
                    mu += (delta_v-delta_tau*mu[i])*si
                    #mu = np.dot(Sigma, v_tilde)
-            #(re) compute Sigma, Sigma_diag and mu using full Cholesky decompy
+            #Marginal moments
-            mu, Sigma_diag, LLT = self._ep_compute_posterior(LLT0, Kmn, tau_tilde, v_tilde)
+            marg_moments.Z_hat[i], marg_moments.mu_hat[i], marg_moments.sigma2_hat[i] = likelihood.moments_match_ep(Y[i], cav_params.tau[i], cav_params.v[i], Y_metadata_i=Y_metadata_i)
-            Sigma_diag = np.maximum(Sigma_diag, np.finfo(float).eps)
+            #Site parameters update
            delta_tau, delta_v = ga_approx._update_i(self.eta, self.delta, post_params, marg_moments, i)
-            #monitor convergence
+            #Posterior distribution parameters update
-            if iterations>0:
+            if self.parallel_updates == False:
-                tau_diff = np.mean(np.square(tau_tilde-tau_tilde_old))
+                post_params._update_rank1(LLT, Kmn, delta_v, delta_tau, i)
                v_diff = np.mean(np.square(v_tilde-v_tilde_old))
            tau_tilde_old = tau_tilde.copy()
            v_tilde_old = v_tilde.copy()
            iterations += 1
        mu_tilde = v_tilde/tau_tilde
        mu_cav = v_cav/tau_cav
        sigma2_sigma2tilde = 1./tau_cav + 1./tau_tilde
        log_Z_tilde = (np.log(Z_hat) + 0.5*np.log(2*np.pi) + 0.5*np.log(sigma2_sigma2tilde)
                         + 0.5*((mu_cav - mu_tilde)**2) / (sigma2_sigma2tilde))
        self.old_mutilde = mu_tilde
        self.old_vtilde = v_tilde
        return mu, Sigma_diag, ObsAr(mu_tilde[:,None]), tau_tilde, log_Z_tilde
    def _ep_compute_posterior(self, LLT0, Kmn, tau_tilde, v_tilde):
        LLT = LLT0 + np.dot(Kmn*tau_tilde[None,:],Kmn.T)
        L = jitchol(LLT)
        V, _ = dtrtrs(L,Kmn,lower=1)
        #Sigma_diag = np.sum(V*V,-2)
        #Knmv_tilde = np.dot(Kmn,v_tilde)
        #mu = np.dot(V2.T,Knmv_tilde)
        Sigma = np.dot(V.T,V)
        mu = np.dot(Sigma,v_tilde)
        Sigma_diag = np.diag(Sigma).copy()
        return (mu, Sigma_diag, LLT)
--- a/GPy/likelihoods/binomial.py
+++ b/GPy/likelihoods/binomial.py
@ -66,7 +66,14 @@ class Binomial(Likelihood):
        np.testing.assert_array_equal(N.shape, y.shape)
        nchoosey = special.gammaln(N+1) - special.gammaln(y+1) - special.gammaln(N-y+1)
-        return nchoosey + y*np.log(inv_link_f) + (N-y)*np.log(1.-inv_link_f)
+        
        Ny = N-y
        t1 = np.zeros(y.shape)
        t2 = np.zeros(y.shape)
        t1[y>0] = y[y>0]*np.log(inv_link_f[y>0])
        t2[Ny>0] = Ny[Ny>0]*np.log(1.-inv_link_f[Ny>0])
        return nchoosey + t1 + t2
    def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
        """
@ -86,7 +93,13 @@ class Binomial(Likelihood):
        N = Y_metadata['trials']
        np.testing.assert_array_equal(N.shape, y.shape)
-        return y/inv_link_f - (N-y)/(1.-inv_link_f)
+        Ny = N-y
        t1 = np.zeros(y.shape)
        t2 = np.zeros(y.shape)
        t1[y>0] = y[y>0]/inv_link_f[y>0]
        t2[Ny>0] = (Ny[Ny>0])/(1.-inv_link_f[Ny>0])        
        return t1 - t2
    def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None):
        """
@ -111,7 +124,13 @@ class Binomial(Likelihood):
        """
        N = Y_metadata['trials']
        np.testing.assert_array_equal(N.shape, y.shape)
-        return -y/np.square(inv_link_f) - (N-y)/np.square(1.-inv_link_f)
+        Ny = N-y
        t1 = np.zeros(y.shape)
        t2 = np.zeros(y.shape)
        t1[y>0] = -y[y>0]/np.square(inv_link_f[y>0])
        t2[Ny>0] = -(Ny[Ny>0])/np.square(1.-inv_link_f[Ny>0])
        return t1+t2
    def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None):
        """
@ -135,8 +154,14 @@ class Binomial(Likelihood):
        N = Y_metadata['trials']
        np.testing.assert_array_equal(N.shape, y.shape)
-        inv_link_f2 = np.square(inv_link_f)
+        #inv_link_f2 = np.square(inv_link_f)  #TODO Remove. Why is this here?
-        return 2*y/inv_link_f**3 - 2*(N-y)/(1.-inv_link_f)**3
+        
        Ny = N-y
        t1 = np.zeros(y.shape)
        t2 = np.zeros(y.shape)
        t1[y>0] = 2*y[y>0]/inv_link_f[y>0]**3
        t2[Ny>0] = - 2*(Ny[Ny>0])/(1.-inv_link_f[Ny>0])**3
        return t1 + t2
    def samples(self, gp, Y_metadata=None, **kw):
        """
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -57,7 +57,10 @@ class Gaussian(Likelihood):
    def update_gradients(self, grad):
        self.variance.gradient = grad
-    def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
+    def ep_gradients(self, Y, cav_tau, cav_v, dL_dKdiag, Y_metadata=None, quad_mode='gk', boost_grad=1.):
        return self.exact_inference_gradients(dL_dKdiag)
    def exact_inference_gradients(self, dL_dKdiag, Y_metadata=None):
        return dL_dKdiag.sum()
    def _preprocess_values(self, Y):
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -6,8 +6,12 @@ from scipy import stats,special
 import scipy as sp
 from . import link_functions
 from ..util.misc import chain_1, chain_2, chain_3, blockify_dhess_dtheta, blockify_third, blockify_hessian, safe_exp
 from ..util.quad_integrate import quadgk_int
 from scipy.integrate import quad
 from functools import partial
 import warnings
 from ..core.parameterization import Parameterized
 class Likelihood(Parameterized):
@ -223,6 +227,91 @@ class Likelihood(Parameterized):
            self.__gh_points = np.polynomial.hermite.hermgauss(T)
        return self.__gh_points
    def ep_gradients(self, Y, cav_tau, cav_v, dL_dKdiag, Y_metadata=None, quad_mode='gk', boost_grad=1.):
        if self.size > 0:
            shape = Y.shape
            tau,v,Y = cav_tau.flatten(), cav_v.flatten(),Y.flatten()
            mu = v/tau
            sigma2 = 1./tau
            # assert Y.shape == v.shape
            dlik_dtheta = np.empty((self.size, Y.shape[0]))
            # for j in range(self.size):
            Y_metadata_list = []
            for index in range(len(Y)):
                Y_metadata_i = {}
                if Y_metadata is not None:
                    for key in Y_metadata.keys():
                        Y_metadata_i[key] = Y_metadata[key][index,:]
                    Y_metadata_list.append(Y_metadata_i)
            if quad_mode == 'gk':
                f = partial(self.integrate_gk)
                quads = zip(*map(f, Y.flatten(), mu.flatten(), np.sqrt(sigma2.flatten()), Y_metadata_list))
                quads = np.vstack(quads)
                quads.reshape(self.size, shape[0], shape[1])
            elif quad_mode == 'gh':
                f = partial(self.integrate_gh)
                quads = zip(*map(f, Y.flatten(), mu.flatten(), np.sqrt(sigma2.flatten())))
                quads = np.hstack(quads)
                quads = quads.T
            else:
                raise Exception("no other quadrature mode available")
            #     do a gaussian-hermite integration
            dL_dtheta_avg = boost_grad * np.nanmean(quads, axis=1)
            dL_dtheta = boost_grad * np.nansum(quads, axis=1)
            # dL_dtheta = boost_grad * np.nansum(dlik_dtheta, axis=1)
        else:
            dL_dtheta = np.zeros(self.num_params)
        return dL_dtheta
    def integrate_gk(self, Y, mu, sigma, Y_metadata_i=None):
        # gaussian-kronrod integration.
        fmin = -np.inf
        fmax = np.inf
        SQRT_2PI = np.sqrt(2.*np.pi)
        def generate_integral(f):
            a = np.exp(self.logpdf_link(f, Y, Y_metadata_i)) * np.exp(-0.5 * np.square((f - mu) / sigma)) / (
                SQRT_2PI * sigma)
            fn1 = a * self.dlogpdf_dtheta(f, Y, Y_metadata_i)
            fn = fn1
            return fn
        dF_dtheta_i = quadgk_int(generate_integral, fmin=fmin, fmax=fmax)
        return dF_dtheta_i
    def integrate_gh(self, Y, mu, sigma, Y_metadata_i=None, gh_points=None):
        # gaussian-hermite quadrature.
        # "calculate site derivatives E_f{d logp(y_i|f_i)/da} where a is a likelihood parameter
        # and the expectation is over the exact marginal posterior, which is not gaussian- and is
        # unnormalised product of the cavity distribution(a Gaussian) and the exact likelihood term.
        #
        # calculate the expectation wrt the approximate marginal posterior, which should be approximately the same.
        # . This term is needed for evaluating the
        # gradients of the marginal likelihood estimate Z_EP wrt likelihood parameters."
        # "writing it explicitly "
        # use them for gaussian-hermite quadrature
        SQRT_2PI = np.sqrt(2.*np.pi)
        if gh_points is None:
            gh_x, gh_w = self._gh_points(32)
        else:
            gh_x, gh_w = gh_points
        X = gh_x[None,:]*np.sqrt(2.)*sigma + mu
        # Here X is a grid vector of possible fi values, while Y is just a single value which will be broadcasted.
        a = np.exp(self.logpdf_link(X, Y, Y_metadata_i))
        a = a.repeat(self.num_params,0)
        b = self.dlogpdf_dtheta(X, Y, Y_metadata_i)
        old_shape = b.shape
        fn = np.array([i*j for i,j in zip(a.flatten(), b.flatten())])
        fn = fn.reshape(old_shape)
        dF_dtheta_i = np.dot(fn, gh_w)/np.sqrt(np.pi)
        return dF_dtheta_i
    def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
        """
        Use Gauss-Hermite Quadrature to compute
--- a/GPy/testing/ep_likelihood_tests.py
+++ b/GPy/testing/ep_likelihood_tests.py
@ -0,0 +1,147 @@
 import numpy as np
 import unittest
 import GPy
 from GPy.models import GradientChecker
 fixed_seed = 10
 from nose.tools import with_setup, nottest
 # this file will contain some high level tests, this is not unit testing, but will give us a higher level estimate
 # if things are going well under the hood.
 class TestObservationModels(unittest.TestCase):
    def setUp(self):
        np.random.seed(fixed_seed)
        self.N = 100
        self.D = 2
        self.X = np.random.rand(self.N, self.D)
        self.real_noise_std = 0.05
        noise = np.random.randn(*self.X[:, 0].shape) * self.real_noise_std
        self.Y = (np.sin(self.X[:, 0] * 2 * np.pi) + noise)[:, None]
        self.num_points = self.X.shape[0]
        self.f = np.random.rand(self.N, 1)
        self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=np.int)[:, None]
        # self.binary_Y[self.binary_Y == 0.0] = -1.0
        self.positive_Y = np.exp(self.Y.copy())
        self.Y_noisy = self.Y.copy()
        self.Y_verynoisy = self.Y.copy()
        self.Y_noisy[75] += 1.3
        self.init_var = 0.15
        self.deg_free = 4.
        censored = np.zeros_like(self.Y)
        random_inds = np.random.choice(self.N, int(self.N / 2), replace=True)
        censored[random_inds] = 1
        self.Y_metadata = dict()
        self.Y_metadata['censored'] = censored
        self.kernel1 = GPy.kern.RBF(self.X.shape[1]) + GPy.kern.White(self.X.shape[1])
    def tearDown(self):
        self.Y = None
        self.X = None
        self.binary_Y =None
        self.positive_Y = None
        self.kernel1 = None
    @with_setup(setUp, tearDown)
    def testEPClassification(self):
        bernoulli = GPy.likelihoods.Bernoulli()
        laplace_inf = GPy.inference.latent_function_inference.Laplace()
        ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode='alternated')
        ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode='nested')
        ep_inf_fractional = GPy.inference.latent_function_inference.EP(ep_mode='nested', eta=0.9)
        m1 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=laplace_inf)
        m1.randomize()
        m2 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_alt)
        m2.randomize()
        m3 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_nested)
        m3.randomize()
        #
        m4 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_fractional)
        m4.randomize()
        optimizer = 'bfgs'
        #do gradcheck here ...
        # self.assertTrue(m1.checkgrad())
        # self.assertTrue(m2.checkgrad())
        # self.assertTrue(m3.checkgrad())
        # self.assertTrue(m4.checkgrad())
        m1.optimize(optimizer=optimizer, max_iters=300)
        m2.optimize(optimizer=optimizer, max_iters=300)
        m3.optimize(optimizer=optimizer, max_iters=300)
        m4.optimize(optimizer=optimizer, max_iters=300)
        # taking laplace predictions as the ground truth
        probs_mean_lap, probs_var_lap = m1.predict(self.X)
        probs_mean_ep_alt, probs_var_ep_alt = m2.predict(self.X)
        probs_mean_ep_nested, probs_var_ep_nested = m3.predict(self.X)
        # for simple single dimension data , marginal likelihood for laplace and EP approximations should not be so far apart.
        self.assertAlmostEqual(m1.log_likelihood(), m2.log_likelihood(),delta=1)
        self.assertAlmostEqual(m1.log_likelihood(), m3.log_likelihood(), delta=1)
        self.assertAlmostEqual(m1.log_likelihood(), m4.log_likelihood(), delta=5)
        GPy.util.classification.conf_matrix(probs_mean_lap, self.binary_Y)
        GPy.util.classification.conf_matrix(probs_mean_ep_alt, self.binary_Y)
        GPy.util.classification.conf_matrix(probs_mean_ep_nested, self.binary_Y)
    @nottest
    def rmse(self, Y, Ystar):
        return np.sqrt(np.mean((Y - Ystar) ** 2))
    @with_setup(setUp, tearDown)
    def test_EP_with_StudentT(self):
        studentT = GPy.likelihoods.StudentT(deg_free=self.deg_free, sigma2=self.init_var)
        laplace_inf = GPy.inference.latent_function_inference.Laplace()
        ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode='alternated')
        ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode='nested')
        ep_inf_frac = GPy.inference.latent_function_inference.EP(ep_mode='nested', eta=0.7)
        m1 = GPy.core.GP(self.X.copy(), self.Y_noisy.copy(), kernel=self.kernel1.copy(), likelihood=studentT.copy(), inference_method=laplace_inf)
        # optimize
        m1['.*white'].constrain_fixed(1e-5)
        m1.randomize()
        m2 = GPy.core.GP(self.X.copy(), self.Y_noisy.copy(), kernel=self.kernel1.copy(), likelihood=studentT.copy(), inference_method=ep_inf_alt)
        m2['.*white'].constrain_fixed(1e-5)
        # m2.constrain_bounded('.*t_scale2', 0.001, 10)
        m2.randomize()
        # m3 = GPy.core.GP(self.X, self.Y_noisy.copy(), kernel=self.kernel1, likelihood=studentT.copy(), inference_method=ep_inf_nested)
        # m3['.*white'].constrain_fixed(1e-5)
        # # m3.constrain_bounded('.*t_scale2', 0.001, 10)
        # m3.randomize()
        optimizer='bfgs'
        m1.optimize(optimizer=optimizer,max_iters=400)
        m2.optimize(optimizer=optimizer, max_iters=400)
        # m3.optimize(optimizer=optimizer, max_iters=500)
        self.assertAlmostEqual(m1.log_likelihood(), m2.log_likelihood(),delta=200)
        # self.assertAlmostEqual(m1.log_likelihood(), m3.log_likelihood(), 3)
        preds_mean_lap, preds_var_lap = m1.predict(self.X)
        preds_mean_alt, preds_var_alt = m2.predict(self.X)
        # preds_mean_nested, preds_var_nested = m3.predict(self.X)
        rmse_lap = self.rmse(preds_mean_lap, self.Y)
        rmse_alt = self.rmse(preds_mean_alt, self.Y)
        # rmse_nested = self.rmse(preds_mean_nested, self.Y_noisy)
        if rmse_alt > rmse_lap:
            self.assertAlmostEqual(rmse_lap, rmse_alt, delta=1.5)
        # m3.optimize(optimizer=optimizer, max_iters=500)
 if __name__ == "__main__":
    unittest.main()
--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@ -61,6 +61,18 @@ class InferenceGPEP(unittest.TestCase):
        Y = lik.samples(f).reshape(-1,1)
        return X, Y
    def genNoisyData(self):
        np.random.seed(1)
        X = np.random.rand(100,1)
        self.real_std = 0.1
        noise = np.random.randn(*X[:, 0].shape)*self.real_std
        Y = (np.sin(X[:, 0]*2*np.pi) + noise)[:, None]
        self.f = np.random.rand(X.shape[0],1)
        Y_extra_noisy = Y.copy()
        Y_extra_noisy[50] += 4.
        # Y_extra_noisy[80:83] -= 2.
        return X, Y, Y_extra_noisy
    def test_inference_EP(self):
        from paramz import ObsAr
        X, Y = self.genData()
@ -73,11 +85,45 @@ class InferenceGPEP(unittest.TestCase):
                        inference_method=inf,
                        likelihood=lik)
        K = self.model.kern.K(X)
        mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde = self.model.inference_method.expectation_propagation(K, ObsAr(Y), lik, None)
-        v_tilde = mu_tilde * tau_tilde
+        post_params, ga_approx, cav_params, log_Z_tilde = self.model.inference_method.expectation_propagation(K, ObsAr(Y), lik, None)
-        p, m, d = self.model.inference_method._inference(K, tau_tilde, v_tilde, lik, Y_metadata=None,  Z_tilde=log_Z_tilde.sum())
+
-        p0, m0, d0 = super(GPy.inference.latent_function_inference.expectation_propagation.EP, inf).inference(k, X,lik ,mu_tilde[:,None], mean_function=None, variance=1./tau_tilde, K=K, Z_tilde=log_Z_tilde.sum() + np.sum(- 0.5*np.log(tau_tilde) + 0.5*(v_tilde*v_tilde*1./tau_tilde)))
+        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
        p, m, d = self.model.inference_method._inference(Y, K, ga_approx, cav_params, lik, Y_metadata=None,  Z_tilde=log_Z_tilde)
        p0, m0, d0 = super(GPy.inference.latent_function_inference.expectation_propagation.EP, inf).inference(k, X,lik ,mu_tilde[:,None], mean_function=None, variance=1./ga_approx.tau, K=K, Z_tilde=log_Z_tilde + np.sum(- 0.5*np.log(ga_approx.tau) + 0.5*(ga_approx.v*ga_approx.v*1./ga_approx.tau)))
        assert (np.sum(np.array([m - m0,
                    np.sum(d['dL_dK'] - d0['dL_dK']),
                    np.sum(d['dL_dthetaL'] - d0['dL_dthetaL']),
                    np.sum(d['dL_dm'] - d0['dL_dm']),
                    np.sum(p._woodbury_vector - p0._woodbury_vector),
                    np.sum(p.woodbury_inv - p0.woodbury_inv)])) < 1e6)
    # NOTE: adding a test like above for parameterized likelihood- the above test is
    # only for probit likelihood which does not have any tunable hyperparameter which is why
    # the term in dictionary of gradients: dL_dthetaL will always be zero. So here we repeat tests for
    # student-t likelihood and heterodescastic gaussian noise case. This test simply checks if the posterior
    # and gradients of log marginal are roughly the same for inference through EP and exact gaussian inference using
    # the gaussian approximation for the individual likelihood site terms. For probit likelihood, it is possible to
    # calculate moments analytically, but for other likelihoods, we will need to use numerical quadrature techniques,
    # and it is possible that any error might creep up because of quadrature implementation.
    def test_inference_EP_non_classification(self):
        from paramz import ObsAr
        X, Y, Y_extra_noisy = self.genNoisyData()
        deg_freedom = 5.
        init_noise_var = 0.08
        lik_studentT = GPy.likelihoods.StudentT(deg_free=deg_freedom, sigma2=init_noise_var)
        # like_gaussian_noise = GPy.likelihoods.MixedNoise()
        k = GPy.kern.RBF(1, variance=2., lengthscale=1.1)
        ep_inf_alt = GPy.inference.latent_function_inference.expectation_propagation.EP(max_iters=4, delta=0.5)
        # ep_inf_nested = GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode='nested', max_iters=100, delta=0.5)
        m = GPy.core.GP(X=X,Y=Y_extra_noisy,kernel=k,likelihood=lik_studentT,inference_method=ep_inf_alt)
        K = m.kern.K(X)
        post_params, ga_approx, cav_params, log_Z_tilde = m.inference_method.expectation_propagation(K, ObsAr(Y_extra_noisy), lik_studentT, None)
        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
        p, m, d = m.inference_method._inference(Y_extra_noisy, K, ga_approx, cav_params, lik_studentT, Y_metadata=None,  Z_tilde=log_Z_tilde)
        p0, m0, d0 = super(GPy.inference.latent_function_inference.expectation_propagation.EP, ep_inf_alt).inference(k, X,lik_studentT ,mu_tilde[:,None], mean_function=None, variance=1./ga_approx.tau, K=K, Z_tilde=log_Z_tilde + np.sum(- 0.5*np.log(ga_approx.tau) + 0.5*(ga_approx.v*ga_approx.v*1./ga_approx.tau)))
        assert (np.sum(np.array([m - m0,
                    np.sum(d['dL_dK'] - d0['dL_dK']),
--- a/GPy/testing/quadrature_tests.py
+++ b/GPy/testing/quadrature_tests.py
@ -0,0 +1,39 @@
 from __future__ import print_function, division
 import numpy as np
 import GPy
 import warnings
 from  ..util.quad_integrate import quadgk_int, quadvgk
 class QuadTests(np.testing.TestCase):
    """
    test file for checking implementation of gaussian-kronrod quadrature.
    we will take a function which can be integrated analytically and check if quadgk result is similar or not!
    through this file we can test how numerically accurate quadrature implementation in native numpy or manual code is.
    """
    def setUp(self):
        pass
    def test_infinite_quad(self):
        def f(x):
            return np.exp(-0.5*x**2)*np.power(x,np.arange(3)[:,None])
        quad_int_val = quadgk_int(f)
        real_val = np.sqrt(np.pi * 2)
        np.testing.assert_almost_equal(real_val, quad_int_val[0], decimal=7)
    def test_finite_quad(self):
        def f2(x):
            return x**2
        quad_int_val = quadvgk(f2, 1.,2.)
        real_val = 7/3.
        np.testing.assert_almost_equal(real_val, quad_int_val, decimal=5)
 if __name__ == '__main__':
    def f(x):
        return np.exp(-0.5 * x ** 2) * np.power(x, np.arange(3)[:, None])
    quad_int_val = quadgk_int(f)
    real_val = np.sqrt(np.pi*2)
    np.testing.assert_almost_equal(real_val, quad_int_val[0], decimal=7)
    print(quadgk_int(f))
--- a/GPy/util/init.py
+++ b/GPy/util/init.py
@ -17,3 +17,4 @@ from . import multioutput
 from . import parallel
 from . import functions
 from . import cluster_with_offset
 from . import quad_integrate
--- a/GPy/util/quad_integrate.py
+++ b/GPy/util/quad_integrate.py
@ -0,0 +1,119 @@
 """
 The file for utilities related to integration by quadrature methods
 - will contain implementation for gaussian-kronrod integration.
 """
 import numpy as np
 def getSubs(Subs, XK, NK=1):
    M = (Subs[1, :] - Subs[0, :]) / 2
    C = (Subs[1, :] + Subs[0, :]) / 2
    I = XK[:, None] * M + np.ones((NK, 1)) * C
    # A = [Subs(1,:); I]
    A = np.vstack((Subs[0, :], I))
    # B = [I;Subs(2,:)]
    B = np.vstack((I, Subs[1, :]))
    # Subs = [reshape(A, 1, []);
    A = A.flatten()
    # reshape(B, 1, [])];
    B = B.flatten()
    Subs = np.vstack((A,B))
    # Subs = np.concatenate((A, B), axis=0)
    return Subs
 def quadvgk(feval, fmin, fmax, tol1=1e-5, tol2=1e-5):
    """
    numpy implementation makes use of the code here: http://se.mathworks.com/matlabcentral/fileexchange/18801-quadvgk
    We here use gaussian kronrod integration already used in gpstuff for evaluating one dimensional integrals.
    This is vectorised quadrature which means that several functions can be evaluated at the same time over a grid of
    points.
    :param f:
    :param fmin:
    :param fmax:
    :param difftol:
    :return:
    """
    XK = np.array([-0.991455371120813, -0.949107912342759, -0.864864423359769, -0.741531185599394,
                   -0.586087235467691, -0.405845151377397, -0.207784955007898, 0.,
                   0.207784955007898, 0.405845151377397, 0.586087235467691,
                   0.741531185599394, 0.864864423359769, 0.949107912342759, 0.991455371120813])
    WK = np.array([0.022935322010529, 0.063092092629979, 0.104790010322250, 0.140653259715525,
                   0.169004726639267, 0.190350578064785, 0.204432940075298, 0.209482141084728,
                   0.204432940075298, 0.190350578064785, 0.169004726639267,
                   0.140653259715525, 0.104790010322250, 0.063092092629979, 0.022935322010529])
     # 7-point Gaussian weightings
    WG = np.array([0.129484966168870, 0.279705391489277, 0.381830050505119, 0.417959183673469,
        0.381830050505119, 0.279705391489277, 0.129484966168870])
    NK = WK.size
    G = np.arange(2,NK,2)
    tol1 = 1e-4
    tol2 = 1e-4
    Subs = np.array([[fmin],[fmax]])
    #  number of functions to evaluate in the feval vector of functions.
    NF = feval(np.zeros(1)).size
    Q = np.zeros(NF)
    neval = 0
    while Subs.size > 0:
        Subs = getSubs(Subs,XK)
        M = (Subs[1,:] - Subs[0,:]) / 2
        C = (Subs[1,:] + Subs[0,:]) / 2
        # NM = length(M);
        NM = M.size
        # x = reshape(XK * M + ones(NK, 1) * C, 1, []);
        x = XK[:,None]*M + C
        x = x.flatten()
        FV = feval(x)
        # FV = FV[:,None]
        Q1 = np.zeros((NF, NM))
        Q2 = np.zeros((NF, NM))
        # for n=1:NF
        # F = reshape(FV(n,:), NK, []);
        # Q1(n,:) = M. * sum((WK * ones(1, NM)). * F);
        # Q2(n,:) = M. * sum((WG * ones(1, NM)). * F(G,:));
        # end
        # for i in range(NF):
        #     F = FV
        #     F = F.reshape((NK,-1))
        #     temp_mat = np.sum(np.multiply(WK[:,None]*np.ones((1,NM)), F),axis=0)
        #     Q1[i,:] = np.multiply(M, temp_mat)
        #     temp_mat = np.sum(np.multiply(WG[:,None]*np.ones((1, NM)), F[G-1,:]), axis=0)
        #     Q2[i,:] = np.multiply(M, temp_mat)
        # ind = np.where(np.logical_or(np.max(np.abs(Q1 -Q2) / Q1) < tol1, (Subs[1,:] - Subs[0,:]) <= tol2) > 0)[0]
        # Q = Q + np.sum(Q1[:,ind], axis=1)
        # np.delete(Subs, ind,axis=1)
        Q1 = np.dot(FV.reshape(NF, NK, NM).swapaxes(2,1),WK)*M
        Q2 = np.dot(FV.reshape(NF, NK, NM).swapaxes(2,1)[:,:,1::2],WG)*M
        #ind = np.nonzero(np.logical_or(np.max(np.abs((Q1-Q2)/Q1), 0) < difftol , M < xtol))[0]
        ind = np.nonzero(np.logical_or(np.max(np.abs((Q1-Q2)), 0) < tol1 , (Subs[1,:] - Subs[0,:])  < tol2))[0]
        Q = Q + np.sum(Q1[:,ind], axis=1)
        Subs = np.delete(Subs, ind, axis=1)
    return Q
 def quadgk_int(f, fmin=-np.inf, fmax=np.inf, difftol=0.1):
    """
    Integrate f from fmin to fmax,
    do integration by substitution
    x = r / (1-r**2)
    when r goes from -1 to 1 , x goes from -inf to inf.
    the interval for quadgk function is from -1 to +1, so we transform the space from (-inf,inf) to (-1,1)
    :param f:
    :param fmin:
    :param fmax:
    :param difftol:
    :return:
    """
    difftol = 1e-4
    def trans_func(r):
        r2 = np.square(r)
        x = r / (1-r2)
        dx_dr = (1 + r2)/(1-r2)**2
        return f(x)*dx_dr
    integrand = quadvgk(trans_func, -1., 1., difftol, difftol)
    return integrand