Merge pull request #544 from SheffieldML/devel

Release GPy 1.8.x
2026-05-15 06:52:39 +02:00 · 2017-09-21 18:00:44 +01:00 · 2017-09-21 18:00:44 +01:00 · d40db2b9af
commit d40db2b9af
parent 4c78cc837b 6cc13af8cd
47 changed files with 14431 additions and 273 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/GPy/version.py
+++ b/GPy/version.py
@ -1 +1 @@
-__version__ = "1.7.7"
+__version__ = "1.8.0"
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -109,17 +109,79 @@ class GP(Model):
        self.link_parameter(self.likelihood)
        self.posterior = None
-        # The predictive variable to be used to predict using the posterior object's
+    def to_dict(self, save_data=True):
-        # woodbury_vector and woodbury_inv is defined as predictive_variable
+        input_dict = super(GP, self)._to_dict()
-        # as long as the posterior has the right woodbury entries.
+        input_dict["class"] = "GPy.core.GP"
-        # It is the input variable used for the covariance between
+        if not save_data:
-        # X_star and the posterior of the GP.
+            input_dict["X"] = None
-        # This is usually just a link to self.X (full GP) or self.Z (sparse GP).
+            input_dict["Y"] = None
-        # Make sure to name this variable and the predict functions will "just work"
+        else:
-        # In maths the predictive variable is:
+            try:
-        #         K_{xx} - K_{xp}W_{pp}^{-1}K_{px}
+                input_dict["X"] = self.X.values.tolist()
-        #         W_{pp} := \texttt{Woodbury inv}
+            except:
-        #         p := _predictive_variable
+                input_dict["X"] = self.X.tolist()
            try:
                input_dict["Y"] = self.Y.values.tolist()
            except:
                input_dict["Y"] = self.Y.tolist()
        input_dict["kernel"] = self.kern.to_dict()
        input_dict["likelihood"] = self.likelihood.to_dict()
        if self.mean_function is not None:
            input_dict["mean_function"] = self.mean_function.to_dict()
        input_dict["inference_method"] = self.inference_method.to_dict()
        #FIXME: Assumes the Y_metadata is serializable. We should create a Metadata class
        if self.Y_metadata is not None:
            input_dict["Y_metadata"] = self.Y_metadata
        if self.normalizer is not None:
            input_dict["normalizer"] = self.normalizer.to_dict()
        return input_dict
    @staticmethod
    def _from_dict(input_dict, data=None):
        import GPy
        import numpy as np
        if (input_dict['X'] is None) or (input_dict['Y'] is None):
            assert(data is not None)
            input_dict["X"], input_dict["Y"] = np.array(data[0]), np.array(data[1])
        elif data is not None:
            print("WARNING: The model has been saved with X,Y! The original values are being overriden!")
            input_dict["X"], input_dict["Y"] = np.array(data[0]), np.array(data[1])
        else:
            input_dict["X"], input_dict["Y"] = np.array(input_dict['X']), np.array(input_dict['Y'])
        input_dict["kernel"] = GPy.kern.Kern.from_dict(input_dict["kernel"])
        input_dict["likelihood"] = GPy.likelihoods.likelihood.Likelihood.from_dict(input_dict["likelihood"])
        mean_function = input_dict.get("mean_function")
        if mean_function is not None:
            input_dict["mean_function"] = GPy.core.mapping.Mapping.from_dict(mean_function)
        else:
            input_dict["mean_function"] = mean_function
        input_dict["inference_method"] = GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(input_dict["inference_method"])
        #FIXME: Assumes the Y_metadata is serializable. We should create a Metadata class
        Y_metadata = input_dict.get("Y_metadata")
        input_dict["Y_metadata"] = Y_metadata
        normalizer = input_dict.get("normalizer")
        if normalizer is not None:
            input_dict["normalizer"] = GPy.util.normalizer._Norm.from_dict(normalizer)
        else:
            input_dict["normalizer"] = normalizer
        return GP(**input_dict)
    def save_model(self, output_filename, compress=True, save_data=True):
        self._save_model(output_filename, compress=True, save_data=True)
    # The predictive variable to be used to predict using the posterior object's
    # woodbury_vector and woodbury_inv is defined as predictive_variable
    # as long as the posterior has the right woodbury entries.
    # It is the input variable used for the covariance between
    # X_star and the posterior of the GP.
    # This is usually just a link to self.X (full GP) or self.Z (sparse GP).
    # Make sure to name this variable and the predict functions will "just work"
    # In maths the predictive variable is:
    #         K_{xx} - K_{xp}W_{pp}^{-1}K_{px}
    #         W_{pp} := \texttt{Woodbury inv}
    #         p := _predictive_variable
    @property
    def _predictive_variable(self):
@ -305,9 +367,9 @@ class GP(Model):
        m, v = self._raw_predict(X,  full_cov=False, kern=kern)
        if likelihood is None:
            likelihood = self.likelihood
-            
+
        quantiles = likelihood.predictive_quantiles(m, v, quantiles, Y_metadata=Y_metadata)
-        
+
        if self.normalizer is not None:
            quantiles = [self.normalizer.inverse_mean(q) for q in quantiles]
        return quantiles
@ -616,4 +678,3 @@ class GP(Model):
        """
        mu_star, var_star = self._raw_predict(x_test)
        return self.likelihood.log_predictive_density_sampling(y_test, mu_star, var_star, Y_metadata=Y_metadata, num_samples=num_samples)
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@ -25,6 +25,30 @@ class Mapping(Parameterized):
    def update_gradients(self, dL_dF, X):
        raise NotImplementedError
    def to_dict(self):
        raise NotImplementedError
    def _to_dict(self):
        input_dict = {}
        input_dict["input_dim"] = self.input_dim
        input_dict["output_dim"] = self.output_dim
        input_dict["name"] = self.name
        return input_dict
    @staticmethod
    def from_dict(input_dict):
        import copy
        input_dict = copy.deepcopy(input_dict)
        mapping_class = input_dict.pop('class')
        input_dict["name"] = str(input_dict["name"])
        import GPy
        mapping_class = eval(mapping_class)
        return mapping_class._from_dict(mapping_class, input_dict)
    @staticmethod
    def _from_dict(mapping_class, input_dict):
        return mapping_class(**input_dict)
 class Bijective_mapping(Mapping):
    """
@ -37,5 +61,3 @@ class Bijective_mapping(Mapping):
    def g(self, f):
        """Inverse mapping from output domain of the function to the inputs."""
        raise NotImplementedError
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -8,6 +8,61 @@ class Model(ParamzModel, Priorizable):
    def __init__(self, name):
        super(Model, self).__init__(name)  # Parameterized.__init__(self)
    def _to_dict(self):
        input_dict = {}
        input_dict["name"] = self.name
        return input_dict
    def to_dict(self):
        raise NotImplementedError
    @staticmethod
    def from_dict(input_dict, data=None):
        import copy
        input_dict = copy.deepcopy(input_dict)
        model_class = input_dict.pop('class')
        input_dict["name"] = str(input_dict["name"])
        import GPy
        model_class = eval(model_class)
        return model_class._from_dict(input_dict, data)
    @staticmethod
    def _from_dict(model_class, input_dict, data=None):
        return model_class(**input_dict)
    def save_model(self, output_filename, compress=True, save_data=True):
        raise NotImplementedError
    def _save_model(self, output_filename, compress=True, save_data=True):
        import json
        output_dict = self.to_dict(save_data)
        if compress:
            import gzip
            with gzip.GzipFile(output_filename + ".zip", 'w') as outfile:
                json_str = json.dumps(output_dict)
                json_bytes = json_str.encode('utf-8')
                outfile.write(json_bytes)
        else:
            with open(output_filename + ".json", 'w') as outfile:
                json.dump(output_dict, outfile)
    @staticmethod
    def load_model(output_filename, data=None):
        compress = output_filename.split(".")[-1] == "zip"
        import json
        if compress:
            import gzip
            with gzip.GzipFile(output_filename, 'r') as json_data:
                json_bytes = json_data.read()
                json_str = json_bytes.decode('utf-8')
                output_dict = json.loads(json_str)
        else:
            with open(output_filename) as json_data:
                output_dict = json.load(json_data)
        import GPy
        return GPy.core.model.Model.from_dict(output_dict, data)
    def log_likelihood(self):
        raise NotImplementedError("this needs to be implemented to use the model class")
--- a/GPy/inference/latent_function_inference/init.py
+++ b/GPy/inference/latent_function_inference/init.py
@ -41,6 +41,26 @@ class LatentFunctionInference(object):
        """
        pass
    def _to_dict(self):
        input_dict = {}
        return input_dict
    def to_dict(self):
        raise NotImplementedError
    @staticmethod
    def from_dict(input_dict):
        import copy
        input_dict = copy.deepcopy(input_dict)
        inference_class = input_dict.pop('class')
        import GPy
        inference_class = eval(inference_class)
        return inference_class._from_dict(inference_class, input_dict)
    @staticmethod
    def _from_dict(inference_class, input_dict):
        return inference_class(**input_dict)
 class InferenceMethodList(LatentFunctionInference, list):
    def on_optimization_start(self):
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@ -21,6 +21,11 @@ class ExactGaussianInference(LatentFunctionInference):
    def __init__(self):
        pass#self._YYTfactor_cache = caching.cache()
    def to_dict(self):
        input_dict = super(ExactGaussianInference, self)._to_dict()
        input_dict["class"] = "GPy.inference.latent_function_inference.exact_gaussian_inference.ExactGaussianInference"
        return input_dict
    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, K=None, variance=None, Z_tilde=None):
        """
        Returns a Posterior class containing essential quantities of the posterior
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@ -6,11 +6,141 @@ from paramz import ObsAr
 from . import ExactGaussianInference, VarDTC
 from ...util import diag
 from .posterior import PosteriorEP as Posterior
 from ...likelihoods import Gaussian
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 #Four wrapper classes to help modularisation of different EP versions
 class marginalMoments(object):
    def __init__(self, num_data):
        self.Z_hat = np.empty(num_data,dtype=np.float64)
        self.mu_hat = np.empty(num_data,dtype=np.float64)
        self.sigma2_hat = np.empty(num_data,dtype=np.float64)
 class cavityParams(object):
    def __init__(self, num_data):
        self.tau = np.empty(num_data,dtype=np.float64)
        self.v = np.empty(num_data,dtype=np.float64)
    def _update_i(self, eta, ga_approx, post_params, i):
        self.tau[i] = 1./post_params.Sigma_diag[i] - eta*ga_approx.tau[i]
        self.v[i] = post_params.mu[i]/post_params.Sigma_diag[i] - eta*ga_approx.v[i]
    def to_dict(self):
        return {"tau": self.tau.tolist(), "v": self.v.tolist()}
    @staticmethod
    def from_dict(input_dict):
        c = cavityParams(len(input_dict["tau"]))
        c.tau = np.array(input_dict["tau"])
        c.v = np.array(input_dict["v"])
        return c
 class gaussianApproximation(object):
    def __init__(self, v, tau):
        self.tau = tau
        self.v = v
    def _update_i(self, eta, delta, post_params, marg_moments, i):
        #Site parameters update
        delta_tau = delta/eta*(1./marg_moments.sigma2_hat[i] - 1./post_params.Sigma_diag[i])
        delta_v = delta/eta*(marg_moments.mu_hat[i]/marg_moments.sigma2_hat[i] - post_params.mu[i]/post_params.Sigma_diag[i])
        tau_tilde_prev = self.tau[i]
        self.tau[i] += delta_tau
        # Enforce positivity of tau_tilde. Even though this is guaranteed for logconcave sites, it is still possible
        # to get negative values due to numerical errors. Moreover, the value of tau_tilde should be positive in order to
        # update the marginal likelihood without runnint into instabilities issues.
        if self.tau[i] < np.finfo(float).eps:
            self.tau[i] = np.finfo(float).eps
            delta_tau = self.tau[i] - tau_tilde_prev
        self.v[i] += delta_v
        return (delta_tau, delta_v)
    def to_dict(self):
        return {"tau": self.tau.tolist(), "v": self.v.tolist()}
    @staticmethod
    def from_dict(input_dict):
        return gaussianApproximation(np.array(input_dict["v"]), np.array(input_dict["tau"]))
 class posteriorParamsBase(object):
    def __init__(self, mu, Sigma_diag):
        self.mu = mu
        self.Sigma_diag = Sigma_diag
    def _update_rank1(self, *arg):
        pass
    def _recompute(self, *arg):
        pass
 class posteriorParams(posteriorParamsBase):
    def __init__(self, mu, Sigma, L=None):
        self.Sigma = Sigma
        self.L = L
        Sigma_diag = np.diag(self.Sigma)
        super(posteriorParams, self).__init__(mu, Sigma_diag)
    def _update_rank1(self, delta_tau, ga_approx, i):
        ci = delta_tau/(1.+ delta_tau*self.Sigma_diag[i])
        DSYR(self.Sigma, self.Sigma[:,i].copy(), -ci)
        self.mu = np.dot(self.Sigma, ga_approx.v)
    def to_dict(self):
        #TODO: Implement a more memory efficient variant
        if self.L is None:
            return { "mu": self.mu.tolist(), "Sigma": self.Sigma.tolist()}
        else:
            return { "mu": self.mu.tolist(), "Sigma": self.Sigma.tolist(), "L": self.L.tolist()}
    @staticmethod
    def from_dict(input_dict):
        if "L" in input_dict:
            return posteriorParams(np.array(input_dict["mu"]), np.array(input_dict["Sigma"]), np.array(input_dict["L"]))
        else:
            return posteriorParams(np.array(input_dict["mu"]), np.array(input_dict["Sigma"]))
    @staticmethod
    def _recompute(K, ga_approx):
        num_data = len(ga_approx.tau)
        tau_tilde_root = np.sqrt(ga_approx.tau)
        Sroot_tilde_K = tau_tilde_root[:,None] * K
        B = np.eye(num_data) + Sroot_tilde_K * tau_tilde_root[None,:]
        L = jitchol(B)
        V, _ = dtrtrs(L, Sroot_tilde_K, lower=1)
        Sigma = K - np.dot(V.T,V) #K - KS^(1/2)BS^(1/2)K = (K^(-1) + \Sigma^(-1))^(-1)
        mu = np.dot(Sigma,ga_approx.v)
        return posteriorParams(mu=mu, Sigma=Sigma, L=L)
 class posteriorParamsDTC(posteriorParamsBase):
    def __init__(self, mu, Sigma_diag):
        super(posteriorParamsDTC, self).__init__(mu, Sigma_diag)
    def _update_rank1(self, LLT, Kmn, delta_v, delta_tau, i):
        #DSYR(Sigma, Sigma[:,i].copy(), -delta_tau/(1.+ delta_tau*Sigma[i,i]))
        DSYR(LLT,Kmn[:,i].copy(),delta_tau)
        L = jitchol(LLT)
        V,info = dtrtrs(L,Kmn,lower=1)
        self.Sigma_diag = np.maximum(np.sum(V*V,-2), np.finfo(float).eps)  #diag(K_nm (L L^\top)^(-1)) K_mn
        si = np.sum(V.T*V[:,i],-1) #(V V^\top)[:,i]
        self.mu += (delta_v-delta_tau*self.mu[i])*si
        #mu = np.dot(Sigma, v_tilde)
    @staticmethod
    def _recompute(LLT0, Kmn, ga_approx):
        LLT = LLT0 + np.dot(Kmn*ga_approx.tau[None,:],Kmn.T)
        L = jitchol(LLT)
        V, _ = dtrtrs(L,Kmn,lower=1)
        #Sigma_diag = np.sum(V*V,-2)
        #Knmv_tilde = np.dot(Kmn,v_tilde)
        #mu = np.dot(V2.T,Knmv_tilde)
        Sigma = np.dot(V.T,V)
        mu = np.dot(Sigma, ga_approx.v)
        Sigma_diag = np.diag(Sigma).copy()
        return posteriorParamsDTC(mu, Sigma_diag), LLT
 class EPBase(object):
-    def __init__(self, epsilon=1e-6, eta=1., delta=1., always_reset=False, max_iters=np.inf, ep_mode="alternated", parallel_updates=False):
+    def __init__(self, epsilon=1e-6, eta=1., delta=1., always_reset=False, max_iters=np.inf, ep_mode="alternated", parallel_updates=False, loading=False):
        """
        The expectation-propagation algorithm.
        For nomenclature see Rasmussen & Williams 2006.
@ -26,16 +156,20 @@ class EPBase(object):
        :max_iters: int
        :ep_mode: string. It can be "nested" (EP is run every time the Hyperparameters change) or "alternated" (It runs EP at the beginning and then optimize the Hyperparameters).
        :parallel_updates: boolean. If true, updates of the parameters of the sites in parallel
        :loading: boolean. If True, prevents the EP parameters to change. Hack used when loading a serialized model
        """
        super(EPBase, self).__init__()
        self.always_reset = always_reset
        self.epsilon, self.eta, self.delta, self.max_iters = epsilon, eta, delta, max_iters
        self.ep_mode = ep_mode
        self.parallel_updates = parallel_updates
        #FIXME: Hack for serialiation. If True, prevents the EP parameters to change when loading a serialized model
        self.loading = loading
        self.reset()
    def reset(self):
-        self.old_mutilde, self.old_vtilde = None, None
+        self.ga_approx_old = None
        self._ep_approximation = None
    def on_optimization_start(self):
@ -45,6 +179,11 @@ class EPBase(object):
        # TODO: update approximation in the end as well? Maybe even with a switch?
        pass
    def _stop_criteria(self, ga_approx):
        tau_diff = np.mean(np.square(ga_approx.tau-self.ga_approx_old.tau))
        v_diff = np.mean(np.square(ga_approx.v-self.ga_approx_old.v))
        return ((tau_diff < self.epsilon) and (v_diff < self.epsilon))
    def __setstate__(self, state):
        super(EPBase, self).__setstate__(state[0])
        self.epsilon, self.eta, self.delta = state[1]
@ -53,9 +192,21 @@ class EPBase(object):
    def __getstate__(self):
        return [super(EPBase, self).__getstate__() , [self.epsilon, self.eta, self.delta]]
    def _to_dict(self):
        input_dict = super(EPBase, self)._to_dict()
        input_dict["epsilon"]=self.epsilon
        input_dict["eta"]=self.eta
        input_dict["delta"]=self.delta
        input_dict["always_reset"]=self.always_reset
        input_dict["max_iters"]=self.max_iters
        input_dict["ep_mode"]=self.ep_mode
        input_dict["parallel_updates"]=self.parallel_updates
        input_dict["loading"]=True
        return input_dict
 class EP(EPBase, ExactGaussianInference):
    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, precision=None, K=None):
-        if self.always_reset:
+        if self.always_reset and not self.loading:
            self.reset()
        num_data, output_dim = Y.shape
@ -64,22 +215,22 @@ class EP(EPBase, ExactGaussianInference):
        if K is None:
            K = kern.K(X)
-        if self.ep_mode=="nested":
+        if self.ep_mode=="nested" and not self.loading:
            #Force EP at each step of the optimization
            self._ep_approximation = None
-            mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
+            post_params, ga_approx, cav_params, log_Z_tilde = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
-        elif self.ep_mode=="alternated":
+        elif self.ep_mode=="alternated" or self.loading:
            if getattr(self, '_ep_approximation', None) is None:
                #if we don't yet have the results of runnign EP, run EP and store the computed factors in self._ep_approximation
-                mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
+                post_params, ga_approx, cav_params, log_Z_tilde = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
            else:
                #if we've already run EP, just use the existing approximation stored in self._ep_approximation
-                mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation
+                post_params, ga_approx, cav_params, log_Z_tilde = self._ep_approximation
        else:
            raise ValueError("ep_mode value not valid")
-        v_tilde = mu_tilde * tau_tilde
+        self.loading = False
-        return self._inference(K, tau_tilde, v_tilde, likelihood, Y_metadata=Y_metadata,  Z_tilde=log_Z_tilde.sum())
+        return self._inference(Y, K, ga_approx, cav_params, likelihood, Y_metadata=Y_metadata,  Z_tilde=log_Z_tilde)
    def expectation_propagation(self, K, Y, likelihood, Y_metadata):
@ -90,41 +241,57 @@ class EP(EPBase, ExactGaussianInference):
        # than ObsArrays
        Y = Y.values.copy()
-        #Initial values - Marginal moments
+        #Initial values - Marginal moments, cavity params, gaussian approximation params and posterior params
-        Z_hat = np.empty(num_data,dtype=np.float64)
+        marg_moments = marginalMoments(num_data)
-        mu_hat = np.empty(num_data,dtype=np.float64)
+        cav_params = cavityParams(num_data)
-        sigma2_hat = np.empty(num_data,dtype=np.float64)
+        ga_approx, post_params = self._init_approximations(K, num_data)
-        tau_cav = np.empty(num_data,dtype=np.float64)
+        #Approximation
-        v_cav = np.empty(num_data,dtype=np.float64)
+        stop = False
        iterations = 0
        while not stop and (iterations < self.max_iters):
            self._local_updates(num_data, cav_params, post_params, marg_moments, ga_approx, likelihood, Y, Y_metadata)
            #(re) compute Sigma and mu using full Cholesky decompy
            post_params = posteriorParams._recompute(K, ga_approx)
            #monitor convergence
            if iterations > 0:
                stop = self._stop_criteria(ga_approx)
            self.ga_approx_old = gaussianApproximation(ga_approx.v.copy(), ga_approx.tau.copy())
            iterations += 1
        # Z_tilde after removing the terms that can lead to infinite terms due to tau_tilde close to zero.
        # This terms cancel with the coreresponding terms in the marginal loglikelihood
        log_Z_tilde = self._log_Z_tilde(marg_moments, ga_approx, cav_params)
                         # - 0.5*np.log(tau_tilde) + 0.5*(v_tilde*v_tilde*1./tau_tilde)
        return (post_params, ga_approx, cav_params, log_Z_tilde)
    def _init_approximations(self, K, num_data):
        #initial values - Gaussian factors
        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
-        if self.old_mutilde is None:
+        if self.ga_approx_old is None:
-            tau_tilde, mu_tilde, v_tilde = np.zeros((3, num_data))
+            v_tilde, tau_tilde = np.zeros((2, num_data))
            ga_approx = gaussianApproximation(v_tilde, tau_tilde)
            Sigma = K.copy()
            diag.add(Sigma, 1e-7)
            mu = np.zeros(num_data)
            post_params = posteriorParams(mu, Sigma)
        else:
-            assert self.old_mutilde.size == num_data, "data size mis-match: did you change the data? try resetting!"
+            assert self.ga_approx_old.v.size == num_data, "data size mis-match: did you change the data? try resetting!"
-            mu_tilde, v_tilde = self.old_mutilde, self.old_vtilde
+            ga_approx = gaussianApproximation(self.ga_approx_old.v, self.ga_approx_old.tau)
-            tau_tilde = v_tilde/mu_tilde
+            post_params = posteriorParams._recompute(K, ga_approx)
-            mu, Sigma, _ = self._ep_compute_posterior(K, tau_tilde, v_tilde)
+            diag.add(post_params.Sigma, 1e-7)
            diag.add(Sigma, 1e-7)
            # TODO: Check the log-marginal under both conditions and choose the best one
        return (ga_approx, post_params)
-        #Approximation
+    def _local_updates(self, num_data, cav_params, post_params, marg_moments, ga_approx, likelihood, Y, Y_metadata, update_order=None):
-        tau_diff = self.epsilon + 1.
+            if update_order is None:
-        v_diff = self.epsilon + 1.
+                update_order = np.random.permutation(num_data)
        tau_tilde_old = np.nan
        v_tilde_old = np.nan
        iterations = 0
        while ((tau_diff > self.epsilon) or (v_diff > self.epsilon)) and (iterations < self.max_iters):
            update_order = np.random.permutation(num_data)
            for i in update_order:
                #Cavity distribution parameters
-                tau_cav[i] = 1./Sigma[i,i] - self.eta*tau_tilde[i]
+                cav_params._update_i(self.eta, ga_approx, post_params, i)
-                v_cav[i] = mu[i]/Sigma[i,i] - self.eta*v_tilde[i]
+
                if Y_metadata is not None:
                    # Pick out the relavent metadata for Yi
                    Y_metadata_i = {}
@ -133,93 +300,77 @@ class EP(EPBase, ExactGaussianInference):
                else:
                    Y_metadata_i = None
                #Marginal moments
-                Z_hat[i], mu_hat[i], sigma2_hat[i] = likelihood.moments_match_ep(Y[i], tau_cav[i], v_cav[i], Y_metadata_i=Y_metadata_i)
+                marg_moments.Z_hat[i], marg_moments.mu_hat[i], marg_moments.sigma2_hat[i] = likelihood.moments_match_ep(Y[i], cav_params.tau[i], cav_params.v[i], Y_metadata_i=Y_metadata_i)
                #Site parameters update
                delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
                delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
                tau_tilde_prev = tau_tilde[i]
                tau_tilde[i] += delta_tau
-                # Enforce positivity of tau_tilde. Even though this is guaranteed for logconcave sites, it is still possible
+                #Site parameters update
-                # to get negative values due to numerical errors. Moreover, the value of tau_tilde should be positive in order to
+                delta_tau, delta_v = ga_approx._update_i(self.eta, self.delta, post_params, marg_moments, i)
                # update the marginal likelihood without inestability issues.
                if tau_tilde[i] < np.finfo(float).eps:
                    tau_tilde[i] = np.finfo(float).eps
                    delta_tau = tau_tilde[i] - tau_tilde_prev
                v_tilde[i] += delta_v
                if self.parallel_updates == False:
-                    #Posterior distribution parameters update
+                    post_params._update_rank1(delta_tau, ga_approx, i)
                    ci = delta_tau/(1.+ delta_tau*Sigma[i,i])
                    DSYR(Sigma, Sigma[:,i].copy(), -ci)
                    mu = np.dot(Sigma, v_tilde)
-            #(re) compute Sigma and mu using full Cholesky decompy
+    def _log_Z_tilde(self, marg_moments, ga_approx, cav_params):
-            mu, Sigma, _ = self._ep_compute_posterior(K, tau_tilde, v_tilde)
+        return np.sum((np.log(marg_moments.Z_hat) + 0.5*np.log(2*np.pi) + 0.5*np.log(1+ga_approx.tau/cav_params.tau) - 0.5 * ((ga_approx.v)**2 * 1./(cav_params.tau + ga_approx.tau))
                + 0.5*(cav_params.v * ( ( (ga_approx.tau/cav_params.tau) * cav_params.v - 2.0 * ga_approx.v ) * 1./(cav_params.tau + ga_approx.tau)))))
            #monitor convergence
            if iterations > 0:
                tau_diff = np.mean(np.square(tau_tilde-tau_tilde_old))
                v_diff = np.mean(np.square(v_tilde-v_tilde_old))
            tau_tilde_old = tau_tilde.copy()
            v_tilde_old = v_tilde.copy()
            iterations += 1
-        mu_tilde = v_tilde/tau_tilde
+    def _ep_marginal(self, K, ga_approx, Z_tilde):
-        mu_cav = v_cav/tau_cav
+        post_params = posteriorParams._recompute(K, ga_approx)
        sigma2_sigma2tilde = 1./tau_cav + 1./tau_tilde
        # Z_tilde after removing the terms that can lead to infinite terms due to tau_tilde close to zero.
        # This terms cancel with the coreresponding terms in the marginal loglikelihood
        log_Z_tilde = (np.log(Z_hat) + 0.5*np.log(2*np.pi) + 0.5*np.log(1+tau_tilde/tau_cav)
                         - 0.5 * ((v_tilde)**2 * 1./(tau_cav + tau_tilde)) + 0.5*(v_cav * ( ( (tau_tilde/tau_cav) * v_cav - 2.0 * v_tilde ) * 1./(tau_cav + tau_tilde))))
                         # - 0.5*np.log(tau_tilde) + 0.5*(v_tilde*v_tilde*1./tau_tilde)
        self.old_mutilde = mu_tilde
        self.old_vtilde = v_tilde
        return mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde
    def _ep_compute_posterior(self, K, tau_tilde, v_tilde):
        num_data = len(tau_tilde)
        tau_tilde_root = np.sqrt(tau_tilde)
        Sroot_tilde_K = tau_tilde_root[:,None] * K
        B = np.eye(num_data) + Sroot_tilde_K * tau_tilde_root[None,:]
        L = jitchol(B)
        V, _ = dtrtrs(L, Sroot_tilde_K, lower=1)
        Sigma = K - np.dot(V.T,V) #K - KS^(1/2)BS^(1/2)K = (K^(-1) + \Sigma^(-1))^(-1)
        mu = np.dot(Sigma,v_tilde)
        return (mu, Sigma, L)
    def _ep_marginal(self, K, tau_tilde, v_tilde, Z_tilde):
        mu, Sigma, L = self._ep_compute_posterior(K, tau_tilde, v_tilde)
        # Gaussian log marginal excluding terms that can go to infinity due to arbitrarily small tau_tilde.
        # These terms cancel out with the terms excluded from Z_tilde
-        B_logdet = np.sum(2.0*np.log(np.diag(L)))
+        B_logdet = np.sum(2.0*np.log(np.diag(post_params.L)))
-        log_marginal =  0.5*(-len(tau_tilde) * log_2_pi - B_logdet + np.sum(v_tilde * np.dot(Sigma,v_tilde)))
+        log_marginal =  0.5*(-len(ga_approx.tau) * log_2_pi - B_logdet + np.sum(ga_approx.v * np.dot(post_params.Sigma,ga_approx.v)))
        log_marginal += Z_tilde
-        return log_marginal, mu, Sigma, L
+        return log_marginal, post_params
-    def _inference(self, K, tau_tilde, v_tilde, likelihood, Z_tilde, Y_metadata=None):
+    def _inference(self, Y, K, ga_approx, cav_params, likelihood, Z_tilde, Y_metadata=None):
-        log_marginal, mu, Sigma, L = self._ep_marginal(K, tau_tilde, v_tilde, Z_tilde)
+        log_marginal, post_params = self._ep_marginal(K, ga_approx, Z_tilde)
-        tau_tilde_root = np.sqrt(tau_tilde)
+        tau_tilde_root = np.sqrt(ga_approx.tau)
        Sroot_tilde_K = tau_tilde_root[:,None] * K
-        aux_alpha , _ = dpotrs(L, np.dot(Sroot_tilde_K, v_tilde), lower=1)
+        aux_alpha , _ = dpotrs(post_params.L, np.dot(Sroot_tilde_K, ga_approx.v), lower=1)
-        alpha = (v_tilde - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde)
+        alpha = (ga_approx.v - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde)
-        LWi, _ = dtrtrs(L, np.diag(tau_tilde_root), lower=1)
+        LWi, _ = dtrtrs(post_params.L, np.diag(tau_tilde_root), lower=1)
        Wi = np.dot(LWi.T,LWi)
        symmetrify(Wi) #(K + Sigma^(\tilde))^(-1)
        dL_dK = 0.5 * (tdot(alpha) - Wi)
-        dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK), Y_metadata)
+        dL_dthetaL = likelihood.ep_gradients(Y, cav_params.tau, cav_params.v, np.diag(dL_dK), Y_metadata=Y_metadata, quad_mode='gh')
        return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
    def to_dict(self):
        input_dict = super(EP, self)._to_dict()
        input_dict["class"] = "GPy.inference.latent_function_inference.expectation_propagation.EP"
        if self.ga_approx_old is not  None:
            input_dict["ga_approx_old"] = self.ga_approx_old.to_dict()
        if self._ep_approximation is not  None:
            input_dict["_ep_approximation"] = {}
            input_dict["_ep_approximation"]["post_params"] = self._ep_approximation[0].to_dict()
            input_dict["_ep_approximation"]["ga_approx"] = self._ep_approximation[1].to_dict()
            input_dict["_ep_approximation"]["cav_params"] = self._ep_approximation[2].to_dict()
            input_dict["_ep_approximation"]["log_Z_tilde"] = self._ep_approximation[3].tolist()
        return input_dict
    @staticmethod
    def _from_dict(inference_class, input_dict):
        ga_approx_old = input_dict.pop('ga_approx_old', None)
        if ga_approx_old is not None:
            ga_approx_old = gaussianApproximation.from_dict(ga_approx_old)
        _ep_approximation_dict = input_dict.pop('_ep_approximation', None)
        _ep_approximation = []
        if _ep_approximation is not None:
            _ep_approximation.append(posteriorParams.from_dict(_ep_approximation_dict["post_params"]))
            _ep_approximation.append(gaussianApproximation.from_dict(_ep_approximation_dict["ga_approx"]))
            _ep_approximation.append(cavityParams.from_dict(_ep_approximation_dict["cav_params"]))
            _ep_approximation.append(np.array(_ep_approximation_dict["log_Z_tilde"]))
        ee = EP(**input_dict)
        ee.ga_approx_old = ga_approx_old
        ee._ep_approximation = _ep_approximation
        return ee
 class EPDTC(EPBase, VarDTC):
    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None, Lm=None, dL_dKmm=None, psi0=None, psi1=None, psi2=None):
@ -244,24 +395,25 @@ class EPDTC(EPBase, VarDTC):
        if self.ep_mode=="nested":
            #Force EP at each step of the optimization
            self._ep_approximation = None
-            mu, Sigma_diag, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
+            post_params, ga_approx, log_Z_tilde = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
        elif self.ep_mode=="alternated":
            if getattr(self, '_ep_approximation', None) is None:
                #if we don't yet have the results of runnign EP, run EP and store the computed factors in self._ep_approximation
-                mu, Sigma_diag, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
+                post_params, ga_approx, log_Z_tilde = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
            else:
                #if we've already run EP, just use the existing approximation stored in self._ep_approximation
-                mu, Sigma_diag, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation
+                post_params, ga_approx, log_Z_tilde = self._ep_approximation
        else:
            raise ValueError("ep_mode value not valid")
-        return super(EPDTC, self).inference(kern, X, Z, likelihood, mu_tilde,
+        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
        return super(EPDTC, self).inference(kern, X, Z, likelihood, ObsAr(mu_tilde[:,None]),
                                            mean_function=mean_function,
                                            Y_metadata=Y_metadata,
-                                            precision=tau_tilde,
+                                            precision=ga_approx.tau,
                                            Lm=Lm, dL_dKmm=dL_dKmm,
-                                            psi0=psi0, psi1=psi1, psi2=psi2, Z_tilde=log_Z_tilde.sum())
+                                            psi0=psi0, psi1=psi1, psi2=psi2, Z_tilde=log_Z_tilde)
    def expectation_propagation(self, Kmm, Kmn, Y, likelihood, Y_metadata):
@ -272,119 +424,91 @@ class EPDTC(EPBase, VarDTC):
        # than ObsArrays
        Y = Y.values.copy()
-        #Initial values - Marginal moments
+        #Initial values - Marginal moments, cavity params, gaussian approximation params and posterior params
-        Z_hat = np.zeros(num_data,dtype=np.float64)
+        marg_moments = marginalMoments(num_data)
-        mu_hat = np.zeros(num_data,dtype=np.float64)
+        cav_params = cavityParams(num_data)
-        sigma2_hat = np.zeros(num_data,dtype=np.float64)
+        ga_approx, post_params, LLT0, LLT = self._init_approximations(Kmm, Kmn, num_data)
-        tau_cav = np.empty(num_data,dtype=np.float64)
+        #Approximation
-        v_cav = np.empty(num_data,dtype=np.float64)
+        stop = False
        iterations = 0
        while not stop and (iterations < self.max_iters):
            self._local_updates(num_data, LLT0, LLT, Kmn, cav_params, post_params, marg_moments, ga_approx, likelihood, Y, Y_metadata)
            #(re) compute Sigma, Sigma_diag and mu using full Cholesky decompy
            post_params, LLT = posteriorParamsDTC._recompute(LLT0, Kmn, ga_approx)
            post_params.Sigma_diag = np.maximum(post_params.Sigma_diag, np.finfo(float).eps)
            #monitor convergence
            if iterations > 0:
                stop = self._stop_criteria(ga_approx)
            self.ga_approx_old = gaussianApproximation(ga_approx.v.copy(), ga_approx.tau.copy())
            iterations += 1
        log_Z_tilde = self._log_Z_tilde(marg_moments, ga_approx, cav_params)
        return post_params, ga_approx, log_Z_tilde
    def _log_Z_tilde(self, marg_moments, ga_approx, cav_params):
        mu_tilde = ga_approx.v/ga_approx.tau
        mu_cav = cav_params.v/cav_params.tau
        sigma2_sigma2tilde = 1./cav_params.tau + 1./ga_approx.tau
        return np.sum((np.log(marg_moments.Z_hat) + 0.5*np.log(2*np.pi) + 0.5*np.log(sigma2_sigma2tilde)
                         + 0.5*((mu_cav - mu_tilde)**2) / (sigma2_sigma2tilde)))
    def _init_approximations(self, Kmm, Kmn, num_data):
        #initial values - Gaussian factors
        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
        LLT0 = Kmm.copy()
        Lm = jitchol(LLT0) #K_m = L_m L_m^\top
-        Vm,info = dtrtrs(Lm,Kmn,lower=1)
+        Vm,info = dtrtrs(Lm, Kmn,lower=1)
        # Lmi = dtrtri(Lm)
        # Kmmi = np.dot(Lmi.T,Lmi)
        # KmmiKmn = np.dot(Kmmi,Kmn)
        # Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
        Qnn_diag = np.sum(Vm*Vm,-2) #diag(Knm Kmm^(-1) Kmn)
        #diag.add(LLT0, 1e-8)
-        if self.old_mutilde is None:
+        if self.ga_approx_old is None:
            #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
            LLT = LLT0.copy() #Sigma = K.copy()
            mu = np.zeros(num_data)
            Sigma_diag = Qnn_diag.copy() + 1e-8
-            tau_tilde, mu_tilde, v_tilde = np.zeros((3, num_data))
+            v_tilde, tau_tilde = np.zeros((2, num_data))
            ga_approx = gaussianApproximation(v_tilde, tau_tilde)
            post_params = posteriorParamsDTC(mu, Sigma_diag)
        else:
-            assert self.old_mutilde.size == num_data, "data size mis-match: did you change the data? try resetting!"
+            assert self.ga_approx_old.v.size == num_data, "data size mis-match: did you change the data? try resetting!"
-            mu_tilde, v_tilde = self.old_mutilde, self.old_vtilde
+            ga_approx = gaussianApproximation(self.ga_approx_old.v, self.ga_approx_old.tau)
-            tau_tilde = v_tilde/mu_tilde
+            post_params, LLT = posteriorParamsDTC._recompute(LLT0, Kmn, ga_approx)
-            mu, Sigma_diag, LLT = self._ep_compute_posterior(LLT0, Kmn, tau_tilde, v_tilde)
+            post_params.Sigma_diag += 1e-8
-            Sigma_diag += 1e-8
+
            # TODO: Check the log-marginal under both conditions and choose the best one
-        #Approximation
+        return (ga_approx, post_params, LLT0, LLT)
-        tau_diff = self.epsilon + 1.
+
-        v_diff = self.epsilon + 1.
+    def _local_updates(self, num_data, LLT0, LLT, Kmn, cav_params, post_params, marg_moments, ga_approx, likelihood, Y, Y_metadata, update_order=None):
-        tau_tilde_old = np.nan
+        if update_order is None:
        v_tilde_old = np.nan
        iterations = 0
        while  ((tau_diff > self.epsilon) or (v_diff > self.epsilon)) and (iterations < self.max_iters):
            update_order = np.random.permutation(num_data)
-            for i in update_order:
+        for i in update_order:
                #Cavity distribution parameters
                tau_cav[i] = 1./Sigma_diag[i] - self.eta*tau_tilde[i]
                v_cav[i] = mu[i]/Sigma_diag[i] - self.eta*v_tilde[i]
                if Y_metadata is not None:
                    # Pick out the relavent metadata for Yi
                    Y_metadata_i = {}
                    for key in Y_metadata.keys():
                        Y_metadata_i[key] = Y_metadata[key][i, :]
                else:
                    Y_metadata_i = None
-                #Marginal moments
+            #Cavity distribution parameters
-                Z_hat[i], mu_hat[i], sigma2_hat[i] = likelihood.moments_match_ep(Y[i], tau_cav[i], v_cav[i], Y_metadata_i=Y_metadata_i)
+            cav_params._update_i(self.eta, ga_approx, post_params, i)
                #Site parameters update
                delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
                tau_tilde_prev = tau_tilde[i]
                tau_tilde[i] += delta_tau
                # Enforce positivity of tau_tilde. Even though this is guaranteed for logconcave sites, it is still possible
                # to get negative values due to numerical errors. Moreover, the value of tau_tilde should be positive in order to
                # update the marginal likelihood without inestability issues.
                if tau_tilde[i] < np.finfo(float).eps:
                    tau_tilde[i] = np.finfo(float).eps
                    delta_tau = tau_tilde[i] - tau_tilde_prev
                v_tilde[i] += delta_v
-                #Posterior distribution parameters update
+            if Y_metadata is not None:
-                if self.parallel_updates == False:
+                # Pick out the relavent metadata for Yi
-                    #DSYR(Sigma, Sigma[:,i].copy(), -delta_tau/(1.+ delta_tau*Sigma[i,i]))
+                Y_metadata_i = {}
-                    DSYR(LLT,Kmn[:,i].copy(),delta_tau)
+                for key in Y_metadata.keys():
-                    L = jitchol(LLT)
+                    Y_metadata_i[key] = Y_metadata[key][i, :]
-                    V,info = dtrtrs(L,Kmn,lower=1)
+            else:
-                    Sigma_diag = np.maximum(np.sum(V*V,-2), np.finfo(float).eps)  #diag(K_nm (L L^\top)^(-1)) K_mn
+                Y_metadata_i = None
                    si = np.sum(V.T*V[:,i],-1) #(V V^\top)[:,i]
                    mu += (delta_v-delta_tau*mu[i])*si
                    #mu = np.dot(Sigma, v_tilde)
-            #(re) compute Sigma, Sigma_diag and mu using full Cholesky decompy
+            #Marginal moments
-            mu, Sigma_diag, LLT = self._ep_compute_posterior(LLT0, Kmn, tau_tilde, v_tilde)
+            marg_moments.Z_hat[i], marg_moments.mu_hat[i], marg_moments.sigma2_hat[i] = likelihood.moments_match_ep(Y[i], cav_params.tau[i], cav_params.v[i], Y_metadata_i=Y_metadata_i)
-            Sigma_diag = np.maximum(Sigma_diag, np.finfo(float).eps)
+            #Site parameters update
            delta_tau, delta_v = ga_approx._update_i(self.eta, self.delta, post_params, marg_moments, i)
-            #monitor convergence
+            #Posterior distribution parameters update
-            if iterations>0:
+            if self.parallel_updates == False:
-                tau_diff = np.mean(np.square(tau_tilde-tau_tilde_old))
+                post_params._update_rank1(LLT, Kmn, delta_v, delta_tau, i)
                v_diff = np.mean(np.square(v_tilde-v_tilde_old))
            tau_tilde_old = tau_tilde.copy()
            v_tilde_old = v_tilde.copy()
            iterations += 1
        mu_tilde = v_tilde/tau_tilde
        mu_cav = v_cav/tau_cav
        sigma2_sigma2tilde = 1./tau_cav + 1./tau_tilde
        log_Z_tilde = (np.log(Z_hat) + 0.5*np.log(2*np.pi) + 0.5*np.log(sigma2_sigma2tilde)
                         + 0.5*((mu_cav - mu_tilde)**2) / (sigma2_sigma2tilde))
        self.old_mutilde = mu_tilde
        self.old_vtilde = v_tilde
        return mu, Sigma_diag, ObsAr(mu_tilde[:,None]), tau_tilde, log_Z_tilde
    def _ep_compute_posterior(self, LLT0, Kmn, tau_tilde, v_tilde):
        LLT = LLT0 + np.dot(Kmn*tau_tilde[None,:],Kmn.T)
        L = jitchol(LLT)
        V, _ = dtrtrs(L,Kmn,lower=1)
        #Sigma_diag = np.sum(V*V,-2)
        #Knmv_tilde = np.dot(Kmn,v_tilde)
        #mu = np.dot(V2.T,Knmv_tilde)
        Sigma = np.dot(V.T,V)
        mu = np.dot(Sigma,v_tilde)
        Sigma_diag = np.diag(Sigma).copy()
        return (mu, Sigma_diag, LLT)
--- a/GPy/kern/src/add.py
+++ b/GPy/kern/src/add.py
@ -13,9 +13,9 @@ class Add(CombinationKernel):
    propagates gradients through.
    This kernel will take over the active dims of it's subkernels passed in.
-    
+
-    NOTE: The subkernels will be copies of the original kernels, to prevent 
+    NOTE: The subkernels will be copies of the original kernels, to prevent
-    unexpected behavior. 
+    unexpected behavior.
    """
    def __init__(self, subkerns, name='sum'):
        _newkerns = []
@ -26,7 +26,7 @@ class Add(CombinationKernel):
                    _newkerns.append(part.copy())
            else:
                _newkerns.append(kern.copy())
-                    
+
        super(Add, self).__init__(_newkerns, name)
        self._exact_psicomp = self._check_exact_psicomp()
@ -43,6 +43,11 @@ class Add(CombinationKernel):
        else:
            return False
    def to_dict(self):
        input_dict = super(Add, self)._to_dict()
        input_dict["class"] = str("GPy.kern.Add")
        return input_dict
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def K(self, X, X2=None, which_parts=None):
        """
--- a/GPy/kern/src/kern.py
+++ b/GPy/kern/src/kern.py
@ -60,6 +60,35 @@ class Kern(Parameterized):
        from .psi_comp import PSICOMP_GH
        self.psicomp = PSICOMP_GH()
    def _to_dict(self):
        input_dict = {}
        input_dict["input_dim"] = self.input_dim
        if isinstance(self.active_dims, np.ndarray):
            input_dict["active_dims"] = self.active_dims.tolist()
        else:
            input_dict["active_dims"] = self.active_dims
        input_dict["name"] = self.name
        input_dict["useGPU"] = self.useGPU
        return input_dict
    def to_dict(self):
        raise NotImplementedError
    @staticmethod
    def from_dict(input_dict):
        import copy
        input_dict = copy.deepcopy(input_dict)
        kernel_class = input_dict.pop('class')
        input_dict["name"] = str(input_dict["name"])
        import GPy
        kernel_class = eval(kernel_class)
        return kernel_class._from_dict(kernel_class, input_dict)
    @staticmethod
    def _from_dict(kernel_class, input_dict):
        return kernel_class(**input_dict)
    def __setstate__(self, state):
        self._all_dims_active = np.arange(0, max(state['active_dims']) + 1)
        super(Kern, self).__setstate__(state)
@ -342,6 +371,21 @@ class CombinationKernel(Kern):
        self.extra_dims = extra_dims
        self.link_parameters(*kernels)
    def _to_dict(self):
        input_dict = super(CombinationKernel, self)._to_dict()
        input_dict["parts"]  = {}
        for ii in range(len(self.parts)):
            input_dict["parts"][ii] = self.parts[ii].to_dict()
        return input_dict
    @staticmethod
    def _from_dict(kernel_class, input_dict):
        parts = input_dict.pop('parts', None)
        subkerns = []
        for pp in parts:
            subkerns.append(Kern.from_dict(parts[pp]))
        return kernel_class(subkerns)
    @property
    def parts(self):
        return self.parameters
--- a/GPy/kern/src/linear.py
+++ b/GPy/kern/src/linear.py
@ -51,6 +51,18 @@ class Linear(Kern):
        self.link_parameter(self.variances)
        self.psicomp = PSICOMP_Linear()
    def to_dict(self):
        input_dict = super(Linear, self)._to_dict()
        input_dict["class"] = "GPy.kern.Linear"
        input_dict["variances"] = self.variances.values.tolist()
        input_dict["ARD"] = self.ARD
        return input_dict
    @staticmethod
    def _from_dict(kernel_class, input_dict):
        useGPU = input_dict.pop('useGPU', None)
        return Linear(**input_dict)
    @Cache_this(limit=3)
    def K(self, X, X2=None):
        if self.ARD:
@ -211,5 +223,3 @@ class LinearFull(Kern):
    def gradients_X_diag(self, dL_dKdiag, X):
        P = np.dot(self.W, self.W.T) + np.diag(self.kappa)
        return 2.*np.einsum('jk,i,ij->ik', P, dL_dKdiag, X)
--- a/GPy/kern/src/periodic.py
+++ b/GPy/kern/src/periodic.py
@ -400,4 +400,3 @@ class PeriodicMatern52(Periodic):
        self.variance.gradient = np.sum(dK_dvar*dL_dK)
        self.lengthscale.gradient = np.sum(dK_dlen*dL_dK)
        self.period.gradient = np.sum(dK_dper*dL_dK)
--- a/GPy/kern/src/prod.py
+++ b/GPy/kern/src/prod.py
@ -39,6 +39,11 @@ class Prod(CombinationKernel):
                    kernels.insert(i, part)
        super(Prod, self).__init__(kernels, name)
    def to_dict(self):
        input_dict = super(Prod, self)._to_dict()
        input_dict["class"] = str("GPy.kern.Prod")
        return input_dict
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def K(self, X, X2=None, which_parts=None):
        if which_parts is None:
@ -117,7 +122,7 @@ class Prod(CombinationKernel):
                part_param_num = len(p.param_array) # number of parameters in the part
                p.sde_update_gradient_full(gradients[part_start_param_index:(part_start_param_index+part_param_num)])
                part_start_param_index += part_param_num
-                
+
    def sde(self):
        """
        """
@ -131,88 +136,88 @@ class Prod(CombinationKernel):
        dQc    = None
        dPinf  = None
        dP0  = None
-        
+
         # Assign models
        for p in self.parts:
            (Ft,Lt,Qct,Ht,P_inft, P0t, dFt,dQct,dP_inft,dP0t) = p.sde()
-            
+
            # check derivative dimensions ->
-            number_of_parameters = len(p.param_array)            
+            number_of_parameters = len(p.param_array)
            assert dFt.shape[2] == number_of_parameters, "Dynamic matrix derivative shape is wrong"
            assert dQct.shape[2] == number_of_parameters, "Diffusion matrix derivative shape is wrong"
            assert dP_inft.shape[2] == number_of_parameters, "Infinite covariance matrix derivative shape is wrong"
            # check derivative dimensions <-
-            
+
            # exception for periodic kernel
            if (p.name == 'std_periodic'):
-                Qct = P_inft  
+                Qct = P_inft
-                dQct = dP_inft                 
+                dQct = dP_inft
-            
+
            dF    = dkron(F,dF,Ft,dFt,'sum')
            dQc   = dkron(Qc,dQc,Qct,dQct,'prod')
            dPinf = dkron(Pinf,dPinf,P_inft,dP_inft,'prod')
            dP0 = dkron(P0,dP0,P0t,dP0t,'prod')
-            
+
            F    = np.kron(F,np.eye(Ft.shape[0])) + np.kron(np.eye(F.shape[0]),Ft)
            L    = np.kron(L,Lt)
            Qc   = np.kron(Qc,Qct)
            Pinf = np.kron(Pinf,P_inft)
            P0 = np.kron(P0,P_inft)
            H    = np.kron(H,Ht)
-            
+
        return (F,L,Qc,H,Pinf,P0,dF,dQc,dPinf,dP0)
 def dkron(A,dA,B,dB, operation='prod'):
    """
-    Function computes the derivative of Kronecker product A*B 
+    Function computes the derivative of Kronecker product A*B
    (or Kronecker sum A+B).
-    
+
    Input:
    -----------------------
-    
+
    A: 2D matrix
-        Some matrix 
+        Some matrix
    dA: 3D (or 2D matrix)
        Derivarives of A
    B: 2D matrix
-        Some matrix 
+        Some matrix
    dB: 3D (or 2D matrix)
-        Derivarives of B    
+        Derivarives of B
-    
+
    operation: str 'prod' or 'sum'
        Which operation is considered. If the operation is 'sum' it is assumed
        that A and are square matrices.s
-    
+
    Output:
        dC: 3D matrix
        Derivative of Kronecker product A*B (or Kronecker sum A+B)
    """
-    
+
    if dA is None:
        dA_param_num = 0
        dA = np.zeros((A.shape[0], A.shape[1],1))
    else:
        dA_param_num = dA.shape[2]
-    
+
    if dB is None:
        dB_param_num = 0
-        dB = np.zeros((B.shape[0], B.shape[1],1))    
+        dB = np.zeros((B.shape[0], B.shape[1],1))
    else:
        dB_param_num = dB.shape[2]
    # Space allocation for derivative matrix
-    dC = np.zeros((A.shape[0]*B.shape[0], A.shape[1]*B.shape[1], dA_param_num +  dB_param_num))    
+    dC = np.zeros((A.shape[0]*B.shape[0], A.shape[1]*B.shape[1], dA_param_num +  dB_param_num))
-    
+
    for k in range(dA_param_num):
        if operation == 'prod':
            dC[:,:,k] = np.kron(dA[:,:,k],B);
        else:
            dC[:,:,k] = np.kron(dA[:,:,k],np.eye( B.shape[0] ))
-            
+
    for k in range(dB_param_num):
        if operation == 'prod':
            dC[:,:,dA_param_num+k] = np.kron(A,dB[:,:,k])
        else:
            dC[:,:,dA_param_num+k] = np.kron(np.eye( A.shape[0] ),dB[:,:,k])
-            
+
    return dC
--- a/GPy/kern/src/rbf.py
+++ b/GPy/kern/src/rbf.py
@ -31,6 +31,14 @@ class RBF(Stationary):
            self.inv_l = Param('inv_lengthscale',1./self.lengthscale**2, Logexp())
            self.link_parameter(self.inv_l)
    def to_dict(self):
        input_dict = super(RBF, self)._to_dict()
        input_dict["class"] = "GPy.kern.RBF"
        input_dict["inv_l"] = self.use_invLengthscale
        if input_dict["inv_l"] == True:
            input_dict["lengthscale"] = np.sqrt(1 / float(self.inv_l))
        return input_dict
    def K_of_r(self, r):
        return self.variance * np.exp(-0.5 * r**2)
@ -42,7 +50,7 @@ class RBF(Stationary):
    def dK2_drdr_diag(self):
        return -self.variance # as the diagonal of r is always filled with zeros
-    
+
    def __getstate__(self):
        dc = super(RBF, self).__getstate__()
        if self.useGPU:
--- a/GPy/kern/src/standard_periodic.py
+++ b/GPy/kern/src/standard_periodic.py
@ -93,6 +93,17 @@ class StdPeriodic(Kern):
        self.link_parameters(self.variance,  self.period, self.lengthscale)
    def to_dict(self):
        input_dict = super(StdPeriodic, self)._to_dict()
        input_dict["class"] = "GPy.kern.StdPeriodic"
        input_dict["variance"] = self.variance.values.tolist()
        input_dict["period"] = self.period.values.tolist()
        input_dict["lengthscale"] = self.lengthscale.values.tolist()
        input_dict["ARD1"] = self.ARD1
        input_dict["ARD2"] = self.ARD2
        return input_dict
    def parameters_changed(self):
        """
        This functions deals as a callback for each optimization iteration.
--- a/GPy/kern/src/static.py
+++ b/GPy/kern/src/static.py
@ -14,6 +14,11 @@ class Static(Kern):
        self.variance = Param('variance', variance, Logexp())
        self.link_parameters(self.variance)
    def _to_dict(self):
        input_dict = super(Static, self)._to_dict()
        input_dict["variance"] =  self.variance.values.tolist()
        return input_dict
    def Kdiag(self, X):
        ret = np.empty((X.shape[0],), dtype=np.float64)
        ret[:] = self.variance
@ -133,6 +138,16 @@ class Bias(Static):
    def __init__(self, input_dim, variance=1., active_dims=None, name='bias'):
        super(Bias, self).__init__(input_dim, variance, active_dims, name)
    def to_dict(self):
        input_dict = super(Bias, self)._to_dict()
        input_dict["class"] = "GPy.kern.Bias"
        return input_dict
    @staticmethod
    def _from_dict(kernel_class, input_dict):
        useGPU = input_dict.pop('useGPU', None)
        return Bias(**input_dict)
    def K(self, X, X2=None):
        shape = (X.shape[0], X.shape[0] if X2 is None else X2.shape[0])
        return np.full(shape, self.variance, dtype=np.float64)
@ -250,4 +265,3 @@ class Precomputed(Fixed):
    def update_gradients_diag(self, dL_dKdiag, X):
        self.variance.gradient = np.einsum('i,ii', dL_dKdiag, self._index(X, None))
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@ -79,6 +79,13 @@ class Stationary(Kern):
        assert self.variance.size==1
        self.link_parameters(self.variance, self.lengthscale)
    def _to_dict(self):
        input_dict = super(Stationary, self)._to_dict()
        input_dict["variance"] =  self.variance.values.tolist()
        input_dict["lengthscale"] = self.lengthscale.values.tolist()
        input_dict["ARD"] = self.ARD
        return input_dict
    def K_of_r(self, r):
        raise NotImplementedError("implement the covariance function as a fn of r to use this class")
@ -351,6 +358,16 @@ class Exponential(Stationary):
    def dK_dr(self, r):
        return -self.K_of_r(r)
    def to_dict(self):
        input_dict = super(Exponential, self)._to_dict()
        input_dict["class"] = "GPy.kern.Exponential"
        return input_dict
    @staticmethod
    def _from_dict(kernel_class, input_dict):
        useGPU = input_dict.pop('useGPU', None)
        return Exponential(**input_dict)
 #    def sde(self):
 #        """
 #        Return the state space representation of the covariance.
@ -399,6 +416,16 @@ class Matern32(Stationary):
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Mat32'):
        super(Matern32, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
    def to_dict(self):
        input_dict = super(Matern32, self)._to_dict()
        input_dict["class"] = "GPy.kern.Matern32"
        return input_dict
    @staticmethod
    def _from_dict(kernel_class, input_dict):
        useGPU = input_dict.pop('useGPU', None)
        return Matern32(**input_dict)
    def K_of_r(self, r):
        return self.variance * (1. + np.sqrt(3.) * r) * np.exp(-np.sqrt(3.) * r)
@ -478,6 +505,16 @@ class Matern52(Stationary):
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Mat52'):
        super(Matern52, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
    def to_dict(self):
        input_dict = super(Matern52, self)._to_dict()
        input_dict["class"] = "GPy.kern.Matern52"
        return input_dict
    @staticmethod
    def _from_dict(kernel_class, input_dict):
        useGPU = input_dict.pop('useGPU', None)
        return Matern52(**input_dict)
    def K_of_r(self, r):
        return self.variance*(1+np.sqrt(5.)*r+5./3*r**2)*np.exp(-np.sqrt(5.)*r)
@ -533,6 +570,16 @@ class ExpQuad(Stationary):
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='ExpQuad'):
        super(ExpQuad, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
    def to_dict(self):
        input_dict = super(ExpQuad, self)._to_dict()
        input_dict["class"] = "GPy.kern.ExpQuad"
        return input_dict
    @staticmethod
    def _from_dict(kernel_class, input_dict):
        useGPU = input_dict.pop('useGPU', None)
        return ExpQuad(**input_dict)
    def K_of_r(self, r):
        return self.variance * np.exp(-0.5 * r**2)
@ -566,6 +613,17 @@ class RatQuad(Stationary):
        self.power = Param('power', power, Logexp())
        self.link_parameters(self.power)
    def to_dict(self):
        input_dict = super(RatQuad, self)._to_dict()
        input_dict["class"] = "GPy.kern.RatQuad"
        input_dict["power"] = self.power.values.tolist()
        return input_dict
    @staticmethod
    def _from_dict(kernel_class, input_dict):
        useGPU = input_dict.pop('useGPU', None)
        return RatQuad(**input_dict)
    def K_of_r(self, r):
        r2 = np.square(r)
 #         return self.variance*np.power(1. + r2/2., -self.power)
@ -588,5 +646,3 @@ class RatQuad(Stationary):
    def update_gradients_diag(self, dL_dKdiag, X):
        super(RatQuad, self).update_gradients_diag(dL_dKdiag, X)
        self.power.gradient = 0.
--- a/GPy/likelihoods/init.py
+++ b/GPy/likelihoods/init.py
@ -7,4 +7,5 @@ from .student_t import StudentT
 from .likelihood import Likelihood
 from .mixed_noise import MixedNoise
 from .binomial import Binomial
-
+from .weibull import Weibull
 from .loglogistic import LogLogistic
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@ -29,6 +29,11 @@ class Bernoulli(Likelihood):
        if isinstance(gp_link , (link_functions.Heaviside, link_functions.Probit)):
            self.log_concave = True
    def to_dict(self):
        input_dict = super(Bernoulli, self)._to_dict()
        input_dict["class"] = "GPy.likelihoods.Bernoulli"
        return input_dict
    def _preprocess_values(self, Y):
        """
        Check if the values of the observations correspond to the values
--- a/GPy/likelihoods/binomial.py
+++ b/GPy/likelihoods/binomial.py
@ -66,7 +66,14 @@ class Binomial(Likelihood):
        np.testing.assert_array_equal(N.shape, y.shape)
        nchoosey = special.gammaln(N+1) - special.gammaln(y+1) - special.gammaln(N-y+1)
-        return nchoosey + y*np.log(inv_link_f) + (N-y)*np.log(1.-inv_link_f)
+        
        Ny = N-y
        t1 = np.zeros(y.shape)
        t2 = np.zeros(y.shape)
        t1[y>0] = y[y>0]*np.log(inv_link_f[y>0])
        t2[Ny>0] = Ny[Ny>0]*np.log(1.-inv_link_f[Ny>0])
        return nchoosey + t1 + t2
    def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
        """
@ -86,7 +93,13 @@ class Binomial(Likelihood):
        N = Y_metadata['trials']
        np.testing.assert_array_equal(N.shape, y.shape)
-        return y/inv_link_f - (N-y)/(1.-inv_link_f)
+        Ny = N-y
        t1 = np.zeros(y.shape)
        t2 = np.zeros(y.shape)
        t1[y>0] = y[y>0]/inv_link_f[y>0]
        t2[Ny>0] = (Ny[Ny>0])/(1.-inv_link_f[Ny>0])        
        return t1 - t2
    def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None):
        """
@ -111,7 +124,13 @@ class Binomial(Likelihood):
        """
        N = Y_metadata['trials']
        np.testing.assert_array_equal(N.shape, y.shape)
-        return -y/np.square(inv_link_f) - (N-y)/np.square(1.-inv_link_f)
+        Ny = N-y
        t1 = np.zeros(y.shape)
        t2 = np.zeros(y.shape)
        t1[y>0] = -y[y>0]/np.square(inv_link_f[y>0])
        t2[Ny>0] = -(Ny[Ny>0])/np.square(1.-inv_link_f[Ny>0])
        return t1+t2
    def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None):
        """
@ -135,8 +154,14 @@ class Binomial(Likelihood):
        N = Y_metadata['trials']
        np.testing.assert_array_equal(N.shape, y.shape)
-        inv_link_f2 = np.square(inv_link_f)
+        #inv_link_f2 = np.square(inv_link_f)  #TODO Remove. Why is this here?
-        return 2*y/inv_link_f**3 - 2*(N-y)/(1.-inv_link_f)**3
+        
        Ny = N-y
        t1 = np.zeros(y.shape)
        t2 = np.zeros(y.shape)
        t1[y>0] = 2*y[y>0]/inv_link_f[y>0]**3
        t2[Ny>0] = - 2*(Ny[Ny>0])/(1.-inv_link_f[Ny>0])**3
        return t1 + t2
    def samples(self, gp, Y_metadata=None, **kw):
        """
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -46,6 +46,13 @@ class Gaussian(Likelihood):
        if isinstance(gp_link, link_functions.Identity):
            self.log_concave = True
    def to_dict(self):
        input_dict = super(Gaussian, self)._to_dict()
        input_dict["class"] = "GPy.likelihoods.Gaussian"
        input_dict["variance"] = self.variance.values.tolist()
        return input_dict
    def betaY(self,Y,Y_metadata=None):
        #TODO: ~Ricardo this does not live here
        raise RuntimeError("Please notify the GPy developers, this should not happen")
@ -57,7 +64,10 @@ class Gaussian(Likelihood):
    def update_gradients(self, grad):
        self.variance.gradient = grad
-    def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
+    def ep_gradients(self, Y, cav_tau, cav_v, dL_dKdiag, Y_metadata=None, quad_mode='gk', boost_grad=1.):
        return self.exact_inference_gradients(dL_dKdiag)
    def exact_inference_gradients(self, dL_dKdiag, Y_metadata=None):
        return dL_dKdiag.sum()
    def _preprocess_values(self, Y):
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -6,8 +6,12 @@ from scipy import stats,special
 import scipy as sp
 from . import link_functions
 from ..util.misc import chain_1, chain_2, chain_3, blockify_dhess_dtheta, blockify_third, blockify_hessian, safe_exp
 from ..util.quad_integrate import quadgk_int
 from scipy.integrate import quad
 from functools import partial
 import warnings
 from ..core.parameterization import Parameterized
 class Likelihood(Parameterized):
@ -40,6 +44,37 @@ class Likelihood(Parameterized):
        self.gp_link = gp_link
        self.log_concave = False
        self.not_block_really = False
        self.name = name
    def to_dict(self):
        raise NotImplementedError
    def _to_dict(self):
        input_dict = {}
        input_dict["name"] = self.name
        input_dict["gp_link_dict"] = self.gp_link.to_dict()
        return input_dict
    @staticmethod
    def from_dict(input_dict):
        import copy
        input_dict = copy.deepcopy(input_dict)
        likelihood_class = input_dict.pop('class')
        input_dict["name"] = str(input_dict["name"])
        name = input_dict.pop('name')
        import GPy
        likelihood_class = eval(likelihood_class)
        return likelihood_class._from_dict(likelihood_class, input_dict)
    @staticmethod
    def _from_dict(likelihood_class, input_dict):
        import copy
        input_dict = copy.deepcopy(input_dict)
        gp_link_dict = input_dict.pop('gp_link_dict')
        import GPy
        gp_link = GPy.likelihoods.link_functions.GPTransformation.from_dict(gp_link_dict)
        input_dict["gp_link"] = gp_link
        return likelihood_class(**input_dict)
    def request_num_latent_functions(self, Y):
        """
@ -223,6 +258,91 @@ class Likelihood(Parameterized):
            self.__gh_points = np.polynomial.hermite.hermgauss(T)
        return self.__gh_points
    def ep_gradients(self, Y, cav_tau, cav_v, dL_dKdiag, Y_metadata=None, quad_mode='gk', boost_grad=1.):
        if self.size > 0:
            shape = Y.shape
            tau,v,Y = cav_tau.flatten(), cav_v.flatten(),Y.flatten()
            mu = v/tau
            sigma2 = 1./tau
            # assert Y.shape == v.shape
            dlik_dtheta = np.empty((self.size, Y.shape[0]))
            # for j in range(self.size):
            Y_metadata_list = []
            for index in range(len(Y)):
                Y_metadata_i = {}
                if Y_metadata is not None:
                    for key in Y_metadata.keys():
                        Y_metadata_i[key] = Y_metadata[key][index,:]
                    Y_metadata_list.append(Y_metadata_i)
            if quad_mode == 'gk':
                f = partial(self.integrate_gk)
                quads = zip(*map(f, Y.flatten(), mu.flatten(), np.sqrt(sigma2.flatten()), Y_metadata_list))
                quads = np.vstack(quads)
                quads.reshape(self.size, shape[0], shape[1])
            elif quad_mode == 'gh':
                f = partial(self.integrate_gh)
                quads = zip(*map(f, Y.flatten(), mu.flatten(), np.sqrt(sigma2.flatten())))
                quads = np.hstack(quads)
                quads = quads.T
            else:
                raise Exception("no other quadrature mode available")
            #     do a gaussian-hermite integration
            dL_dtheta_avg = boost_grad * np.nanmean(quads, axis=1)
            dL_dtheta = boost_grad * np.nansum(quads, axis=1)
            # dL_dtheta = boost_grad * np.nansum(dlik_dtheta, axis=1)
        else:
            dL_dtheta = np.zeros(self.num_params)
        return dL_dtheta
    def integrate_gk(self, Y, mu, sigma, Y_metadata_i=None):
        # gaussian-kronrod integration.
        fmin = -np.inf
        fmax = np.inf
        SQRT_2PI = np.sqrt(2.*np.pi)
        def generate_integral(f):
            a = np.exp(self.logpdf_link(f, Y, Y_metadata_i)) * np.exp(-0.5 * np.square((f - mu) / sigma)) / (
                SQRT_2PI * sigma)
            fn1 = a * self.dlogpdf_dtheta(f, Y, Y_metadata_i)
            fn = fn1
            return fn
        dF_dtheta_i = quadgk_int(generate_integral, fmin=fmin, fmax=fmax)
        return dF_dtheta_i
    def integrate_gh(self, Y, mu, sigma, Y_metadata_i=None, gh_points=None):
        # gaussian-hermite quadrature.
        # "calculate site derivatives E_f{d logp(y_i|f_i)/da} where a is a likelihood parameter
        # and the expectation is over the exact marginal posterior, which is not gaussian- and is
        # unnormalised product of the cavity distribution(a Gaussian) and the exact likelihood term.
        #
        # calculate the expectation wrt the approximate marginal posterior, which should be approximately the same.
        # . This term is needed for evaluating the
        # gradients of the marginal likelihood estimate Z_EP wrt likelihood parameters."
        # "writing it explicitly "
        # use them for gaussian-hermite quadrature
        SQRT_2PI = np.sqrt(2.*np.pi)
        if gh_points is None:
            gh_x, gh_w = self._gh_points(32)
        else:
            gh_x, gh_w = gh_points
        X = gh_x[None,:]*np.sqrt(2.)*sigma + mu
        # Here X is a grid vector of possible fi values, while Y is just a single value which will be broadcasted.
        a = np.exp(self.logpdf_link(X, Y, Y_metadata_i))
        a = a.repeat(self.num_params,0)
        b = self.dlogpdf_dtheta(X, Y, Y_metadata_i)
        old_shape = b.shape
        fn = np.array([i*j for i,j in zip(a.flatten(), b.flatten())])
        fn = fn.reshape(old_shape)
        dF_dtheta_i = np.dot(fn, gh_w)/np.sqrt(np.pi)
        return dF_dtheta_i
    def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
        """
        Use Gauss-Hermite Quadrature to compute
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@ -43,6 +43,25 @@ class GPTransformation(object):
        """
        raise NotImplementedError
    def to_dict(self):
        raise NotImplementedError
    def _to_dict(self):
        return {}
    @staticmethod
    def from_dict(input_dict):
        import copy
        input_dict = copy.deepcopy(input_dict)
        link_class = input_dict.pop('class')
        import GPy
        link_class = eval(link_class)
        return link_class._from_dict(link_class, input_dict)
    @staticmethod
    def _from_dict(link_class, input_dict):
        return link_class(**input_dict)
 class Identity(GPTransformation):
    """
    .. math::
@ -62,6 +81,10 @@ class Identity(GPTransformation):
    def d3transf_df3(self,f):
        return np.zeros_like(f)
    def to_dict(self):
        input_dict = super(Identity, self)._to_dict()
        input_dict["class"] = "GPy.likelihoods.link_functions.Identity"
        return input_dict
 class Probit(GPTransformation):
    """
@ -82,6 +105,11 @@ class Probit(GPTransformation):
    def d3transf_df3(self,f):
        return (safe_square(f)-1.)*std_norm_pdf(f)
    def to_dict(self):
        input_dict = super(Probit, self)._to_dict()
        input_dict["class"] = "GPy.likelihoods.link_functions.Probit"
        return input_dict
 class Cloglog(GPTransformation):
    """
--- a/GPy/likelihoods/loggaussian.py
+++ b/GPy/likelihoods/loggaussian.py
@ -0,0 +1,304 @@
 # Copyright (c) 2012 - 2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats, special
 from ..core.parameterization import Param
 from ..core.parameterization.transformations import Logexp
 from . import link_functions
 from .likelihood import Likelihood
 class LogGaussian(Likelihood):
    """
    .. math::
        $$ p(y_{i}|f_{i}, z_{i}) = \\prod_{i=1}^{n} (\\frac{ry^{r-1}}{\\exp{f(x_{i})}})^{1-z_i} (1 + (\\frac{y}{\\exp(f(x_{i}))})^{r})^{z_i-2}  $$
    .. note:
        where z_{i} is the censoring indicator- 0 for non-censored data, and 1 for censored data.
    """
    def __init__(self,gp_link=None, sigma=1.):
        if gp_link is None:
            gp_link = link_functions.Identity()
            # gp_link = link_functions.Log()
        super(LogGaussian, self).__init__(gp_link, name='loggaussian')
        self.sigma = Param('sigma', sigma, Logexp())
        self.variance = Param('variance', sigma**2, Logexp())
        self.link_parameter(self.variance)
        # self.link_parameter()
    def pdf_link(self, link_f, y, Y_metadata=None):
        """
        :param link_f: latent variables link(f)
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: includes censoring information in dictionary key 'censored'
        :returns: likelihood evaluated for this point
        :rtype: float
        """
        return np.exp(self.logpdf_link(link_f, y, Y_metadata=Y_metadata))
    def logpdf_link(self, link_f, y, Y_metadata=None):
        """
        :param link_f: latent variables (link(f))
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: includes censoring information in dictionary key 'censored'
        :returns: likelihood evaluated for this point
        :rtype: float
        """
        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        c = np.zeros_like(y)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        uncensored = (1-c)* (-0.5*np.log(2*np.pi*self.variance) - np.log(y) - (np.log(y)-link_f)**2 /(2*self.variance) )
        censored = c*np.log( 1 - stats.norm.cdf((np.log(y) - link_f)/np.sqrt(self.variance)) )
        logpdf = uncensored + censored
        return logpdf
    def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
        """
        derivative of logpdf wrt link_f param
        .. math::
        :param link_f: latent variables link(f)
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: includes censoring information in dictionary key 'censored'
        :returns: likelihood evaluated for this point
        :rtype: float
        """
        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        c = np.zeros_like(y)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        val = np.log(y) - link_f
        val_scaled = val/np.sqrt(self.variance)
        val_scaled2 = val/self.variance
        uncensored = (1-c)*(val_scaled2)
        a = (1- stats.norm.cdf(val_scaled))
        # llg(z) = 1. / (1 - norm_cdf(r / sqrt(s2))). * (1 / sqrt(2 * pi * s2). * exp(-1 / (2. * s2). * r. ^ 2));
        censored = c*( 1./a) * (np.exp(-1.* val**2 /(2*self.variance)) / np.sqrt(2*np.pi*self.variance))
        # censored = c * (1. / (1 - stats.norm.cdf(val_scaled))) * (stats.norm.pdf(val_scaled))
        gradient = uncensored + censored
        return gradient
    def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
        """
        Hessian at y, given link(f), w.r.t link(f)
        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
        The hessian will be 0 unless i == j
        .. math::
        :param link_f: latent variables link(f)
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: includes censoring information in dictionary key 'censored'
        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
        :rtype: Nx1 array
        .. Note::
            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
        """
        # c = Y_metadata['censored']
        # c = np.zeros((y.shape[0],))
        c = np.zeros_like(y)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        val = np.log(y) - link_f
        val_scaled = val/np.sqrt(self.variance)
        val_scaled2 = val/self.variance
        a = (1 - stats.norm.cdf(val_scaled))
        uncensored = (1-c) *(-1)/self.variance
        censored = c*(-np.exp(-val**2/self.variance) / ( 2*np.pi*self.variance*(a**2) ) +
                      val*np.exp(-(val**2)/(2*self.variance))/( np.sqrt(2*np.pi)*self.variance**(3/2.)*a) )
        hessian = censored + uncensored
        return hessian
    def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
        """
        Gradient of the log-likelihood function at y given f, w.r.t shape parameter
        .. math::
        :param inv_link_f: latent variables link(f)
        :type inv_link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: includes censoring information in dictionary key 'censored'
        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
        :rtype: float
        """
        c = np.zeros_like(y)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        val = np.log(y) - link_f
        val_scaled = val/np.sqrt(self.variance)
        val_scaled2 = val/self.variance
        a = (1 - stats.norm.cdf(val_scaled))
        uncensored = 0
        censored = c *( 2*np.exp(-3*(val**2)/(2*self.variance)) / ((a**3)*(2*np.pi*self.variance)**(3/2.))
                        - val*np.exp(-(val**2)/self.variance)/ ( (a**2)*np.pi*self.variance**2)
                        - val*np.exp(-(val**2)/self.variance)/ ( (a**2)*2*np.pi*self.variance**2)
                        - np.exp(-(val**2)/(2*self.variance))/ ( a*(self.variance**(1.50))*np.sqrt(2*np.pi))
                        + (val**2)*np.exp(-(val**2)/(2*self.variance))/ ( a*np.sqrt(2*np.pi*self.variance)*self.variance**2 ) )
        d3pdf_dlink3 = uncensored + censored
        return d3pdf_dlink3
    def dlogpdf_link_dvar(self, link_f, y, Y_metadata=None):
        """
        Gradient of the log-likelihood function at y given f, w.r.t variance parameter
        .. math::
        :param inv_link_f: latent variables link(f)
        :type inv_link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: includes censoring information in dictionary key 'censored'
        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
        :rtype: float
        """
        c = np.zeros_like(y)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        val = np.log(y) - link_f
        val_scaled = val/np.sqrt(self.variance)
        val_scaled2 = val/self.variance
        a = (1 - stats.norm.cdf(val_scaled))
        uncensored = (1-c)*(-0.5/self.variance + (val**2)/(2*(self.variance**2)) )
        censored = c *( val*np.exp(-val**2/ (2*self.variance)) / (a*np.sqrt(2*np.pi)*2*(self.variance**(1.5))) )
        dlogpdf_dvar = uncensored + censored
        # dlogpdf_dvar = dlogpdf_dvar*self.variance
        return dlogpdf_dvar
    def dlogpdf_dlink_dvar(self, link_f, y, Y_metadata=None):
        """
        :param link_f: latent variables link(f)
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: Y_metadata not used in gaussian
        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
        :rtype: Nx1 array
        """
        c = np.zeros_like(y)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        val = np.log(y) - link_f
        val_scaled = val/np.sqrt(self.variance)
        val_scaled2 = val/self.variance
        a = (1 - stats.norm.cdf(val_scaled))
        uncensored = (1-c)*(-val/(self.variance**2))
        censored = c * (-val*np.exp(-val**2/self.variance)/( 4*np.pi*(self.variance**2)*(a**2)) +
                         (-1 + (val**2)/self.variance)*np.exp(-val**2/(2*self.variance) ) /
                        ( a*(np.sqrt(2.*np.pi)*2*self.variance**1.5)) )
        dlik_grad_dsigma = uncensored + censored
        # dlik_grad_dsigma = dlik_grad_dsigma*self.variance
        return dlik_grad_dsigma
    def d2logpdf_dlink2_dvar(self, link_f, y, Y_metadata=None):
        """
        :param link_f: latent variables link(f)
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: Y_metadata not used in gaussian
        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
        :rtype: Nx1 array
        """
        c = np.zeros_like(y)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        val = np.log(y) - link_f
        val_scaled = val/np.sqrt(self.variance)
        val_scaled2 = val/self.variance
        a = (1 - stats.norm.cdf(val_scaled))
        uncensored = (1-c)*( 1./(self.variance**2) )
        censored = c*( val*np.exp(-3*(val**2)/(2*self.variance) )/ ((a**3)*np.sqrt(8*np.pi**3)*self.variance**(5/2.))
                       + np.exp(-val**2/self.variance)/((a**2)*4*np.pi*self.variance**2)
                       - np.exp(-val**2/self.variance)*val**2 / ((a**2)*2*np.pi*self.variance**3)
                       + np.exp(-val**2/self.variance)/ ( (a**2)*4*np.pi*self.variance**2)
                       - np.exp(-val**2/ (2*self.variance))*val / ( a*np.sqrt(2*np.pi)*2*self.variance**(5/2.))
                       - np.exp(-val**2/self.variance)*(val**2) / ((a**2)*4*np.pi*self.variance**3)
                       - np.exp(-val**2/ (2*self.variance))*val/ (a*np.sqrt(2*np.pi)*self.variance**(5/2.))
                       + np.exp(-val**2/ (2*self.variance))*(val**3) / (a*np.sqrt(2*np.pi)*2*self.variance**(7/2.)) )
        dlik_hess_dsigma = uncensored + censored
        return dlik_hess_dsigma
    def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
        """
        :param link_f: latent variables link(f)
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: Y_metadata not used in gaussian
        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
        :rtype: Nx1 array
        """
        dlogpdf_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
        dlogpdf_dtheta[0,:,:] = self.dlogpdf_link_dvar(f,y,Y_metadata=Y_metadata)
        return dlogpdf_dtheta
    def dlogpdf_dlink_dtheta(self, f, y, Y_metadata=None):
        """
        :param link_f: latent variables link(f)
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: Y_metadata not used in gaussian
        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
        :rtype: Nx1 array
        """
        dlogpdf_dlink_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
        dlogpdf_dlink_dtheta[0,:,:] = self.dlogpdf_dlink_dvar(f,y,Y_metadata=Y_metadata)
        return dlogpdf_dlink_dtheta
    def d2logpdf_dlink2_dtheta(self, f, y, Y_metadata=None):
        """
        :param link_f: latent variables link(f)
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: Y_metadata not used in gaussian
        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
        :rtype: Nx1 array
        """
        d2logpdf_dlink2_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
        d2logpdf_dlink2_dtheta[0,:,:] = self.d2logpdf_dlink2_dvar(f,y,Y_metadata=Y_metadata)
        return d2logpdf_dlink2_dtheta
    def update_gradients(self, grads):
        """
        Pull out the gradients, be careful as the order must match the order
        in which the parameters are added
        """
        self.variance.gradient = grads[0]
    def samples(self, gp, Y_metadata=None):
        """
        Returns a set of samples of observations based on a given value of the latent variable.
        :param gp: latent variable
        """
        orig_shape = gp.shape
        gp = gp.flatten()
--- a/GPy/likelihoods/loglogistic.py
+++ b/GPy/likelihoods/loglogistic.py
@ -0,0 +1,339 @@
 from __future__ import division
 # Copyright (c) 2015 Alan Saul
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats,special
 import scipy as sp
 from ..core.parameterization import Param
 from ..core.parameterization.transformations import Logexp
 from . import link_functions
 from .likelihood import Likelihood
 from .link_functions import Log
 class LogLogistic(Likelihood):
    """
    .. math::
        $$ p(y_{i}|f_{i}, z_{i}) = \\prod_{i=1}^{n} (\\frac{ry^{r-1}}{\\exp{f(x_{i})}})^{1-z_i} (1 + (\\frac{y}{\\exp(f(x_{i}))})^{r})^{z_i-2}  $$
    .. note:
        where z_{i} is the censoring indicator- 0 for non-censored data, and 1 for censored data.
    """
    def __init__(self, gp_link=None, r=1.0):
        if gp_link is None:
            #Parameterised not as link_f but as f
            gp_link = Log()
        super(LogLogistic, self).__init__(gp_link, name='LogLogistic')
        self.r = Param('r_log_shape', float(r), Logexp())
        self.link_parameter(self.r)
        # self.censored = 'censored'
    def pdf_link(self, link_f, y, Y_metadata=None):
        """
        Likelihood function given link(f)
        .. math::
        :param link_f: latent variables link(f)
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: includes censoring information in dictionary key 'censored'
        :returns: likelihood evaluated for this point
        :rtype: float
        """
        return np.exp(self.logpdf_link(link_f, y, Y_metadata=Y_metadata))
    def logpdf_link(self, link_f, y, Y_metadata=None):
        """
        Log Likelihood Function given link(f)
        .. math::
        :param link_f: latent variables (link(f))
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: includes censoring information in dictionary key 'censored'
        :returns: likelihood evaluated for this point
        :rtype: float
        """
        # c = np.zeros((y.shape[0],))
        c = np.zeros_like(link_f)
        if Y_metadata is not None and  'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        link_f = np.clip(link_f, 1e-150, 1e100)
        # y_link_f = y/link_f
        # y_link_f_r = y_link_f**self.r
        # y_link_f_r = np.clip(y**self.r, 1e-150, 1e200) / np.clip(link_f**self.r, 1e-150, 1e200)
        # y_link_f_r = np.clip((y/link_f)**self.r, 1e-150, 1e200)
        y_r = np.clip(y**self.r, 1e-150, 1e200)
        link_f_r = np.clip(link_f**self.r, 1e-150, 1e200)
        y_link_f_r = np.clip(y_r / link_f_r, 1e-150, 1e200)
        #uncensored = (1-c)*(np.log(self.r) + (self.r+1)*np.log(y) - self.r*np.log(link_f) - 2*np.log1p(y_link_f_r))
        #uncensored = (1-c)*(np.log((self.r/link_f)*y_link_f**(self.r-1)) - 2*np.log1p(y_link_f_r))
        # clever way tp break it into censored and uncensored-parts ..
        uncensored = (1-c)*(np.log(self.r) + (self.r-1)*np.log(y) - self.r*np.log(link_f) - 2*np.log1p(y_link_f_r))
        censored = (c)*(-np.log1p(y_link_f_r))
        #
        return uncensored + censored
        # return uncensored
    def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
        """
        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
        .. math::
        :param link_f: latent variables (f)
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: includes censoring information in dictionary key 'censored'
        :returns: gradient of likelihood evaluated at points
        :rtype: Nx1 array
        """
        # c = Y_metadata['censored']
        # for debugging
        # c = np.zeros((y.shape[0],))
        c = np.zeros_like(link_f)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        #y_link_f = y/link_f
        #y_link_f_r = y_link_f**self.r
        y_link_f_r = np.clip(y**self.r, 1e-150, 1e200) / np.clip(link_f**self.r, 1e-150, 1e200)
        #In terms of link_f
        # uncensored = (1-c)*( (2*self.r*y**r)/(link_f**self.r + y**self.r) - link_f*self.r)
        uncensored = (1-c)*self.r*(y_link_f_r - 1)/(link_f*(1 + y_link_f_r))
        censored = c*(self.r*y_link_f_r/(link_f*y_link_f_r + link_f))
        return uncensored + censored
        # return uncensored
    def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
        """
        Hessian at y, given link(f), w.r.t link(f)
        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
        The hessian will be 0 unless i == j
        .. math::
        :param link_f: latent variables link(f)
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: includes censoring information in dictionary key 'censored'
        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
        :rtype: Nx1 array
        .. Note::
            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
        """
        # c = Y_metadata['censored']
        # c = np.zeros((y.shape[0],))
        c = np.zeros_like(link_f)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        y_link_f = y/link_f
        y_link_f_r = y_link_f**self.r
        #In terms of link_f
        censored = c*(-self.r*y_link_f_r*(y_link_f_r + self.r + 1)/((link_f**2)*(y_link_f_r + 1)**2))
        uncensored = (1-c)*(-self.r*(2*self.r*y_link_f_r + y_link_f**(2*self.r) - 1) / ((link_f**2)*(1+ y_link_f_r)**2))
        hess = censored + uncensored
        return hess
    def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
        """
        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
        .. math::
        :param link_f: latent variables link(f)
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: includes censoring information in dictionary key 'censored'
        :returns: third derivative of likelihood evaluated at points f
        :rtype: Nx1 array
        """
        # c = Y_metadata['censored']
        #  for debugging
        # c = np.zeros((y.shape[0],))
        c = np.zeros_like(link_f)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        y_link_f = y/link_f
        y_link_f_r = y_link_f**self.r
        #In terms of link_f
        censored = c*(self.r*y_link_f_r*(((self.r**2)*(-(y_link_f_r - 1))) + 3*self.r*(y_link_f_r + 1) + 2*(y_link_f_r + 1)**2)
                      / ((link_f**3)*(y_link_f_r + 1)**3))
        uncensored = (1-c)*(2*self.r*(-(self.r**2)*(y_link_f_r -1)*y_link_f_r + 3*self.r*(y_link_f_r + 1)*y_link_f_r + (y_link_f_r - 1)*(y_link_f_r + 1)**2)
                            / ((link_f**3)*(y_link_f_r + 1)**3))
        d3lik_dlink3 = censored + uncensored
        return d3lik_dlink3
    def dlogpdf_link_dr(self, inv_link_f, y, Y_metadata=None):
        """
        Gradient of the log-likelihood function at y given f, w.r.t shape parameter
        .. math::
        :param inv_link_f: latent variables link(f)
        :type inv_link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: includes censoring information in dictionary key 'censored'
        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
        :rtype: float
        """
        # c = Y_metadata['censored']
        # c = np.zeros((y.shape[0],))
        c = np.zeros_like(y)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        link_f = inv_link_f #FIXME: Change names consistently...
        y_link_f = y/link_f
        log_y_link_f = np.log(y) - np.log(link_f)
        y_link_f_r = y_link_f**self.r
        #In terms of link_f
        censored = c*(-y_link_f_r*log_y_link_f/(1 + y_link_f_r))
        uncensored = (1-c)*(1./self.r + np.log(y) - np.log(link_f) - (2*y_link_f_r*log_y_link_f) / (1 + y_link_f_r))
        dlogpdf_dr = censored + uncensored
        return dlogpdf_dr
    def dlogpdf_dlink_dr(self, inv_link_f, y, Y_metadata=None):
        """
        Derivative of the dlogpdf_dlink w.r.t shape parameter
        .. math::
        :param inv_link_f: latent variables inv_link_f
        :type inv_link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: includes censoring information in dictionary key 'censored'
        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
        :rtype: Nx1 array
        """
        # c = np.zeros((y.shape[0],))
        c = np.zeros_like(y)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        link_f = inv_link_f
        y_link_f = y/link_f
        y_link_f_r = y_link_f**self.r
        log_y_link_f = np.log(y) - np.log(link_f)
        #In terms of link_f
        censored = c*(y_link_f_r*(y_link_f_r + self.r*log_y_link_f + 1)/(link_f*(y_link_f_r + 1)**2))
        uncensored = (1-c)*(y_link_f**(2*self.r) + 2*self.r*y_link_f_r*log_y_link_f - 1) / (link_f*(1 + y_link_f_r)**2)
        # dlogpdf_dlink_dr = uncensored
        dlogpdf_dlink_dr = censored + uncensored
        return dlogpdf_dlink_dr
    def d2logpdf_dlink2_dr(self, inv_link_f, y, Y_metadata=None):
        """
        Gradient of the hessian (d2logpdf_dlink2) w.r.t shape parameter
        .. math::
        :param inv_link_f: latent variables link(f)
        :type inv_link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: includes censoring information in dictionary key 'censored'
        :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
        :rtype: Nx1 array
        """
        # c = Y_metadata['censored']
        # c = np.zeros((y.shape[0],))
        c = np.zeros_like(y)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        link_f = inv_link_f
        y_link_f = y/link_f
        y_link_f_r = y_link_f**self.r
        log_y_link_f = np.log(y) - np.log(link_f)
        #In terms of link_f
        y_link_f_2r = y_link_f**(2*self.r)
        denom2 = (link_f**2)*(1 + y_link_f_r)**2
        denom3 = (link_f**2)*(1 + y_link_f_r)**3
        censored = c*(-((y_link_f_r + self.r + 1)*y_link_f_r)/denom2
                      -(self.r*(y_link_f_r + self.r + 1)*y_link_f_r*log_y_link_f)/denom2
                      -(self.r*y_link_f_r*(y_link_f_r*log_y_link_f + 1))/denom2
                      +(2*self.r*(y_link_f_r + self.r + 1)*y_link_f_2r*log_y_link_f)/denom3
                      )
        uncensored = (1-c)*(-(2*self.r*y_link_f_r + y_link_f_2r - 1)/denom2
                            -(self.r*(2*y_link_f_r + 2*self.r*y_link_f_r*log_y_link_f + 2*y_link_f_2r*log_y_link_f)/denom2)
                            +(2*self.r*(2*self.r*y_link_f_r + y_link_f_2r - 1)*y_link_f_r*log_y_link_f)/denom3
                            )
        d2logpdf_dlink2_dr = censored + uncensored
        return d2logpdf_dlink2_dr
    def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
        dlogpdf_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
        dlogpdf_dtheta[0, :, :] = self.dlogpdf_link_dr(f, y, Y_metadata=Y_metadata)
        return dlogpdf_dtheta
    def dlogpdf_dlink_dtheta(self, f, y, Y_metadata=None):
        dlogpdf_dlink_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
        dlogpdf_dlink_dtheta[0, :, :] = self.dlogpdf_dlink_dr(f, y, Y_metadata=Y_metadata)
        return dlogpdf_dlink_dtheta
    def d2logpdf_dlink2_dtheta(self, f, y, Y_metadata=None):
        d2logpdf_dlink2_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
        d2logpdf_dlink2_dtheta[0,:, :] = self.d2logpdf_dlink2_dr(f, y, Y_metadata=Y_metadata)
        return d2logpdf_dlink2_dtheta
    def update_gradients(self, grads):
        """
        Pull out the gradients, be careful as the order must match the order
        in which the parameters are added
        """
        self.r.gradient = grads[0]
    def samples(self, gp, Y_metadata=None):
        """
        Returns a set of samples of observations based on a given value of the latent variable.
        :param gp: latent variable
        """
        orig_shape = gp.shape
        gp = gp.flatten()
        #rs = np.ones_like(gp)*self.r
        #scales = np.ones_like(gp)*np.sqrt(self.sigma2)
        #Ysim = sp.stats.fisk.rvs(rs, scale=self.gp_link.transf(gp))
        Ysim = np.array([sp.stats.fisk.rvs(self.r, loc=0, scale=self.gp_link.transf(f)) for f in gp])
        #np.random.fisk(self.gp_link.transf(gp), c=self.r)
        return Ysim.reshape(orig_shape)
--- a/GPy/likelihoods/weibull.py
+++ b/GPy/likelihoods/weibull.py
@ -0,0 +1,322 @@
 # Copyright (c) 2012 - 2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats, special
 import scipy as sp
 from ..core.parameterization import Param
 from ..core.parameterization.transformations import Logexp
 from . import link_functions
 from .likelihood import Likelihood
 class Weibull(Likelihood):
    """
    Implementing Weibull likelihood function ...
    """
    def __init__(self, gp_link=None, beta=1.):
        if gp_link is None:
            #Parameterised not as link_f but as f
            # gp_link = link_functions.Identity()
            #Parameterised as link_f
            gp_link = link_functions.Log()
        super(Weibull, self).__init__(gp_link, name='Weibull')
        self.r = Param('r_weibull_shape', float(beta), Logexp())
        self.link_parameter(self.r)
    def pdf_link(self, link_f, y, Y_metadata=None):
        """
        Likelihood function given link(f)
        :param link_f: latent variables link(f)
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: Y_metadata which is not used in weibull distribution
        :returns: likelihood evaluated for this point
        :rtype: float
        """
        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        c = np.zeros((link_f.shape[0],))
        # log_objective = np.log(self.r) + (self.r - 1) * np.log(y) - link_f - (np.exp(-link_f) * (y ** self.r))
        # log_objective = stats.weibull_min.pdf(y,c=self.beta,loc=link_f,scale=1.)
        log_objective = self.logpdf_link(link_f, y, Y_metadata)
        return np.exp(log_objective)
    def logpdf_link(self, link_f, y, Y_metadata=None):
        """
        Log Likelihood Function given link(f)
        .. math::
            \\ln p(y_{i}|\lambda(f_{i})) = \\alpha_{i}\\log \\beta - \\log \\Gamma(\\alpha_{i}) + (\\alpha_{i} - 1)\\log y_{i} - \\beta y_{i}\\\\
            \\alpha_{i} = \\beta y_{i}
        :param link_f: latent variables (link(f))
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: Y_metadata which is not used in poisson distribution
        :returns: likelihood evaluated for this point
        :rtype: float
        """
        # alpha = self.gp_link.transf(gp)*self.beta    sum(log(a) + (a-1).*log(y)- f - exp(-f).*y.^a)
        # return (1. - alpha)*np.log(obs) + self.beta*obs - alpha * np.log(self.beta) + np.log(special.gamma(alpha))
        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
        c = np.zeros_like(y)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        # uncensored = (1-c)* (np.log(self.r) + (self.r - 1) * np.log(y) - link_f - (np.exp(-link_f) * (y ** self.r)))
        # censored = (-c)*np.exp(-link_f)*(y**self.r)
        uncensored = (1-c)*( np.log(self.r)-np.log(link_f)+(self.r-1)*np.log(y) - y**self.r/link_f)
        censored = -c*y**self.r/link_f
        log_objective = uncensored + censored
        return log_objective
    def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
        """
        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
        .. math::
            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\beta (\\log \\beta y_{i}) - \\Psi(\\alpha_{i})\\beta\\\\
            \\alpha_{i} = \\beta y_{i}
        :param link_f: latent variables (f)
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: Y_metadata which is not used in gamma distribution
        :returns: gradient of likelihood evaluated at points
        :rtype: Nx1 array
        """
        # grad =  (1. - self.beta) / (y - link_f)
        c = np.zeros_like(y)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        # uncensored = (1-c)* ( -1 + np.exp(-link_f)*(y ** self.r))
        # censored = c*np.exp(-link_f)*(y**self.r)
        uncensored = (1-c)*(-1/link_f + y**self.r/link_f**2)
        censored = c*y**self.r/link_f**2
        grad = uncensored + censored
        return grad
    def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
        """
        Hessian at y, given link(f), w.r.t link(f)
        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
        The hessian will be 0 unless i == j
        .. math::
            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = -\\beta^{2}\\frac{d\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
            \\alpha_{i} = \\beta y_{i}
        :param link_f: latent variables link(f)
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: Y_metadata which is not used in gamma distribution
        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
        :rtype: Nx1 array
        .. Note::
            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
        """
        # hess = (self.beta - 1.) / (y - link_f)**2
        c = np.zeros_like(y)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        # uncensored = (1-c)* (-(y ** self.r) * np.exp(-link_f))
        # censored = -c*np.exp(-link_f)*y**self.r
        uncensored = (1-c)*(1/link_f**2 -2*y**self.r/link_f**3)
        censored = -c*2*y**self.r/link_f**3
        hess = uncensored + censored
        # hess = -(y ** self.r) * np.exp(-link_f)
        return hess
    def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
        """
        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
        .. math::
            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = -\\beta^{3}\\frac{d^{2}\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
            \\alpha_{i} = \\beta y_{i}
        :param link_f: latent variables link(f)
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: Y_metadata which is not used in gamma distribution
        :returns: third derivative of likelihood evaluated at points f
        :rtype: Nx1 array
        """
        # d3lik_dlink3 = (1. - self.beta) / (y - link_f)**3
        c = np.zeros_like(y)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        # uncensored = (1-c)* ((y ** self.r) * np.exp(-link_f))
        # censored = c*np.exp(-link_f)*y**self.r
        uncensored = (1-c)*(-2/link_f**3+ 6*y**self.r/link_f**4)
        censored = c*6*y**self.r/link_f**4
        d3lik_dlink3 = uncensored + censored
        # d3lik_dlink3 = (y ** self.r) * np.exp(-link_f)
        return d3lik_dlink3
    def exact_inference_gradients(self, dL_dKdiag, Y_metadata=None):
        return np.zeros(self.size)
    def dlogpdf_link_dr(self, inv_link_f, y, Y_metadata=None):
        """
        Gradient of the log-likelihood function at y given f, w.r.t shape parameter
        .. math::
        :param inv_link_f: latent variables link(f)
        :type inv_link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: includes censoring information in dictionary key 'censored'
        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
        :rtype: float
        """
        c = np.zeros_like(y)
        link_f = inv_link_f
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        uncensored = (1-c)* (1./self.r + np.log(y) - y**self.r*np.log(y)/link_f)
        censored = (-c*y**self.r*np.log(y)/link_f)
        dlogpdf_dr = uncensored + censored
        return dlogpdf_dr
    def dlogpdf_dlink_dr(self, inv_link_f, y, Y_metadata=None):
        """
        First order derivative derivative of loglikelihood wrt r:shape parameter
        :param link_f: latent variables link(f)
        :type link_f: Nx1 array
        :param y: data
        :type y: Nx1 array
        :param Y_metadata: Y_metadata which is not used in gamma distribution
        :returns: third derivative of likelihood evaluated at points f
        :rtype: Nx1 array
        """
        # dlogpdf_dlink_dr = self.beta * y**(self.beta - 1) * np.exp(-link_f)
        # dlogpdf_dlink_dr = np.exp(-link_f) * (y ** self.r) * np.log(y)
        c = np.zeros_like(y)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        link_f = inv_link_f
        # uncensored = (1-c)*(np.exp(-link_f)* (y ** self.r) * np.log(y))
        # censored = c*np.exp(-link_f)*(y**self.r)*np.log(y)
        uncensored = (1-c)*(y**self.r*np.log(y)/link_f**2)
        censored = c*(y**self.r*np.log(y)/link_f**2)
        dlogpdf_dlink_dr = uncensored + censored
        return dlogpdf_dlink_dr
    def d2logpdf_dlink2_dr(self, link_f, y, Y_metadata=None):
        """
        Derivative of hessian of loglikelihood wrt r-shape parameter.
        :param link_f:
        :param y:
        :param Y_metadata:
        :return:
        """
        c = np.zeros_like(y)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        # uncensored = (1-c)*( -np.exp(-link_f)* (y ** self.r) * np.log(y))
        # censored = -c*np.exp(-link_f)*(y**self.r)*np.log(y)
        uncensored = (1-c)*-2*y**self.r*np.log(y)/link_f**3
        censored = c*-2*y**self.r*np.log(y)/link_f**3
        d2logpdf_dlink_dr = uncensored + censored
        return d2logpdf_dlink_dr
    def d3logpdf_dlink3_dr(self, link_f, y, Y_metadata=None):
        """
        :param link_f:
        :param y:
        :param Y_metadata:
        :return:
        """
        c = np.zeros_like(y)
        if Y_metadata is not None and 'censored' in Y_metadata.keys():
            c = Y_metadata['censored']
        uncensored = (1-c)* ((y**self.r)*np.exp(-link_f)*np.log1p(y))
        censored = c*np.exp(-link_f)*(y**self.r)*np.log(y)
        d3logpdf_dlink3_dr = uncensored + censored
        return d3logpdf_dlink3_dr
    def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
        """
        :param f:
        :param y:
        :param Y_metadata:
        :return:
        """
        dlogpdf_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
        dlogpdf_dtheta[0, :, :] = self.dlogpdf_link_dr(f, y, Y_metadata=Y_metadata)
        return dlogpdf_dtheta
    def dlogpdf_dlink_dtheta(self, f, y, Y_metadata=None):
        """
        :param f:
        :param y:
        :param Y_metadata:
        :return:
        """
        dlogpdf_dlink_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
        dlogpdf_dlink_dtheta[0, :, :] = self.dlogpdf_dlink_dr(f, y, Y_metadata)
        return dlogpdf_dlink_dtheta
    def d2logpdf_dlink2_dtheta(self, f, y, Y_metadata=None):
        """
        :param f:
        :param y:
        :param Y_metadata:
        :return:
        """
        d2logpdf_dlink_dtheta2 = np.zeros((self.size, f.shape[0], f.shape[1]))
        d2logpdf_dlink_dtheta2[0, :, :] = self.d2logpdf_dlink2_dr(f, y, Y_metadata)
        return d2logpdf_dlink_dtheta2
    def update_gradients(self, grads):
        """
        Pull out the gradients, be careful as the order must match the order
        in which the parameters are added
        """
        self.r.gradient = grads[0]
    def samples(self, gp, Y_metadata=None):
        """
        Returns a set of samples of observations conditioned on a given value of latent variable f.
        :param gp: latent variable
        """
        orig_shape = gp.shape
        gp = gp.flatten()
        weibull_samples = np.array([sp.stats.weibull_min.rvs(self.r, loc=0, scale=self.gp_link.transf(f)) for f in gp])
        return weibull_samples.reshape(orig_shape)
--- a/GPy/mappings/constant.py
+++ b/GPy/mappings/constant.py
@ -38,3 +38,9 @@ class Constant(Mapping):
    def gradients_X(self, dL_dF, X):
        return np.zeros_like(X)
    def to_dict(self):
        input_dict = super(Constant, self)._to_dict()
        input_dict["class"] = "GPy.mappings.Constant"
        input_dict["value"] = self.C.values[0]
        return input_dict
--- a/GPy/mappings/identity.py
+++ b/GPy/mappings/identity.py
@ -19,8 +19,7 @@ class Identity(Mapping):
    def gradients_X(self, dL_dF, X):
        return dL_dF
-
+    def to_dict(self):
-
+        input_dict = super(Identity, self)._to_dict()
-
+        input_dict["class"] = "GPy.mappings.Identity"
-
+        return input_dict
--- a/GPy/mappings/linear.py
+++ b/GPy/mappings/linear.py
@ -37,3 +37,21 @@ class Linear(Mapping):
    def gradients_X(self, dL_dF, X):
        return np.dot(dL_dF, self.A.T)
    def to_dict(self):
        input_dict = super(Linear, self)._to_dict()
        input_dict["class"] = "GPy.mappings.Linear"
        input_dict["A"] = self.A.values.tolist()
        return input_dict
    @staticmethod
    def _from_dict(mapping_class, input_dict):
        import copy
        input_dict = copy.deepcopy(input_dict)
        A = np.array(input_dict.pop('A'))
        l = Linear(**input_dict)
        l.unlink_parameter(l.A)
        l.update_model(False)
        l.A = Param('A', A)
        l.link_parameter(l.A)
        return l
--- a/GPy/mappings/mlp.py
+++ b/GPy/mappings/mlp.py
@ -35,7 +35,7 @@ class MLP(Mapping):
        # Backpropagation to hidden layer.
        dL_dact = np.dot(dL_dF, self.W2.T)
-        dL_dlayer1 = dL_dact / np.square(np.cosh(layer1))
+        dL_dlayer1 = dL_dact * (1 - np.square(activations))
        # Finally, evaluate the first-layer gradients.
        self.W1.gradient = np.dot(X.T,dL_dlayer1)
@ -47,7 +47,7 @@ class MLP(Mapping):
        # Backpropagation to hidden layer.
        dL_dact = np.dot(dL_dF, self.W2.T)
-        dL_dlayer1 = dL_dact / np.square(np.cosh(layer1))
+        dL_dlayer1 = dL_dact * (1 - np.square(activations))
        return np.dot(dL_dlayer1, self.W1.T)
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -9,6 +9,7 @@ from .gplvm import GPLVM
 from .bcgplvm import BCGPLVM
 from .sparse_gplvm import SparseGPLVM
 from .warped_gp import WarpedGP
 from .input_warped_gp import InputWarpedGP
 from .bayesian_gplvm import BayesianGPLVM
 from .mrd import MRD
 from .gradient_checker import GradientChecker, HessianChecker, SkewChecker
--- a/GPy/models/gp_classification.py
+++ b/GPy/models/gp_classification.py
@ -4,6 +4,7 @@
 from ..core import GP
 from .. import likelihoods
 from .. import kern
 import numpy as np
 from ..inference.latent_function_inference.expectation_propagation import EP
 class GPClassification(GP):
@ -27,3 +28,23 @@ class GPClassification(GP):
        likelihood = likelihoods.Bernoulli()
        GP.__init__(self, X=X, Y=Y,  kernel=kernel, likelihood=likelihood, inference_method=EP(), mean_function=mean_function, name='gp_classification')
    @staticmethod
    def from_gp(gp):
        from copy import deepcopy
        gp = deepcopy(gp)
        GPClassification(gp.X, gp.Y, gp.kern, gp.likelihood, gp.inference_method, gp.mean_function, name='gp_classification')
    def to_dict(self, save_data=True):
        model_dict = super(GPClassification,self).to_dict(save_data)
        model_dict["class"] = "GPy.models.GPClassification"
        return model_dict
    @staticmethod
    def from_dict(input_dict, data=None):
        import GPy
        m = GPy.core.model.Model.from_dict(input_dict, data)
        return GPClassification.from_gp(m)
    def save_model(self, output_filename, compress=True, save_data=True):
        self._save_model(output_filename, compress=True, save_data=True)
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@ -35,3 +35,23 @@ class GPRegression(GP):
        super(GPRegression, self).__init__(X, Y, kernel, likelihood, name='GP regression', Y_metadata=Y_metadata, normalizer=normalizer, mean_function=mean_function)
    @staticmethod
    def from_gp(gp):
        from copy import deepcopy
        gp = deepcopy(gp)
        return GPRegression(gp.X, gp.Y, gp.kern, gp.Y_metadata, gp.normalizer, gp.likelihood.variance.values, gp.mean_function)
    def to_dict(self, save_data=True):
        model_dict = super(GPRegression,self).to_dict(save_data)
        model_dict["class"] = "GPy.models.GPRegression"
        return model_dict
    @staticmethod
    def _from_dict(input_dict, data=None):
        import GPy
        input_dict["class"] = "GPy.core.GP"
        m = GPy.core.GP.from_dict(input_dict, data)
        return GPRegression.from_gp(m)
    def save_model(self, output_filename, compress=True, save_data=True):
        self._save_model(output_filename, compress=True, save_data=True)
--- a/GPy/models/input_warped_gp.py
+++ b/GPy/models/input_warped_gp.py
@ -0,0 +1,149 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core import GP
 from .. import likelihoods
 from ..util.input_warping_functions import KumarWarping
 from .. import kern
 class InputWarpedGP(GP):
    """Input Warped GP
    This defines a GP model that applies a warping function to the Input.
    By default, it uses Kumar Warping (CDF of Kumaraswamy distribution)
    Parameters
    ----------
    X : array_like, shape = (n_samples, n_features) for input data
    Y : array_like, shape = (n_samples, 1) for output data
    kernel : object, optional
        An instance of kernel function defined in GPy.kern
        Default to Matern 32
    warping_function : object, optional
        An instance of warping function defined in GPy.util.input_warping_functions
        Default to KumarWarping
    warping_indices : list of int, optional
        An list of indices of which features in X should be warped.
        It is used in the Kumar warping function
    normalizer : bool, optional
        A bool variable indicates whether to normalize the output
    Xmin : list of float, optional
        The min values for every feature in X
        It is used in the Kumar warping function
    Xmax : list of float, optional
        The max values for every feature in X
        It is used in the Kumar warping function
    epsilon : float, optional
        We normalize X to [0+e, 1-e]. If not given, using the default value defined in KumarWarping function
    Attributes
    ----------
    X_untransformed : array_like, shape = (n_samples, n_features)
        A copy of original input X
    X_warped : array_like, shape = (n_samples, n_features)
        Input data after warping
    warping_function : object, optional
        An instance of warping function defined in GPy.util.input_warping_functions
        Default to KumarWarping
    Notes
    -----
    Kumar warping uses the CDF of Kumaraswamy distribution. More on the Kumaraswamy distribution can be found at the
    wiki page: https://en.wikipedia.org/wiki/Kumaraswamy_distribution
    References
    ----------
    Snoek, J.; Swersky, K.; Zemel, R. S. & Adams, R. P.
    Input Warping for Bayesian Optimization of Non-stationary Functions
    preprint arXiv:1402.0929, 2014
    """
    def __init__(self, X, Y, kernel=None, normalizer=False, warping_function=None, warping_indices=None, Xmin=None, Xmax=None, epsilon=None):
        if X.ndim == 1:
            X = X.reshape(-1, 1)
        self.X_untransformed = X.copy()
        if kernel is None:
            kernel = kern.sde_Matern32(X.shape[1], variance=1.)
        self.kernel = kernel
        if warping_function is None:
            self.warping_function = KumarWarping(self.X_untransformed, warping_indices, epsilon, Xmin, Xmax)
        else:
            self.warping_function = warping_function
        self.X_warped = self.transform_data(self.X_untransformed)
        likelihood = likelihoods.Gaussian()
        super(InputWarpedGP, self).__init__(self.X_warped, Y, likelihood=likelihood, kernel=kernel, normalizer=normalizer)
        # Add the parameters in the warping function to the model parameters hierarchy
        self.link_parameter(self.warping_function)
    def parameters_changed(self):
        """Update the gradients of parameters for warping function
        This method is called when having new values of parameters for warping function, kernels
        and other parameters in a normal GP
        """
        # using the warped X to update
        self.X = self.transform_data(self.X_untransformed)
        super(InputWarpedGP, self).parameters_changed()
        # the gradient of log likelihood w.r.t. input AFTER warping is a product of dL_dK and dK_dX
        dL_dX = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X)
        self.warping_function.update_grads(self.X_untransformed, dL_dX)
    def transform_data(self, X, test_data=False):
        """Apply warping_function to some Input data
        Parameters
        ----------
        X : array_like, shape = (n_samples, n_features)
        test_data: bool, optional
            Default to False, should set to True when transforming test data
        """
        return self.warping_function.f(X, test_data)
    def log_likelihood(self):
        """Compute the marginal log likelihood
        For input warping, just use the normal GP log likelihood
        """
        return GP.log_likelihood(self)
    def predict(self, Xnew):
        """Prediction on the new data
        Parameters
        ----------
        Xnew : array_like, shape = (n_samples, n_features)
            The test data.
        Returns
        -------
        mean : array_like, shape = (n_samples, output.dim)
            Posterior mean at the location of Xnew
        var : array_like, shape = (n_samples, 1)
            Posterior variance at the location of Xnew
        """
        Xnew_warped = self.transform_data(Xnew, test_data=True)
        mean, var = super(InputWarpedGP, self).predict(Xnew_warped, kern=self.kernel, full_cov=False)
        return mean, var
 if __name__ == '__main__':
    X = np.random.randn(100, 1)
    Y = np.sin(X) + np.random.randn(100, 1)*0.05
    m = InputWarpedGP(X, Y)
--- a/GPy/plotting/gpy_plot/kernel_plots.py
+++ b/GPy/plotting/gpy_plot/kernel_plots.py
@ -65,7 +65,7 @@ def plot_ARD(kernel, filtering=None, legend=False, canvas=None, **kwargs):
    if canvas is None:
-        canvas, kwargs = pl().new_canvas(xlim=(-.5, kernel._effective_input_dim-.5), xlabel='input dimension', ylabel='sensitivity', **kwargs)
+        canvas, kwargs = pl().new_canvas(xlim=(-.5, kernel._effective_input_dim-.5), xlabel='input dimension', ylabel='ard contribution', **kwargs)
    for i in range(ard_params.shape[0]):
        if parts[i].name in filtering:
--- a/GPy/testing/ep_likelihood_tests.py
+++ b/GPy/testing/ep_likelihood_tests.py
@ -0,0 +1,147 @@
 import numpy as np
 import unittest
 import GPy
 from GPy.models import GradientChecker
 fixed_seed = 10
 from nose.tools import with_setup, nottest
 # this file will contain some high level tests, this is not unit testing, but will give us a higher level estimate
 # if things are going well under the hood.
 class TestObservationModels(unittest.TestCase):
    def setUp(self):
        np.random.seed(fixed_seed)
        self.N = 100
        self.D = 2
        self.X = np.random.rand(self.N, self.D)
        self.real_noise_std = 0.05
        noise = np.random.randn(*self.X[:, 0].shape) * self.real_noise_std
        self.Y = (np.sin(self.X[:, 0] * 2 * np.pi) + noise)[:, None]
        self.num_points = self.X.shape[0]
        self.f = np.random.rand(self.N, 1)
        self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=np.int)[:, None]
        # self.binary_Y[self.binary_Y == 0.0] = -1.0
        self.positive_Y = np.exp(self.Y.copy())
        self.Y_noisy = self.Y.copy()
        self.Y_verynoisy = self.Y.copy()
        self.Y_noisy[75] += 1.3
        self.init_var = 0.15
        self.deg_free = 4.
        censored = np.zeros_like(self.Y)
        random_inds = np.random.choice(self.N, int(self.N / 2), replace=True)
        censored[random_inds] = 1
        self.Y_metadata = dict()
        self.Y_metadata['censored'] = censored
        self.kernel1 = GPy.kern.RBF(self.X.shape[1]) + GPy.kern.White(self.X.shape[1])
    def tearDown(self):
        self.Y = None
        self.X = None
        self.binary_Y =None
        self.positive_Y = None
        self.kernel1 = None
    @with_setup(setUp, tearDown)
    def testEPClassification(self):
        bernoulli = GPy.likelihoods.Bernoulli()
        laplace_inf = GPy.inference.latent_function_inference.Laplace()
        ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode='alternated')
        ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode='nested')
        ep_inf_fractional = GPy.inference.latent_function_inference.EP(ep_mode='nested', eta=0.9)
        m1 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=laplace_inf)
        m1.randomize()
        m2 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_alt)
        m2.randomize()
        m3 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_nested)
        m3.randomize()
        #
        m4 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_fractional)
        m4.randomize()
        optimizer = 'bfgs'
        #do gradcheck here ...
        # self.assertTrue(m1.checkgrad())
        # self.assertTrue(m2.checkgrad())
        # self.assertTrue(m3.checkgrad())
        # self.assertTrue(m4.checkgrad())
        m1.optimize(optimizer=optimizer, max_iters=300)
        m2.optimize(optimizer=optimizer, max_iters=300)
        m3.optimize(optimizer=optimizer, max_iters=300)
        m4.optimize(optimizer=optimizer, max_iters=300)
        # taking laplace predictions as the ground truth
        probs_mean_lap, probs_var_lap = m1.predict(self.X)
        probs_mean_ep_alt, probs_var_ep_alt = m2.predict(self.X)
        probs_mean_ep_nested, probs_var_ep_nested = m3.predict(self.X)
        # for simple single dimension data , marginal likelihood for laplace and EP approximations should not be so far apart.
        self.assertAlmostEqual(m1.log_likelihood(), m2.log_likelihood(),delta=1)
        self.assertAlmostEqual(m1.log_likelihood(), m3.log_likelihood(), delta=1)
        self.assertAlmostEqual(m1.log_likelihood(), m4.log_likelihood(), delta=5)
        GPy.util.classification.conf_matrix(probs_mean_lap, self.binary_Y)
        GPy.util.classification.conf_matrix(probs_mean_ep_alt, self.binary_Y)
        GPy.util.classification.conf_matrix(probs_mean_ep_nested, self.binary_Y)
    @nottest
    def rmse(self, Y, Ystar):
        return np.sqrt(np.mean((Y - Ystar) ** 2))
    @with_setup(setUp, tearDown)
    def test_EP_with_StudentT(self):
        studentT = GPy.likelihoods.StudentT(deg_free=self.deg_free, sigma2=self.init_var)
        laplace_inf = GPy.inference.latent_function_inference.Laplace()
        ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode='alternated')
        ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode='nested')
        ep_inf_frac = GPy.inference.latent_function_inference.EP(ep_mode='nested', eta=0.7)
        m1 = GPy.core.GP(self.X.copy(), self.Y_noisy.copy(), kernel=self.kernel1.copy(), likelihood=studentT.copy(), inference_method=laplace_inf)
        # optimize
        m1['.*white'].constrain_fixed(1e-5)
        m1.randomize()
        m2 = GPy.core.GP(self.X.copy(), self.Y_noisy.copy(), kernel=self.kernel1.copy(), likelihood=studentT.copy(), inference_method=ep_inf_alt)
        m2['.*white'].constrain_fixed(1e-5)
        # m2.constrain_bounded('.*t_scale2', 0.001, 10)
        m2.randomize()
        # m3 = GPy.core.GP(self.X, self.Y_noisy.copy(), kernel=self.kernel1, likelihood=studentT.copy(), inference_method=ep_inf_nested)
        # m3['.*white'].constrain_fixed(1e-5)
        # # m3.constrain_bounded('.*t_scale2', 0.001, 10)
        # m3.randomize()
        optimizer='bfgs'
        m1.optimize(optimizer=optimizer,max_iters=400)
        m2.optimize(optimizer=optimizer, max_iters=400)
        # m3.optimize(optimizer=optimizer, max_iters=500)
        self.assertAlmostEqual(m1.log_likelihood(), m2.log_likelihood(),delta=200)
        # self.assertAlmostEqual(m1.log_likelihood(), m3.log_likelihood(), 3)
        preds_mean_lap, preds_var_lap = m1.predict(self.X)
        preds_mean_alt, preds_var_alt = m2.predict(self.X)
        # preds_mean_nested, preds_var_nested = m3.predict(self.X)
        rmse_lap = self.rmse(preds_mean_lap, self.Y)
        rmse_alt = self.rmse(preds_mean_alt, self.Y)
        # rmse_nested = self.rmse(preds_mean_nested, self.Y_noisy)
        if rmse_alt > rmse_lap:
            self.assertAlmostEqual(rmse_lap, rmse_alt, delta=1.5)
        # m3.optimize(optimizer=optimizer, max_iters=500)
 if __name__ == "__main__":
    unittest.main()
--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@ -61,6 +61,18 @@ class InferenceGPEP(unittest.TestCase):
        Y = lik.samples(f).reshape(-1,1)
        return X, Y
    def genNoisyData(self):
        np.random.seed(1)
        X = np.random.rand(100,1)
        self.real_std = 0.1
        noise = np.random.randn(*X[:, 0].shape)*self.real_std
        Y = (np.sin(X[:, 0]*2*np.pi) + noise)[:, None]
        self.f = np.random.rand(X.shape[0],1)
        Y_extra_noisy = Y.copy()
        Y_extra_noisy[50] += 4.
        # Y_extra_noisy[80:83] -= 2.
        return X, Y, Y_extra_noisy
    def test_inference_EP(self):
        from paramz import ObsAr
        X, Y = self.genData()
@ -73,11 +85,45 @@ class InferenceGPEP(unittest.TestCase):
                        inference_method=inf,
                        likelihood=lik)
        K = self.model.kern.K(X)
        mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde = self.model.inference_method.expectation_propagation(K, ObsAr(Y), lik, None)
-        v_tilde = mu_tilde * tau_tilde
+        post_params, ga_approx, cav_params, log_Z_tilde = self.model.inference_method.expectation_propagation(K, ObsAr(Y), lik, None)
-        p, m, d = self.model.inference_method._inference(K, tau_tilde, v_tilde, lik, Y_metadata=None,  Z_tilde=log_Z_tilde.sum())
+
-        p0, m0, d0 = super(GPy.inference.latent_function_inference.expectation_propagation.EP, inf).inference(k, X,lik ,mu_tilde[:,None], mean_function=None, variance=1./tau_tilde, K=K, Z_tilde=log_Z_tilde.sum() + np.sum(- 0.5*np.log(tau_tilde) + 0.5*(v_tilde*v_tilde*1./tau_tilde)))
+        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
        p, m, d = self.model.inference_method._inference(Y, K, ga_approx, cav_params, lik, Y_metadata=None,  Z_tilde=log_Z_tilde)
        p0, m0, d0 = super(GPy.inference.latent_function_inference.expectation_propagation.EP, inf).inference(k, X,lik ,mu_tilde[:,None], mean_function=None, variance=1./ga_approx.tau, K=K, Z_tilde=log_Z_tilde + np.sum(- 0.5*np.log(ga_approx.tau) + 0.5*(ga_approx.v*ga_approx.v*1./ga_approx.tau)))
        assert (np.sum(np.array([m - m0,
                    np.sum(d['dL_dK'] - d0['dL_dK']),
                    np.sum(d['dL_dthetaL'] - d0['dL_dthetaL']),
                    np.sum(d['dL_dm'] - d0['dL_dm']),
                    np.sum(p._woodbury_vector - p0._woodbury_vector),
                    np.sum(p.woodbury_inv - p0.woodbury_inv)])) < 1e6)
    # NOTE: adding a test like above for parameterized likelihood- the above test is
    # only for probit likelihood which does not have any tunable hyperparameter which is why
    # the term in dictionary of gradients: dL_dthetaL will always be zero. So here we repeat tests for
    # student-t likelihood and heterodescastic gaussian noise case. This test simply checks if the posterior
    # and gradients of log marginal are roughly the same for inference through EP and exact gaussian inference using
    # the gaussian approximation for the individual likelihood site terms. For probit likelihood, it is possible to
    # calculate moments analytically, but for other likelihoods, we will need to use numerical quadrature techniques,
    # and it is possible that any error might creep up because of quadrature implementation.
    def test_inference_EP_non_classification(self):
        from paramz import ObsAr
        X, Y, Y_extra_noisy = self.genNoisyData()
        deg_freedom = 5.
        init_noise_var = 0.08
        lik_studentT = GPy.likelihoods.StudentT(deg_free=deg_freedom, sigma2=init_noise_var)
        # like_gaussian_noise = GPy.likelihoods.MixedNoise()
        k = GPy.kern.RBF(1, variance=2., lengthscale=1.1)
        ep_inf_alt = GPy.inference.latent_function_inference.expectation_propagation.EP(max_iters=4, delta=0.5)
        # ep_inf_nested = GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode='nested', max_iters=100, delta=0.5)
        m = GPy.core.GP(X=X,Y=Y_extra_noisy,kernel=k,likelihood=lik_studentT,inference_method=ep_inf_alt)
        K = m.kern.K(X)
        post_params, ga_approx, cav_params, log_Z_tilde = m.inference_method.expectation_propagation(K, ObsAr(Y_extra_noisy), lik_studentT, None)
        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
        p, m, d = m.inference_method._inference(Y_extra_noisy, K, ga_approx, cav_params, lik_studentT, Y_metadata=None,  Z_tilde=log_Z_tilde)
        p0, m0, d0 = super(GPy.inference.latent_function_inference.expectation_propagation.EP, ep_inf_alt).inference(k, X,lik_studentT ,mu_tilde[:,None], mean_function=None, variance=1./ga_approx.tau, K=K, Z_tilde=log_Z_tilde + np.sum(- 0.5*np.log(ga_approx.tau) + 0.5*(ga_approx.v*ga_approx.v*1./ga_approx.tau)))
        assert (np.sum(np.array([m - m0,
                    np.sum(d['dL_dK'] - d0['dL_dK']),
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@ -123,6 +123,11 @@ class TestNoiseModels(object):
        self.var = 0.2
        self.deg_free = 4.0
        censored = np.zeros_like(self.Y)
        random_inds = np.random.choice(self.N, int(self.N / 2), replace=True)
        censored[random_inds] = 1
        self.Y_metadata = dict()
        self.Y_metadata['censored'] = censored
        #Make a bigger step as lower bound can be quite curved
        self.step = 1e-4
@ -274,6 +279,20 @@ class TestNoiseModels(object):
                "Y_metadata": {'trials': self.ns},
                "laplace": True,
            },
            "loglogistic_censored": {
                "model": GPy.likelihoods.LogLogistic(),
                "link_f_constraints": [self.constrain_positive],
                "Y": self.positive_Y,
                "Y_metadata": self.Y_metadata,
                "laplace": True
            },
            "weibull_censored": {
                "model": GPy.likelihoods.Weibull(),
                "link_f_constraints": [self.constrain_positive],
                "Y": self.positive_Y,
                "Y_metadata": self.Y_metadata,
                "laplace": True
            }
            #,
            #GAMMA needs some work!"Gamma_default": {
            #"model": GPy.likelihoods.Gamma(),
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -399,6 +399,68 @@ class MiscTests(unittest.TestCase):
        m.optimize()
        print(m)
    def test_input_warped_gp_identity(self):
        """
        A InputWarpedGP with the identity warping function should be
        equal to a standard GP.
        """
        k = GPy.kern.RBF(1)
        m = GPy.models.GPRegression(self.X, self.Y, kernel=k)
        m.optimize()
        preds = m.predict(self.X)
        warp_k = GPy.kern.RBF(1)
        warp_f = GPy.util.input_warping_functions.IdentifyWarping()
        warp_m = GPy.models.InputWarpedGP(self.X, self.Y, kernel=warp_k, warping_function=warp_f)
        warp_m.optimize()
        warp_preds = warp_m.predict(self.X)
        np.testing.assert_almost_equal(preds, warp_preds, decimal=4)
    def test_kumar_warping_gradient(self):
        n_X = 100
        np.random.seed(0)
        X = np.random.randn(n_X, 2)
        Y = np.sum(np.sin(X), 1).reshape(n_X, 1)
        k1 = GPy.kern.Linear(2)
        m1 = GPy.models.InputWarpedGP(X, Y, kernel=k1)
        m1.randomize()
        self.assertEquals(m1.checkgrad(), True)
        k2 = GPy.kern.RBF(2)
        m2 = GPy.models.InputWarpedGP(X, Y, kernel=k2)
        m2.randomize()
        m2.checkgrad()
        self.assertEquals(m2.checkgrad(), True)
        k3 = GPy.kern.Matern52(2)
        m3 = GPy.models.InputWarpedGP(X, Y, kernel=k3)
        m3.randomize()
        m3.checkgrad()
        self.assertEquals(m3.checkgrad(), True)
    def test_kumar_warping_parameters(self):
        np.random.seed(1)
        X = np.random.rand(5, 2)
        epsilon = 1e-6
        # testing warping indices
        warping_ind_1 = [0, 1, 2]
        warping_ind_2 = [-1, 1, 2]
        warping_ind_3 = [0, 1.5, 2]
        self.failUnlessRaises(ValueError, GPy.util.input_warping_functions.KumarWarping, X, warping_ind_1)
        self.failUnlessRaises(ValueError, GPy.util.input_warping_functions.KumarWarping, X, warping_ind_2)
        self.failUnlessRaises(ValueError, GPy.util.input_warping_functions.KumarWarping, X, warping_ind_3)
        # testing Xmin and Xmax
        Xmin_1, Xmax_1 = None, [1, 1]
        Xmin_2, Xmax_2 = [0, 0], None
        Xmin_3, Xmax_3 = [0, 0, 0], [1, 1]
        self.failUnlessRaises(ValueError, GPy.util.input_warping_functions.KumarWarping, X, [0, 1], epsilon, Xmin_1, Xmax_1)
        self.failUnlessRaises(ValueError, GPy.util.input_warping_functions.KumarWarping, X, [0, 1], epsilon, Xmin_2, Xmax_2)
        self.failUnlessRaises(ValueError, GPy.util.input_warping_functions.KumarWarping, X, [0, 1], epsilon, Xmin_3, Xmax_3)
    def test_warped_gp_identity(self):
        """
        A WarpedGP with the identity warping function should be
--- a/GPy/testing/quadrature_tests.py
+++ b/GPy/testing/quadrature_tests.py
@ -0,0 +1,39 @@
 from __future__ import print_function, division
 import numpy as np
 import GPy
 import warnings
 from  ..util.quad_integrate import quadgk_int, quadvgk
 class QuadTests(np.testing.TestCase):
    """
    test file for checking implementation of gaussian-kronrod quadrature.
    we will take a function which can be integrated analytically and check if quadgk result is similar or not!
    through this file we can test how numerically accurate quadrature implementation in native numpy or manual code is.
    """
    def setUp(self):
        pass
    def test_infinite_quad(self):
        def f(x):
            return np.exp(-0.5*x**2)*np.power(x,np.arange(3)[:,None])
        quad_int_val = quadgk_int(f)
        real_val = np.sqrt(np.pi * 2)
        np.testing.assert_almost_equal(real_val, quad_int_val[0], decimal=7)
    def test_finite_quad(self):
        def f2(x):
            return x**2
        quad_int_val = quadvgk(f2, 1.,2.)
        real_val = 7/3.
        np.testing.assert_almost_equal(real_val, quad_int_val, decimal=5)
 if __name__ == '__main__':
    def f(x):
        return np.exp(-0.5 * x ** 2) * np.power(x, np.arange(3)[:, None])
    quad_int_val = quadgk_int(f)
    real_val = np.sqrt(np.pi*2)
    np.testing.assert_almost_equal(real_val, quad_int_val[0], decimal=7)
    print(quadgk_int(f))
--- a/GPy/testing/serialization_tests.py
+++ b/GPy/testing/serialization_tests.py
@ -0,0 +1,202 @@
 '''
 Created on 20 April 2017
@author: pgmoren
 '''
 import unittest, itertools
 #import cPickle as pickle
 import pickle
 import numpy as np
 import tempfile
 import GPy
 from nose import SkipTest
 import numpy as np
 fixed_seed = 11
 class Test(unittest.TestCase):
    def test_serialize_deserialize_kernels(self):
        k1 = GPy.kern.RBF(2, variance=1.0, lengthscale=[1.0,1.0], ARD=True)
        k2 = GPy.kern.RatQuad(2, variance=2.0, lengthscale=1.0, power=2.0, active_dims = [0,1])
        k3 = GPy.kern.Bias(2, variance=2.0, active_dims = [1,0])
        k4 = GPy.kern.StdPeriodic(2, variance=2.0, lengthscale=1.0, period=1.0, active_dims = [1,1])
        k5 = GPy.kern.Linear(2, variances=[2.0, 1.0], ARD=True, active_dims = [1,1])
        k6 = GPy.kern.Exponential(2, variance=1., lengthscale=2)
        k7 = GPy.kern.Matern32(2, variance=1.0, lengthscale=[1.0,3.0], ARD=True, active_dims = [1,1])
        k8 = GPy.kern.Matern52(2, variance=2.0, lengthscale=[2.0,1.0], ARD=True, active_dims = [1,0])
        k9 = GPy.kern.ExpQuad(2, variance=3.0, lengthscale=[1.0,2.0], ARD=True, active_dims = [0,1])
        k10 = k1 + k1.copy() + k2 + k3 + k4 + k5 + k6
        k11 = k1 * k2 * k2.copy() * k3 * k4 * k5
        k12 = (k1 + k2) * (k3 + k4 + k5)
        k13 = ((k1 + k2) * k3) + k4 + k5 * k7
        k14 = ((k1 + k2) * k3) + k4 * k5 + k8
        k15 = ((k1 * k2) * k3) + k4 * k5 + k8 + k9
        k_list = [k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15]
        for kk in k_list:
            kk_dict = kk.to_dict()
            kk_r = GPy.kern.Kern.from_dict(kk_dict)
            assert type(kk) == type(kk_r)
            np.testing.assert_array_equal(kk[:], kk_r[:])
            np.testing.assert_array_equal(np.array(kk.active_dims), np.array(kk_r.active_dims))
    def test_serialize_deserialize_mappings(self):
        m1 = GPy.mappings.Identity(3,2)
        m2 = GPy.mappings.Constant(3,2,1)
        m2_r = GPy.core.mapping.Mapping.from_dict(m2.to_dict())
        np.testing.assert_array_equal(m2.C.values[:], m2_r.C.values[:])
        m3 = GPy.mappings.Linear(3,2)
        m3_r = GPy.core.mapping.Mapping.from_dict(m3.to_dict())
        assert np.all(m3.A == m3_r.A)
        m_list = [m1, m2, m3]
        for mm in m_list:
            mm_dict = mm.to_dict()
            mm_r = GPy.core.mapping.Mapping.from_dict(mm_dict)
            assert type(mm) == type(mm_r)
            assert type(mm.input_dim) == type(mm_r.input_dim)
            assert type(mm.output_dim) == type(mm_r.output_dim)
    def test_serialize_deserialize_likelihoods(self):
        l1 = GPy.likelihoods.Gaussian(GPy.likelihoods.link_functions.Identity(),variance=3.0)
        l1_r = GPy.likelihoods.likelihood.Likelihood.from_dict(l1.to_dict())
        l2 = GPy.likelihoods.Bernoulli(GPy.likelihoods.link_functions.Probit())
        l2_r = GPy.likelihoods.likelihood.Likelihood.from_dict(l2.to_dict())
        assert type(l1) == type(l1_r)
        assert np.all(l1.variance == l1_r.variance)
        assert type(l2) == type(l2_r)
    def test_serialize_deserialize_normalizers(self):
        n1 = GPy.util.normalizer.Standardize()
        n1.scale_by(np.random.rand(10))
        n1_r = GPy.util.normalizer._Norm.from_dict((n1.to_dict()))
        assert type(n1) == type(n1_r)
        assert np.all(n1.mean == n1_r.mean)
        assert np.all(n1.std == n1_r.std)
    def test_serialize_deserialize_link_functions(self):
        l1 = GPy.likelihoods.link_functions.Identity()
        l2 = GPy.likelihoods.link_functions.Probit()
        l_list = [l1, l2]
        for ll in l_list:
            ll_dict = ll.to_dict()
            ll_r = GPy.likelihoods.link_functions.GPTransformation.from_dict(ll_dict)
            assert type(ll) == type(ll_r)
    def test_serialize_deserialize_inference_methods(self):
        e1 = GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode="nested")
        e1.ga_approx_old = GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(np.random.rand(10),np.random.rand(10))
        e1._ep_approximation = []
        e1._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.posteriorParams(np.random.rand(10),np.random.rand(100).reshape((10,10))))
        e1._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(np.random.rand(10),np.random.rand(10)))
        e1._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.cavityParams(10))
        e1._ep_approximation[-1].v = np.random.rand(10)
        e1._ep_approximation[-1].tau = np.random.rand(10)
        e1._ep_approximation.append(np.random.rand(10))
        e1_r = GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(e1.to_dict())
        assert type(e1) == type(e1_r)
        assert e1.epsilon==e1_r.epsilon
        assert e1.eta==e1_r.eta
        assert e1.delta==e1_r.delta
        assert e1.always_reset==e1_r.always_reset
        assert e1.max_iters==e1_r.max_iters
        assert e1.ep_mode==e1_r.ep_mode
        assert e1.parallel_updates==e1_r.parallel_updates
        np.testing.assert_array_equal(e1.ga_approx_old.tau[:], e1_r.ga_approx_old.tau[:])
        np.testing.assert_array_equal(e1.ga_approx_old.v[:], e1_r.ga_approx_old.v[:])
        np.testing.assert_array_equal(e1._ep_approximation[0].mu[:], e1_r._ep_approximation[0].mu[:])
        np.testing.assert_array_equal(e1._ep_approximation[0].Sigma[:], e1_r._ep_approximation[0].Sigma[:])
        np.testing.assert_array_equal(e1._ep_approximation[1].tau[:], e1_r._ep_approximation[1].tau[:])
        np.testing.assert_array_equal(e1._ep_approximation[1].v[:], e1_r._ep_approximation[1].v[:])
        np.testing.assert_array_equal(e1._ep_approximation[2].tau[:], e1_r._ep_approximation[2].tau[:])
        np.testing.assert_array_equal(e1._ep_approximation[2].v[:], e1_r._ep_approximation[2].v[:])
        np.testing.assert_array_equal(e1._ep_approximation[3][:], e1_r._ep_approximation[3][:])
        e2 = GPy.inference.latent_function_inference.exact_gaussian_inference.ExactGaussianInference()
        e2_r = GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(e2.to_dict())
        assert type(e2) == type(e2_r)
    def test_serialize_deserialize_model(self):
        np.random.seed(fixed_seed)
        N = 20
        Nhalf = int(N/2)
        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[:, None]
        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
        kernel = GPy.kern.RBF(1)
        likelihood = GPy.likelihoods.Bernoulli()
        inference_method=GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode="nested")
        mean_function=None
        m = GPy.core.GP(X=X, Y=Y,  kernel=kernel, likelihood=likelihood, inference_method=inference_method, mean_function=mean_function, normalizer=True, name='gp_classification')
        m.optimize()
        m.save_model("temp_test_gp_with_data.json", compress=True, save_data=True)
        m.save_model("temp_test_gp_without_data.json", compress=True, save_data=False)
        m1_r = GPy.core.GP.load_model("temp_test_gp_with_data.json.zip")
        m2_r = GPy.core.GP.load_model("temp_test_gp_without_data.json.zip", (X,Y))
        import os
        os.remove("temp_test_gp_with_data.json.zip")
        os.remove("temp_test_gp_without_data.json.zip")
        var = m.predict(X)[0]
        var1_r = m1_r.predict(X)[0]
        var2_r = m2_r.predict(X)[0]
        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var2_r).flatten())
    def test_serialize_deserialize_inference_GPRegressor(self):
        np.random.seed(fixed_seed)
        N = 50
        N_new = 50
        D = 1
        X = np.random.uniform(-3., 3., (N, 1))
        Y = np.sin(X) + np.random.randn(N, D) * 0.05
        X_new = np.random.uniform(-3., 3., (N_new, 1))
        k = GPy.kern.RBF(input_dim=1, lengthscale=10)
        m = GPy.models.GPRegression(X,Y,k)
        m.optimize()
        m.save_model("temp_test_gp_regressor_with_data.json", compress=True, save_data=True)
        m.save_model("temp_test_gp_regressor_without_data.json", compress=True, save_data=False)
        m1_r = GPy.models.GPRegression.load_model("temp_test_gp_regressor_with_data.json.zip")
        m2_r = GPy.models.GPRegression.load_model("temp_test_gp_regressor_without_data.json.zip", (X,Y))
        import os
        os.remove("temp_test_gp_regressor_with_data.json.zip")
        os.remove("temp_test_gp_regressor_without_data.json.zip")
        Xp = np.random.uniform(size=(int(1e5),1))
        Xp[:,0] = Xp[:,0]*15-5
        _, var = m.predict(Xp)
        _, var1_r = m1_r.predict(Xp)
        _, var2_r = m2_r.predict(Xp)
        np.testing.assert_array_equal(var.flatten(), var1_r.flatten())
        np.testing.assert_array_equal(var.flatten(), var2_r.flatten())
    def test_serialize_deserialize_inference_GPClassifier(self):
        np.random.seed(fixed_seed)
        N = 50
        Nhalf = int(N/2)
        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[:, None]
        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
        kernel = GPy.kern.RBF(1)
        m = GPy.models.GPClassification(X, Y, kernel=kernel)
        m.optimize()
        m.save_model("temp_test_gp_classifier_with_data.json", compress=True, save_data=True)
        m.save_model("temp_test_gp_classifier_without_data.json", compress=True, save_data=False)
        m1_r = GPy.models.GPClassification.load_model("temp_test_gp_classifier_with_data.json.zip")
        m2_r = GPy.models.GPClassification.load_model("temp_test_gp_classifier_without_data.json.zip", (X,Y))
        import os
        os.remove("temp_test_gp_classifier_with_data.json.zip")
        os.remove("temp_test_gp_classifier_without_data.json.zip")
        var = m.predict(X)[0]
        var1_r = m1_r.predict(X)[0]
        var2_r = m2_r.predict(X)[0]
        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
 if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.test_parameter_index_operations']
    unittest.main()
--- a/GPy/util/init.py
+++ b/GPy/util/init.py
@ -17,3 +17,4 @@ from . import multioutput
 from . import parallel
 from . import functions
 from . import cluster_with_offset
 from . import input_warping_functions
--- a/GPy/util/input_warping_functions.py
+++ b/GPy/util/input_warping_functions.py
@ -0,0 +1,262 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core.parameterization import Parameterized, Param
 from ..core.parameterization.priors import LogGaussian
 class InputWarpingFunction(Parameterized):
    """Abstract class for input warping functions
    """
    def __init__(self, name):
        super(InputWarpingFunction, self).__init__(name=name)
    def f(self, X, test=False):
        raise NotImplementedError
    def fgrad_x(self, X):
        raise NotImplementedError
    def update_grads(self, X, dL_dW):
        raise NotImplementedError
 class IdentifyWarping(InputWarpingFunction):
    """The identity warping function, for testing"""
    def __init__(self):
        super(IdentifyWarping, self).__init__(name='input_warp_identity')
    def f(self, X, test_data=False):
        return X
    def fgrad_X(self, X):
        return np.zeros(X.shape)
    def update_grads(self, X, dL_dW):
        pass
 class InputWarpingTest(InputWarpingFunction):
    """The identity warping function, for testing"""
    def __init__(self):
        super(InputWarpingTest, self).__init__(name='input_warp_test')
        self.a = Param('a', 1.0)
        self.set_prior(LogGaussian(0.0, 0.75))
        self.link_parameter(self.a)
    def f(self, X, test_data=False):
        return X * self.a
    def fgrad_X(self, X):
        return self.ones(X.shape) * self.a
    def update_grads(self, X, dL_dW):
        self.a.gradient[:] = np.sum(dL_dW * X)
 class KumarWarping(InputWarpingFunction):
    """Kumar Warping for input data
    Parameters
    ----------
    X : array_like, shape = (n_samples, n_features)
        The input data that is going to be warped
    warping_indices: list of int, optional
        The features that are going to be warped
        Default to warp all the features
    epsilon: float, optional
        Used to normalized input data to [0+e, 1-e]
        Default to 1e-6
    Xmin : list of float, Optional
        The min values for each feature defined by users
        Default to the train minimum
    Xmax : list of float, Optional
        The max values for each feature defined by users
        Default to the train maximum
    Attributes
    ----------
    warping_indices: list of int
        The features that are going to be warped
        Default to warp all the features
    warping_dim: int
        The number of features to be warped
    Xmin : list of float
        The min values for each feature defined by users
        Default to the train minimum
    Xmax : list of float
        The max values for each feature defined by users
        Default to the train maximum
    epsilon: float
        Used to normalized input data to [0+e, 1-e]
        Default to 1e-6
    X_normalized : array_like, shape = (n_samples, n_features)
        The normalized training X
    scaling : list of float, length = n_features in X
        Defined as 1.0 / (self.Xmax - self.Xmin)
    params : list of Param
        The list of all the parameters used in Kumar Warping
    num_parameters: int
        The number of parameters used in Kumar Warping
    """
    def __init__(self, X, warping_indices=None, epsilon=None, Xmin=None, Xmax=None):
        super(KumarWarping, self).__init__(name='input_warp_kumar')
        if warping_indices is not None and np.max(warping_indices) > X.shape[1] -1:
            raise ValueError("Kumar warping indices exceed feature dimension")
        if warping_indices is not None and np.min(warping_indices) < 0:
            raise ValueError("Kumar warping indices should be larger than 0")
        if warping_indices is not None and np.any(list(map(lambda x: not isinstance(x, int), warping_indices))):
            raise ValueError("Kumar warping indices should be integer")
        if Xmin is None and Xmax is None:
            Xmin = X.min(axis=0)
            Xmax = X.max(axis=0)
        else:
            if Xmin is None or Xmax is None:
                raise ValueError("Xmin and Xmax need to be provide at the same time!")
            if len(Xmin) != X.shape[1] or len(Xmax) != X.shape[1]:
                raise ValueError("Xmin and Xmax should have n_feature values!")
        if epsilon is None:
            epsilon = 1e-6
        self.epsilon = epsilon
        self.Xmin = Xmin - self.epsilon
        self.Xmax = Xmax + self.epsilon
        self.scaling = 1.0 / (self.Xmax - self.Xmin)
        self.X_normalized = (X - self.Xmin) / (self.Xmax - self.Xmin)
        if warping_indices is None:
            warping_indices = range(X.shape[1])
        self.warping_indices = warping_indices
        self.warping_dim = len(self.warping_indices)
        self.num_parameters = 2 * self.warping_dim
        # create parameters
        self.params = [[Param('a%d' % i, 1.0), Param('b%d' % i, 1.0)] for i in range(self.warping_dim)]
        # add constraints
        for i in range(self.warping_dim):
            self.params[i][0].constrain_bounded(0.0, 10.0)
            self.params[i][1].constrain_bounded(0.0, 10.0)
        # set priors and add them into handler
        for i in range(self.warping_dim):
            self.params[i][0].set_prior(LogGaussian(0.0, 0.75))
            self.params[i][1].set_prior(LogGaussian(0.0, 0.75))
            self.link_parameter(self.params[i][0])
            self.link_parameter(self.params[i][1])
    def f(self, X, test_data=False):
        """Apply warping_function to some Input data
        Parameters:
        -----------
        X : array_like, shape = (n_samples, n_features)
        test_data: bool, optional
            Default to False, should set to True when transforming test data
        Returns
        -------
        X_warped : array_like, shape = (n_samples, n_features)
            The warped input data
        Math
        ----
        f(x) = 1 - (1 - x^a)^b
        """
        X_warped = X.copy()
        if test_data:
            X_normalized = (X - self.Xmin) / (self.Xmax - self.Xmin)
        else:
            X_normalized = self.X_normalized
        for i_seq, i_fea in enumerate(self.warping_indices):
            a, b = self.params[i_seq][0], self.params[i_seq][1]
            X_warped[:, i_fea] = 1 - np.power(1 - np.power(X_normalized[:, i_fea], a), b)
        return X_warped
    def fgrad_X(self, X):
        """Compute the gradient of warping function with respect to X
        Parameters
        ----------
        X : array_like, shape = (n_samples, n_features)
            The location to compute gradient
        Returns
        -------
        grad : array_like, shape = (n_samples, n_features)
            The gradient for every location at X
        Math
        ----
        grad = a * b * x ^(a-1) * (1 - x^a)^(b-1)
        """
        grad = np.zeros(X.shape)
        for i_seq, i_fea in enumerate(self.warping_indices):
            a, b = self.params[i_seq][0], self.params[i_seq][1]
            grad[:, i_fea] = a * b * np.power(self.X_normalized[:, i_fea], a-1) *  \
                             np.power(1 - np.power(self.X_normalized[:, i_fea], a), b-1) * self.scaling[i_fea]
        return grad
    def update_grads(self, X, dL_dW):
        """Update the gradients of marginal log likelihood with respect to the parameters of warping function
        Parameters
        ----------
        X : array_like, shape = (n_samples, n_features)
            The input BEFORE warping
        dL_dW : array_like, shape = (n_samples, n_features)
            The gradient of marginal log likelihood with respect to the Warped input
        Math
        ----
        let w = f(x), the input after warping, then
        dW_da = b * (1 - x^a)^(b - 1) * x^a * ln(x)
        dW_db = - (1 - x^a)^b * ln(1 - x^a)
        dL_da = dL_dW * dW_da
        dL_db = dL_dW * dW_db
        """
        for i_seq, i_fea in enumerate(self.warping_indices):
            ai, bi = self.params[i_seq][0], self.params[i_seq][1]
            # cache some value for save some computation
            x_pow_a = np.power(self.X_normalized[:, i_fea], ai)
            # compute gradient for ai, bi on all X
            dz_dai = bi * np.power(1 - x_pow_a, bi-1) * x_pow_a * np.log(self.X_normalized[:, i_fea])
            dz_dbi = - np.power(1 - x_pow_a, bi) * np.log(1 - x_pow_a)
            # sum gradients on all the data
            dL_dai = np.sum(dL_dW[:, i_fea] * dz_dai)
            dL_dbi = np.sum(dL_dW[:, i_fea] * dz_dbi)
            self.params[i_seq][0].gradient[:] = dL_dai
            self.params[i_seq][1].gradient[:] = dL_dbi
--- a/GPy/util/normalizer.py
+++ b/GPy/util/normalizer.py
@ -33,6 +33,27 @@ class _Norm(object):
        """
        raise NotImplementedError
    def to_dict(self):
        raise NotImplementedError
    def _to_dict(self):
        input_dict = {}
        return input_dict
    @staticmethod
    def from_dict(input_dict):
        import copy
        input_dict = copy.deepcopy(input_dict)
        normalizer_class = input_dict.pop('class')
        import GPy
        normalizer_class = eval(normalizer_class)
        return normalizer_class._from_dict(normalizer_class, input_dict)
    @staticmethod
    def _from_dict(normalizer_class, input_dict):
        return normalizer_class(**input_dict)
 class Standardize(_Norm):
    def __init__(self):
        self.mean = None
@ -50,9 +71,26 @@ class Standardize(_Norm):
    def scaled(self):
        return self.mean is not None
    def to_dict(self):
        input_dict = super(Standardize, self)._to_dict()
        input_dict["class"] = "GPy.util.normalizer.Standardize"
        if self.mean is not None:
            input_dict["mean"] = self.mean.tolist()
            input_dict["std"] = self.std.tolist()
        return input_dict
    @staticmethod
    def _from_dict(kernel_class, input_dict):
        s = Standardize()
        if "mean" in input_dict:
            s.mean = np.array(input_dict["mean"])
        if "std" in input_dict:
            s.std = np.array(input_dict["std"])
        return s
 # Inverse variance to be implemented, disabling for now
 # If someone in the future want to implement this,
-# we need to implement the inverse variance for 
+# we need to implement the inverse variance for
 # normalization. This means, we need to know the factor
 # for the variance to be multiplied to the variance in
 # normalized space. This is easy to compute for standardization
@ -71,7 +109,7 @@ class Standardize(_Norm):
 #     def inverse_mean(self, X):
 #         return (X + .5) * (self.ymax - self.ymin) + self.ymin
 #     def inverse_variance(self, var):
-#         
+#
 #         return (var*(self.std**2))
 #     def scaled(self):
-#         return (self.ymin is not None) and (self.ymax is not None)
+#         return (self.ymin is not None) and (self.ymax is not None)
--- a/GPy/util/quad_integrate.py
+++ b/GPy/util/quad_integrate.py
@ -0,0 +1,119 @@
 """
 The file for utilities related to integration by quadrature methods
 - will contain implementation for gaussian-kronrod integration.
 """
 import numpy as np
 def getSubs(Subs, XK, NK=1):
    M = (Subs[1, :] - Subs[0, :]) / 2
    C = (Subs[1, :] + Subs[0, :]) / 2
    I = XK[:, None] * M + np.ones((NK, 1)) * C
    # A = [Subs(1,:); I]
    A = np.vstack((Subs[0, :], I))
    # B = [I;Subs(2,:)]
    B = np.vstack((I, Subs[1, :]))
    # Subs = [reshape(A, 1, []);
    A = A.flatten()
    # reshape(B, 1, [])];
    B = B.flatten()
    Subs = np.vstack((A,B))
    # Subs = np.concatenate((A, B), axis=0)
    return Subs
 def quadvgk(feval, fmin, fmax, tol1=1e-5, tol2=1e-5):
    """
    numpy implementation makes use of the code here: http://se.mathworks.com/matlabcentral/fileexchange/18801-quadvgk
    We here use gaussian kronrod integration already used in gpstuff for evaluating one dimensional integrals.
    This is vectorised quadrature which means that several functions can be evaluated at the same time over a grid of
    points.
    :param f:
    :param fmin:
    :param fmax:
    :param difftol:
    :return:
    """
    XK = np.array([-0.991455371120813, -0.949107912342759, -0.864864423359769, -0.741531185599394,
                   -0.586087235467691, -0.405845151377397, -0.207784955007898, 0.,
                   0.207784955007898, 0.405845151377397, 0.586087235467691,
                   0.741531185599394, 0.864864423359769, 0.949107912342759, 0.991455371120813])
    WK = np.array([0.022935322010529, 0.063092092629979, 0.104790010322250, 0.140653259715525,
                   0.169004726639267, 0.190350578064785, 0.204432940075298, 0.209482141084728,
                   0.204432940075298, 0.190350578064785, 0.169004726639267,
                   0.140653259715525, 0.104790010322250, 0.063092092629979, 0.022935322010529])
     # 7-point Gaussian weightings
    WG = np.array([0.129484966168870, 0.279705391489277, 0.381830050505119, 0.417959183673469,
        0.381830050505119, 0.279705391489277, 0.129484966168870])
    NK = WK.size
    G = np.arange(2,NK,2)
    tol1 = 1e-4
    tol2 = 1e-4
    Subs = np.array([[fmin],[fmax]])
    #  number of functions to evaluate in the feval vector of functions.
    NF = feval(np.zeros(1)).size
    Q = np.zeros(NF)
    neval = 0
    while Subs.size > 0:
        Subs = getSubs(Subs,XK)
        M = (Subs[1,:] - Subs[0,:]) / 2
        C = (Subs[1,:] + Subs[0,:]) / 2
        # NM = length(M);
        NM = M.size
        # x = reshape(XK * M + ones(NK, 1) * C, 1, []);
        x = XK[:,None]*M + C
        x = x.flatten()
        FV = feval(x)
        # FV = FV[:,None]
        Q1 = np.zeros((NF, NM))
        Q2 = np.zeros((NF, NM))
        # for n=1:NF
        # F = reshape(FV(n,:), NK, []);
        # Q1(n,:) = M. * sum((WK * ones(1, NM)). * F);
        # Q2(n,:) = M. * sum((WG * ones(1, NM)). * F(G,:));
        # end
        # for i in range(NF):
        #     F = FV
        #     F = F.reshape((NK,-1))
        #     temp_mat = np.sum(np.multiply(WK[:,None]*np.ones((1,NM)), F),axis=0)
        #     Q1[i,:] = np.multiply(M, temp_mat)
        #     temp_mat = np.sum(np.multiply(WG[:,None]*np.ones((1, NM)), F[G-1,:]), axis=0)
        #     Q2[i,:] = np.multiply(M, temp_mat)
        # ind = np.where(np.logical_or(np.max(np.abs(Q1 -Q2) / Q1) < tol1, (Subs[1,:] - Subs[0,:]) <= tol2) > 0)[0]
        # Q = Q + np.sum(Q1[:,ind], axis=1)
        # np.delete(Subs, ind,axis=1)
        Q1 = np.dot(FV.reshape(NF, NK, NM).swapaxes(2,1),WK)*M
        Q2 = np.dot(FV.reshape(NF, NK, NM).swapaxes(2,1)[:,:,1::2],WG)*M
        #ind = np.nonzero(np.logical_or(np.max(np.abs((Q1-Q2)/Q1), 0) < difftol , M < xtol))[0]
        ind = np.nonzero(np.logical_or(np.max(np.abs((Q1-Q2)), 0) < tol1 , (Subs[1,:] - Subs[0,:])  < tol2))[0]
        Q = Q + np.sum(Q1[:,ind], axis=1)
        Subs = np.delete(Subs, ind, axis=1)
    return Q
 def quadgk_int(f, fmin=-np.inf, fmax=np.inf, difftol=0.1):
    """
    Integrate f from fmin to fmax,
    do integration by substitution
    x = r / (1-r**2)
    when r goes from -1 to 1 , x goes from -inf to inf.
    the interval for quadgk function is from -1 to +1, so we transform the space from (-inf,inf) to (-1,1)
    :param f:
    :param fmin:
    :param fmax:
    :param difftol:
    :return:
    """
    difftol = 1e-4
    def trans_func(r):
        r2 = np.square(r)
        x = r / (1-r2)
        dx_dr = (1 + r2)/(1-r2)**2
        return f(x)*dx_dr
    integrand = quadvgk(trans_func, -1., 1., difftol, difftol)
    return integrand
--- a/appveyor.yml
+++ b/appveyor.yml
@ -3,7 +3,7 @@ environment:
    secure: 8/ZjXFwtd1S7ixd7PJOpptupKKEDhm2da/q3unabJ00=
  COVERALLS_REPO_TOKEN:
    secure: d3Luic/ESkGaWnZrvWZTKrzO+xaVwJWaRCEP0F+K/9DQGPSRZsJ/Du5g3s4XF+tS
-  gpy_version: 1.7.7
+  gpy_version: 1.8.0
  matrix:
    - PYTHON_VERSION: 2.7
      MINICONDA: C:\Miniconda-x64
--- a/setup.cfg
+++ b/setup.cfg
@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.7.7
+current_version = 1.8.0
 tag = True
 commit = True