Merge pull request #544 from SheffieldML/devel

Release GPy 1.8.x
2026-05-15 06:52:39 +02:00 · 2017-09-21 18:00:44 +01:00 · 2017-09-21 18:00:44 +01:00 · d40db2b9af
commit d40db2b9af
parent 4c78cc837b 6cc13af8cd
47 changed files with 14431 additions and 273 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/GPy/version.py
+++ b/GPy/version.py
@ -1 +1 @@
-__version__ = "1.7.7"
+__version__ = "1.8.0"
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -109,6 +109,68 @@ class GP(Model):
        self.link_parameter(self.likelihood)
        self.posterior = None

+    def to_dict(self, save_data=True):
+        input_dict = super(GP, self)._to_dict()
+        input_dict["class"] = "GPy.core.GP"
+        if not save_data:
+            input_dict["X"] = None
+            input_dict["Y"] = None
+        else:
+            try:
+                input_dict["X"] = self.X.values.tolist()
+            except:
+                input_dict["X"] = self.X.tolist()
+            try:
+                input_dict["Y"] = self.Y.values.tolist()
+            except:
+                input_dict["Y"] = self.Y.tolist()
+        input_dict["kernel"] = self.kern.to_dict()
+        input_dict["likelihood"] = self.likelihood.to_dict()
+        if self.mean_function is not None:
+            input_dict["mean_function"] = self.mean_function.to_dict()
+        input_dict["inference_method"] = self.inference_method.to_dict()
+        #FIXME: Assumes the Y_metadata is serializable. We should create a Metadata class
+        if self.Y_metadata is not None:
+            input_dict["Y_metadata"] = self.Y_metadata
+        if self.normalizer is not None:
+            input_dict["normalizer"] = self.normalizer.to_dict()
+        return input_dict
+
+    @staticmethod
+    def _from_dict(input_dict, data=None):
+        import GPy
+        import numpy as np
+        if (input_dict['X'] is None) or (input_dict['Y'] is None):
+            assert(data is not None)
+            input_dict["X"], input_dict["Y"] = np.array(data[0]), np.array(data[1])
+        elif data is not None:
+            print("WARNING: The model has been saved with X,Y! The original values are being overriden!")
+            input_dict["X"], input_dict["Y"] = np.array(data[0]), np.array(data[1])
+        else:
+            input_dict["X"], input_dict["Y"] = np.array(input_dict['X']), np.array(input_dict['Y'])
+        input_dict["kernel"] = GPy.kern.Kern.from_dict(input_dict["kernel"])
+        input_dict["likelihood"] = GPy.likelihoods.likelihood.Likelihood.from_dict(input_dict["likelihood"])
+        mean_function = input_dict.get("mean_function")
+        if mean_function is not None:
+            input_dict["mean_function"] = GPy.core.mapping.Mapping.from_dict(mean_function)
+        else:
+            input_dict["mean_function"] = mean_function
+        input_dict["inference_method"] = GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(input_dict["inference_method"])
+
+        #FIXME: Assumes the Y_metadata is serializable. We should create a Metadata class
+        Y_metadata = input_dict.get("Y_metadata")
+        input_dict["Y_metadata"] = Y_metadata
+
+        normalizer = input_dict.get("normalizer")
+        if normalizer is not None:
+            input_dict["normalizer"] = GPy.util.normalizer._Norm.from_dict(normalizer)
+        else:
+            input_dict["normalizer"] = normalizer
+        return GP(**input_dict)
+
+    def save_model(self, output_filename, compress=True, save_data=True):
+        self._save_model(output_filename, compress=True, save_data=True)
+
    # The predictive variable to be used to predict using the posterior object's
    # woodbury_vector and woodbury_inv is defined as predictive_variable
    # as long as the posterior has the right woodbury entries.
@ -616,4 +678,3 @@ class GP(Model):
        """
        mu_star, var_star = self._raw_predict(x_test)
        return self.likelihood.log_predictive_density_sampling(y_test, mu_star, var_star, Y_metadata=Y_metadata, num_samples=num_samples)
-
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@ -25,6 +25,30 @@ class Mapping(Parameterized):
    def update_gradients(self, dL_dF, X):
        raise NotImplementedError

+    def to_dict(self):
+        raise NotImplementedError
+
+    def _to_dict(self):
+        input_dict = {}
+        input_dict["input_dim"] = self.input_dim
+        input_dict["output_dim"] = self.output_dim
+        input_dict["name"] = self.name
+        return input_dict
+
+    @staticmethod
+    def from_dict(input_dict):
+        import copy
+        input_dict = copy.deepcopy(input_dict)
+        mapping_class = input_dict.pop('class')
+        input_dict["name"] = str(input_dict["name"])
+        import GPy
+        mapping_class = eval(mapping_class)
+        return mapping_class._from_dict(mapping_class, input_dict)
+
+    @staticmethod
+    def _from_dict(mapping_class, input_dict):
+        return mapping_class(**input_dict)
+

 class Bijective_mapping(Mapping):
    """
@ -37,5 +61,3 @@ class Bijective_mapping(Mapping):
    def g(self, f):
        """Inverse mapping from output domain of the function to the inputs."""
        raise NotImplementedError
-
-
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -8,6 +8,61 @@ class Model(ParamzModel, Priorizable):
    def __init__(self, name):
        super(Model, self).__init__(name)  # Parameterized.__init__(self)

+    def _to_dict(self):
+        input_dict = {}
+        input_dict["name"] = self.name
+        return input_dict
+
+    def to_dict(self):
+        raise NotImplementedError
+
+    @staticmethod
+    def from_dict(input_dict, data=None):
+        import copy
+        input_dict = copy.deepcopy(input_dict)
+        model_class = input_dict.pop('class')
+        input_dict["name"] = str(input_dict["name"])
+        import GPy
+        model_class = eval(model_class)
+        return model_class._from_dict(input_dict, data)
+
+    @staticmethod
+    def _from_dict(model_class, input_dict, data=None):
+        return model_class(**input_dict)
+
+    def save_model(self, output_filename, compress=True, save_data=True):
+        raise NotImplementedError
+
+    def _save_model(self, output_filename, compress=True, save_data=True):
+        import json
+        output_dict = self.to_dict(save_data)
+        if compress:
+            import gzip
+            with gzip.GzipFile(output_filename + ".zip", 'w') as outfile:
+                json_str = json.dumps(output_dict)
+                json_bytes = json_str.encode('utf-8')
+                outfile.write(json_bytes)
+        else:
+            with open(output_filename + ".json", 'w') as outfile:
+                json.dump(output_dict, outfile)
+
+    @staticmethod
+    def load_model(output_filename, data=None):
+        compress = output_filename.split(".")[-1] == "zip"
+        import json
+        if compress:
+            import gzip
+            with gzip.GzipFile(output_filename, 'r') as json_data:
+                json_bytes = json_data.read()
+                json_str = json_bytes.decode('utf-8')
+                output_dict = json.loads(json_str)
+        else:
+            with open(output_filename) as json_data:
+                output_dict = json.load(json_data)
+        import GPy
+        return GPy.core.model.Model.from_dict(output_dict, data)
+
+
    def log_likelihood(self):
        raise NotImplementedError("this needs to be implemented to use the model class")

--- a/GPy/inference/latent_function_inference/init.py
+++ b/GPy/inference/latent_function_inference/init.py
@ -41,6 +41,26 @@ class LatentFunctionInference(object):
        """
        pass

+    def _to_dict(self):
+        input_dict = {}
+        return input_dict
+
+    def to_dict(self):
+        raise NotImplementedError
+
+    @staticmethod
+    def from_dict(input_dict):
+        import copy
+        input_dict = copy.deepcopy(input_dict)
+        inference_class = input_dict.pop('class')
+        import GPy
+        inference_class = eval(inference_class)
+        return inference_class._from_dict(inference_class, input_dict)
+
+    @staticmethod
+    def _from_dict(inference_class, input_dict):
+        return inference_class(**input_dict)
+
 class InferenceMethodList(LatentFunctionInference, list):

    def on_optimization_start(self):
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@ -21,6 +21,11 @@ class ExactGaussianInference(LatentFunctionInference):
    def __init__(self):
        pass#self._YYTfactor_cache = caching.cache()

+    def to_dict(self):
+        input_dict = super(ExactGaussianInference, self)._to_dict()
+        input_dict["class"] = "GPy.inference.latent_function_inference.exact_gaussian_inference.ExactGaussianInference"
+        return input_dict
+
    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, K=None, variance=None, Z_tilde=None):
        """
        Returns a Posterior class containing essential quantities of the posterior
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@ -6,11 +6,141 @@ from paramz import ObsAr
 from . import ExactGaussianInference, VarDTC
 from ...util import diag
 from .posterior import PosteriorEP as Posterior
+from ...likelihoods import Gaussian
+from . import LatentFunctionInference

 log_2_pi = np.log(2*np.pi)

+
+#Four wrapper classes to help modularisation of different EP versions
+class marginalMoments(object):
+    def __init__(self, num_data):
+        self.Z_hat = np.empty(num_data,dtype=np.float64)
+        self.mu_hat = np.empty(num_data,dtype=np.float64)
+        self.sigma2_hat = np.empty(num_data,dtype=np.float64)
+
+
+class cavityParams(object):
+    def __init__(self, num_data):
+        self.tau = np.empty(num_data,dtype=np.float64)
+        self.v = np.empty(num_data,dtype=np.float64)
+    def _update_i(self, eta, ga_approx, post_params, i):
+        self.tau[i] = 1./post_params.Sigma_diag[i] - eta*ga_approx.tau[i]
+        self.v[i] = post_params.mu[i]/post_params.Sigma_diag[i] - eta*ga_approx.v[i]
+    def to_dict(self):
+        return {"tau": self.tau.tolist(), "v": self.v.tolist()}
+    @staticmethod
+    def from_dict(input_dict):
+        c = cavityParams(len(input_dict["tau"]))
+        c.tau = np.array(input_dict["tau"])
+        c.v = np.array(input_dict["v"])
+        return c
+
+
+class gaussianApproximation(object):
+    def __init__(self, v, tau):
+        self.tau = tau
+        self.v = v
+    def _update_i(self, eta, delta, post_params, marg_moments, i):
+        #Site parameters update
+        delta_tau = delta/eta*(1./marg_moments.sigma2_hat[i] - 1./post_params.Sigma_diag[i])
+        delta_v = delta/eta*(marg_moments.mu_hat[i]/marg_moments.sigma2_hat[i] - post_params.mu[i]/post_params.Sigma_diag[i])
+        tau_tilde_prev = self.tau[i]
+        self.tau[i] += delta_tau
+
+        # Enforce positivity of tau_tilde. Even though this is guaranteed for logconcave sites, it is still possible
+        # to get negative values due to numerical errors. Moreover, the value of tau_tilde should be positive in order to
+        # update the marginal likelihood without runnint into instabilities issues.
+        if self.tau[i] < np.finfo(float).eps:
+            self.tau[i] = np.finfo(float).eps
+            delta_tau = self.tau[i] - tau_tilde_prev
+        self.v[i] += delta_v
+
+        return (delta_tau, delta_v)
+    def to_dict(self):
+        return {"tau": self.tau.tolist(), "v": self.v.tolist()}
+    @staticmethod
+    def from_dict(input_dict):
+        return gaussianApproximation(np.array(input_dict["v"]), np.array(input_dict["tau"]))
+
+
+class posteriorParamsBase(object):
+    def __init__(self, mu, Sigma_diag):
+        self.mu = mu
+        self.Sigma_diag = Sigma_diag
+    def _update_rank1(self, *arg):
+        pass
+
+    def _recompute(self, *arg):
+        pass
+
+class posteriorParams(posteriorParamsBase):
+    def __init__(self, mu, Sigma, L=None):
+        self.Sigma = Sigma
+        self.L = L
+        Sigma_diag = np.diag(self.Sigma)
+        super(posteriorParams, self).__init__(mu, Sigma_diag)
+
+    def _update_rank1(self, delta_tau, ga_approx, i):
+        ci = delta_tau/(1.+ delta_tau*self.Sigma_diag[i])
+        DSYR(self.Sigma, self.Sigma[:,i].copy(), -ci)
+        self.mu = np.dot(self.Sigma, ga_approx.v)
+    def to_dict(self):
+        #TODO: Implement a more memory efficient variant
+        if self.L is None:
+            return { "mu": self.mu.tolist(), "Sigma": self.Sigma.tolist()}
+        else:
+            return { "mu": self.mu.tolist(), "Sigma": self.Sigma.tolist(), "L": self.L.tolist()}
+    @staticmethod
+    def from_dict(input_dict):
+        if "L" in input_dict:
+            return posteriorParams(np.array(input_dict["mu"]), np.array(input_dict["Sigma"]), np.array(input_dict["L"]))
+        else:
+            return posteriorParams(np.array(input_dict["mu"]), np.array(input_dict["Sigma"]))
+
+
+
+    @staticmethod
+    def _recompute(K, ga_approx):
+        num_data = len(ga_approx.tau)
+        tau_tilde_root = np.sqrt(ga_approx.tau)
+        Sroot_tilde_K = tau_tilde_root[:,None] * K
+        B = np.eye(num_data) + Sroot_tilde_K * tau_tilde_root[None,:]
+        L = jitchol(B)
+        V, _ = dtrtrs(L, Sroot_tilde_K, lower=1)
+        Sigma = K - np.dot(V.T,V) #K - KS^(1/2)BS^(1/2)K = (K^(-1) + \Sigma^(-1))^(-1)
+        mu = np.dot(Sigma,ga_approx.v)
+        return posteriorParams(mu=mu, Sigma=Sigma, L=L)
+
+class posteriorParamsDTC(posteriorParamsBase):
+    def __init__(self, mu, Sigma_diag):
+        super(posteriorParamsDTC, self).__init__(mu, Sigma_diag)
+
+    def _update_rank1(self, LLT, Kmn, delta_v, delta_tau, i):
+        #DSYR(Sigma, Sigma[:,i].copy(), -delta_tau/(1.+ delta_tau*Sigma[i,i]))
+        DSYR(LLT,Kmn[:,i].copy(),delta_tau)
+        L = jitchol(LLT)
+        V,info = dtrtrs(L,Kmn,lower=1)
+        self.Sigma_diag = np.maximum(np.sum(V*V,-2), np.finfo(float).eps)  #diag(K_nm (L L^\top)^(-1)) K_mn
+        si = np.sum(V.T*V[:,i],-1) #(V V^\top)[:,i]
+        self.mu += (delta_v-delta_tau*self.mu[i])*si
+        #mu = np.dot(Sigma, v_tilde)
+
+    @staticmethod
+    def _recompute(LLT0, Kmn, ga_approx):
+        LLT = LLT0 + np.dot(Kmn*ga_approx.tau[None,:],Kmn.T)
+        L = jitchol(LLT)
+        V, _ = dtrtrs(L,Kmn,lower=1)
+        #Sigma_diag = np.sum(V*V,-2)
+        #Knmv_tilde = np.dot(Kmn,v_tilde)
+        #mu = np.dot(V2.T,Knmv_tilde)
+        Sigma = np.dot(V.T,V)
+        mu = np.dot(Sigma, ga_approx.v)
+        Sigma_diag = np.diag(Sigma).copy()
+        return posteriorParamsDTC(mu, Sigma_diag), LLT
+
 class EPBase(object):
-    def __init__(self, epsilon=1e-6, eta=1., delta=1., always_reset=False, max_iters=np.inf, ep_mode="alternated", parallel_updates=False):
+    def __init__(self, epsilon=1e-6, eta=1., delta=1., always_reset=False, max_iters=np.inf, ep_mode="alternated", parallel_updates=False, loading=False):
        """
        The expectation-propagation algorithm.
        For nomenclature see Rasmussen & Williams 2006.
@ -26,16 +156,20 @@ class EPBase(object):
        :max_iters: int
        :ep_mode: string. It can be "nested" (EP is run every time the Hyperparameters change) or "alternated" (It runs EP at the beginning and then optimize the Hyperparameters).
        :parallel_updates: boolean. If true, updates of the parameters of the sites in parallel
+        :loading: boolean. If True, prevents the EP parameters to change. Hack used when loading a serialized model
        """
        super(EPBase, self).__init__()
+
        self.always_reset = always_reset
        self.epsilon, self.eta, self.delta, self.max_iters = epsilon, eta, delta, max_iters
        self.ep_mode = ep_mode
        self.parallel_updates = parallel_updates
+        #FIXME: Hack for serialiation. If True, prevents the EP parameters to change when loading a serialized model
+        self.loading = loading
        self.reset()

    def reset(self):
-        self.old_mutilde, self.old_vtilde = None, None
+        self.ga_approx_old = None
        self._ep_approximation = None

    def on_optimization_start(self):
@ -45,6 +179,11 @@ class EPBase(object):
        # TODO: update approximation in the end as well? Maybe even with a switch?
        pass

+    def _stop_criteria(self, ga_approx):
+        tau_diff = np.mean(np.square(ga_approx.tau-self.ga_approx_old.tau))
+        v_diff = np.mean(np.square(ga_approx.v-self.ga_approx_old.v))
+        return ((tau_diff < self.epsilon) and (v_diff < self.epsilon))
+
    def __setstate__(self, state):
        super(EPBase, self).__setstate__(state[0])
        self.epsilon, self.eta, self.delta = state[1]
@ -53,9 +192,21 @@ class EPBase(object):
    def __getstate__(self):
        return [super(EPBase, self).__getstate__() , [self.epsilon, self.eta, self.delta]]

+    def _to_dict(self):
+        input_dict = super(EPBase, self)._to_dict()
+        input_dict["epsilon"]=self.epsilon
+        input_dict["eta"]=self.eta
+        input_dict["delta"]=self.delta
+        input_dict["always_reset"]=self.always_reset
+        input_dict["max_iters"]=self.max_iters
+        input_dict["ep_mode"]=self.ep_mode
+        input_dict["parallel_updates"]=self.parallel_updates
+        input_dict["loading"]=True
+        return input_dict
+
 class EP(EPBase, ExactGaussianInference):
    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, precision=None, K=None):
-        if self.always_reset:
+        if self.always_reset and not self.loading:
            self.reset()

        num_data, output_dim = Y.shape
@ -64,22 +215,22 @@ class EP(EPBase, ExactGaussianInference):
        if K is None:
            K = kern.K(X)

-        if self.ep_mode=="nested":
+        if self.ep_mode=="nested" and not self.loading:
            #Force EP at each step of the optimization
            self._ep_approximation = None
-            mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
-        elif self.ep_mode=="alternated":
+            post_params, ga_approx, cav_params, log_Z_tilde = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
+        elif self.ep_mode=="alternated" or self.loading:
            if getattr(self, '_ep_approximation', None) is None:
                #if we don't yet have the results of runnign EP, run EP and store the computed factors in self._ep_approximation
-                mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
+                post_params, ga_approx, cav_params, log_Z_tilde = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
            else:
                #if we've already run EP, just use the existing approximation stored in self._ep_approximation
-                mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation
+                post_params, ga_approx, cav_params, log_Z_tilde = self._ep_approximation
        else:
            raise ValueError("ep_mode value not valid")

-        v_tilde = mu_tilde * tau_tilde
-        return self._inference(K, tau_tilde, v_tilde, likelihood, Y_metadata=Y_metadata,  Z_tilde=log_Z_tilde.sum())
+        self.loading = False
+        return self._inference(Y, K, ga_approx, cav_params, likelihood, Y_metadata=Y_metadata,  Z_tilde=log_Z_tilde)

    def expectation_propagation(self, K, Y, likelihood, Y_metadata):

@ -90,41 +241,57 @@ class EP(EPBase, ExactGaussianInference):
        # than ObsArrays
        Y = Y.values.copy()

-        #Initial values - Marginal moments
-        Z_hat = np.empty(num_data,dtype=np.float64)
-        mu_hat = np.empty(num_data,dtype=np.float64)
-        sigma2_hat = np.empty(num_data,dtype=np.float64)
+        #Initial values - Marginal moments, cavity params, gaussian approximation params and posterior params
+        marg_moments = marginalMoments(num_data)
+        cav_params = cavityParams(num_data)
+        ga_approx, post_params = self._init_approximations(K, num_data)

-        tau_cav = np.empty(num_data,dtype=np.float64)
-        v_cav = np.empty(num_data,dtype=np.float64)
+        #Approximation
+        stop = False
+        iterations = 0
+        while not stop and (iterations < self.max_iters):
+            self._local_updates(num_data, cav_params, post_params, marg_moments, ga_approx, likelihood, Y, Y_metadata)

+            #(re) compute Sigma and mu using full Cholesky decompy
+            post_params = posteriorParams._recompute(K, ga_approx)
+
+            #monitor convergence
+            if iterations > 0:
+                stop = self._stop_criteria(ga_approx)
+            self.ga_approx_old = gaussianApproximation(ga_approx.v.copy(), ga_approx.tau.copy())
+            iterations += 1
+
+        # Z_tilde after removing the terms that can lead to infinite terms due to tau_tilde close to zero.
+        # This terms cancel with the coreresponding terms in the marginal loglikelihood
+        log_Z_tilde = self._log_Z_tilde(marg_moments, ga_approx, cav_params)
+                         # - 0.5*np.log(tau_tilde) + 0.5*(v_tilde*v_tilde*1./tau_tilde)
+        return (post_params, ga_approx, cav_params, log_Z_tilde)
+
+    def _init_approximations(self, K, num_data):
        #initial values - Gaussian factors
        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
-        if self.old_mutilde is None:
-            tau_tilde, mu_tilde, v_tilde = np.zeros((3, num_data))
+        if self.ga_approx_old is None:
+            v_tilde, tau_tilde = np.zeros((2, num_data))
+            ga_approx = gaussianApproximation(v_tilde, tau_tilde)
            Sigma = K.copy()
            diag.add(Sigma, 1e-7)
            mu = np.zeros(num_data)
+            post_params = posteriorParams(mu, Sigma)
        else:
-            assert self.old_mutilde.size == num_data, "data size mis-match: did you change the data? try resetting!"
-            mu_tilde, v_tilde = self.old_mutilde, self.old_vtilde
-            tau_tilde = v_tilde/mu_tilde
-            mu, Sigma, _ = self._ep_compute_posterior(K, tau_tilde, v_tilde)
-            diag.add(Sigma, 1e-7)
+            assert self.ga_approx_old.v.size == num_data, "data size mis-match: did you change the data? try resetting!"
+            ga_approx = gaussianApproximation(self.ga_approx_old.v, self.ga_approx_old.tau)
+            post_params = posteriorParams._recompute(K, ga_approx)
+            diag.add(post_params.Sigma, 1e-7)
            # TODO: Check the log-marginal under both conditions and choose the best one
+        return (ga_approx, post_params)

-        #Approximation
-        tau_diff = self.epsilon + 1.
-        v_diff = self.epsilon + 1.
-        tau_tilde_old = np.nan
-        v_tilde_old = np.nan
-        iterations = 0
-        while ((tau_diff > self.epsilon) or (v_diff > self.epsilon)) and (iterations < self.max_iters):
+    def _local_updates(self, num_data, cav_params, post_params, marg_moments, ga_approx, likelihood, Y, Y_metadata, update_order=None):
+            if update_order is None:
                update_order = np.random.permutation(num_data)
            for i in update_order:
                #Cavity distribution parameters
-                tau_cav[i] = 1./Sigma[i,i] - self.eta*tau_tilde[i]
-                v_cav[i] = mu[i]/Sigma[i,i] - self.eta*v_tilde[i]
+                cav_params._update_i(self.eta, ga_approx, post_params, i)
+
                if Y_metadata is not None:
                    # Pick out the relavent metadata for Yi
                    Y_metadata_i = {}
@ -133,93 +300,77 @@ class EP(EPBase, ExactGaussianInference):
                else:
                    Y_metadata_i = None
                #Marginal moments
-                Z_hat[i], mu_hat[i], sigma2_hat[i] = likelihood.moments_match_ep(Y[i], tau_cav[i], v_cav[i], Y_metadata_i=Y_metadata_i)
-                #Site parameters update
-                delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
-                delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
-                tau_tilde_prev = tau_tilde[i]
-                tau_tilde[i] += delta_tau
+                marg_moments.Z_hat[i], marg_moments.mu_hat[i], marg_moments.sigma2_hat[i] = likelihood.moments_match_ep(Y[i], cav_params.tau[i], cav_params.v[i], Y_metadata_i=Y_metadata_i)

-                # Enforce positivity of tau_tilde. Even though this is guaranteed for logconcave sites, it is still possible
-                # to get negative values due to numerical errors. Moreover, the value of tau_tilde should be positive in order to
-                # update the marginal likelihood without inestability issues.
-                if tau_tilde[i] < np.finfo(float).eps:
-                    tau_tilde[i] = np.finfo(float).eps
-                    delta_tau = tau_tilde[i] - tau_tilde_prev
-                v_tilde[i] += delta_v
+                #Site parameters update
+                delta_tau, delta_v = ga_approx._update_i(self.eta, self.delta, post_params, marg_moments, i)

                if self.parallel_updates == False:
-                    #Posterior distribution parameters update
-                    ci = delta_tau/(1.+ delta_tau*Sigma[i,i])
-                    DSYR(Sigma, Sigma[:,i].copy(), -ci)
-                    mu = np.dot(Sigma, v_tilde)
+                    post_params._update_rank1(delta_tau, ga_approx, i)

-            #(re) compute Sigma and mu using full Cholesky decompy
-            mu, Sigma, _ = self._ep_compute_posterior(K, tau_tilde, v_tilde)
+    def _log_Z_tilde(self, marg_moments, ga_approx, cav_params):
+        return np.sum((np.log(marg_moments.Z_hat) + 0.5*np.log(2*np.pi) + 0.5*np.log(1+ga_approx.tau/cav_params.tau) - 0.5 * ((ga_approx.v)**2 * 1./(cav_params.tau + ga_approx.tau))
+                + 0.5*(cav_params.v * ( ( (ga_approx.tau/cav_params.tau) * cav_params.v - 2.0 * ga_approx.v ) * 1./(cav_params.tau + ga_approx.tau)))))

-            #monitor convergence
-            if iterations > 0:
-                tau_diff = np.mean(np.square(tau_tilde-tau_tilde_old))
-                v_diff = np.mean(np.square(v_tilde-v_tilde_old))
-            tau_tilde_old = tau_tilde.copy()
-            v_tilde_old = v_tilde.copy()

-            iterations += 1

-        mu_tilde = v_tilde/tau_tilde
-        mu_cav = v_cav/tau_cav
-        sigma2_sigma2tilde = 1./tau_cav + 1./tau_tilde
-
-        # Z_tilde after removing the terms that can lead to infinite terms due to tau_tilde close to zero.
-        # This terms cancel with the coreresponding terms in the marginal loglikelihood
-        log_Z_tilde = (np.log(Z_hat) + 0.5*np.log(2*np.pi) + 0.5*np.log(1+tau_tilde/tau_cav)
-                         - 0.5 * ((v_tilde)**2 * 1./(tau_cav + tau_tilde)) + 0.5*(v_cav * ( ( (tau_tilde/tau_cav) * v_cav - 2.0 * v_tilde ) * 1./(tau_cav + tau_tilde))))
-                         # - 0.5*np.log(tau_tilde) + 0.5*(v_tilde*v_tilde*1./tau_tilde)
-
-        self.old_mutilde = mu_tilde
-        self.old_vtilde = v_tilde
-
-        return mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde
-
-    def _ep_compute_posterior(self, K, tau_tilde, v_tilde):
-        num_data = len(tau_tilde)
-        tau_tilde_root = np.sqrt(tau_tilde)
-        Sroot_tilde_K = tau_tilde_root[:,None] * K
-        B = np.eye(num_data) + Sroot_tilde_K * tau_tilde_root[None,:]
-        L = jitchol(B)
-        V, _ = dtrtrs(L, Sroot_tilde_K, lower=1)
-        Sigma = K - np.dot(V.T,V) #K - KS^(1/2)BS^(1/2)K = (K^(-1) + \Sigma^(-1))^(-1)
-        mu = np.dot(Sigma,v_tilde)
-        return (mu, Sigma, L)
-
-    def _ep_marginal(self, K, tau_tilde, v_tilde, Z_tilde):
-        mu, Sigma, L = self._ep_compute_posterior(K, tau_tilde, v_tilde)
+    def _ep_marginal(self, K, ga_approx, Z_tilde):
+        post_params = posteriorParams._recompute(K, ga_approx)

        # Gaussian log marginal excluding terms that can go to infinity due to arbitrarily small tau_tilde.
        # These terms cancel out with the terms excluded from Z_tilde
-        B_logdet = np.sum(2.0*np.log(np.diag(L)))
-        log_marginal =  0.5*(-len(tau_tilde) * log_2_pi - B_logdet + np.sum(v_tilde * np.dot(Sigma,v_tilde)))
+        B_logdet = np.sum(2.0*np.log(np.diag(post_params.L)))
+        log_marginal =  0.5*(-len(ga_approx.tau) * log_2_pi - B_logdet + np.sum(ga_approx.v * np.dot(post_params.Sigma,ga_approx.v)))
        log_marginal += Z_tilde

-        return log_marginal, mu, Sigma, L
+        return log_marginal, post_params

-    def _inference(self, K, tau_tilde, v_tilde, likelihood, Z_tilde, Y_metadata=None):
-        log_marginal, mu, Sigma, L = self._ep_marginal(K, tau_tilde, v_tilde, Z_tilde)
+    def _inference(self, Y, K, ga_approx, cav_params, likelihood, Z_tilde, Y_metadata=None):
+        log_marginal, post_params = self._ep_marginal(K, ga_approx, Z_tilde)

-        tau_tilde_root = np.sqrt(tau_tilde)
+        tau_tilde_root = np.sqrt(ga_approx.tau)
        Sroot_tilde_K = tau_tilde_root[:,None] * K

-        aux_alpha , _ = dpotrs(L, np.dot(Sroot_tilde_K, v_tilde), lower=1)
-        alpha = (v_tilde - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde)
-        LWi, _ = dtrtrs(L, np.diag(tau_tilde_root), lower=1)
+        aux_alpha , _ = dpotrs(post_params.L, np.dot(Sroot_tilde_K, ga_approx.v), lower=1)
+        alpha = (ga_approx.v - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) /mu^(/tilde)
+        LWi, _ = dtrtrs(post_params.L, np.diag(tau_tilde_root), lower=1)
        Wi = np.dot(LWi.T,LWi)
        symmetrify(Wi) #(K + Sigma^(\tilde))^(-1)

        dL_dK = 0.5 * (tdot(alpha) - Wi)
-        dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK), Y_metadata)
-
+        dL_dthetaL = likelihood.ep_gradients(Y, cav_params.tau, cav_params.v, np.diag(dL_dK), Y_metadata=Y_metadata, quad_mode='gh')
        return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}

+    def to_dict(self):
+        input_dict = super(EP, self)._to_dict()
+        input_dict["class"] = "GPy.inference.latent_function_inference.expectation_propagation.EP"
+        if self.ga_approx_old is not  None:
+            input_dict["ga_approx_old"] = self.ga_approx_old.to_dict()
+        if self._ep_approximation is not  None:
+            input_dict["_ep_approximation"] = {}
+            input_dict["_ep_approximation"]["post_params"] = self._ep_approximation[0].to_dict()
+            input_dict["_ep_approximation"]["ga_approx"] = self._ep_approximation[1].to_dict()
+            input_dict["_ep_approximation"]["cav_params"] = self._ep_approximation[2].to_dict()
+            input_dict["_ep_approximation"]["log_Z_tilde"] = self._ep_approximation[3].tolist()
+
+        return input_dict
+
+    @staticmethod
+    def _from_dict(inference_class, input_dict):
+        ga_approx_old = input_dict.pop('ga_approx_old', None)
+        if ga_approx_old is not None:
+            ga_approx_old = gaussianApproximation.from_dict(ga_approx_old)
+        _ep_approximation_dict = input_dict.pop('_ep_approximation', None)
+        _ep_approximation = []
+        if _ep_approximation is not None:
+            _ep_approximation.append(posteriorParams.from_dict(_ep_approximation_dict["post_params"]))
+            _ep_approximation.append(gaussianApproximation.from_dict(_ep_approximation_dict["ga_approx"]))
+            _ep_approximation.append(cavityParams.from_dict(_ep_approximation_dict["cav_params"]))
+            _ep_approximation.append(np.array(_ep_approximation_dict["log_Z_tilde"]))
+        ee = EP(**input_dict)
+        ee.ga_approx_old = ga_approx_old
+        ee._ep_approximation = _ep_approximation
+        return ee

 class EPDTC(EPBase, VarDTC):
    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None, Lm=None, dL_dKmm=None, psi0=None, psi1=None, psi2=None):
@ -244,24 +395,25 @@ class EPDTC(EPBase, VarDTC):
        if self.ep_mode=="nested":
            #Force EP at each step of the optimization
            self._ep_approximation = None
-            mu, Sigma_diag, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
+            post_params, ga_approx, log_Z_tilde = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
        elif self.ep_mode=="alternated":
            if getattr(self, '_ep_approximation', None) is None:
                #if we don't yet have the results of runnign EP, run EP and store the computed factors in self._ep_approximation
-                mu, Sigma_diag, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
+                post_params, ga_approx, log_Z_tilde = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
            else:
                #if we've already run EP, just use the existing approximation stored in self._ep_approximation
-                mu, Sigma_diag, mu_tilde, tau_tilde, log_Z_tilde = self._ep_approximation
+                post_params, ga_approx, log_Z_tilde = self._ep_approximation
        else:
            raise ValueError("ep_mode value not valid")

-        return super(EPDTC, self).inference(kern, X, Z, likelihood, mu_tilde,
+        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
+
+        return super(EPDTC, self).inference(kern, X, Z, likelihood, ObsAr(mu_tilde[:,None]),
                                            mean_function=mean_function,
                                            Y_metadata=Y_metadata,
-                                            precision=tau_tilde,
+                                            precision=ga_approx.tau,
                                            Lm=Lm, dL_dKmm=dL_dKmm,
-                                            psi0=psi0, psi1=psi1, psi2=psi2, Z_tilde=log_Z_tilde.sum())
-
+                                            psi0=psi0, psi1=psi1, psi2=psi2, Z_tilde=log_Z_tilde)

    def expectation_propagation(self, Kmm, Kmn, Y, likelihood, Y_metadata):

@ -272,51 +424,78 @@ class EPDTC(EPBase, VarDTC):
        # than ObsArrays
        Y = Y.values.copy()

-        #Initial values - Marginal moments
-        Z_hat = np.zeros(num_data,dtype=np.float64)
-        mu_hat = np.zeros(num_data,dtype=np.float64)
-        sigma2_hat = np.zeros(num_data,dtype=np.float64)
+        #Initial values - Marginal moments, cavity params, gaussian approximation params and posterior params
+        marg_moments = marginalMoments(num_data)
+        cav_params = cavityParams(num_data)
+        ga_approx, post_params, LLT0, LLT = self._init_approximations(Kmm, Kmn, num_data)

-        tau_cav = np.empty(num_data,dtype=np.float64)
-        v_cav = np.empty(num_data,dtype=np.float64)
+        #Approximation
+        stop = False
+        iterations = 0
+        while not stop and (iterations < self.max_iters):
+            self._local_updates(num_data, LLT0, LLT, Kmn, cav_params, post_params, marg_moments, ga_approx, likelihood, Y, Y_metadata)
+            #(re) compute Sigma, Sigma_diag and mu using full Cholesky decompy
+            post_params, LLT = posteriorParamsDTC._recompute(LLT0, Kmn, ga_approx)
+            post_params.Sigma_diag = np.maximum(post_params.Sigma_diag, np.finfo(float).eps)

+            #monitor convergence
+            if iterations > 0:
+                stop = self._stop_criteria(ga_approx)
+            self.ga_approx_old = gaussianApproximation(ga_approx.v.copy(), ga_approx.tau.copy())
+            iterations += 1
+
+        log_Z_tilde = self._log_Z_tilde(marg_moments, ga_approx, cav_params)
+
+        return post_params, ga_approx, log_Z_tilde
+
+    def _log_Z_tilde(self, marg_moments, ga_approx, cav_params):
+        mu_tilde = ga_approx.v/ga_approx.tau
+        mu_cav = cav_params.v/cav_params.tau
+        sigma2_sigma2tilde = 1./cav_params.tau + 1./ga_approx.tau
+
+        return np.sum((np.log(marg_moments.Z_hat) + 0.5*np.log(2*np.pi) + 0.5*np.log(sigma2_sigma2tilde)
+                         + 0.5*((mu_cav - mu_tilde)**2) / (sigma2_sigma2tilde)))
+
+    def _init_approximations(self, Kmm, Kmn, num_data):
        #initial values - Gaussian factors
        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
        LLT0 = Kmm.copy()
        Lm = jitchol(LLT0) #K_m = L_m L_m^\top
-        Vm,info = dtrtrs(Lm,Kmn,lower=1)
+        Vm,info = dtrtrs(Lm, Kmn,lower=1)
        # Lmi = dtrtri(Lm)
        # Kmmi = np.dot(Lmi.T,Lmi)
        # KmmiKmn = np.dot(Kmmi,Kmn)
        # Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
        Qnn_diag = np.sum(Vm*Vm,-2) #diag(Knm Kmm^(-1) Kmn)
        #diag.add(LLT0, 1e-8)
-        if self.old_mutilde is None:
+        if self.ga_approx_old is None:
            #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
            LLT = LLT0.copy() #Sigma = K.copy()
            mu = np.zeros(num_data)
            Sigma_diag = Qnn_diag.copy() + 1e-8
-            tau_tilde, mu_tilde, v_tilde = np.zeros((3, num_data))
+            v_tilde, tau_tilde = np.zeros((2, num_data))
+            ga_approx = gaussianApproximation(v_tilde, tau_tilde)
+            post_params = posteriorParamsDTC(mu, Sigma_diag)
+
        else:
-            assert self.old_mutilde.size == num_data, "data size mis-match: did you change the data? try resetting!"
-            mu_tilde, v_tilde = self.old_mutilde, self.old_vtilde
-            tau_tilde = v_tilde/mu_tilde
-            mu, Sigma_diag, LLT = self._ep_compute_posterior(LLT0, Kmn, tau_tilde, v_tilde)
-            Sigma_diag += 1e-8
+            assert self.ga_approx_old.v.size == num_data, "data size mis-match: did you change the data? try resetting!"
+            ga_approx = gaussianApproximation(self.ga_approx_old.v, self.ga_approx_old.tau)
+            post_params, LLT = posteriorParamsDTC._recompute(LLT0, Kmn, ga_approx)
+            post_params.Sigma_diag += 1e-8
+
            # TODO: Check the log-marginal under both conditions and choose the best one

-        #Approximation
-        tau_diff = self.epsilon + 1.
-        v_diff = self.epsilon + 1.
-        tau_tilde_old = np.nan
-        v_tilde_old = np.nan
-        iterations = 0
-        while  ((tau_diff > self.epsilon) or (v_diff > self.epsilon)) and (iterations < self.max_iters):
+        return (ga_approx, post_params, LLT0, LLT)
+
+    def _local_updates(self, num_data, LLT0, LLT, Kmn, cav_params, post_params, marg_moments, ga_approx, likelihood, Y, Y_metadata, update_order=None):
+        if update_order is None:
            update_order = np.random.permutation(num_data)
        for i in update_order:
+
            #Cavity distribution parameters
-                tau_cav[i] = 1./Sigma_diag[i] - self.eta*tau_tilde[i]
-                v_cav[i] = mu[i]/Sigma_diag[i] - self.eta*v_tilde[i]
+            cav_params._update_i(self.eta, ga_approx, post_params, i)
+
+
            if Y_metadata is not None:
                # Pick out the relavent metadata for Yi
                Y_metadata_i = {}
@ -326,65 +505,10 @@ class EPDTC(EPBase, VarDTC):
                Y_metadata_i = None

            #Marginal moments
-                Z_hat[i], mu_hat[i], sigma2_hat[i] = likelihood.moments_match_ep(Y[i], tau_cav[i], v_cav[i], Y_metadata_i=Y_metadata_i)
+            marg_moments.Z_hat[i], marg_moments.mu_hat[i], marg_moments.sigma2_hat[i] = likelihood.moments_match_ep(Y[i], cav_params.tau[i], cav_params.v[i], Y_metadata_i=Y_metadata_i)
            #Site parameters update
-                delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
-                delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
-                tau_tilde_prev = tau_tilde[i]
-                tau_tilde[i] += delta_tau
-
-                # Enforce positivity of tau_tilde. Even though this is guaranteed for logconcave sites, it is still possible
-                # to get negative values due to numerical errors. Moreover, the value of tau_tilde should be positive in order to
-                # update the marginal likelihood without inestability issues.
-                if tau_tilde[i] < np.finfo(float).eps:
-                    tau_tilde[i] = np.finfo(float).eps
-                    delta_tau = tau_tilde[i] - tau_tilde_prev
-                v_tilde[i] += delta_v
+            delta_tau, delta_v = ga_approx._update_i(self.eta, self.delta, post_params, marg_moments, i)

            #Posterior distribution parameters update
            if self.parallel_updates == False:
-                    #DSYR(Sigma, Sigma[:,i].copy(), -delta_tau/(1.+ delta_tau*Sigma[i,i]))
-                    DSYR(LLT,Kmn[:,i].copy(),delta_tau)
-                    L = jitchol(LLT)
-                    V,info = dtrtrs(L,Kmn,lower=1)
-                    Sigma_diag = np.maximum(np.sum(V*V,-2), np.finfo(float).eps)  #diag(K_nm (L L^\top)^(-1)) K_mn
-                    si = np.sum(V.T*V[:,i],-1) #(V V^\top)[:,i]
-                    mu += (delta_v-delta_tau*mu[i])*si
-                    #mu = np.dot(Sigma, v_tilde)
-
-            #(re) compute Sigma, Sigma_diag and mu using full Cholesky decompy
-            mu, Sigma_diag, LLT = self._ep_compute_posterior(LLT0, Kmn, tau_tilde, v_tilde)
-            Sigma_diag = np.maximum(Sigma_diag, np.finfo(float).eps)
-
-            #monitor convergence
-            if iterations>0:
-                tau_diff = np.mean(np.square(tau_tilde-tau_tilde_old))
-                v_diff = np.mean(np.square(v_tilde-v_tilde_old))
-            tau_tilde_old = tau_tilde.copy()
-            v_tilde_old = v_tilde.copy()
-            iterations += 1
-
-        mu_tilde = v_tilde/tau_tilde
-        mu_cav = v_cav/tau_cav
-        sigma2_sigma2tilde = 1./tau_cav + 1./tau_tilde
-
-        log_Z_tilde = (np.log(Z_hat) + 0.5*np.log(2*np.pi) + 0.5*np.log(sigma2_sigma2tilde)
-                         + 0.5*((mu_cav - mu_tilde)**2) / (sigma2_sigma2tilde))
-
-        self.old_mutilde = mu_tilde
-        self.old_vtilde = v_tilde
-
-        return mu, Sigma_diag, ObsAr(mu_tilde[:,None]), tau_tilde, log_Z_tilde
-
-    def _ep_compute_posterior(self, LLT0, Kmn, tau_tilde, v_tilde):
-        LLT = LLT0 + np.dot(Kmn*tau_tilde[None,:],Kmn.T)
-        L = jitchol(LLT)
-        V, _ = dtrtrs(L,Kmn,lower=1)
-        #Sigma_diag = np.sum(V*V,-2)
-        #Knmv_tilde = np.dot(Kmn,v_tilde)
-        #mu = np.dot(V2.T,Knmv_tilde)
-        Sigma = np.dot(V.T,V)
-        mu = np.dot(Sigma,v_tilde)
-        Sigma_diag = np.diag(Sigma).copy()
-
-        return (mu, Sigma_diag, LLT)
+                post_params._update_rank1(LLT, Kmn, delta_v, delta_tau, i)
--- a/GPy/kern/src/add.py
+++ b/GPy/kern/src/add.py
@ -43,6 +43,11 @@ class Add(CombinationKernel):
        else:
            return False

+    def to_dict(self):
+        input_dict = super(Add, self)._to_dict()
+        input_dict["class"] = str("GPy.kern.Add")
+        return input_dict
+
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def K(self, X, X2=None, which_parts=None):
        """
--- a/GPy/kern/src/kern.py
+++ b/GPy/kern/src/kern.py
@ -60,6 +60,35 @@ class Kern(Parameterized):
        from .psi_comp import PSICOMP_GH
        self.psicomp = PSICOMP_GH()

+    def _to_dict(self):
+        input_dict = {}
+        input_dict["input_dim"] = self.input_dim
+        if isinstance(self.active_dims, np.ndarray):
+            input_dict["active_dims"] = self.active_dims.tolist()
+        else:
+            input_dict["active_dims"] = self.active_dims
+        input_dict["name"] = self.name
+        input_dict["useGPU"] = self.useGPU
+        return input_dict
+
+    def to_dict(self):
+        raise NotImplementedError
+
+    @staticmethod
+    def from_dict(input_dict):
+        import copy
+        input_dict = copy.deepcopy(input_dict)
+        kernel_class = input_dict.pop('class')
+        input_dict["name"] = str(input_dict["name"])
+        import GPy
+        kernel_class = eval(kernel_class)
+        return kernel_class._from_dict(kernel_class, input_dict)
+
+    @staticmethod
+    def _from_dict(kernel_class, input_dict):
+        return kernel_class(**input_dict)
+
+
    def __setstate__(self, state):
        self._all_dims_active = np.arange(0, max(state['active_dims']) + 1)
        super(Kern, self).__setstate__(state)
@ -342,6 +371,21 @@ class CombinationKernel(Kern):
        self.extra_dims = extra_dims
        self.link_parameters(*kernels)

+    def _to_dict(self):
+        input_dict = super(CombinationKernel, self)._to_dict()
+        input_dict["parts"]  = {}
+        for ii in range(len(self.parts)):
+            input_dict["parts"][ii] = self.parts[ii].to_dict()
+        return input_dict
+
+    @staticmethod
+    def _from_dict(kernel_class, input_dict):
+        parts = input_dict.pop('parts', None)
+        subkerns = []
+        for pp in parts:
+            subkerns.append(Kern.from_dict(parts[pp]))
+        return kernel_class(subkerns)
+
    @property
    def parts(self):
        return self.parameters
--- a/GPy/kern/src/linear.py
+++ b/GPy/kern/src/linear.py
@ -51,6 +51,18 @@ class Linear(Kern):
        self.link_parameter(self.variances)
        self.psicomp = PSICOMP_Linear()

+    def to_dict(self):
+        input_dict = super(Linear, self)._to_dict()
+        input_dict["class"] = "GPy.kern.Linear"
+        input_dict["variances"] = self.variances.values.tolist()
+        input_dict["ARD"] = self.ARD
+        return input_dict
+
+    @staticmethod
+    def _from_dict(kernel_class, input_dict):
+        useGPU = input_dict.pop('useGPU', None)
+        return Linear(**input_dict)
+
    @Cache_this(limit=3)
    def K(self, X, X2=None):
        if self.ARD:
@ -211,5 +223,3 @@ class LinearFull(Kern):
    def gradients_X_diag(self, dL_dKdiag, X):
        P = np.dot(self.W, self.W.T) + np.diag(self.kappa)
        return 2.*np.einsum('jk,i,ij->ik', P, dL_dKdiag, X)
-
-
--- a/GPy/kern/src/periodic.py
+++ b/GPy/kern/src/periodic.py
@ -400,4 +400,3 @@ class PeriodicMatern52(Periodic):
        self.variance.gradient = np.sum(dK_dvar*dL_dK)
        self.lengthscale.gradient = np.sum(dK_dlen*dL_dK)
        self.period.gradient = np.sum(dK_dper*dL_dK)
-
--- a/GPy/kern/src/prod.py
+++ b/GPy/kern/src/prod.py
@ -39,6 +39,11 @@ class Prod(CombinationKernel):
                    kernels.insert(i, part)
        super(Prod, self).__init__(kernels, name)

+    def to_dict(self):
+        input_dict = super(Prod, self)._to_dict()
+        input_dict["class"] = str("GPy.kern.Prod")
+        return input_dict
+
    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def K(self, X, X2=None, which_parts=None):
        if which_parts is None:
--- a/GPy/kern/src/rbf.py
+++ b/GPy/kern/src/rbf.py
@ -31,6 +31,14 @@ class RBF(Stationary):
            self.inv_l = Param('inv_lengthscale',1./self.lengthscale**2, Logexp())
            self.link_parameter(self.inv_l)

+    def to_dict(self):
+        input_dict = super(RBF, self)._to_dict()
+        input_dict["class"] = "GPy.kern.RBF"
+        input_dict["inv_l"] = self.use_invLengthscale
+        if input_dict["inv_l"] == True:
+            input_dict["lengthscale"] = np.sqrt(1 / float(self.inv_l))
+        return input_dict
+
    def K_of_r(self, r):
        return self.variance * np.exp(-0.5 * r**2)

--- a/GPy/kern/src/standard_periodic.py
+++ b/GPy/kern/src/standard_periodic.py
@ -93,6 +93,17 @@ class StdPeriodic(Kern):

        self.link_parameters(self.variance,  self.period, self.lengthscale)

+    def to_dict(self):
+        input_dict = super(StdPeriodic, self)._to_dict()
+        input_dict["class"] = "GPy.kern.StdPeriodic"
+        input_dict["variance"] = self.variance.values.tolist()
+        input_dict["period"] = self.period.values.tolist()
+        input_dict["lengthscale"] = self.lengthscale.values.tolist()
+        input_dict["ARD1"] = self.ARD1
+        input_dict["ARD2"] = self.ARD2
+        return input_dict
+
+
    def parameters_changed(self):
        """
        This functions deals as a callback for each optimization iteration.
--- a/GPy/kern/src/static.py
+++ b/GPy/kern/src/static.py
@ -14,6 +14,11 @@ class Static(Kern):
        self.variance = Param('variance', variance, Logexp())
        self.link_parameters(self.variance)

+    def _to_dict(self):
+        input_dict = super(Static, self)._to_dict()
+        input_dict["variance"] =  self.variance.values.tolist()
+        return input_dict
+
    def Kdiag(self, X):
        ret = np.empty((X.shape[0],), dtype=np.float64)
        ret[:] = self.variance
@ -133,6 +138,16 @@ class Bias(Static):
    def __init__(self, input_dim, variance=1., active_dims=None, name='bias'):
        super(Bias, self).__init__(input_dim, variance, active_dims, name)

+    def to_dict(self):
+        input_dict = super(Bias, self)._to_dict()
+        input_dict["class"] = "GPy.kern.Bias"
+        return input_dict
+
+    @staticmethod
+    def _from_dict(kernel_class, input_dict):
+        useGPU = input_dict.pop('useGPU', None)
+        return Bias(**input_dict)
+
    def K(self, X, X2=None):
        shape = (X.shape[0], X.shape[0] if X2 is None else X2.shape[0])
        return np.full(shape, self.variance, dtype=np.float64)
@ -250,4 +265,3 @@ class Precomputed(Fixed):

    def update_gradients_diag(self, dL_dKdiag, X):
        self.variance.gradient = np.einsum('i,ii', dL_dKdiag, self._index(X, None))
-
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@ -79,6 +79,13 @@ class Stationary(Kern):
        assert self.variance.size==1
        self.link_parameters(self.variance, self.lengthscale)

+    def _to_dict(self):
+        input_dict = super(Stationary, self)._to_dict()
+        input_dict["variance"] =  self.variance.values.tolist()
+        input_dict["lengthscale"] = self.lengthscale.values.tolist()
+        input_dict["ARD"] = self.ARD
+        return input_dict
+
    def K_of_r(self, r):
        raise NotImplementedError("implement the covariance function as a fn of r to use this class")

@ -351,6 +358,16 @@ class Exponential(Stationary):
    def dK_dr(self, r):
        return -self.K_of_r(r)

+    def to_dict(self):
+        input_dict = super(Exponential, self)._to_dict()
+        input_dict["class"] = "GPy.kern.Exponential"
+        return input_dict
+
+    @staticmethod
+    def _from_dict(kernel_class, input_dict):
+        useGPU = input_dict.pop('useGPU', None)
+        return Exponential(**input_dict)
+
 #    def sde(self):
 #        """
 #        Return the state space representation of the covariance.
@ -399,6 +416,16 @@ class Matern32(Stationary):
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Mat32'):
        super(Matern32, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)

+    def to_dict(self):
+        input_dict = super(Matern32, self)._to_dict()
+        input_dict["class"] = "GPy.kern.Matern32"
+        return input_dict
+
+    @staticmethod
+    def _from_dict(kernel_class, input_dict):
+        useGPU = input_dict.pop('useGPU', None)
+        return Matern32(**input_dict)
+
    def K_of_r(self, r):
        return self.variance * (1. + np.sqrt(3.) * r) * np.exp(-np.sqrt(3.) * r)

@ -478,6 +505,16 @@ class Matern52(Stationary):
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Mat52'):
        super(Matern52, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)

+    def to_dict(self):
+        input_dict = super(Matern52, self)._to_dict()
+        input_dict["class"] = "GPy.kern.Matern52"
+        return input_dict
+
+    @staticmethod
+    def _from_dict(kernel_class, input_dict):
+        useGPU = input_dict.pop('useGPU', None)
+        return Matern52(**input_dict)
+
    def K_of_r(self, r):
        return self.variance*(1+np.sqrt(5.)*r+5./3*r**2)*np.exp(-np.sqrt(5.)*r)

@ -533,6 +570,16 @@ class ExpQuad(Stationary):
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='ExpQuad'):
        super(ExpQuad, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)

+    def to_dict(self):
+        input_dict = super(ExpQuad, self)._to_dict()
+        input_dict["class"] = "GPy.kern.ExpQuad"
+        return input_dict
+
+    @staticmethod
+    def _from_dict(kernel_class, input_dict):
+        useGPU = input_dict.pop('useGPU', None)
+        return ExpQuad(**input_dict)
+
    def K_of_r(self, r):
        return self.variance * np.exp(-0.5 * r**2)

@ -566,6 +613,17 @@ class RatQuad(Stationary):
        self.power = Param('power', power, Logexp())
        self.link_parameters(self.power)

+    def to_dict(self):
+        input_dict = super(RatQuad, self)._to_dict()
+        input_dict["class"] = "GPy.kern.RatQuad"
+        input_dict["power"] = self.power.values.tolist()
+        return input_dict
+
+    @staticmethod
+    def _from_dict(kernel_class, input_dict):
+        useGPU = input_dict.pop('useGPU', None)
+        return RatQuad(**input_dict)
+
    def K_of_r(self, r):
        r2 = np.square(r)
 #         return self.variance*np.power(1. + r2/2., -self.power)
@ -588,5 +646,3 @@ class RatQuad(Stationary):
    def update_gradients_diag(self, dL_dKdiag, X):
        super(RatQuad, self).update_gradients_diag(dL_dKdiag, X)
        self.power.gradient = 0.
-
-
--- a/GPy/likelihoods/init.py
+++ b/GPy/likelihoods/init.py
@ -7,4 +7,5 @@ from .student_t import StudentT
 from .likelihood import Likelihood
 from .mixed_noise import MixedNoise
 from .binomial import Binomial
-
+from .weibull import Weibull
+from .loglogistic import LogLogistic
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@ -29,6 +29,11 @@ class Bernoulli(Likelihood):
        if isinstance(gp_link , (link_functions.Heaviside, link_functions.Probit)):
            self.log_concave = True

+    def to_dict(self):
+        input_dict = super(Bernoulli, self)._to_dict()
+        input_dict["class"] = "GPy.likelihoods.Bernoulli"
+        return input_dict
+
    def _preprocess_values(self, Y):
        """
        Check if the values of the observations correspond to the values
--- a/GPy/likelihoods/binomial.py
+++ b/GPy/likelihoods/binomial.py
@ -66,7 +66,14 @@ class Binomial(Likelihood):
        np.testing.assert_array_equal(N.shape, y.shape)

        nchoosey = special.gammaln(N+1) - special.gammaln(y+1) - special.gammaln(N-y+1)
-        return nchoosey + y*np.log(inv_link_f) + (N-y)*np.log(1.-inv_link_f)
+        
+        Ny = N-y
+        t1 = np.zeros(y.shape)
+        t2 = np.zeros(y.shape)
+        t1[y>0] = y[y>0]*np.log(inv_link_f[y>0])
+        t2[Ny>0] = Ny[Ny>0]*np.log(1.-inv_link_f[Ny>0])
+        
+        return nchoosey + t1 + t2

    def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
        """
@ -86,7 +93,13 @@ class Binomial(Likelihood):
        N = Y_metadata['trials']
        np.testing.assert_array_equal(N.shape, y.shape)

-        return y/inv_link_f - (N-y)/(1.-inv_link_f)
+        Ny = N-y
+        t1 = np.zeros(y.shape)
+        t2 = np.zeros(y.shape)
+        t1[y>0] = y[y>0]/inv_link_f[y>0]
+        t2[Ny>0] = (Ny[Ny>0])/(1.-inv_link_f[Ny>0])        
+
+        return t1 - t2

    def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None):
        """
@ -111,7 +124,13 @@ class Binomial(Likelihood):
        """
        N = Y_metadata['trials']
        np.testing.assert_array_equal(N.shape, y.shape)
-        return -y/np.square(inv_link_f) - (N-y)/np.square(1.-inv_link_f)
+        Ny = N-y
+        t1 = np.zeros(y.shape)
+        t2 = np.zeros(y.shape)
+        t1[y>0] = -y[y>0]/np.square(inv_link_f[y>0])
+        t2[Ny>0] = -(Ny[Ny>0])/np.square(1.-inv_link_f[Ny>0])
+        return t1+t2
+

    def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None):
        """
@ -135,8 +154,14 @@ class Binomial(Likelihood):
        N = Y_metadata['trials']
        np.testing.assert_array_equal(N.shape, y.shape)

-        inv_link_f2 = np.square(inv_link_f)
-        return 2*y/inv_link_f**3 - 2*(N-y)/(1.-inv_link_f)**3
+        #inv_link_f2 = np.square(inv_link_f)  #TODO Remove. Why is this here?
+        
+        Ny = N-y
+        t1 = np.zeros(y.shape)
+        t2 = np.zeros(y.shape)
+        t1[y>0] = 2*y[y>0]/inv_link_f[y>0]**3
+        t2[Ny>0] = - 2*(Ny[Ny>0])/(1.-inv_link_f[Ny>0])**3
+        return t1 + t2

    def samples(self, gp, Y_metadata=None, **kw):
        """
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -46,6 +46,13 @@ class Gaussian(Likelihood):
        if isinstance(gp_link, link_functions.Identity):
            self.log_concave = True

+    def to_dict(self):
+        input_dict = super(Gaussian, self)._to_dict()
+        input_dict["class"] = "GPy.likelihoods.Gaussian"
+        input_dict["variance"] = self.variance.values.tolist()
+        return input_dict
+
+
    def betaY(self,Y,Y_metadata=None):
        #TODO: ~Ricardo this does not live here
        raise RuntimeError("Please notify the GPy developers, this should not happen")
@ -57,7 +64,10 @@ class Gaussian(Likelihood):
    def update_gradients(self, grad):
        self.variance.gradient = grad

-    def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
+    def ep_gradients(self, Y, cav_tau, cav_v, dL_dKdiag, Y_metadata=None, quad_mode='gk', boost_grad=1.):
+        return self.exact_inference_gradients(dL_dKdiag)
+
+    def exact_inference_gradients(self, dL_dKdiag, Y_metadata=None):
        return dL_dKdiag.sum()

    def _preprocess_values(self, Y):
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -6,8 +6,12 @@ from scipy import stats,special
 import scipy as sp
 from . import link_functions
 from ..util.misc import chain_1, chain_2, chain_3, blockify_dhess_dtheta, blockify_third, blockify_hessian, safe_exp
+from ..util.quad_integrate import quadgk_int
 from scipy.integrate import quad
+from functools import partial
+
 import warnings
+
 from ..core.parameterization import Parameterized

 class Likelihood(Parameterized):
@ -40,6 +44,37 @@ class Likelihood(Parameterized):
        self.gp_link = gp_link
        self.log_concave = False
        self.not_block_really = False
+        self.name = name
+
+    def to_dict(self):
+        raise NotImplementedError
+
+    def _to_dict(self):
+        input_dict = {}
+        input_dict["name"] = self.name
+        input_dict["gp_link_dict"] = self.gp_link.to_dict()
+        return input_dict
+
+    @staticmethod
+    def from_dict(input_dict):
+        import copy
+        input_dict = copy.deepcopy(input_dict)
+        likelihood_class = input_dict.pop('class')
+        input_dict["name"] = str(input_dict["name"])
+        name = input_dict.pop('name')
+        import GPy
+        likelihood_class = eval(likelihood_class)
+        return likelihood_class._from_dict(likelihood_class, input_dict)
+
+    @staticmethod
+    def _from_dict(likelihood_class, input_dict):
+        import copy
+        input_dict = copy.deepcopy(input_dict)
+        gp_link_dict = input_dict.pop('gp_link_dict')
+        import GPy
+        gp_link = GPy.likelihoods.link_functions.GPTransformation.from_dict(gp_link_dict)
+        input_dict["gp_link"] = gp_link
+        return likelihood_class(**input_dict)

    def request_num_latent_functions(self, Y):
        """
@ -223,6 +258,91 @@ class Likelihood(Parameterized):
            self.__gh_points = np.polynomial.hermite.hermgauss(T)
        return self.__gh_points

+    def ep_gradients(self, Y, cav_tau, cav_v, dL_dKdiag, Y_metadata=None, quad_mode='gk', boost_grad=1.):
+        if self.size > 0:
+            shape = Y.shape
+            tau,v,Y = cav_tau.flatten(), cav_v.flatten(),Y.flatten()
+            mu = v/tau
+            sigma2 = 1./tau
+
+            # assert Y.shape == v.shape
+            dlik_dtheta = np.empty((self.size, Y.shape[0]))
+            # for j in range(self.size):
+            Y_metadata_list = []
+            for index in range(len(Y)):
+                Y_metadata_i = {}
+                if Y_metadata is not None:
+                    for key in Y_metadata.keys():
+                        Y_metadata_i[key] = Y_metadata[key][index,:]
+                    Y_metadata_list.append(Y_metadata_i)
+
+            if quad_mode == 'gk':
+                f = partial(self.integrate_gk)
+                quads = zip(*map(f, Y.flatten(), mu.flatten(), np.sqrt(sigma2.flatten()), Y_metadata_list))
+                quads = np.vstack(quads)
+                quads.reshape(self.size, shape[0], shape[1])
+            elif quad_mode == 'gh':
+                f = partial(self.integrate_gh)
+                quads = zip(*map(f, Y.flatten(), mu.flatten(), np.sqrt(sigma2.flatten())))
+                quads = np.hstack(quads)
+                quads = quads.T
+            else:
+                raise Exception("no other quadrature mode available")
+            #     do a gaussian-hermite integration
+            dL_dtheta_avg = boost_grad * np.nanmean(quads, axis=1)
+            dL_dtheta = boost_grad * np.nansum(quads, axis=1)
+            # dL_dtheta = boost_grad * np.nansum(dlik_dtheta, axis=1)
+        else:
+            dL_dtheta = np.zeros(self.num_params)
+        return dL_dtheta
+
+
+    def integrate_gk(self, Y, mu, sigma, Y_metadata_i=None):
+        # gaussian-kronrod integration.
+        fmin = -np.inf
+        fmax = np.inf
+        SQRT_2PI = np.sqrt(2.*np.pi)
+        def generate_integral(f):
+            a = np.exp(self.logpdf_link(f, Y, Y_metadata_i)) * np.exp(-0.5 * np.square((f - mu) / sigma)) / (
+                SQRT_2PI * sigma)
+            fn1 = a * self.dlogpdf_dtheta(f, Y, Y_metadata_i)
+            fn = fn1
+            return fn
+
+        dF_dtheta_i = quadgk_int(generate_integral, fmin=fmin, fmax=fmax)
+        return dF_dtheta_i
+
+    def integrate_gh(self, Y, mu, sigma, Y_metadata_i=None, gh_points=None):
+        # gaussian-hermite quadrature.
+        # "calculate site derivatives E_f{d logp(y_i|f_i)/da} where a is a likelihood parameter
+        # and the expectation is over the exact marginal posterior, which is not gaussian- and is
+        # unnormalised product of the cavity distribution(a Gaussian) and the exact likelihood term.
+        #
+        # calculate the expectation wrt the approximate marginal posterior, which should be approximately the same.
+        # . This term is needed for evaluating the
+        # gradients of the marginal likelihood estimate Z_EP wrt likelihood parameters."
+        # "writing it explicitly "
+        # use them for gaussian-hermite quadrature
+
+        SQRT_2PI = np.sqrt(2.*np.pi)
+        if gh_points is None:
+            gh_x, gh_w = self._gh_points(32)
+        else:
+            gh_x, gh_w = gh_points
+
+        X = gh_x[None,:]*np.sqrt(2.)*sigma + mu
+
+        # Here X is a grid vector of possible fi values, while Y is just a single value which will be broadcasted.
+        a = np.exp(self.logpdf_link(X, Y, Y_metadata_i))
+        a = a.repeat(self.num_params,0)
+        b = self.dlogpdf_dtheta(X, Y, Y_metadata_i)
+        old_shape = b.shape
+        fn = np.array([i*j for i,j in zip(a.flatten(), b.flatten())])
+        fn = fn.reshape(old_shape)
+
+        dF_dtheta_i = np.dot(fn, gh_w)/np.sqrt(np.pi)
+        return dF_dtheta_i
+
    def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
        """
        Use Gauss-Hermite Quadrature to compute
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@ -43,6 +43,25 @@ class GPTransformation(object):
        """
        raise NotImplementedError

+    def to_dict(self):
+        raise NotImplementedError
+
+    def _to_dict(self):
+        return {}
+
+    @staticmethod
+    def from_dict(input_dict):
+        import copy
+        input_dict = copy.deepcopy(input_dict)
+        link_class = input_dict.pop('class')
+        import GPy
+        link_class = eval(link_class)
+        return link_class._from_dict(link_class, input_dict)
+
+    @staticmethod
+    def _from_dict(link_class, input_dict):
+        return link_class(**input_dict)
+
 class Identity(GPTransformation):
    """
    .. math::
@ -62,6 +81,10 @@ class Identity(GPTransformation):
    def d3transf_df3(self,f):
        return np.zeros_like(f)

+    def to_dict(self):
+        input_dict = super(Identity, self)._to_dict()
+        input_dict["class"] = "GPy.likelihoods.link_functions.Identity"
+        return input_dict

 class Probit(GPTransformation):
    """
@ -82,6 +105,11 @@ class Probit(GPTransformation):
    def d3transf_df3(self,f):
        return (safe_square(f)-1.)*std_norm_pdf(f)

+    def to_dict(self):
+        input_dict = super(Probit, self)._to_dict()
+        input_dict["class"] = "GPy.likelihoods.link_functions.Probit"
+        return input_dict
+

 class Cloglog(GPTransformation):
    """
--- a/GPy/likelihoods/loggaussian.py
+++ b/GPy/likelihoods/loggaussian.py
@ -0,0 +1,304 @@
+# Copyright (c) 2012 - 2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from scipy import stats, special
+from ..core.parameterization import Param
+from ..core.parameterization.transformations import Logexp
+from . import link_functions
+from .likelihood import Likelihood
+
+
+
+class LogGaussian(Likelihood):
+    """
+    .. math::
+        $$ p(y_{i}|f_{i}, z_{i}) = \\prod_{i=1}^{n} (\\frac{ry^{r-1}}{\\exp{f(x_{i})}})^{1-z_i} (1 + (\\frac{y}{\\exp(f(x_{i}))})^{r})^{z_i-2}  $$
+
+    .. note:
+        where z_{i} is the censoring indicator- 0 for non-censored data, and 1 for censored data.
+
+
+    """
+    def __init__(self,gp_link=None, sigma=1.):
+        if gp_link is None:
+            gp_link = link_functions.Identity()
+            # gp_link = link_functions.Log()
+
+        super(LogGaussian, self).__init__(gp_link, name='loggaussian')
+
+        self.sigma = Param('sigma', sigma, Logexp())
+        self.variance = Param('variance', sigma**2, Logexp())
+        self.link_parameter(self.variance)
+        # self.link_parameter()
+
+    def pdf_link(self, link_f, y, Y_metadata=None):
+        """
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: includes censoring information in dictionary key 'censored'
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        return np.exp(self.logpdf_link(link_f, y, Y_metadata=Y_metadata))
+
+    def logpdf_link(self, link_f, y, Y_metadata=None):
+        """
+        :param link_f: latent variables (link(f))
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: includes censoring information in dictionary key 'censored'
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        c = np.zeros_like(y)
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+
+        uncensored = (1-c)* (-0.5*np.log(2*np.pi*self.variance) - np.log(y) - (np.log(y)-link_f)**2 /(2*self.variance) )
+        censored = c*np.log( 1 - stats.norm.cdf((np.log(y) - link_f)/np.sqrt(self.variance)) )
+        logpdf = uncensored + censored
+        return logpdf
+
+    def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
+        """
+        derivative of logpdf wrt link_f param
+        .. math::
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: includes censoring information in dictionary key 'censored'
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        c = np.zeros_like(y)
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+
+        val = np.log(y) - link_f
+        val_scaled = val/np.sqrt(self.variance)
+        val_scaled2 = val/self.variance
+        uncensored = (1-c)*(val_scaled2)
+        a = (1- stats.norm.cdf(val_scaled))
+        # llg(z) = 1. / (1 - norm_cdf(r / sqrt(s2))). * (1 / sqrt(2 * pi * s2). * exp(-1 / (2. * s2). * r. ^ 2));
+        censored = c*( 1./a) * (np.exp(-1.* val**2 /(2*self.variance)) / np.sqrt(2*np.pi*self.variance))
+        # censored = c * (1. / (1 - stats.norm.cdf(val_scaled))) * (stats.norm.pdf(val_scaled))
+        gradient = uncensored + censored
+        return gradient
+
+    def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
+        """
+        Hessian at y, given link(f), w.r.t link(f)
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
+
+        .. math::
+
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: includes censoring information in dictionary key 'censored'
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        # c = Y_metadata['censored']
+        # c = np.zeros((y.shape[0],))
+        c = np.zeros_like(y)
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+
+        val = np.log(y) - link_f
+        val_scaled = val/np.sqrt(self.variance)
+        val_scaled2 = val/self.variance
+        a = (1 - stats.norm.cdf(val_scaled))
+        uncensored = (1-c) *(-1)/self.variance
+        censored = c*(-np.exp(-val**2/self.variance) / ( 2*np.pi*self.variance*(a**2) ) +
+                      val*np.exp(-(val**2)/(2*self.variance))/( np.sqrt(2*np.pi)*self.variance**(3/2.)*a) )
+        hessian = censored + uncensored
+        return hessian
+
+    def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
+        """
+        Gradient of the log-likelihood function at y given f, w.r.t shape parameter
+
+        .. math::
+
+        :param inv_link_f: latent variables link(f)
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: includes censoring information in dictionary key 'censored'
+        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :rtype: float
+        """
+        c = np.zeros_like(y)
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+
+        val = np.log(y) - link_f
+        val_scaled = val/np.sqrt(self.variance)
+        val_scaled2 = val/self.variance
+        a = (1 - stats.norm.cdf(val_scaled))
+        uncensored = 0
+        censored = c *( 2*np.exp(-3*(val**2)/(2*self.variance)) / ((a**3)*(2*np.pi*self.variance)**(3/2.))
+                        - val*np.exp(-(val**2)/self.variance)/ ( (a**2)*np.pi*self.variance**2)
+                        - val*np.exp(-(val**2)/self.variance)/ ( (a**2)*2*np.pi*self.variance**2)
+                        - np.exp(-(val**2)/(2*self.variance))/ ( a*(self.variance**(1.50))*np.sqrt(2*np.pi))
+                        + (val**2)*np.exp(-(val**2)/(2*self.variance))/ ( a*np.sqrt(2*np.pi*self.variance)*self.variance**2 ) )
+        d3pdf_dlink3 = uncensored + censored
+        return d3pdf_dlink3
+
+    def dlogpdf_link_dvar(self, link_f, y, Y_metadata=None):
+        """
+        Gradient of the log-likelihood function at y given f, w.r.t variance parameter
+
+        .. math::
+
+        :param inv_link_f: latent variables link(f)
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: includes censoring information in dictionary key 'censored'
+        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :rtype: float
+        """
+
+        c = np.zeros_like(y)
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+
+        val = np.log(y) - link_f
+        val_scaled = val/np.sqrt(self.variance)
+        val_scaled2 = val/self.variance
+        a = (1 - stats.norm.cdf(val_scaled))
+        uncensored = (1-c)*(-0.5/self.variance + (val**2)/(2*(self.variance**2)) )
+        censored = c *( val*np.exp(-val**2/ (2*self.variance)) / (a*np.sqrt(2*np.pi)*2*(self.variance**(1.5))) )
+        dlogpdf_dvar = uncensored + censored
+        # dlogpdf_dvar = dlogpdf_dvar*self.variance
+        return dlogpdf_dvar
+
+    def dlogpdf_dlink_dvar(self, link_f, y, Y_metadata=None):
+        """
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in gaussian
+        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        c = np.zeros_like(y)
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+
+        val = np.log(y) - link_f
+        val_scaled = val/np.sqrt(self.variance)
+        val_scaled2 = val/self.variance
+        a = (1 - stats.norm.cdf(val_scaled))
+        uncensored = (1-c)*(-val/(self.variance**2))
+        censored = c * (-val*np.exp(-val**2/self.variance)/( 4*np.pi*(self.variance**2)*(a**2)) +
+                         (-1 + (val**2)/self.variance)*np.exp(-val**2/(2*self.variance) ) /
+                        ( a*(np.sqrt(2.*np.pi)*2*self.variance**1.5)) )
+        dlik_grad_dsigma = uncensored + censored
+        # dlik_grad_dsigma = dlik_grad_dsigma*self.variance
+        return dlik_grad_dsigma
+
+    def d2logpdf_dlink2_dvar(self, link_f, y, Y_metadata=None):
+        """
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in gaussian
+        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        c = np.zeros_like(y)
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+        val = np.log(y) - link_f
+        val_scaled = val/np.sqrt(self.variance)
+        val_scaled2 = val/self.variance
+        a = (1 - stats.norm.cdf(val_scaled))
+        uncensored = (1-c)*( 1./(self.variance**2) )
+        censored = c*( val*np.exp(-3*(val**2)/(2*self.variance) )/ ((a**3)*np.sqrt(8*np.pi**3)*self.variance**(5/2.))
+                       + np.exp(-val**2/self.variance)/((a**2)*4*np.pi*self.variance**2)
+                       - np.exp(-val**2/self.variance)*val**2 / ((a**2)*2*np.pi*self.variance**3)
+                       + np.exp(-val**2/self.variance)/ ( (a**2)*4*np.pi*self.variance**2)
+                       - np.exp(-val**2/ (2*self.variance))*val / ( a*np.sqrt(2*np.pi)*2*self.variance**(5/2.))
+                       - np.exp(-val**2/self.variance)*(val**2) / ((a**2)*4*np.pi*self.variance**3)
+                       - np.exp(-val**2/ (2*self.variance))*val/ (a*np.sqrt(2*np.pi)*self.variance**(5/2.))
+                       + np.exp(-val**2/ (2*self.variance))*(val**3) / (a*np.sqrt(2*np.pi)*2*self.variance**(7/2.)) )
+        dlik_hess_dsigma = uncensored + censored
+        return dlik_hess_dsigma
+
+    def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
+        """
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in gaussian
+        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        dlogpdf_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        dlogpdf_dtheta[0,:,:] = self.dlogpdf_link_dvar(f,y,Y_metadata=Y_metadata)
+        return dlogpdf_dtheta
+
+    def dlogpdf_dlink_dtheta(self, f, y, Y_metadata=None):
+        """
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in gaussian
+        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        dlogpdf_dlink_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        dlogpdf_dlink_dtheta[0,:,:] = self.dlogpdf_dlink_dvar(f,y,Y_metadata=Y_metadata)
+        return dlogpdf_dlink_dtheta
+
+    def d2logpdf_dlink2_dtheta(self, f, y, Y_metadata=None):
+        """
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata not used in gaussian
+        :returns: derivative of log likelihood evaluated at points link(f) w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        d2logpdf_dlink2_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        d2logpdf_dlink2_dtheta[0,:,:] = self.d2logpdf_dlink2_dvar(f,y,Y_metadata=Y_metadata)
+        return d2logpdf_dlink2_dtheta
+
+    def update_gradients(self, grads):
+        """
+        Pull out the gradients, be careful as the order must match the order
+        in which the parameters are added
+        """
+        self.variance.gradient = grads[0]
+
+    def samples(self, gp, Y_metadata=None):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
--- a/GPy/likelihoods/loglogistic.py
+++ b/GPy/likelihoods/loglogistic.py
@ -0,0 +1,339 @@
+from __future__ import division
+# Copyright (c) 2015 Alan Saul
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from scipy import stats,special
+import scipy as sp
+from ..core.parameterization import Param
+from ..core.parameterization.transformations import Logexp
+from . import link_functions
+from .likelihood import Likelihood
+from .link_functions import Log
+
+class LogLogistic(Likelihood):
+    """
+    .. math::
+        $$ p(y_{i}|f_{i}, z_{i}) = \\prod_{i=1}^{n} (\\frac{ry^{r-1}}{\\exp{f(x_{i})}})^{1-z_i} (1 + (\\frac{y}{\\exp(f(x_{i}))})^{r})^{z_i-2}  $$
+
+    .. note:
+        where z_{i} is the censoring indicator- 0 for non-censored data, and 1 for censored data.
+    """
+
+    def __init__(self, gp_link=None, r=1.0):
+        if gp_link is None:
+            #Parameterised not as link_f but as f
+            gp_link = Log()
+
+        super(LogLogistic, self).__init__(gp_link, name='LogLogistic')
+        self.r = Param('r_log_shape', float(r), Logexp())
+        self.link_parameter(self.r)
+        # self.censored = 'censored'
+
+
+
+    def pdf_link(self, link_f, y, Y_metadata=None):
+        """
+        Likelihood function given link(f)
+
+        .. math::
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: includes censoring information in dictionary key 'censored'
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        return np.exp(self.logpdf_link(link_f, y, Y_metadata=Y_metadata))
+
+
+    def logpdf_link(self, link_f, y, Y_metadata=None):
+        """
+        Log Likelihood Function given link(f)
+
+        .. math::
+
+
+        :param link_f: latent variables (link(f))
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: includes censoring information in dictionary key 'censored'
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        """
+        # c = np.zeros((y.shape[0],))
+        c = np.zeros_like(link_f)
+        if Y_metadata is not None and  'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+
+        link_f = np.clip(link_f, 1e-150, 1e100)
+        # y_link_f = y/link_f
+        # y_link_f_r = y_link_f**self.r
+        # y_link_f_r = np.clip(y**self.r, 1e-150, 1e200) / np.clip(link_f**self.r, 1e-150, 1e200)
+        # y_link_f_r = np.clip((y/link_f)**self.r, 1e-150, 1e200)
+        y_r = np.clip(y**self.r, 1e-150, 1e200)
+        link_f_r = np.clip(link_f**self.r, 1e-150, 1e200)
+        y_link_f_r = np.clip(y_r / link_f_r, 1e-150, 1e200)
+        #uncensored = (1-c)*(np.log(self.r) + (self.r+1)*np.log(y) - self.r*np.log(link_f) - 2*np.log1p(y_link_f_r))
+        #uncensored = (1-c)*(np.log((self.r/link_f)*y_link_f**(self.r-1)) - 2*np.log1p(y_link_f_r))
+
+        # clever way tp break it into censored and uncensored-parts ..
+        uncensored = (1-c)*(np.log(self.r) + (self.r-1)*np.log(y) - self.r*np.log(link_f) - 2*np.log1p(y_link_f_r))
+        censored = (c)*(-np.log1p(y_link_f_r))
+        #
+        return uncensored + censored
+        # return uncensored
+
+    def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
+        """
+        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
+
+        .. math::
+
+        :param link_f: latent variables (f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: includes censoring information in dictionary key 'censored'
+        :returns: gradient of likelihood evaluated at points
+        :rtype: Nx1 array
+
+        """
+        # c = Y_metadata['censored']
+        # for debugging
+        # c = np.zeros((y.shape[0],))
+        c = np.zeros_like(link_f)
+
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+
+        #y_link_f = y/link_f
+        #y_link_f_r = y_link_f**self.r
+        y_link_f_r = np.clip(y**self.r, 1e-150, 1e200) / np.clip(link_f**self.r, 1e-150, 1e200)
+
+        #In terms of link_f
+        # uncensored = (1-c)*( (2*self.r*y**r)/(link_f**self.r + y**self.r) - link_f*self.r)
+        uncensored = (1-c)*self.r*(y_link_f_r - 1)/(link_f*(1 + y_link_f_r))
+        censored = c*(self.r*y_link_f_r/(link_f*y_link_f_r + link_f))
+        return uncensored + censored
+        # return uncensored
+
+    def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
+        """
+        Hessian at y, given link(f), w.r.t link(f)
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
+
+        .. math::
+
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: includes censoring information in dictionary key 'censored'
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        # c = Y_metadata['censored']
+        # c = np.zeros((y.shape[0],))
+        c = np.zeros_like(link_f)
+
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+
+        y_link_f = y/link_f
+        y_link_f_r = y_link_f**self.r
+
+        #In terms of link_f
+        censored = c*(-self.r*y_link_f_r*(y_link_f_r + self.r + 1)/((link_f**2)*(y_link_f_r + 1)**2))
+        uncensored = (1-c)*(-self.r*(2*self.r*y_link_f_r + y_link_f**(2*self.r) - 1) / ((link_f**2)*(1+ y_link_f_r)**2))
+        hess = censored + uncensored
+        return hess
+
+    def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: includes censoring information in dictionary key 'censored'
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        # c = Y_metadata['censored']
+        #  for debugging
+        # c = np.zeros((y.shape[0],))
+        c = np.zeros_like(link_f)
+
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+        y_link_f = y/link_f
+        y_link_f_r = y_link_f**self.r
+
+        #In terms of link_f
+        censored = c*(self.r*y_link_f_r*(((self.r**2)*(-(y_link_f_r - 1))) + 3*self.r*(y_link_f_r + 1) + 2*(y_link_f_r + 1)**2)
+                      / ((link_f**3)*(y_link_f_r + 1)**3))
+        uncensored = (1-c)*(2*self.r*(-(self.r**2)*(y_link_f_r -1)*y_link_f_r + 3*self.r*(y_link_f_r + 1)*y_link_f_r + (y_link_f_r - 1)*(y_link_f_r + 1)**2)
+                            / ((link_f**3)*(y_link_f_r + 1)**3))
+
+        d3lik_dlink3 = censored + uncensored
+        return d3lik_dlink3
+
+    def dlogpdf_link_dr(self, inv_link_f, y, Y_metadata=None):
+        """
+        Gradient of the log-likelihood function at y given f, w.r.t shape parameter
+
+        .. math::
+
+        :param inv_link_f: latent variables link(f)
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: includes censoring information in dictionary key 'censored'
+        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :rtype: float
+        """
+        # c = Y_metadata['censored']
+        # c = np.zeros((y.shape[0],))
+        c = np.zeros_like(y)
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+
+        link_f = inv_link_f #FIXME: Change names consistently...
+        y_link_f = y/link_f
+        log_y_link_f = np.log(y) - np.log(link_f)
+        y_link_f_r = y_link_f**self.r
+
+        #In terms of link_f
+        censored = c*(-y_link_f_r*log_y_link_f/(1 + y_link_f_r))
+        uncensored = (1-c)*(1./self.r + np.log(y) - np.log(link_f) - (2*y_link_f_r*log_y_link_f) / (1 + y_link_f_r))
+
+        dlogpdf_dr = censored + uncensored
+        return dlogpdf_dr
+
+    def dlogpdf_dlink_dr(self, inv_link_f, y, Y_metadata=None):
+        """
+        Derivative of the dlogpdf_dlink w.r.t shape parameter
+
+        .. math::
+
+        :param inv_link_f: latent variables inv_link_f
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: includes censoring information in dictionary key 'censored'
+        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        # c = np.zeros((y.shape[0],))
+        c = np.zeros_like(y)
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+        link_f = inv_link_f
+        y_link_f = y/link_f
+        y_link_f_r = y_link_f**self.r
+        log_y_link_f = np.log(y) - np.log(link_f)
+
+        #In terms of link_f
+        censored = c*(y_link_f_r*(y_link_f_r + self.r*log_y_link_f + 1)/(link_f*(y_link_f_r + 1)**2))
+        uncensored = (1-c)*(y_link_f**(2*self.r) + 2*self.r*y_link_f_r*log_y_link_f - 1) / (link_f*(1 + y_link_f_r)**2)
+
+        # dlogpdf_dlink_dr = uncensored
+        dlogpdf_dlink_dr = censored + uncensored
+        return dlogpdf_dlink_dr
+
+    def d2logpdf_dlink2_dr(self, inv_link_f, y, Y_metadata=None):
+        """
+        Gradient of the hessian (d2logpdf_dlink2) w.r.t shape parameter
+
+        .. math::
+
+        :param inv_link_f: latent variables link(f)
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: includes censoring information in dictionary key 'censored'
+        :returns: derivative of hessian evaluated at points f and f_j w.r.t variance parameter
+        :rtype: Nx1 array
+        """
+        # c = Y_metadata['censored']
+
+        # c = np.zeros((y.shape[0],))
+        c = np.zeros_like(y)
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+        link_f = inv_link_f
+        y_link_f = y/link_f
+        y_link_f_r = y_link_f**self.r
+        log_y_link_f = np.log(y) - np.log(link_f)
+
+        #In terms of link_f
+        y_link_f_2r = y_link_f**(2*self.r)
+        denom2 = (link_f**2)*(1 + y_link_f_r)**2
+        denom3 = (link_f**2)*(1 + y_link_f_r)**3
+
+        censored = c*(-((y_link_f_r + self.r + 1)*y_link_f_r)/denom2
+                      -(self.r*(y_link_f_r + self.r + 1)*y_link_f_r*log_y_link_f)/denom2
+                      -(self.r*y_link_f_r*(y_link_f_r*log_y_link_f + 1))/denom2
+                      +(2*self.r*(y_link_f_r + self.r + 1)*y_link_f_2r*log_y_link_f)/denom3
+                      )
+
+        uncensored = (1-c)*(-(2*self.r*y_link_f_r + y_link_f_2r - 1)/denom2
+                            -(self.r*(2*y_link_f_r + 2*self.r*y_link_f_r*log_y_link_f + 2*y_link_f_2r*log_y_link_f)/denom2)
+                            +(2*self.r*(2*self.r*y_link_f_r + y_link_f_2r - 1)*y_link_f_r*log_y_link_f)/denom3
+                            )
+        d2logpdf_dlink2_dr = censored + uncensored
+
+        return d2logpdf_dlink2_dr
+
+    def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
+        dlogpdf_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        dlogpdf_dtheta[0, :, :] = self.dlogpdf_link_dr(f, y, Y_metadata=Y_metadata)
+        return dlogpdf_dtheta
+
+    def dlogpdf_dlink_dtheta(self, f, y, Y_metadata=None):
+        dlogpdf_dlink_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        dlogpdf_dlink_dtheta[0, :, :] = self.dlogpdf_dlink_dr(f, y, Y_metadata=Y_metadata)
+        return dlogpdf_dlink_dtheta
+
+    def d2logpdf_dlink2_dtheta(self, f, y, Y_metadata=None):
+        d2logpdf_dlink2_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        d2logpdf_dlink2_dtheta[0,:, :] = self.d2logpdf_dlink2_dr(f, y, Y_metadata=Y_metadata)
+        return d2logpdf_dlink2_dtheta
+
+    def update_gradients(self, grads):
+        """
+        Pull out the gradients, be careful as the order must match the order
+        in which the parameters are added
+        """
+        self.r.gradient = grads[0]
+
+    def samples(self, gp, Y_metadata=None):
+        """
+        Returns a set of samples of observations based on a given value of the latent variable.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        #rs = np.ones_like(gp)*self.r
+        #scales = np.ones_like(gp)*np.sqrt(self.sigma2)
+        #Ysim = sp.stats.fisk.rvs(rs, scale=self.gp_link.transf(gp))
+        Ysim = np.array([sp.stats.fisk.rvs(self.r, loc=0, scale=self.gp_link.transf(f)) for f in gp])
+        #np.random.fisk(self.gp_link.transf(gp), c=self.r)
+        return Ysim.reshape(orig_shape)
+
--- a/GPy/likelihoods/weibull.py
+++ b/GPy/likelihoods/weibull.py
@ -0,0 +1,322 @@
+# Copyright (c) 2012 - 2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats, special
+import scipy as sp
+from ..core.parameterization import Param
+from ..core.parameterization.transformations import Logexp
+from . import link_functions
+from .likelihood import Likelihood
+
+
+class Weibull(Likelihood):
+    """
+    Implementing Weibull likelihood function ...
+
+    """
+
+    def __init__(self, gp_link=None, beta=1.):
+        if gp_link is None:
+            #Parameterised not as link_f but as f
+            # gp_link = link_functions.Identity()
+            #Parameterised as link_f
+            gp_link = link_functions.Log()
+        super(Weibull, self).__init__(gp_link, name='Weibull')
+
+        self.r = Param('r_weibull_shape', float(beta), Logexp())
+        self.link_parameter(self.r)
+
+    def pdf_link(self, link_f, y, Y_metadata=None):
+        """
+        Likelihood function given link(f)
+
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata which is not used in weibull distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+        """
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        c = np.zeros((link_f.shape[0],))
+
+        # log_objective = np.log(self.r) + (self.r - 1) * np.log(y) - link_f - (np.exp(-link_f) * (y ** self.r))
+        # log_objective = stats.weibull_min.pdf(y,c=self.beta,loc=link_f,scale=1.)
+        log_objective = self.logpdf_link(link_f, y, Y_metadata)
+        return np.exp(log_objective)
+
+    def logpdf_link(self, link_f, y, Y_metadata=None):
+        """
+        Log Likelihood Function given link(f)
+
+        .. math::
+            \\ln p(y_{i}|\lambda(f_{i})) = \\alpha_{i}\\log \\beta - \\log \\Gamma(\\alpha_{i}) + (\\alpha_{i} - 1)\\log y_{i} - \\beta y_{i}\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables (link(f))
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata which is not used in poisson distribution
+        :returns: likelihood evaluated for this point
+        :rtype: float
+
+        """
+        # alpha = self.gp_link.transf(gp)*self.beta    sum(log(a) + (a-1).*log(y)- f - exp(-f).*y.^a)
+        # return (1. - alpha)*np.log(obs) + self.beta*obs - alpha * np.log(self.beta) + np.log(special.gamma(alpha))
+        assert np.atleast_1d(link_f).shape == np.atleast_1d(y).shape
+        c = np.zeros_like(y)
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+
+        # uncensored = (1-c)* (np.log(self.r) + (self.r - 1) * np.log(y) - link_f - (np.exp(-link_f) * (y ** self.r)))
+        # censored = (-c)*np.exp(-link_f)*(y**self.r)
+        uncensored = (1-c)*( np.log(self.r)-np.log(link_f)+(self.r-1)*np.log(y) - y**self.r/link_f)
+        censored = -c*y**self.r/link_f
+
+        log_objective = uncensored + censored
+        return log_objective
+
+    def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
+        """
+        Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\beta (\\log \\beta y_{i}) - \\Psi(\\alpha_{i})\\beta\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables (f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata which is not used in gamma distribution
+        :returns: gradient of likelihood evaluated at points
+        :rtype: Nx1 array
+
+        """
+        # grad =  (1. - self.beta) / (y - link_f)
+        c = np.zeros_like(y)
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+
+        # uncensored = (1-c)* ( -1 + np.exp(-link_f)*(y ** self.r))
+        # censored = c*np.exp(-link_f)*(y**self.r)
+        uncensored = (1-c)*(-1/link_f + y**self.r/link_f**2)
+        censored = c*y**self.r/link_f**2
+        grad = uncensored + censored
+        return grad
+
+    def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
+        """
+        Hessian at y, given link(f), w.r.t link(f)
+        i.e. second derivative logpdf at y given link(f_i) and link(f_j)  w.r.t link(f_i) and link(f_j)
+        The hessian will be 0 unless i == j
+
+        .. math::
+            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = -\\beta^{2}\\frac{d\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata which is not used in gamma distribution
+        :returns: Diagonal of hessian matrix (second derivative of likelihood evaluated at points f)
+        :rtype: Nx1 array
+
+        .. Note::
+            Will return diagonal of hessian, since every where else it is 0, as the likelihood factorizes over cases
+            (the distribution for y_i depends only on link(f_i) not on link(f_(j!=i))
+        """
+        # hess = (self.beta - 1.) / (y - link_f)**2
+        c = np.zeros_like(y)
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+
+        # uncensored = (1-c)* (-(y ** self.r) * np.exp(-link_f))
+        # censored = -c*np.exp(-link_f)*y**self.r
+        uncensored = (1-c)*(1/link_f**2 -2*y**self.r/link_f**3)
+        censored = -c*2*y**self.r/link_f**3
+        hess = uncensored + censored
+        # hess = -(y ** self.r) * np.exp(-link_f)
+        return hess
+
+    def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
+        """
+        Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
+
+        .. math::
+            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = -\\beta^{3}\\frac{d^{2}\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
+            \\alpha_{i} = \\beta y_{i}
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata which is not used in gamma distribution
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        # d3lik_dlink3 = (1. - self.beta) / (y - link_f)**3
+
+        c = np.zeros_like(y)
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+        # uncensored = (1-c)* ((y ** self.r) * np.exp(-link_f))
+        # censored = c*np.exp(-link_f)*y**self.r
+        uncensored = (1-c)*(-2/link_f**3+ 6*y**self.r/link_f**4)
+        censored = c*6*y**self.r/link_f**4
+
+        d3lik_dlink3 = uncensored + censored
+        # d3lik_dlink3 = (y ** self.r) * np.exp(-link_f)
+        return d3lik_dlink3
+
+    def exact_inference_gradients(self, dL_dKdiag, Y_metadata=None):
+        return np.zeros(self.size)
+
+    def dlogpdf_link_dr(self, inv_link_f, y, Y_metadata=None):
+        """
+
+        Gradient of the log-likelihood function at y given f, w.r.t shape parameter
+
+        .. math::
+
+        :param inv_link_f: latent variables link(f)
+        :type inv_link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: includes censoring information in dictionary key 'censored'
+        :returns: derivative of likelihood evaluated at points f w.r.t variance parameter
+        :rtype: float
+        """
+        c = np.zeros_like(y)
+        link_f = inv_link_f
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+        uncensored = (1-c)* (1./self.r + np.log(y) - y**self.r*np.log(y)/link_f)
+        censored = (-c*y**self.r*np.log(y)/link_f)
+        dlogpdf_dr = uncensored + censored
+        return dlogpdf_dr
+
+    def dlogpdf_dlink_dr(self, inv_link_f, y, Y_metadata=None):
+        """
+        First order derivative derivative of loglikelihood wrt r:shape parameter
+
+        :param link_f: latent variables link(f)
+        :type link_f: Nx1 array
+        :param y: data
+        :type y: Nx1 array
+        :param Y_metadata: Y_metadata which is not used in gamma distribution
+        :returns: third derivative of likelihood evaluated at points f
+        :rtype: Nx1 array
+        """
+        # dlogpdf_dlink_dr = self.beta * y**(self.beta - 1) * np.exp(-link_f)
+        # dlogpdf_dlink_dr = np.exp(-link_f) * (y ** self.r) * np.log(y)
+        c = np.zeros_like(y)
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+
+        link_f = inv_link_f
+        # uncensored = (1-c)*(np.exp(-link_f)* (y ** self.r) * np.log(y))
+        # censored = c*np.exp(-link_f)*(y**self.r)*np.log(y)
+        uncensored = (1-c)*(y**self.r*np.log(y)/link_f**2)
+        censored = c*(y**self.r*np.log(y)/link_f**2)
+        dlogpdf_dlink_dr = uncensored + censored
+        return dlogpdf_dlink_dr
+
+    def d2logpdf_dlink2_dr(self, link_f, y, Y_metadata=None):
+        """
+
+        Derivative of hessian of loglikelihood wrt r-shape parameter.
+        :param link_f:
+        :param y:
+        :param Y_metadata:
+        :return:
+        """
+
+        c = np.zeros_like(y)
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+
+        # uncensored = (1-c)*( -np.exp(-link_f)* (y ** self.r) * np.log(y))
+        # censored = -c*np.exp(-link_f)*(y**self.r)*np.log(y)
+        uncensored = (1-c)*-2*y**self.r*np.log(y)/link_f**3
+        censored = c*-2*y**self.r*np.log(y)/link_f**3
+        d2logpdf_dlink_dr = uncensored + censored
+
+        return d2logpdf_dlink_dr
+
+    def d3logpdf_dlink3_dr(self, link_f, y, Y_metadata=None):
+        """
+
+        :param link_f:
+        :param y:
+        :param Y_metadata:
+        :return:
+        """
+        c = np.zeros_like(y)
+        if Y_metadata is not None and 'censored' in Y_metadata.keys():
+            c = Y_metadata['censored']
+
+        uncensored = (1-c)* ((y**self.r)*np.exp(-link_f)*np.log1p(y))
+        censored = c*np.exp(-link_f)*(y**self.r)*np.log(y)
+        d3logpdf_dlink3_dr = uncensored + censored
+        return d3logpdf_dlink3_dr
+
+    def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
+        """
+
+        :param f:
+        :param y:
+        :param Y_metadata:
+        :return:
+        """
+        dlogpdf_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        dlogpdf_dtheta[0, :, :] = self.dlogpdf_link_dr(f, y, Y_metadata=Y_metadata)
+        return dlogpdf_dtheta
+
+    def dlogpdf_dlink_dtheta(self, f, y, Y_metadata=None):
+        """
+
+        :param f:
+        :param y:
+        :param Y_metadata:
+        :return:
+        """
+        dlogpdf_dlink_dtheta = np.zeros((self.size, f.shape[0], f.shape[1]))
+        dlogpdf_dlink_dtheta[0, :, :] = self.dlogpdf_dlink_dr(f, y, Y_metadata)
+        return dlogpdf_dlink_dtheta
+
+    def d2logpdf_dlink2_dtheta(self, f, y, Y_metadata=None):
+        """
+
+        :param f:
+        :param y:
+        :param Y_metadata:
+        :return:
+        """
+        d2logpdf_dlink_dtheta2 = np.zeros((self.size, f.shape[0], f.shape[1]))
+        d2logpdf_dlink_dtheta2[0, :, :] = self.d2logpdf_dlink2_dr(f, y, Y_metadata)
+        return d2logpdf_dlink_dtheta2
+
+    def update_gradients(self, grads):
+        """
+        Pull out the gradients, be careful as the order must match the order
+        in which the parameters are added
+        """
+        self.r.gradient = grads[0]
+
+    def samples(self, gp, Y_metadata=None):
+        """
+        Returns a set of samples of observations conditioned on a given value of latent variable f.
+
+        :param gp: latent variable
+        """
+        orig_shape = gp.shape
+        gp = gp.flatten()
+        weibull_samples = np.array([sp.stats.weibull_min.rvs(self.r, loc=0, scale=self.gp_link.transf(f)) for f in gp])
+        return weibull_samples.reshape(orig_shape)
--- a/GPy/mappings/constant.py
+++ b/GPy/mappings/constant.py
@ -38,3 +38,9 @@ class Constant(Mapping):

    def gradients_X(self, dL_dF, X):
        return np.zeros_like(X)
+
+    def to_dict(self):
+        input_dict = super(Constant, self)._to_dict()
+        input_dict["class"] = "GPy.mappings.Constant"
+        input_dict["value"] = self.C.values[0]
+        return input_dict
--- a/GPy/mappings/identity.py
+++ b/GPy/mappings/identity.py
@ -19,8 +19,7 @@ class Identity(Mapping):
    def gradients_X(self, dL_dF, X):
        return dL_dF

-
-
-
-
-
+    def to_dict(self):
+        input_dict = super(Identity, self)._to_dict()
+        input_dict["class"] = "GPy.mappings.Identity"
+        return input_dict
--- a/GPy/mappings/linear.py
+++ b/GPy/mappings/linear.py
@ -37,3 +37,21 @@ class Linear(Mapping):

    def gradients_X(self, dL_dF, X):
        return np.dot(dL_dF, self.A.T)
+
+    def to_dict(self):
+        input_dict = super(Linear, self)._to_dict()
+        input_dict["class"] = "GPy.mappings.Linear"
+        input_dict["A"] = self.A.values.tolist()
+        return input_dict
+
+    @staticmethod
+    def _from_dict(mapping_class, input_dict):
+        import copy
+        input_dict = copy.deepcopy(input_dict)
+        A = np.array(input_dict.pop('A'))
+        l = Linear(**input_dict)
+        l.unlink_parameter(l.A)
+        l.update_model(False)
+        l.A = Param('A', A)
+        l.link_parameter(l.A)
+        return l
--- a/GPy/mappings/mlp.py
+++ b/GPy/mappings/mlp.py
@ -35,7 +35,7 @@ class MLP(Mapping):

        # Backpropagation to hidden layer.
        dL_dact = np.dot(dL_dF, self.W2.T)
-        dL_dlayer1 = dL_dact / np.square(np.cosh(layer1))
+        dL_dlayer1 = dL_dact * (1 - np.square(activations))

        # Finally, evaluate the first-layer gradients.
        self.W1.gradient = np.dot(X.T,dL_dlayer1)
@ -47,7 +47,7 @@ class MLP(Mapping):

        # Backpropagation to hidden layer.
        dL_dact = np.dot(dL_dF, self.W2.T)
-        dL_dlayer1 = dL_dact / np.square(np.cosh(layer1))
+        dL_dlayer1 = dL_dact * (1 - np.square(activations))

        return np.dot(dL_dlayer1, self.W1.T)

--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -9,6 +9,7 @@ from .gplvm import GPLVM
 from .bcgplvm import BCGPLVM
 from .sparse_gplvm import SparseGPLVM
 from .warped_gp import WarpedGP
+from .input_warped_gp import InputWarpedGP
 from .bayesian_gplvm import BayesianGPLVM
 from .mrd import MRD
 from .gradient_checker import GradientChecker, HessianChecker, SkewChecker
--- a/GPy/models/gp_classification.py
+++ b/GPy/models/gp_classification.py
@ -4,6 +4,7 @@
 from ..core import GP
 from .. import likelihoods
 from .. import kern
+import numpy as np
 from ..inference.latent_function_inference.expectation_propagation import EP

 class GPClassification(GP):
@ -27,3 +28,23 @@ class GPClassification(GP):
        likelihood = likelihoods.Bernoulli()

        GP.__init__(self, X=X, Y=Y,  kernel=kernel, likelihood=likelihood, inference_method=EP(), mean_function=mean_function, name='gp_classification')
+
+    @staticmethod
+    def from_gp(gp):
+        from copy import deepcopy
+        gp = deepcopy(gp)
+        GPClassification(gp.X, gp.Y, gp.kern, gp.likelihood, gp.inference_method, gp.mean_function, name='gp_classification')
+
+    def to_dict(self, save_data=True):
+        model_dict = super(GPClassification,self).to_dict(save_data)
+        model_dict["class"] = "GPy.models.GPClassification"
+        return model_dict
+
+    @staticmethod
+    def from_dict(input_dict, data=None):
+        import GPy
+        m = GPy.core.model.Model.from_dict(input_dict, data)
+        return GPClassification.from_gp(m)
+
+    def save_model(self, output_filename, compress=True, save_data=True):
+        self._save_model(output_filename, compress=True, save_data=True)
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@ -35,3 +35,23 @@ class GPRegression(GP):

        super(GPRegression, self).__init__(X, Y, kernel, likelihood, name='GP regression', Y_metadata=Y_metadata, normalizer=normalizer, mean_function=mean_function)

+    @staticmethod
+    def from_gp(gp):
+        from copy import deepcopy
+        gp = deepcopy(gp)
+        return GPRegression(gp.X, gp.Y, gp.kern, gp.Y_metadata, gp.normalizer, gp.likelihood.variance.values, gp.mean_function)
+
+    def to_dict(self, save_data=True):
+        model_dict = super(GPRegression,self).to_dict(save_data)
+        model_dict["class"] = "GPy.models.GPRegression"
+        return model_dict
+
+    @staticmethod
+    def _from_dict(input_dict, data=None):
+        import GPy
+        input_dict["class"] = "GPy.core.GP"
+        m = GPy.core.GP.from_dict(input_dict, data)
+        return GPRegression.from_gp(m)
+
+    def save_model(self, output_filename, compress=True, save_data=True):
+        self._save_model(output_filename, compress=True, save_data=True)
--- a/GPy/models/input_warped_gp.py
+++ b/GPy/models/input_warped_gp.py
@ -0,0 +1,149 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+
+from ..core import GP
+from .. import likelihoods
+from ..util.input_warping_functions import KumarWarping
+from .. import kern
+
+
+class InputWarpedGP(GP):
+    """Input Warped GP
+
+    This defines a GP model that applies a warping function to the Input.
+    By default, it uses Kumar Warping (CDF of Kumaraswamy distribution)
+
+    Parameters
+    ----------
+    X : array_like, shape = (n_samples, n_features) for input data
+
+    Y : array_like, shape = (n_samples, 1) for output data
+
+    kernel : object, optional
+        An instance of kernel function defined in GPy.kern
+        Default to Matern 32
+
+    warping_function : object, optional
+        An instance of warping function defined in GPy.util.input_warping_functions
+        Default to KumarWarping
+
+    warping_indices : list of int, optional
+        An list of indices of which features in X should be warped.
+        It is used in the Kumar warping function
+
+    normalizer : bool, optional
+        A bool variable indicates whether to normalize the output
+
+    Xmin : list of float, optional
+        The min values for every feature in X
+        It is used in the Kumar warping function
+
+    Xmax : list of float, optional
+        The max values for every feature in X
+        It is used in the Kumar warping function
+
+    epsilon : float, optional
+        We normalize X to [0+e, 1-e]. If not given, using the default value defined in KumarWarping function
+
+    Attributes
+    ----------
+    X_untransformed : array_like, shape = (n_samples, n_features)
+        A copy of original input X
+
+    X_warped : array_like, shape = (n_samples, n_features)
+        Input data after warping
+
+    warping_function : object, optional
+        An instance of warping function defined in GPy.util.input_warping_functions
+        Default to KumarWarping
+
+    Notes
+    -----
+    Kumar warping uses the CDF of Kumaraswamy distribution. More on the Kumaraswamy distribution can be found at the
+    wiki page: https://en.wikipedia.org/wiki/Kumaraswamy_distribution
+
+    References
+    ----------
+    Snoek, J.; Swersky, K.; Zemel, R. S. & Adams, R. P.
+    Input Warping for Bayesian Optimization of Non-stationary Functions
+    preprint arXiv:1402.0929, 2014
+    """
+    def __init__(self, X, Y, kernel=None, normalizer=False, warping_function=None, warping_indices=None, Xmin=None, Xmax=None, epsilon=None):
+        if X.ndim == 1:
+            X = X.reshape(-1, 1)
+        self.X_untransformed = X.copy()
+
+        if kernel is None:
+            kernel = kern.sde_Matern32(X.shape[1], variance=1.)
+        self.kernel = kernel
+
+        if warping_function is None:
+            self.warping_function = KumarWarping(self.X_untransformed, warping_indices, epsilon, Xmin, Xmax)
+        else:
+            self.warping_function = warping_function
+
+        self.X_warped = self.transform_data(self.X_untransformed)
+        likelihood = likelihoods.Gaussian()
+        super(InputWarpedGP, self).__init__(self.X_warped, Y, likelihood=likelihood, kernel=kernel, normalizer=normalizer)
+
+        # Add the parameters in the warping function to the model parameters hierarchy
+        self.link_parameter(self.warping_function)
+
+    def parameters_changed(self):
+        """Update the gradients of parameters for warping function
+
+        This method is called when having new values of parameters for warping function, kernels
+        and other parameters in a normal GP
+        """
+        # using the warped X to update
+        self.X = self.transform_data(self.X_untransformed)
+        super(InputWarpedGP, self).parameters_changed()
+        # the gradient of log likelihood w.r.t. input AFTER warping is a product of dL_dK and dK_dX
+        dL_dX = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X)
+        self.warping_function.update_grads(self.X_untransformed, dL_dX)
+
+    def transform_data(self, X, test_data=False):
+        """Apply warping_function to some Input data
+
+        Parameters
+        ----------
+        X : array_like, shape = (n_samples, n_features)
+
+        test_data: bool, optional
+            Default to False, should set to True when transforming test data
+        """
+        return self.warping_function.f(X, test_data)
+
+    def log_likelihood(self):
+        """Compute the marginal log likelihood
+
+        For input warping, just use the normal GP log likelihood
+        """
+        return GP.log_likelihood(self)
+
+    def predict(self, Xnew):
+        """Prediction on the new data
+
+        Parameters
+        ----------
+        Xnew : array_like, shape = (n_samples, n_features)
+            The test data.
+
+        Returns
+        -------
+        mean : array_like, shape = (n_samples, output.dim)
+            Posterior mean at the location of Xnew
+
+        var : array_like, shape = (n_samples, 1)
+            Posterior variance at the location of Xnew
+        """
+        Xnew_warped = self.transform_data(Xnew, test_data=True)
+        mean, var = super(InputWarpedGP, self).predict(Xnew_warped, kern=self.kernel, full_cov=False)
+        return mean, var
+
+if __name__ == '__main__':
+    X = np.random.randn(100, 1)
+    Y = np.sin(X) + np.random.randn(100, 1)*0.05
+    m = InputWarpedGP(X, Y)
--- a/GPy/plotting/gpy_plot/kernel_plots.py
+++ b/GPy/plotting/gpy_plot/kernel_plots.py
@ -65,7 +65,7 @@ def plot_ARD(kernel, filtering=None, legend=False, canvas=None, **kwargs):


    if canvas is None:
-        canvas, kwargs = pl().new_canvas(xlim=(-.5, kernel._effective_input_dim-.5), xlabel='input dimension', ylabel='sensitivity', **kwargs)
+        canvas, kwargs = pl().new_canvas(xlim=(-.5, kernel._effective_input_dim-.5), xlabel='input dimension', ylabel='ard contribution', **kwargs)

    for i in range(ard_params.shape[0]):
        if parts[i].name in filtering:
--- a/GPy/testing/ep_likelihood_tests.py
+++ b/GPy/testing/ep_likelihood_tests.py
@ -0,0 +1,147 @@
+
+import numpy as np
+import unittest
+import GPy
+from GPy.models import GradientChecker
+
+fixed_seed = 10
+from nose.tools import with_setup, nottest
+
+
+# this file will contain some high level tests, this is not unit testing, but will give us a higher level estimate
+# if things are going well under the hood.
+class TestObservationModels(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(fixed_seed)
+        self.N = 100
+        self.D = 2
+        self.X = np.random.rand(self.N, self.D)
+
+        self.real_noise_std = 0.05
+        noise = np.random.randn(*self.X[:, 0].shape) * self.real_noise_std
+        self.Y = (np.sin(self.X[:, 0] * 2 * np.pi) + noise)[:, None]
+        self.num_points = self.X.shape[0]
+        self.f = np.random.rand(self.N, 1)
+        self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=np.int)[:, None]
+        # self.binary_Y[self.binary_Y == 0.0] = -1.0
+        self.positive_Y = np.exp(self.Y.copy())
+
+        self.Y_noisy = self.Y.copy()
+        self.Y_verynoisy = self.Y.copy()
+        self.Y_noisy[75] += 1.3
+
+        self.init_var = 0.15
+        self.deg_free = 4.
+        censored = np.zeros_like(self.Y)
+        random_inds = np.random.choice(self.N, int(self.N / 2), replace=True)
+        censored[random_inds] = 1
+        self.Y_metadata = dict()
+        self.Y_metadata['censored'] = censored
+        self.kernel1 = GPy.kern.RBF(self.X.shape[1]) + GPy.kern.White(self.X.shape[1])
+
+    def tearDown(self):
+        self.Y = None
+        self.X = None
+        self.binary_Y =None
+        self.positive_Y = None
+        self.kernel1 = None
+
+    @with_setup(setUp, tearDown)
+    def testEPClassification(self):
+        bernoulli = GPy.likelihoods.Bernoulli()
+        laplace_inf = GPy.inference.latent_function_inference.Laplace()
+
+        ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode='alternated')
+        ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode='nested')
+        ep_inf_fractional = GPy.inference.latent_function_inference.EP(ep_mode='nested', eta=0.9)
+
+        m1 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=laplace_inf)
+        m1.randomize()
+
+        m2 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_alt)
+        m2.randomize()
+
+        m3 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_nested)
+        m3.randomize()
+        #
+        m4 = GPy.core.GP(self.X, self.binary_Y.copy(), kernel=self.kernel1.copy(), likelihood=bernoulli.copy(), inference_method=ep_inf_fractional)
+        m4.randomize()
+
+        optimizer = 'bfgs'
+
+        #do gradcheck here ...
+        # self.assertTrue(m1.checkgrad())
+        # self.assertTrue(m2.checkgrad())
+        # self.assertTrue(m3.checkgrad())
+        # self.assertTrue(m4.checkgrad())
+
+        m1.optimize(optimizer=optimizer, max_iters=300)
+        m2.optimize(optimizer=optimizer, max_iters=300)
+        m3.optimize(optimizer=optimizer, max_iters=300)
+        m4.optimize(optimizer=optimizer, max_iters=300)
+
+        # taking laplace predictions as the ground truth
+        probs_mean_lap, probs_var_lap = m1.predict(self.X)
+        probs_mean_ep_alt, probs_var_ep_alt = m2.predict(self.X)
+        probs_mean_ep_nested, probs_var_ep_nested = m3.predict(self.X)
+
+        # for simple single dimension data , marginal likelihood for laplace and EP approximations should not be so far apart.
+        self.assertAlmostEqual(m1.log_likelihood(), m2.log_likelihood(),delta=1)
+        self.assertAlmostEqual(m1.log_likelihood(), m3.log_likelihood(), delta=1)
+        self.assertAlmostEqual(m1.log_likelihood(), m4.log_likelihood(), delta=5)
+
+        GPy.util.classification.conf_matrix(probs_mean_lap, self.binary_Y)
+        GPy.util.classification.conf_matrix(probs_mean_ep_alt, self.binary_Y)
+        GPy.util.classification.conf_matrix(probs_mean_ep_nested, self.binary_Y)
+
+    @nottest
+    def rmse(self, Y, Ystar):
+        return np.sqrt(np.mean((Y - Ystar) ** 2))
+
+    @with_setup(setUp, tearDown)
+    def test_EP_with_StudentT(self):
+        studentT = GPy.likelihoods.StudentT(deg_free=self.deg_free, sigma2=self.init_var)
+        laplace_inf = GPy.inference.latent_function_inference.Laplace()
+
+        ep_inf_alt = GPy.inference.latent_function_inference.EP(ep_mode='alternated')
+        ep_inf_nested = GPy.inference.latent_function_inference.EP(ep_mode='nested')
+        ep_inf_frac = GPy.inference.latent_function_inference.EP(ep_mode='nested', eta=0.7)
+
+        m1 = GPy.core.GP(self.X.copy(), self.Y_noisy.copy(), kernel=self.kernel1.copy(), likelihood=studentT.copy(), inference_method=laplace_inf)
+        # optimize
+        m1['.*white'].constrain_fixed(1e-5)
+        m1.randomize()
+
+        m2 = GPy.core.GP(self.X.copy(), self.Y_noisy.copy(), kernel=self.kernel1.copy(), likelihood=studentT.copy(), inference_method=ep_inf_alt)
+        m2['.*white'].constrain_fixed(1e-5)
+        # m2.constrain_bounded('.*t_scale2', 0.001, 10)
+        m2.randomize()
+
+        # m3 = GPy.core.GP(self.X, self.Y_noisy.copy(), kernel=self.kernel1, likelihood=studentT.copy(), inference_method=ep_inf_nested)
+        # m3['.*white'].constrain_fixed(1e-5)
+        # # m3.constrain_bounded('.*t_scale2', 0.001, 10)
+        # m3.randomize()
+
+        optimizer='bfgs'
+        m1.optimize(optimizer=optimizer,max_iters=400)
+        m2.optimize(optimizer=optimizer, max_iters=400)
+        # m3.optimize(optimizer=optimizer, max_iters=500)
+
+        self.assertAlmostEqual(m1.log_likelihood(), m2.log_likelihood(),delta=200)
+
+        # self.assertAlmostEqual(m1.log_likelihood(), m3.log_likelihood(), 3)
+
+        preds_mean_lap, preds_var_lap = m1.predict(self.X)
+        preds_mean_alt, preds_var_alt = m2.predict(self.X)
+        # preds_mean_nested, preds_var_nested = m3.predict(self.X)
+        rmse_lap = self.rmse(preds_mean_lap, self.Y)
+        rmse_alt = self.rmse(preds_mean_alt, self.Y)
+        # rmse_nested = self.rmse(preds_mean_nested, self.Y_noisy)
+
+        if rmse_alt > rmse_lap:
+            self.assertAlmostEqual(rmse_lap, rmse_alt, delta=1.5)
+        # m3.optimize(optimizer=optimizer, max_iters=500)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@ -61,6 +61,18 @@ class InferenceGPEP(unittest.TestCase):
        Y = lik.samples(f).reshape(-1,1)
        return X, Y

+    def genNoisyData(self):
+        np.random.seed(1)
+        X = np.random.rand(100,1)
+        self.real_std = 0.1
+        noise = np.random.randn(*X[:, 0].shape)*self.real_std
+        Y = (np.sin(X[:, 0]*2*np.pi) + noise)[:, None]
+        self.f = np.random.rand(X.shape[0],1)
+        Y_extra_noisy = Y.copy()
+        Y_extra_noisy[50] += 4.
+        # Y_extra_noisy[80:83] -= 2.
+        return X, Y, Y_extra_noisy
+
    def test_inference_EP(self):
        from paramz import ObsAr
        X, Y = self.genData()
@ -73,11 +85,45 @@ class InferenceGPEP(unittest.TestCase):
                        inference_method=inf,
                        likelihood=lik)
        K = self.model.kern.K(X)
-        mu, Sigma, mu_tilde, tau_tilde, log_Z_tilde = self.model.inference_method.expectation_propagation(K, ObsAr(Y), lik, None)

-        v_tilde = mu_tilde * tau_tilde
-        p, m, d = self.model.inference_method._inference(K, tau_tilde, v_tilde, lik, Y_metadata=None,  Z_tilde=log_Z_tilde.sum())
-        p0, m0, d0 = super(GPy.inference.latent_function_inference.expectation_propagation.EP, inf).inference(k, X,lik ,mu_tilde[:,None], mean_function=None, variance=1./tau_tilde, K=K, Z_tilde=log_Z_tilde.sum() + np.sum(- 0.5*np.log(tau_tilde) + 0.5*(v_tilde*v_tilde*1./tau_tilde)))
+        post_params, ga_approx, cav_params, log_Z_tilde = self.model.inference_method.expectation_propagation(K, ObsAr(Y), lik, None)
+
+        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
+        p, m, d = self.model.inference_method._inference(Y, K, ga_approx, cav_params, lik, Y_metadata=None,  Z_tilde=log_Z_tilde)
+        p0, m0, d0 = super(GPy.inference.latent_function_inference.expectation_propagation.EP, inf).inference(k, X,lik ,mu_tilde[:,None], mean_function=None, variance=1./ga_approx.tau, K=K, Z_tilde=log_Z_tilde + np.sum(- 0.5*np.log(ga_approx.tau) + 0.5*(ga_approx.v*ga_approx.v*1./ga_approx.tau)))
+
+        assert (np.sum(np.array([m - m0,
+                    np.sum(d['dL_dK'] - d0['dL_dK']),
+                    np.sum(d['dL_dthetaL'] - d0['dL_dthetaL']),
+                    np.sum(d['dL_dm'] - d0['dL_dm']),
+                    np.sum(p._woodbury_vector - p0._woodbury_vector),
+                    np.sum(p.woodbury_inv - p0.woodbury_inv)])) < 1e6)
+
+    # NOTE: adding a test like above for parameterized likelihood- the above test is
+    # only for probit likelihood which does not have any tunable hyperparameter which is why
+    # the term in dictionary of gradients: dL_dthetaL will always be zero. So here we repeat tests for
+    # student-t likelihood and heterodescastic gaussian noise case. This test simply checks if the posterior
+    # and gradients of log marginal are roughly the same for inference through EP and exact gaussian inference using
+    # the gaussian approximation for the individual likelihood site terms. For probit likelihood, it is possible to
+    # calculate moments analytically, but for other likelihoods, we will need to use numerical quadrature techniques,
+    # and it is possible that any error might creep up because of quadrature implementation.
+    def test_inference_EP_non_classification(self):
+        from paramz import ObsAr
+        X, Y, Y_extra_noisy = self.genNoisyData()
+        deg_freedom = 5.
+        init_noise_var = 0.08
+        lik_studentT = GPy.likelihoods.StudentT(deg_free=deg_freedom, sigma2=init_noise_var)
+        # like_gaussian_noise = GPy.likelihoods.MixedNoise()
+        k = GPy.kern.RBF(1, variance=2., lengthscale=1.1)
+        ep_inf_alt = GPy.inference.latent_function_inference.expectation_propagation.EP(max_iters=4, delta=0.5)
+        # ep_inf_nested = GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode='nested', max_iters=100, delta=0.5)
+        m = GPy.core.GP(X=X,Y=Y_extra_noisy,kernel=k,likelihood=lik_studentT,inference_method=ep_inf_alt)
+        K = m.kern.K(X)
+        post_params, ga_approx, cav_params, log_Z_tilde = m.inference_method.expectation_propagation(K, ObsAr(Y_extra_noisy), lik_studentT, None)
+
+        mu_tilde = ga_approx.v / ga_approx.tau.astype(float)
+        p, m, d = m.inference_method._inference(Y_extra_noisy, K, ga_approx, cav_params, lik_studentT, Y_metadata=None,  Z_tilde=log_Z_tilde)
+        p0, m0, d0 = super(GPy.inference.latent_function_inference.expectation_propagation.EP, ep_inf_alt).inference(k, X,lik_studentT ,mu_tilde[:,None], mean_function=None, variance=1./ga_approx.tau, K=K, Z_tilde=log_Z_tilde + np.sum(- 0.5*np.log(ga_approx.tau) + 0.5*(ga_approx.v*ga_approx.v*1./ga_approx.tau)))

        assert (np.sum(np.array([m - m0,
                    np.sum(d['dL_dK'] - d0['dL_dK']),
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@ -123,6 +123,11 @@ class TestNoiseModels(object):
        
        self.var = 0.2
        self.deg_free = 4.0
+        censored = np.zeros_like(self.Y)
+        random_inds = np.random.choice(self.N, int(self.N / 2), replace=True)
+        censored[random_inds] = 1
+        self.Y_metadata = dict()
+        self.Y_metadata['censored'] = censored

        #Make a bigger step as lower bound can be quite curved
        self.step = 1e-4
@ -274,6 +279,20 @@ class TestNoiseModels(object):
                "Y_metadata": {'trials': self.ns},
                "laplace": True,
            },
+            "loglogistic_censored": {
+                "model": GPy.likelihoods.LogLogistic(),
+                "link_f_constraints": [self.constrain_positive],
+                "Y": self.positive_Y,
+                "Y_metadata": self.Y_metadata,
+                "laplace": True
+            },
+            "weibull_censored": {
+                "model": GPy.likelihoods.Weibull(),
+                "link_f_constraints": [self.constrain_positive],
+                "Y": self.positive_Y,
+                "Y_metadata": self.Y_metadata,
+                "laplace": True
+            }
            #,
            #GAMMA needs some work!"Gamma_default": {
            #"model": GPy.likelihoods.Gamma(),
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -399,6 +399,68 @@ class MiscTests(unittest.TestCase):
        m.optimize()
        print(m)

+    def test_input_warped_gp_identity(self):
+        """
+        A InputWarpedGP with the identity warping function should be
+        equal to a standard GP.
+        """
+        k = GPy.kern.RBF(1)
+        m = GPy.models.GPRegression(self.X, self.Y, kernel=k)
+        m.optimize()
+        preds = m.predict(self.X)
+
+        warp_k = GPy.kern.RBF(1)
+        warp_f = GPy.util.input_warping_functions.IdentifyWarping()
+        warp_m = GPy.models.InputWarpedGP(self.X, self.Y, kernel=warp_k, warping_function=warp_f)
+        warp_m.optimize()
+        warp_preds = warp_m.predict(self.X)
+
+        np.testing.assert_almost_equal(preds, warp_preds, decimal=4)
+
+    def test_kumar_warping_gradient(self):
+        n_X = 100
+        np.random.seed(0)
+        X = np.random.randn(n_X, 2)
+        Y = np.sum(np.sin(X), 1).reshape(n_X, 1)
+
+        k1 = GPy.kern.Linear(2)
+        m1 = GPy.models.InputWarpedGP(X, Y, kernel=k1)
+        m1.randomize()
+        self.assertEquals(m1.checkgrad(), True)
+
+        k2 = GPy.kern.RBF(2)
+        m2 = GPy.models.InputWarpedGP(X, Y, kernel=k2)
+        m2.randomize()
+        m2.checkgrad()
+        self.assertEquals(m2.checkgrad(), True)
+
+        k3 = GPy.kern.Matern52(2)
+        m3 = GPy.models.InputWarpedGP(X, Y, kernel=k3)
+        m3.randomize()
+        m3.checkgrad()
+        self.assertEquals(m3.checkgrad(), True)
+
+    def test_kumar_warping_parameters(self):
+        np.random.seed(1)
+        X = np.random.rand(5, 2)
+        epsilon = 1e-6
+
+        # testing warping indices
+        warping_ind_1 = [0, 1, 2]
+        warping_ind_2 = [-1, 1, 2]
+        warping_ind_3 = [0, 1.5, 2]
+        self.failUnlessRaises(ValueError, GPy.util.input_warping_functions.KumarWarping, X, warping_ind_1)
+        self.failUnlessRaises(ValueError, GPy.util.input_warping_functions.KumarWarping, X, warping_ind_2)
+        self.failUnlessRaises(ValueError, GPy.util.input_warping_functions.KumarWarping, X, warping_ind_3)
+
+        # testing Xmin and Xmax
+        Xmin_1, Xmax_1 = None, [1, 1]
+        Xmin_2, Xmax_2 = [0, 0], None
+        Xmin_3, Xmax_3 = [0, 0, 0], [1, 1]
+        self.failUnlessRaises(ValueError, GPy.util.input_warping_functions.KumarWarping, X, [0, 1], epsilon, Xmin_1, Xmax_1)
+        self.failUnlessRaises(ValueError, GPy.util.input_warping_functions.KumarWarping, X, [0, 1], epsilon, Xmin_2, Xmax_2)
+        self.failUnlessRaises(ValueError, GPy.util.input_warping_functions.KumarWarping, X, [0, 1], epsilon, Xmin_3, Xmax_3)
+
    def test_warped_gp_identity(self):
        """
        A WarpedGP with the identity warping function should be
--- a/GPy/testing/quadrature_tests.py
+++ b/GPy/testing/quadrature_tests.py
@ -0,0 +1,39 @@
+from __future__ import print_function, division
+import numpy as np
+import GPy
+import warnings
+from  ..util.quad_integrate import quadgk_int, quadvgk
+
+
+
+class QuadTests(np.testing.TestCase):
+    """
+    test file for checking implementation of gaussian-kronrod quadrature.
+    we will take a function which can be integrated analytically and check if quadgk result is similar or not!
+    through this file we can test how numerically accurate quadrature implementation in native numpy or manual code is.
+    """
+    def setUp(self):
+        pass
+
+    def test_infinite_quad(self):
+        def f(x):
+            return np.exp(-0.5*x**2)*np.power(x,np.arange(3)[:,None])
+        quad_int_val = quadgk_int(f)
+        real_val = np.sqrt(np.pi * 2)
+        np.testing.assert_almost_equal(real_val, quad_int_val[0], decimal=7)
+
+    def test_finite_quad(self):
+        def f2(x):
+            return x**2
+        quad_int_val = quadvgk(f2, 1.,2.)
+        real_val = 7/3.
+        np.testing.assert_almost_equal(real_val, quad_int_val, decimal=5)
+
+if __name__ == '__main__':
+    def f(x):
+        return np.exp(-0.5 * x ** 2) * np.power(x, np.arange(3)[:, None])
+
+    quad_int_val = quadgk_int(f)
+    real_val = np.sqrt(np.pi*2)
+    np.testing.assert_almost_equal(real_val, quad_int_val[0], decimal=7)
+    print(quadgk_int(f))
--- a/GPy/testing/serialization_tests.py
+++ b/GPy/testing/serialization_tests.py
@ -0,0 +1,202 @@
+'''
+Created on 20 April 2017
+
+@author: pgmoren
+'''
+import unittest, itertools
+#import cPickle as pickle
+import pickle
+import numpy as np
+import tempfile
+import GPy
+from nose import SkipTest
+import numpy as np
+fixed_seed = 11
+
+
+class Test(unittest.TestCase):
+    def test_serialize_deserialize_kernels(self):
+        k1 = GPy.kern.RBF(2, variance=1.0, lengthscale=[1.0,1.0], ARD=True)
+        k2 = GPy.kern.RatQuad(2, variance=2.0, lengthscale=1.0, power=2.0, active_dims = [0,1])
+        k3 = GPy.kern.Bias(2, variance=2.0, active_dims = [1,0])
+        k4 = GPy.kern.StdPeriodic(2, variance=2.0, lengthscale=1.0, period=1.0, active_dims = [1,1])
+        k5 = GPy.kern.Linear(2, variances=[2.0, 1.0], ARD=True, active_dims = [1,1])
+        k6 = GPy.kern.Exponential(2, variance=1., lengthscale=2)
+        k7 = GPy.kern.Matern32(2, variance=1.0, lengthscale=[1.0,3.0], ARD=True, active_dims = [1,1])
+        k8 = GPy.kern.Matern52(2, variance=2.0, lengthscale=[2.0,1.0], ARD=True, active_dims = [1,0])
+        k9 = GPy.kern.ExpQuad(2, variance=3.0, lengthscale=[1.0,2.0], ARD=True, active_dims = [0,1])
+        k10 = k1 + k1.copy() + k2 + k3 + k4 + k5 + k6
+        k11 = k1 * k2 * k2.copy() * k3 * k4 * k5
+        k12 = (k1 + k2) * (k3 + k4 + k5)
+        k13 = ((k1 + k2) * k3) + k4 + k5 * k7
+        k14 = ((k1 + k2) * k3) + k4 * k5 + k8
+        k15 = ((k1 * k2) * k3) + k4 * k5 + k8 + k9
+
+        k_list = [k1,k2,k3,k4,k5,k6,k7,k8,k9,k10,k11,k12,k13,k14,k15]
+
+        for kk in k_list:
+            kk_dict = kk.to_dict()
+            kk_r = GPy.kern.Kern.from_dict(kk_dict)
+            assert type(kk) == type(kk_r)
+            np.testing.assert_array_equal(kk[:], kk_r[:])
+            np.testing.assert_array_equal(np.array(kk.active_dims), np.array(kk_r.active_dims))
+
+    def test_serialize_deserialize_mappings(self):
+        m1 = GPy.mappings.Identity(3,2)
+        m2 = GPy.mappings.Constant(3,2,1)
+        m2_r = GPy.core.mapping.Mapping.from_dict(m2.to_dict())
+        np.testing.assert_array_equal(m2.C.values[:], m2_r.C.values[:])
+        m3 = GPy.mappings.Linear(3,2)
+        m3_r = GPy.core.mapping.Mapping.from_dict(m3.to_dict())
+        assert np.all(m3.A == m3_r.A)
+
+        m_list = [m1, m2, m3]
+        for mm in m_list:
+            mm_dict = mm.to_dict()
+            mm_r = GPy.core.mapping.Mapping.from_dict(mm_dict)
+            assert type(mm) == type(mm_r)
+            assert type(mm.input_dim) == type(mm_r.input_dim)
+            assert type(mm.output_dim) == type(mm_r.output_dim)
+
+    def test_serialize_deserialize_likelihoods(self):
+        l1 = GPy.likelihoods.Gaussian(GPy.likelihoods.link_functions.Identity(),variance=3.0)
+        l1_r = GPy.likelihoods.likelihood.Likelihood.from_dict(l1.to_dict())
+        l2 = GPy.likelihoods.Bernoulli(GPy.likelihoods.link_functions.Probit())
+        l2_r = GPy.likelihoods.likelihood.Likelihood.from_dict(l2.to_dict())
+        assert type(l1) == type(l1_r)
+        assert np.all(l1.variance == l1_r.variance)
+        assert type(l2) == type(l2_r)
+
+    def test_serialize_deserialize_normalizers(self):
+        n1 = GPy.util.normalizer.Standardize()
+        n1.scale_by(np.random.rand(10))
+        n1_r = GPy.util.normalizer._Norm.from_dict((n1.to_dict()))
+        assert type(n1) == type(n1_r)
+        assert np.all(n1.mean == n1_r.mean)
+        assert np.all(n1.std == n1_r.std)
+
+    def test_serialize_deserialize_link_functions(self):
+        l1 = GPy.likelihoods.link_functions.Identity()
+        l2 = GPy.likelihoods.link_functions.Probit()
+        l_list = [l1, l2]
+        for ll in l_list:
+            ll_dict = ll.to_dict()
+            ll_r = GPy.likelihoods.link_functions.GPTransformation.from_dict(ll_dict)
+            assert type(ll) == type(ll_r)
+
+    def test_serialize_deserialize_inference_methods(self):
+
+        e1 = GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode="nested")
+        e1.ga_approx_old = GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(np.random.rand(10),np.random.rand(10))
+        e1._ep_approximation = []
+        e1._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.posteriorParams(np.random.rand(10),np.random.rand(100).reshape((10,10))))
+        e1._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.gaussianApproximation(np.random.rand(10),np.random.rand(10)))
+        e1._ep_approximation.append(GPy.inference.latent_function_inference.expectation_propagation.cavityParams(10))
+        e1._ep_approximation[-1].v = np.random.rand(10)
+        e1._ep_approximation[-1].tau = np.random.rand(10)
+        e1._ep_approximation.append(np.random.rand(10))
+        e1_r = GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(e1.to_dict())
+
+        assert type(e1) == type(e1_r)
+        assert e1.epsilon==e1_r.epsilon
+        assert e1.eta==e1_r.eta
+        assert e1.delta==e1_r.delta
+        assert e1.always_reset==e1_r.always_reset
+        assert e1.max_iters==e1_r.max_iters
+        assert e1.ep_mode==e1_r.ep_mode
+        assert e1.parallel_updates==e1_r.parallel_updates
+
+        np.testing.assert_array_equal(e1.ga_approx_old.tau[:], e1_r.ga_approx_old.tau[:])
+        np.testing.assert_array_equal(e1.ga_approx_old.v[:], e1_r.ga_approx_old.v[:])
+        np.testing.assert_array_equal(e1._ep_approximation[0].mu[:], e1_r._ep_approximation[0].mu[:])
+        np.testing.assert_array_equal(e1._ep_approximation[0].Sigma[:], e1_r._ep_approximation[0].Sigma[:])
+        np.testing.assert_array_equal(e1._ep_approximation[1].tau[:], e1_r._ep_approximation[1].tau[:])
+        np.testing.assert_array_equal(e1._ep_approximation[1].v[:], e1_r._ep_approximation[1].v[:])
+        np.testing.assert_array_equal(e1._ep_approximation[2].tau[:], e1_r._ep_approximation[2].tau[:])
+        np.testing.assert_array_equal(e1._ep_approximation[2].v[:], e1_r._ep_approximation[2].v[:])
+        np.testing.assert_array_equal(e1._ep_approximation[3][:], e1_r._ep_approximation[3][:])
+
+        e2 = GPy.inference.latent_function_inference.exact_gaussian_inference.ExactGaussianInference()
+        e2_r = GPy.inference.latent_function_inference.LatentFunctionInference.from_dict(e2.to_dict())
+
+        assert type(e2) == type(e2_r)
+
+    def test_serialize_deserialize_model(self):
+        np.random.seed(fixed_seed)
+        N = 20
+        Nhalf = int(N/2)
+        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[:, None]
+        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
+        kernel = GPy.kern.RBF(1)
+        likelihood = GPy.likelihoods.Bernoulli()
+        inference_method=GPy.inference.latent_function_inference.expectation_propagation.EP(ep_mode="nested")
+        mean_function=None
+        m = GPy.core.GP(X=X, Y=Y,  kernel=kernel, likelihood=likelihood, inference_method=inference_method, mean_function=mean_function, normalizer=True, name='gp_classification')
+        m.optimize()
+        m.save_model("temp_test_gp_with_data.json", compress=True, save_data=True)
+        m.save_model("temp_test_gp_without_data.json", compress=True, save_data=False)
+        m1_r = GPy.core.GP.load_model("temp_test_gp_with_data.json.zip")
+        m2_r = GPy.core.GP.load_model("temp_test_gp_without_data.json.zip", (X,Y))
+        import os
+        os.remove("temp_test_gp_with_data.json.zip")
+        os.remove("temp_test_gp_without_data.json.zip")
+        var = m.predict(X)[0]
+        var1_r = m1_r.predict(X)[0]
+        var2_r = m2_r.predict(X)[0]
+        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
+        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var2_r).flatten())
+
+    def test_serialize_deserialize_inference_GPRegressor(self):
+        np.random.seed(fixed_seed)
+        N = 50
+        N_new = 50
+        D = 1
+        X = np.random.uniform(-3., 3., (N, 1))
+        Y = np.sin(X) + np.random.randn(N, D) * 0.05
+        X_new = np.random.uniform(-3., 3., (N_new, 1))
+        k = GPy.kern.RBF(input_dim=1, lengthscale=10)
+        m = GPy.models.GPRegression(X,Y,k)
+        m.optimize()
+        m.save_model("temp_test_gp_regressor_with_data.json", compress=True, save_data=True)
+        m.save_model("temp_test_gp_regressor_without_data.json", compress=True, save_data=False)
+        m1_r = GPy.models.GPRegression.load_model("temp_test_gp_regressor_with_data.json.zip")
+        m2_r = GPy.models.GPRegression.load_model("temp_test_gp_regressor_without_data.json.zip", (X,Y))
+        import os
+        os.remove("temp_test_gp_regressor_with_data.json.zip")
+        os.remove("temp_test_gp_regressor_without_data.json.zip")
+
+        Xp = np.random.uniform(size=(int(1e5),1))
+        Xp[:,0] = Xp[:,0]*15-5
+
+        _, var = m.predict(Xp)
+        _, var1_r = m1_r.predict(Xp)
+        _, var2_r = m2_r.predict(Xp)
+        np.testing.assert_array_equal(var.flatten(), var1_r.flatten())
+        np.testing.assert_array_equal(var.flatten(), var2_r.flatten())
+
+    def test_serialize_deserialize_inference_GPClassifier(self):
+        np.random.seed(fixed_seed)
+        N = 50
+        Nhalf = int(N/2)
+        X = np.hstack([np.random.normal(5, 2, Nhalf), np.random.normal(10, 2, Nhalf)])[:, None]
+        Y = np.hstack([np.ones(Nhalf), np.zeros(Nhalf)])[:, None]
+        kernel = GPy.kern.RBF(1)
+        m = GPy.models.GPClassification(X, Y, kernel=kernel)
+        m.optimize()
+        m.save_model("temp_test_gp_classifier_with_data.json", compress=True, save_data=True)
+        m.save_model("temp_test_gp_classifier_without_data.json", compress=True, save_data=False)
+        m1_r = GPy.models.GPClassification.load_model("temp_test_gp_classifier_with_data.json.zip")
+        m2_r = GPy.models.GPClassification.load_model("temp_test_gp_classifier_without_data.json.zip", (X,Y))
+        import os
+        os.remove("temp_test_gp_classifier_with_data.json.zip")
+        os.remove("temp_test_gp_classifier_without_data.json.zip")
+
+        var = m.predict(X)[0]
+        var1_r = m1_r.predict(X)[0]
+        var2_r = m2_r.predict(X)[0]
+        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
+        np.testing.assert_array_equal(np.array(var).flatten(), np.array(var1_r).flatten())
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.test_parameter_index_operations']
+    unittest.main()
--- a/GPy/util/init.py
+++ b/GPy/util/init.py
@ -17,3 +17,4 @@ from . import multioutput
 from . import parallel
 from . import functions
 from . import cluster_with_offset
+from . import input_warping_functions
--- a/GPy/util/input_warping_functions.py
+++ b/GPy/util/input_warping_functions.py
@ -0,0 +1,262 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ..core.parameterization import Parameterized, Param
+from ..core.parameterization.priors import LogGaussian
+
+
+class InputWarpingFunction(Parameterized):
+    """Abstract class for input warping functions
+    """
+
+    def __init__(self, name):
+        super(InputWarpingFunction, self).__init__(name=name)
+
+    def f(self, X, test=False):
+
+        raise NotImplementedError
+
+    def fgrad_x(self, X):
+        raise NotImplementedError
+
+    def update_grads(self, X, dL_dW):
+        raise NotImplementedError
+
+
+class IdentifyWarping(InputWarpingFunction):
+    """The identity warping function, for testing"""
+    def __init__(self):
+        super(IdentifyWarping, self).__init__(name='input_warp_identity')
+
+    def f(self, X, test_data=False):
+        return X
+
+    def fgrad_X(self, X):
+        return np.zeros(X.shape)
+
+    def update_grads(self, X, dL_dW):
+        pass
+
+
+class InputWarpingTest(InputWarpingFunction):
+    """The identity warping function, for testing"""
+    def __init__(self):
+        super(InputWarpingTest, self).__init__(name='input_warp_test')
+        self.a = Param('a', 1.0)
+        self.set_prior(LogGaussian(0.0, 0.75))
+        self.link_parameter(self.a)
+
+    def f(self, X, test_data=False):
+        return X * self.a
+
+    def fgrad_X(self, X):
+        return self.ones(X.shape) * self.a
+
+    def update_grads(self, X, dL_dW):
+        self.a.gradient[:] = np.sum(dL_dW * X)
+
+
+class KumarWarping(InputWarpingFunction):
+    """Kumar Warping for input data
+
+    Parameters
+    ----------
+    X : array_like, shape = (n_samples, n_features)
+        The input data that is going to be warped
+
+    warping_indices: list of int, optional
+        The features that are going to be warped
+        Default to warp all the features
+
+    epsilon: float, optional
+        Used to normalized input data to [0+e, 1-e]
+        Default to 1e-6
+
+    Xmin : list of float, Optional
+        The min values for each feature defined by users
+        Default to the train minimum
+
+    Xmax : list of float, Optional
+        The max values for each feature defined by users
+        Default to the train maximum
+
+    Attributes
+    ----------
+    warping_indices: list of int
+        The features that are going to be warped
+        Default to warp all the features
+
+    warping_dim: int
+        The number of features to be warped
+
+    Xmin : list of float
+        The min values for each feature defined by users
+        Default to the train minimum
+
+    Xmax : list of float
+        The max values for each feature defined by users
+        Default to the train maximum
+
+    epsilon: float
+        Used to normalized input data to [0+e, 1-e]
+        Default to 1e-6
+
+    X_normalized : array_like, shape = (n_samples, n_features)
+        The normalized training X
+
+    scaling : list of float, length = n_features in X
+        Defined as 1.0 / (self.Xmax - self.Xmin)
+
+    params : list of Param
+        The list of all the parameters used in Kumar Warping
+
+    num_parameters: int
+        The number of parameters used in Kumar Warping
+    """
+
+    def __init__(self, X, warping_indices=None, epsilon=None, Xmin=None, Xmax=None):
+
+        super(KumarWarping, self).__init__(name='input_warp_kumar')
+
+        if warping_indices is not None and np.max(warping_indices) > X.shape[1] -1:
+            raise ValueError("Kumar warping indices exceed feature dimension")
+
+        if warping_indices is not None and np.min(warping_indices) < 0:
+            raise ValueError("Kumar warping indices should be larger than 0")
+
+        if warping_indices is not None and np.any(list(map(lambda x: not isinstance(x, int), warping_indices))):
+            raise ValueError("Kumar warping indices should be integer")
+
+        if Xmin is None and Xmax is None:
+            Xmin = X.min(axis=0)
+            Xmax = X.max(axis=0)
+        else:
+            if Xmin is None or Xmax is None:
+                raise ValueError("Xmin and Xmax need to be provide at the same time!")
+            if len(Xmin) != X.shape[1] or len(Xmax) != X.shape[1]:
+                raise ValueError("Xmin and Xmax should have n_feature values!")
+
+        if epsilon is None:
+            epsilon = 1e-6
+        self.epsilon = epsilon
+
+        self.Xmin = Xmin - self.epsilon
+        self.Xmax = Xmax + self.epsilon
+        self.scaling = 1.0 / (self.Xmax - self.Xmin)
+        self.X_normalized = (X - self.Xmin) / (self.Xmax - self.Xmin)
+
+        if warping_indices is None:
+            warping_indices = range(X.shape[1])
+
+        self.warping_indices = warping_indices
+        self.warping_dim = len(self.warping_indices)
+        self.num_parameters = 2 * self.warping_dim
+
+        # create parameters
+        self.params = [[Param('a%d' % i, 1.0), Param('b%d' % i, 1.0)] for i in range(self.warping_dim)]
+
+        # add constraints
+        for i in range(self.warping_dim):
+            self.params[i][0].constrain_bounded(0.0, 10.0)
+            self.params[i][1].constrain_bounded(0.0, 10.0)
+
+        # set priors and add them into handler
+        for i in range(self.warping_dim):
+            self.params[i][0].set_prior(LogGaussian(0.0, 0.75))
+            self.params[i][1].set_prior(LogGaussian(0.0, 0.75))
+            self.link_parameter(self.params[i][0])
+            self.link_parameter(self.params[i][1])
+
+    def f(self, X, test_data=False):
+        """Apply warping_function to some Input data
+
+        Parameters:
+        -----------
+        X : array_like, shape = (n_samples, n_features)
+
+        test_data: bool, optional
+            Default to False, should set to True when transforming test data
+
+        Returns
+        -------
+        X_warped : array_like, shape = (n_samples, n_features)
+            The warped input data
+
+        Math
+        ----
+        f(x) = 1 - (1 - x^a)^b
+        """
+        X_warped = X.copy()
+        if test_data:
+            X_normalized = (X - self.Xmin) / (self.Xmax - self.Xmin)
+        else:
+            X_normalized = self.X_normalized
+
+        for i_seq, i_fea in enumerate(self.warping_indices):
+            a, b = self.params[i_seq][0], self.params[i_seq][1]
+            X_warped[:, i_fea] = 1 - np.power(1 - np.power(X_normalized[:, i_fea], a), b)
+        return X_warped
+
+    def fgrad_X(self, X):
+        """Compute the gradient of warping function with respect to X
+
+        Parameters
+        ----------
+        X : array_like, shape = (n_samples, n_features)
+            The location to compute gradient
+
+        Returns
+        -------
+        grad : array_like, shape = (n_samples, n_features)
+            The gradient for every location at X
+
+        Math
+        ----
+        grad = a * b * x ^(a-1) * (1 - x^a)^(b-1)
+        """
+        grad = np.zeros(X.shape)
+        for i_seq, i_fea in enumerate(self.warping_indices):
+            a, b = self.params[i_seq][0], self.params[i_seq][1]
+            grad[:, i_fea] = a * b * np.power(self.X_normalized[:, i_fea], a-1) *  \
+                             np.power(1 - np.power(self.X_normalized[:, i_fea], a), b-1) * self.scaling[i_fea]
+        return grad
+
+    def update_grads(self, X, dL_dW):
+        """Update the gradients of marginal log likelihood with respect to the parameters of warping function
+
+        Parameters
+        ----------
+        X : array_like, shape = (n_samples, n_features)
+            The input BEFORE warping
+
+        dL_dW : array_like, shape = (n_samples, n_features)
+            The gradient of marginal log likelihood with respect to the Warped input
+
+        Math
+        ----
+        let w = f(x), the input after warping, then
+        dW_da = b * (1 - x^a)^(b - 1) * x^a * ln(x)
+        dW_db = - (1 - x^a)^b * ln(1 - x^a)
+        dL_da = dL_dW * dW_da
+        dL_db = dL_dW * dW_db
+        """
+        for i_seq, i_fea in enumerate(self.warping_indices):
+            ai, bi = self.params[i_seq][0], self.params[i_seq][1]
+
+            # cache some value for save some computation
+            x_pow_a = np.power(self.X_normalized[:, i_fea], ai)
+
+            # compute gradient for ai, bi on all X
+            dz_dai = bi * np.power(1 - x_pow_a, bi-1) * x_pow_a * np.log(self.X_normalized[:, i_fea])
+            dz_dbi = - np.power(1 - x_pow_a, bi) * np.log(1 - x_pow_a)
+
+            # sum gradients on all the data
+            dL_dai = np.sum(dL_dW[:, i_fea] * dz_dai)
+            dL_dbi = np.sum(dL_dW[:, i_fea] * dz_dbi)
+            self.params[i_seq][0].gradient[:] = dL_dai
+            self.params[i_seq][1].gradient[:] = dL_dbi
+
+
+
+
--- a/GPy/util/normalizer.py
+++ b/GPy/util/normalizer.py
@ -33,6 +33,27 @@ class _Norm(object):
        """
        raise NotImplementedError

+    def to_dict(self):
+        raise NotImplementedError
+
+    def _to_dict(self):
+        input_dict = {}
+        return input_dict
+
+    @staticmethod
+    def from_dict(input_dict):
+        import copy
+        input_dict = copy.deepcopy(input_dict)
+        normalizer_class = input_dict.pop('class')
+        import GPy
+        normalizer_class = eval(normalizer_class)
+        return normalizer_class._from_dict(normalizer_class, input_dict)
+
+    @staticmethod
+    def _from_dict(normalizer_class, input_dict):
+        return normalizer_class(**input_dict)
+
+
 class Standardize(_Norm):
    def __init__(self):
        self.mean = None
@ -50,6 +71,23 @@ class Standardize(_Norm):
    def scaled(self):
        return self.mean is not None

+    def to_dict(self):
+        input_dict = super(Standardize, self)._to_dict()
+        input_dict["class"] = "GPy.util.normalizer.Standardize"
+        if self.mean is not None:
+            input_dict["mean"] = self.mean.tolist()
+            input_dict["std"] = self.std.tolist()
+        return input_dict
+
+    @staticmethod
+    def _from_dict(kernel_class, input_dict):
+        s = Standardize()
+        if "mean" in input_dict:
+            s.mean = np.array(input_dict["mean"])
+        if "std" in input_dict:
+            s.std = np.array(input_dict["std"])
+        return s
+
 # Inverse variance to be implemented, disabling for now
 # If someone in the future want to implement this,
 # we need to implement the inverse variance for
--- a/GPy/util/quad_integrate.py
+++ b/GPy/util/quad_integrate.py
@ -0,0 +1,119 @@
+"""
+The file for utilities related to integration by quadrature methods
+- will contain implementation for gaussian-kronrod integration.
+
+"""
+import numpy as np
+
+def getSubs(Subs, XK, NK=1):
+    M = (Subs[1, :] - Subs[0, :]) / 2
+    C = (Subs[1, :] + Subs[0, :]) / 2
+    I = XK[:, None] * M + np.ones((NK, 1)) * C
+    # A = [Subs(1,:); I]
+    A = np.vstack((Subs[0, :], I))
+    # B = [I;Subs(2,:)]
+    B = np.vstack((I, Subs[1, :]))
+    # Subs = [reshape(A, 1, []);
+    A = A.flatten()
+    # reshape(B, 1, [])];
+    B = B.flatten()
+    Subs = np.vstack((A,B))
+    # Subs = np.concatenate((A, B), axis=0)
+    return Subs
+
+def quadvgk(feval, fmin, fmax, tol1=1e-5, tol2=1e-5):
+    """
+    numpy implementation makes use of the code here: http://se.mathworks.com/matlabcentral/fileexchange/18801-quadvgk
+    We here use gaussian kronrod integration already used in gpstuff for evaluating one dimensional integrals.
+    This is vectorised quadrature which means that several functions can be evaluated at the same time over a grid of
+    points.
+    :param f:
+    :param fmin:
+    :param fmax:
+    :param difftol:
+    :return:
+    """
+
+    XK = np.array([-0.991455371120813, -0.949107912342759, -0.864864423359769, -0.741531185599394,
+                   -0.586087235467691, -0.405845151377397, -0.207784955007898, 0.,
+                   0.207784955007898, 0.405845151377397, 0.586087235467691,
+                   0.741531185599394, 0.864864423359769, 0.949107912342759, 0.991455371120813])
+    WK = np.array([0.022935322010529, 0.063092092629979, 0.104790010322250, 0.140653259715525,
+                   0.169004726639267, 0.190350578064785, 0.204432940075298, 0.209482141084728,
+                   0.204432940075298, 0.190350578064785, 0.169004726639267,
+                   0.140653259715525, 0.104790010322250, 0.063092092629979, 0.022935322010529])
+     # 7-point Gaussian weightings
+    WG = np.array([0.129484966168870, 0.279705391489277, 0.381830050505119, 0.417959183673469,
+        0.381830050505119, 0.279705391489277, 0.129484966168870])
+
+    NK = WK.size
+    G = np.arange(2,NK,2)
+    tol1 = 1e-4
+    tol2 = 1e-4
+    Subs = np.array([[fmin],[fmax]])
+    #  number of functions to evaluate in the feval vector of functions.
+    NF = feval(np.zeros(1)).size
+    Q = np.zeros(NF)
+    neval = 0
+    while Subs.size > 0:
+        Subs = getSubs(Subs,XK)
+        M = (Subs[1,:] - Subs[0,:]) / 2
+        C = (Subs[1,:] + Subs[0,:]) / 2
+        # NM = length(M);
+        NM = M.size
+        # x = reshape(XK * M + ones(NK, 1) * C, 1, []);
+        x = XK[:,None]*M + C
+        x = x.flatten()
+        FV = feval(x)
+        # FV = FV[:,None]
+        Q1 = np.zeros((NF, NM))
+        Q2 = np.zeros((NF, NM))
+
+        # for n=1:NF
+        # F = reshape(FV(n,:), NK, []);
+        # Q1(n,:) = M. * sum((WK * ones(1, NM)). * F);
+        # Q2(n,:) = M. * sum((WG * ones(1, NM)). * F(G,:));
+        # end
+        # for i in range(NF):
+        #     F = FV
+        #     F = F.reshape((NK,-1))
+        #     temp_mat = np.sum(np.multiply(WK[:,None]*np.ones((1,NM)), F),axis=0)
+        #     Q1[i,:] = np.multiply(M, temp_mat)
+        #     temp_mat = np.sum(np.multiply(WG[:,None]*np.ones((1, NM)), F[G-1,:]), axis=0)
+        #     Q2[i,:] = np.multiply(M, temp_mat)
+        # ind = np.where(np.logical_or(np.max(np.abs(Q1 -Q2) / Q1) < tol1, (Subs[1,:] - Subs[0,:]) <= tol2) > 0)[0]
+        # Q = Q + np.sum(Q1[:,ind], axis=1)
+        # np.delete(Subs, ind,axis=1)
+
+        Q1 = np.dot(FV.reshape(NF, NK, NM).swapaxes(2,1),WK)*M
+        Q2 = np.dot(FV.reshape(NF, NK, NM).swapaxes(2,1)[:,:,1::2],WG)*M
+        #ind = np.nonzero(np.logical_or(np.max(np.abs((Q1-Q2)/Q1), 0) < difftol , M < xtol))[0]
+        ind = np.nonzero(np.logical_or(np.max(np.abs((Q1-Q2)), 0) < tol1 , (Subs[1,:] - Subs[0,:])  < tol2))[0]
+        Q = Q + np.sum(Q1[:,ind], axis=1)
+        Subs = np.delete(Subs, ind, axis=1)
+    return Q
+
+def quadgk_int(f, fmin=-np.inf, fmax=np.inf, difftol=0.1):
+    """
+    Integrate f from fmin to fmax,
+    do integration by substitution
+    x = r / (1-r**2)
+    when r goes from -1 to 1 , x goes from -inf to inf.
+    the interval for quadgk function is from -1 to +1, so we transform the space from (-inf,inf) to (-1,1)
+    :param f:
+    :param fmin:
+    :param fmax:
+    :param difftol:
+    :return:
+    """
+    difftol = 1e-4
+    def trans_func(r):
+        r2 = np.square(r)
+        x = r / (1-r2)
+        dx_dr = (1 + r2)/(1-r2)**2
+        return f(x)*dx_dr
+
+    integrand = quadvgk(trans_func, -1., 1., difftol, difftol)
+    return integrand
+
+
--- a/appveyor.yml
+++ b/appveyor.yml
@ -3,7 +3,7 @@ environment:
    secure: 8/ZjXFwtd1S7ixd7PJOpptupKKEDhm2da/q3unabJ00=
  COVERALLS_REPO_TOKEN:
    secure: d3Luic/ESkGaWnZrvWZTKrzO+xaVwJWaRCEP0F+K/9DQGPSRZsJ/Du5g3s4XF+tS
-  gpy_version: 1.7.7
+  gpy_version: 1.8.0
  matrix:
    - PYTHON_VERSION: 2.7
      MINICONDA: C:\Miniconda-x64
--- a/setup.cfg
+++ b/setup.cfg
@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.7.7
+current_version = 1.8.0
 tag = True
 commit = True