Merge branch 'mean_functions' into devel

2026-05-07 02:52:40 +02:00 · 2015-04-01 09:16:08 +01:00 · 2015-04-01 09:16:08 +01:00 · 8c71d52b7f
commit 8c71d52b7f
parent 64ccd3bea9 c3d20c744f
27 changed files with 461 additions and 137 deletions
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -5,6 +5,7 @@ import numpy as np
 import sys
 from .. import kern
 from model import Model
 from mapping import Mapping
 from parameterization import ObsAr
 from .. import likelihoods
 from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
@ -34,7 +35,7 @@ class GP(Model):
    """
-    def __init__(self, X, Y, kernel, likelihood, inference_method=None, name='gp', Y_metadata=None, normalizer=False):
+    def __init__(self, X, Y, kernel, likelihood, mean_function=None, inference_method=None, name='gp', Y_metadata=None, normalizer=False):
        super(GP, self).__init__(name)
        assert X.ndim == 2
@ -75,6 +76,15 @@ class GP(Model):
        assert isinstance(likelihood, likelihoods.Likelihood)
        self.likelihood = likelihood
        #handle the mean function
        self.mean_function = mean_function
        if mean_function is not None:
            assert isinstance(self.mean_function, Mapping)
            assert mean_function.input_dim == self.input_dim
            assert mean_function.output_dim == self.output_dim
            self.link_parameter(mean_function)
        #find a sensible inference method
        logger.info("initializing inference method")
        if inference_method is None:
@ -153,9 +163,11 @@ class GP(Model):
            This method is not designed to be called manually, the framework is set up to automatically call this method upon changes to parameters, if you call
            this method yourself, there may be unexpected consequences.
        """
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.Y_metadata)
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.mean_function, self.Y_metadata)
        self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
        self.kern.update_gradients_full(self.grad_dict['dL_dK'], self.X)
        if self.mean_function is not None:
            self.mean_function.update_gradients(self.grad_dict['dL_dm'], self.X)
    def log_likelihood(self):
        """
@ -192,6 +204,10 @@ class GP(Model):
        #force mu to be a column vector
        if len(mu.shape)==1: mu = mu[:,None]
        #add the mean function in
        if not self.mean_function is None:
            mu += self.mean_function.f(_Xnew)
        return mu, var
    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None):
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -19,7 +19,7 @@ class SparseGP(GP):
    This model allows (approximate) inference using variational DTC or FITC
    (Gaussian likelihoods) as well as non-conjugate sparse methods based on
    these.
-    
+
    This is not for missing data, as the implementation for missing data involves
    some inefficient optimization routine decisions.
    See missing data SparseGP implementation in py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'.
@ -39,7 +39,7 @@ class SparseGP(GP):
    """
-    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
+    def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, inference_method=None,
                 name='sparse gp', Y_metadata=None, normalizer=False):
        #pick a sensible inference method
        if inference_method is None:
@ -53,7 +53,7 @@ class SparseGP(GP):
        self.Z = Param('inducing inputs', Z)
        self.num_inducing = Z.shape[0]
-        GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
+        GP.__init__(self, X, Y, kernel, likelihood, mean_function, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
        logger.info("Adding Z as parameter")
        self.link_parameter(self.Z, index=0)
@ -61,7 +61,7 @@ class SparseGP(GP):
    def has_uncertain_inputs(self):
        return isinstance(self.X, VariationalPosterior)
-    
+
    def set_Z(self, Z, trigger_update=True):
        if trigger_update: self.update_model(False)
        self.unlink_parameter(self.Z)
@ -110,8 +110,8 @@ class SparseGP(GP):
    def _raw_predict(self, Xnew, full_cov=False, kern=None):
        """
-        Make a prediction for the latent function values. 
+        Make a prediction for the latent function values.
-    
+
        For certain inputs we give back a full_cov of shape NxN,
        if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of, 
        we take only the diagonal elements across N.
@ -136,6 +136,9 @@ class SparseGP(GP):
            else:
                Kxx = kern.Kdiag(Xnew)
                var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
            #add in the mean function
            if self.mean_function is not None:
                mu += self.mean_function.f(Xnew)
        else:
            psi0_star = self.kern.psi0(self.Z, Xnew)
            psi1_star = self.kern.psi1(self.Z, Xnew)
@ -165,4 +168,5 @@ class SparseGP(GP):
                    var[i] = var_
                else:
                    var[i] = np.diag(var_)+p0-t2
        return mu, var
--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@ -9,7 +9,7 @@ from ..inference.latent_function_inference import SVGP as svgp_inf
 class SVGP(SparseGP):
-    def __init__(self, X, Y, Z, kernel, likelihood, name='SVGP', Y_metadata=None, batchsize=None):
+    def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, name='SVGP', Y_metadata=None, batchsize=None):
        """
        Stochastic Variational GP.
@ -38,7 +38,7 @@ class SVGP(SparseGP):
        #create the SVI inference method
        inf_method = svgp_inf()
-        SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, inference_method=inf_method,
+        SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, mean_function=mean_function, inference_method=inf_method,
                 name=name, Y_metadata=Y_metadata, normalizer=False)
        self.m = Param('q_u_mean', np.zeros((self.num_inducing, Y.shape[1])))
@ -48,7 +48,7 @@ class SVGP(SparseGP):
        self.link_parameter(self.m)
    def parameters_changed(self):
-        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.mean_function, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
        #update the kernel gradients
        self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z)
@ -65,6 +65,13 @@ class SVGP(SparseGP):
        self.m.gradient = self.grad_dict['dL_dm']
        self.chol.gradient = self.grad_dict['dL_dchol']
        if self.mean_function is not None:
            self.mean_function.update_gradients(self.grad_dict['dL_dmfX'], self.X)
            g = self.mean_function.gradient[:].copy()
            self.mean_function.update_gradients(self.grad_dict['dL_dmfZ'], self.Z)
            self.mean_function.gradient[:] += g
            self.Z.gradient[:] += self.mean_function.gradients_X(self.grad_dict['dL_dmfZ'], self.Z)
    def set_data(self, X, Y):
        """
        Set the data without calling parameters_changed to avoid wasted computation
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -505,3 +505,48 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
    print m
    return m
 def simple_mean_function(max_iters=100, optimize=True, plot=True):
    """
    The simplest possible mean function. No parameters, just a simple Sinusoid.
    """
    #create  simple mean function
    mf = GPy.core.Mapping(1,1)
    mf.f = np.sin
    mf.update_gradients = lambda a,b: None
    X = np.linspace(0,10,50).reshape(-1,1)
    Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
    k =GPy.kern.RBF(1)
    lik = GPy.likelihoods.Gaussian()
    m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
    if optimize:
        m.optimize(max_iters=max_iters)
    if plot:
        m.plot(plot_limits=(-10,15))
    return m
 def parametric_mean_function(max_iters=100, optimize=True, plot=True):
    """
    A linear mean function with parameters that we'll learn alongside the kernel
    """
    #create  simple mean function
    mf = GPy.core.Mapping(1,1)
    mf.f = np.sin
    X = np.linspace(0,10,50).reshape(-1,1)
    Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape) + 3*X
    mf = GPy.mappings.Linear(1,1)
    k =GPy.kern.RBF(1)
    lik = GPy.likelihoods.Gaussian()
    m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
    if optimize:
        m.optimize(max_iters=max_iters)
    if plot:
        m.plot()
    return m
--- a/GPy/inference/latent_function_inference/dtc.py
+++ b/GPy/inference/latent_function_inference/dtc.py
@ -20,7 +20,8 @@ class DTC(LatentFunctionInference):
    def __init__(self):
        self.const_jitter = 1e-6
-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
        assert mean_function is None, "inference with a mean function not implemented"
        assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
        num_inducing, _ = Z.shape
@ -88,7 +89,8 @@ class vDTC(object):
    def __init__(self):
        self.const_jitter = 1e-6
-    def inference(self, kern, X, X_variance, Z, likelihood, Y, Y_metadata):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
        assert mean_function is None, "inference with a mean function not implemented"
        assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
        num_inducing, _ = Z.shape
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@ -36,11 +36,18 @@ class ExactGaussianInference(LatentFunctionInference):
            #print "WARNING: N>D of Y, we need caching of L, such that L*L^T = Y, returning Y still!"
            return Y
-    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None):
        """
        Returns a Posterior class containing essential quantities of the posterior
        """
-        YYT_factor = self.get_YYTfactor(Y)
+
        if mean_function is None:
            m = 0
        else:
            m = mean_function.f(X)
        YYT_factor = self.get_YYTfactor(Y-m)
        K = kern.K(X)
@ -56,4 +63,4 @@ class ExactGaussianInference(LatentFunctionInference):
        dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK),Y_metadata)
-        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
+        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@ -33,7 +33,8 @@ class EP(LatentFunctionInference):
        # TODO: update approximation in the end as well? Maybe even with a switch?
        pass
-    def inference(self, kern, X, likelihood, Y, Y_metadata=None, Z=None):
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, Z=None):
        assert mean_function is None, "inference with a mean function not implemented"
        num_data, output_dim = Y.shape
        assert output_dim ==1, "ep in 1D only (for now!)"
--- a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
@ -64,7 +64,8 @@ class EPDTC(LatentFunctionInference):
        self.old_mutilde, self.old_vtilde = None, None
        self._ep_approximation = None
-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
        assert mean_function is None, "inference with a mean function not implemented"
        num_data, output_dim = Y.shape
        assert output_dim ==1, "ep in 1D only (for now!)"
--- a/GPy/inference/latent_function_inference/fitc.py
+++ b/GPy/inference/latent_function_inference/fitc.py
@ -18,7 +18,8 @@ class FITC(LatentFunctionInference):
    """
    const_jitter = 1e-6
-    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
        assert mean_function is None, "inference with a mean function not implemented"
        num_inducing, _ = Z.shape
        num_data, output_dim = Y.shape
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@ -39,10 +39,12 @@ class Laplace(LatentFunctionInference):
        self.first_run = True
        self._previous_Ki_fhat = None
-    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+    def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None):
        """
        Returns a Posterior class containing essential quantities of the posterior
        """
        assert mean_function is None, "inference with a mean function not implemented"
        # Compute K
        K = kern.K(X)
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@ -15,7 +15,7 @@ class Posterior(object):
    the function at any new point x_* by integrating over this posterior.
    """
-    def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None):
+    def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None, prior_mean=0):
        """
        woodbury_chol : a lower triangular matrix L that satisfies posterior_covariance = K - K L^{-T} L^{-1} K
        woodbury_vector : a matrix (or vector, as Nx1 matrix) M which satisfies posterior_mean = K M
@ -67,6 +67,7 @@ class Posterior(object):
        #option 2:
        self._mean = mean
        self._covariance = cov
        self._prior_mean = prior_mean
        #compute this lazily
        self._precision = None
@ -175,7 +176,7 @@ class Posterior(object):
        $$
        """
        if self._woodbury_vector is None:
-            self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean)
+            self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean - self._prior_mean)
        return self._woodbury_vector
    @property
--- a/GPy/inference/latent_function_inference/svgp.py
+++ b/GPy/inference/latent_function_inference/svgp.py
@ -6,7 +6,8 @@ from posterior import Posterior
 class SVGP(LatentFunctionInference):
-    def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
+    def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
        num_inducing = Z.shape[0]
        num_data, num_outputs = Y.shape
@ -22,6 +23,15 @@ class SVGP(LatentFunctionInference):
            #S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S))
            #Si, Lnew, _,_ = linalg.pdinv(S)
        #compute mean function stuff
        if mean_function is not None:
            prior_mean_u = mean_function.f(Z)
            prior_mean_f = mean_function.f(X)
        else:
            prior_mean_u = np.zeros((num_inducing, num_outputs))
            prior_mean_f = np.zeros((num_data, num_outputs))
        #compute kernel related stuff
        Kmm = kern.K(Z)
        Knm = kern.K(X, Z)
@ -30,17 +40,31 @@ class SVGP(LatentFunctionInference):
        #compute the marginal means and variances of q(f)
        A = np.dot(Knm, Kmmi)
-        mu = np.dot(A, q_u_mean)
+        mu = prior_mean_f + np.dot(A, q_u_mean - prior_mean_u)
        v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * np.einsum('ij,jkl->ikl', A, S),1)
        #compute the KL term
        Kmmim = np.dot(Kmmi, q_u_mean)
        KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0)
        KL = KLs.sum()
-        dKL_dm = Kmmim
+        #gradient of the KL term (assuming zero mean function)
        dKL_dm = Kmmim.copy()
        dKL_dS = 0.5*(Kmmi[:,:,None] - Si)
        dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)
        if mean_function is not None:
            #adjust KL term for mean function
            Kmmi_mfZ = np.dot(Kmmi, prior_mean_u)
            KL += -np.sum(q_u_mean*Kmmi_mfZ)
            KL += 0.5*np.sum(Kmmi_mfZ*prior_mean_u)
            #adjust gradient for mean fucntion
            dKL_dm -= Kmmi_mfZ
            dKL_dKmm += Kmmim.dot(Kmmi_mfZ.T)
            dKL_dKmm -= 0.5*Kmmi_mfZ.dot(Kmmi_mfZ.T)
            #compute gradients for mean_function
            dKL_dmfZ = Kmmi_mfZ - Kmmim
        #quadrature for the likelihood
        F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata)
@ -50,11 +74,9 @@ class SVGP(LatentFunctionInference):
        if dF_dthetaL is not None:
            dF_dthetaL =  dF_dthetaL.sum(1).sum(1)*batch_scale
-        #derivatives of expected likelihood
+        #derivatives of expected likelihood, assuming zero mean function
        Adv = A.T[:,:,None]*dF_dv[None,:,:] # As if dF_Dv is diagonal
        Admu = A.T.dot(dF_dmu)
        #AdvA = np.einsum('ijk,jl->ilk', Adv, A)
        #AdvA = np.dot(A.T, Adv).swapaxes(0,1)
        AdvA = np.dstack([np.dot(A.T, Adv[:,:,i].T) for i in range(num_outputs)])
        tmp = np.einsum('ijk,jlk->il', AdvA, S).dot(Kmmi)
        dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(-1) - tmp - tmp.T
@ -64,6 +86,14 @@ class SVGP(LatentFunctionInference):
        dF_dm = Admu
        dF_dS = AdvA
        #adjust gradient to account for mean function
        if mean_function is not None:
            dF_dmfX = dF_dmu.copy()
            dF_dmfZ = -Admu
            dF_dKmn -= np.dot(Kmmi_mfZ, dF_dmu.T)
            dF_dKmm += Admu.dot(Kmmi_mfZ.T)
        #sum (gradients of) expected likelihood and KL part
        log_marginal = F.sum() - KL
        dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn
@ -71,4 +101,8 @@ class SVGP(LatentFunctionInference):
        dL_dchol = np.dstack([2.*np.dot(dL_dS[:,:,i], L[:,:,i]) for i in range(num_outputs)])
        dL_dchol = choleskies.triang_to_flat(dL_dchol)
-        return Posterior(mean=q_u_mean, cov=S, K=Kmm), log_marginal, {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv.sum(1), 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
+        grad_dict = {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv.sum(1), 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
        if mean_function is not None:
            grad_dict['dL_dmfZ'] = dF_dmfZ - dKL_dmfZ
            grad_dict['dL_dmfX'] = dF_dmfX
        return Posterior(mean=q_u_mean, cov=S, K=Kmm, prior_mean=prior_mean_u), log_marginal, grad_dict
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@ -6,6 +6,20 @@ from kern import CombinationKernel
 from ...util.caching import Cache_this
 import itertools
 def numpy_invalid_op_as_exception(func):
    """
    A decorator that allows catching numpy invalid operations
    as exceptions (the default behaviour is raising warnings).
    """
    def func_wrapper(*args, **kwargs):
        np.seterr(invalid='raise')
        result = func(*args, **kwargs)
        np.seterr(invalid='warn')
        return result
    return func_wrapper
 class Prod(CombinationKernel):
    """
    Computes the product of 2 kernels
@ -46,18 +60,20 @@ class Prod(CombinationKernel):
            self.parts[0].update_gradients_full(dL_dK*self.parts[1].K(X,X2), X, X2)
            self.parts[1].update_gradients_full(dL_dK*self.parts[0].K(X,X2), X, X2)
        else:
-            k = self.K(X,X2)*dL_dK
+            for combination in itertools.combinations(self.parts, len(self.parts) - 1):
-            for p in self.parts:
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
-                p.update_gradients_full(k/p.K(X,X2),X,X2)
+                to_update = list(set(self.parts) - set(combination))[0]
                to_update.update_gradients_full(dL_dK * prod, X, X2)
    def update_gradients_diag(self, dL_dKdiag, X):
        if len(self.parts)==2:
            self.parts[0].update_gradients_diag(dL_dKdiag*self.parts[1].Kdiag(X), X)
            self.parts[1].update_gradients_diag(dL_dKdiag*self.parts[0].Kdiag(X), X)
        else:
-            k = self.Kdiag(X)*dL_dKdiag
+            for combination in itertools.combinations(self.parts, len(self.parts) - 1):
-            for p in self.parts:
+                prod = reduce(np.multiply, [p.Kdiag(X) for p in combination])
-                p.update_gradients_diag(k/p.Kdiag(X),X)
+                to_update = list(set(self.parts) - set(combination))[0]
                to_update.update_gradients_diag(dL_dKdiag * prod, X)
    def gradients_X(self, dL_dK, X, X2=None):
        target = np.zeros(X.shape)
@ -65,9 +81,10 @@ class Prod(CombinationKernel):
            target += self.parts[0].gradients_X(dL_dK*self.parts[1].K(X, X2), X, X2)
            target += self.parts[1].gradients_X(dL_dK*self.parts[0].K(X, X2), X, X2)
        else:
-            k = self.K(X,X2)*dL_dK
+            for combination in itertools.combinations(self.parts, len(self.parts) - 1):
-            for p in self.parts:
+                prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
-                target += p.gradients_X(k/p.K(X,X2),X,X2)
+                to_update = list(set(self.parts) - set(combination))[0]
                target += to_update.gradients_X(dL_dK * prod, X, X2)
        return target
    def gradients_X_diag(self, dL_dKdiag, X):
@ -80,3 +97,5 @@ class Prod(CombinationKernel):
            for p in self.parts:
                target += p.gradients_X_diag(k/p.Kdiag(X),X)
        return target
--- a/GPy/mappings/init.py
+++ b/GPy/mappings/init.py
@ -4,4 +4,5 @@
 from kernel import Kernel
 from linear import Linear
 from mlp import MLP
-#from rbf import RBF
+from additive import Additive
 from compound import Compound
--- a/GPy/mappings/additive.py
+++ b/GPy/mappings/additive.py
@ -2,8 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
-from ..core.mapping import Mapping
+from ..core import Mapping
 import GPy
 class Additive(Mapping):
    """
@ -27,8 +26,6 @@ class Additive(Mapping):
        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
        self.mapping1 = mapping1
        self.mapping2 = mapping2
        self.num_params = self.mapping1.num_params + self.mapping2.num_params
        self.name = self.mapping1.name + '+' + self.mapping2.name
    def f(self, X):
        return self.mapping1.f(X) + self.mapping2.f(X)
--- a/GPy/mappings/compound.py
+++ b/GPy/mappings/compound.py
@ -0,0 +1,39 @@
 # Copyright (c) 2015, James Hensman and Alan Saul
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from ..core import Mapping
 class Compound(Mapping):
    """
    Mapping based on passing one mapping through another
    .. math::
       f(\mathbf{x}) = f_2(f_1(\mathbf{x}))
    :param mapping1: first mapping
    :type mapping1: GPy.mappings.Mapping
    :param mapping2: second mapping
    :type mapping2: GPy.mappings.Mapping
    """
    def __init__(self, mapping1, mapping2):
        assert(mapping1.output_dim==mapping2.input_dim)
        input_dim, output_dim = mapping1.input_dim, mapping2.output_dim
        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
        self.mapping1 = mapping1
        self.mapping2 = mapping2
        self.link_parameters(self.mapping1, self.mapping2)
    def f(self, X):
        return self.mapping2.f(self.mapping1.f(X))
    def update_gradients(self, dL_dF, X):
        hidden = self.mapping1.f(X)
        self.mapping2.update_gradients(dL_dF, hidden)
        self.mapping1.update_gradients(self.mapping2.gradients_X(dL_dF, hidden), X)
    def gradients_X(self, dL_dF, X):
        hidden = self.mapping1.f(X)
        return self.mapping1.gradients_X(self.mapping2.gradients_X(dL_dF, hidden), X)
--- a/GPy/mappings/kernel.py
+++ b/GPy/mappings/kernel.py
@ -36,16 +36,16 @@ class Kernel(Mapping):
        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
        self.kern = kernel
        self.Z = Z
-        self.num_bases, Zdim = X.shape
+        self.num_bases, Zdim = Z.shape
        assert Zdim == self.input_dim
-        self.A = GPy.core.Param('A', np.random.randn(self.num_bases, self.output_dim))
+        self.A = Param('A', np.random.randn(self.num_bases, self.output_dim))
-        self.add_parameter(self.A)
+        self.link_parameter(self.A)
    def f(self, X):
        return np.dot(self.kern.K(X, self.Z), self.A)
    def update_gradients(self, dL_dF, X):
-        self.kern.update_gradients_full(np.dot(dL_dF, self.A.T))
+        self.kern.update_gradients_full(np.dot(dL_dF, self.A.T), X, self.Z)
        self.A.gradient = np.dot( self.kern.K(self.Z, X), dL_dF)
    def gradients_X(self, dL_dF, X):
--- a/GPy/mappings/linear.py
+++ b/GPy/mappings/linear.py
@ -26,8 +26,8 @@ class Linear(Mapping):
    def __init__(self, input_dim, output_dim, name='linmap'):
        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
-        self.A = GPy.core.Param('A', np.random.randn(self.input_dim, self.output_dim))
+        self.A = Param('A', np.random.randn(self.input_dim, self.output_dim))
-        self.add_parameter(self.A)
+        self.link_parameter(self.A)
    def f(self, X):
        return np.dot(X, self.A)
--- a/GPy/mappings/mlp.py
+++ b/GPy/mappings/mlp.py
@ -11,32 +11,45 @@ class MLP(Mapping):
    """
    def __init__(self, input_dim=1, output_dim=1, hidden_dim=3, name='mlpmap'):
-        super(MLP).__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
+        super(MLP, self).__init__(input_dim=input_dim, output_dim=output_dim, name=name)
        self.hidden_dim = hidden_dim
        self.W1 = Param('W1', np.random.randn(self.input_dim, self.hidden_dim))
        self.b1 = Param('b1', np.random.randn(self.hidden_dim))
        self.W2 = Param('W2', np.random.randn(self.hidden_dim, self.output_dim))
        self.b2 = Param('b2', np.random.randn(self.output_dim))
        self.link_parameters(self.W1, self.b1, self.W2, self.b2)
    def f(self, X):
-        N, D = X.shape
+        layer1 = np.dot(X, self.W1) + self.b1
-        activations = np.tanh(np.dot(X,self.W1) + self.b1)
+        activations = np.tanh(layer1)
-        self.out = np.dot(self.activations,self.W2) + self.b2
+        return  np.dot(activations, self.W2) + self.b2
        return self.output_fn(self.out)
    def update_gradients(self, dL_dF, X):
-        activations = np.tanh(np.dot(X,self.W1) + self.b1)
+        layer1 = np.dot(X,self.W1) + self.b1
-
+        activations = np.tanh(layer1)
        #Evaluate second-layer gradients.
        self.W2.gradient = np.dot(activations.T, dL_dF)
        self.b2.gradient = np.sum(dL_dF, 0)
        # Backpropagation to hidden layer.
-        delta_hid = np.dot(dL_dF, self.W2.T) * (1.0 - activations**2)
+        dL_dact = np.dot(dL_dF, self.W2.T)
        dL_dlayer1 = dL_dact / np.square(np.cosh(layer1))
        # Finally, evaluate the first-layer gradients.
-        self.W1.gradients = np.dot(X.T,delta_hid)
+        self.W1.gradient = np.dot(X.T,dL_dlayer1)
-        self.b1.gradients = np.sum(delta_hid, 0)
+        self.b1.gradient = np.sum(dL_dlayer1, 0)
    def gradients_X(self, dL_dF, X):
        layer1 = np.dot(X,self.W1) + self.b1
        activations = np.tanh(layer1)
        # Backpropagation to hidden layer.
        dL_dact = np.dot(dL_dF, self.W2.T)
        dL_dlayer1 = dL_dact / np.square(np.cosh(layer1))
        return np.dot(dL_dlayer1, self.W1.T)
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@ -43,10 +43,11 @@ class SparseGPMiniBatch(SparseGP):
    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
                 name='sparse gp', Y_metadata=None, normalizer=False,
                 missing_data=False, stochastic=False, batchsize=1):
-        #pick a sensible inference method
+        
        # pick a sensible inference method
        if inference_method is None:
            if isinstance(likelihood, likelihoods.Gaussian):
-                inference_method = var_dtc.VarDTC(limit=1 if not self.missing_data else Y.shape[1])
+                inference_method = var_dtc.VarDTC(limit=1 if not missing_data else Y.shape[1])
            else:
                #inference_method = ??
                raise NotImplementedError, "what to do what to do?"
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@ -1,7 +1,6 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..util.warping_functions import *
 from ..core import GP
@ -10,14 +9,16 @@ from GPy.util.warping_functions import TanhWarpingFunction_d
 from GPy import kern
 class WarpedGP(GP):
-    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3, normalize_X=False, normalize_Y=False):
+    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3):
        if kernel is None:
-            kernel = kern.rbf(X.shape[1])
+            kernel = kern.RBF(X.shape[1])
        if warping_function == None:
            self.warping_function = TanhWarpingFunction_d(warping_terms)
            self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1,) * 1)
        else:
            self.warping_function = warping_function
        self.scale_data = False
        if self.scale_data:
@ -25,10 +26,10 @@ class WarpedGP(GP):
        self.has_uncertain_inputs = False
        self.Y_untransformed = Y.copy()
        self.predict_in_warped_space = False
-        likelihood = likelihoods.Gaussian(self.transform_data(), normalize=normalize_Y)
+        likelihood = likelihoods.Gaussian()
-        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
+        GP.__init__(self, X, self.transform_data(), likelihood=likelihood, kernel=kernel)
-        self._set_params(self._get_params())
+        self.link_parameter(self.warping_function)
    def _scale_data(self, Y):
        self._Ymax = Y.max()
@ -38,62 +39,55 @@ class WarpedGP(GP):
    def _unscale_data(self, Y):
        return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin
-    def _set_params(self, x):
+    def parameters_changed(self):
-        self.warping_params = x[:self.warping_function.num_parameters]
+        self.Y[:] = self.transform_data()
-        Y = self.transform_data()
+        super(WarpedGP, self).parameters_changed()
        self.likelihood.set_data(Y)
        GP._set_params(self, x[self.warping_function.num_parameters:].copy())
-    def _get_params(self):
+        Kiy = self.posterior.woodbury_vector.flatten()
        return np.hstack((self.warping_params.flatten().copy(), GP._get_params(self).copy()))
-    def _get_param_names(self):
+        grad_y = self.warping_function.fgrad_y(self.Y_untransformed)
-        warping_names = self.warping_function._get_param_names()
+        grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed,
        param_names = GP._get_param_names(self)
        return warping_names + param_names
    def transform_data(self):
        Y = self.warping_function.f(self.Y_untransformed.copy(), self.warping_params).copy()
        return Y
    def log_likelihood(self):
        ll = GP.log_likelihood(self)
        jacobian = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
        return ll + np.log(jacobian).sum()
    def _log_likelihood_gradients(self):
        ll_grads = GP._log_likelihood_gradients(self)
        alpha = np.dot(self.Ki, self.likelihood.Y.flatten())
        warping_grads = self.warping_function_gradients(alpha)
        warping_grads = np.append(warping_grads[:, :-1].flatten(), warping_grads[0, -1])
        return np.hstack((warping_grads.flatten(), ll_grads.flatten()))
    def warping_function_gradients(self, Kiy):
        grad_y = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
        grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed, self.warping_params,
                                                                 return_covar_chain=True)
        djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
        dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
-        return -dquad_dpsi + djac_dpsi
+        warping_grads = -dquad_dpsi + djac_dpsi
        self.warping_function.psi.gradient[:] = warping_grads[:, :-1]
        self.warping_function.d.gradient[:] = warping_grads[0, -1]
    def transform_data(self):
        Y = self.warping_function.f(self.Y_untransformed.copy()).copy()
        return Y
    def log_likelihood(self):
        ll = GP.log_likelihood(self)
        jacobian = self.warping_function.fgrad_y(self.Y_untransformed)
        return ll + np.log(jacobian).sum()
    def plot_warping(self):
-        self.warping_function.plot(self.warping_params, self.Y_untransformed.min(), self.Y_untransformed.max())
+        self.warping_function.plot(self.Y_untransformed.min(), self.Y_untransformed.max())
-    def predict(self, Xnew, which_parts='all', full_cov=False, pred_init=None):
+    def predict(self, Xnew, which_parts='all', pred_init=None):
        # normalize X values
-        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
+        # Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
-        mu, var = GP._raw_predict(self, Xnew, full_cov=full_cov, which_parts=which_parts)
+        mu, var = GP._raw_predict(self, Xnew)
        # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
+        mean, var = self.likelihood.predictive_values(mu, var)
        if self.predict_in_warped_space:
-            mean = self.warping_function.f_inv(mean, self.warping_params, y=pred_init)
+            mean = self.warping_function.f_inv(mean,  y=pred_init)
-            var = self.warping_function.f_inv(var, self.warping_params)
+            var = self.warping_function.f_inv(var)
        if self.scale_data:
            mean = self._unscale_data(mean)
-        
+
-        return mean, var, _025pm, _975pm
+        return mean, var
 if __name__ == '__main__':
    X = np.random.randn(100, 1)
    Y = np.sin(X) + np.random.randn(100, 1)*0.05
    m = WarpedGP(X, Y)
--- a/GPy/plotting/matplot_dep/maps.py
+++ b/GPy/plotting/matplot_dep/maps.py
@ -6,7 +6,11 @@ try:
    from matplotlib.patches import Polygon
    from matplotlib.collections import PatchCollection
    #from matplotlib import cm
-    pb.ion()
+    try:
        __IPYTHON__
        pb.ion()
    except NameError:
        pass
 except:
    pass
 import re
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -256,13 +256,23 @@ class KernelGradientTestsContinuous(unittest.TestCase):
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
    def test_Prod1(self):
        k = GPy.kern.RBF(self.D) * GPy.kern.Linear(self.D)
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
    def test_Prod2(self):
-        k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D))
+        k = GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D)
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
    def test_Prod3(self):
-        k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D))
+        k = GPy.kern.RBF(self.D) * GPy.kern.Linear(self.D) * GPy.kern.Bias(self.D)
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
    def test_Prod4(self):
        k = GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D) * GPy.kern.Matern32(2, active_dims=[0,1])
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
@ -401,11 +411,27 @@ class Coregionalize_weave_test(unittest.TestCase):
    GPy.util.config.config.set('weave', 'working', 'False')
 class KernelTestsProductWithZeroValues(unittest.TestCase):
    def setUp(self):
        self.X = np.array([[0,1],[1,0]])
        self.k = GPy.kern.Linear(2) * GPy.kern.Bias(2)
    def test_zero_valued_kernel_full(self):
        self.k.update_gradients_full(1, self.X)
        self.assertFalse(np.isnan(self.k['linear.variances'].gradient),
                         "Gradient resulted in NaN")
    def test_zero_valued_kernel_gradients_X(self):
        target = self.k.gradients_X(1, self.X)
        self.assertFalse(np.any(np.isnan(target)),
                         "Gradient resulted in NaN")
 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
    unittest.main()
 #     np.random.seed(0)
 #     N0 = 3
 #     N1 = 9
--- a/GPy/testing/mapping_tests.py
+++ b/GPy/testing/mapping_tests.py
@ -0,0 +1,72 @@
 # Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import unittest
 import numpy as np
 import GPy
 class MappingGradChecker(GPy.core.Model):
    """
    This class has everything we need to check the gradient of a mapping. It
    implement a simple likelihood which is a weighted sum of the outputs of the
    mapping. the gradients are checked against the parameters of the mapping
    and the input.
    """
    def __init__(self, mapping, X, name='map_grad_check'):
        super(MappingGradChecker, self).__init__(name)
        self.mapping = mapping
        self.link_parameter(self.mapping)
        self.X = GPy.core.Param('X',X)
        self.link_parameter(self.X)
        self.dL_dY = np.random.randn(self.X.shape[0], self.mapping.output_dim)
    def log_likelihood(self):
        return np.sum(self.mapping.f(self.X) * self.dL_dY)
    def parameters_changed(self):
        self.X.gradient = self.mapping.gradients_X(self.dL_dY, self.X)
        self.mapping.update_gradients(self.dL_dY, self.X)
 class MappingTests(unittest.TestCase):
    def test_kernelmapping(self):
        X = np.random.randn(100,3)
        Z = np.random.randn(10,3)
        mapping = GPy.mappings.Kernel(3, 2, Z, GPy.kern.RBF(3))
        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
    def test_linearmapping(self):
        mapping = GPy.mappings.Linear(3, 2)
        X = np.random.randn(100,3)
        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
    def test_mlpmapping(self):
        mapping = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
        X = np.random.randn(100,3)
        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
    def test_addmapping(self):
        m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
        m2 = GPy.mappings.Linear(input_dim=3, output_dim=2)
        mapping = GPy.mappings.Additive(m1, m2)
        X = np.random.randn(100,3)
        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
    def test_compoundmapping(self):
        m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
        Z = np.random.randn(10,2)
        m2 = GPy.mappings.Kernel(2, 4, Z, GPy.kern.RBF(2))
        mapping = GPy.mappings.Compound(m1, m2)
        X = np.random.randn(100,3)
        self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
    unittest.main()
--- a/GPy/testing/svgp_tests.py
+++ b/GPy/testing/svgp_tests.py
@ -32,3 +32,23 @@ class SVGP_classification(np.testing.TestCase):
        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
    def test_grad(self):
        assert self.m.checkgrad(step=1e-4)
 class SVGP_Poisson_with_meanfunction(np.testing.TestCase):
    """
    Inference in the SVGP with a Bernoulli likelihood
    """
    def setUp(self):
        X = np.linspace(0,10,100).reshape(-1,1)
        Z = np.linspace(0,10,10).reshape(-1,1)
        latent_f = np.exp(0.1*X * 0.05*X**2)
        Y = np.array([np.random.poisson(f) for f in latent_f.flatten()]).reshape(-1,1)
        mf = GPy.mappings.Linear(1,1)
        lik = GPy.likelihoods.Poisson()
        k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
        self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k, mean_function=mf)
    def test_grad(self):
        assert self.m.checkgrad(step=1e-4)
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -96,16 +96,21 @@ def jitchol(A, maxtries=5):
        num_tries = 1
        while num_tries <= maxtries and np.isfinite(jitter):
            try:
                print jitter
                L = linalg.cholesky(A + np.eye(A.shape[0]) * jitter, lower=True)
                logging.warning('Added {} rounds of jitter, jitter of {:.10e}\n'.format(num_tries, jitter))
                return L
            except:
                jitter *= 10
            finally:
                num_tries += 1
        raise linalg.LinAlgError, "not positive definite, even with jitter."
    import traceback
-    logging.warning('\n'.join(['Added {} rounds of jitter, jitter of {:.10e}'.format(num_tries-1, jitter),
+    try: raise
-                                '  in '+traceback.format_list(traceback.extract_stack(limit=2)[-2:-1])[0][2:]]))
+    except:
-    raise linalg.LinAlgError, "not positive definite, even with jitter."
+        logging.warning('\n'.join(['Added jitter of {:.10e}'.format(jitter),
            '  in '+traceback.format_list(traceback.extract_stack(limit=2)[-2:-1])[0][2:]]))
    import ipdb;ipdb.set_trace()
    return L
 # def dtrtri(L, lower=1):
 #     """
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@ -1,17 +1,18 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from GPy.core.parameterization import Parameterized, Param
 from ..core.parameterization.transformations import Logexp
-class WarpingFunction(object):
+class WarpingFunction(Parameterized):
    """
    abstract function for warping
    z = f(y)
    """
-    def __init__(self):
+    def __init__(self, name):
-        raise NotImplementedError
+        super(WarpingFunction, self).__init__(name=name)
    def f(self,y,psi):
        """function transformation
@ -34,9 +35,10 @@ class WarpingFunction(object):
    def _get_param_names(self):
        raise NotImplementedError
-    def plot(self, psi, xmin, xmax):
+    def plot(self,  xmin, xmax):
        psi = self.psi
        y = np.arange(xmin, xmax, 0.01)
-        f_y = self.f(y, psi)
+        f_y = self.f(y)
        from matplotlib import pyplot as plt
        plt.figure()
        plt.plot(y, f_y)
@ -50,6 +52,7 @@ class TanhWarpingFunction(WarpingFunction):
        """n_terms specifies the number of tanh terms to be used"""
        self.n_terms = n_terms
        self.num_parameters = 3 * self.n_terms
        super(TanhWarpingFunction, self).__init__(name='warp_tanh')
    def f(self,y,psi):
        """
@ -163,8 +166,18 @@ class TanhWarpingFunction_d(WarpingFunction):
        """n_terms specifies the number of tanh terms to be used"""
        self.n_terms = n_terms
        self.num_parameters = 3 * self.n_terms + 1
        self.psi = np.ones((self.n_terms, 3))
-    def f(self,y,psi):
+        super(TanhWarpingFunction_d, self).__init__(name='warp_tanh')
        self.psi = Param('psi', self.psi)
        self.psi[:, :2].constrain_positive()
        self.d = Param('%s' % ('d'), 1.0, Logexp())
        self.link_parameter(self.psi)
        self.link_parameter(self.d)
    def f(self,y):
        """
        Transform y with f using parameter vector psi
        psi = [[a,b,c]]
@ -175,9 +188,9 @@ class TanhWarpingFunction_d(WarpingFunction):
        #1. check that number of params is consistent
        # assert psi.shape[0] == self.n_terms, 'inconsistent parameter dimensions'
        # assert psi.shape[1] == 4, 'inconsistent parameter dimensions'
-        mpsi = psi.copy()
+
-        d = psi[-1]
+        d = self.d
-        mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
+        mpsi = self.psi
        #3. transform data
        z = d*y.copy()
@ -187,7 +200,7 @@ class TanhWarpingFunction_d(WarpingFunction):
        return z
-    def f_inv(self, z, psi, max_iterations=1000, y=None):
+    def f_inv(self, z, max_iterations=1000, y=None):
        """
        calculate the numerical inverse of f
@ -198,12 +211,12 @@ class TanhWarpingFunction_d(WarpingFunction):
        z = z.copy()
        if y is None:
            y = np.ones_like(z)
-            
+
        it = 0
        update = np.inf
        while it == 0 or (np.abs(update).sum() > 1e-10 and it < max_iterations):
-            update = (self.f(y, psi) - z)/self.fgrad_y(y, psi)
+            update = (self.f(y) - z)/self.fgrad_y(y)
            y -= update
            it += 1
        if it == max_iterations:
@ -212,7 +225,7 @@ class TanhWarpingFunction_d(WarpingFunction):
        return y
-    def fgrad_y(self, y, psi, return_precalc = False):
+    def fgrad_y(self, y,return_precalc = False):
        """
        gradient of f w.r.t to y ([N x 1])
@ -221,9 +234,8 @@ class TanhWarpingFunction_d(WarpingFunction):
        """
-        mpsi = psi.copy()
+        d = self.d
-        d = psi[-1]
+        mpsi = self.psi
        mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
        # vectorized version
@ -240,7 +252,7 @@ class TanhWarpingFunction_d(WarpingFunction):
        return GRAD
-    def fgrad_y_psi(self, y, psi, return_covar_chain = False):
+    def fgrad_y_psi(self, y, return_covar_chain = False):
        """
        gradient of f w.r.t to y and psi
@ -248,10 +260,10 @@ class TanhWarpingFunction_d(WarpingFunction):
        """
        mpsi = psi.copy()
        mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
-        w, s, r, d = self.fgrad_y(y, psi, return_precalc = True)
+        mpsi = self.psi
        w, s, r, d = self.fgrad_y(y, return_precalc = True)
        gradients = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
        for i in range(len(mpsi)):