diff --git a/GPy/__init__.py b/GPy/__init__.py
index 1dc3e6b1..9ee3d935 100644
--- a/GPy/__init__.py
+++ b/GPy/__init__.py
@@ -14,6 +14,8 @@ from . import testing
 from . import kern
 from . import plotting
 
+from .util import normalizer
+
 # backwards compatibility
 import sys
 backwards_compatibility = ['lists_and_dicts', 'observable_array', 'ties_and_remappings', 'index_operations']
diff --git a/GPy/__version__.py b/GPy/__version__.py
index a955fdae..3e8d9f94 100644
--- a/GPy/__version__.py
+++ b/GPy/__version__.py
@@ -1 +1 @@
-__version__ = "1.2.1"
+__version__ = "1.4.0"
diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 9cbe3daf..f0910d9d 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -2,17 +2,17 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from .. import kern
-from GPy.core.model import Model
-from paramz import ObsAr
+from .model import Model
+from .parameterization.variational import VariationalPosterior
 from .mapping import Mapping
 from .. import likelihoods
+from .. import kern
 from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
-from GPy.core.parameterization.variational import VariationalPosterior
+from ..util.normalizer import Standardize
+from paramz import ObsAr
 
 import logging
 import warnings
-from GPy.util.normalizer import MeanNorm
 logger = logging.getLogger("GP")
 
 class GP(Model):
@@ -28,7 +28,7 @@ class GP(Model):
     :param Norm normalizer:
         normalize the outputs Y.
         Prediction will be un-normalized using this normalizer.
-        If normalizer is None, we will normalize using MeanNorm.
+        If normalizer is None, we will normalize using Standardize.
         If normalizer is False, no normalization will be done.
 
     .. Note:: Multiple independent outputs are allowed using columns of Y
@@ -49,7 +49,7 @@ class GP(Model):
         logger.info("initializing Y")
 
         if normalizer is True:
-            self.normalizer = MeanNorm()
+            self.normalizer = Standardize()
         elif normalizer is False:
             self.normalizer = None
         else:
@@ -64,6 +64,7 @@ class GP(Model):
             self.Y_normalized = self.Y
         else:
             self.Y = Y
+            self.Y_normalized = self.Y
 
         if Y.shape[0] != self.num_data:
             #There can be cases where we want inputs than outputs, for example if we have multiple latent
@@ -248,14 +249,16 @@ class GP(Model):
         """
         #predict the latent function values
         mu, var = self._raw_predict(Xnew, full_cov=full_cov, kern=kern)
-        if self.normalizer is not None:
-            mu, var = self.normalizer.inverse_mean(mu), self.normalizer.inverse_variance(var)
 
         if include_likelihood:
             # now push through likelihood
             if likelihood is None:
                 likelihood = self.likelihood
             mu, var = likelihood.predictive_values(mu, var, full_cov, Y_metadata=Y_metadata)
+
+        if self.normalizer is not None:
+            mu, var = self.normalizer.inverse_mean(mu), self.normalizer.inverse_variance(var)
+
         return mu, var
 
     def predict_noiseless(self,  Xnew, full_cov=False, Y_metadata=None, kern=None):
@@ -300,11 +303,14 @@ class GP(Model):
         :rtype: [np.ndarray (Xnew x self.output_dim), np.ndarray (Xnew x self.output_dim)]
         """
         m, v = self._raw_predict(X,  full_cov=False, kern=kern)
-        if self.normalizer is not None:
-            m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
         if likelihood is None:
             likelihood = self.likelihood
-        return likelihood.predictive_quantiles(m, v, quantiles, Y_metadata=Y_metadata)
+            
+        quantiles = likelihood.predictive_quantiles(m, v, quantiles, Y_metadata=Y_metadata)
+        
+        if self.normalizer is not None:
+            quantiles = [self.normalizer.inverse_mean(q) for q in quantiles]
+        return quantiles
 
     def predictive_gradients(self, Xnew, kern=None):
         """
diff --git a/GPy/core/parameterization/priorizable.py b/GPy/core/parameterization/priorizable.py
index 1d1153c7..eb82e862 100644
--- a/GPy/core/parameterization/priorizable.py
+++ b/GPy/core/parameterization/priorizable.py
@@ -16,7 +16,7 @@ class Priorizable(Parameterizable):
 
     def __setstate__(self, state):
         super(Priorizable, self).__setstate__(state)
-        self._index_operations['priors'] = self.priors
+        #self._index_operations['priors'] = self.priors
 
 
     #===========================================================================
diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index 11734564..19919f97 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -550,3 +550,34 @@ def parametric_mean_function(max_iters=100, optimize=True, plot=True):
     return m
 
 
+def warped_gp_cubic_sine(max_iters=100):
+    """
+    A test replicating the cubic sine regression problem from
+    Snelson's paper.
+    """
+    X = (2 * np.pi) * np.random.random(151) - np.pi
+    Y = np.sin(X) + np.random.normal(0,0.2,151)
+    Y = np.array([np.power(abs(y),float(1)/3) * (1,-1)[y<0] for y in Y])
+    X = X[:, None]
+    Y = Y[:, None]
+
+    warp_k = GPy.kern.RBF(1)
+    warp_f = GPy.util.warping_functions.TanhFunction(n_terms=2)
+    warp_m = GPy.models.WarpedGP(X, Y, kernel=warp_k, warping_function=warp_f)
+    warp_m['.*\.d'].constrain_fixed(1.0)
+    m = GPy.models.GPRegression(X, Y)
+    m.optimize_restarts(parallel=False, robust=True, num_restarts=5, max_iters=max_iters)
+    warp_m.optimize_restarts(parallel=False, robust=True, num_restarts=5, max_iters=max_iters)
+    #m.optimize(max_iters=max_iters)
+    #warp_m.optimize(max_iters=max_iters)
+
+    print(warp_m)
+    print(warp_m['.*warp.*'])
+
+    warp_m.predict_in_warped_space = False
+    warp_m.plot(title="Warped GP - Latent space")
+    warp_m.predict_in_warped_space = True
+    warp_m.plot(title="Warped GP - Warped space")
+    m.plot(title="Standard GP")
+    warp_m.plot_warping()
+    pb.show()
diff --git a/GPy/kern/src/add.py b/GPy/kern/src/add.py
index 99fe809b..ca20e5a9 100644
--- a/GPy/kern/src/add.py
+++ b/GPy/kern/src/add.py
@@ -13,15 +13,21 @@ class Add(CombinationKernel):
     propagates gradients through.
 
     This kernel will take over the active dims of it's subkernels passed in.
+    
+    NOTE: The subkernels will be copies of the original kernels, to prevent 
+    unexpected behavior. 
     """
     def __init__(self, subkerns, name='sum'):
-        for i, kern in enumerate(subkerns[:]):
+        _newkerns = []
+        for kern in subkerns:
             if isinstance(kern, Add):
-                del subkerns[i]
-                for part in kern.parts[::-1]:
+                for part in kern.parts:
                     #kern.unlink_parameter(part)
-                    subkerns.insert(i, part.copy())
-        super(Add, self).__init__(subkerns, name)
+                    _newkerns.append(part.copy())
+            else:
+                _newkerns.append(kern.copy())
+                    
+        super(Add, self).__init__(_newkerns, name)
         self._exact_psicomp = self._check_exact_psicomp()
 
     def _check_exact_psicomp(self):
diff --git a/GPy/kern/src/static.py b/GPy/kern/src/static.py
index 5cf4a1c9..0ffd8112 100644
--- a/GPy/kern/src/static.py
+++ b/GPy/kern/src/static.py
@@ -135,9 +135,7 @@ class Bias(Static):
 
     def K(self, X, X2=None):
         shape = (X.shape[0], X.shape[0] if X2 is None else X2.shape[0])
-        ret = np.empty(shape, dtype=np.float64)
-        ret[:] = self.variance
-        return ret
+        return np.full(shape, self.variance, dtype=np.float64)
 
     def update_gradients_full(self, dL_dK, X, X2=None):
         self.variance.gradient = dL_dK.sum()
@@ -146,9 +144,7 @@ class Bias(Static):
         self.variance.gradient = dL_dKdiag.sum()
 
     def psi2(self, Z, variational_posterior):
-        ret = np.empty((Z.shape[0], Z.shape[0]), dtype=np.float64)
-        ret[:] = self.variance*self.variance*variational_posterior.shape[0]
-        return ret
+        return np.full((Z.shape[0], Z.shape[0]), self.variance*self.variance*variational_posterior.shape[0], dtype=np.float64)
 
     def psi2n(self, Z, variational_posterior):
         ret = np.empty((variational_posterior.mean.shape[0], Z.shape[0], Z.shape[0]), dtype=np.float64)
diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py
index 654b1938..6d19d8a5 100644
--- a/GPy/models/__init__.py
+++ b/GPy/models/__init__.py
@@ -26,3 +26,5 @@ from .dpgplvm import DPBayesianGPLVM
 from .state_space_model import StateSpace
 
 from .ibp_lfm import IBPLFM
+
+from .gp_offset_regression import GPOffsetRegression
diff --git a/GPy/models/gp_offset_regression.py b/GPy/models/gp_offset_regression.py
new file mode 100644
index 00000000..b843e08d
--- /dev/null
+++ b/GPy/models/gp_offset_regression.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2012 - 2014 the GPy Austhors (see AUTHORS.txt)
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+# Written by Mike Smith. michaeltsmith.org.uk
+
+import numpy as np
+from ..core import GP
+from .. import likelihoods
+from .. import kern
+from ..core import Param
+
+class GPOffsetRegression(GP):
+    """
+    Gaussian Process model for offset regression
+
+    :param X: input observations, we assume for this class that this has one dimension of actual inputs and the last dimension should be the index of the cluster (so X should be Nx2)
+    :param Y: observed values (Nx1?)
+    :param kernel: a GPy kernel, defaults to rbf
+    :param Norm normalizer: [False]
+    :param noise_var: the noise variance for Gaussian likelhood, defaults to 1.
+
+        Normalize Y with the norm given.
+        If normalizer is False, no normalization will be done
+        If it is None, we use GaussianNorm(alization)
+
+    .. Note:: Multiple independent outputs are allowed using columns of Y
+
+    """
+
+    def __init__(self, X, Y, kernel=None, Y_metadata=None, normalizer=None, noise_var=1., mean_function=None):
+
+        assert X.shape[1]>1, "Need at least two input dimensions, as last dimension is the label of the cluster"
+        if kernel is None:
+            kernel = kern.RBF(X.shape[1]-1)
+
+        #self._log_marginal_likelihood = np.nan #todo
+        
+        likelihood = likelihoods.Gaussian(variance=noise_var)
+        self.X_fixed = X[:,:-1]
+        self.selected = np.array([int(x) for x in X[:,-1]])
+        
+        
+        super(GPOffsetRegression, self).__init__(X, Y, kernel, likelihood, name='GP offset regression', Y_metadata=Y_metadata, normalizer=normalizer, mean_function=mean_function)
+        maxcluster = np.max(self.selected)
+        self.offset = Param('offset', np.zeros(maxcluster))
+        #self.offset.set_prior(...)
+        self.link_parameter(self.offset)
+        
+    #def dr_doffset(self, X, sel): #how much r changes wrt the offset hyperparameters
+        
+    #def dL_doffset(self, X, sel):
+    #    dL_dr = self.dK_dr_via_X(X, X) * dL_dK
+                
+        
+    def dr_doffset(self,X,sel,delta):
+        #given an input matrix, X and the offsets (delta)
+        #finds dr/dDelta
+        #returns them as a list, one for each offset (delta).        
+        #get the input values
+
+        #a matrix G represents the effect of increasing the offset on the radius passed to the kernel for each input. For example
+        #what effect will increasing offset 4 have on the kernel output of inputs 5 and 8? Answer: Gs[4][5,8]... (positive or negative)
+        Gs = []
+        for i,d in enumerate(delta):
+            #X[sel==(i+1)]-=d
+            G = np.repeat(np.array(sel==(i+1))[:,None]*1,len(X),axis=1) - np.repeat(np.array(sel==(i+1))[None,:]*1,len(X),axis=0)
+            Gs.append(G)
+        #does subtracting the two Xs end up positive or negative (if negative we need to flip the sign in G).
+        w = np.repeat(X,len(X),axis=1) - np.repeat(X.T,len(X),axis=0)
+        dr_doffsets = []
+        for i,d in enumerate(delta):
+            dr_doffset = np.sign(w * Gs[i])
+            #print "dr_doffset %d" % i
+            #print dr_doffset
+            #print Gs[i]
+            #print w
+            dr_doffsets.append(dr_doffset)
+            
+        #lastly we need to divide by the lengthscale: So far we've found d(X_i - X_j)/dOffsets
+        #we want dr/dOffsets. (X_i - X_j)/lengthscale = r
+        dr_doffsets /= self.kern.lengthscale 
+        return dr_doffsets
+        
+    def parameters_changed(self):
+        offsets = np.hstack([0.0,self.offset.values])[:,None]
+        
+        self.X = self.X_fixed - offsets[self.selected]
+        super(GPOffsetRegression, self).parameters_changed()
+        
+        dL_dr = self.kern.dK_dr_via_X(self.X, self.X) * self.grad_dict['dL_dK']
+        
+        dr_doff = self.dr_doffset(self.X,self.selected,self.offset.values)
+        for i in range(len(dr_doff)):
+            dL_doff = dL_dr * dr_doff[i]
+            self.offset.gradient[i] = -np.sum(dL_doff)
+        
diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index df947d3e..a24401fe 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -2,65 +2,56 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from ..util.warping_functions import *
+#from ..util.warping_functions import *
 from ..core import GP
 from .. import likelihoods
-from GPy.util.warping_functions import TanhWarpingFunction_d
+from paramz import ObsAr
+#from GPy.util.warping_functions import TanhFunction
+from ..util.warping_functions import TanhFunction
 from GPy import kern
 
 class WarpedGP(GP):
-    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3):
-
+    """
+    This defines a GP Regression model that applies a 
+    warping function to the output.
+    """
+    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3, normalizer=False):
         if kernel is None:
             kernel = kern.RBF(X.shape[1])
-
         if warping_function == None:
-            self.warping_function = TanhWarpingFunction_d(warping_terms)
+            self.warping_function = TanhFunction(warping_terms)
             self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1) * 1)
         else:
             self.warping_function = warping_function
-
-        self.scale_data = False
-        if self.scale_data:
-            Y = self._scale_data(Y)
-        #self.has_uncertain_inputs = False
-        self.Y_untransformed = Y.copy()
-        self.predict_in_warped_space = True
         likelihood = likelihoods.Gaussian()
-
-        GP.__init__(self, X, self.transform_data(), likelihood=likelihood, kernel=kernel)
+        super(WarpedGP, self).__init__(X, Y.copy(), likelihood=likelihood, kernel=kernel, normalizer=normalizer)
+        self.Y_normalized = self.Y_normalized.copy()
+        self.Y_untransformed = self.Y_normalized.copy()
+        self.predict_in_warped_space = True
         self.link_parameter(self.warping_function)
 
-    def _scale_data(self, Y):
-        self._Ymax = Y.max()
-        self._Ymin = Y.min()
-        return (Y - self._Ymin) / (self._Ymax - self._Ymin) - 0.5
-
-    def _unscale_data(self, Y):
-        return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin
+    def set_XY(self, X=None, Y=None):
+        super(WarpedGP, self).set_XY(X, Y)
+        self.Y_untransformed = self.Y_normalized.copy()
+        self.update_model(True)
 
     def parameters_changed(self):
-        self.Y[:] = self.transform_data()
+        """
+        Notice that we update the warping function gradients here.
+        """
+        self.Y_normalized[:] = self.transform_data()
         super(WarpedGP, self).parameters_changed()
-
         Kiy = self.posterior.woodbury_vector.flatten()
-
-        grad_y = self.warping_function.fgrad_y(self.Y_untransformed)
-        grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed,
-                                                                 return_covar_chain=True)
-        djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
-        dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
-
-        warping_grads = -dquad_dpsi + djac_dpsi
-
-        self.warping_function.psi.gradient[:] = warping_grads[:, :-1]
-        self.warping_function.d.gradient[:] = warping_grads[0, -1]
+        self.warping_function.update_grads(self.Y_untransformed, Kiy)
 
     def transform_data(self):
         Y = self.warping_function.f(self.Y_untransformed.copy()).copy()
         return Y
 
     def log_likelihood(self):
+        """
+        Notice we add the jacobian of the warping function here.
+        """
         ll = GP.log_likelihood(self)
         jacobian = self.warping_function.fgrad_y(self.Y_untransformed)
         return ll + np.log(jacobian).sum()
@@ -73,36 +64,42 @@ class WarpedGP(GP):
         arg2 = np.ones(shape=gh_samples.shape).dot(mean.T)
         return self.warping_function.f_inv(arg1 + arg2, y=pred_init)
 
-    def _get_warped_mean(self, mean, std, pred_init=None, deg_gauss_hermite=100):
+    def _get_warped_mean(self, mean, std, pred_init=None, deg_gauss_hermite=20):
         """
         Calculate the warped mean by using Gauss-Hermite quadrature.
         """
         gh_samples, gh_weights = np.polynomial.hermite.hermgauss(deg_gauss_hermite)
-        gh_samples = gh_samples[:,None]
-        gh_weights = gh_weights[None,:]
+        gh_samples = gh_samples[:, None]
+        gh_weights = gh_weights[None, :]
         return gh_weights.dot(self._get_warped_term(mean, std, gh_samples)) / np.sqrt(np.pi)
 
-    def _get_warped_variance(self, mean, std, pred_init=None, deg_gauss_hermite=100):
+    def _get_warped_variance(self, mean, std, pred_init=None, deg_gauss_hermite=20):
         """
         Calculate the warped variance by using Gauss-Hermite quadrature.
         """
         gh_samples, gh_weights = np.polynomial.hermite.hermgauss(deg_gauss_hermite)
-        gh_samples = gh_samples[:,None]
-        gh_weights = gh_weights[None,:]
+        gh_samples = gh_samples[:, None]
+        gh_weights = gh_weights[None, :]
         arg1 = gh_weights.dot(self._get_warped_term(mean, std, gh_samples, 
                                                     pred_init=pred_init) ** 2) / np.sqrt(np.pi)
         arg2 = self._get_warped_mean(mean, std, pred_init=pred_init,
                                      deg_gauss_hermite=deg_gauss_hermite)
         return arg1 - (arg2 ** 2)
 
-    def predict(self, Xnew, which_parts='all', pred_init=None, full_cov=False, Y_metadata=None,
-                median=False, deg_gauss_hermite=100):
-        # normalize X values
-        # Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
-        mu, var = GP._raw_predict(self, Xnew)
-
+    def predict(self, Xnew, kern=None, pred_init=None, Y_metadata=None,
+                median=False, deg_gauss_hermite=20, likelihood=None):
+        """
+        Prediction results depend on:
+        - The value of the self.predict_in_warped_space flag
+        - The median flag passed as argument
+        The likelihood keyword is never used, it is just to follow the plotting API.
+        """
+        #mu, var = GP._raw_predict(self, Xnew)
         # now push through likelihood
-        mean, var = self.likelihood.predictive_values(mu, var)
+        #mean, var = self.likelihood.predictive_values(mu, var)
+        
+        mean, var = super(WarpedGP, self).predict(Xnew, kern=kern, full_cov=False, likelihood=likelihood)
+
 
         if self.predict_in_warped_space:
             std = np.sqrt(var)
@@ -116,13 +113,9 @@ class WarpedGP(GP):
         else:
             wmean = mean
             wvar = var
-
-        if self.scale_data:
-            pred = self._unscale_data(pred)
-
         return wmean, wvar
 
-    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None):
+    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None, likelihood=None, kern=None):
         """
         Get the predictive quantiles around the prediction at X
 
@@ -133,19 +126,38 @@ class WarpedGP(GP):
         :returns: list of quantiles for each X and predictive quantiles for interval combination
         :rtype: [np.ndarray (Xnew x self.input_dim), np.ndarray (Xnew x self.input_dim)]
         """
-        m, v = self._raw_predict(X,  full_cov=False)
-        if self.normalizer is not None:
-            m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
-        a, b = self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
-        #return [a, b]
-        if not self.predict_in_warped_space:
-            return [a, b]
-        #print a.shape
-        new_a = self.warping_function.f_inv(a)
-        new_b = self.warping_function.f_inv(b)
+        qs = super(WarpedGP, self).predict_quantiles(X, quantiles, Y_metadata=Y_metadata, likelihood=likelihood, kern=kern)
+        if self.predict_in_warped_space:
+            return [self.warping_function.f_inv(q) for q in qs]
+        return qs
+        #m, v = self._raw_predict(X,  full_cov=False)
+        #if self.normalizer is not None:
+        #    m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
+        #a, b = self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
+        #if not self.predict_in_warped_space:
+        #    return [a, b]
+        #new_a = self.warping_function.f_inv(a)
+        #new_b = self.warping_function.f_inv(b)
+        #return [new_a, new_b]
 
-        return [new_a, new_b]
-        #return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
+    def log_predictive_density(self, x_test, y_test, Y_metadata=None):
+        """
+        Calculation of the log predictive density. Notice we add
+        the jacobian of the warping function here.
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param x_test: test locations (x_{*})
+        :type x_test: (Nx1) array
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param Y_metadata: metadata associated with the test points
+        """
+        mu_star, var_star = self._raw_predict(x_test)
+        fy = self.warping_function.f(y_test)
+        ll_lpd = self.likelihood.log_predictive_density(fy, mu_star, var_star, Y_metadata=Y_metadata)
+        return ll_lpd + np.log(self.warping_function.fgrad_y(y_test))
 
 
 if __name__ == '__main__':
diff --git a/GPy/plotting/abstract_plotting_library.py b/GPy/plotting/abstract_plotting_library.py
index 64061b99..95377ed7 100644
--- a/GPy/plotting/abstract_plotting_library.py
+++ b/GPy/plotting/abstract_plotting_library.py
@@ -61,6 +61,8 @@ class AbstractPlottingLibrary(object):
         """
         Get a new figure with nrows and ncolumns subplots.
         Does not initialize the canvases yet.
+        
+        There is individual kwargs for the individual plotting libraries to use.
         """
         raise NotImplementedError("Implement all plot functions in AbstractPlottingLibrary in order to use your own plotting library")
     
diff --git a/GPy/plotting/gpy_plot/plot_util.py b/GPy/plotting/gpy_plot/plot_util.py
index 0d472d06..7bd1723f 100644
--- a/GPy/plotting/gpy_plot/plot_util.py
+++ b/GPy/plotting/gpy_plot/plot_util.py
@@ -31,6 +31,7 @@
 import numpy as np
 from scipy import sparse
 import itertools
+from ...models import WarpedGP
 
 def in_ipynb():
     try:
@@ -295,6 +296,10 @@ def get_x_y_var(model):
         Y = model.Y.values
     except AttributeError:
         Y = model.Y
+
+    if isinstance(model, WarpedGP) and not model.predict_in_warped_space:
+        Y = model.Y_normalized
+    
     if sparse.issparse(Y): Y = Y.todense().view(np.ndarray)
     return X, X_variance, Y
 
@@ -381,4 +386,4 @@ def x_frame2D(X,plot_limits=None,resolution=None):
     resolution = resolution or 50
     xx, yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution]
     Xnew = np.c_[xx.flat, yy.flat]
-    return Xnew, xx, yy, xmin, xmax
\ No newline at end of file
+    return Xnew, xx, yy, xmin, xmax
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index e4411e23..1e0731ab 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -91,18 +91,32 @@ class MiscTests(unittest.TestCase):
         k = GPy.kern.RBF(1)
         m2 = GPy.models.GPRegression(self.X, (Y-mu)/std, kernel=k, normalizer=False)
         m2[:] = m[:]
+        
         mu1, var1 = m.predict(m.X, full_cov=True)
         mu2, var2 = m2.predict(m2.X, full_cov=True)
         np.testing.assert_allclose(mu1, (mu2*std)+mu)
-        np.testing.assert_allclose(var1, var2)
+        np.testing.assert_allclose(var1, var2*std**2)
+        
         mu1, var1 = m.predict(m.X, full_cov=False)
         mu2, var2 = m2.predict(m2.X, full_cov=False)
+        
         np.testing.assert_allclose(mu1, (mu2*std)+mu)
-        np.testing.assert_allclose(var1, var2)
+        np.testing.assert_allclose(var1, var2*std**2)
 
         q50n = m.predict_quantiles(m.X, (50,))
         q50 = m2.predict_quantiles(m2.X, (50,))
+        
         np.testing.assert_allclose(q50n[0], (q50[0]*std)+mu)
+        
+        # Test variance component:
+        qs = np.array([2.5, 97.5])
+        # The quantiles get computed before unormalization
+        # And transformed using the mean transformation:
+        c = np.random.choice(self.X.shape[0])
+        q95 = m2.predict_quantiles(self.X[[c]], qs)
+        mu, var = m2.predict(self.X[[c]])
+        from scipy.stats import norm
+        np.testing.assert_allclose((mu+(norm.ppf(qs/100.)*np.sqrt(var))).flatten(), np.array(q95).flatten())
 
     def check_jacobian(self):
         try:
@@ -361,51 +375,95 @@ class MiscTests(unittest.TestCase):
         preds = m.predict(self.X)
 
         warp_k = GPy.kern.RBF(1)
-        warp_f = GPy.util.warping_functions.IdentityFunction()
-        warp_m = GPy.models.WarpedGP(self.X, self.Y, kernel=warp_k, warping_function=warp_f)
+        warp_f = GPy.util.warping_functions.IdentityFunction(closed_inverse=False)
+        warp_m = GPy.models.WarpedGP(self.X, self.Y, kernel=warp_k, 
+                                     warping_function=warp_f)
         warp_m.optimize()
         warp_preds = warp_m.predict(self.X)
 
-        np.testing.assert_almost_equal(preds, warp_preds)
+        warp_k_exact = GPy.kern.RBF(1)
+        warp_f_exact = GPy.util.warping_functions.IdentityFunction()
+        warp_m_exact = GPy.models.WarpedGP(self.X, self.Y, kernel=warp_k_exact, 
+                                           warping_function=warp_f_exact)
+        warp_m_exact.optimize()
+        warp_preds_exact = warp_m_exact.predict(self.X)
+ 
+        np.testing.assert_almost_equal(preds, warp_preds, decimal=4)
+        np.testing.assert_almost_equal(preds, warp_preds_exact, decimal=4)
 
-    @unittest.skip('Comment this to plot the modified sine function')
-    def test_warped_gp_sine(self):
+    def test_warped_gp_log(self):
         """
-        A test replicating the sine regression problem from
-        Snelson's paper.
+        A WarpedGP with the log warping function should be
+        equal to a standard GP with log labels.
+        Note that we predict the median here.
+        """
+        k = GPy.kern.RBF(1)
+        Y = np.abs(self.Y)
+        logY = np.log(Y)
+        m = GPy.models.GPRegression(self.X, logY, kernel=k)
+        m.optimize()
+        preds = m.predict(self.X)[0]
+
+        warp_k = GPy.kern.RBF(1)
+        warp_f = GPy.util.warping_functions.LogFunction(closed_inverse=False)
+        warp_m = GPy.models.WarpedGP(self.X, Y, kernel=warp_k, 
+                                     warping_function=warp_f)
+        warp_m.optimize()
+        warp_preds = warp_m.predict(self.X, median=True)[0]
+
+        warp_k_exact = GPy.kern.RBF(1)
+        warp_f_exact = GPy.util.warping_functions.LogFunction()
+        warp_m_exact = GPy.models.WarpedGP(self.X, Y, kernel=warp_k_exact, 
+                                           warping_function=warp_f_exact)
+        warp_m_exact.optimize(messages=True)
+        warp_preds_exact = warp_m_exact.predict(self.X, median=True)[0]
+ 
+        np.testing.assert_almost_equal(np.exp(preds), warp_preds, decimal=4)
+        np.testing.assert_almost_equal(np.exp(preds), warp_preds_exact, decimal=4)
+
+    def test_warped_gp_cubic_sine(self, max_iters=100):
+        """
+        A test replicating the cubic sine regression problem from
+        Snelson's paper. This test doesn't have any assertions, it's
+        just to ensure coverage of the tanh warping function code.
         """
         X = (2 * np.pi) * np.random.random(151) - np.pi
-        Y = np.sin(X) + np.random.normal(0,0.1,151)
-        Y = np.exp(Y) - 5
-        #Y = np.array([np.power(abs(y),float(1)/3) * (1,-1)[y<0] for y in Y]) + 0
+        Y = np.sin(X) + np.random.normal(0,0.2,151)
+        Y = np.array([np.power(abs(y),float(1)/3) * (1,-1)[y<0] for y in Y])
+        X = X[:, None]
+        Y = Y[:, None]
 
-        #np.seterr(over='raise')
-        import matplotlib.pyplot as plt
-        warp_k = GPy.kern.RBF(1)
-        warp_f = GPy.util.warping_functions.TanhWarpingFunction_d(n_terms=2)
-        warp_m = GPy.models.WarpedGP(X[:, None], Y[:, None], kernel=warp_k, warping_function=warp_f)
-        #warp_m['.*variance.*'].constrain_fixed(0.25)
-        #warp_m['.*lengthscale.*'].constrain_fixed(1)
-        #warp_m['warp_tanh.d'].constrain_fixed(1)
-        #warp_m.randomize()
-        #warp_m['.*warp_tanh.psi*'][:,0:2].constrain_bounded(0,100)
-        #warp_m['.*warp_tanh.psi*'][:,0:1].constrain_fixed(1)
-
-        #print(warp_m.checkgrad())
-        #warp_m.plot()
-        #plt.show()
-
-        warp_m.optimize_restarts(parallel=True, robust=True)
-        #print(warp_m.checkgrad())
-        print(warp_m)
-        print(warp_m['.*warp.*'])
+        warp_m = GPy.models.WarpedGP(X, Y)#, kernel=warp_k)#, warping_function=warp_f)
+        warp_m['.*\.d'].constrain_fixed(1.0)
+        warp_m.optimize_restarts(parallel=False, robust=False, num_restarts=5, 
+                                 max_iters=max_iters)
+        warp_m.predict(X)
+        warp_m.predict_quantiles(X)
+        warp_m.log_predictive_density(X, Y)
         warp_m.predict_in_warped_space = False
         warp_m.plot()
         warp_m.predict_in_warped_space = True
         warp_m.plot()
-        warp_f.plot(X.min()-10, X.max()+10)
-        plt.show()
+        
+    def test_offset_regression(self):
+        #Tests GPy.models.GPOffsetRegression. Using two small time series
+        #from a sine wave, we confirm the algorithm determines that the
+        #likelihood is maximised when the offset hyperparameter is approximately
+        #equal to the actual offset in X between the two time series.
+        offset = 3
+        X1 = np.arange(0,50,5.0)[:,None]
+        X2 = np.arange(0+offset,50+offset,5.0)[:,None]
+        X = np.vstack([X1,X2])
+        ind = np.vstack([np.zeros([10,1]),np.ones([10,1])])
+        X = np.hstack([X,ind])
+        Y = np.sin((X[0:10,0])/30.0)[:,None]
+        Y = np.vstack([Y,Y])
 
+        m = GPy.models.GPOffsetRegression(X,Y)
+        m.rbf.lengthscale=5.0 #make it something other than one to check our gradients properly!
+        assert m.checkgrad(), "Gradients of offset parameters don't match numerical approximations."
+        m.optimize()
+        assert np.abs(m.offset[0]-offset)<0.1, ("GPOffsetRegression model failing to estimate correct offset (value estimated = %0.2f instead of %0.2f)" % (m.offset[0], offset))
 
 
 class GradientTests(np.testing.TestCase):
diff --git a/GPy/testing/util_tests.py b/GPy/testing/util_tests.py
index 3c6241f3..ba3d7ddf 100644
--- a/GPy/testing/util_tests.py
+++ b/GPy/testing/util_tests.py
@@ -29,6 +29,7 @@
 #===============================================================================
 
 import unittest, numpy as np
+import GPy
 
 class TestDebug(unittest.TestCase):
     def test_checkFinite(self):
@@ -107,3 +108,50 @@ class TestDebug(unittest.TestCase):
         self.assertTrue(d[tuple(X[:,4])] == d[tuple(X[:,0])] == [0, 4, 5])
         self.assertTrue(d[tuple(X[:,1])] == [1])
 
+    def test_offset_cluster(self):
+        #Tests the GPy.util.cluster_with_offset.cluster utility with a small
+        #test data set. Not using random noise just in case it occasionally
+        #causes it not to cluster correctly.
+        #groundtruth cluster identifiers are: [0,1,1,0]
+        
+        #data contains a list of the four sets of time series (3 per data point)      
+
+        data = [np.array([[ 2.18094245,  1.96529789,  2.00265523,  2.18218742,  2.06795428],
+                [ 1.62254829,  1.75748448,  1.83879347,  1.87531326,  1.52503496],
+                [ 1.54589609,  1.61607914,  2.00463192,  1.48771394,  1.63339218]]),
+         np.array([[ 2.86766106,  2.97953437,  2.91958876,  2.92510506,  3.03239241],
+                [ 2.57368423,  2.59954886,  3.10000395,  2.75806125,  2.89865704],
+                [ 2.58916318,  2.53698259,  2.63858411,  2.63102504,  2.51853901]]),
+         np.array([[ 2.77834168,  2.9618564 ,  2.88482141,  3.24259745,  2.9716821 ],
+                [ 2.60675576,  2.67095624,  2.94824436,  2.80520631,  2.87247516],
+                [ 2.49543562,  2.5492281 ,  2.6505866 ,  2.65015308,  2.59738616]]),
+         np.array([[ 1.76783086,  2.21666738,  2.07939706,  1.9268263 ,  2.23360121],
+                [ 1.94305547,  1.94648592,  2.1278921 ,  2.09481457,  2.08575238],
+                [ 1.69336013,  1.72285186,  1.6339506 ,  1.61212022,  1.39198698]])]
+
+        #inputs contains their associated X values
+        
+        inputs = [np.array([[ 0.        ],
+                [ 0.68040097],
+                [ 1.20316795],
+                [ 1.798749  ],
+                [ 2.14891733]]), np.array([[ 0.        ],
+                [ 0.51910637],
+                [ 0.98259352],
+                [ 1.57442965],
+                [ 1.82515098]]), np.array([[ 0.        ],
+                [ 0.66645478],
+                [ 1.59464591],
+                [ 1.69769551],
+                [ 1.80932752]]), np.array([[ 0.        ],
+                [ 0.87512108],
+                [ 1.71881079],
+                [ 2.67162871],
+                [ 3.23761907]])]
+                    
+        #try doing the clustering
+        active = GPy.util.cluster_with_offset.cluster(data,inputs)
+        #check to see that the clustering has correctly clustered the time series.
+        clusters = set([frozenset(cluster) for cluster in active])
+        assert set([1,2]) in clusters, "Offset Clustering algorithm failed"
+        assert set([0,3]) in clusters, "Offset Clustering algoirthm failed"
diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py
index fb1eb0d6..685551fd 100644
--- a/GPy/util/__init__.py
+++ b/GPy/util/__init__.py
@@ -16,3 +16,4 @@ from . import initialization
 from . import multioutput
 from . import parallel
 from . import functions
+from . import cluster_with_offset
diff --git a/GPy/util/cluster_with_offset.py b/GPy/util/cluster_with_offset.py
new file mode 100644
index 00000000..d1e896f1
--- /dev/null
+++ b/GPy/util/cluster_with_offset.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2016, Mike Smith
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import GPy
+import numpy as np
+import sys #so I can print dots
+
+def get_log_likelihood(inputs,data,clust):
+    """Get the LL of a combined set of clusters, ignoring time series offsets.
+    
+    Get the log likelihood of a cluster without worrying about the fact
+    different time series are offset. We're using it here really for those
+    cases in which we only have one cluster to get the loglikelihood of.
+    
+    arguments:
+    inputs -- the 'X's in a list, one item per cluster
+    data -- the 'Y's in a list, one item per cluster
+    clust -- list of clusters to use
+    
+    returns a tuple:
+    log likelihood and the offset (which is always zero for this model)
+    """
+ 
+    S = data[0].shape[0] #number of time series
+    
+    #build a new dataset from the clusters, by combining all clusters together
+    X = np.zeros([0,1])
+    Y = np.zeros([0,S])
+    
+    #for each person in the cluster,
+    #add their inputs and data to the new dataset
+    for p in clust:
+        X = np.vstack([X,inputs[p]])
+        Y = np.vstack([Y,data[p].T])
+        
+    #find the loglikelihood. We just add together the LL for each time series.
+    #ll=0
+    #for s in range(S):
+    #    m = GPy.models.GPRegression(X,Y[:,s][:,None])
+    #    m.optimize()
+    #    ll+=m.log_likelihood()
+
+    m = GPy.models.GPRegression(X,Y)
+    m.optimize()
+    ll=m.log_likelihood()    
+    return ll,0
+
+def get_log_likelihood_offset(inputs,data,clust):
+    """Get the log likelihood of a combined set of clusters, fitting the offsets
+    
+    arguments:
+    inputs -- the 'X's in a list, one item per cluster
+    data -- the 'Y's in a list, one item per cluster
+    clust -- list of clusters to use
+    
+    returns a tuple:
+    log likelihood and the offset
+    """    
+        
+    #if we've only got one cluster, the model has an error, so we want to just
+    #use normal GPRegression.
+    if len(clust)==1: 
+        return get_log_likelihood(inputs,data,clust)
+                
+    S = data[0].shape[0] #number of time series
+        
+    X = np.zeros([0,2]) #notice the extra column, this is for the cluster index
+    Y = np.zeros([0,S])
+    
+    #for each person in the cluster, add their inputs and data to the new
+    #dataset. Note we add an index identifying which person is which data point.
+    #This is for the offset model to use, to allow it to know which data points
+    #to shift.
+    for i,p in enumerate(clust):
+        idx = i*np.ones([inputs[p].shape[0],1])
+        X = np.vstack([X,np.hstack([inputs[p],idx])])
+        Y = np.vstack([Y,data[p].T])
+        
+    m = GPy.models.GPOffsetRegression(X,Y)
+    #TODO: How to select a sensible prior?
+    m.offset.set_prior(GPy.priors.Gaussian(0,20)) 
+    #TODO: Set a sensible start value for the length scale,
+    #make it long to help the offset fit.
+    
+    m.optimize()
+    
+    ll = m.log_likelihood()
+    offset = m.offset.values[0]
+    return ll,offset
+
+def cluster(data,inputs,verbose=False):
+    """Clusters data
+    
+    Using the new offset model, this method uses a greedy algorithm to cluster
+    the data. It starts with all the data points in separate clusters and tests
+    whether combining them increases the overall log-likelihood (LL). It then
+    iteratively joins pairs of clusters which cause the greatest increase in
+    the LL, until no join increases the LL.
+    
+    arguments:
+    inputs -- the 'X's in a list, one item per cluster
+    data -- the 'Y's in a list, one item per cluster
+    
+    returns a list of the clusters.    
+    """
+    N=len(data)
+    
+    
+    #Define a set of N active cluster
+    active = []
+    for p in range(0,N):
+        active.append([p])
+
+    loglikes = np.zeros(len(active))
+    loglikes[:] = None
+
+    pairloglikes = np.zeros([len(active),len(active)])
+    pairloglikes[:] = None
+    pairoffset = np.zeros([len(active),len(active)])
+
+    it = 0
+    while True:
+    
+        if verbose:
+            it +=1
+            print("Iteration %d" % it)
+        
+        #Compute the log-likelihood of each cluster (add them together)
+        for clusti in range(len(active)):
+            if verbose:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+            if np.isnan(loglikes[clusti]):
+                loglikes[clusti], unused_offset = get_log_likelihood_offset(inputs,data,[clusti])
+
+            #try combining with each other cluster...
+            for clustj in range(clusti): #count from 0 to clustj-1
+                temp = [clusti,clustj]
+                if np.isnan(pairloglikes[clusti,clustj]):
+                    pairloglikes[clusti,clustj],pairoffset[clusti,clustj] = get_log_likelihood_offset(inputs,data,temp)
+
+        seploglikes = np.repeat(loglikes[:,None].T,len(loglikes),0)+np.repeat(loglikes[:,None],len(loglikes),1)
+        loglikeimprovement = pairloglikes - seploglikes #how much likelihood improves with clustering
+        top = np.unravel_index(np.nanargmax(pairloglikes-seploglikes), pairloglikes.shape)
+
+        #if loglikeimprovement.shape[0]<3:
+        # #no more clustering to do - this shouldn't happen really unless
+        # #we've set the threshold to apply clustering to less than 0
+        #    break 
+        
+        #if theres further clustering to be done...
+        if loglikeimprovement[top[0],top[1]]>0:
+            active[top[0]].extend(active[top[1]])
+            offset=pairoffset[top[0],top[1]]
+            inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset])
+            data[top[0]] = np.hstack([data[top[0]],data[top[1]]])
+            del inputs[top[1]]
+            del data[top[1]]
+            del active[top[1]]
+
+            #None = message to say we need to recalculate
+            pairloglikes[:,top[0]] = None 
+            pairloglikes[top[0],:] = None 
+            pairloglikes = np.delete(pairloglikes,top[1],0)
+            pairloglikes = np.delete(pairloglikes,top[1],1)
+            loglikes[top[0]] = None
+            loglikes = np.delete(loglikes,top[1])
+        else:
+            break
+            
+        #if loglikeimprovement[top[0],top[1]]>0:
+        #    print "joined"
+        #    print top
+        #    print offset
+        #    print offsets
+        #    print offsets[top[1]]-offsets[top[0]]
+
+    #TODO Add a way to return the offsets applied to all the time series
+    return active 
+
+#starttime = time.time()
+#active = cluster(data,inputs)
+#endtime = time.time()
+#print "TOTAL TIME %0.4f" % (endtime-starttime)
diff --git a/GPy/util/normalizer.py b/GPy/util/normalizer.py
index 854726c4..78ad945d 100644
--- a/GPy/util/normalizer.py
+++ b/GPy/util/normalizer.py
@@ -6,7 +6,7 @@ Created on Aug 27, 2014
 import logging
 import numpy as np
 
-class Norm(object):
+class _Norm(object):
     def __init__(self):
         pass
     def scale_by(self, Y):
@@ -18,7 +18,8 @@ class Norm(object):
         """
         Project Y into normalized space
         """
-        raise NotImplementedError
+        if not self.scaled():
+            raise AttributeError("Norm object not initialized yet, try calling scale_by(data) first.")
     def inverse_mean(self, X):
         """
         Project the normalized object X into space of Y
@@ -31,15 +32,46 @@ class Norm(object):
         Whether this Norm object has been initialized.
         """
         raise NotImplementedError
-class MeanNorm(Norm):
+
+class Standardize(_Norm):
     def __init__(self):
         self.mean = None
     def scale_by(self, Y):
         Y = np.ma.masked_invalid(Y, copy=False)
         self.mean = Y.mean(0).view(np.ndarray)
+        self.std = Y.std(0).view(np.ndarray)
     def normalize(self, Y):
-        return Y-self.mean
+        super(Standardize, self).normalize(Y)
+        return (Y-self.mean)/self.std
     def inverse_mean(self, X):
-        return X+self.mean
+        return (X*self.std)+self.mean
+    def inverse_variance(self, var):
+        return (var*(self.std**2))
     def scaled(self):
         return self.mean is not None
+
+# Inverse variance to be implemented, disabling for now
+# If someone in the future want to implement this,
+# we need to implement the inverse variance for 
+# normalization. This means, we need to know the factor
+# for the variance to be multiplied to the variance in
+# normalized space. This is easy to compute for standardization
+# (see above) but gets tricky here.
+# class Normalize(_Norm):
+#     def __init__(self):
+#         self.ymin = None
+#         self.ymax = None
+#     def scale_by(self, Y):
+#         Y = np.ma.masked_invalid(Y, copy=False)
+#         self.ymin = Y.min(0).view(np.ndarray)
+#         self.ymax = Y.max(0).view(np.ndarray)
+#     def normalize(self, Y):
+#         super(Normalize, self).normalize(Y)
+#         return (Y - self.ymin) / (self.ymax - self.ymin) - .5
+#     def inverse_mean(self, X):
+#         return (X + .5) * (self.ymax - self.ymin) + self.ymin
+#     def inverse_variance(self, var):
+#         
+#         return (var*(self.std**2))
+#     def scaled(self):
+#         return (self.ymin is not None) and (self.ymax is not None)
\ No newline at end of file
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index 1617283c..3af28d43 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -4,6 +4,7 @@
 import numpy as np
 from ..core.parameterization import Parameterized, Param
 from paramz.transformations import Logexp
+import sys
 
 
 class WarpingFunction(Parameterized):
@@ -14,6 +15,7 @@ class WarpingFunction(Parameterized):
 
     def __init__(self, name):
         super(WarpingFunction, self).__init__(name=name)
+        self.rate = 0.1
 
     def f(self, y, psi):
         """function transformation
@@ -29,15 +31,32 @@ class WarpingFunction(Parameterized):
         """gradient of f w.r.t to y"""
         raise NotImplementedError
 
-    def f_inv(self, z, psi):
-        """inverse function transformation"""
-        raise NotImplementedError
+    def f_inv(self, z, max_iterations=250, y=None):
+        """
+        Calculate the numerical inverse of f. This should be
+        overwritten for specific warping functions where the
+        inverse can be found in closed form.
 
-    def _get_param_names(self):
-        raise NotImplementedError
+        :param max_iterations: maximum number of N.R. iterations
+        """
+
+        z = z.copy()
+        y = np.ones_like(z)
+
+        it = 0
+        update = np.inf
+        while np.abs(update).sum() > 1e-10 and it < max_iterations:
+            fy = self.f(y)
+            fgrady = self.fgrad_y(y)
+            update = (fy - z) / fgrady
+            y -= self.rate * update
+            it += 1
+        #if it == max_iterations:
+        #    print("WARNING!!! Maximum number of iterations reached in f_inv ")
+        #    print("Sum of roots: %.4f" % np.sum(fy - z))
+        return y
 
     def plot(self, xmin, xmax):
-        psi = self.psi
         y = np.arange(xmin, xmax, 0.01)
         f_y = self.f(y)
         from matplotlib import pyplot as plt
@@ -49,183 +68,59 @@ class WarpingFunction(Parameterized):
         plt.show()
 
 
-class TanhWarpingFunction(WarpingFunction):
-
-    def __init__(self, n_terms=3):
-        """n_terms specifies the number of tanh terms to be used"""
-        self.n_terms = n_terms
-        self.num_parameters = 3 * self.n_terms
-        super(TanhWarpingFunction, self).__init__(name='warp_tanh')
-
-    def f(self, y, psi):
+class TanhFunction(WarpingFunction):
+    """
+    This is the function proposed in Snelson et al.:
+    A sum of tanh functions with linear trends outside
+    the range. Notice the term 'd', which scales the
+    linear trend.
+    """
+    def __init__(self, n_terms=3, initial_y=None):
         """
-        transform y with f using parameter vector psi
-        psi = [[a,b,c]]
-        ::math::`f = \\sum_{terms} a * tanh(b*(y+c))`
+        n_terms specifies the number of tanh terms to be used
         """
-        #1. check that number of params is consistent
-        assert psi.shape[0] == self.n_terms, 'inconsistent parameter dimensions'
-        assert psi.shape[1] == 3, 'inconsistent parameter dimensions'
-
-        #2. exponentiate the a and b (positive!)
-        mpsi = psi.copy()
-
-        #3. transform data
-        z = y.copy()
-        for i in range(len(mpsi)):
-            a,b,c = mpsi[i]
-            z += a*np.tanh(b*(y+c))
-        return z
-
-    def f_inv(self, y, psi, iterations=10):
-        """
-        calculate the numerical inverse of f
-        :param iterations: number of N.R. iterations
-        """
-
-        y = y.copy()
-        z = np.ones_like(y)
-        for i in range(iterations):
-            z -= (self.f(z, psi) - y)/self.fgrad_y(z,psi)
-        return z
-
-    def fgrad_y(self, y, psi, return_precalc=False):
-        """
-        gradient of f w.r.t to y ([N x 1])
-        returns: Nx1 vector of derivatives, unless return_precalc is true,
-        then it also returns the precomputed stuff
-        """
-
-        mpsi = psi.copy()
-
-        # vectorized version
-
-        # S = (mpsi[:,1]*(y + mpsi[:,2])).T
-        S = (mpsi[:,1]*(y[:,:,None] + mpsi[:,2])).T
-        R = np.tanh(S)
-        D = 1-R**2
-
-        # GRAD = (1+(mpsi[:,0:1]*mpsi[:,1:2]*D).sum(axis=0))[:,np.newaxis]
-        GRAD = (1+(mpsi[:,0:1][:,:,None]*mpsi[:,1:2][:,:,None]*D).sum(axis=0)).T
-
-        if return_precalc:
-            # return GRAD,S.sum(axis=1),R.sum(axis=1),D.sum(axis=1)
-            return GRAD, S, R, D
-
-        return GRAD
-
-    def fgrad_y_psi(self, y, psi, return_covar_chain=False):
-        """
-        gradient of f w.r.t to y and psi
-        returns: NxIx3 tensor of partial derivatives
-        """
-
-        # 1. exponentiate the a and b (positive!)
-        mpsi = psi.copy()
-        w, s, r, d = self.fgrad_y(y, psi, return_precalc = True)
-
-        gradients = np.zeros((y.shape[0], y.shape[1], len(mpsi), 3))
-        for i in range(len(mpsi)):
-            a,b,c = mpsi[i]
-            gradients[:,:,i,0] = (b*(1.0/np.cosh(s[i]))**2).T
-            gradients[:,:,i,1] = a*(d[i] - 2.0*s[i]*r[i]*(1.0/np.cosh(s[i]))**2).T
-            gradients[:,:,i,2] = (-2.0*a*(b**2)*r[i]*((1.0/np.cosh(s[i]))**2)).T
-
-        if return_covar_chain:
-            covar_grad_chain = np.zeros((y.shape[0], y.shape[1], len(mpsi), 3))
-
-            for i in range(len(mpsi)):
-                a,b,c = mpsi[i]
-                covar_grad_chain[:, :, i, 0] = (r[i]).T
-                covar_grad_chain[:, :, i, 1] = (a*(y + c) * ((1.0/np.cosh(s[i]))**2).T)
-                covar_grad_chain[:, :, i, 2] = a*b*((1.0/np.cosh(s[i]))**2).T
-
-            return gradients, covar_grad_chain
-
-        return gradients
-
-    def _get_param_names(self):
-        variables = ['a', 'b', 'c']
-        names = sum([['warp_tanh_%s_t%i' % (variables[n],q) for n in range(3)] for q in range(self.n_terms)],[])
-        return names
-
-
-class TanhWarpingFunction_d(WarpingFunction):
-
-    def __init__(self, n_terms=3):
-        """n_terms specifies the number of tanh terms to be used"""
         self.n_terms = n_terms
         self.num_parameters = 3 * self.n_terms + 1
         self.psi = np.ones((self.n_terms, 3))
-
-        super(TanhWarpingFunction_d, self).__init__(name='warp_tanh')
+        super(TanhFunction, self).__init__(name='warp_tanh')
         self.psi = Param('psi', self.psi)
         self.psi[:, :2].constrain_positive()
-
         self.d = Param('%s' % ('d'), 1.0, Logexp())
         self.link_parameter(self.psi)
         self.link_parameter(self.d)
+        self.initial_y = initial_y
 
     def f(self, y):
         """
         Transform y with f using parameter vector psi
         psi = [[a,b,c]]
 
-        :math:`f = \\sum_{terms} a * tanh(b*(y+c))`
+        :math:`f = (y * d) + \\sum_{terms} a * tanh(b *(y + c))`
         """
-        #1. check that number of params is consistent
-        # assert psi.shape[0] == self.n_terms, 'inconsistent parameter dimensions'
-        # assert psi.shape[1] == 4, 'inconsistent parameter dimensions'
-
         d = self.d
         mpsi = self.psi
-
-        #3. transform data
-        z = d*y.copy()
+        z = d * y.copy()
         for i in range(len(mpsi)):
-            a,b,c = mpsi[i]
-            z += a*np.tanh(b*(y+c))
+            a, b, c = mpsi[i]
+            z += a * np.tanh(b * (y + c))
         return z
 
-    def f_inv(self, z, max_iterations=1000, y=None):
-        """
-        calculate the numerical inverse of f
-
-        :param max_iterations: maximum number of N.R. iterations
-        """
-
-        z = z.copy()
-        if y is None:
-            y = np.ones_like(z)
-
-        it = 0
-        update = np.inf
-
-        while it == 0 or (np.abs(update).sum() > 1e-10 and it < max_iterations):
-            fy = self.f(y)
-            fgrady = self.fgrad_y(y)
-            update = (fy - z)/fgrady
-            y -= update
-            it += 1
-        if it == max_iterations:
-            print("WARNING!!! Maximum number of iterations reached in f_inv ")
-        return y
-
     def fgrad_y(self, y, return_precalc=False):
         """
         gradient of f w.r.t to y ([N x 1])
 
-        :returns: Nx1 vector of derivatives, unless return_precalc is true, then it also returns the precomputed stuff
+        :returns: Nx1 vector of derivatives, unless return_precalc is true, 
+        then it also returns the precomputed stuff
         """
         d = self.d
         mpsi = self.psi
 
         # vectorized version
-        S = (mpsi[:,1]*(y[:,:,None] + mpsi[:,2])).T
+        S = (mpsi[:,1] * (y[:,:,None] + mpsi[:,2])).T
         R = np.tanh(S)
-        D = 1-R**2
+        D = 1 - (R ** 2)
 
-        GRAD = (d + (mpsi[:,0:1][:,:,None]*mpsi[:,1:2][:,:,None]*D).sum(axis=0)).T
+        GRAD = (d + (mpsi[:,0:1][:,:,None] * mpsi[:,1:2][:,:,None] * D).sum(axis=0)).T
 
         if return_precalc:
             return GRAD, S, R, D
@@ -244,45 +139,79 @@ class TanhWarpingFunction_d(WarpingFunction):
         gradients = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
         for i in range(len(mpsi)):
             a,b,c  = mpsi[i]
-            gradients[:,:,i,0] = (b*(1.0/np.cosh(s[i]))**2).T
-            gradients[:,:,i,1] = a*(d[i] - 2.0*s[i]*r[i]*(1.0/np.cosh(s[i]))**2).T
-            gradients[:,:,i,2] = (-2.0*a*(b**2)*r[i]*((1.0/np.cosh(s[i]))**2)).T
-        gradients[:,:,0,3] = 1.0
+            gradients[:, :, i, 0] = (b * (1.0/np.cosh(s[i])) ** 2).T
+            gradients[:, :, i, 1] = a * (d[i] - 2.0 * s[i] * r[i] * (1.0/np.cosh(s[i])) ** 2).T
+            gradients[:, :, i, 2] = (-2.0 * a * (b ** 2) * r[i] * ((1.0 / np.cosh(s[i])) ** 2)).T
+        gradients[:, :, 0, 3] = 1.0
 
         if return_covar_chain:
             covar_grad_chain = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
-
             for i in range(len(mpsi)):
                 a,b,c = mpsi[i]
                 covar_grad_chain[:, :, i, 0] = (r[i]).T
-                covar_grad_chain[:, :, i, 1] = (a*(y + c) * ((1.0/np.cosh(s[i]))**2).T)
-                covar_grad_chain[:, :, i, 2] = a*b*((1.0/np.cosh(s[i]))**2).T
+                covar_grad_chain[:, :, i, 1] = (a * (y + c) * ((1.0 / np.cosh(s[i])) ** 2).T)
+                covar_grad_chain[:, :, i, 2] = a * b * ((1.0 / np.cosh(s[i])) ** 2).T
             covar_grad_chain[:, :, 0, 3] = y
-
             return gradients, covar_grad_chain
 
         return gradients
 
-    def _get_param_names(self):
-        variables = ['a', 'b', 'c', 'd']
-        names = sum([['warp_tanh_%s_t%i' % (variables[n],q) for n in range(3)] for q in range(self.n_terms)],[])
-        names.append('warp_tanh_d')
-        return names
+    def update_grads(self, Y_untransformed, Kiy):
+        grad_y = self.fgrad_y(Y_untransformed)
+        grad_y_psi, grad_psi = self.fgrad_y_psi(Y_untransformed,
+                                                return_covar_chain=True)
+        djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
+        dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
+
+        warping_grads = -dquad_dpsi + djac_dpsi
+
+        self.psi.gradient[:] = warping_grads[:, :-1]
+        self.d.gradient[:] = warping_grads[0, -1]
+
+
+class LogFunction(WarpingFunction):
+    """
+    Easy wrapper for applying a fixed log warping function to
+    positive-only values.
+    The closed_inverse flag should only be set to False for
+    debugging and testing purposes.
+    """
+    def __init__(self, closed_inverse=True):
+        self.num_parameters = 0
+        super(LogFunction, self).__init__(name='log')
+        if closed_inverse:
+            self.f_inv = self._f_inv
+
+    def f(self, y):
+        return np.log(y)
+
+    def fgrad_y(self, y):
+        return 1. / y
+
+    def update_grads(self, Y_untransformed, Kiy):
+        pass
+
+    def fgrad_y_psi(self, y, return_covar_chain=False):
+        if return_covar_chain:
+            return 0, 0
+        return 0
+
+    def _f_inv(self, z, y=None):
+        return np.exp(z)
 
 
 class IdentityFunction(WarpingFunction):
     """
     Identity warping function. This is for testing and sanity check purposes
     and should not be used in practice.
+    The closed_inverse flag should only be set to False for
+    debugging and testing purposes.
     """
-    def __init__(self):
-        self.num_parameters = 4
-        self.psi = Param('psi', np.zeros((1,3)))
-        self.d = Param('%s' % ('d'), 1.0, Logexp())
+    def __init__(self, closed_inverse=True):
+        self.num_parameters = 0
         super(IdentityFunction, self).__init__(name='identity')
-        self.link_parameter(self.psi)
-        self.link_parameter(self.d)
-
+        if closed_inverse:
+            self.f_inv = self._f_inv
         
     def f(self, y):
         return y
@@ -290,11 +219,14 @@ class IdentityFunction(WarpingFunction):
     def fgrad_y(self, y):
         return np.ones(y.shape)
 
+    def update_grads(self, Y_untransformed, Kiy):
+        pass
+
     def fgrad_y_psi(self, y, return_covar_chain=False):
-        gradients = np.zeros((y.shape[0], y.shape[1], len(self.psi), 4))
         if return_covar_chain:
-            return gradients, gradients
-        return gradients
+            return 0, 0
+        return 0
         
-    def f_inv(self, z, y=None):
+    def _f_inv(self, z, y=None):
         return z
+
diff --git a/README.md b/README.md
index e0e715a9..513f2704 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ The Gaussian processes framework in Python.
 * Travis-CI [unit-tests](https://travis-ci.org/SheffieldML/GPy)
 * [![licence](https://img.shields.io/badge/licence-BSD-blue.svg)](http://opensource.org/licenses/BSD-3-Clause)
 
-[![develstat](https://travis-ci.org/SheffieldML/GPy.svg?branch=devel)](https://travis-ci.org/SheffieldML/GPy) [![appveyor](https://ci.appveyor.com/api/projects/status/662o6tha09m2jix3/branch/deploy?svg=true)](https://ci.appveyor.com/project/mzwiessele/gpy/branch/deploy) [![coverallsdevel](https://coveralls.io/repos/github/SheffieldML/GPy/badge.svg?branch=devel)](https://coveralls.io/github/SheffieldML/GPy?branch=devel) [![covdevel](http://codecov.io/github/SheffieldML/GPy/coverage.svg?branch=devel)](http://codecov.io/github/SheffieldML/GPy?branch=devel) [![Research software impact](http://depsy.org/api/package/pypi/GPy/badge.svg)](http://depsy.org/package/python/GPy) [![Code Health](https://landscape.io/github/SheffieldML/GPy/devel/landscape.svg?style=flat)](https://landscape.io/github/SheffieldML/GPy/devel)
+[![deploystat](https://travis-ci.org/SheffieldML/GPy.svg?branch=deploy)](https://travis-ci.org/SheffieldML/GPy) [![appveyor](https://ci.appveyor.com/api/projects/status/662o6tha09m2jix3/branch/deploy?svg=true)](https://ci.appveyor.com/project/mzwiessele/gpy/branch/deploy) [![coverallsdevel](https://coveralls.io/repos/github/SheffieldML/GPy/badge.svg?branch=devel)](https://coveralls.io/github/SheffieldML/GPy?branch=devel) [![covdevel](http://codecov.io/github/SheffieldML/GPy/coverage.svg?branch=devel)](http://codecov.io/github/SheffieldML/GPy?branch=devel) [![Research software impact](http://depsy.org/api/package/pypi/GPy/badge.svg)](http://depsy.org/package/python/GPy) [![Code Health](https://landscape.io/github/SheffieldML/GPy/devel/landscape.svg?style=flat)](https://landscape.io/github/SheffieldML/GPy/devel)
 
 ## Updated Structure
 
diff --git a/appveyor.yml b/appveyor.yml
index f7f7b7ab..ed0ec0d6 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -3,7 +3,7 @@ environment:
     secure: 8/ZjXFwtd1S7ixd7PJOpptupKKEDhm2da/q3unabJ00=
   COVERALLS_REPO_TOKEN:
     secure: d3Luic/ESkGaWnZrvWZTKrzO+xaVwJWaRCEP0F+K/9DQGPSRZsJ/Du5g3s4XF+tS
-  gpy_version: 1.2.1
+  gpy_version: 1.4.0
   matrix:
     - PYTHON_VERSION: 2.7
       MINICONDA: C:\Miniconda-x64
@@ -71,10 +71,10 @@ deploy_script:
 - echo password:%pip_access% >> %USERPROFILE%\\.pypirc
 - ps: >-
     if ($env:APPVEYOR_REPO_BRANCH -eq 'devel') { 
-        twine upload -r test dist/*
+        twine upload --skip-existing -r test dist/*
     }
     elseif ($env:APPVEYOR_REPO_BRANCH -eq 'deploy') {  
-        twine upload dist/*
+        twine upload --skip-existing dist/*
     }
     else {
         echo not deploying on other branches
diff --git a/setup.cfg b/setup.cfg
index 57fddfaf..9546c53a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.2.1
+current_version = 1.4.0
 tag = False
 commit = True
 
diff --git a/setup.py b/setup.py
index c8d20add..1cf2321d 100644
--- a/setup.py
+++ b/setup.py
@@ -148,7 +148,7 @@ setup(name = 'GPy',
       include_package_data = True,
       py_modules = ['GPy.__init__'],
       test_suite = 'GPy.testing',
-      install_requires = ['numpy>=1.7', 'scipy>=0.16', 'six', 'paramz>=0.5.2'],
+      install_requires = ['numpy>=1.7', 'scipy>=0.16', 'six', 'paramz>=0.6.6'],
       extras_require = {'docs':['sphinx'],
                         'optional':['mpi4py',
                                     'ipython>=4.0.0',