From bef114eabd592fc2b44bb823c0e8ec779f25306d Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Tue, 8 Dec 2015 13:59:46 +0000
Subject: [PATCH 01/53] (wpgs) fixing newton-raphson for f_inv and fixing
 plotting stuff

---
 GPy/models/warped_gp.py            | 37 +++++++++++++++++++++---------
 GPy/plotting/gpy_plot/plot_util.py |  8 ++++++-
 GPy/testing/model_tests.py         | 29 ++++++++++-------------
 GPy/util/warping_functions.py      |  6 ++++-
 4 files changed, 50 insertions(+), 30 deletions(-)

diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index df947d3e..998c0ed6 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -96,11 +96,14 @@ class WarpedGP(GP):
         return arg1 - (arg2 ** 2)
 
     def predict(self, Xnew, which_parts='all', pred_init=None, full_cov=False, Y_metadata=None,
-                median=False, deg_gauss_hermite=100):
-        # normalize X values
-        # Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
+                median=False, deg_gauss_hermite=100, likelihood=None):
+        """
+        Prediction results depend on:
+        - The value of the self.predict_in_warped_space flag
+        - The median flag passed as argument
+        The likelihood keyword is never used, it is just to follow the plotting API.
+        """
         mu, var = GP._raw_predict(self, Xnew)
-
         # now push through likelihood
         mean, var = self.likelihood.predictive_values(mu, var)
 
@@ -116,13 +119,11 @@ class WarpedGP(GP):
         else:
             wmean = mean
             wvar = var
-
         if self.scale_data:
             pred = self._unscale_data(pred)
-
         return wmean, wvar
 
-    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None):
+    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None, likelihood=None, median=False):
         """
         Get the predictive quantiles around the prediction at X
 
@@ -137,15 +138,29 @@ class WarpedGP(GP):
         if self.normalizer is not None:
             m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
         a, b = self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
-        #return [a, b]
         if not self.predict_in_warped_space:
             return [a, b]
-        #print a.shape
         new_a = self.warping_function.f_inv(a)
         new_b = self.warping_function.f_inv(b)
-
         return [new_a, new_b]
-        #return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
+
+    def log_predictive_density(self, x_test, y_test, Y_metadata=None):
+        """
+        Calculation of the log predictive density
+
+        .. math:
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+
+        :param x_test: test locations (x_{*})
+        :type x_test: (Nx1) array
+        :param y_test: test observations (y_{*})
+        :type y_test: (Nx1) array
+        :param Y_metadata: metadata associated with the test points
+        """
+        mu_star, var_star = self._raw_predict(x_test)
+        fy = self.warping_function.f(y_test)
+        ll_lpd = self.likelihood.log_predictive_density(fy, mu_star, var_star, Y_metadata=Y_metadata)
+        return ll_lpd * self.warping_function.fgrad_y(y_test)
 
 
 if __name__ == '__main__':
diff --git a/GPy/plotting/gpy_plot/plot_util.py b/GPy/plotting/gpy_plot/plot_util.py
index 254886a2..8c5c04f1 100644
--- a/GPy/plotting/gpy_plot/plot_util.py
+++ b/GPy/plotting/gpy_plot/plot_util.py
@@ -31,6 +31,7 @@
 import numpy as np
 from scipy import sparse
 import itertools
+from GPy.models import WarpedGP
 
 def in_ipynb():
     try:
@@ -73,6 +74,9 @@ def helper_predict_with_model(self, Xgrid, plot_raw, apply_link, percentiles, wh
     if 'output_index' not in predict_kw['Y_metadata']:
         predict_kw['Y_metadata']['output_index'] = Xgrid[:,-1:].astype(np.int)
 
+    if isinstance(self, WarpedGP) and self.predict_in_warped_space:
+        predict_kw['median'] = True
+
     mu, _ = self.predict(Xgrid, **predict_kw)
 
     if percentiles is not None:
@@ -291,6 +295,8 @@ def get_x_y_var(model):
         Y = model.Y.values
     except AttributeError:
         Y = model.Y
+    if isinstance(model, WarpedGP) and model.predict_in_warped_space:
+        Y = model.Y_untransformed
     if sparse.issparse(Y): Y = Y.todense().view(np.ndarray)
     return X, X_variance, Y
 
@@ -377,4 +383,4 @@ def x_frame2D(X,plot_limits=None,resolution=None):
     resolution = resolution or 50
     xx, yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution]
     Xnew = np.vstack((xx.flatten(),yy.flatten())).T
-    return Xnew, xx, yy, xmin, xmax
\ No newline at end of file
+    return Xnew, xx, yy, xmin, xmax
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 1212d746..988bde13 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -307,42 +307,37 @@ class MiscTests(unittest.TestCase):
  
         np.testing.assert_almost_equal(preds, warp_preds)
 
-    @unittest.skip('Comment this to plot the modified sine function')
+    #@unittest.skip('Comment this to plot the modified sine function')
     def test_warped_gp_sine(self):
         """
         A test replicating the sine regression problem from
         Snelson's paper.
         """
         X = (2 * np.pi) * np.random.random(151) - np.pi
-        Y = np.sin(X) + np.random.normal(0,0.1,151)
-        Y = np.exp(Y) - 5
-        #Y = np.array([np.power(abs(y),float(1)/3) * (1,-1)[y<0] for y in Y]) + 0
+        Y = np.sin(X) + np.random.normal(0,0.2,151)
+        Y = np.array([np.power(abs(y),float(1)/3) * (1,-1)[y<0] for y in Y])
         
-        #np.seterr(over='raise')
         import matplotlib.pyplot as plt
         warp_k = GPy.kern.RBF(1)
         warp_f = GPy.util.warping_functions.TanhWarpingFunction_d(n_terms=2)
         warp_m = GPy.models.WarpedGP(X[:, None], Y[:, None], kernel=warp_k, warping_function=warp_f)
-        #warp_m['.*variance.*'].constrain_fixed(0.25)
-        #warp_m['.*lengthscale.*'].constrain_fixed(1)
-        #warp_m['warp_tanh.d'].constrain_fixed(1)
-        #warp_m.randomize()
-        #warp_m['.*warp_tanh.psi*'][:,0:2].constrain_bounded(0,100)
-        #warp_m['.*warp_tanh.psi*'][:,0:1].constrain_fixed(1)
-        
-        #print(warp_m.checkgrad())
-        #warp_m.plot()
-        #plt.show()
+        warp_m['.*noise.variance.*'].constrain_fixed(0.1)
 
-        warp_m.optimize_restarts(parallel=True, robust=True)
-        #print(warp_m.checkgrad())
+        m = GPy.models.GPRegression(X[:, None], Y[:, None])
+        m.optimize_restarts(parallel=False, robust=True, num_restarts=5)
+        warp_m.optimize_restarts(parallel=False, robust=True, num_restarts=5)
         print(warp_m)
         print(warp_m['.*warp.*'])
         warp_m.predict_in_warped_space = False
         warp_m.plot()
         warp_m.predict_in_warped_space = True
         warp_m.plot()
+        m.plot()
         warp_f.plot(X.min()-10, X.max()+10)
+
+
+
+
         plt.show()
 
         
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index 1617283c..516103bf 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -196,7 +196,11 @@ class TanhWarpingFunction_d(WarpingFunction):
 
         z = z.copy()
         if y is None:
-            y = np.ones_like(z)
+            # The idea here is to initialize y with +1 where
+            # z is positive and -1 where it is negative.
+            # For negative z, Newton-Raphson diverges 
+            # if we initialize y with a positive value (and vice-versa).
+            y = ((z > 0) * 1.) - (z <= 0)
 
         it = 0
         update = np.inf

From 4c843adc4cbd05c8095ccdcf85b127cc12c856c4 Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Tue, 8 Dec 2015 14:14:59 +0000
Subject: [PATCH 02/53] skipping the wgps Snelson's test (comment the skip line
 to see the plots)

---
 GPy/testing/model_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 988bde13..7f983411 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -307,7 +307,7 @@ class MiscTests(unittest.TestCase):
  
         np.testing.assert_almost_equal(preds, warp_preds)
 
-    #@unittest.skip('Comment this to plot the modified sine function')
+    @unittest.skip('Comment this to plot the modified sine function')
     def test_warped_gp_sine(self):
         """
         A test replicating the sine regression problem from

From 24e9d68a19560033d040d860fb044e5d6bc18480 Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Thu, 14 Jan 2016 15:28:48 +0000
Subject: [PATCH 03/53] stuff

---
 GPy/models/warped_gp.py       | 2 +-
 GPy/testing/model_tests.py    | 4 ++--
 GPy/util/warping_functions.py | 9 +++++++--
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index 998c0ed6..443454a1 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -160,7 +160,7 @@ class WarpedGP(GP):
         mu_star, var_star = self._raw_predict(x_test)
         fy = self.warping_function.f(y_test)
         ll_lpd = self.likelihood.log_predictive_density(fy, mu_star, var_star, Y_metadata=Y_metadata)
-        return ll_lpd * self.warping_function.fgrad_y(y_test)
+        return ll_lpd - np.log(self.warping_function.fgrad_y(y_test))
 
 
 if __name__ == '__main__':
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 7f983411..d637a8c6 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -307,7 +307,7 @@ class MiscTests(unittest.TestCase):
  
         np.testing.assert_almost_equal(preds, warp_preds)
 
-    @unittest.skip('Comment this to plot the modified sine function')
+    #@unittest.skip('Comment this to plot the modified sine function')
     def test_warped_gp_sine(self):
         """
         A test replicating the sine regression problem from
@@ -321,7 +321,6 @@ class MiscTests(unittest.TestCase):
         warp_k = GPy.kern.RBF(1)
         warp_f = GPy.util.warping_functions.TanhWarpingFunction_d(n_terms=2)
         warp_m = GPy.models.WarpedGP(X[:, None], Y[:, None], kernel=warp_k, warping_function=warp_f)
-        warp_m['.*noise.variance.*'].constrain_fixed(0.1)
 
         m = GPy.models.GPRegression(X[:, None], Y[:, None])
         m.optimize_restarts(parallel=False, robust=True, num_restarts=5)
@@ -330,6 +329,7 @@ class MiscTests(unittest.TestCase):
         print(warp_m['.*warp.*'])
         warp_m.predict_in_warped_space = False
         warp_m.plot()
+        import ipdb; ipdb.set_trace()
         warp_m.predict_in_warped_space = True
         warp_m.plot()
         m.plot()
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index 516103bf..ce99a940 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -152,7 +152,7 @@ class TanhWarpingFunction(WarpingFunction):
 
 class TanhWarpingFunction_d(WarpingFunction):
 
-    def __init__(self, n_terms=3):
+    def __init__(self, n_terms=3, initial_y=None):
         """n_terms specifies the number of tanh terms to be used"""
         self.n_terms = n_terms
         self.num_parameters = 3 * self.n_terms + 1
@@ -165,6 +165,7 @@ class TanhWarpingFunction_d(WarpingFunction):
         self.d = Param('%s' % ('d'), 1.0, Logexp())
         self.link_parameter(self.psi)
         self.link_parameter(self.d)
+        self.initial_y = initial_y
 
     def f(self, y):
         """
@@ -187,7 +188,7 @@ class TanhWarpingFunction_d(WarpingFunction):
             z += a*np.tanh(b*(y+c))
         return z
 
-    def f_inv(self, z, max_iterations=1000, y=None):
+    def f_inv(self, z, max_iterations=100, y=None):
         """
         calculate the numerical inverse of f
 
@@ -195,12 +196,15 @@ class TanhWarpingFunction_d(WarpingFunction):
         """
 
         z = z.copy()
+        
         if y is None:
             # The idea here is to initialize y with +1 where
             # z is positive and -1 where it is negative.
             # For negative z, Newton-Raphson diverges 
             # if we initialize y with a positive value (and vice-versa).
             y = ((z > 0) * 1.) - (z <= 0)
+            if self.initial_y is not None:
+                y *= self.initial_y
 
         it = 0
         update = np.inf
@@ -213,6 +217,7 @@ class TanhWarpingFunction_d(WarpingFunction):
             it += 1
         if it == max_iterations:
             print("WARNING!!! Maximum number of iterations reached in f_inv ")
+            print("Sum of updates: %.4f" % np.sum(update))
         return y
 
     def fgrad_y(self, y, return_precalc=False):

From c129900768eb7e363c4095dbad7603bb606f92c2 Mon Sep 17 00:00:00 2001
From: Daniel Beck <daniel.e.beck@gmail.com>
Date: Mon, 22 Feb 2016 19:57:54 +0000
Subject: [PATCH 04/53] added a log warping function

---
 GPy/models/warped_gp.py       | 20 ++++++------
 GPy/util/warping_functions.py | 60 ++++++++++++++++++++++++++++++++---
 2 files changed, 66 insertions(+), 14 deletions(-)

diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index 443454a1..d040f2d4 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -44,17 +44,19 @@ class WarpedGP(GP):
         super(WarpedGP, self).parameters_changed()
 
         Kiy = self.posterior.woodbury_vector.flatten()
+    
+        self.warping_function.update_grads(self.Y_untransformed, Kiy)
 
-        grad_y = self.warping_function.fgrad_y(self.Y_untransformed)
-        grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed,
-                                                                 return_covar_chain=True)
-        djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
-        dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
+        #grad_y = self.warping_function.fgrad_y(self.Y_untransformed)
+        #grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed,
+        #                                                         return_covar_chain=True)
+        #djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
+        #dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
 
-        warping_grads = -dquad_dpsi + djac_dpsi
+        #warping_grads = -dquad_dpsi + djac_dpsi
 
-        self.warping_function.psi.gradient[:] = warping_grads[:, :-1]
-        self.warping_function.d.gradient[:] = warping_grads[0, -1]
+        #self.warping_function.psi.gradient[:] = warping_grads[:, :-1]
+        #self.warping_function.d.gradient[:] = warping_grads[0, -1]
 
     def transform_data(self):
         Y = self.warping_function.f(self.Y_untransformed.copy()).copy()
@@ -160,7 +162,7 @@ class WarpedGP(GP):
         mu_star, var_star = self._raw_predict(x_test)
         fy = self.warping_function.f(y_test)
         ll_lpd = self.likelihood.log_predictive_density(fy, mu_star, var_star, Y_metadata=Y_metadata)
-        return ll_lpd - np.log(self.warping_function.fgrad_y(y_test))
+        return ll_lpd + np.log(self.warping_function.fgrad_y(y_test))
 
 
 if __name__ == '__main__':
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index ce99a940..88625757 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -278,6 +278,52 @@ class TanhWarpingFunction_d(WarpingFunction):
         names.append('warp_tanh_d')
         return names
 
+    def update_grads(self, Y_untransformed, Kiy):
+        grad_y = self.fgrad_y(Y_untransformed)
+        grad_y_psi, grad_psi = self.fgrad_y_psi(Y_untransformed,
+                                                return_covar_chain=True)
+        djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
+        dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
+
+        warping_grads = -dquad_dpsi + djac_dpsi
+
+        self.psi.gradient[:] = warping_grads[:, :-1]
+        self.d.gradient[:] = warping_grads[0, -1]
+
+
+class LogFunction(WarpingFunction):
+    """
+    Easy wrapper for applying a fixed warping function to
+    positive-only values.
+    """
+    def __init__(self):
+        self.num_parameters = 0
+        #self.psi = Param('psi', np.zeros((1,3)))
+        #self.d = Param('%s' % ('d'), 0.0, Logexp())
+        super(LogFunction, self).__init__(name='log')
+        #self.link_parameter(self.psi)
+        #self.link_parameter(self.d)
+
+
+    def f(self, y):
+        return np.log(y)
+
+    def fgrad_y(self, y):
+        return 1. / y
+
+    def update_grads(self, Y_untransformed, Kiy):
+        pass
+
+    def fgrad_y_psi(self, y, return_covar_chain=False):
+        gradients = np.zeros((y.shape[0], y.shape[1], len(self.psi), 4))
+        gradients = 0
+        if return_covar_chain:
+            return gradients, gradients
+        return gradients
+
+    def f_inv(self, z, y=None):
+        return np.exp(z)
+
 
 class IdentityFunction(WarpingFunction):
     """
@@ -285,12 +331,12 @@ class IdentityFunction(WarpingFunction):
     and should not be used in practice.
     """
     def __init__(self):
-        self.num_parameters = 4
-        self.psi = Param('psi', np.zeros((1,3)))
-        self.d = Param('%s' % ('d'), 1.0, Logexp())
+        self.num_parameters = 0
+        #self.psi = Param('psi', np.zeros((1,3)))
+        #self.d = Param('%s' % ('d'), 0.0, Logexp())
         super(IdentityFunction, self).__init__(name='identity')
-        self.link_parameter(self.psi)
-        self.link_parameter(self.d)
+        #self.link_parameter(self.psi)
+        #self.link_parameter(self.d)
 
         
     def f(self, y):
@@ -299,8 +345,12 @@ class IdentityFunction(WarpingFunction):
     def fgrad_y(self, y):
         return np.ones(y.shape)
 
+    def update_grads(self, Y_untransformed, Kiy):
+        pass
+
     def fgrad_y_psi(self, y, return_covar_chain=False):
         gradients = np.zeros((y.shape[0], y.shape[1], len(self.psi), 4))
+        gradients = 0
         if return_covar_chain:
             return gradients, gradients
         return gradients

From 03e59b764d7cdfbb9a629ea25949dabbf10fa664 Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Wed, 24 Feb 2016 11:07:31 +0000
Subject: [PATCH 05/53] cleaning on warping functions

---
 GPy/util/warping_functions.py | 26 +++++++-------------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index 88625757..147ffda4 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -153,7 +153,9 @@ class TanhWarpingFunction(WarpingFunction):
 class TanhWarpingFunction_d(WarpingFunction):
 
     def __init__(self, n_terms=3, initial_y=None):
-        """n_terms specifies the number of tanh terms to be used"""
+        """
+        n_terms specifies the number of tanh terms to be used
+        """
         self.n_terms = n_terms
         self.num_parameters = 3 * self.n_terms + 1
         self.psi = np.ones((self.n_terms, 3))
@@ -298,12 +300,7 @@ class LogFunction(WarpingFunction):
     """
     def __init__(self):
         self.num_parameters = 0
-        #self.psi = Param('psi', np.zeros((1,3)))
-        #self.d = Param('%s' % ('d'), 0.0, Logexp())
         super(LogFunction, self).__init__(name='log')
-        #self.link_parameter(self.psi)
-        #self.link_parameter(self.d)
-
 
     def f(self, y):
         return np.log(y)
@@ -315,11 +312,9 @@ class LogFunction(WarpingFunction):
         pass
 
     def fgrad_y_psi(self, y, return_covar_chain=False):
-        gradients = np.zeros((y.shape[0], y.shape[1], len(self.psi), 4))
-        gradients = 0
         if return_covar_chain:
-            return gradients, gradients
-        return gradients
+            return 0, 0
+        return 0
 
     def f_inv(self, z, y=None):
         return np.exp(z)
@@ -332,12 +327,7 @@ class IdentityFunction(WarpingFunction):
     """
     def __init__(self):
         self.num_parameters = 0
-        #self.psi = Param('psi', np.zeros((1,3)))
-        #self.d = Param('%s' % ('d'), 0.0, Logexp())
         super(IdentityFunction, self).__init__(name='identity')
-        #self.link_parameter(self.psi)
-        #self.link_parameter(self.d)
-
         
     def f(self, y):
         return y
@@ -349,11 +339,9 @@ class IdentityFunction(WarpingFunction):
         pass
 
     def fgrad_y_psi(self, y, return_covar_chain=False):
-        gradients = np.zeros((y.shape[0], y.shape[1], len(self.psi), 4))
-        gradients = 0
         if return_covar_chain:
-            return gradients, gradients
-        return gradients
+            return 0, 0
+        return 0
         
     def f_inv(self, z, y=None):
         return z

From 8deac1d05869661ede6d9325b30ef70dfd586f27 Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Wed, 24 Feb 2016 11:17:22 +0000
Subject: [PATCH 06/53] cleaning on warping functions

---
 GPy/util/warping_functions.py | 42 +++++++++++++++--------------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index 147ffda4..a2f0dfa8 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -174,20 +174,14 @@ class TanhWarpingFunction_d(WarpingFunction):
         Transform y with f using parameter vector psi
         psi = [[a,b,c]]
 
-        :math:`f = \\sum_{terms} a * tanh(b*(y+c))`
+        :math:`f = (y * d) + \\sum_{terms} a * tanh(b *(y + c))`
         """
-        #1. check that number of params is consistent
-        # assert psi.shape[0] == self.n_terms, 'inconsistent parameter dimensions'
-        # assert psi.shape[1] == 4, 'inconsistent parameter dimensions'
-
         d = self.d
         mpsi = self.psi
-
-        #3. transform data
-        z = d*y.copy()
+        z = d * y.copy()
         for i in range(len(mpsi)):
-            a,b,c = mpsi[i]
-            z += a*np.tanh(b*(y+c))
+            a, b, c = mpsi[i]
+            z += a * np.tanh(b * (y + c))
         return z
 
     def f_inv(self, z, max_iterations=100, y=None):
@@ -214,7 +208,7 @@ class TanhWarpingFunction_d(WarpingFunction):
         while it == 0 or (np.abs(update).sum() > 1e-10 and it < max_iterations):
             fy = self.f(y)
             fgrady = self.fgrad_y(y)
-            update = (fy - z)/fgrady
+            update = (fy - z) / fgrady
             y -= update
             it += 1
         if it == max_iterations:
@@ -226,17 +220,18 @@ class TanhWarpingFunction_d(WarpingFunction):
         """
         gradient of f w.r.t to y ([N x 1])
 
-        :returns: Nx1 vector of derivatives, unless return_precalc is true, then it also returns the precomputed stuff
+        :returns: Nx1 vector of derivatives, unless return_precalc is true, 
+        then it also returns the precomputed stuff
         """
         d = self.d
         mpsi = self.psi
 
         # vectorized version
-        S = (mpsi[:,1]*(y[:,:,None] + mpsi[:,2])).T
+        S = (mpsi[:,1] * (y[:,:,None] + mpsi[:,2])).T
         R = np.tanh(S)
-        D = 1-R**2
+        D = 1 - (R ** 2)
 
-        GRAD = (d + (mpsi[:,0:1][:,:,None]*mpsi[:,1:2][:,:,None]*D).sum(axis=0)).T
+        GRAD = (d + (mpsi[:,0:1][:,:,None] * mpsi[:,1:2][:,:,None] * D).sum(axis=0)).T
 
         if return_precalc:
             return GRAD, S, R, D
@@ -255,28 +250,27 @@ class TanhWarpingFunction_d(WarpingFunction):
         gradients = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
         for i in range(len(mpsi)):
             a,b,c  = mpsi[i]
-            gradients[:,:,i,0] = (b*(1.0/np.cosh(s[i]))**2).T
-            gradients[:,:,i,1] = a*(d[i] - 2.0*s[i]*r[i]*(1.0/np.cosh(s[i]))**2).T
-            gradients[:,:,i,2] = (-2.0*a*(b**2)*r[i]*((1.0/np.cosh(s[i]))**2)).T
-        gradients[:,:,0,3] = 1.0
+            gradients[:, :, i, 0] = (b * (1.0/np.cosh(s[i])) ** 2).T
+            gradients[:, :, i, 1] = a * (d[i] - 2.0 * s[i] * r[i] * (1.0/np.cosh(s[i])) ** 2).T
+            gradients[:, :, i, 2] = (-2.0 * a * (b ** 2) * r[i] * ((1.0 / np.cosh(s[i])) ** 2)).T
+        gradients[:, :, 0, 3] = 1.0
 
         if return_covar_chain:
             covar_grad_chain = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
-
             for i in range(len(mpsi)):
                 a,b,c = mpsi[i]
                 covar_grad_chain[:, :, i, 0] = (r[i]).T
-                covar_grad_chain[:, :, i, 1] = (a*(y + c) * ((1.0/np.cosh(s[i]))**2).T)
-                covar_grad_chain[:, :, i, 2] = a*b*((1.0/np.cosh(s[i]))**2).T
+                covar_grad_chain[:, :, i, 1] = (a * (y + c) * ((1.0 / np.cosh(s[i])) ** 2).T)
+                covar_grad_chain[:, :, i, 2] = a * b * ((1.0 / np.cosh(s[i])) ** 2).T
             covar_grad_chain[:, :, 0, 3] = y
-
             return gradients, covar_grad_chain
 
         return gradients
 
     def _get_param_names(self):
         variables = ['a', 'b', 'c', 'd']
-        names = sum([['warp_tanh_%s_t%i' % (variables[n],q) for n in range(3)] for q in range(self.n_terms)],[])
+        names = sum([['warp_tanh_%s_t%i' % (variables[n],q) for n in range(3)] 
+                     for q in range(self.n_terms)],[])
         names.append('warp_tanh_d')
         return names
 

From 9039fae29e5fce00c5fd49beedc0fc041f2a7ef6 Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Wed, 24 Feb 2016 11:24:31 +0000
Subject: [PATCH 07/53] deleted old tanh_warp and renamed warp_tanh_d to
 warp_tanh

---
 GPy/models/warped_gp.py       |   4 +-
 GPy/testing/model_tests.py    |   2 +-
 GPy/util/warping_functions.py | 105 +---------------------------------
 3 files changed, 5 insertions(+), 106 deletions(-)

diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index d040f2d4..50be6467 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -5,7 +5,7 @@ import numpy as np
 from ..util.warping_functions import *
 from ..core import GP
 from .. import likelihoods
-from GPy.util.warping_functions import TanhWarpingFunction_d
+from GPy.util.warping_functions import TanhWarpingFunction
 from GPy import kern
 
 class WarpedGP(GP):
@@ -15,7 +15,7 @@ class WarpedGP(GP):
             kernel = kern.RBF(X.shape[1])
 
         if warping_function == None:
-            self.warping_function = TanhWarpingFunction_d(warping_terms)
+            self.warping_function = TanhWarpingFunction(warping_terms)
             self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1) * 1)
         else:
             self.warping_function = warping_function
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index d637a8c6..3ced78f2 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -319,7 +319,7 @@ class MiscTests(unittest.TestCase):
         
         import matplotlib.pyplot as plt
         warp_k = GPy.kern.RBF(1)
-        warp_f = GPy.util.warping_functions.TanhWarpingFunction_d(n_terms=2)
+        warp_f = GPy.util.warping_functions.TanhWarpingFunction(n_terms=2)
         warp_m = GPy.models.WarpedGP(X[:, None], Y[:, None], kernel=warp_k, warping_function=warp_f)
 
         m = GPy.models.GPRegression(X[:, None], Y[:, None])
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index a2f0dfa8..dd41e5ee 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -51,107 +51,6 @@ class WarpingFunction(Parameterized):
 
 class TanhWarpingFunction(WarpingFunction):
 
-    def __init__(self, n_terms=3):
-        """n_terms specifies the number of tanh terms to be used"""
-        self.n_terms = n_terms
-        self.num_parameters = 3 * self.n_terms
-        super(TanhWarpingFunction, self).__init__(name='warp_tanh')
-
-    def f(self, y, psi):
-        """
-        transform y with f using parameter vector psi
-        psi = [[a,b,c]]
-        ::math::`f = \\sum_{terms} a * tanh(b*(y+c))`
-        """
-        #1. check that number of params is consistent
-        assert psi.shape[0] == self.n_terms, 'inconsistent parameter dimensions'
-        assert psi.shape[1] == 3, 'inconsistent parameter dimensions'
-
-        #2. exponentiate the a and b (positive!)
-        mpsi = psi.copy()
-
-        #3. transform data
-        z = y.copy()
-        for i in range(len(mpsi)):
-            a,b,c = mpsi[i]
-            z += a*np.tanh(b*(y+c))
-        return z
-
-    def f_inv(self, y, psi, iterations=10):
-        """
-        calculate the numerical inverse of f
-        :param iterations: number of N.R. iterations
-        """
-
-        y = y.copy()
-        z = np.ones_like(y)
-        for i in range(iterations):
-            z -= (self.f(z, psi) - y)/self.fgrad_y(z,psi)
-        return z
-
-    def fgrad_y(self, y, psi, return_precalc=False):
-        """
-        gradient of f w.r.t to y ([N x 1])
-        returns: Nx1 vector of derivatives, unless return_precalc is true,
-        then it also returns the precomputed stuff
-        """
-
-        mpsi = psi.copy()
-
-        # vectorized version
-
-        # S = (mpsi[:,1]*(y + mpsi[:,2])).T
-        S = (mpsi[:,1]*(y[:,:,None] + mpsi[:,2])).T
-        R = np.tanh(S)
-        D = 1-R**2
-
-        # GRAD = (1+(mpsi[:,0:1]*mpsi[:,1:2]*D).sum(axis=0))[:,np.newaxis]
-        GRAD = (1+(mpsi[:,0:1][:,:,None]*mpsi[:,1:2][:,:,None]*D).sum(axis=0)).T
-
-        if return_precalc:
-            # return GRAD,S.sum(axis=1),R.sum(axis=1),D.sum(axis=1)
-            return GRAD, S, R, D
-
-        return GRAD
-
-    def fgrad_y_psi(self, y, psi, return_covar_chain=False):
-        """
-        gradient of f w.r.t to y and psi
-        returns: NxIx3 tensor of partial derivatives
-        """
-
-        # 1. exponentiate the a and b (positive!)
-        mpsi = psi.copy()
-        w, s, r, d = self.fgrad_y(y, psi, return_precalc = True)
-
-        gradients = np.zeros((y.shape[0], y.shape[1], len(mpsi), 3))
-        for i in range(len(mpsi)):
-            a,b,c = mpsi[i]
-            gradients[:,:,i,0] = (b*(1.0/np.cosh(s[i]))**2).T
-            gradients[:,:,i,1] = a*(d[i] - 2.0*s[i]*r[i]*(1.0/np.cosh(s[i]))**2).T
-            gradients[:,:,i,2] = (-2.0*a*(b**2)*r[i]*((1.0/np.cosh(s[i]))**2)).T
-
-        if return_covar_chain:
-            covar_grad_chain = np.zeros((y.shape[0], y.shape[1], len(mpsi), 3))
-
-            for i in range(len(mpsi)):
-                a,b,c = mpsi[i]
-                covar_grad_chain[:, :, i, 0] = (r[i]).T
-                covar_grad_chain[:, :, i, 1] = (a*(y + c) * ((1.0/np.cosh(s[i]))**2).T)
-                covar_grad_chain[:, :, i, 2] = a*b*((1.0/np.cosh(s[i]))**2).T
-
-            return gradients, covar_grad_chain
-
-        return gradients
-
-    def _get_param_names(self):
-        variables = ['a', 'b', 'c']
-        names = sum([['warp_tanh_%s_t%i' % (variables[n],q) for n in range(3)] for q in range(self.n_terms)],[])
-        return names
-
-
-class TanhWarpingFunction_d(WarpingFunction):
-
     def __init__(self, n_terms=3, initial_y=None):
         """
         n_terms specifies the number of tanh terms to be used
@@ -160,7 +59,7 @@ class TanhWarpingFunction_d(WarpingFunction):
         self.num_parameters = 3 * self.n_terms + 1
         self.psi = np.ones((self.n_terms, 3))
 
-        super(TanhWarpingFunction_d, self).__init__(name='warp_tanh')
+        super(TanhWarpingFunction, self).__init__(name='warp_tanh')
         self.psi = Param('psi', self.psi)
         self.psi[:, :2].constrain_positive()
 
@@ -271,7 +170,7 @@ class TanhWarpingFunction_d(WarpingFunction):
         variables = ['a', 'b', 'c', 'd']
         names = sum([['warp_tanh_%s_t%i' % (variables[n],q) for n in range(3)] 
                      for q in range(self.n_terms)],[])
-        names.append('warp_tanh_d')
+        names.append('warp_tanh')
         return names
 
     def update_grads(self, Y_untransformed, Kiy):

From 50f72f02943f9ee476c746cdbc7d23725ff0f319 Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Wed, 24 Feb 2016 11:28:05 +0000
Subject: [PATCH 08/53] some cleaning on warped gp

---
 GPy/models/warped_gp.py | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index 50be6467..3e5733eb 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -9,17 +9,18 @@ from GPy.util.warping_functions import TanhWarpingFunction
 from GPy import kern
 
 class WarpedGP(GP):
+    """
+    This defines a GP Regression model that applies a 
+    warping function to the output.
+    """
     def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3):
-
         if kernel is None:
             kernel = kern.RBF(X.shape[1])
-
         if warping_function == None:
             self.warping_function = TanhWarpingFunction(warping_terms)
             self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1) * 1)
         else:
             self.warping_function = warping_function
-
         self.scale_data = False
         if self.scale_data:
             Y = self._scale_data(Y)
@@ -27,7 +28,6 @@ class WarpedGP(GP):
         self.Y_untransformed = Y.copy()
         self.predict_in_warped_space = True
         likelihood = likelihoods.Gaussian()
-
         GP.__init__(self, X, self.transform_data(), likelihood=likelihood, kernel=kernel)
         self.link_parameter(self.warping_function)
 
@@ -40,29 +40,22 @@ class WarpedGP(GP):
         return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin
 
     def parameters_changed(self):
+        """
+        Notice that we update the warping function gradients here.
+        """
         self.Y[:] = self.transform_data()
         super(WarpedGP, self).parameters_changed()
-
         Kiy = self.posterior.woodbury_vector.flatten()
-    
         self.warping_function.update_grads(self.Y_untransformed, Kiy)
 
-        #grad_y = self.warping_function.fgrad_y(self.Y_untransformed)
-        #grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed,
-        #                                                         return_covar_chain=True)
-        #djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
-        #dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
-
-        #warping_grads = -dquad_dpsi + djac_dpsi
-
-        #self.warping_function.psi.gradient[:] = warping_grads[:, :-1]
-        #self.warping_function.d.gradient[:] = warping_grads[0, -1]
-
     def transform_data(self):
         Y = self.warping_function.f(self.Y_untransformed.copy()).copy()
         return Y
 
     def log_likelihood(self):
+        """
+        Notice we add the jacobian of the warping function here.
+        """
         ll = GP.log_likelihood(self)
         jacobian = self.warping_function.fgrad_y(self.Y_untransformed)
         return ll + np.log(jacobian).sum()
@@ -148,7 +141,8 @@ class WarpedGP(GP):
 
     def log_predictive_density(self, x_test, y_test, Y_metadata=None):
         """
-        Calculation of the log predictive density
+        Calculation of the log predictive density. Notice we add
+        the jacobian of the warping function here.
 
         .. math:
             p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})

From b266b578cd18b538363b362991f5503894ce4d21 Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Wed, 24 Feb 2016 11:29:41 +0000
Subject: [PATCH 09/53] renamed TanhWarpingFunction to TanhFunction

---
 GPy/models/warped_gp.py       | 4 ++--
 GPy/testing/model_tests.py    | 2 +-
 GPy/util/warping_functions.py | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index 3e5733eb..d35c1a15 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -5,7 +5,7 @@ import numpy as np
 from ..util.warping_functions import *
 from ..core import GP
 from .. import likelihoods
-from GPy.util.warping_functions import TanhWarpingFunction
+from GPy.util.warping_functions import TanhFunction
 from GPy import kern
 
 class WarpedGP(GP):
@@ -17,7 +17,7 @@ class WarpedGP(GP):
         if kernel is None:
             kernel = kern.RBF(X.shape[1])
         if warping_function == None:
-            self.warping_function = TanhWarpingFunction(warping_terms)
+            self.warping_function = TanhFunction(warping_terms)
             self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1) * 1)
         else:
             self.warping_function = warping_function
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 3ced78f2..ab88d011 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -319,7 +319,7 @@ class MiscTests(unittest.TestCase):
         
         import matplotlib.pyplot as plt
         warp_k = GPy.kern.RBF(1)
-        warp_f = GPy.util.warping_functions.TanhWarpingFunction(n_terms=2)
+        warp_f = GPy.util.warping_functions.TanhFunction(n_terms=2)
         warp_m = GPy.models.WarpedGP(X[:, None], Y[:, None], kernel=warp_k, warping_function=warp_f)
 
         m = GPy.models.GPRegression(X[:, None], Y[:, None])
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index dd41e5ee..5bccbd91 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -49,7 +49,7 @@ class WarpingFunction(Parameterized):
         plt.show()
 
 
-class TanhWarpingFunction(WarpingFunction):
+class TanhFunction(WarpingFunction):
 
     def __init__(self, n_terms=3, initial_y=None):
         """
@@ -59,7 +59,7 @@ class TanhWarpingFunction(WarpingFunction):
         self.num_parameters = 3 * self.n_terms + 1
         self.psi = np.ones((self.n_terms, 3))
 
-        super(TanhWarpingFunction, self).__init__(name='warp_tanh')
+        super(TanhFunction, self).__init__(name='warp_tanh')
         self.psi = Param('psi', self.psi)
         self.psi[:, :2].constrain_positive()
 

From b3f8f263dac0e07ec09f4f3f229a5bdcb50d7b35 Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Wed, 24 Feb 2016 11:32:32 +0000
Subject: [PATCH 10/53] doctest on TanhFunction

---
 GPy/util/warping_functions.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index 5bccbd91..e8c99540 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -50,7 +50,12 @@ class WarpingFunction(Parameterized):
 
 
 class TanhFunction(WarpingFunction):
-
+    """
+    This is the function proposed in Snelson et al.:
+    A sum of tanh functions with linear trends outside
+    the range. Notice the term 'd', which scales the
+    linear trend.
+    """
     def __init__(self, n_terms=3, initial_y=None):
         """
         n_terms specifies the number of tanh terms to be used
@@ -58,11 +63,9 @@ class TanhFunction(WarpingFunction):
         self.n_terms = n_terms
         self.num_parameters = 3 * self.n_terms + 1
         self.psi = np.ones((self.n_terms, 3))
-
         super(TanhFunction, self).__init__(name='warp_tanh')
         self.psi = Param('psi', self.psi)
         self.psi[:, :2].constrain_positive()
-
         self.d = Param('%s' % ('d'), 1.0, Logexp())
         self.link_parameter(self.psi)
         self.link_parameter(self.d)

From 4dc6f0ec6c5e025642910dc5a9597a2b3d14577b Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Wed, 24 Feb 2016 13:08:52 +0000
Subject: [PATCH 11/53] first try on logistic function

---
 GPy/util/warping_functions.py | 150 ++++++++++++++++++++++++++++++++++
 1 file changed, 150 insertions(+)

diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index e8c99540..10fddb50 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -241,3 +241,153 @@ class IdentityFunction(WarpingFunction):
         
     def f_inv(self, z, y=None):
         return z
+
+
+class LogisticFunction(WarpingFunction):
+    """
+    A sum of logistic functions.
+    """
+    def __init__(self, n_terms=3, initial_y=None):
+        """
+        n_terms specifies the number of logistic terms to be used
+        """
+        self.n_terms = n_terms
+        self.num_parameters = 3 * self.n_terms + 1
+        self.psi = np.ones((self.n_terms, 3))
+        super(LogisticFunction, self).__init__(name='warp_logistic')
+        self.psi = Param('psi', self.psi)
+        self.psi[:, :2].constrain_positive()
+        self.d = Param('%s' % ('d'), 1.0, Logexp())
+        self.link_parameter(self.psi)
+        self.link_parameter(self.d)
+        self.initial_y = initial_y
+
+    def _logistic(self, x):
+        return 1. / (1 + np.exp(-x))
+
+    def f(self, y):
+        """
+        Transform y with f using parameter vector psi
+        psi = [[a,b,c]]
+
+        :math:`f = (y * d) + \\sum_{terms} a * logistic(b *(y + c))`
+        """
+        d = self.d
+        mpsi = self.psi
+        z = d * y.copy()
+        for i in xrange(self.n_terms):
+            a, b, c = mpsi[i]
+            z += a * self._logistic(b * (y + c))
+        return z
+
+    def f_inv(self, z, max_iterations=100, y=None):
+        """
+        calculate the numerical inverse of f
+
+        :param max_iterations: maximum number of N.R. iterations
+        """
+        z = z.copy()
+        if y is None:
+            # The idea here is to initialize y with +1 where
+            # z is positive and -1 where it is negative.
+            # For negative z, Newton-Raphson diverges 
+            # if we initialize y with a positive value (and vice-versa).
+            y = ((z > 0) * 1.) - (z <= 0)
+            if self.initial_y is not None:
+                y *= self.initial_y
+        it = 0
+        update = np.inf
+        while it == 0 or (np.abs(update).sum() > 1e-10 and it < max_iterations):
+            fy = self.f(y)
+            fgrady = self.fgrad_y(y)
+            update = (fy - z) / fgrady
+            y -= update
+            it += 1
+        if it == max_iterations:
+            print("WARNING!!! Maximum number of iterations reached in f_inv ")
+            print("Sum of updates: %.4f" % np.sum(update))
+        return y
+
+    def fgrad_y(self, y, return_precalc=False):
+        """
+        Gradient of f w.r.t to y ([N x 1])
+        This vectorized version calculates all summation terms
+        at the same time (since the grad of a sum is the sum of the grads).
+
+        :returns: Nx1 vector of derivatives, unless return_precalc is true, 
+        then it also returns the precomputed stuff
+        """
+        d = self.d
+        mpsi = self.psi
+
+        # vectorized version
+        # term = b * (y + c)
+        term = (mpsi[:, 1] * (y[:, :, None] + mpsi[:, 2])).T
+        logistic_term = self._logistic(term)
+        logistic_grad = logistic_term - (logistic_term ** 2)
+        
+        #S = (mpsi[:,1] * (y[:,:,None] + mpsi[:,2])).T
+        #R = np.tanh(S)
+        #D = 1 - (R ** 2)
+
+        #GRAD = (d + (mpsi[:,0:1][:,:,None] * mpsi[:,1:2][:,:,None] * D).sum(axis=0)).T
+        grad = (d + (mpsi[:, :1][:, : ,None] * mpsi[:, 1:2][:, :, None] * 
+                     logistic_grad).sum(axis=0)).T
+
+        if return_precalc:
+            return grad, logistic_term, logistic_grad
+        return grad
+
+    def fgrad_y_psi(self, y, psi):
+        """
+        gradient of f w.r.t to y and psi
+
+        :returns: NxIx4 tensor of partial derivatives
+        """
+        mpsi = self.psi
+
+        grad, l_term, l_grad = self.fgrad_y(y, return_precalc=True)
+        gradients = np.zeros((y.shape[0], y.shape[1], self.n_terms, 4))
+        for i in xrange(self.n_terms):
+            a, b, c  = mpsi[i]
+            gradients[:, :, i, 0] = b * l_grad[i].T
+            b2l_term = b - (2 * l_term[i])
+            al_grad = a * l_grad[i].T
+            #gradients[:, :, i, 1] = a * (d[i] - 2.0 * s[i] * r[i] * (1.0/np.cosh(s[i])) ** 2).T
+            gradients[:, :, i, 1] = (1 + ((y + c) * b2l_term)) * al_grad
+            #gradients[:, :, i, 2] = (-2.0 * a * (b ** 2) * r[i] * ((1.0 / np.cosh(s[i])) ** 2)).T
+            gradients[:, :, i, 2] = b * b2l_term * al_grad
+        gradients[:, :, 0, 3] = 1.0
+
+        if return_covar_chain:
+            covar_grad_chain = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
+            for i in xrange(self.n_terms):
+                a, b, c = mpsi[i]
+                covar_grad_chain[:, :, i, 0] = l_term[i].T
+                al_grad = a * l_grad[i].T
+                covar_grad_chain[:, :, i, 1] = (y + c) * al_grad
+                covar_grad_chain[:, :, i, 2] = b * al_grad
+            covar_grad_chain[:, :, 0, 3] = y
+            return gradients, covar_grad_chain
+
+        return gradients
+
+    def _get_param_names(self):
+        variables = ['a', 'b', 'c', 'd']
+        names = sum([['warp_logistic_%s_t%i' % (variables[n],q) for n in range(3)] 
+                     for q in range(self.n_terms)],[])
+        names.append('warp_logistic')
+        return names
+
+    def update_grads(self, Y_untransformed, Kiy):
+        grad_y = self.fgrad_y(Y_untransformed)
+        grad_y_psi, grad_psi = self.fgrad_y_psi(Y_untransformed,
+                                                return_covar_chain=True)
+        djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
+        dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
+
+        warping_grads = -dquad_dpsi + djac_dpsi
+
+        self.psi.gradient[:] = warping_grads[:, :-1]
+        self.d.gradient[:] = warping_grads[0, -1]
+

From 91e625a9bdf549934dcac75b8ab4820a37a9dcdf Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Wed, 24 Feb 2016 13:27:19 +0000
Subject: [PATCH 12/53] logistic seems working but more tests are needed

---
 GPy/testing/model_tests.py    | 4 +++-
 GPy/util/warping_functions.py | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index ab88d011..4b9d11ee 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -316,10 +316,12 @@ class MiscTests(unittest.TestCase):
         X = (2 * np.pi) * np.random.random(151) - np.pi
         Y = np.sin(X) + np.random.normal(0,0.2,151)
         Y = np.array([np.power(abs(y),float(1)/3) * (1,-1)[y<0] for y in Y])
+        Y = np.abs(Y)
         
         import matplotlib.pyplot as plt
         warp_k = GPy.kern.RBF(1)
-        warp_f = GPy.util.warping_functions.TanhFunction(n_terms=2)
+        #warp_f = GPy.util.warping_functions.TanhFunction(n_terms=2)
+        warp_f = GPy.util.warping_functions.LogisticFunction(n_terms=2)
         warp_m = GPy.models.WarpedGP(X[:, None], Y[:, None], kernel=warp_k, warping_function=warp_f)
 
         m = GPy.models.GPRegression(X[:, None], Y[:, None])
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index 10fddb50..c62ed08e 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -338,7 +338,7 @@ class LogisticFunction(WarpingFunction):
             return grad, logistic_term, logistic_grad
         return grad
 
-    def fgrad_y_psi(self, y, psi):
+    def fgrad_y_psi(self, y, return_covar_chain=False):
         """
         gradient of f w.r.t to y and psi
 
@@ -351,7 +351,7 @@ class LogisticFunction(WarpingFunction):
         for i in xrange(self.n_terms):
             a, b, c  = mpsi[i]
             gradients[:, :, i, 0] = b * l_grad[i].T
-            b2l_term = b - (2 * l_term[i])
+            b2l_term = b - (2 * l_term[i].T)
             al_grad = a * l_grad[i].T
             #gradients[:, :, i, 1] = a * (d[i] - 2.0 * s[i] * r[i] * (1.0/np.cosh(s[i])) ** 2).T
             gradients[:, :, i, 1] = (1 + ((y + c) * b2l_term)) * al_grad

From 5534c45b0a3b7cab3ea45e7801510ad73db53cb1 Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Wed, 2 Mar 2016 16:54:03 +0000
Subject: [PATCH 13/53] added a rate to inverse calculation

---
 GPy/models/warped_gp.py       |  14 ++--
 GPy/testing/model_tests.py    |  32 +++++++-
 GPy/util/warping_functions.py | 150 +++++++++++++++++++++++++++++++---
 3 files changed, 173 insertions(+), 23 deletions(-)

diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index d35c1a15..419a4104 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -68,22 +68,22 @@ class WarpedGP(GP):
         arg2 = np.ones(shape=gh_samples.shape).dot(mean.T)
         return self.warping_function.f_inv(arg1 + arg2, y=pred_init)
 
-    def _get_warped_mean(self, mean, std, pred_init=None, deg_gauss_hermite=100):
+    def _get_warped_mean(self, mean, std, pred_init=None, deg_gauss_hermite=20):
         """
         Calculate the warped mean by using Gauss-Hermite quadrature.
         """
         gh_samples, gh_weights = np.polynomial.hermite.hermgauss(deg_gauss_hermite)
-        gh_samples = gh_samples[:,None]
-        gh_weights = gh_weights[None,:]
+        gh_samples = gh_samples[:, None]
+        gh_weights = gh_weights[None, :]
         return gh_weights.dot(self._get_warped_term(mean, std, gh_samples)) / np.sqrt(np.pi)
 
-    def _get_warped_variance(self, mean, std, pred_init=None, deg_gauss_hermite=100):
+    def _get_warped_variance(self, mean, std, pred_init=None, deg_gauss_hermite=20):
         """
         Calculate the warped variance by using Gauss-Hermite quadrature.
         """
         gh_samples, gh_weights = np.polynomial.hermite.hermgauss(deg_gauss_hermite)
-        gh_samples = gh_samples[:,None]
-        gh_weights = gh_weights[None,:]
+        gh_samples = gh_samples[:, None]
+        gh_weights = gh_weights[None, :]
         arg1 = gh_weights.dot(self._get_warped_term(mean, std, gh_samples, 
                                                     pred_init=pred_init) ** 2) / np.sqrt(np.pi)
         arg2 = self._get_warped_mean(mean, std, pred_init=pred_init,
@@ -91,7 +91,7 @@ class WarpedGP(GP):
         return arg1 - (arg2 ** 2)
 
     def predict(self, Xnew, which_parts='all', pred_init=None, full_cov=False, Y_metadata=None,
-                median=False, deg_gauss_hermite=100, likelihood=None):
+                median=False, deg_gauss_hermite=20, likelihood=None):
         """
         Prediction results depend on:
         - The value of the self.predict_in_warped_space flag
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 4b9d11ee..2b9add02 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -307,6 +307,31 @@ class MiscTests(unittest.TestCase):
  
         np.testing.assert_almost_equal(preds, warp_preds)
 
+    def test_warped_gp_log(self):
+        """
+        A WarpedGP with the log warping function should be
+        equal to a standard GP with log labels.
+        """
+        k = GPy.kern.RBF(1)
+        Y = np.abs(self.Y)
+        logY = np.log(Y)
+        m = GPy.models.GPRegression(self.X, logY, kernel=k)
+        #m.optimize()
+        m['Gaussian_noise.variance'] = 1e-4
+        preds = m.predict(self.X)[0]
+
+        warp_k = GPy.kern.RBF(1)
+        warp_f = GPy.util.warping_functions.LogFunction()
+        warp_m = GPy.models.WarpedGP(self.X, Y, kernel=warp_k, warping_function=warp_f)
+        warp_m.optimize()
+        warp_m['.*'] = 1.0
+        warp_m['Gaussian_noise.variance'] = 1e-4
+        warp_preds = warp_m.predict(self.X, median=True)[0]
+        #print np.exp(preds)
+        #print warp_preds
+ 
+        np.testing.assert_almost_equal(np.exp(preds), warp_preds)
+
     #@unittest.skip('Comment this to plot the modified sine function')
     def test_warped_gp_sine(self):
         """
@@ -316,12 +341,13 @@ class MiscTests(unittest.TestCase):
         X = (2 * np.pi) * np.random.random(151) - np.pi
         Y = np.sin(X) + np.random.normal(0,0.2,151)
         Y = np.array([np.power(abs(y),float(1)/3) * (1,-1)[y<0] for y in Y])
-        Y = np.abs(Y)
+        #Y = np.abs(Y)
         
         import matplotlib.pyplot as plt
         warp_k = GPy.kern.RBF(1)
-        #warp_f = GPy.util.warping_functions.TanhFunction(n_terms=2)
-        warp_f = GPy.util.warping_functions.LogisticFunction(n_terms=2)
+        warp_f = GPy.util.warping_functions.TanhFunction(n_terms=2)
+        #warp_f = GPy.util.warping_functions.LogisticFunction(n_terms=2)
+        #warp_f = GPy.util.warping_functions.LogitFunction(n_terms=1)
         warp_m = GPy.models.WarpedGP(X[:, None], Y[:, None], kernel=warp_k, warping_function=warp_f)
 
         m = GPy.models.GPRegression(X[:, None], Y[:, None])
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index c62ed08e..548877e5 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -70,6 +70,7 @@ class TanhFunction(WarpingFunction):
         self.link_parameter(self.psi)
         self.link_parameter(self.d)
         self.initial_y = initial_y
+        self.rate = 0.1
 
     def f(self, y):
         """
@@ -94,28 +95,19 @@ class TanhFunction(WarpingFunction):
         """
 
         z = z.copy()
-        
-        if y is None:
-            # The idea here is to initialize y with +1 where
-            # z is positive and -1 where it is negative.
-            # For negative z, Newton-Raphson diverges 
-            # if we initialize y with a positive value (and vice-versa).
-            y = ((z > 0) * 1.) - (z <= 0)
-            if self.initial_y is not None:
-                y *= self.initial_y
+        y = np.ones_like(z)
 
         it = 0
         update = np.inf
-
-        while it == 0 or (np.abs(update).sum() > 1e-10 and it < max_iterations):
+        while np.abs(update).sum() > 1e-10 and it < max_iterations:
             fy = self.f(y)
             fgrady = self.fgrad_y(y)
             update = (fy - z) / fgrady
-            y -= update
+            y -= self.rate * update
             it += 1
         if it == max_iterations:
             print("WARNING!!! Maximum number of iterations reached in f_inv ")
-            print("Sum of updates: %.4f" % np.sum(update))
+            print("Sum of roots: %.4f" % np.sum(fy - z))
         return y
 
     def fgrad_y(self, y, return_precalc=False):
@@ -391,3 +383,135 @@ class LogisticFunction(WarpingFunction):
         self.psi.gradient[:] = warping_grads[:, :-1]
         self.d.gradient[:] = warping_grads[0, -1]
 
+
+class LogitFunction(WarpingFunction):
+    """
+    A sum of logit functions.
+    """
+    def __init__(self, n_terms=1, initial_y=None):
+        """
+        n_terms specifies the number of logistic terms to be used
+        """
+        self.n_terms = n_terms
+        self.num_parameters = 3 * self.n_terms
+        self.psi = np.ones((self.n_terms, 3))
+        super(LogitFunction, self).__init__(name='warp_logit')
+        self.psi = Param('psi', self.psi)
+        self.psi[:, :2].constrain_positive()
+        self.link_parameter(self.psi)
+        self.initial_y = initial_y
+        self.e = 1e-5
+
+    def _logit(self, y):
+        a, b, c = self.psi[0]
+        y += self.e
+        return ((np.log(y) - np.log(a - y)) / b) + c
+
+    def f(self, y):
+        """
+        Transform y with f using parameter vector psi
+        psi = [[a,b,c]]
+
+        :math:`f = (y * d) + \\sum_{terms} a * logistic(b *(y + c))`
+        """
+        z = np.zeros_like(y)
+        for i in xrange(self.n_terms):
+            a, b, c = self.psi[i]
+            z += self._logit(y)
+        return z
+
+    def f_inv(self, z, max_iterations=100, y=None):
+        """
+        calculate the numerical inverse of f
+
+        :param max_iterations: maximum number of N.R. iterations
+        """
+        z = z.copy()
+        if y is None:
+            # The idea here is to initialize y with +1 where
+            # z is positive and -1 where it is negative.
+            # For negative z, Newton-Raphson diverges 
+            # if we initialize y with a positive value (and vice-versa).
+            y = ((z > 0) * 1.) - (z <= 0)
+            if self.initial_y is not None:
+                y *= self.initial_y
+        it = 0
+        update = np.inf
+        while it == 0 or (np.abs(update).sum() > 1e-10 and it < max_iterations):
+            fy = self.f(y)
+            fgrady = self.fgrad_y(y)
+            update = (fy - z) / fgrady
+            y -= update
+            it += 1
+        if it == max_iterations:
+            print("WARNING!!! Maximum number of iterations reached in f_inv ")
+            print("Sum of updates: %.4f" % np.sum(update))
+        return y
+
+    def fgrad_y(self, y, return_precalc=False):
+        """
+        Gradient of f w.r.t to y ([N x 1])
+        This vectorized version calculates all summation terms
+        at the same time (since the grad of a sum is the sum of the grads).
+
+        :returns: Nx1 vector of derivatives, unless return_precalc is true, 
+        then it also returns the precomputed stuff
+        """
+        a, b, c = self.psi[0]
+
+        # vectorized version
+        # term = b * (y + c)
+        y += self.e
+        yb = y * b
+        ab = a * b
+        grad = (1. / yb) + (1. / (ab - yb))
+
+        if return_precalc:
+            return grad, yb, ab
+        return grad
+
+    def fgrad_y_psi(self, y, return_covar_chain=False):
+        """
+        gradient of f w.r.t to y and psi
+
+        :returns: NxIx4 tensor of partial derivatives
+        """
+        grad, yb, ab = self.fgrad_y(y, return_precalc=True)
+        gradients = np.zeros((y.shape[0], y.shape[1], self.n_terms, 3))
+        for i in xrange(self.n_terms):
+            a, b, c  = self.psi[i]
+            gradients[:, :, i, 0] = -b / ((ab - yb) ** 2)
+            yb2 = y * (b ** 2)
+            ab2 = a * (b ** 2)
+            gradients[:, :, i, 1] = - (1. / yb2) - (1. / (ab2 - yb2))
+            #gradients[:, :, i, 2] = 0.0
+
+        if return_covar_chain:
+            covar_grad_chain = np.zeros((y.shape[0], y.shape[1], self.n_terms, 3))
+            for i in xrange(self.n_terms):
+                a, b, c = self.psi[i]
+                covar_grad_chain[:, :, i, 0] = - (1. / (ab - yb))
+                covar_grad_chain[:, :, i, 1] = - (np.log(y) - np.log(a - y)) / (b ** 2)
+                covar_grad_chain[:, :, i, 2] = 1.0
+            return gradients, covar_grad_chain
+
+        return gradients
+
+    def _get_param_names(self):
+        variables = ['a', 'b', 'c']
+        names = sum([['warp_logit_%s_t%i' % (variables[n],q) for n in range(3)] 
+                     for q in range(self.n_terms)],[])
+        names.append('warp_logit')
+        return names
+
+    def update_grads(self, Y_untransformed, Kiy):
+        grad_y = self.fgrad_y(Y_untransformed)
+        grad_y_psi, grad_psi = self.fgrad_y_psi(Y_untransformed,
+                                                return_covar_chain=True)
+        djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
+        dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
+
+        warping_grads = -dquad_dpsi + djac_dpsi
+
+        self.psi.gradient[:] = warping_grads[:, :]
+        

From 7bee3daac83ef91001cfc007959ee787e0b961c7 Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Wed, 2 Mar 2016 16:55:26 +0000
Subject: [PATCH 14/53] cleaning

---
 GPy/testing/model_tests.py    |   5 +-
 GPy/util/warping_functions.py | 281 ----------------------------------
 2 files changed, 1 insertion(+), 285 deletions(-)

diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 2b9add02..83487373 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -332,7 +332,7 @@ class MiscTests(unittest.TestCase):
  
         np.testing.assert_almost_equal(np.exp(preds), warp_preds)
 
-    #@unittest.skip('Comment this to plot the modified sine function')
+    @unittest.skip('Comment this to plot the modified sine function')
     def test_warped_gp_sine(self):
         """
         A test replicating the sine regression problem from
@@ -341,13 +341,10 @@ class MiscTests(unittest.TestCase):
         X = (2 * np.pi) * np.random.random(151) - np.pi
         Y = np.sin(X) + np.random.normal(0,0.2,151)
         Y = np.array([np.power(abs(y),float(1)/3) * (1,-1)[y<0] for y in Y])
-        #Y = np.abs(Y)
         
         import matplotlib.pyplot as plt
         warp_k = GPy.kern.RBF(1)
         warp_f = GPy.util.warping_functions.TanhFunction(n_terms=2)
-        #warp_f = GPy.util.warping_functions.LogisticFunction(n_terms=2)
-        #warp_f = GPy.util.warping_functions.LogitFunction(n_terms=1)
         warp_m = GPy.models.WarpedGP(X[:, None], Y[:, None], kernel=warp_k, warping_function=warp_f)
 
         m = GPy.models.GPRegression(X[:, None], Y[:, None])
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index 548877e5..f98f0f08 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -234,284 +234,3 @@ class IdentityFunction(WarpingFunction):
     def f_inv(self, z, y=None):
         return z
 
-
-class LogisticFunction(WarpingFunction):
-    """
-    A sum of logistic functions.
-    """
-    def __init__(self, n_terms=3, initial_y=None):
-        """
-        n_terms specifies the number of logistic terms to be used
-        """
-        self.n_terms = n_terms
-        self.num_parameters = 3 * self.n_terms + 1
-        self.psi = np.ones((self.n_terms, 3))
-        super(LogisticFunction, self).__init__(name='warp_logistic')
-        self.psi = Param('psi', self.psi)
-        self.psi[:, :2].constrain_positive()
-        self.d = Param('%s' % ('d'), 1.0, Logexp())
-        self.link_parameter(self.psi)
-        self.link_parameter(self.d)
-        self.initial_y = initial_y
-
-    def _logistic(self, x):
-        return 1. / (1 + np.exp(-x))
-
-    def f(self, y):
-        """
-        Transform y with f using parameter vector psi
-        psi = [[a,b,c]]
-
-        :math:`f = (y * d) + \\sum_{terms} a * logistic(b *(y + c))`
-        """
-        d = self.d
-        mpsi = self.psi
-        z = d * y.copy()
-        for i in xrange(self.n_terms):
-            a, b, c = mpsi[i]
-            z += a * self._logistic(b * (y + c))
-        return z
-
-    def f_inv(self, z, max_iterations=100, y=None):
-        """
-        calculate the numerical inverse of f
-
-        :param max_iterations: maximum number of N.R. iterations
-        """
-        z = z.copy()
-        if y is None:
-            # The idea here is to initialize y with +1 where
-            # z is positive and -1 where it is negative.
-            # For negative z, Newton-Raphson diverges 
-            # if we initialize y with a positive value (and vice-versa).
-            y = ((z > 0) * 1.) - (z <= 0)
-            if self.initial_y is not None:
-                y *= self.initial_y
-        it = 0
-        update = np.inf
-        while it == 0 or (np.abs(update).sum() > 1e-10 and it < max_iterations):
-            fy = self.f(y)
-            fgrady = self.fgrad_y(y)
-            update = (fy - z) / fgrady
-            y -= update
-            it += 1
-        if it == max_iterations:
-            print("WARNING!!! Maximum number of iterations reached in f_inv ")
-            print("Sum of updates: %.4f" % np.sum(update))
-        return y
-
-    def fgrad_y(self, y, return_precalc=False):
-        """
-        Gradient of f w.r.t to y ([N x 1])
-        This vectorized version calculates all summation terms
-        at the same time (since the grad of a sum is the sum of the grads).
-
-        :returns: Nx1 vector of derivatives, unless return_precalc is true, 
-        then it also returns the precomputed stuff
-        """
-        d = self.d
-        mpsi = self.psi
-
-        # vectorized version
-        # term = b * (y + c)
-        term = (mpsi[:, 1] * (y[:, :, None] + mpsi[:, 2])).T
-        logistic_term = self._logistic(term)
-        logistic_grad = logistic_term - (logistic_term ** 2)
-        
-        #S = (mpsi[:,1] * (y[:,:,None] + mpsi[:,2])).T
-        #R = np.tanh(S)
-        #D = 1 - (R ** 2)
-
-        #GRAD = (d + (mpsi[:,0:1][:,:,None] * mpsi[:,1:2][:,:,None] * D).sum(axis=0)).T
-        grad = (d + (mpsi[:, :1][:, : ,None] * mpsi[:, 1:2][:, :, None] * 
-                     logistic_grad).sum(axis=0)).T
-
-        if return_precalc:
-            return grad, logistic_term, logistic_grad
-        return grad
-
-    def fgrad_y_psi(self, y, return_covar_chain=False):
-        """
-        gradient of f w.r.t to y and psi
-
-        :returns: NxIx4 tensor of partial derivatives
-        """
-        mpsi = self.psi
-
-        grad, l_term, l_grad = self.fgrad_y(y, return_precalc=True)
-        gradients = np.zeros((y.shape[0], y.shape[1], self.n_terms, 4))
-        for i in xrange(self.n_terms):
-            a, b, c  = mpsi[i]
-            gradients[:, :, i, 0] = b * l_grad[i].T
-            b2l_term = b - (2 * l_term[i].T)
-            al_grad = a * l_grad[i].T
-            #gradients[:, :, i, 1] = a * (d[i] - 2.0 * s[i] * r[i] * (1.0/np.cosh(s[i])) ** 2).T
-            gradients[:, :, i, 1] = (1 + ((y + c) * b2l_term)) * al_grad
-            #gradients[:, :, i, 2] = (-2.0 * a * (b ** 2) * r[i] * ((1.0 / np.cosh(s[i])) ** 2)).T
-            gradients[:, :, i, 2] = b * b2l_term * al_grad
-        gradients[:, :, 0, 3] = 1.0
-
-        if return_covar_chain:
-            covar_grad_chain = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
-            for i in xrange(self.n_terms):
-                a, b, c = mpsi[i]
-                covar_grad_chain[:, :, i, 0] = l_term[i].T
-                al_grad = a * l_grad[i].T
-                covar_grad_chain[:, :, i, 1] = (y + c) * al_grad
-                covar_grad_chain[:, :, i, 2] = b * al_grad
-            covar_grad_chain[:, :, 0, 3] = y
-            return gradients, covar_grad_chain
-
-        return gradients
-
-    def _get_param_names(self):
-        variables = ['a', 'b', 'c', 'd']
-        names = sum([['warp_logistic_%s_t%i' % (variables[n],q) for n in range(3)] 
-                     for q in range(self.n_terms)],[])
-        names.append('warp_logistic')
-        return names
-
-    def update_grads(self, Y_untransformed, Kiy):
-        grad_y = self.fgrad_y(Y_untransformed)
-        grad_y_psi, grad_psi = self.fgrad_y_psi(Y_untransformed,
-                                                return_covar_chain=True)
-        djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
-        dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
-
-        warping_grads = -dquad_dpsi + djac_dpsi
-
-        self.psi.gradient[:] = warping_grads[:, :-1]
-        self.d.gradient[:] = warping_grads[0, -1]
-
-
-class LogitFunction(WarpingFunction):
-    """
-    A sum of logit functions.
-    """
-    def __init__(self, n_terms=1, initial_y=None):
-        """
-        n_terms specifies the number of logistic terms to be used
-        """
-        self.n_terms = n_terms
-        self.num_parameters = 3 * self.n_terms
-        self.psi = np.ones((self.n_terms, 3))
-        super(LogitFunction, self).__init__(name='warp_logit')
-        self.psi = Param('psi', self.psi)
-        self.psi[:, :2].constrain_positive()
-        self.link_parameter(self.psi)
-        self.initial_y = initial_y
-        self.e = 1e-5
-
-    def _logit(self, y):
-        a, b, c = self.psi[0]
-        y += self.e
-        return ((np.log(y) - np.log(a - y)) / b) + c
-
-    def f(self, y):
-        """
-        Transform y with f using parameter vector psi
-        psi = [[a,b,c]]
-
-        :math:`f = (y * d) + \\sum_{terms} a * logistic(b *(y + c))`
-        """
-        z = np.zeros_like(y)
-        for i in xrange(self.n_terms):
-            a, b, c = self.psi[i]
-            z += self._logit(y)
-        return z
-
-    def f_inv(self, z, max_iterations=100, y=None):
-        """
-        calculate the numerical inverse of f
-
-        :param max_iterations: maximum number of N.R. iterations
-        """
-        z = z.copy()
-        if y is None:
-            # The idea here is to initialize y with +1 where
-            # z is positive and -1 where it is negative.
-            # For negative z, Newton-Raphson diverges 
-            # if we initialize y with a positive value (and vice-versa).
-            y = ((z > 0) * 1.) - (z <= 0)
-            if self.initial_y is not None:
-                y *= self.initial_y
-        it = 0
-        update = np.inf
-        while it == 0 or (np.abs(update).sum() > 1e-10 and it < max_iterations):
-            fy = self.f(y)
-            fgrady = self.fgrad_y(y)
-            update = (fy - z) / fgrady
-            y -= update
-            it += 1
-        if it == max_iterations:
-            print("WARNING!!! Maximum number of iterations reached in f_inv ")
-            print("Sum of updates: %.4f" % np.sum(update))
-        return y
-
-    def fgrad_y(self, y, return_precalc=False):
-        """
-        Gradient of f w.r.t to y ([N x 1])
-        This vectorized version calculates all summation terms
-        at the same time (since the grad of a sum is the sum of the grads).
-
-        :returns: Nx1 vector of derivatives, unless return_precalc is true, 
-        then it also returns the precomputed stuff
-        """
-        a, b, c = self.psi[0]
-
-        # vectorized version
-        # term = b * (y + c)
-        y += self.e
-        yb = y * b
-        ab = a * b
-        grad = (1. / yb) + (1. / (ab - yb))
-
-        if return_precalc:
-            return grad, yb, ab
-        return grad
-
-    def fgrad_y_psi(self, y, return_covar_chain=False):
-        """
-        gradient of f w.r.t to y and psi
-
-        :returns: NxIx4 tensor of partial derivatives
-        """
-        grad, yb, ab = self.fgrad_y(y, return_precalc=True)
-        gradients = np.zeros((y.shape[0], y.shape[1], self.n_terms, 3))
-        for i in xrange(self.n_terms):
-            a, b, c  = self.psi[i]
-            gradients[:, :, i, 0] = -b / ((ab - yb) ** 2)
-            yb2 = y * (b ** 2)
-            ab2 = a * (b ** 2)
-            gradients[:, :, i, 1] = - (1. / yb2) - (1. / (ab2 - yb2))
-            #gradients[:, :, i, 2] = 0.0
-
-        if return_covar_chain:
-            covar_grad_chain = np.zeros((y.shape[0], y.shape[1], self.n_terms, 3))
-            for i in xrange(self.n_terms):
-                a, b, c = self.psi[i]
-                covar_grad_chain[:, :, i, 0] = - (1. / (ab - yb))
-                covar_grad_chain[:, :, i, 1] = - (np.log(y) - np.log(a - y)) / (b ** 2)
-                covar_grad_chain[:, :, i, 2] = 1.0
-            return gradients, covar_grad_chain
-
-        return gradients
-
-    def _get_param_names(self):
-        variables = ['a', 'b', 'c']
-        names = sum([['warp_logit_%s_t%i' % (variables[n],q) for n in range(3)] 
-                     for q in range(self.n_terms)],[])
-        names.append('warp_logit')
-        return names
-
-    def update_grads(self, Y_untransformed, Kiy):
-        grad_y = self.fgrad_y(Y_untransformed)
-        grad_y_psi, grad_psi = self.fgrad_y_psi(Y_untransformed,
-                                                return_covar_chain=True)
-        djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
-        dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
-
-        warping_grads = -dquad_dpsi + djac_dpsi
-
-        self.psi.gradient[:] = warping_grads[:, :]
-        

From 6bea908234058b3fe57fd1dc61fa16e19ce74d1a Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Wed, 2 Mar 2016 17:05:22 +0000
Subject: [PATCH 15/53] refactored the numeric inverse into the mother class,
 to test Identity and Log

---
 GPy/testing/model_tests.py    | 21 +++--------
 GPy/util/warping_functions.py | 70 +++++++++++++++++++----------------
 2 files changed, 43 insertions(+), 48 deletions(-)

diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 83487373..351aa210 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -300,12 +300,12 @@ class MiscTests(unittest.TestCase):
         preds = m.predict(self.X)
 
         warp_k = GPy.kern.RBF(1)
-        warp_f = GPy.util.warping_functions.IdentityFunction()
+        warp_f = GPy.util.warping_functions.IdentityFunction(closed_inverse=False)
         warp_m = GPy.models.WarpedGP(self.X, self.Y, kernel=warp_k, warping_function=warp_f)
         warp_m.optimize()
         warp_preds = warp_m.predict(self.X)
  
-        np.testing.assert_almost_equal(preds, warp_preds)
+        np.testing.assert_almost_equal(preds, warp_preds, decimal=4)
 
     def test_warped_gp_log(self):
         """
@@ -316,21 +316,16 @@ class MiscTests(unittest.TestCase):
         Y = np.abs(self.Y)
         logY = np.log(Y)
         m = GPy.models.GPRegression(self.X, logY, kernel=k)
-        #m.optimize()
-        m['Gaussian_noise.variance'] = 1e-4
+        m.optimize()
         preds = m.predict(self.X)[0]
 
         warp_k = GPy.kern.RBF(1)
-        warp_f = GPy.util.warping_functions.LogFunction()
+        warp_f = GPy.util.warping_functions.LogFunction(closed_inverse=False)
         warp_m = GPy.models.WarpedGP(self.X, Y, kernel=warp_k, warping_function=warp_f)
         warp_m.optimize()
-        warp_m['.*'] = 1.0
-        warp_m['Gaussian_noise.variance'] = 1e-4
         warp_preds = warp_m.predict(self.X, median=True)[0]
-        #print np.exp(preds)
-        #print warp_preds
  
-        np.testing.assert_almost_equal(np.exp(preds), warp_preds)
+        np.testing.assert_almost_equal(np.exp(preds), warp_preds, decimal=4)
 
     @unittest.skip('Comment this to plot the modified sine function')
     def test_warped_gp_sine(self):
@@ -354,17 +349,11 @@ class MiscTests(unittest.TestCase):
         print(warp_m['.*warp.*'])
         warp_m.predict_in_warped_space = False
         warp_m.plot()
-        import ipdb; ipdb.set_trace()
         warp_m.predict_in_warped_space = True
         warp_m.plot()
         m.plot()
         warp_f.plot(X.min()-10, X.max()+10)
-
-
-
-
         plt.show()
-
         
 
 class GradientTests(np.testing.TestCase):
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index f98f0f08..13cda810 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -14,6 +14,7 @@ class WarpingFunction(Parameterized):
 
     def __init__(self, name):
         super(WarpingFunction, self).__init__(name=name)
+        self.rate = 0.1
 
     def f(self, y, psi):
         """function transformation
@@ -29,9 +30,30 @@ class WarpingFunction(Parameterized):
         """gradient of f w.r.t to y"""
         raise NotImplementedError
 
-    def f_inv(self, z, psi):
-        """inverse function transformation"""
-        raise NotImplementedError
+    def f_inv(self, z, max_iterations=100, y=None):
+        """
+        Calculate the numerical inverse of f. This should be
+        overwritten for specific warping functions where the
+        inverse can be found in closed form.
+
+        :param max_iterations: maximum number of N.R. iterations
+        """
+
+        z = z.copy()
+        y = np.ones_like(z)
+
+        it = 0
+        update = np.inf
+        while np.abs(update).sum() > 1e-10 and it < max_iterations:
+            fy = self.f(y)
+            fgrady = self.fgrad_y(y)
+            update = (fy - z) / fgrady
+            y -= self.rate * update
+            it += 1
+        if it == max_iterations:
+            print("WARNING!!! Maximum number of iterations reached in f_inv ")
+            print("Sum of roots: %.4f" % np.sum(fy - z))
+        return y
 
     def _get_param_names(self):
         raise NotImplementedError
@@ -70,7 +92,6 @@ class TanhFunction(WarpingFunction):
         self.link_parameter(self.psi)
         self.link_parameter(self.d)
         self.initial_y = initial_y
-        self.rate = 0.1
 
     def f(self, y):
         """
@@ -87,29 +108,6 @@ class TanhFunction(WarpingFunction):
             z += a * np.tanh(b * (y + c))
         return z
 
-    def f_inv(self, z, max_iterations=100, y=None):
-        """
-        calculate the numerical inverse of f
-
-        :param max_iterations: maximum number of N.R. iterations
-        """
-
-        z = z.copy()
-        y = np.ones_like(z)
-
-        it = 0
-        update = np.inf
-        while np.abs(update).sum() > 1e-10 and it < max_iterations:
-            fy = self.f(y)
-            fgrady = self.fgrad_y(y)
-            update = (fy - z) / fgrady
-            y -= self.rate * update
-            it += 1
-        if it == max_iterations:
-            print("WARNING!!! Maximum number of iterations reached in f_inv ")
-            print("Sum of roots: %.4f" % np.sum(fy - z))
-        return y
-
     def fgrad_y(self, y, return_precalc=False):
         """
         gradient of f w.r.t to y ([N x 1])
@@ -183,12 +181,16 @@ class TanhFunction(WarpingFunction):
 
 class LogFunction(WarpingFunction):
     """
-    Easy wrapper for applying a fixed warping function to
+    Easy wrapper for applying a fixed log warping function to
     positive-only values.
+    The closed_inverse flag should only be set to False for
+    debugging and testing purposes.
     """
-    def __init__(self):
+    def __init__(self, closed_inverse=True):
         self.num_parameters = 0
         super(LogFunction, self).__init__(name='log')
+        if closed_inverse:
+            self.f_inv = self._f_inv
 
     def f(self, y):
         return np.log(y)
@@ -204,7 +206,7 @@ class LogFunction(WarpingFunction):
             return 0, 0
         return 0
 
-    def f_inv(self, z, y=None):
+    def _f_inv(self, z, y=None):
         return np.exp(z)
 
 
@@ -212,10 +214,14 @@ class IdentityFunction(WarpingFunction):
     """
     Identity warping function. This is for testing and sanity check purposes
     and should not be used in practice.
+    The closed_inverse flag should only be set to False for
+    debugging and testing purposes.
     """
-    def __init__(self):
+    def __init__(self, closed_inverse=True):
         self.num_parameters = 0
         super(IdentityFunction, self).__init__(name='identity')
+        if closed_inverse:
+            self.f_inv = self._f_inv
         
     def f(self, y):
         return y
@@ -231,6 +237,6 @@ class IdentityFunction(WarpingFunction):
             return 0, 0
         return 0
         
-    def f_inv(self, z, y=None):
+    def _f_inv(self, z, y=None):
         return z
 

From 2ecd13b08d93448167fc66e5cd3cfff58916083d Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Mon, 7 Mar 2016 10:46:42 +0000
Subject: [PATCH 16/53] moved cubic sine from tests to examples

---
 GPy/examples/regression.py | 26 ++++++++++++++++++++++++++
 GPy/testing/model_tests.py | 29 +----------------------------
 2 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index 11734564..b6bc07a0 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -550,3 +550,29 @@ def parametric_mean_function(max_iters=100, optimize=True, plot=True):
     return m
 
 
+def warped_gp_cubic_sine(max_iters=100):
+    """
+    A test replicating the sine regression problem from
+    Snelson's paper.
+    """
+    X = (2 * np.pi) * np.random.random(151) - np.pi
+    Y = np.sin(X) + np.random.normal(0,0.2,151)
+    Y = np.array([np.power(abs(y),float(1)/3) * (1,-1)[y<0] for y in Y])
+        
+    warp_k = GPy.kern.RBF(1)
+    warp_f = GPy.util.warping_functions.TanhFunction(n_terms=2)
+    warp_m = GPy.models.WarpedGP(X[:, None], Y[:, None], kernel=warp_k, warping_function=warp_f)
+
+    m = GPy.models.GPRegression(X[:, None], Y[:, None])
+    m.optimize_restarts(parallel=False, robust=True, num_restarts=5, max_iters=max_iters)
+    warp_m.optimize_restarts(parallel=False, robust=True, num_restarts=5, max_iters=max_iters)
+    print(warp_m)
+    print(warp_m['.*warp.*'])
+    warp_m.predict_in_warped_space = False
+    warp_m.plot(title="Warped GP - Latent space")
+    warp_m.predict_in_warped_space = True
+    warp_m.plot(title="Warped GP - Warped space")
+    m.plot(title="Standard GP")
+    #warp_f.plot(X.min()-10, X.max()+10)
+    warp_m.plot_warping()
+    pb.show()
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 351aa210..5ec64e15 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -311,6 +311,7 @@ class MiscTests(unittest.TestCase):
         """
         A WarpedGP with the log warping function should be
         equal to a standard GP with log labels.
+        Note that we predict the median here.
         """
         k = GPy.kern.RBF(1)
         Y = np.abs(self.Y)
@@ -327,34 +328,6 @@ class MiscTests(unittest.TestCase):
  
         np.testing.assert_almost_equal(np.exp(preds), warp_preds, decimal=4)
 
-    @unittest.skip('Comment this to plot the modified sine function')
-    def test_warped_gp_sine(self):
-        """
-        A test replicating the sine regression problem from
-        Snelson's paper.
-        """
-        X = (2 * np.pi) * np.random.random(151) - np.pi
-        Y = np.sin(X) + np.random.normal(0,0.2,151)
-        Y = np.array([np.power(abs(y),float(1)/3) * (1,-1)[y<0] for y in Y])
-        
-        import matplotlib.pyplot as plt
-        warp_k = GPy.kern.RBF(1)
-        warp_f = GPy.util.warping_functions.TanhFunction(n_terms=2)
-        warp_m = GPy.models.WarpedGP(X[:, None], Y[:, None], kernel=warp_k, warping_function=warp_f)
-
-        m = GPy.models.GPRegression(X[:, None], Y[:, None])
-        m.optimize_restarts(parallel=False, robust=True, num_restarts=5)
-        warp_m.optimize_restarts(parallel=False, robust=True, num_restarts=5)
-        print(warp_m)
-        print(warp_m['.*warp.*'])
-        warp_m.predict_in_warped_space = False
-        warp_m.plot()
-        warp_m.predict_in_warped_space = True
-        warp_m.plot()
-        m.plot()
-        warp_f.plot(X.min()-10, X.max()+10)
-        plt.show()
-        
 
 class GradientTests(np.testing.TestCase):
     def setUp(self):

From 419f2fb284c987cfa19cf3306c6578143430b6f4 Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Mon, 7 Mar 2016 10:53:17 +0000
Subject: [PATCH 17/53] cleaning

---
 GPy/examples/regression.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index b6bc07a0..261d5a49 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -552,27 +552,28 @@ def parametric_mean_function(max_iters=100, optimize=True, plot=True):
 
 def warped_gp_cubic_sine(max_iters=100):
     """
-    A test replicating the sine regression problem from
+    A test replicating the cubic sine regression problem from
     Snelson's paper.
     """
     X = (2 * np.pi) * np.random.random(151) - np.pi
     Y = np.sin(X) + np.random.normal(0,0.2,151)
     Y = np.array([np.power(abs(y),float(1)/3) * (1,-1)[y<0] for y in Y])
-        
+    X = X[:, None]
+    Y = Y[:, None]
+
     warp_k = GPy.kern.RBF(1)
     warp_f = GPy.util.warping_functions.TanhFunction(n_terms=2)
-    warp_m = GPy.models.WarpedGP(X[:, None], Y[:, None], kernel=warp_k, warping_function=warp_f)
-
-    m = GPy.models.GPRegression(X[:, None], Y[:, None])
+    warp_m = GPy.models.WarpedGP(X, Y, kernel=warp_k, warping_function=warp_f)
+    m = GPy.models.GPRegression(X, Y)
     m.optimize_restarts(parallel=False, robust=True, num_restarts=5, max_iters=max_iters)
     warp_m.optimize_restarts(parallel=False, robust=True, num_restarts=5, max_iters=max_iters)
     print(warp_m)
     print(warp_m['.*warp.*'])
+
     warp_m.predict_in_warped_space = False
     warp_m.plot(title="Warped GP - Latent space")
     warp_m.predict_in_warped_space = True
     warp_m.plot(title="Warped GP - Warped space")
     m.plot(title="Standard GP")
-    #warp_f.plot(X.min()-10, X.max()+10)
     warp_m.plot_warping()
     pb.show()

From 0d943ac98ec3b4a68e69a9f73acf5370b2445e7b Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Mon, 14 Mar 2016 17:22:23 +0000
Subject: [PATCH 18/53] added a log-tanh function

---
 GPy/models/warped_gp.py       |   5 ++
 GPy/util/warping_functions.py | 106 +++++++++++++++++++++++++++++++++-
 2 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index 419a4104..2e93ddd5 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -5,6 +5,7 @@ import numpy as np
 from ..util.warping_functions import *
 from ..core import GP
 from .. import likelihoods
+from paramz import ObsAr
 from GPy.util.warping_functions import TanhFunction
 from GPy import kern
 
@@ -31,6 +32,10 @@ class WarpedGP(GP):
         GP.__init__(self, X, self.transform_data(), likelihood=likelihood, kernel=kernel)
         self.link_parameter(self.warping_function)
 
+    def set_XY(self, X=None, Y=None):
+        self.Y_untransformed = Y.copy()
+        GP.set_XY(self, X, self.transform_data())
+
     def _scale_data(self, Y):
         self._Ymax = Y.max()
         self._Ymin = Y.min()
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index 13cda810..ab7e2547 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -59,7 +59,6 @@ class WarpingFunction(Parameterized):
         raise NotImplementedError
 
     def plot(self, xmin, xmax):
-        psi = self.psi
         y = np.arange(xmin, xmax, 0.01)
         f_y = self.f(y)
         from matplotlib import pyplot as plt
@@ -179,6 +178,111 @@ class TanhFunction(WarpingFunction):
         self.d.gradient[:] = warping_grads[0, -1]
 
 
+class LogTanhFunction(WarpingFunction):
+    """
+    Trying to get the best of both worlds...
+    """
+    def __init__(self, n_terms=3, initial_y=None):
+        """
+        n_terms specifies the number of tanh terms to be used
+        """
+        self.n_terms = n_terms
+        self.num_parameters = 3 * self.n_terms + 1
+        self.psi = np.ones((self.n_terms, 3))
+        super(LogTanhFunction, self).__init__(name='warp_tanh')
+        self.psi = Param('psi', self.psi)
+        self.psi[:, :2].constrain_positive()
+        self.d = Param('%s' % ('d'), 1.0, Logexp())
+        self.link_parameter(self.psi)
+        self.link_parameter(self.d)
+        self.initial_y = initial_y
+
+    def f(self, y):
+        """
+        Transform y with f using parameter vector psi
+        psi = [[a,b,c]]
+
+        :math:`f = (y * d) + \\sum_{terms} a * tanh(b *(y + c))`
+        """
+        d = self.d
+        mpsi = self.psi
+        z = d * np.log(y)
+        for i in range(len(mpsi)):
+            a, b, c = mpsi[i]
+            z += a * np.tanh(b * (np.log(y) + c))
+        return z
+
+    def fgrad_y(self, y, return_precalc=False):
+        """
+        gradient of f w.r.t to y ([N x 1])
+
+        :returns: Nx1 vector of derivatives, unless return_precalc is true, 
+        then it also returns the precomputed stuff
+        """
+        d = self.d
+        mpsi = self.psi
+
+        # vectorized version
+        S = (mpsi[:,1] * (np.log(y)[:,:,None] + mpsi[:,2])).T
+        R = np.tanh(S)
+        D = 1 - (R ** 2)
+
+        GRAD = (d + (mpsi[:,0:1][:,:,None] * mpsi[:,1:2][:,:,None] * D).sum(axis=0)).T / y
+
+        if return_precalc:
+            return GRAD, S, R, D
+
+        return GRAD
+
+    def fgrad_y_psi(self, y, return_covar_chain=False):
+        """
+        gradient of f w.r.t to y and psi
+
+        :returns: NxIx4 tensor of partial derivatives
+        """
+        mpsi = self.psi
+
+        w, s, r, d = self.fgrad_y(y, return_precalc=True)
+        gradients = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
+        for i in range(len(mpsi)):
+            a,b,c  = mpsi[i]
+            gradients[:, :, i, 0] = (b * (1.0/np.cosh(s[i])) ** 2).T / y
+            gradients[:, :, i, 1] = a * (d[i] - 2.0 * s[i] * r[i] * (1.0/np.cosh(s[i])) ** 2).T / y
+            gradients[:, :, i, 2] = (-2.0 * a * (b ** 2) * r[i] * ((1.0 / np.cosh(s[i])) ** 2)).T / y
+        gradients[:, :, 0, 3] = 1.0 / y
+
+        if return_covar_chain:
+            covar_grad_chain = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
+            for i in range(len(mpsi)):
+                a,b,c = mpsi[i]
+                covar_grad_chain[:, :, i, 0] = (r[i]).T
+                covar_grad_chain[:, :, i, 1] = (a * (np.log(y) + c) * ((1.0 / np.cosh(s[i])) ** 2).T)
+                covar_grad_chain[:, :, i, 2] = a * b * ((1.0 / np.cosh(s[i])) ** 2).T
+            covar_grad_chain[:, :, 0, 3] = np.log(y)
+            return gradients, covar_grad_chain
+
+        return gradients
+
+    def _get_param_names(self):
+        variables = ['a', 'b', 'c', 'd']
+        names = sum([['warp_tanh_%s_t%i' % (variables[n],q) for n in range(3)] 
+                     for q in range(self.n_terms)],[])
+        names.append('warp_tanh')
+        return names
+
+    def update_grads(self, Y_untransformed, Kiy):
+        grad_y = self.fgrad_y(Y_untransformed)
+        grad_y_psi, grad_psi = self.fgrad_y_psi(Y_untransformed,
+                                                return_covar_chain=True)
+        djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
+        dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
+
+        warping_grads = -dquad_dpsi + djac_dpsi
+
+        self.psi.gradient[:] = warping_grads[:, :-1]
+        self.d.gradient[:] = warping_grads[0, -1]
+
+
 class LogFunction(WarpingFunction):
     """
     Easy wrapper for applying a fixed log warping function to

From 347bcc963a68242385f1e99bd6668093fbebaf5d Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Mon, 14 Mar 2016 17:40:43 +0000
Subject: [PATCH 19/53] added an exception when you input 0 or negative values
 to logtanh function

---
 GPy/util/warping_functions.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index ab7e2547..c4e8f04c 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -4,6 +4,7 @@
 import numpy as np
 from ..core.parameterization import Parameterized, Param
 from paramz.transformations import Logexp
+import sys
 
 
 class WarpingFunction(Parameterized):
@@ -45,7 +46,11 @@ class WarpingFunction(Parameterized):
         it = 0
         update = np.inf
         while np.abs(update).sum() > 1e-10 and it < max_iterations:
-            fy = self.f(y)
+            try:
+                fy = self.f(y)
+            except ValueError:
+                sys.stderr.write("One of y's is negative! Stopping inverse calculation.\n")
+                break
             fgrady = self.fgrad_y(y)
             update = (fy - z) / fgrady
             y -= self.rate * update
@@ -204,6 +209,9 @@ class LogTanhFunction(WarpingFunction):
 
         :math:`f = (y * d) + \\sum_{terms} a * tanh(b *(y + c))`
         """
+        if (y <= 0.0).any():
+            sys.stderr.write("Negative y found!, raising exception.\n")
+            raise ValueError
         d = self.d
         mpsi = self.psi
         z = d * np.log(y)

From 0b32bdf9a67eeb1c8650b63638cd08ab4bb12fc9 Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Thu, 7 Apr 2016 17:32:35 +0100
Subject: [PATCH 20/53] removed logtanh, put into a new branch

---
 GPy/util/warping_functions.py | 108 ----------------------------------
 1 file changed, 108 deletions(-)

diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index c4e8f04c..7bc0050f 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -183,114 +183,6 @@ class TanhFunction(WarpingFunction):
         self.d.gradient[:] = warping_grads[0, -1]
 
 
-class LogTanhFunction(WarpingFunction):
-    """
-    Trying to get the best of both worlds...
-    """
-    def __init__(self, n_terms=3, initial_y=None):
-        """
-        n_terms specifies the number of tanh terms to be used
-        """
-        self.n_terms = n_terms
-        self.num_parameters = 3 * self.n_terms + 1
-        self.psi = np.ones((self.n_terms, 3))
-        super(LogTanhFunction, self).__init__(name='warp_tanh')
-        self.psi = Param('psi', self.psi)
-        self.psi[:, :2].constrain_positive()
-        self.d = Param('%s' % ('d'), 1.0, Logexp())
-        self.link_parameter(self.psi)
-        self.link_parameter(self.d)
-        self.initial_y = initial_y
-
-    def f(self, y):
-        """
-        Transform y with f using parameter vector psi
-        psi = [[a,b,c]]
-
-        :math:`f = (y * d) + \\sum_{terms} a * tanh(b *(y + c))`
-        """
-        if (y <= 0.0).any():
-            sys.stderr.write("Negative y found!, raising exception.\n")
-            raise ValueError
-        d = self.d
-        mpsi = self.psi
-        z = d * np.log(y)
-        for i in range(len(mpsi)):
-            a, b, c = mpsi[i]
-            z += a * np.tanh(b * (np.log(y) + c))
-        return z
-
-    def fgrad_y(self, y, return_precalc=False):
-        """
-        gradient of f w.r.t to y ([N x 1])
-
-        :returns: Nx1 vector of derivatives, unless return_precalc is true, 
-        then it also returns the precomputed stuff
-        """
-        d = self.d
-        mpsi = self.psi
-
-        # vectorized version
-        S = (mpsi[:,1] * (np.log(y)[:,:,None] + mpsi[:,2])).T
-        R = np.tanh(S)
-        D = 1 - (R ** 2)
-
-        GRAD = (d + (mpsi[:,0:1][:,:,None] * mpsi[:,1:2][:,:,None] * D).sum(axis=0)).T / y
-
-        if return_precalc:
-            return GRAD, S, R, D
-
-        return GRAD
-
-    def fgrad_y_psi(self, y, return_covar_chain=False):
-        """
-        gradient of f w.r.t to y and psi
-
-        :returns: NxIx4 tensor of partial derivatives
-        """
-        mpsi = self.psi
-
-        w, s, r, d = self.fgrad_y(y, return_precalc=True)
-        gradients = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
-        for i in range(len(mpsi)):
-            a,b,c  = mpsi[i]
-            gradients[:, :, i, 0] = (b * (1.0/np.cosh(s[i])) ** 2).T / y
-            gradients[:, :, i, 1] = a * (d[i] - 2.0 * s[i] * r[i] * (1.0/np.cosh(s[i])) ** 2).T / y
-            gradients[:, :, i, 2] = (-2.0 * a * (b ** 2) * r[i] * ((1.0 / np.cosh(s[i])) ** 2)).T / y
-        gradients[:, :, 0, 3] = 1.0 / y
-
-        if return_covar_chain:
-            covar_grad_chain = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
-            for i in range(len(mpsi)):
-                a,b,c = mpsi[i]
-                covar_grad_chain[:, :, i, 0] = (r[i]).T
-                covar_grad_chain[:, :, i, 1] = (a * (np.log(y) + c) * ((1.0 / np.cosh(s[i])) ** 2).T)
-                covar_grad_chain[:, :, i, 2] = a * b * ((1.0 / np.cosh(s[i])) ** 2).T
-            covar_grad_chain[:, :, 0, 3] = np.log(y)
-            return gradients, covar_grad_chain
-
-        return gradients
-
-    def _get_param_names(self):
-        variables = ['a', 'b', 'c', 'd']
-        names = sum([['warp_tanh_%s_t%i' % (variables[n],q) for n in range(3)] 
-                     for q in range(self.n_terms)],[])
-        names.append('warp_tanh')
-        return names
-
-    def update_grads(self, Y_untransformed, Kiy):
-        grad_y = self.fgrad_y(Y_untransformed)
-        grad_y_psi, grad_psi = self.fgrad_y_psi(Y_untransformed,
-                                                return_covar_chain=True)
-        djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
-        dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
-
-        warping_grads = -dquad_dpsi + djac_dpsi
-
-        self.psi.gradient[:] = warping_grads[:, :-1]
-        self.d.gradient[:] = warping_grads[0, -1]
-
-
 class LogFunction(WarpingFunction):
     """
     Easy wrapper for applying a fixed log warping function to

From 3e27748b429448ab429b6279749d80a85da9f9ca Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Thu, 7 Apr 2016 17:52:59 +0100
Subject: [PATCH 21/53] replicated the cubic sine example into warped_gp tests
 for code coverage

---
 GPy/examples/regression.py |  4 ++++
 GPy/testing/model_tests.py | 22 ++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index 261d5a49..19919f97 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -564,9 +564,13 @@ def warped_gp_cubic_sine(max_iters=100):
     warp_k = GPy.kern.RBF(1)
     warp_f = GPy.util.warping_functions.TanhFunction(n_terms=2)
     warp_m = GPy.models.WarpedGP(X, Y, kernel=warp_k, warping_function=warp_f)
+    warp_m['.*\.d'].constrain_fixed(1.0)
     m = GPy.models.GPRegression(X, Y)
     m.optimize_restarts(parallel=False, robust=True, num_restarts=5, max_iters=max_iters)
     warp_m.optimize_restarts(parallel=False, robust=True, num_restarts=5, max_iters=max_iters)
+    #m.optimize(max_iters=max_iters)
+    #warp_m.optimize(max_iters=max_iters)
+
     print(warp_m)
     print(warp_m['.*warp.*'])
 
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 5ec64e15..c0316cf1 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -328,6 +328,28 @@ class MiscTests(unittest.TestCase):
  
         np.testing.assert_almost_equal(np.exp(preds), warp_preds, decimal=4)
 
+    def test_warped_gp_cubic_sine(self, max_iters=100):
+        """
+        A test replicating the cubic sine regression problem from
+        Snelson's paper. This test doesn't have any assertions, it's
+        just to ensure coverage of the tanh warping function code.
+        """
+        X = (2 * np.pi) * np.random.random(151) - np.pi
+        Y = np.sin(X) + np.random.normal(0,0.2,151)
+        Y = np.array([np.power(abs(y),float(1)/3) * (1,-1)[y<0] for y in Y])
+        X = X[:, None]
+        Y = Y[:, None]
+
+        warp_k = GPy.kern.RBF(1)
+        warp_f = GPy.util.warping_functions.TanhFunction(n_terms=2)
+        warp_m = GPy.models.WarpedGP(X, Y, kernel=warp_k, warping_function=warp_f)
+        warp_m['.*\.d'].constrain_fixed(1.0)
+        warp_m.optimize_restarts(parallel=False, robust=False, num_restarts=5, 
+                                 max_iters=max_iters)
+        print warp_m
+        warp_m.predict(X)
+
+
 
 class GradientTests(np.testing.TestCase):
     def setUp(self):

From ff700bbebf055bb842c8fc890085e591f465461b Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Thu, 7 Apr 2016 18:07:13 +0100
Subject: [PATCH 22/53] changed imports to relative ones

---
 GPy/models/warped_gp.py            | 5 +++--
 GPy/plotting/gpy_plot/plot_util.py | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index 2e93ddd5..aa9e8c90 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -2,11 +2,12 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from ..util.warping_functions import *
+#from ..util.warping_functions import *
 from ..core import GP
 from .. import likelihoods
 from paramz import ObsAr
-from GPy.util.warping_functions import TanhFunction
+#from GPy.util.warping_functions import TanhFunction
+from ..util.warping_functions import TanhFunction
 from GPy import kern
 
 class WarpedGP(GP):
diff --git a/GPy/plotting/gpy_plot/plot_util.py b/GPy/plotting/gpy_plot/plot_util.py
index 237e87cd..74e190d9 100644
--- a/GPy/plotting/gpy_plot/plot_util.py
+++ b/GPy/plotting/gpy_plot/plot_util.py
@@ -31,7 +31,7 @@
 import numpy as np
 from scipy import sparse
 import itertools
-from GPy.models import WarpedGP
+from ...models import WarpedGP
 
 def in_ipynb():
     try:

From 2d9bdfdbfb9eaa552134c8bf79a84e5b2b1027da Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Thu, 7 Apr 2016 18:41:48 +0100
Subject: [PATCH 23/53] added tests for closed inverse in identity and log

---
 GPy/testing/model_tests.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index c0316cf1..144c6adf 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -301,11 +301,20 @@ class MiscTests(unittest.TestCase):
 
         warp_k = GPy.kern.RBF(1)
         warp_f = GPy.util.warping_functions.IdentityFunction(closed_inverse=False)
-        warp_m = GPy.models.WarpedGP(self.X, self.Y, kernel=warp_k, warping_function=warp_f)
+        warp_m = GPy.models.WarpedGP(self.X, self.Y, kernel=warp_k, 
+                                     warping_function=warp_f)
         warp_m.optimize()
         warp_preds = warp_m.predict(self.X)
+
+        warp_k_exact = GPy.kern.RBF(1)
+        warp_f_exact = GPy.util.warping_functions.IdentityFunction()
+        warp_m_exact = GPy.models.WarpedGP(self.X, self.Y, kernel=warp_k_exact, 
+                                           warping_function=warp_f_exact)
+        warp_m_exact.optimize()
+        warp_preds_exact = warp_m_exact.predict(self.X)
  
         np.testing.assert_almost_equal(preds, warp_preds, decimal=4)
+        np.testing.assert_almost_equal(preds, warp_preds_exact, decimal=4)
 
     def test_warped_gp_log(self):
         """
@@ -322,11 +331,20 @@ class MiscTests(unittest.TestCase):
 
         warp_k = GPy.kern.RBF(1)
         warp_f = GPy.util.warping_functions.LogFunction(closed_inverse=False)
-        warp_m = GPy.models.WarpedGP(self.X, Y, kernel=warp_k, warping_function=warp_f)
+        warp_m = GPy.models.WarpedGP(self.X, Y, kernel=warp_k, 
+                                     warping_function=warp_f)
         warp_m.optimize()
         warp_preds = warp_m.predict(self.X, median=True)[0]
+
+        warp_k_exact = GPy.kern.RBF(1)
+        warp_f_exact = GPy.util.warping_functions.LogFunction()
+        warp_m_exact = GPy.models.WarpedGP(self.X, Y, kernel=warp_k_exact, 
+                                           warping_function=warp_f_exact)
+        warp_m_exact.optimize(messages=True)
+        warp_preds_exact = warp_m_exact.predict(self.X, median=True)[0]
  
         np.testing.assert_almost_equal(np.exp(preds), warp_preds, decimal=4)
+        np.testing.assert_almost_equal(np.exp(preds), warp_preds_exact, decimal=4)
 
     def test_warped_gp_cubic_sine(self, max_iters=100):
         """

From e17ee1078bf0b08f159164ee8461a1c4ffdb1e25 Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Thu, 7 Apr 2016 18:44:55 +0100
Subject: [PATCH 24/53] removed the check on f(y), it was only useful in
 logtanh

---
 GPy/util/warping_functions.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index 7bc0050f..6b8b73e9 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -46,11 +46,7 @@ class WarpingFunction(Parameterized):
         it = 0
         update = np.inf
         while np.abs(update).sum() > 1e-10 and it < max_iterations:
-            try:
-                fy = self.f(y)
-            except ValueError:
-                sys.stderr.write("One of y's is negative! Stopping inverse calculation.\n")
-                break
+            fy = self.f(y)
             fgrady = self.fgrad_y(y)
             update = (fy - z) / fgrady
             y -= self.rate * update

From 28ee0f7aca87ecdb37714ffaa8ad71945099ddd8 Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Mon, 11 Apr 2016 16:22:19 +0100
Subject: [PATCH 25/53] improving coverage and removing py2 print

---
 GPy/testing/model_tests.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 9803c62f..ef6e34cb 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -419,14 +419,16 @@ class MiscTests(unittest.TestCase):
         X = X[:, None]
         Y = Y[:, None]
 
-        warp_k = GPy.kern.RBF(1)
-        warp_f = GPy.util.warping_functions.TanhFunction(n_terms=2)
-        warp_m = GPy.models.WarpedGP(X, Y, kernel=warp_k, warping_function=warp_f)
+        warp_m = GPy.models.WarpedGP(X, Y)#, kernel=warp_k)#, warping_function=warp_f)
         warp_m['.*\.d'].constrain_fixed(1.0)
         warp_m.optimize_restarts(parallel=False, robust=False, num_restarts=5, 
                                  max_iters=max_iters)
-        print warp_m
         warp_m.predict(X)
+        warp_m.predict_quantiles(X)
+        warp_m.log_predictive_density(X, Y)
+        warp_m.predict_in_warped_space = False
+        warp_m.predict(X)
+        warp_m.predict_quantiles(X)
 
 
 class GradientTests(np.testing.TestCase):

From c701fb9f093f47e319b37ae3c34d729c6c7b2437 Mon Sep 17 00:00:00 2001
From: Michael T Smith <lionfishy@gmail.com>
Date: Wed, 3 Aug 2016 14:55:23 +0100
Subject: [PATCH 26/53] Adding refs to new clustering and offset model and
 utility

---
 GPy/models/__init__.py | 2 ++
 GPy/util/__init__.py   | 1 +
 2 files changed, 3 insertions(+)

diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py
index 654b1938..6d19d8a5 100644
--- a/GPy/models/__init__.py
+++ b/GPy/models/__init__.py
@@ -26,3 +26,5 @@ from .dpgplvm import DPBayesianGPLVM
 from .state_space_model import StateSpace
 
 from .ibp_lfm import IBPLFM
+
+from .gp_offset_regression import GPOffsetRegression
diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py
index fb1eb0d6..685551fd 100644
--- a/GPy/util/__init__.py
+++ b/GPy/util/__init__.py
@@ -16,3 +16,4 @@ from . import initialization
 from . import multioutput
 from . import parallel
 from . import functions
+from . import cluster_with_offset

From 132c8d92b2badabeac3e6167f433e9bfb726641c Mon Sep 17 00:00:00 2001
From: Michael T Smith <lionfishy@gmail.com>
Date: Wed, 3 Aug 2016 15:27:49 +0100
Subject: [PATCH 27/53] Offset model and clustering utility

---
 GPy/models/gp_offset_regression.py |  92 +++++++++++++++
 GPy/util/cluster_with_offset.py    | 180 +++++++++++++++++++++++++++++
 2 files changed, 272 insertions(+)
 create mode 100644 GPy/models/gp_offset_regression.py
 create mode 100644 GPy/util/cluster_with_offset.py

diff --git a/GPy/models/gp_offset_regression.py b/GPy/models/gp_offset_regression.py
new file mode 100644
index 00000000..3699223e
--- /dev/null
+++ b/GPy/models/gp_offset_regression.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2012 - 2014 the GPy Austhors (see AUTHORS.txt)
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+# Written by Mike Smith. michaeltsmith.org.uk
+
+import numpy as np
+from ..core import GP
+from .. import likelihoods
+from .. import kern
+from ..core import Param
+
+class GPOffsetRegression(GP):
+    """
+    Gaussian Process model for offset regression
+
+    :param X: input observations, we assume for this class that this has one dimension of actual inputs and the last dimension should be the index of the cluster (so X should be Nx2)
+    :param Y: observed values (Nx1?)
+    :param kernel: a GPy kernel, defaults to rbf
+    :param Norm normalizer: [False]
+    :param noise_var: the noise variance for Gaussian likelhood, defaults to 1.
+
+        Normalize Y with the norm given.
+        If normalizer is False, no normalization will be done
+        If it is None, we use GaussianNorm(alization)
+
+    .. Note:: Multiple independent outputs are allowed using columns of Y
+
+    """
+
+    def __init__(self, X, Y, kernel=None, Y_metadata=None, normalizer=None, noise_var=1., mean_function=None):
+
+        assert X.shape[1]>1, "Need at least two input dimensions, as last dimension is the label of the cluster"
+        if kernel is None:
+            kernel = kern.RBF(X.shape[1]-1)
+
+        #self._log_marginal_likelihood = np.nan #todo
+        
+        likelihood = likelihoods.Gaussian(variance=noise_var)
+        self.X_fixed = X[:,:-1]
+        self.selected = np.array([int(x) for x in X[:,-1]])
+        
+        
+        super(GPOffsetRegression, self).__init__(X, Y, kernel, likelihood, name='GP offset regression', Y_metadata=Y_metadata, normalizer=normalizer, mean_function=mean_function)
+        maxcluster = np.max(self.selected)
+        self.offset = Param('offset', np.zeros(maxcluster))
+        #self.offset.set_prior(...)
+        self.link_parameter(self.offset)
+        
+    #def dr_doffset(self, X, sel): #how much r changes wrt the offset hyperparameters
+        
+    #def dL_doffset(self, X, sel):
+    #    dL_dr = self.dK_dr_via_X(X, X) * dL_dK
+        
+        
+        
+    def dr_doffset(self,X,sel,delta):
+        #given an input matrix, X and the offsets (delta)
+        #finds dr/dDelta
+        #returns them as a list, one for each offset (delta).        
+        #get the input values
+
+        #a matrix G represents the effect of increasing the offset on the radius passed to the kernel for each input. For example
+        #what effect will increasing offset 4 have on the kernel output of inputs 5 and 8? Answer: Gs[4][5,8]... (positive or negative)
+        Gs = []
+        for i,d in enumerate(delta):
+            #X[sel==(i+1)]-=d
+            G = np.repeat(np.array(sel==(i+1))[:,None]*1,len(X),axis=1) - np.repeat(np.array(sel==(i+1))[None,:]*1,len(X),axis=0)
+            Gs.append(G)
+        #does subtracting the two Xs end up positive or negative (if negative we need to flip the sign in G).
+        w = np.repeat(X,len(X),axis=1) - np.repeat(X.T,len(X),axis=0)
+        dr_doffsets = []
+        for i,d in enumerate(delta):
+            dr_doffset = np.sign(w * Gs[i])
+            #print "dr_doffset %d" % i
+            #print dr_doffset
+            #print Gs[i]
+            #print w
+            dr_doffsets.append(dr_doffset)
+        return dr_doffsets
+        
+    def parameters_changed(self):
+        offsets = np.hstack([0.0,self.offset.values])[:,None]
+        
+        self.X = self.X_fixed - offsets[self.selected]
+        super(GPOffsetRegression, self).parameters_changed()
+        
+        dL_dr = self.kern.dK_dr_via_X(self.X, self.X) * self.grad_dict['dL_dK']
+        
+        dr_doff = self.dr_doffset(self.X,self.selected,self.offset.values)
+        for i in range(len(dr_doff)):
+            dL_doff = dL_dr * dr_doff[i]
+            self.offset.gradient[i] = -np.sum(dL_doff)
+        
diff --git a/GPy/util/cluster_with_offset.py b/GPy/util/cluster_with_offset.py
new file mode 100644
index 00000000..103bdaca
--- /dev/null
+++ b/GPy/util/cluster_with_offset.py
@@ -0,0 +1,180 @@
+# Copyright (c) 2016, Mike Smith
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import GPy
+import numpy as np
+
+def get_log_likelihood(inputs,data,clust):
+    """Get the LL of a combined set of clusters, ignoring time series offsets.
+    
+    Get the log likelihood of a cluster without worrying about the fact
+    different time series are offset. We're using it here really for those
+    cases in which we only have one cluster to get the loglikelihood of.
+    
+    arguments:
+    inputs -- the 'X's in a list, one item per cluster
+    data -- the 'Y's in a list, one item per cluster
+    clust -- list of clusters to use
+    
+    returns a tuple:
+    log likelihood and the offset (which is always zero for this model)
+    """
+ 
+    S = data[0].shape[0] #number of time series
+    
+    #build a new dataset from the clusters, by combining all clusters together
+    X = np.zeros([0,1])
+    Y = np.zeros([0,S])
+    
+    #for each person in the cluster,
+    #add their inputs and data to the new dataset
+    for p in clust:
+        X = np.vstack([X,inputs[p]])
+        Y = np.vstack([Y,data[p].T])
+        
+    #find the loglikelihood. We just add together the LL for each time series.
+    #ll=0
+    #for s in range(S):
+    #    m = GPy.models.GPRegression(X,Y[:,s][:,None])
+    #    m.optimize()
+    #    ll+=m.log_likelihood()
+
+    m = GPy.models.GPRegression(X,Y)
+    m.optimize()
+    ll=m.log_likelihood()    
+    return ll,0
+
+def get_log_likelihood_offset(inputs,data,clust):
+    """Get the log likelihood of a combined set of clusters, fitting the offsets
+    
+    arguments:
+    inputs -- the 'X's in a list, one item per cluster
+    data -- the 'Y's in a list, one item per cluster
+    clust -- list of clusters to use
+    
+    returns a tuple:
+    log likelihood and the offset
+    """    
+        
+    #if we've only got one cluster, the model has an error, so we want to just
+    #use normal GPRegression.
+    if len(clust)==1: 
+        return get_log_likelihood(inputs,data,clust)
+                
+    S = data[0].shape[0] #number of time series
+        
+    X = np.zeros([0,2]) #notice the extra column, this is for the cluster index
+    Y = np.zeros([0,S])
+    
+    #for each person in the cluster, add their inputs and data to the new
+    #dataset. Note we add an index identifying which person is which data point.
+    #This is for the offset model to use, to allow it to know which data points
+    #to shift.
+    for i,p in enumerate(clust):
+        idx = i*np.ones([inputs[p].shape[0],1])
+        X = np.vstack([X,np.hstack([inputs[p],idx])])
+        Y = np.vstack([Y,data[p].T])
+        
+    m = GPy.models.GPOffsetRegression(X,Y)
+    #TODO: How to select a sensible prior?
+    m.offset.set_prior(GPy.priors.Gaussian(0,20)) 
+    #TODO: Set a sensible start value for the length scale,
+    #make it long to help the offset fit.
+    
+    m.optimize()
+    
+    ll = m.log_likelihood()
+    offset = m.offset.values[0]
+    return ll,offset
+
+def cluster(data,inputs,verbose=False):
+    """Clusters data
+    
+    Using the new offset model, this method uses a greedy algorithm to cluster
+    the data. It starts with all the data points in separate clusters and tests
+    whether combining them increases the overall log-likelihood (LL). It then
+    iteratively joins pairs of clusters which cause the greatest increase in
+    the LL, until no join increases the LL.
+    
+    arguments:
+    inputs -- the 'X's in a list, one item per cluster
+    data -- the 'Y's in a list, one item per cluster
+    
+    returns a list of the clusters.    
+    """
+    N=len(data)
+    
+    
+    #Define a set of N active cluster
+    active = []
+    for p in range(0,N):
+        active.append([p])
+
+    loglikes = np.zeros(len(active))
+    loglikes[:] = None
+
+    pairloglikes = np.zeros([len(active),len(active)])
+    pairloglikes[:] = None
+    pairoffset = np.zeros([len(active),len(active)])
+
+    it = 0
+    while True:
+    
+        if verbose:
+            it +=1
+            print "Iteration %d" % it
+        
+        #Compute the log-likelihood of each cluster (add them together)
+        for clusti in range(len(active)):
+            if np.isnan(loglikes[clusti]):
+                loglikes[clusti], unused_offset = get_log_likelihood_offset(inputs,data,[clusti])
+
+            #try combining with each other cluster...
+            for clustj in range(clusti): #count from 0 to clustj-1
+                temp = [clusti,clustj]
+                if np.isnan(pairloglikes[clusti,clustj]):
+                    pairloglikes[clusti,clustj],pairoffset[clusti,clustj] = get_log_likelihood_offset(inputs,data,temp)
+
+        seploglikes = np.repeat(loglikes[:,None].T,len(loglikes),0)+np.repeat(loglikes[:,None],len(loglikes),1)
+        loglikeimprovement = pairloglikes - seploglikes #how much likelihood improves with clustering
+        top = np.unravel_index(np.nanargmax(pairloglikes-seploglikes), pairloglikes.shape)
+
+        #if loglikeimprovement.shape[0]<3:
+        # #no more clustering to do - this shouldn't happen really unless
+        # #we've set the threshold to apply clustering to less than 0
+        #    break 
+        
+        #if theres further clustering to be done...
+        if loglikeimprovement[top[0],top[1]]>0:
+            active[top[0]].extend(active[top[1]])
+            offset=pairoffset[top[0],top[1]]
+            inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset])
+            data[top[0]] = np.hstack([data[top[0]],data[top[1]]])
+            del inputs[top[1]]
+            del data[top[1]]
+            del active[top[1]]
+
+            #None = message to say we need to recalculate
+            pairloglikes[:,top[0]] = None 
+            pairloglikes[top[0],:] = None 
+            pairloglikes = np.delete(pairloglikes,top[1],0)
+            pairloglikes = np.delete(pairloglikes,top[1],1)
+            loglikes[top[0]] = None
+            loglikes = np.delete(loglikes,top[1])
+        else:
+            break
+            
+        #if loglikeimprovement[top[0],top[1]]>0:
+        #    print "joined"
+        #    print top
+        #    print offset
+        #    print offsets
+        #    print offsets[top[1]]-offsets[top[0]]
+
+    #TODO Add a way to return the offsets applied to all the time series
+    return active 
+
+#starttime = time.time()
+#active = cluster(data,inputs)
+#endtime = time.time()
+#print "TOTAL TIME %0.4f" % (endtime-starttime)

From ea8b7321815ea07ef0a1688668ed03c6b55514f0 Mon Sep 17 00:00:00 2001
From: Michael T Smith <lionfishy@gmail.com>
Date: Wed, 3 Aug 2016 17:49:01 +0100
Subject: [PATCH 28/53] Corrected v2 missing print brackets. Added test code
 for new model and util

---
 GPy/testing/model_tests.py      | 18 ++++++++++++
 GPy/testing/util_tests.py       | 49 +++++++++++++++++++++++++++++++++
 GPy/util/cluster_with_offset.py |  2 +-
 3 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index e4411e23..ff137299 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -405,6 +405,24 @@ class MiscTests(unittest.TestCase):
         warp_m.plot()
         warp_f.plot(X.min()-10, X.max()+10)
         plt.show()
+        
+    def test_offset_regression(self):
+        #Tests GPy.models.GPOffsetRegression. Using two small time series
+        #from a sine wave, we confirm the algorithm determines that the
+        #likelihood is maximised when the offset hyperparameter is approximately
+        #equal to the actual offset in X between the two time series.
+        offset = 3
+        X1 = np.arange(0,50,5.0)[:,None]
+        X2 = np.arange(0+offset,50+offset,5.0)[:,None]
+        X = np.vstack([X1,X2])
+        ind = np.vstack([np.zeros([10,1]),np.ones([10,1])])
+        X = np.hstack([X,ind])
+        Y = np.sin((X[0:10,0])/30.0)[:,None]
+        Y = np.vstack([Y,Y])
+
+        m = GPy.models.GPOffsetRegression(X,Y)
+        m.optimize()
+        assert np.abs(m.offset[0]-offset)<0.1, "GPOffsetRegression model failing to estimate correct offset."
 
 
 
diff --git a/GPy/testing/util_tests.py b/GPy/testing/util_tests.py
index 3c6241f3..e82f7c1a 100644
--- a/GPy/testing/util_tests.py
+++ b/GPy/testing/util_tests.py
@@ -107,3 +107,52 @@ class TestDebug(unittest.TestCase):
         self.assertTrue(d[tuple(X[:,4])] == d[tuple(X[:,0])] == [0, 4, 5])
         self.assertTrue(d[tuple(X[:,1])] == [1])
 
+    def test_offset_cluster(self):
+        #Tests the GPy.util.cluster_with_offset.cluster utility with a small
+        #test data set. Not using random noise just in case it occasionally
+        #causes it not to cluster correctly.
+        #groundtruth cluster identifiers are: [0,1,1,0]
+        
+        #data contains a list of the four sets of time series (3 per data point)      
+
+        data = [np.array([[ 2.18094245,  1.96529789,  2.00265523,  2.18218742,  2.06795428],
+                [ 1.62254829,  1.75748448,  1.83879347,  1.87531326,  1.52503496],
+                [ 1.54589609,  1.61607914,  2.00463192,  1.48771394,  1.63339218]]),
+         np.array([[ 2.86766106,  2.97953437,  2.91958876,  2.92510506,  3.03239241],
+                [ 2.57368423,  2.59954886,  3.10000395,  2.75806125,  2.89865704],
+                [ 2.58916318,  2.53698259,  2.63858411,  2.63102504,  2.51853901]]),
+         np.array([[ 2.77834168,  2.9618564 ,  2.88482141,  3.24259745,  2.9716821 ],
+                [ 2.60675576,  2.67095624,  2.94824436,  2.80520631,  2.87247516],
+                [ 2.49543562,  2.5492281 ,  2.6505866 ,  2.65015308,  2.59738616]]),
+         np.array([[ 1.76783086,  2.21666738,  2.07939706,  1.9268263 ,  2.23360121],
+                [ 1.94305547,  1.94648592,  2.1278921 ,  2.09481457,  2.08575238],
+                [ 1.69336013,  1.72285186,  1.6339506 ,  1.61212022,  1.39198698]])]
+
+        #inputs contains their associated X values
+        
+        inputs = [np.array([[ 0.        ],
+                [ 0.68040097],
+                [ 1.20316795],
+                [ 1.798749  ],
+                [ 2.14891733]]), np.array([[ 0.        ],
+                [ 0.51910637],
+                [ 0.98259352],
+                [ 1.57442965],
+                [ 1.82515098]]), np.array([[ 0.        ],
+                [ 0.66645478],
+                [ 1.59464591],
+                [ 1.69769551],
+                [ 1.80932752]]), np.array([[ 0.        ],
+                [ 0.87512108],
+                [ 1.71881079],
+                [ 2.67162871],
+                [ 3.23761907]])]
+            
+        #try doing the clustering
+        active = GPy.util.cluster_with_offset.cluster(data,inputs)
+
+        #check to see that the clustering has correctly clustered the time series.
+        from sets import Set
+        clusters = Set([Set(cluster) for cluster in active])
+        assert Set([1,2]) in clusters, "Offset Clustering algorithm failed"
+        assert Set([0,3]) in clusters, "Offset Clustering algoirthm failed"
diff --git a/GPy/util/cluster_with_offset.py b/GPy/util/cluster_with_offset.py
index 103bdaca..fa8f1be8 100644
--- a/GPy/util/cluster_with_offset.py
+++ b/GPy/util/cluster_with_offset.py
@@ -122,7 +122,7 @@ def cluster(data,inputs,verbose=False):
     
         if verbose:
             it +=1
-            print "Iteration %d" % it
+            print("Iteration %d" % it)
         
         #Compute the log-likelihood of each cluster (add them together)
         for clusti in range(len(active)):

From 16bc77a1edeeadeb377f794707e4752a4827630e Mon Sep 17 00:00:00 2001
From: Michael T Smith <lionfishy@gmail.com>
Date: Wed, 3 Aug 2016 18:12:56 +0100
Subject: [PATCH 29/53] More useful message from testing re offset estimate

---
 GPy/testing/model_tests.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index ff137299..e2c05aec 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -421,8 +421,9 @@ class MiscTests(unittest.TestCase):
         Y = np.vstack([Y,Y])
 
         m = GPy.models.GPOffsetRegression(X,Y)
+        assert m.checkgrad(), "Gradients of offset parameters don't match numerical approximations."
         m.optimize()
-        assert np.abs(m.offset[0]-offset)<0.1, "GPOffsetRegression model failing to estimate correct offset."
+        assert np.abs(m.offset[0]-offset)<0.1, ("GPOffsetRegression model failing to estimate correct offset (value estimated = %0.2f instead of %0.2f)" % (m.offset[0], offset))
 
 
 

From 04bcf56de12ed0da215a12e4d4edb450ac216dea Mon Sep 17 00:00:00 2001
From: Michael T Smith <lionfishy@gmail.com>
Date: Wed, 3 Aug 2016 18:28:08 +0100
Subject: [PATCH 30/53] Added GPy import

---
 GPy/testing/util_tests.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/GPy/testing/util_tests.py b/GPy/testing/util_tests.py
index e82f7c1a..ce0beaaa 100644
--- a/GPy/testing/util_tests.py
+++ b/GPy/testing/util_tests.py
@@ -29,6 +29,7 @@
 #===============================================================================
 
 import unittest, numpy as np
+import GPy
 
 class TestDebug(unittest.TestCase):
     def test_checkFinite(self):

From ebe4a496a65d27ea8cb0289e1008ffac17ab3c98 Mon Sep 17 00:00:00 2001
From: Michael T Smith <lionfishy@gmail.com>
Date: Thu, 4 Aug 2016 09:01:27 +0100
Subject: [PATCH 31/53] Corrected mistake in gradients: Was finding
 d(Xi-Xj)/dOffset instead of dr/dOffset. Fixed by scaling by kernel
 lengthscale.

---
 GPy/models/gp_offset_regression.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/GPy/models/gp_offset_regression.py b/GPy/models/gp_offset_regression.py
index 3699223e..2fd264f4 100644
--- a/GPy/models/gp_offset_regression.py
+++ b/GPy/models/gp_offset_regression.py
@@ -75,6 +75,10 @@ class GPOffsetRegression(GP):
             #print Gs[i]
             #print w
             dr_doffsets.append(dr_doffset)
+            
+        #lastly we need to divide by the lengthscale: So far we've found d(X_i - X_j)/dOffsets
+        #we want dr/dOffsets. (X_i - X_j)/lengthscale = r
+        dr_doffsets /= self.kern.lengthscale 
         return dr_doffsets
         
     def parameters_changed(self):

From 97f5ca6b8439fca8f365e4d9e5f23717f5cd4624 Mon Sep 17 00:00:00 2001
From: Michael T Smith <lionfishy@gmail.com>
Date: Thu, 4 Aug 2016 09:26:40 +0100
Subject: [PATCH 32/53] Modified set code in test to work with python 2 and
 python 3.

---
 GPy/testing/util_tests.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/GPy/testing/util_tests.py b/GPy/testing/util_tests.py
index ce0beaaa..ba3d7ddf 100644
--- a/GPy/testing/util_tests.py
+++ b/GPy/testing/util_tests.py
@@ -148,12 +148,10 @@ class TestDebug(unittest.TestCase):
                 [ 1.71881079],
                 [ 2.67162871],
                 [ 3.23761907]])]
-            
+                    
         #try doing the clustering
         active = GPy.util.cluster_with_offset.cluster(data,inputs)
-
         #check to see that the clustering has correctly clustered the time series.
-        from sets import Set
-        clusters = Set([Set(cluster) for cluster in active])
-        assert Set([1,2]) in clusters, "Offset Clustering algorithm failed"
-        assert Set([0,3]) in clusters, "Offset Clustering algoirthm failed"
+        clusters = set([frozenset(cluster) for cluster in active])
+        assert set([1,2]) in clusters, "Offset Clustering algorithm failed"
+        assert set([0,3]) in clusters, "Offset Clustering algoirthm failed"

From ad2779bdf3547b13656daed67b66284cb5652614 Mon Sep 17 00:00:00 2001
From: Michael T Smith <lionfishy@gmail.com>
Date: Thu, 4 Aug 2016 10:11:30 +0100
Subject: [PATCH 33/53] made initial lengthscale!=1 to ensure we're properly
 testing gradients

---
 GPy/testing/model_tests.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index e2c05aec..5d6de7ae 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -421,6 +421,7 @@ class MiscTests(unittest.TestCase):
         Y = np.vstack([Y,Y])
 
         m = GPy.models.GPOffsetRegression(X,Y)
+        m.rbf.lengthscale=5.0 #make it something other than one to check our gradients properly!
         assert m.checkgrad(), "Gradients of offset parameters don't match numerical approximations."
         m.optimize()
         assert np.abs(m.offset[0]-offset)<0.1, ("GPOffsetRegression model failing to estimate correct offset (value estimated = %0.2f instead of %0.2f)" % (m.offset[0], offset))

From e019b17cf965a7f673326140767dbc48c4332e32 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-18-106.eu-west-1.compute.internal>
Date: Thu, 4 Aug 2016 12:31:36 +0000
Subject: [PATCH 34/53] Added threaded option - but this doesn't work due to
 the global interpreter lock

---
 GPy/util/__init__.py                     |   1 +
 GPy/util/cluster_with_offset.py          |   4 +
 GPy/util/threaded_cluster_with_offset.py | 206 +++++++++++++++++++++++
 3 files changed, 211 insertions(+)
 create mode 100644 GPy/util/threaded_cluster_with_offset.py

diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py
index 685551fd..136e0d3b 100644
--- a/GPy/util/__init__.py
+++ b/GPy/util/__init__.py
@@ -17,3 +17,4 @@ from . import multioutput
 from . import parallel
 from . import functions
 from . import cluster_with_offset
+from . import threaded_cluster_with_offset
diff --git a/GPy/util/cluster_with_offset.py b/GPy/util/cluster_with_offset.py
index fa8f1be8..d1e896f1 100644
--- a/GPy/util/cluster_with_offset.py
+++ b/GPy/util/cluster_with_offset.py
@@ -3,6 +3,7 @@
 
 import GPy
 import numpy as np
+import sys #so I can print dots
 
 def get_log_likelihood(inputs,data,clust):
     """Get the LL of a combined set of clusters, ignoring time series offsets.
@@ -126,6 +127,9 @@ def cluster(data,inputs,verbose=False):
         
         #Compute the log-likelihood of each cluster (add them together)
         for clusti in range(len(active)):
+            if verbose:
+                sys.stdout.write('.')
+                sys.stdout.flush()
             if np.isnan(loglikes[clusti]):
                 loglikes[clusti], unused_offset = get_log_likelihood_offset(inputs,data,[clusti])
 
diff --git a/GPy/util/threaded_cluster_with_offset.py b/GPy/util/threaded_cluster_with_offset.py
new file mode 100644
index 00000000..66192c9f
--- /dev/null
+++ b/GPy/util/threaded_cluster_with_offset.py
@@ -0,0 +1,206 @@
+# Copyright (c) 2016, Mike Smith
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import GPy
+import numpy as np
+import time
+import sys #so I can print dots
+from threading import Thread
+maxthreads = 40
+
+    
+def get_log_likelihood(inputs,data,clust,loglikesarray):
+    """Get the LL of a combined set of clusters, ignoring time series offsets.
+    
+    Get the log likelihood of a cluster without worrying about the fact
+    different time series are offset. We're using it here really for those
+    cases in which we only have one cluster to get the loglikelihood of.
+    
+    arguments:
+    inputs -- the 'X's in a list, one item per cluster
+    data -- the 'Y's in a list, one item per cluster
+    clust -- list of clusters to use
+    
+    returns a tuple:
+    log likelihood and the offset (which is always zero for this model)
+    """
+ 
+    S = data[0].shape[0] #number of time series
+    
+    #build a new dataset from the clusters, by combining all clusters together
+    X = np.zeros([0,1])
+    Y = np.zeros([0,S])
+    
+    #for each person in the cluster,
+    #add their inputs and data to the new dataset
+    for p in clust:
+        X = np.vstack([X,inputs[p]])
+        Y = np.vstack([Y,data[p].T])
+        
+    #find the loglikelihood. We just add together the LL for each time series.
+    #ll=0
+    #for s in range(S):
+    #    m = GPy.models.GPRegression(X,Y[:,s][:,None])
+    #    m.optimize()
+    #    ll+=m.log_likelihood()
+
+    m = GPy.models.GPRegression(X,Y)
+    m.optimize()
+    ll=m.log_likelihood()  
+    
+    loglikesarray[clust[0]] = ll
+    return
+
+def get_log_likelihood_offset(inputs,data,clust,loglikesarray,offsetarray):
+    """Get the log likelihood of a combined set of clusters, fitting the offsets
+    
+    arguments:
+    inputs -- the 'X's in a list, one item per cluster
+    data -- the 'Y's in a list, one item per cluster
+    clust -- list of clusters to use
+    
+    returns a tuple:
+    log likelihood and the offset
+    """    
+        
+    assert len(clust)>1, "Use get_log_likelihood if you only have one cluster"
+        
+      
+                
+    S = data[0].shape[0] #number of time series
+        
+    X = np.zeros([0,2]) #notice the extra column, this is for the cluster index
+    Y = np.zeros([0,S])
+    
+    #for each person in the cluster, add their inputs and data to the new
+    #dataset. Note we add an index identifying which person is which data point.
+    #This is for the offset model to use, to allow it to know which data points
+    #to shift.
+    for i,p in enumerate(clust):
+        idx = i*np.ones([inputs[p].shape[0],1])
+        X = np.vstack([X,np.hstack([inputs[p],idx])])
+        Y = np.vstack([Y,data[p].T])
+        
+    m = GPy.models.GPOffsetRegression(X,Y)
+    #TODO: How to select a sensible prior?
+    m.offset.set_prior(GPy.priors.Gaussian(0,20)) 
+    #TODO: Set a sensible start value for the length scale,
+    #make it long to help the offset fit.
+    
+    m.optimize()
+    
+    ll = m.log_likelihood()
+    offset = m.offset.values[0]
+    #return ll,offset
+    loglikesarray[clust[0],clust[1]] = ll
+    offsetarray[clust[0],clust[1]] = offset
+    return
+
+def cluster(data,inputs,verbose=False):
+    """Clusters data
+    
+    Using the new offset model, this method uses a greedy algorithm to cluster
+    the data. It starts with all the data points in separate clusters and tests
+    whether combining them increases the overall log-likelihood (LL). It then
+    iteratively joins pairs of clusters which cause the greatest increase in
+    the LL, until no join increases the LL.
+    
+    arguments:
+    inputs -- the 'X's in a list, one item per cluster
+    data -- the 'Y's in a list, one item per cluster
+    
+    returns a list of the clusters.    
+    """
+    N=len(data)
+    
+    
+    #Define a set of N active cluster
+    active = []
+    for p in range(0,N):
+        active.append([p])
+
+    loglikes = np.zeros(len(active))
+    loglikes[:] = None
+
+    pairloglikes = np.zeros([len(active),len(active)])
+    pairloglikes[:] = None
+    pairoffset = np.zeros([len(active),len(active)])
+
+    it = 0
+    while True:
+    
+        if verbose:
+            it +=1
+            print("Iteration %d" % it)
+        
+        threads = []
+        #Compute the log-likelihood of each cluster (add them together)
+        for clusti in range(len(active)):
+            if verbose:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+            if np.isnan(loglikes[clusti]):
+                t = Thread(target=get_log_likelihood,args=(inputs,data,[clusti],loglikes))
+                threads.append(t)
+                #loglikes[clusti], unused_offset = get_log_likelihood_offset(inputs,data,[clusti])
+
+            #try combining with each other cluster...
+            for clustj in range(clusti): #count from 0 to clustj-1
+                temp = [clusti,clustj]
+                if np.isnan(pairloglikes[clusti,clustj]):
+                    #pairloglikes[clusti,clustj],pairoffset[clusti,clustj] = get_log_likelihood_offset(inputs,data,temp)
+                    t = Thread(target=get_log_likelihood_offset,args=(inputs,data,temp,pairloglikes,pairoffset))
+                    threads.append(t)
+   
+        if len(threads)<=maxthreads:
+            for t in threads:
+                t.start()
+            for t in threads:
+                t.join()
+
+        else: #should use a queue, as the method here assumes all threads take about same time, etc.
+            for i,t in enumerate(threads):
+                t.start()
+                if (i>=maxthreads):
+                    threads[i-maxthreads].join()
+            for i in range(len(threads)-maxthreads,len(threads)):
+                threads[i].join()
+
+        seploglikes = np.repeat(loglikes[:,None].T,len(loglikes),0)+np.repeat(loglikes[:,None],len(loglikes),1)
+        loglikeimprovement = pairloglikes - seploglikes #how much likelihood improves with clustering
+        top = np.unravel_index(np.nanargmax(pairloglikes-seploglikes), pairloglikes.shape)
+
+        #if loglikeimprovement.shape[0]<3:
+        # #no more clustering to do - this shouldn't happen really unless
+        # #we've set the threshold to apply clustering to less than 0
+        #    break 
+        
+        #if theres further clustering to be done...
+        if loglikeimprovement[top[0],top[1]]>0:
+            active[top[0]].extend(active[top[1]])
+            offset=pairoffset[top[0],top[1]]
+            inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset])
+            data[top[0]] = np.hstack([data[top[0]],data[top[1]]])
+            del inputs[top[1]]
+            del data[top[1]]
+            del active[top[1]]
+
+            #None = message to say we need to recalculate
+            pairloglikes[:,top[0]] = None 
+            pairloglikes[top[0],:] = None 
+            pairloglikes = np.delete(pairloglikes,top[1],0)
+            pairloglikes = np.delete(pairloglikes,top[1],1)
+            loglikes[top[0]] = None
+            loglikes = np.delete(loglikes,top[1])
+        else:
+            break
+            
+        #if loglikeimprovement[top[0],top[1]]>0:
+        #    print "joined"
+        #    print top
+        #    print offset
+        #    print offsets
+        #    print offsets[top[1]]-offsets[top[0]]
+
+    #TODO Add a way to return the offsets applied to all the time series
+    return active 

From 6f4d03178ebdc1967d5aeefbb1ca7ca5ab3923b3 Mon Sep 17 00:00:00 2001
From: Michael T Smith <lionfishy@gmail.com>
Date: Thu, 4 Aug 2016 14:47:08 +0100
Subject: [PATCH 35/53] Don't use message added to cluster code

---
 GPy/util/threaded_cluster_with_offset.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/GPy/util/threaded_cluster_with_offset.py b/GPy/util/threaded_cluster_with_offset.py
index 66192c9f..6818f5ec 100644
--- a/GPy/util/threaded_cluster_with_offset.py
+++ b/GPy/util/threaded_cluster_with_offset.py
@@ -1,6 +1,9 @@
 # Copyright (c) 2016, Mike Smith
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
+# Not recommended for use at the moment - the threading here doesn't
+# work due to the global interpreter lock.
+
 import GPy
 import numpy as np
 import time

From 81dc680a8acb72b46f16bc12d599f847680a3a98 Mon Sep 17 00:00:00 2001
From: Michael T Smith <lionfishy@gmail.com>
Date: Tue, 9 Aug 2016 13:55:37 +0100
Subject: [PATCH 36/53] Push just to rerun testing

---
 GPy/models/gp_offset_regression.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/GPy/models/gp_offset_regression.py b/GPy/models/gp_offset_regression.py
index 2fd264f4..b843e08d 100644
--- a/GPy/models/gp_offset_regression.py
+++ b/GPy/models/gp_offset_regression.py
@@ -49,8 +49,7 @@ class GPOffsetRegression(GP):
         
     #def dL_doffset(self, X, sel):
     #    dL_dr = self.dK_dr_via_X(X, X) * dL_dK
-        
-        
+                
         
     def dr_doffset(self,X,sel,delta):
         #given an input matrix, X and the offsets (delta)

From bd758ac849eff3f13f58c61b7a41f0fc1f1a395f Mon Sep 17 00:00:00 2001
From: Michael T Smith <lionfishy@gmail.com>
Date: Tue, 9 Aug 2016 14:54:29 +0100
Subject: [PATCH 37/53] Removing 'threaded' version

---
 GPy/util/__init__.py                     |   1 -
 GPy/util/threaded_cluster_with_offset.py | 209 -----------------------
 2 files changed, 210 deletions(-)
 delete mode 100644 GPy/util/threaded_cluster_with_offset.py

diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py
index 136e0d3b..685551fd 100644
--- a/GPy/util/__init__.py
+++ b/GPy/util/__init__.py
@@ -17,4 +17,3 @@ from . import multioutput
 from . import parallel
 from . import functions
 from . import cluster_with_offset
-from . import threaded_cluster_with_offset
diff --git a/GPy/util/threaded_cluster_with_offset.py b/GPy/util/threaded_cluster_with_offset.py
deleted file mode 100644
index 6818f5ec..00000000
--- a/GPy/util/threaded_cluster_with_offset.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# Copyright (c) 2016, Mike Smith
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-# Not recommended for use at the moment - the threading here doesn't
-# work due to the global interpreter lock.
-
-import GPy
-import numpy as np
-import time
-import sys #so I can print dots
-from threading import Thread
-maxthreads = 40
-
-    
-def get_log_likelihood(inputs,data,clust,loglikesarray):
-    """Get the LL of a combined set of clusters, ignoring time series offsets.
-    
-    Get the log likelihood of a cluster without worrying about the fact
-    different time series are offset. We're using it here really for those
-    cases in which we only have one cluster to get the loglikelihood of.
-    
-    arguments:
-    inputs -- the 'X's in a list, one item per cluster
-    data -- the 'Y's in a list, one item per cluster
-    clust -- list of clusters to use
-    
-    returns a tuple:
-    log likelihood and the offset (which is always zero for this model)
-    """
- 
-    S = data[0].shape[0] #number of time series
-    
-    #build a new dataset from the clusters, by combining all clusters together
-    X = np.zeros([0,1])
-    Y = np.zeros([0,S])
-    
-    #for each person in the cluster,
-    #add their inputs and data to the new dataset
-    for p in clust:
-        X = np.vstack([X,inputs[p]])
-        Y = np.vstack([Y,data[p].T])
-        
-    #find the loglikelihood. We just add together the LL for each time series.
-    #ll=0
-    #for s in range(S):
-    #    m = GPy.models.GPRegression(X,Y[:,s][:,None])
-    #    m.optimize()
-    #    ll+=m.log_likelihood()
-
-    m = GPy.models.GPRegression(X,Y)
-    m.optimize()
-    ll=m.log_likelihood()  
-    
-    loglikesarray[clust[0]] = ll
-    return
-
-def get_log_likelihood_offset(inputs,data,clust,loglikesarray,offsetarray):
-    """Get the log likelihood of a combined set of clusters, fitting the offsets
-    
-    arguments:
-    inputs -- the 'X's in a list, one item per cluster
-    data -- the 'Y's in a list, one item per cluster
-    clust -- list of clusters to use
-    
-    returns a tuple:
-    log likelihood and the offset
-    """    
-        
-    assert len(clust)>1, "Use get_log_likelihood if you only have one cluster"
-        
-      
-                
-    S = data[0].shape[0] #number of time series
-        
-    X = np.zeros([0,2]) #notice the extra column, this is for the cluster index
-    Y = np.zeros([0,S])
-    
-    #for each person in the cluster, add their inputs and data to the new
-    #dataset. Note we add an index identifying which person is which data point.
-    #This is for the offset model to use, to allow it to know which data points
-    #to shift.
-    for i,p in enumerate(clust):
-        idx = i*np.ones([inputs[p].shape[0],1])
-        X = np.vstack([X,np.hstack([inputs[p],idx])])
-        Y = np.vstack([Y,data[p].T])
-        
-    m = GPy.models.GPOffsetRegression(X,Y)
-    #TODO: How to select a sensible prior?
-    m.offset.set_prior(GPy.priors.Gaussian(0,20)) 
-    #TODO: Set a sensible start value for the length scale,
-    #make it long to help the offset fit.
-    
-    m.optimize()
-    
-    ll = m.log_likelihood()
-    offset = m.offset.values[0]
-    #return ll,offset
-    loglikesarray[clust[0],clust[1]] = ll
-    offsetarray[clust[0],clust[1]] = offset
-    return
-
-def cluster(data,inputs,verbose=False):
-    """Clusters data
-    
-    Using the new offset model, this method uses a greedy algorithm to cluster
-    the data. It starts with all the data points in separate clusters and tests
-    whether combining them increases the overall log-likelihood (LL). It then
-    iteratively joins pairs of clusters which cause the greatest increase in
-    the LL, until no join increases the LL.
-    
-    arguments:
-    inputs -- the 'X's in a list, one item per cluster
-    data -- the 'Y's in a list, one item per cluster
-    
-    returns a list of the clusters.    
-    """
-    N=len(data)
-    
-    
-    #Define a set of N active cluster
-    active = []
-    for p in range(0,N):
-        active.append([p])
-
-    loglikes = np.zeros(len(active))
-    loglikes[:] = None
-
-    pairloglikes = np.zeros([len(active),len(active)])
-    pairloglikes[:] = None
-    pairoffset = np.zeros([len(active),len(active)])
-
-    it = 0
-    while True:
-    
-        if verbose:
-            it +=1
-            print("Iteration %d" % it)
-        
-        threads = []
-        #Compute the log-likelihood of each cluster (add them together)
-        for clusti in range(len(active)):
-            if verbose:
-                sys.stdout.write('.')
-                sys.stdout.flush()
-            if np.isnan(loglikes[clusti]):
-                t = Thread(target=get_log_likelihood,args=(inputs,data,[clusti],loglikes))
-                threads.append(t)
-                #loglikes[clusti], unused_offset = get_log_likelihood_offset(inputs,data,[clusti])
-
-            #try combining with each other cluster...
-            for clustj in range(clusti): #count from 0 to clustj-1
-                temp = [clusti,clustj]
-                if np.isnan(pairloglikes[clusti,clustj]):
-                    #pairloglikes[clusti,clustj],pairoffset[clusti,clustj] = get_log_likelihood_offset(inputs,data,temp)
-                    t = Thread(target=get_log_likelihood_offset,args=(inputs,data,temp,pairloglikes,pairoffset))
-                    threads.append(t)
-   
-        if len(threads)<=maxthreads:
-            for t in threads:
-                t.start()
-            for t in threads:
-                t.join()
-
-        else: #should use a queue, as the method here assumes all threads take about same time, etc.
-            for i,t in enumerate(threads):
-                t.start()
-                if (i>=maxthreads):
-                    threads[i-maxthreads].join()
-            for i in range(len(threads)-maxthreads,len(threads)):
-                threads[i].join()
-
-        seploglikes = np.repeat(loglikes[:,None].T,len(loglikes),0)+np.repeat(loglikes[:,None],len(loglikes),1)
-        loglikeimprovement = pairloglikes - seploglikes #how much likelihood improves with clustering
-        top = np.unravel_index(np.nanargmax(pairloglikes-seploglikes), pairloglikes.shape)
-
-        #if loglikeimprovement.shape[0]<3:
-        # #no more clustering to do - this shouldn't happen really unless
-        # #we've set the threshold to apply clustering to less than 0
-        #    break 
-        
-        #if theres further clustering to be done...
-        if loglikeimprovement[top[0],top[1]]>0:
-            active[top[0]].extend(active[top[1]])
-            offset=pairoffset[top[0],top[1]]
-            inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset])
-            data[top[0]] = np.hstack([data[top[0]],data[top[1]]])
-            del inputs[top[1]]
-            del data[top[1]]
-            del active[top[1]]
-
-            #None = message to say we need to recalculate
-            pairloglikes[:,top[0]] = None 
-            pairloglikes[top[0],:] = None 
-            pairloglikes = np.delete(pairloglikes,top[1],0)
-            pairloglikes = np.delete(pairloglikes,top[1],1)
-            loglikes[top[0]] = None
-            loglikes = np.delete(loglikes,top[1])
-        else:
-            break
-            
-        #if loglikeimprovement[top[0],top[1]]>0:
-        #    print "joined"
-        #    print top
-        #    print offset
-        #    print offsets
-        #    print offsets[top[1]]-offsets[top[0]]
-
-    #TODO Add a way to return the offsets applied to all the time series
-    return active 

From b8867f1552c05244dcd5ba38a7a57b6f1056312c Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Tue, 16 Aug 2016 11:12:52 +0100
Subject: [PATCH 38/53] [kern] Add kernel was swallowing parts #fix #412

---
 GPy/kern/src/add.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/GPy/kern/src/add.py b/GPy/kern/src/add.py
index 99fe809b..ca20e5a9 100644
--- a/GPy/kern/src/add.py
+++ b/GPy/kern/src/add.py
@@ -13,15 +13,21 @@ class Add(CombinationKernel):
     propagates gradients through.
 
     This kernel will take over the active dims of it's subkernels passed in.
+    
+    NOTE: The subkernels will be copies of the original kernels, to prevent 
+    unexpected behavior. 
     """
     def __init__(self, subkerns, name='sum'):
-        for i, kern in enumerate(subkerns[:]):
+        _newkerns = []
+        for kern in subkerns:
             if isinstance(kern, Add):
-                del subkerns[i]
-                for part in kern.parts[::-1]:
+                for part in kern.parts:
                     #kern.unlink_parameter(part)
-                    subkerns.insert(i, part.copy())
-        super(Add, self).__init__(subkerns, name)
+                    _newkerns.append(part.copy())
+            else:
+                _newkerns.append(kern.copy())
+                    
+        super(Add, self).__init__(_newkerns, name)
         self._exact_psicomp = self._check_exact_psicomp()
 
     def _check_exact_psicomp(self):

From 0abb9b835ffeb020410bdf9a1e0532139ffa5cfc Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Tue, 16 Aug 2016 11:13:32 +0100
Subject: [PATCH 39/53] Revert "[kern] Add kernel was swallowing parts #fix
 #412"

This reverts commit b8867f1552c05244dcd5ba38a7a57b6f1056312c.
---
 GPy/kern/src/add.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/GPy/kern/src/add.py b/GPy/kern/src/add.py
index ca20e5a9..99fe809b 100644
--- a/GPy/kern/src/add.py
+++ b/GPy/kern/src/add.py
@@ -13,21 +13,15 @@ class Add(CombinationKernel):
     propagates gradients through.
 
     This kernel will take over the active dims of it's subkernels passed in.
-    
-    NOTE: The subkernels will be copies of the original kernels, to prevent 
-    unexpected behavior. 
     """
     def __init__(self, subkerns, name='sum'):
-        _newkerns = []
-        for kern in subkerns:
+        for i, kern in enumerate(subkerns[:]):
             if isinstance(kern, Add):
-                for part in kern.parts:
+                del subkerns[i]
+                for part in kern.parts[::-1]:
                     #kern.unlink_parameter(part)
-                    _newkerns.append(part.copy())
-            else:
-                _newkerns.append(kern.copy())
-                    
-        super(Add, self).__init__(_newkerns, name)
+                    subkerns.insert(i, part.copy())
+        super(Add, self).__init__(subkerns, name)
         self._exact_psicomp = self._check_exact_psicomp()
 
     def _check_exact_psicomp(self):

From 27d49bbe015435b77b791384407c4dd012491ee8 Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Tue, 16 Aug 2016 11:14:54 +0100
Subject: [PATCH 40/53] Revert "Revert "[kern] Add kernel was swallowing parts
 #fix #412""

This reverts commit 0abb9b835ffeb020410bdf9a1e0532139ffa5cfc.
---
 GPy/kern/src/add.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/GPy/kern/src/add.py b/GPy/kern/src/add.py
index 99fe809b..ca20e5a9 100644
--- a/GPy/kern/src/add.py
+++ b/GPy/kern/src/add.py
@@ -13,15 +13,21 @@ class Add(CombinationKernel):
     propagates gradients through.
 
     This kernel will take over the active dims of it's subkernels passed in.
+    
+    NOTE: The subkernels will be copies of the original kernels, to prevent 
+    unexpected behavior. 
     """
     def __init__(self, subkerns, name='sum'):
-        for i, kern in enumerate(subkerns[:]):
+        _newkerns = []
+        for kern in subkerns:
             if isinstance(kern, Add):
-                del subkerns[i]
-                for part in kern.parts[::-1]:
+                for part in kern.parts:
                     #kern.unlink_parameter(part)
-                    subkerns.insert(i, part.copy())
-        super(Add, self).__init__(subkerns, name)
+                    _newkerns.append(part.copy())
+            else:
+                _newkerns.append(kern.copy())
+                    
+        super(Add, self).__init__(_newkerns, name)
         self._exact_psicomp = self._check_exact_psicomp()
 
     def _check_exact_psicomp(self):

From 7135527fd00f0a385b65bfb32f9535c3859d1f8a Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Tue, 16 Aug 2016 11:17:15 +0100
Subject: [PATCH 41/53] [Add] add kernel swallowed parts fix #412

---
 GPy/kern/src/add.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/GPy/kern/src/add.py b/GPy/kern/src/add.py
index 99fe809b..ca20e5a9 100644
--- a/GPy/kern/src/add.py
+++ b/GPy/kern/src/add.py
@@ -13,15 +13,21 @@ class Add(CombinationKernel):
     propagates gradients through.
 
     This kernel will take over the active dims of it's subkernels passed in.
+    
+    NOTE: The subkernels will be copies of the original kernels, to prevent 
+    unexpected behavior. 
     """
     def __init__(self, subkerns, name='sum'):
-        for i, kern in enumerate(subkerns[:]):
+        _newkerns = []
+        for kern in subkerns:
             if isinstance(kern, Add):
-                del subkerns[i]
-                for part in kern.parts[::-1]:
+                for part in kern.parts:
                     #kern.unlink_parameter(part)
-                    subkerns.insert(i, part.copy())
-        super(Add, self).__init__(subkerns, name)
+                    _newkerns.append(part.copy())
+            else:
+                _newkerns.append(kern.copy())
+                    
+        super(Add, self).__init__(_newkerns, name)
         self._exact_psicomp = self._check_exact_psicomp()
 
     def _check_exact_psicomp(self):

From 9aa19b662d7d2f4ca52df428b4b060a1d12425d5 Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Tue, 16 Aug 2016 11:19:04 +0100
Subject: [PATCH 42/53] [readme] added deploy status to readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e0e715a9..513f2704 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ The Gaussian processes framework in Python.
 * Travis-CI [unit-tests](https://travis-ci.org/SheffieldML/GPy)
 * [![licence](https://img.shields.io/badge/licence-BSD-blue.svg)](http://opensource.org/licenses/BSD-3-Clause)
 
-[![develstat](https://travis-ci.org/SheffieldML/GPy.svg?branch=devel)](https://travis-ci.org/SheffieldML/GPy) [![appveyor](https://ci.appveyor.com/api/projects/status/662o6tha09m2jix3/branch/deploy?svg=true)](https://ci.appveyor.com/project/mzwiessele/gpy/branch/deploy) [![coverallsdevel](https://coveralls.io/repos/github/SheffieldML/GPy/badge.svg?branch=devel)](https://coveralls.io/github/SheffieldML/GPy?branch=devel) [![covdevel](http://codecov.io/github/SheffieldML/GPy/coverage.svg?branch=devel)](http://codecov.io/github/SheffieldML/GPy?branch=devel) [![Research software impact](http://depsy.org/api/package/pypi/GPy/badge.svg)](http://depsy.org/package/python/GPy) [![Code Health](https://landscape.io/github/SheffieldML/GPy/devel/landscape.svg?style=flat)](https://landscape.io/github/SheffieldML/GPy/devel)
+[![deploystat](https://travis-ci.org/SheffieldML/GPy.svg?branch=deploy)](https://travis-ci.org/SheffieldML/GPy) [![appveyor](https://ci.appveyor.com/api/projects/status/662o6tha09m2jix3/branch/deploy?svg=true)](https://ci.appveyor.com/project/mzwiessele/gpy/branch/deploy) [![coverallsdevel](https://coveralls.io/repos/github/SheffieldML/GPy/badge.svg?branch=devel)](https://coveralls.io/github/SheffieldML/GPy?branch=devel) [![covdevel](http://codecov.io/github/SheffieldML/GPy/coverage.svg?branch=devel)](http://codecov.io/github/SheffieldML/GPy?branch=devel) [![Research software impact](http://depsy.org/api/package/pypi/GPy/badge.svg)](http://depsy.org/package/python/GPy) [![Code Health](https://landscape.io/github/SheffieldML/GPy/devel/landscape.svg?style=flat)](https://landscape.io/github/SheffieldML/GPy/devel)
 
 ## Updated Structure
 

From b6847366120ef1b4ad7232c8fb80ed26ad448802 Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Tue, 16 Aug 2016 12:01:17 +0100
Subject: [PATCH 43/53] [plotting] small edit

---
 GPy/plotting/abstract_plotting_library.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/GPy/plotting/abstract_plotting_library.py b/GPy/plotting/abstract_plotting_library.py
index 64061b99..95377ed7 100644
--- a/GPy/plotting/abstract_plotting_library.py
+++ b/GPy/plotting/abstract_plotting_library.py
@@ -61,6 +61,8 @@ class AbstractPlottingLibrary(object):
         """
         Get a new figure with nrows and ncolumns subplots.
         Does not initialize the canvases yet.
+        
+        There is individual kwargs for the individual plotting libraries to use.
         """
         raise NotImplementedError("Implement all plot functions in AbstractPlottingLibrary in order to use your own plotting library")
     

From 171dc446c420f7590118e94b465d4e7adfd7af59 Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Tue, 16 Aug 2016 12:07:17 +0100
Subject: [PATCH 44/53] [appveyor] warning on existing whls

---
 appveyor.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/appveyor.yml b/appveyor.yml
index f7f7b7ab..fb35b745 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -71,10 +71,10 @@ deploy_script:
 - echo password:%pip_access% >> %USERPROFILE%\\.pypirc
 - ps: >-
     if ($env:APPVEYOR_REPO_BRANCH -eq 'devel') { 
-        twine upload -r test dist/*
+        twine upload --exists warning -r test dist/*
     }
     elseif ($env:APPVEYOR_REPO_BRANCH -eq 'deploy') {  
-        twine upload dist/*
+        twine upload --exists warning dist/*
     }
     else {
         echo not deploying on other branches

From 8d348ec7c664f605b0ccc3f397b08d23f97dbac2 Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Tue, 16 Aug 2016 13:41:33 +0100
Subject: [PATCH 45/53] [priorizable] small edit to be usable with paramz 0.6.1
 and greater

---
 GPy/core/parameterization/priorizable.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/core/parameterization/priorizable.py b/GPy/core/parameterization/priorizable.py
index 1d1153c7..eb82e862 100644
--- a/GPy/core/parameterization/priorizable.py
+++ b/GPy/core/parameterization/priorizable.py
@@ -16,7 +16,7 @@ class Priorizable(Parameterizable):
 
     def __setstate__(self, state):
         super(Priorizable, self).__setstate__(state)
-        self._index_operations['priors'] = self.priors
+        #self._index_operations['priors'] = self.priors
 
 
     #===========================================================================

From feed4c6b8d075bc47ee29d69a2f01dc6bda97abf Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Tue, 16 Aug 2016 13:47:12 +0100
Subject: [PATCH 46/53] [priorizable] small edit to be usable with paramz 0.6.1
 and greater

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c8d20add..4cb71d90 100644
--- a/setup.py
+++ b/setup.py
@@ -148,7 +148,7 @@ setup(name = 'GPy',
       include_package_data = True,
       py_modules = ['GPy.__init__'],
       test_suite = 'GPy.testing',
-      install_requires = ['numpy>=1.7', 'scipy>=0.16', 'six', 'paramz>=0.5.2'],
+      install_requires = ['numpy>=1.7', 'scipy>=0.16', 'six', 'paramz>=0.6.1'],
       extras_require = {'docs':['sphinx'],
                         'optional':['mpi4py',
                                     'ipython>=4.0.0',

From a70c6c9c7230334bb7ef3051d1f7107c16cc1010 Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Tue, 16 Aug 2016 13:48:39 +0100
Subject: [PATCH 47/53] =?UTF-8?q?Bump=20version:=201.2.1=20=E2=86=92=201.3?=
 =?UTF-8?q?.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 GPy/__version__.py | 2 +-
 appveyor.yml       | 2 +-
 setup.cfg          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/GPy/__version__.py b/GPy/__version__.py
index a955fdae..67bc602a 100644
--- a/GPy/__version__.py
+++ b/GPy/__version__.py
@@ -1 +1 @@
-__version__ = "1.2.1"
+__version__ = "1.3.0"
diff --git a/appveyor.yml b/appveyor.yml
index fb35b745..4b7bcde4 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -3,7 +3,7 @@ environment:
     secure: 8/ZjXFwtd1S7ixd7PJOpptupKKEDhm2da/q3unabJ00=
   COVERALLS_REPO_TOKEN:
     secure: d3Luic/ESkGaWnZrvWZTKrzO+xaVwJWaRCEP0F+K/9DQGPSRZsJ/Du5g3s4XF+tS
-  gpy_version: 1.2.1
+  gpy_version: 1.3.0
   matrix:
     - PYTHON_VERSION: 2.7
       MINICONDA: C:\Miniconda-x64
diff --git a/setup.cfg b/setup.cfg
index 57fddfaf..f4fba600 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.2.1
+current_version = 1.3.0
 tag = False
 commit = True
 

From 1a37a06e5d569a9ae3a1027f3cd6675b3f6058a8 Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Tue, 16 Aug 2016 13:55:35 +0100
Subject: [PATCH 48/53] [appveyor] skip existing twine upload

---
 appveyor.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/appveyor.yml b/appveyor.yml
index 4b7bcde4..49801d75 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -71,10 +71,10 @@ deploy_script:
 - echo password:%pip_access% >> %USERPROFILE%\\.pypirc
 - ps: >-
     if ($env:APPVEYOR_REPO_BRANCH -eq 'devel') { 
-        twine upload --exists warning -r test dist/*
+        twine upload --skip-existing -r test dist/*
     }
     elseif ($env:APPVEYOR_REPO_BRANCH -eq 'deploy') {  
-        twine upload --exists warning dist/*
+        twine upload --skip-existing dist/*
     }
     else {
         echo not deploying on other branches

From cb6e22de130d5c424d8f39018f074f5c2e7799ac Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Tue, 16 Aug 2016 13:55:42 +0100
Subject: [PATCH 49/53] =?UTF-8?q?Bump=20version:=201.3.0=20=E2=86=92=201.3?=
 =?UTF-8?q?.1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 GPy/__version__.py | 2 +-
 appveyor.yml       | 2 +-
 setup.cfg          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/GPy/__version__.py b/GPy/__version__.py
index 67bc602a..9c73af26 100644
--- a/GPy/__version__.py
+++ b/GPy/__version__.py
@@ -1 +1 @@
-__version__ = "1.3.0"
+__version__ = "1.3.1"
diff --git a/appveyor.yml b/appveyor.yml
index 49801d75..6a7e5635 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -3,7 +3,7 @@ environment:
     secure: 8/ZjXFwtd1S7ixd7PJOpptupKKEDhm2da/q3unabJ00=
   COVERALLS_REPO_TOKEN:
     secure: d3Luic/ESkGaWnZrvWZTKrzO+xaVwJWaRCEP0F+K/9DQGPSRZsJ/Du5g3s4XF+tS
-  gpy_version: 1.3.0
+  gpy_version: 1.3.1
   matrix:
     - PYTHON_VERSION: 2.7
       MINICONDA: C:\Miniconda-x64
diff --git a/setup.cfg b/setup.cfg
index f4fba600..a79f77a3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.3.0
+current_version = 1.3.1
 tag = False
 commit = True
 

From e529c5a7e0a37de96ac1523220c23613b815ce6f Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Tue, 16 Aug 2016 15:08:41 +0100
Subject: [PATCH 50/53] [paramz] update included

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 4cb71d90..1cf2321d 100644
--- a/setup.py
+++ b/setup.py
@@ -148,7 +148,7 @@ setup(name = 'GPy',
       include_package_data = True,
       py_modules = ['GPy.__init__'],
       test_suite = 'GPy.testing',
-      install_requires = ['numpy>=1.7', 'scipy>=0.16', 'six', 'paramz>=0.6.1'],
+      install_requires = ['numpy>=1.7', 'scipy>=0.16', 'six', 'paramz>=0.6.6'],
       extras_require = {'docs':['sphinx'],
                         'optional':['mpi4py',
                                     'ipython>=4.0.0',

From f50b691ec66aaed21d5fc9940a5c8197cf7391ea Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Tue, 16 Aug 2016 15:08:45 +0100
Subject: [PATCH 51/53] =?UTF-8?q?Bump=20version:=201.3.1=20=E2=86=92=201.3?=
 =?UTF-8?q?.2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 GPy/__version__.py | 2 +-
 appveyor.yml       | 2 +-
 setup.cfg          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/GPy/__version__.py b/GPy/__version__.py
index 9c73af26..f708a9b2 100644
--- a/GPy/__version__.py
+++ b/GPy/__version__.py
@@ -1 +1 @@
-__version__ = "1.3.1"
+__version__ = "1.3.2"
diff --git a/appveyor.yml b/appveyor.yml
index 6a7e5635..1f7a7dd9 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -3,7 +3,7 @@ environment:
     secure: 8/ZjXFwtd1S7ixd7PJOpptupKKEDhm2da/q3unabJ00=
   COVERALLS_REPO_TOKEN:
     secure: d3Luic/ESkGaWnZrvWZTKrzO+xaVwJWaRCEP0F+K/9DQGPSRZsJ/Du5g3s4XF+tS
-  gpy_version: 1.3.1
+  gpy_version: 1.3.2
   matrix:
     - PYTHON_VERSION: 2.7
       MINICONDA: C:\Miniconda-x64
diff --git a/setup.cfg b/setup.cfg
index a79f77a3..6651958c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.3.1
+current_version = 1.3.2
 tag = False
 commit = True
 

From d343ec8b41d43c4edc94f7bfbd5defeed0da2586 Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Wed, 17 Aug 2016 14:51:29 +0100
Subject: [PATCH 52/53] [warped stuff] plotting and normalizer in warped gps

---
 GPy/__init__.py                    |  2 +
 GPy/core/gp.py                     | 30 ++++++++------
 GPy/kern/src/static.py             |  8 +---
 GPy/models/warped_gp.py            | 63 ++++++++++++++----------------
 GPy/plotting/gpy_plot/plot_util.py |  9 ++---
 GPy/testing/model_tests.py         | 18 ++++++++-
 GPy/util/normalizer.py             | 42 +++++++++++++++++---
 GPy/util/warping_functions.py      | 18 ++-------
 8 files changed, 112 insertions(+), 78 deletions(-)

diff --git a/GPy/__init__.py b/GPy/__init__.py
index 1dc3e6b1..9ee3d935 100644
--- a/GPy/__init__.py
+++ b/GPy/__init__.py
@@ -14,6 +14,8 @@ from . import testing
 from . import kern
 from . import plotting
 
+from .util import normalizer
+
 # backwards compatibility
 import sys
 backwards_compatibility = ['lists_and_dicts', 'observable_array', 'ties_and_remappings', 'index_operations']
diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 9cbe3daf..f0910d9d 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -2,17 +2,17 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from .. import kern
-from GPy.core.model import Model
-from paramz import ObsAr
+from .model import Model
+from .parameterization.variational import VariationalPosterior
 from .mapping import Mapping
 from .. import likelihoods
+from .. import kern
 from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
-from GPy.core.parameterization.variational import VariationalPosterior
+from ..util.normalizer import Standardize
+from paramz import ObsAr
 
 import logging
 import warnings
-from GPy.util.normalizer import MeanNorm
 logger = logging.getLogger("GP")
 
 class GP(Model):
@@ -28,7 +28,7 @@ class GP(Model):
     :param Norm normalizer:
         normalize the outputs Y.
         Prediction will be un-normalized using this normalizer.
-        If normalizer is None, we will normalize using MeanNorm.
+        If normalizer is None, we will normalize using Standardize.
         If normalizer is False, no normalization will be done.
 
     .. Note:: Multiple independent outputs are allowed using columns of Y
@@ -49,7 +49,7 @@ class GP(Model):
         logger.info("initializing Y")
 
         if normalizer is True:
-            self.normalizer = MeanNorm()
+            self.normalizer = Standardize()
         elif normalizer is False:
             self.normalizer = None
         else:
@@ -64,6 +64,7 @@ class GP(Model):
             self.Y_normalized = self.Y
         else:
             self.Y = Y
+            self.Y_normalized = self.Y
 
         if Y.shape[0] != self.num_data:
             #There can be cases where we want inputs than outputs, for example if we have multiple latent
@@ -248,14 +249,16 @@ class GP(Model):
         """
         #predict the latent function values
         mu, var = self._raw_predict(Xnew, full_cov=full_cov, kern=kern)
-        if self.normalizer is not None:
-            mu, var = self.normalizer.inverse_mean(mu), self.normalizer.inverse_variance(var)
 
         if include_likelihood:
             # now push through likelihood
             if likelihood is None:
                 likelihood = self.likelihood
             mu, var = likelihood.predictive_values(mu, var, full_cov, Y_metadata=Y_metadata)
+
+        if self.normalizer is not None:
+            mu, var = self.normalizer.inverse_mean(mu), self.normalizer.inverse_variance(var)
+
         return mu, var
 
     def predict_noiseless(self,  Xnew, full_cov=False, Y_metadata=None, kern=None):
@@ -300,11 +303,14 @@ class GP(Model):
         :rtype: [np.ndarray (Xnew x self.output_dim), np.ndarray (Xnew x self.output_dim)]
         """
         m, v = self._raw_predict(X,  full_cov=False, kern=kern)
-        if self.normalizer is not None:
-            m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
         if likelihood is None:
             likelihood = self.likelihood
-        return likelihood.predictive_quantiles(m, v, quantiles, Y_metadata=Y_metadata)
+            
+        quantiles = likelihood.predictive_quantiles(m, v, quantiles, Y_metadata=Y_metadata)
+        
+        if self.normalizer is not None:
+            quantiles = [self.normalizer.inverse_mean(q) for q in quantiles]
+        return quantiles
 
     def predictive_gradients(self, Xnew, kern=None):
         """
diff --git a/GPy/kern/src/static.py b/GPy/kern/src/static.py
index 5cf4a1c9..0ffd8112 100644
--- a/GPy/kern/src/static.py
+++ b/GPy/kern/src/static.py
@@ -135,9 +135,7 @@ class Bias(Static):
 
     def K(self, X, X2=None):
         shape = (X.shape[0], X.shape[0] if X2 is None else X2.shape[0])
-        ret = np.empty(shape, dtype=np.float64)
-        ret[:] = self.variance
-        return ret
+        return np.full(shape, self.variance, dtype=np.float64)
 
     def update_gradients_full(self, dL_dK, X, X2=None):
         self.variance.gradient = dL_dK.sum()
@@ -146,9 +144,7 @@ class Bias(Static):
         self.variance.gradient = dL_dKdiag.sum()
 
     def psi2(self, Z, variational_posterior):
-        ret = np.empty((Z.shape[0], Z.shape[0]), dtype=np.float64)
-        ret[:] = self.variance*self.variance*variational_posterior.shape[0]
-        return ret
+        return np.full((Z.shape[0], Z.shape[0]), self.variance*self.variance*variational_posterior.shape[0], dtype=np.float64)
 
     def psi2n(self, Z, variational_posterior):
         ret = np.empty((variational_posterior.mean.shape[0], Z.shape[0], Z.shape[0]), dtype=np.float64)
diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index aa9e8c90..a24401fe 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -15,7 +15,7 @@ class WarpedGP(GP):
     This defines a GP Regression model that applies a 
     warping function to the output.
     """
-    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3):
+    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3, normalizer=False):
         if kernel is None:
             kernel = kern.RBF(X.shape[1])
         if warping_function == None:
@@ -23,33 +23,23 @@ class WarpedGP(GP):
             self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1) * 1)
         else:
             self.warping_function = warping_function
-        self.scale_data = False
-        if self.scale_data:
-            Y = self._scale_data(Y)
-        #self.has_uncertain_inputs = False
-        self.Y_untransformed = Y.copy()
-        self.predict_in_warped_space = True
         likelihood = likelihoods.Gaussian()
-        GP.__init__(self, X, self.transform_data(), likelihood=likelihood, kernel=kernel)
+        super(WarpedGP, self).__init__(X, Y.copy(), likelihood=likelihood, kernel=kernel, normalizer=normalizer)
+        self.Y_normalized = self.Y_normalized.copy()
+        self.Y_untransformed = self.Y_normalized.copy()
+        self.predict_in_warped_space = True
         self.link_parameter(self.warping_function)
 
     def set_XY(self, X=None, Y=None):
-        self.Y_untransformed = Y.copy()
-        GP.set_XY(self, X, self.transform_data())
-
-    def _scale_data(self, Y):
-        self._Ymax = Y.max()
-        self._Ymin = Y.min()
-        return (Y - self._Ymin) / (self._Ymax - self._Ymin) - 0.5
-
-    def _unscale_data(self, Y):
-        return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin
+        super(WarpedGP, self).set_XY(X, Y)
+        self.Y_untransformed = self.Y_normalized.copy()
+        self.update_model(True)
 
     def parameters_changed(self):
         """
         Notice that we update the warping function gradients here.
         """
-        self.Y[:] = self.transform_data()
+        self.Y_normalized[:] = self.transform_data()
         super(WarpedGP, self).parameters_changed()
         Kiy = self.posterior.woodbury_vector.flatten()
         self.warping_function.update_grads(self.Y_untransformed, Kiy)
@@ -96,7 +86,7 @@ class WarpedGP(GP):
                                      deg_gauss_hermite=deg_gauss_hermite)
         return arg1 - (arg2 ** 2)
 
-    def predict(self, Xnew, which_parts='all', pred_init=None, full_cov=False, Y_metadata=None,
+    def predict(self, Xnew, kern=None, pred_init=None, Y_metadata=None,
                 median=False, deg_gauss_hermite=20, likelihood=None):
         """
         Prediction results depend on:
@@ -104,9 +94,12 @@ class WarpedGP(GP):
         - The median flag passed as argument
         The likelihood keyword is never used, it is just to follow the plotting API.
         """
-        mu, var = GP._raw_predict(self, Xnew)
+        #mu, var = GP._raw_predict(self, Xnew)
         # now push through likelihood
-        mean, var = self.likelihood.predictive_values(mu, var)
+        #mean, var = self.likelihood.predictive_values(mu, var)
+        
+        mean, var = super(WarpedGP, self).predict(Xnew, kern=kern, full_cov=False, likelihood=likelihood)
+
 
         if self.predict_in_warped_space:
             std = np.sqrt(var)
@@ -120,11 +113,9 @@ class WarpedGP(GP):
         else:
             wmean = mean
             wvar = var
-        if self.scale_data:
-            pred = self._unscale_data(pred)
         return wmean, wvar
 
-    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None, likelihood=None, median=False):
+    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None, likelihood=None, kern=None):
         """
         Get the predictive quantiles around the prediction at X
 
@@ -135,15 +126,19 @@ class WarpedGP(GP):
         :returns: list of quantiles for each X and predictive quantiles for interval combination
         :rtype: [np.ndarray (Xnew x self.input_dim), np.ndarray (Xnew x self.input_dim)]
         """
-        m, v = self._raw_predict(X,  full_cov=False)
-        if self.normalizer is not None:
-            m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
-        a, b = self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
-        if not self.predict_in_warped_space:
-            return [a, b]
-        new_a = self.warping_function.f_inv(a)
-        new_b = self.warping_function.f_inv(b)
-        return [new_a, new_b]
+        qs = super(WarpedGP, self).predict_quantiles(X, quantiles, Y_metadata=Y_metadata, likelihood=likelihood, kern=kern)
+        if self.predict_in_warped_space:
+            return [self.warping_function.f_inv(q) for q in qs]
+        return qs
+        #m, v = self._raw_predict(X,  full_cov=False)
+        #if self.normalizer is not None:
+        #    m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
+        #a, b = self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
+        #if not self.predict_in_warped_space:
+        #    return [a, b]
+        #new_a = self.warping_function.f_inv(a)
+        #new_b = self.warping_function.f_inv(b)
+        #return [new_a, new_b]
 
     def log_predictive_density(self, x_test, y_test, Y_metadata=None):
         """
diff --git a/GPy/plotting/gpy_plot/plot_util.py b/GPy/plotting/gpy_plot/plot_util.py
index 937f35ab..7bd1723f 100644
--- a/GPy/plotting/gpy_plot/plot_util.py
+++ b/GPy/plotting/gpy_plot/plot_util.py
@@ -74,9 +74,6 @@ def helper_predict_with_model(self, Xgrid, plot_raw, apply_link, percentiles, wh
     if 'output_index' not in predict_kw['Y_metadata']:
         predict_kw['Y_metadata']['output_index'] = Xgrid[:,-1:].astype(np.int)
 
-    if isinstance(self, WarpedGP) and self.predict_in_warped_space:
-        predict_kw['median'] = True
-
     mu, _ = self.predict(Xgrid, **predict_kw)
 
     if percentiles is not None:
@@ -299,8 +296,10 @@ def get_x_y_var(model):
         Y = model.Y.values
     except AttributeError:
         Y = model.Y
-    if isinstance(model, WarpedGP) and model.predict_in_warped_space:
-        Y = model.Y_untransformed
+
+    if isinstance(model, WarpedGP) and not model.predict_in_warped_space:
+        Y = model.Y_normalized
+    
     if sparse.issparse(Y): Y = Y.todense().view(np.ndarray)
     return X, X_variance, Y
 
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 010befe4..1e0731ab 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -91,18 +91,32 @@ class MiscTests(unittest.TestCase):
         k = GPy.kern.RBF(1)
         m2 = GPy.models.GPRegression(self.X, (Y-mu)/std, kernel=k, normalizer=False)
         m2[:] = m[:]
+        
         mu1, var1 = m.predict(m.X, full_cov=True)
         mu2, var2 = m2.predict(m2.X, full_cov=True)
         np.testing.assert_allclose(mu1, (mu2*std)+mu)
-        np.testing.assert_allclose(var1, var2)
+        np.testing.assert_allclose(var1, var2*std**2)
+        
         mu1, var1 = m.predict(m.X, full_cov=False)
         mu2, var2 = m2.predict(m2.X, full_cov=False)
+        
         np.testing.assert_allclose(mu1, (mu2*std)+mu)
-        np.testing.assert_allclose(var1, var2)
+        np.testing.assert_allclose(var1, var2*std**2)
 
         q50n = m.predict_quantiles(m.X, (50,))
         q50 = m2.predict_quantiles(m2.X, (50,))
+        
         np.testing.assert_allclose(q50n[0], (q50[0]*std)+mu)
+        
+        # Test variance component:
+        qs = np.array([2.5, 97.5])
+        # The quantiles get computed before unormalization
+        # And transformed using the mean transformation:
+        c = np.random.choice(self.X.shape[0])
+        q95 = m2.predict_quantiles(self.X[[c]], qs)
+        mu, var = m2.predict(self.X[[c]])
+        from scipy.stats import norm
+        np.testing.assert_allclose((mu+(norm.ppf(qs/100.)*np.sqrt(var))).flatten(), np.array(q95).flatten())
 
     def check_jacobian(self):
         try:
diff --git a/GPy/util/normalizer.py b/GPy/util/normalizer.py
index 854726c4..78ad945d 100644
--- a/GPy/util/normalizer.py
+++ b/GPy/util/normalizer.py
@@ -6,7 +6,7 @@ Created on Aug 27, 2014
 import logging
 import numpy as np
 
-class Norm(object):
+class _Norm(object):
     def __init__(self):
         pass
     def scale_by(self, Y):
@@ -18,7 +18,8 @@ class Norm(object):
         """
         Project Y into normalized space
         """
-        raise NotImplementedError
+        if not self.scaled():
+            raise AttributeError("Norm object not initialized yet, try calling scale_by(data) first.")
     def inverse_mean(self, X):
         """
         Project the normalized object X into space of Y
@@ -31,15 +32,46 @@ class Norm(object):
         Whether this Norm object has been initialized.
         """
         raise NotImplementedError
-class MeanNorm(Norm):
+
+class Standardize(_Norm):
     def __init__(self):
         self.mean = None
     def scale_by(self, Y):
         Y = np.ma.masked_invalid(Y, copy=False)
         self.mean = Y.mean(0).view(np.ndarray)
+        self.std = Y.std(0).view(np.ndarray)
     def normalize(self, Y):
-        return Y-self.mean
+        super(Standardize, self).normalize(Y)
+        return (Y-self.mean)/self.std
     def inverse_mean(self, X):
-        return X+self.mean
+        return (X*self.std)+self.mean
+    def inverse_variance(self, var):
+        return (var*(self.std**2))
     def scaled(self):
         return self.mean is not None
+
+# Inverse variance to be implemented, disabling for now
+# If someone in the future want to implement this,
+# we need to implement the inverse variance for 
+# normalization. This means, we need to know the factor
+# for the variance to be multiplied to the variance in
+# normalized space. This is easy to compute for standardization
+# (see above) but gets tricky here.
+# class Normalize(_Norm):
+#     def __init__(self):
+#         self.ymin = None
+#         self.ymax = None
+#     def scale_by(self, Y):
+#         Y = np.ma.masked_invalid(Y, copy=False)
+#         self.ymin = Y.min(0).view(np.ndarray)
+#         self.ymax = Y.max(0).view(np.ndarray)
+#     def normalize(self, Y):
+#         super(Normalize, self).normalize(Y)
+#         return (Y - self.ymin) / (self.ymax - self.ymin) - .5
+#     def inverse_mean(self, X):
+#         return (X + .5) * (self.ymax - self.ymin) + self.ymin
+#     def inverse_variance(self, var):
+#         
+#         return (var*(self.std**2))
+#     def scaled(self):
+#         return (self.ymin is not None) and (self.ymax is not None)
\ No newline at end of file
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index 6b8b73e9..3af28d43 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -31,7 +31,7 @@ class WarpingFunction(Parameterized):
         """gradient of f w.r.t to y"""
         raise NotImplementedError
 
-    def f_inv(self, z, max_iterations=100, y=None):
+    def f_inv(self, z, max_iterations=250, y=None):
         """
         Calculate the numerical inverse of f. This should be
         overwritten for specific warping functions where the
@@ -51,14 +51,11 @@ class WarpingFunction(Parameterized):
             update = (fy - z) / fgrady
             y -= self.rate * update
             it += 1
-        if it == max_iterations:
-            print("WARNING!!! Maximum number of iterations reached in f_inv ")
-            print("Sum of roots: %.4f" % np.sum(fy - z))
+        #if it == max_iterations:
+        #    print("WARNING!!! Maximum number of iterations reached in f_inv ")
+        #    print("Sum of roots: %.4f" % np.sum(fy - z))
         return y
 
-    def _get_param_names(self):
-        raise NotImplementedError
-
     def plot(self, xmin, xmax):
         y = np.arange(xmin, xmax, 0.01)
         f_y = self.f(y)
@@ -159,13 +156,6 @@ class TanhFunction(WarpingFunction):
 
         return gradients
 
-    def _get_param_names(self):
-        variables = ['a', 'b', 'c', 'd']
-        names = sum([['warp_tanh_%s_t%i' % (variables[n],q) for n in range(3)] 
-                     for q in range(self.n_terms)],[])
-        names.append('warp_tanh')
-        return names
-
     def update_grads(self, Y_untransformed, Kiy):
         grad_y = self.fgrad_y(Y_untransformed)
         grad_y_psi, grad_psi = self.fgrad_y_psi(Y_untransformed,

From a4777c705ef47a6755bcfcc88c87730784081996 Mon Sep 17 00:00:00 2001
From: mzwiessele <ibinbei@gmail.com>
Date: Wed, 17 Aug 2016 14:53:26 +0100
Subject: [PATCH 53/53] =?UTF-8?q?Bump=20version:=201.3.2=20=E2=86=92=201.4?=
 =?UTF-8?q?.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 GPy/__version__.py | 2 +-
 appveyor.yml       | 2 +-
 setup.cfg          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/GPy/__version__.py b/GPy/__version__.py
index f708a9b2..3e8d9f94 100644
--- a/GPy/__version__.py
+++ b/GPy/__version__.py
@@ -1 +1 @@
-__version__ = "1.3.2"
+__version__ = "1.4.0"
diff --git a/appveyor.yml b/appveyor.yml
index 1f7a7dd9..ed0ec0d6 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -3,7 +3,7 @@ environment:
     secure: 8/ZjXFwtd1S7ixd7PJOpptupKKEDhm2da/q3unabJ00=
   COVERALLS_REPO_TOKEN:
     secure: d3Luic/ESkGaWnZrvWZTKrzO+xaVwJWaRCEP0F+K/9DQGPSRZsJ/Du5g3s4XF+tS
-  gpy_version: 1.3.2
+  gpy_version: 1.4.0
   matrix:
     - PYTHON_VERSION: 2.7
       MINICONDA: C:\Miniconda-x64
diff --git a/setup.cfg b/setup.cfg
index 6651958c..9546c53a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.3.2
+current_version = 1.4.0
 tag = False
 commit = True