From c22167153085b9534a8b335de6b28c06bbc4a84b Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Thu, 30 Jul 2015 17:05:03 +0100
Subject: [PATCH 01/14] added a small test for warped gps

---
 GPy/testing/model_tests.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index ce78ee88..f609f378 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -203,6 +203,13 @@ class MiscTests(unittest.TestCase):
         m.optimize()
         print(m)
 
+    def test_warped_gp(self):
+        k = GPy.kern.RBF(1)
+        m = GPy.models.WarpedGP(self.X, self.Y, kernel=k)
+        m.randomize()
+        m.optimize()
+        print(m)
+
 class GradientTests(np.testing.TestCase):
     def setUp(self):
         ######################################

From 7a7d563576c2d98fc33acee0ac8ba9c61fde6176 Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Thu, 30 Jul 2015 17:09:31 +0100
Subject: [PATCH 02/14] commented out has_uncertain_inputs in warped gps since
 it breaks plotting

---
 GPy/models/warped_gp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index 5bc9a417..b5654053 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -23,7 +23,7 @@ class WarpedGP(GP):
         self.scale_data = False
         if self.scale_data:
             Y = self._scale_data(Y)
-        self.has_uncertain_inputs = False
+        #self.has_uncertain_inputs = False
         self.Y_untransformed = Y.copy()
         self.predict_in_warped_space = False
         likelihood = likelihoods.Gaussian()

From 45321f536bc93b34524d0c5737fd25e73bfdcaba Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Thu, 30 Jul 2015 17:19:16 +0100
Subject: [PATCH 03/14] added keywords to predict in warped gps because they
 are used in the plotting method

---
 GPy/models/warped_gp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index b5654053..55825be0 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -69,7 +69,7 @@ class WarpedGP(GP):
     def plot_warping(self):
         self.warping_function.plot(self.Y_untransformed.min(), self.Y_untransformed.max())
 
-    def predict(self, Xnew, which_parts='all', pred_init=None):
+    def predict(self, Xnew, which_parts='all', pred_init=None, Y_metadata=None, full_cov=False):
         # normalize X values
         # Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
         mu, var = GP._raw_predict(self, Xnew)

From 00a209f4cb91cd40557c78142443e43ef78ddbed Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Thu, 30 Jul 2015 17:33:42 +0100
Subject: [PATCH 04/14] added predict_quantiles method to warped gps

---
 GPy/models/warped_gp.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index 55825be0..eec37e48 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -69,7 +69,7 @@ class WarpedGP(GP):
     def plot_warping(self):
         self.warping_function.plot(self.Y_untransformed.min(), self.Y_untransformed.max())
 
-    def predict(self, Xnew, which_parts='all', pred_init=None, Y_metadata=None, full_cov=False):
+    def predict(self, Xnew, which_parts='all', pred_init=None, full_cov=False, Y_metadata=None):
         # normalize X values
         # Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
         mu, var = GP._raw_predict(self, Xnew)
@@ -86,6 +86,27 @@ class WarpedGP(GP):
 
         return mean, var
 
+    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None):
+        """
+        Get the predictive quantiles around the prediction at X
+
+        :param X: The points at which to make a prediction
+        :type X: np.ndarray (Xnew x self.input_dim)
+        :param quantiles: tuple of quantiles, default is (2.5, 97.5) which is the 95% interval
+        :type quantiles: tuple
+        :returns: list of quantiles for each X and predictive quantiles for interval combination
+        :rtype: [np.ndarray (Xnew x self.input_dim), np.ndarray (Xnew x self.input_dim)]
+        """
+        m, v = self._raw_predict(X,  full_cov=False)
+        if self.normalizer is not None:
+            m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
+        a, b = self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
+        #print a.shape
+        new_a = self.warping_function.f_inv(a)
+        new_b = self.warping_function.f_inv(b)
+        return [new_a, new_b]
+        #return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
+
 if __name__ == '__main__':
     X = np.random.randn(100, 1)
     Y = np.sin(X) + np.random.randn(100, 1)*0.05

From cf4c5af48764762cf141b901fe4fef7f1b9c00b0 Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Thu, 30 Jul 2015 17:38:21 +0100
Subject: [PATCH 05/14] if the plot is a warped gp, then it plots Y before
 warping

---
 GPy/plotting/matplot_dep/models_plots.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index 9f841372..9ad166b5 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -10,6 +10,7 @@ import numpy as np
 from base_plots import gpplot, x_frame1D, x_frame2D
 from ...models.gp_coregionalized_regression import GPCoregionalizedRegression
 from ...models.sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
+from ...models.warped_gp import WarpedGP
 from scipy import sparse
 from ...core.parameterization.variational import VariationalPosterior
 
@@ -73,6 +74,10 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
     else:
         X = model.X
     Y = model.Y
+
+    if isinstance(model, WarpedGP):
+        Y = model.Y_untransformed
+
     if sparse.issparse(Y): Y = Y.todense().view(np.ndarray)
 
     if hasattr(model, 'Z'): Z = model.Z

From 3aa563d5bad488d3e577ca3c457d2519e0255f21 Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Tue, 4 Aug 2015 14:22:46 +0100
Subject: [PATCH 06/14] first try in implementing warped mean

---
 GPy/models/warped_gp.py | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index eec37e48..540b6cb2 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -25,7 +25,7 @@ class WarpedGP(GP):
             Y = self._scale_data(Y)
         #self.has_uncertain_inputs = False
         self.Y_untransformed = Y.copy()
-        self.predict_in_warped_space = False
+        self.predict_in_warped_space = True
         likelihood = likelihoods.Gaussian()
 
         GP.__init__(self, X, self.transform_data(), likelihood=likelihood, kernel=kernel)
@@ -69,7 +69,19 @@ class WarpedGP(GP):
     def plot_warping(self):
         self.warping_function.plot(self.Y_untransformed.min(), self.Y_untransformed.max())
 
-    def predict(self, Xnew, which_parts='all', pred_init=None, full_cov=False, Y_metadata=None):
+    def _get_warped_mean(self, mean, var, pred_init=None, deg_gauss_hermite=100):
+        """
+        Calculate the warped mean by using Gauss-Hermite quadrature.
+        """
+        gh_samples, gh_weights = np.polynomial.hermite.hermgauss(deg_gauss_hermite)
+        gh_samples = gh_samples[:,None]
+        gh_weights = gh_weights[None,:]
+        arg1 = gh_samples.dot(var.T) * np.sqrt(2)
+        arg2 = np.ones(shape=gh_samples.shape).dot(mean.T)
+        return gh_weights.dot(self.warping_function.f_inv(arg1 + arg2, y=pred_init)) / np.sqrt(np.pi)
+
+    def predict(self, Xnew, which_parts='all', pred_init=None, full_cov=False, Y_metadata=None,
+                median=False, deg_gauss_hermite=100):
         # normalize X values
         # Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
         mu, var = GP._raw_predict(self, Xnew)
@@ -78,13 +90,17 @@ class WarpedGP(GP):
         mean, var = self.likelihood.predictive_values(mu, var)
 
         if self.predict_in_warped_space:
-            mean = self.warping_function.f_inv(mean,  y=pred_init)
+            if median:
+                pred = self.warping_function.f_inv(mean,  y=pred_init)
+            else:
+                pred = self._get_warped_mean(mean, var, pred_init=pred_init,
+                                             deg_gauss_hermite=deg_gauss_hermite).T
             var = self.warping_function.f_inv(var)
 
         if self.scale_data:
-            mean = self._unscale_data(mean)
+            pred = self._unscale_data(pred)
 
-        return mean, var
+        return pred, var
 
     def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None):
         """

From 2426580d4d2f98a0b8fcb92208ed91330acc466f Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Tue, 4 Aug 2015 18:46:10 +0100
Subject: [PATCH 07/14] implemented variance using gauss-hermite

---
 GPy/models/warped_gp.py       | 36 ++++++++++++++++++++++++++++-------
 GPy/util/warping_functions.py |  1 +
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index 540b6cb2..0be3d2fe 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -64,11 +64,17 @@ class WarpedGP(GP):
     def log_likelihood(self):
         ll = GP.log_likelihood(self)
         jacobian = self.warping_function.fgrad_y(self.Y_untransformed)
+        print np.log(jacobian)
         return ll + np.log(jacobian).sum()
 
     def plot_warping(self):
         self.warping_function.plot(self.Y_untransformed.min(), self.Y_untransformed.max())
 
+    def _get_warped_term(self, mean, var, gh_samples, pred_init=None):
+        arg1 = gh_samples.dot(var.T) * np.sqrt(2)
+        arg2 = np.ones(shape=gh_samples.shape).dot(mean.T)
+        return self.warping_function.f_inv(arg1 + arg2, y=pred_init)
+
     def _get_warped_mean(self, mean, var, pred_init=None, deg_gauss_hermite=100):
         """
         Calculate the warped mean by using Gauss-Hermite quadrature.
@@ -76,9 +82,17 @@ class WarpedGP(GP):
         gh_samples, gh_weights = np.polynomial.hermite.hermgauss(deg_gauss_hermite)
         gh_samples = gh_samples[:,None]
         gh_weights = gh_weights[None,:]
-        arg1 = gh_samples.dot(var.T) * np.sqrt(2)
-        arg2 = np.ones(shape=gh_samples.shape).dot(mean.T)
-        return gh_weights.dot(self.warping_function.f_inv(arg1 + arg2, y=pred_init)) / np.sqrt(np.pi)
+        return gh_weights.dot(self._get_warped_term(mean, var, gh_samples)) / np.sqrt(np.pi)
+
+    def _get_warped_variance(self, mean, var, pred_init=None, deg_gauss_hermite=100):
+        gh_samples, gh_weights = np.polynomial.hermite.hermgauss(deg_gauss_hermite)
+        gh_samples = gh_samples[:,None]
+        gh_weights = gh_weights[None,:]
+        arg1 = gh_weights.dot(self._get_warped_term(mean, var, gh_samples, 
+                                                    pred_init=pred_init) ** 2) / np.sqrt(np.pi)
+        arg2 = self._get_warped_mean(mean, var, pred_init=pred_init,
+                                     deg_gauss_hermite=deg_gauss_hermite)
+        return arg1 - (arg2 ** 2)
 
     def predict(self, Xnew, which_parts='all', pred_init=None, full_cov=False, Y_metadata=None,
                 median=False, deg_gauss_hermite=100):
@@ -91,16 +105,24 @@ class WarpedGP(GP):
 
         if self.predict_in_warped_space:
             if median:
-                pred = self.warping_function.f_inv(mean,  y=pred_init)
+                #print 'MEDIAN!'
+                wmean = self.warping_function.f_inv(mean,  y=pred_init)
             else:
-                pred = self._get_warped_mean(mean, var, pred_init=pred_init,
+                #print 'MEAN!'
+                wmean = self._get_warped_mean(mean, var, pred_init=pred_init,
+                                              deg_gauss_hermite=deg_gauss_hermite).T
+            #var = self.warping_function.f_inv(var)
+            wvar = self._get_warped_variance(mean, var, pred_init=pred_init,
                                              deg_gauss_hermite=deg_gauss_hermite).T
-            var = self.warping_function.f_inv(var)
+        else:
+            wmean = mean
+            #wvar = var
+            wvar = self.warping_function.f_inv(var)
 
         if self.scale_data:
             pred = self._unscale_data(pred)
 
-        return pred, var
+        return wmean, wvar
 
     def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None):
         """
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index a56f8c27..37f4ef3e 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -45,6 +45,7 @@ class WarpingFunction(Parameterized):
         plt.xlabel('y')
         plt.ylabel('f(y)')
         plt.title('warping function')
+        plt.show()
 
 class TanhWarpingFunction(WarpingFunction):
 

From 7fd8c9c556d9bc42d2623433b4c3f454e0baaf1e Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Wed, 5 Aug 2015 10:35:15 +0100
Subject: [PATCH 08/14] added identity warp function and change np.argmin to
 np.nanargmin in optimize_restarts

---
 GPy/core/model.py             |  2 +-
 GPy/util/warping_functions.py | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/GPy/core/model.py b/GPy/core/model.py
index 9d8b89f4..cf45ee3c 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -104,7 +104,7 @@ class Model(Parameterized):
                     raise e
 
         if len(self.optimization_runs):
-            i = np.argmin([o.f_opt for o in self.optimization_runs])
+            i = np.nanargmin([o.f_opt for o in self.optimization_runs])
             self.optimizer_array = self.optimization_runs[i].x_opt
         else:
             self.optimizer_array = initial_parameters
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index 37f4ef3e..00e24c4b 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -293,3 +293,25 @@ class TanhWarpingFunction_d(WarpingFunction):
         names = sum([['warp_tanh_%s_t%i' % (variables[n],q) for n in range(3)] for q in range(self.n_terms)],[])
         names.append('warp_tanh_d')
         return names
+
+
+class IdentityFunction(WarpingFunction):
+    """
+    Identity warping function. This is for testing and sanity check purposes
+    and should not be used in practice.
+    """
+    def __init__(self):
+        self.num_parameters = 0
+        super(IdentityFunction, self).__init__(name='identity')
+        
+    def f(self, y):
+        return y
+
+    def fgrad_y(self, y):
+        return 1.0
+
+    def fgrad_y_psi(self,y):
+        return 1.0
+
+    def f_inv(self,z):
+        return z

From 76bc0bec254f84aa7ffc5a4ef930b38b33aa385f Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Wed, 5 Aug 2015 10:50:59 +0100
Subject: [PATCH 09/14] added initial test for warped gps using identity
 function

---
 GPy/models/warped_gp.py       |  1 -
 GPy/testing/model_tests.py    |  3 ++-
 GPy/util/warping_functions.py | 17 +++++++++++++----
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index 0be3d2fe..1ce0e92b 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -64,7 +64,6 @@ class WarpedGP(GP):
     def log_likelihood(self):
         ll = GP.log_likelihood(self)
         jacobian = self.warping_function.fgrad_y(self.Y_untransformed)
-        print np.log(jacobian)
         return ll + np.log(jacobian).sum()
 
     def plot_warping(self):
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index f609f378..feec7211 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -205,7 +205,8 @@ class MiscTests(unittest.TestCase):
 
     def test_warped_gp(self):
         k = GPy.kern.RBF(1)
-        m = GPy.models.WarpedGP(self.X, self.Y, kernel=k)
+        warp = GPy.util.warping_functions.IdentityFunction()
+        m = GPy.models.WarpedGP(self.X, self.Y, kernel=k, warping_function=warp)
         m.randomize()
         m.optimize()
         print(m)
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index 00e24c4b..a3a282c8 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -301,17 +301,26 @@ class IdentityFunction(WarpingFunction):
     and should not be used in practice.
     """
     def __init__(self):
-        self.num_parameters = 0
+        self.num_parameters = 4
+        self.psi = Param('psi', np.zeros((1,3)))
+        self.d = Param('%s' % ('d'), 1.0, Logexp())
         super(IdentityFunction, self).__init__(name='identity')
+        self.link_parameter(self.psi)
+        self.link_parameter(self.d)
+
         
     def f(self, y):
         return y
 
     def fgrad_y(self, y):
-        return 1.0
+        return np.ones(y.shape)
 
-    def fgrad_y_psi(self,y):
-        return 1.0
+    def fgrad_y_psi(self, y, return_covar_chain=False):
+        gradients = np.ones((y.shape[0], y.shape[1], len(self.psi), 4))
+        if return_covar_chain:
+            return gradients, gradients
+        return gradients
+        
 
     def f_inv(self,z):
         return z

From 995de0f3992eb2bbc2c31b289f8d02149068dcaa Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Wed, 5 Aug 2015 19:05:48 +0100
Subject: [PATCH 10/14] added a new test which tries to replicate Snelson's toy
 1D but NR seems to diverge...

---
 GPy/models/warped_gp.py                  | 27 ++++++-----
 GPy/plotting/matplot_dep/models_plots.py |  8 +++-
 GPy/testing/model_tests.py               | 60 ++++++++++++++++++++++--
 GPy/util/warping_functions.py            | 18 ++++---
 4 files changed, 88 insertions(+), 25 deletions(-)

diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index 1ce0e92b..77ded72d 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -69,27 +69,27 @@ class WarpedGP(GP):
     def plot_warping(self):
         self.warping_function.plot(self.Y_untransformed.min(), self.Y_untransformed.max())
 
-    def _get_warped_term(self, mean, var, gh_samples, pred_init=None):
-        arg1 = gh_samples.dot(var.T) * np.sqrt(2)
+    def _get_warped_term(self, mean, std, gh_samples, pred_init=None):
+        arg1 = gh_samples.dot(std.T) * np.sqrt(2)
         arg2 = np.ones(shape=gh_samples.shape).dot(mean.T)
         return self.warping_function.f_inv(arg1 + arg2, y=pred_init)
 
-    def _get_warped_mean(self, mean, var, pred_init=None, deg_gauss_hermite=100):
+    def _get_warped_mean(self, mean, std, pred_init=None, deg_gauss_hermite=100):
         """
         Calculate the warped mean by using Gauss-Hermite quadrature.
         """
         gh_samples, gh_weights = np.polynomial.hermite.hermgauss(deg_gauss_hermite)
         gh_samples = gh_samples[:,None]
         gh_weights = gh_weights[None,:]
-        return gh_weights.dot(self._get_warped_term(mean, var, gh_samples)) / np.sqrt(np.pi)
+        return gh_weights.dot(self._get_warped_term(mean, std, gh_samples)) / np.sqrt(np.pi)
 
-    def _get_warped_variance(self, mean, var, pred_init=None, deg_gauss_hermite=100):
+    def _get_warped_variance(self, mean, std, pred_init=None, deg_gauss_hermite=100):
         gh_samples, gh_weights = np.polynomial.hermite.hermgauss(deg_gauss_hermite)
         gh_samples = gh_samples[:,None]
         gh_weights = gh_weights[None,:]
-        arg1 = gh_weights.dot(self._get_warped_term(mean, var, gh_samples, 
+        arg1 = gh_weights.dot(self._get_warped_term(mean, std, gh_samples, 
                                                     pred_init=pred_init) ** 2) / np.sqrt(np.pi)
-        arg2 = self._get_warped_mean(mean, var, pred_init=pred_init,
+        arg2 = self._get_warped_mean(mean, std, pred_init=pred_init,
                                      deg_gauss_hermite=deg_gauss_hermite)
         return arg1 - (arg2 ** 2)
 
@@ -103,20 +103,20 @@ class WarpedGP(GP):
         mean, var = self.likelihood.predictive_values(mu, var)
 
         if self.predict_in_warped_space:
+            std = np.sqrt(var)
             if median:
                 #print 'MEDIAN!'
-                wmean = self.warping_function.f_inv(mean,  y=pred_init)
+                wmean = self.warping_function.f_inv(mean, y=pred_init)
             else:
                 #print 'MEAN!'
-                wmean = self._get_warped_mean(mean, var, pred_init=pred_init,
+                wmean = self._get_warped_mean(mean, std, pred_init=pred_init,
                                               deg_gauss_hermite=deg_gauss_hermite).T
             #var = self.warping_function.f_inv(var)
-            wvar = self._get_warped_variance(mean, var, pred_init=pred_init,
+            wvar = self._get_warped_variance(mean, std, pred_init=pred_init,
                                              deg_gauss_hermite=deg_gauss_hermite).T
         else:
             wmean = mean
-            #wvar = var
-            wvar = self.warping_function.f_inv(var)
+            wvar = var
 
         if self.scale_data:
             pred = self._unscale_data(pred)
@@ -138,9 +138,12 @@ class WarpedGP(GP):
         if self.normalizer is not None:
             m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
         a, b = self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
+        if not self.predict_in_warped_space:
+            return [a, b]
         #print a.shape
         new_a = self.warping_function.f_inv(a)
         new_b = self.warping_function.f_inv(b)
+
         return [new_a, new_b]
         #return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
 
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index 9ad166b5..e3bef675 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -75,7 +75,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
         X = model.X
     Y = model.Y
 
-    if isinstance(model, WarpedGP):
+    if isinstance(model, WarpedGP) and model.predict_in_warped_space:
         Y = model.Y_untransformed
 
     if sparse.issparse(Y): Y = Y.todense().view(np.ndarray)
@@ -117,7 +117,11 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
                     Y_metadata = {'output_index': extra_data}
                 else:
                     Y_metadata['output_index'] = extra_data
-            m, v = model.predict(Xgrid, full_cov=False, Y_metadata=Y_metadata, **predict_kw)
+            if isinstance(model, WarpedGP):
+                m, v = model.predict(Xgrid, full_cov=False, median=True, Y_metadata=Y_metadata, **predict_kw)
+                #print np.concatenate((Xgrid, m), axis=1)
+            else:
+                m, v = model.predict(Xgrid, full_cov=False, Y_metadata=Y_metadata, **predict_kw)
             lower, upper = model.predict_quantiles(Xgrid, Y_metadata=Y_metadata)
 
 
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index feec7211..ca2dce07 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -203,13 +203,63 @@ class MiscTests(unittest.TestCase):
         m.optimize()
         print(m)
 
-    def test_warped_gp(self):
+    def test_warped_gp_identity(self):
+        """
+        A WarpedGP with the identity warping function should be
+        equal to a standard GP.
+        """
         k = GPy.kern.RBF(1)
-        warp = GPy.util.warping_functions.IdentityFunction()
-        m = GPy.models.WarpedGP(self.X, self.Y, kernel=k, warping_function=warp)
-        m.randomize()
+        m = GPy.models.GPRegression(self.X, self.Y, kernel=k)
         m.optimize()
-        print(m)
+        preds = m.predict(self.X)
+
+        warp_k = GPy.kern.RBF(1)
+        warp_f = GPy.util.warping_functions.IdentityFunction()
+        warp_m = GPy.models.WarpedGP(self.X, self.Y, kernel=warp_k, warping_function=warp_f)
+        warp_m.optimize()
+        warp_preds = warp_m.predict(self.X)
+ 
+        np.testing.assert_almost_equal(preds, warp_preds)
+
+    @unittest.skip('Comment this to plot the modified sine function')
+    def test_warped_gp_sine(self):
+        """
+        A test replicating the sine regression problem from
+        Snelson's paper.
+        """
+        X = (2 * np.pi) * np.random.random(151) - np.pi
+        Y = np.sin(X) + np.random.normal(0,0.1,151)
+        Y = np.exp(Y) - 5
+        #Y = np.array([np.power(abs(y),float(1)/3) * (1,-1)[y<0] for y in Y]) + 0
+        
+        #np.seterr(over='raise')
+        import matplotlib.pyplot as plt
+        warp_k = GPy.kern.RBF(1)
+        warp_f = GPy.util.warping_functions.TanhWarpingFunction_d(n_terms=2)
+        warp_m = GPy.models.WarpedGP(X[:, None], Y[:, None], kernel=warp_k, warping_function=warp_f)
+        #warp_m['.*variance.*'].constrain_fixed(0.25)
+        #warp_m['.*lengthscale.*'].constrain_fixed(1)
+        #warp_m['warp_tanh.d'].constrain_fixed(1)
+        #warp_m.randomize()
+        #warp_m['.*warp_tanh.psi*'][:,0:2].constrain_bounded(0,100)
+        #warp_m['.*warp_tanh.psi*'][:,0:1].constrain_fixed(1)
+        
+        #print(warp_m.checkgrad())
+        #warp_m.plot()
+        #plt.show()
+
+        warp_m.optimize_restarts(parallel=True, robust=True)
+        #print(warp_m.checkgrad())
+        print(warp_m)
+        print(warp_m['.*warp.*'])
+        warp_m.predict_in_warped_space = False
+        warp_m.plot()
+        warp_m.predict_in_warped_space = True
+        warp_m.plot()
+        warp_f.plot(X.min()-10, X.max()+10)
+        plt.show()
+
+        
 
 class GradientTests(np.testing.TestCase):
     def setUp(self):
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index a3a282c8..88e688b8 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -211,17 +211,24 @@ class TanhWarpingFunction_d(WarpingFunction):
 
         z = z.copy()
         if y is None:
-            y = np.ones_like(z)
+            y = np.ones_like(z) * 0.1
+            #y = np.zeros_like(z)
 
         it = 0
         update = np.inf
+        #import ipdb; ipdb.set_trace()
 
         while it == 0 or (np.abs(update).sum() > 1e-10 and it < max_iterations):
-            update = (self.f(y) - z)/self.fgrad_y(y)
+            fy = self.f(y)
+            fgrady = self.fgrad_y(y)
+            update = (fy - z)/fgrady
             y -= update
             it += 1
+            #print it
+            #print y
         if it == max_iterations:
             print("WARNING!!! Maximum number of iterations reached in f_inv ")
+            #print np.abs(update)
 
         return y
 
@@ -265,7 +272,7 @@ class TanhWarpingFunction_d(WarpingFunction):
         mpsi = self.psi
 
         w, s, r, d = self.fgrad_y(y, return_precalc = True)
-
+        #print s
         gradients = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
         for i in range(len(mpsi)):
             a,b,c  = mpsi[i]
@@ -316,11 +323,10 @@ class IdentityFunction(WarpingFunction):
         return np.ones(y.shape)
 
     def fgrad_y_psi(self, y, return_covar_chain=False):
-        gradients = np.ones((y.shape[0], y.shape[1], len(self.psi), 4))
+        gradients = np.zeros((y.shape[0], y.shape[1], len(self.psi), 4))
         if return_covar_chain:
             return gradients, gradients
         return gradients
         
-
-    def f_inv(self,z):
+    def f_inv(self, z, y=None):
         return z

From 96441382b95bab88c252328a717211494eabfc08 Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Wed, 5 Aug 2015 19:15:34 +0100
Subject: [PATCH 11/14] code cleaning on warping_functions

---
 GPy/util/warping_functions.py | 70 ++++++++++-------------------------
 1 file changed, 19 insertions(+), 51 deletions(-)

diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index 88e688b8..b8b9a261 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -5,6 +5,7 @@ import numpy as np
 from ..core.parameterization import Parameterized, Param
 from ..core.parameterization.transformations import Logexp
 
+
 class WarpingFunction(Parameterized):
     """
     abstract function for warping
@@ -14,28 +15,28 @@ class WarpingFunction(Parameterized):
     def __init__(self, name):
         super(WarpingFunction, self).__init__(name=name)
 
-    def f(self,y,psi):
+    def f(self, y, psi):
         """function transformation
-        y is a list of values (GP training data) of shpape [N,1]
+        y is a list of values (GP training data) of shape [N, 1]
         """
         raise NotImplementedError
 
-    def fgrad_y(self,y,psi):
+    def fgrad_y(self, y, psi):
         """gradient of f w.r.t to y"""
         raise NotImplementedError
 
-    def fgrad_y_psi(self,y,psi):
+    def fgrad_y_psi(self, y, psi):
         """gradient of f w.r.t to y"""
         raise NotImplementedError
 
-    def f_inv(self,z,psi):
+    def f_inv(self, z, psi):
         """inverse function transformation"""
         raise NotImplementedError
 
     def _get_param_names(self):
         raise NotImplementedError
 
-    def plot(self,  xmin, xmax):
+    def plot(self, xmin, xmax):
         psi = self.psi
         y = np.arange(xmin, xmax, 0.01)
         f_y = self.f(y)
@@ -47,22 +48,21 @@ class WarpingFunction(Parameterized):
         plt.title('warping function')
         plt.show()
 
+
 class TanhWarpingFunction(WarpingFunction):
 
-    def __init__(self,n_terms=3):
+    def __init__(self, n_terms=3):
         """n_terms specifies the number of tanh terms to be used"""
         self.n_terms = n_terms
         self.num_parameters = 3 * self.n_terms
         super(TanhWarpingFunction, self).__init__(name='warp_tanh')
 
-    def f(self,y,psi):
+    def f(self, y, psi):
         """
         transform y with f using parameter vector psi
         psi = [[a,b,c]]
         ::math::`f = \\sum_{terms} a * tanh(b*(y+c))`
-
         """
-
         #1. check that number of params is consistent
         assert psi.shape[0] == self.n_terms, 'inconsistent parameter dimensions'
         assert psi.shape[1] == 3, 'inconsistent parameter dimensions'
@@ -77,25 +77,19 @@ class TanhWarpingFunction(WarpingFunction):
             z += a*np.tanh(b*(y+c))
         return z
 
-
-    def f_inv(self, y, psi, iterations = 10):
+    def f_inv(self, y, psi, iterations=10):
         """
         calculate the numerical inverse of f
-
         :param iterations: number of N.R. iterations
-
         """
 
         y = y.copy()
         z = np.ones_like(y)
-
         for i in range(iterations):
             z -= (self.f(z, psi) - y)/self.fgrad_y(z,psi)
-
         return z
 
-
-    def fgrad_y(self, y, psi, return_precalc = False):
+    def fgrad_y(self, y, psi, return_precalc=False):
         """
         gradient of f w.r.t to y ([N x 1])
         returns: Nx1 vector of derivatives, unless return_precalc is true,
@@ -118,16 +112,12 @@ class TanhWarpingFunction(WarpingFunction):
             # return GRAD,S.sum(axis=1),R.sum(axis=1),D.sum(axis=1)
             return GRAD, S, R, D
 
-
         return GRAD
 
-
-    def fgrad_y_psi(self, y, psi, return_covar_chain = False):
+    def fgrad_y_psi(self, y, psi, return_covar_chain=False):
         """
         gradient of f w.r.t to y and psi
-
         returns: NxIx3 tensor of partial derivatives
-
         """
 
         # 1. exponentiate the a and b (positive!)
@@ -141,7 +131,6 @@ class TanhWarpingFunction(WarpingFunction):
             gradients[:,:,i,1] = a*(d[i] - 2.0*s[i]*r[i]*(1.0/np.cosh(s[i]))**2).T
             gradients[:,:,i,2] = (-2.0*a*(b**2)*r[i]*((1.0/np.cosh(s[i]))**2)).T
 
-
         if return_covar_chain:
             covar_grad_chain = np.zeros((y.shape[0], y.shape[1], len(mpsi), 3))
 
@@ -163,7 +152,7 @@ class TanhWarpingFunction(WarpingFunction):
 
 class TanhWarpingFunction_d(WarpingFunction):
 
-    def __init__(self,n_terms=3):
+    def __init__(self, n_terms=3):
         """n_terms specifies the number of tanh terms to be used"""
         self.n_terms = n_terms
         self.num_parameters = 3 * self.n_terms + 1
@@ -177,15 +166,13 @@ class TanhWarpingFunction_d(WarpingFunction):
         self.link_parameter(self.psi)
         self.link_parameter(self.d)
 
-
-    def f(self,y):
+    def f(self, y):
         """
         Transform y with f using parameter vector psi
         psi = [[a,b,c]]
 
         :math:`f = \\sum_{terms} a * tanh(b*(y+c))`
         """
-
         #1. check that number of params is consistent
         # assert psi.shape[0] == self.n_terms, 'inconsistent parameter dimensions'
         # assert psi.shape[1] == 4, 'inconsistent parameter dimensions'
@@ -200,23 +187,19 @@ class TanhWarpingFunction_d(WarpingFunction):
             z += a*np.tanh(b*(y+c))
         return z
 
-
     def f_inv(self, z, max_iterations=1000, y=None):
         """
         calculate the numerical inverse of f
 
         :param max_iterations: maximum number of N.R. iterations
-
         """
 
         z = z.copy()
         if y is None:
-            y = np.ones_like(z) * 0.1
-            #y = np.zeros_like(z)
+            y = np.ones_like(z)
 
         it = 0
         update = np.inf
-        #import ipdb; ipdb.set_trace()
 
         while it == 0 or (np.abs(update).sum() > 1e-10 and it < max_iterations):
             fy = self.f(y)
@@ -224,29 +207,20 @@ class TanhWarpingFunction_d(WarpingFunction):
             update = (fy - z)/fgrady
             y -= update
             it += 1
-            #print it
-            #print y
         if it == max_iterations:
             print("WARNING!!! Maximum number of iterations reached in f_inv ")
-            #print np.abs(update)
-
         return y
 
-
-    def fgrad_y(self, y,return_precalc = False):
+    def fgrad_y(self, y, return_precalc=False):
         """
         gradient of f w.r.t to y ([N x 1])
 
         :returns: Nx1 vector of derivatives, unless return_precalc is true, then it also returns the precomputed stuff
-
         """
-
-
         d = self.d
         mpsi = self.psi
 
         # vectorized version
-
         S = (mpsi[:,1]*(y[:,:,None] + mpsi[:,2])).T
         R = np.tanh(S)
         D = 1-R**2
@@ -256,23 +230,17 @@ class TanhWarpingFunction_d(WarpingFunction):
         if return_precalc:
             return GRAD, S, R, D
 
-
         return GRAD
 
-
-    def fgrad_y_psi(self, y, return_covar_chain = False):
+    def fgrad_y_psi(self, y, return_covar_chain=False):
         """
         gradient of f w.r.t to y and psi
 
         :returns: NxIx4 tensor of partial derivatives
-
         """
-
-
         mpsi = self.psi
 
-        w, s, r, d = self.fgrad_y(y, return_precalc = True)
-        #print s
+        w, s, r, d = self.fgrad_y(y, return_precalc=True)
         gradients = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
         for i in range(len(mpsi)):
             a,b,c  = mpsi[i]

From 20a1e8d3c7d9641a244159ab12867112046babcc Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Wed, 5 Aug 2015 19:18:25 +0100
Subject: [PATCH 12/14] some cleaning on WarpedGP code

---
 GPy/models/warped_gp.py    | 10 +++++-----
 GPy/testing/model_tests.py |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index 77ded72d..06bc85cb 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -16,7 +16,7 @@ class WarpedGP(GP):
 
         if warping_function == None:
             self.warping_function = TanhWarpingFunction_d(warping_terms)
-            self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1,) * 1)
+            self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1) * 1)
         else:
             self.warping_function = warping_function
 
@@ -56,7 +56,6 @@ class WarpedGP(GP):
         self.warping_function.psi.gradient[:] = warping_grads[:, :-1]
         self.warping_function.d.gradient[:] = warping_grads[0, -1]
 
-
     def transform_data(self):
         Y = self.warping_function.f(self.Y_untransformed.copy()).copy()
         return Y
@@ -84,6 +83,9 @@ class WarpedGP(GP):
         return gh_weights.dot(self._get_warped_term(mean, std, gh_samples)) / np.sqrt(np.pi)
 
     def _get_warped_variance(self, mean, std, pred_init=None, deg_gauss_hermite=100):
+        """
+        Calculate the warped variance by using Gauss-Hermite quadrature.
+        """
         gh_samples, gh_weights = np.polynomial.hermite.hermgauss(deg_gauss_hermite)
         gh_samples = gh_samples[:,None]
         gh_weights = gh_weights[None,:]
@@ -105,13 +107,10 @@ class WarpedGP(GP):
         if self.predict_in_warped_space:
             std = np.sqrt(var)
             if median:
-                #print 'MEDIAN!'
                 wmean = self.warping_function.f_inv(mean, y=pred_init)
             else:
-                #print 'MEAN!'
                 wmean = self._get_warped_mean(mean, std, pred_init=pred_init,
                                               deg_gauss_hermite=deg_gauss_hermite).T
-            #var = self.warping_function.f_inv(var)
             wvar = self._get_warped_variance(mean, std, pred_init=pred_init,
                                              deg_gauss_hermite=deg_gauss_hermite).T
         else:
@@ -147,6 +146,7 @@ class WarpedGP(GP):
         return [new_a, new_b]
         #return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
 
+
 if __name__ == '__main__':
     X = np.random.randn(100, 1)
     Y = np.sin(X) + np.random.randn(100, 1)*0.05
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index ca2dce07..03bb0bb5 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -221,7 +221,7 @@ class MiscTests(unittest.TestCase):
  
         np.testing.assert_almost_equal(preds, warp_preds)
 
-    @unittest.skip('Comment this to plot the modified sine function')
+    #@unittest.skip('Comment this to plot the modified sine function')
     def test_warped_gp_sine(self):
         """
         A test replicating the sine regression problem from

From 4dd2f4feb724a62f0482395e52be66e3ab7fffdf Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Wed, 5 Aug 2015 19:18:57 +0100
Subject: [PATCH 13/14] commenting plotting test

---
 GPy/testing/model_tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 03bb0bb5..ca2dce07 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -221,7 +221,7 @@ class MiscTests(unittest.TestCase):
  
         np.testing.assert_almost_equal(preds, warp_preds)
 
-    #@unittest.skip('Comment this to plot the modified sine function')
+    @unittest.skip('Comment this to plot the modified sine function')
     def test_warped_gp_sine(self):
         """
         A test replicating the sine regression problem from

From a350e74b317bae9c28467b2c42d952c71954c4da Mon Sep 17 00:00:00 2001
From: beckdaniel <daniel.e.beck@gmail.com>
Date: Sat, 26 Sep 2015 13:30:23 +0100
Subject: [PATCH 14/14] merged master

---
 GPy/models/warped_gp.py                  | 1 +
 GPy/plotting/matplot_dep/models_plots.py | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index 06bc85cb..df947d3e 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -137,6 +137,7 @@ class WarpedGP(GP):
         if self.normalizer is not None:
             m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
         a, b = self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
+        #return [a, b]
         if not self.predict_in_warped_space:
             return [a, b]
         #print a.shape
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index 0986c20e..8fe0b2c2 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -46,6 +46,8 @@ def plot_data(model, which_data_rows='all',
     #data
     X = model.X
     Y = model.Y
+    if isinstance(model, WarpedGP) and model.predict_in_warped_space:
+        Y = model.Y_untransformed
 
     #work out what the inputs are for plotting (1D or 2D)
     if visible_dims is None:
@@ -184,11 +186,12 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
 
             if isinstance(model, WarpedGP):
                 m, v = model.predict(Xgrid, full_cov=False, median=True, Y_metadata=Y_metadata, **predict_kw)
+                lower, upper = model.predict_quantiles(Xgrid)
                 #print np.concatenate((Xgrid, m), axis=1)
             else:
                 m, v = model.predict(Xgrid, full_cov=False, Y_metadata=Y_metadata, **predict_kw)
-            fmu, fv = model._raw_predict(Xgrid, full_cov=False, **predict_kw)
-            lower, upper = model.likelihood.predictive_quantiles(fmu, fv, (2.5, 97.5), Y_metadata=Y_metadata)
+                fmu, fv = model._raw_predict(Xgrid, full_cov=False, **predict_kw)
+                lower, upper = model.likelihood.predictive_quantiles(fmu, fv, (2.5, 97.5), Y_metadata=Y_metadata)