diff --git a/GPy/__init__.py b/GPy/__init__.py
index 1dc3e6b1..9ee3d935 100644
--- a/GPy/__init__.py
+++ b/GPy/__init__.py
@@ -14,6 +14,8 @@ from . import testing
 from . import kern
 from . import plotting
 
+from .util import normalizer
+
 # backwards compatibility
 import sys
 backwards_compatibility = ['lists_and_dicts', 'observable_array', 'ties_and_remappings', 'index_operations']
diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 9cbe3daf..f0910d9d 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -2,17 +2,17 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from .. import kern
-from GPy.core.model import Model
-from paramz import ObsAr
+from .model import Model
+from .parameterization.variational import VariationalPosterior
 from .mapping import Mapping
 from .. import likelihoods
+from .. import kern
 from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
-from GPy.core.parameterization.variational import VariationalPosterior
+from ..util.normalizer import Standardize
+from paramz import ObsAr
 
 import logging
 import warnings
-from GPy.util.normalizer import MeanNorm
 logger = logging.getLogger("GP")
 
 class GP(Model):
@@ -28,7 +28,7 @@ class GP(Model):
     :param Norm normalizer:
         normalize the outputs Y.
         Prediction will be un-normalized using this normalizer.
-        If normalizer is None, we will normalize using MeanNorm.
+        If normalizer is None, we will normalize using Standardize.
         If normalizer is False, no normalization will be done.
 
     .. Note:: Multiple independent outputs are allowed using columns of Y
@@ -49,7 +49,7 @@ class GP(Model):
         logger.info("initializing Y")
 
         if normalizer is True:
-            self.normalizer = MeanNorm()
+            self.normalizer = Standardize()
         elif normalizer is False:
             self.normalizer = None
         else:
@@ -64,6 +64,7 @@ class GP(Model):
             self.Y_normalized = self.Y
         else:
             self.Y = Y
+            self.Y_normalized = self.Y
 
         if Y.shape[0] != self.num_data:
             #There can be cases where we want inputs than outputs, for example if we have multiple latent
@@ -248,14 +249,16 @@ class GP(Model):
         """
         #predict the latent function values
         mu, var = self._raw_predict(Xnew, full_cov=full_cov, kern=kern)
-        if self.normalizer is not None:
-            mu, var = self.normalizer.inverse_mean(mu), self.normalizer.inverse_variance(var)
 
         if include_likelihood:
             # now push through likelihood
             if likelihood is None:
                 likelihood = self.likelihood
             mu, var = likelihood.predictive_values(mu, var, full_cov, Y_metadata=Y_metadata)
+
+        if self.normalizer is not None:
+            mu, var = self.normalizer.inverse_mean(mu), self.normalizer.inverse_variance(var)
+
         return mu, var
 
     def predict_noiseless(self,  Xnew, full_cov=False, Y_metadata=None, kern=None):
@@ -300,11 +303,14 @@ class GP(Model):
         :rtype: [np.ndarray (Xnew x self.output_dim), np.ndarray (Xnew x self.output_dim)]
         """
         m, v = self._raw_predict(X,  full_cov=False, kern=kern)
-        if self.normalizer is not None:
-            m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
         if likelihood is None:
             likelihood = self.likelihood
-        return likelihood.predictive_quantiles(m, v, quantiles, Y_metadata=Y_metadata)
+            
+        quantiles = likelihood.predictive_quantiles(m, v, quantiles, Y_metadata=Y_metadata)
+        
+        if self.normalizer is not None:
+            quantiles = [self.normalizer.inverse_mean(q) for q in quantiles]
+        return quantiles
 
     def predictive_gradients(self, Xnew, kern=None):
         """
diff --git a/GPy/kern/src/static.py b/GPy/kern/src/static.py
index 5cf4a1c9..0ffd8112 100644
--- a/GPy/kern/src/static.py
+++ b/GPy/kern/src/static.py
@@ -135,9 +135,7 @@ class Bias(Static):
 
     def K(self, X, X2=None):
         shape = (X.shape[0], X.shape[0] if X2 is None else X2.shape[0])
-        ret = np.empty(shape, dtype=np.float64)
-        ret[:] = self.variance
-        return ret
+        return np.full(shape, self.variance, dtype=np.float64)
 
     def update_gradients_full(self, dL_dK, X, X2=None):
         self.variance.gradient = dL_dK.sum()
@@ -146,9 +144,7 @@ class Bias(Static):
         self.variance.gradient = dL_dKdiag.sum()
 
     def psi2(self, Z, variational_posterior):
-        ret = np.empty((Z.shape[0], Z.shape[0]), dtype=np.float64)
-        ret[:] = self.variance*self.variance*variational_posterior.shape[0]
-        return ret
+        return np.full((Z.shape[0], Z.shape[0]), self.variance*self.variance*variational_posterior.shape[0], dtype=np.float64)
 
     def psi2n(self, Z, variational_posterior):
         ret = np.empty((variational_posterior.mean.shape[0], Z.shape[0], Z.shape[0]), dtype=np.float64)
diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index aa9e8c90..a24401fe 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -15,7 +15,7 @@ class WarpedGP(GP):
     This defines a GP Regression model that applies a 
     warping function to the output.
     """
-    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3):
+    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3, normalizer=False):
         if kernel is None:
             kernel = kern.RBF(X.shape[1])
         if warping_function == None:
@@ -23,33 +23,23 @@ class WarpedGP(GP):
             self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1) * 1)
         else:
             self.warping_function = warping_function
-        self.scale_data = False
-        if self.scale_data:
-            Y = self._scale_data(Y)
-        #self.has_uncertain_inputs = False
-        self.Y_untransformed = Y.copy()
-        self.predict_in_warped_space = True
         likelihood = likelihoods.Gaussian()
-        GP.__init__(self, X, self.transform_data(), likelihood=likelihood, kernel=kernel)
+        super(WarpedGP, self).__init__(X, Y.copy(), likelihood=likelihood, kernel=kernel, normalizer=normalizer)
+        self.Y_normalized = self.Y_normalized.copy()
+        self.Y_untransformed = self.Y_normalized.copy()
+        self.predict_in_warped_space = True
         self.link_parameter(self.warping_function)
 
     def set_XY(self, X=None, Y=None):
-        self.Y_untransformed = Y.copy()
-        GP.set_XY(self, X, self.transform_data())
-
-    def _scale_data(self, Y):
-        self._Ymax = Y.max()
-        self._Ymin = Y.min()
-        return (Y - self._Ymin) / (self._Ymax - self._Ymin) - 0.5
-
-    def _unscale_data(self, Y):
-        return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin
+        super(WarpedGP, self).set_XY(X, Y)
+        self.Y_untransformed = self.Y_normalized.copy()
+        self.update_model(True)
 
     def parameters_changed(self):
         """
         Notice that we update the warping function gradients here.
         """
-        self.Y[:] = self.transform_data()
+        self.Y_normalized[:] = self.transform_data()
         super(WarpedGP, self).parameters_changed()
         Kiy = self.posterior.woodbury_vector.flatten()
         self.warping_function.update_grads(self.Y_untransformed, Kiy)
@@ -96,7 +86,7 @@ class WarpedGP(GP):
                                      deg_gauss_hermite=deg_gauss_hermite)
         return arg1 - (arg2 ** 2)
 
-    def predict(self, Xnew, which_parts='all', pred_init=None, full_cov=False, Y_metadata=None,
+    def predict(self, Xnew, kern=None, pred_init=None, Y_metadata=None,
                 median=False, deg_gauss_hermite=20, likelihood=None):
         """
         Prediction results depend on:
@@ -104,9 +94,12 @@ class WarpedGP(GP):
         - The median flag passed as argument
         The likelihood keyword is never used, it is just to follow the plotting API.
         """
-        mu, var = GP._raw_predict(self, Xnew)
+        #mu, var = GP._raw_predict(self, Xnew)
         # now push through likelihood
-        mean, var = self.likelihood.predictive_values(mu, var)
+        #mean, var = self.likelihood.predictive_values(mu, var)
+        
+        mean, var = super(WarpedGP, self).predict(Xnew, kern=kern, full_cov=False, likelihood=likelihood)
+
 
         if self.predict_in_warped_space:
             std = np.sqrt(var)
@@ -120,11 +113,9 @@ class WarpedGP(GP):
         else:
             wmean = mean
             wvar = var
-        if self.scale_data:
-            pred = self._unscale_data(pred)
         return wmean, wvar
 
-    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None, likelihood=None, median=False):
+    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None, likelihood=None, kern=None):
         """
         Get the predictive quantiles around the prediction at X
 
@@ -135,15 +126,19 @@ class WarpedGP(GP):
         :returns: list of quantiles for each X and predictive quantiles for interval combination
         :rtype: [np.ndarray (Xnew x self.input_dim), np.ndarray (Xnew x self.input_dim)]
         """
-        m, v = self._raw_predict(X,  full_cov=False)
-        if self.normalizer is not None:
-            m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
-        a, b = self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
-        if not self.predict_in_warped_space:
-            return [a, b]
-        new_a = self.warping_function.f_inv(a)
-        new_b = self.warping_function.f_inv(b)
-        return [new_a, new_b]
+        qs = super(WarpedGP, self).predict_quantiles(X, quantiles, Y_metadata=Y_metadata, likelihood=likelihood, kern=kern)
+        if self.predict_in_warped_space:
+            return [self.warping_function.f_inv(q) for q in qs]
+        return qs
+        #m, v = self._raw_predict(X,  full_cov=False)
+        #if self.normalizer is not None:
+        #    m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
+        #a, b = self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
+        #if not self.predict_in_warped_space:
+        #    return [a, b]
+        #new_a = self.warping_function.f_inv(a)
+        #new_b = self.warping_function.f_inv(b)
+        #return [new_a, new_b]
 
     def log_predictive_density(self, x_test, y_test, Y_metadata=None):
         """
diff --git a/GPy/plotting/gpy_plot/plot_util.py b/GPy/plotting/gpy_plot/plot_util.py
index 937f35ab..7bd1723f 100644
--- a/GPy/plotting/gpy_plot/plot_util.py
+++ b/GPy/plotting/gpy_plot/plot_util.py
@@ -74,9 +74,6 @@ def helper_predict_with_model(self, Xgrid, plot_raw, apply_link, percentiles, wh
     if 'output_index' not in predict_kw['Y_metadata']:
         predict_kw['Y_metadata']['output_index'] = Xgrid[:,-1:].astype(np.int)
 
-    if isinstance(self, WarpedGP) and self.predict_in_warped_space:
-        predict_kw['median'] = True
-
     mu, _ = self.predict(Xgrid, **predict_kw)
 
     if percentiles is not None:
@@ -299,8 +296,10 @@ def get_x_y_var(model):
         Y = model.Y.values
     except AttributeError:
         Y = model.Y
-    if isinstance(model, WarpedGP) and model.predict_in_warped_space:
-        Y = model.Y_untransformed
+
+    if isinstance(model, WarpedGP) and not model.predict_in_warped_space:
+        Y = model.Y_normalized
+    
     if sparse.issparse(Y): Y = Y.todense().view(np.ndarray)
     return X, X_variance, Y
 
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 010befe4..1e0731ab 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -91,18 +91,32 @@ class MiscTests(unittest.TestCase):
         k = GPy.kern.RBF(1)
         m2 = GPy.models.GPRegression(self.X, (Y-mu)/std, kernel=k, normalizer=False)
         m2[:] = m[:]
+        
         mu1, var1 = m.predict(m.X, full_cov=True)
         mu2, var2 = m2.predict(m2.X, full_cov=True)
         np.testing.assert_allclose(mu1, (mu2*std)+mu)
-        np.testing.assert_allclose(var1, var2)
+        np.testing.assert_allclose(var1, var2*std**2)
+        
         mu1, var1 = m.predict(m.X, full_cov=False)
         mu2, var2 = m2.predict(m2.X, full_cov=False)
+        
         np.testing.assert_allclose(mu1, (mu2*std)+mu)
-        np.testing.assert_allclose(var1, var2)
+        np.testing.assert_allclose(var1, var2*std**2)
 
         q50n = m.predict_quantiles(m.X, (50,))
         q50 = m2.predict_quantiles(m2.X, (50,))
+        
         np.testing.assert_allclose(q50n[0], (q50[0]*std)+mu)
+        
+        # Test variance component:
+        qs = np.array([2.5, 97.5])
+        # The quantiles get computed before unormalization
+        # And transformed using the mean transformation:
+        c = np.random.choice(self.X.shape[0])
+        q95 = m2.predict_quantiles(self.X[[c]], qs)
+        mu, var = m2.predict(self.X[[c]])
+        from scipy.stats import norm
+        np.testing.assert_allclose((mu+(norm.ppf(qs/100.)*np.sqrt(var))).flatten(), np.array(q95).flatten())
 
     def check_jacobian(self):
         try:
diff --git a/GPy/util/normalizer.py b/GPy/util/normalizer.py
index 854726c4..78ad945d 100644
--- a/GPy/util/normalizer.py
+++ b/GPy/util/normalizer.py
@@ -6,7 +6,7 @@ Created on Aug 27, 2014
 import logging
 import numpy as np
 
-class Norm(object):
+class _Norm(object):
     def __init__(self):
         pass
     def scale_by(self, Y):
@@ -18,7 +18,8 @@ class Norm(object):
         """
         Project Y into normalized space
         """
-        raise NotImplementedError
+        if not self.scaled():
+            raise AttributeError("Norm object not initialized yet, try calling scale_by(data) first.")
     def inverse_mean(self, X):
         """
         Project the normalized object X into space of Y
@@ -31,15 +32,46 @@ class Norm(object):
         Whether this Norm object has been initialized.
         """
         raise NotImplementedError
-class MeanNorm(Norm):
+
+class Standardize(_Norm):
     def __init__(self):
         self.mean = None
     def scale_by(self, Y):
         Y = np.ma.masked_invalid(Y, copy=False)
         self.mean = Y.mean(0).view(np.ndarray)
+        self.std = Y.std(0).view(np.ndarray)
     def normalize(self, Y):
-        return Y-self.mean
+        super(Standardize, self).normalize(Y)
+        return (Y-self.mean)/self.std
     def inverse_mean(self, X):
-        return X+self.mean
+        return (X*self.std)+self.mean
+    def inverse_variance(self, var):
+        return (var*(self.std**2))
     def scaled(self):
         return self.mean is not None
+
+# Inverse variance to be implemented, disabling for now
+# If someone in the future want to implement this,
+# we need to implement the inverse variance for 
+# normalization. This means, we need to know the factor
+# for the variance to be multiplied to the variance in
+# normalized space. This is easy to compute for standardization
+# (see above) but gets tricky here.
+# class Normalize(_Norm):
+#     def __init__(self):
+#         self.ymin = None
+#         self.ymax = None
+#     def scale_by(self, Y):
+#         Y = np.ma.masked_invalid(Y, copy=False)
+#         self.ymin = Y.min(0).view(np.ndarray)
+#         self.ymax = Y.max(0).view(np.ndarray)
+#     def normalize(self, Y):
+#         super(Normalize, self).normalize(Y)
+#         return (Y - self.ymin) / (self.ymax - self.ymin) - .5
+#     def inverse_mean(self, X):
+#         return (X + .5) * (self.ymax - self.ymin) + self.ymin
+#     def inverse_variance(self, var):
+#         
+#         return (var*(self.std**2))
+#     def scaled(self):
+#         return (self.ymin is not None) and (self.ymax is not None)
\ No newline at end of file
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index 6b8b73e9..3af28d43 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -31,7 +31,7 @@ class WarpingFunction(Parameterized):
         """gradient of f w.r.t to y"""
         raise NotImplementedError
 
-    def f_inv(self, z, max_iterations=100, y=None):
+    def f_inv(self, z, max_iterations=250, y=None):
         """
         Calculate the numerical inverse of f. This should be
         overwritten for specific warping functions where the
@@ -51,14 +51,11 @@ class WarpingFunction(Parameterized):
             update = (fy - z) / fgrady
             y -= self.rate * update
             it += 1
-        if it == max_iterations:
-            print("WARNING!!! Maximum number of iterations reached in f_inv ")
-            print("Sum of roots: %.4f" % np.sum(fy - z))
+        #if it == max_iterations:
+        #    print("WARNING!!! Maximum number of iterations reached in f_inv ")
+        #    print("Sum of roots: %.4f" % np.sum(fy - z))
         return y
 
-    def _get_param_names(self):
-        raise NotImplementedError
-
     def plot(self, xmin, xmax):
         y = np.arange(xmin, xmax, 0.01)
         f_y = self.f(y)
@@ -159,13 +156,6 @@ class TanhFunction(WarpingFunction):
 
         return gradients
 
-    def _get_param_names(self):
-        variables = ['a', 'b', 'c', 'd']
-        names = sum([['warp_tanh_%s_t%i' % (variables[n],q) for n in range(3)] 
-                     for q in range(self.n_terms)],[])
-        names.append('warp_tanh')
-        return names
-
     def update_grads(self, Y_untransformed, Kiy):
         grad_y = self.fgrad_y(Y_untransformed)
         grad_y_psi, grad_psi = self.fgrad_y_psi(Y_untransformed,