diff --git a/.travis.yml b/.travis.yml
index 63fa1c5e..6f96f1ec 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -71,3 +71,4 @@ deploy:
     branch: deploy
   distributions: $DIST
   skip_cleanup: true
+  skip_upload_docs: false
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7e0d5c81..46cb6f69 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,22 +1,311 @@
 # Changelog
 
-## v1.8.5 (2017-12-01)
-
-### New Features
-
-* Implement [Latent Variable Multiple Output Gaussian Processes (LVMOGP)](https://arxiv.org/abs/1705.09862) [Zhenwen Dai]
-
-* Add mean function functionality to dtc inference method [Mark Pullin]
-
-* Allow non-zero mean GP prior for EP [Pablo Moreno]
+## v1.9.2 (2018-02-22)
 
 ### Fix
 
-* Fix DSYR function interface (to support SciPy 1.0) [Pablo Moreno]
+* Rtd. [mzwiessele]
 
-* Fix scipy=1.0.0 incompatibility of lyapunov [Alan Saul]
+* Rtd. [mzwiessele]
+
+* Rtd. [mzwiessele]
+
+* Rtd. [mzwiessele]
+
+* Rtd. [mzwiessele]
+
+* Rtd. [mzwiessele]
+
+* Rtd. [mzwiessele]
+
+* Rtd. [mzwiessele]
+
+* Rtd. [mzwiessele]
+
+* Rtd. [mzwiessele]
+
+* Rtd. [mzwiessele]
+
+* Rtd. [mzwiessele]
+
+* Rtd. [mzwiessele]
+
+* Rtd. [mzwiessele]
+
+* Rtd. [mzwiessele]
+
+* Rtd. [mzwiessele]
+
+* Rtd. [mzwiessele]
+
+### Other
+
+* Bump version: 1.9.1 → 1.9.2. [mzwiessele]
+
+
+## v1.9.1 (2018-02-22)
+
+### Fix
+
+* Paramz newest version. [mzwiessele]
+
+### Other
+
+* Bump version: 1.9.0 → 1.9.1. [mzwiessele]
+
+
+## v1.9.0 (2018-02-22)
+
+### Other
+
+* Bump version: 1.8.7 → 1.9.0. [mzwiessele]
+
+
+## v1.8.7 (2018-02-22)
+
+### Fix
+
+* Merge deploy back into devel. [mzwiessele]
+
+### Other
+
+* Bump version: 1.8.6 → 1.8.7. [mzwiessele]
+
+* Deploy version 1.8.5. [Zhenwen Dai]
+
+  * added extended version of MLP function with multiple hidden layers and different activation functions
+
+  * Update mapping_tests.py
+
+  Make output of gradient check verbose to diagnose error
+
+  * Update mapping_tests.py
+
+  Remove verbosity again after gradient checks passed without problem with verbosity
+
+  * the implementation of SVI-MOGP
+
+  * Try to fix the issue with model_tests
+
+  * updated mapping test to pass gradient checks
+
+  * Fix random seed for reproducible results in tests
+
+  * Add mean function functionality to dtc inference method
+
+  * Fix DSYR function (See https://github.com/scipy/scipy/issues/8155)
+
+  * Updated sde_kern to work with scipy=1.0.0
+
+  * Trying to fix tests for Matplotlib plotting issue
+
+  * Testing Again #575
+
+  * Figured it must be a matplotlib import error #575
+
+  New import matplotlib must be missing a package
+
+  * Removed ImageComparisonFailure #575
+
+  ImageComparisonFailure no longer exists which causes issues with travis testing using the most recent matplotlib
+
+  * Fix EP for non-zero mean GP priors
+
+  * improve the documentation for LVMOGP
+
+  * remove non-ascii characters
+
+  * Small correction to doc
+
+  * add type into docstring
+
+  * update changelog for 1.8.5
+
+  * bump the version: 1.8.4 -&gt; 1.8.5
+
+
+## v1.8.6 (2018-02-22)
+
+### Fix
+
+* Gamma prior no assignment after init. [mzwiessele]
+
+* #568, product kernel resolution. [mzwiessele]
+
+* #590. [Max Zwiessele]
+
+  Y_normalized was not used for running optimization
+
+* Appveyor comment missing. [mzwiessele]
+
+### Other
+
+* Bump version: 1.8.5 → 1.8.6. [mzwiessele]
+
+* Merge pull request #597 from marpulli/devel. [Max Zwiessele]
+
+  Allow calculation of full predictive covariance matrices with multipl…
+
+* Allow calculation of full predictive covariance matrices with multiple outputs and normalization. [Mark Pullin]
+
+* Merge pull request #600 from marpulli/plotting_fix. [Max Zwiessele]
+
+  Fix visible dimensions for plotting inducing points
+
+* Fix visible dimensions for plotting inducing points. [Mark Pullin]
+
+* Merge pull request #599 from marpulli/grads_efficiency. [Zhenwen Dai]
+
+  Make predictive_gradients more efficient
+
+* Make predictive_gradients more efficient. [Mark Pullin]
+
+* Merge pull request #587 from esiivola/feature-multioutput. [Zhenwen Dai]
+
+  Merge the implementation of Multioutput kernel
+
+* Changed two function names so that they follow the python naming convention. [Siivola Eero]
+
+* Merge remote-tracking branch &#x27;origin&#x27; into feature-multioutput. [Eero Siivola]
+
+* Merge pull request #592 from SheffieldML/sparsegp-normalization. [Zhenwen Dai]
+
+  fix: #590
+
+* Merge pull request #589 from apaleyes/devel. [Zhenwen Dai]
+
+  Implemented utility function to compute covariance between points in GP Model
+
+* Moved posterior_covariance to Posterior class. [Andrei Paleyes]
+
+* Implemented utility function to compute covariance between points in GP Model. [Andrei Paleyes]
+
+* Changed the structure of multioutput kernel so that it doesn&#x27;t change the API of Kernels + documented the class. [Eero Siivola]
+
+* Merge remote-tracking branch &#x27;origin/devel&#x27; into feature-multioutput. [Eero Siivola]
+
+* Merge pull request #585 from YoshikawaMasashi/devel. [Zhenwen Dai]
+
+  modify the MLP kernel equation
+
+* Modify the MLP kernel equation. [masashi yoshikawa]
+
+* Added multioutput kern and tests. [Eero Siivola]
+
+* Multioutput kernel + initial test. [Siivola Eero]
+
+* Multioutput kernel + initial test. [Siivola Eero]
+
+* Change dtype for Python 3 in robot_wirelss. [Neil Lawrence]
+
+* Bump the version: 1.8.4 -&gt; 1.8.5. [Zhenwen Dai]
+
+* Update changelog for 1.8.5. [Zhenwen Dai]
+
+* Merge pull request #579 from SheffieldML/multi_out_doc. [Zhenwen Dai]
+
+  Improve the documentation for LVMOGP
+
+* Add type into docstring. [Zhenwen Dai]
+
+* Merge branch &#x27;devel&#x27; of github.com:SheffieldML/GPy into multi_out_doc. [Zhenwen Dai]
+
+* Remove non-ascii characters. [Zhenwen Dai]
+
+* Improve the documentation for LVMOGP. [Zhenwen Dai]
+
+* Merge pull request #580 from marpulli/devel. [Zhenwen Dai]
+
+  Small correction to doc
+
+* Small correction to doc. [Mark Pullin]
+
+* Merge pull request #578 from pgmoren/devel. [Zhenwen Dai]
+
+  Fix EP for non-zero mean GP priors (binary classification)
+
+* Fix EP for non-zero mean GP priors. [Moreno]
+
+* Merge pull request #572 from marpulli/devel. [Alan Saul]
+
+  Add mean function functionality to dtc inference method
+
+* Add mean function functionality to dtc inference method. [Mark Pullin]
+
+* Merge pull request #573 from pgmoren/devel. [Zhenwen Dai]
+
+  Fix DSYR function (See https://github.com/scipy/scipy/issues/8155)
+
+* Fix DSYR function (See https://github.com/scipy/scipy/issues/8155) [Moreno]
+
+* Merge pull request #574 from alansaul/lyapunov_fix. [Alan Saul]
+
+  Fixing scipy=1.0.0 incompatibility of lyapunov discovered in PR #573. Coverage issue should be resolved by PR #575.
+
+* Updated sde_kern to work with scipy=1.0.0. [Alan Saul]
+
+* Merge pull request #575 from SheffieldML/matplotlib_testing. [Alan Saul]
+
+  Fixing tests for Matplotlib plotting issue
+
+* Removed ImageComparisonFailure #575. [Alan Saul]
+
+  ImageComparisonFailure no longer exists which causes issues with travis testing using the most recent matplotlib
+
+* Figured it must be a matplotlib import error #575. [Alan Saul]
+
+  New import matplotlib must be missing a package
+
+* Testing Again #575. [Alan Saul]
+
+* Trying to fix tests for Matplotlib plotting issue. [Alan Saul]
+
+* Merge pull request #526 from msbauer/mlp_extended. [Zhenwen Dai]
+
+  added extended version of MLP function
+
+* Fix random seed for reproducible results in tests. [msbauer]
+
+* Updated mapping test to pass gradient checks. [msbauer]
+
+* Update mapping_tests.py. [msbauer]
+
+  Remove verbosity again after gradient checks passed without problem with verbosity
+
+* Update mapping_tests.py. [msbauer]
+
+  Make output of gradient check verbose to diagnose error
+
+* Added extended version of MLP function with multiple hidden layers and different activation functions. [Bauer]
+
+* Merge pull request #562 from SheffieldML/external-mo. [Zhenwen Dai]
+
+  Release the implementation of LVMOGP
+
+* Try to fix the issue with model_tests. [Zhenwen Dai]
+
+* Merge with new changes from devel. [Zhenwen Dai]
+
+* Merge pull request #561 from SheffieldML/deploy. [Max Zwiessele]
+
+  Deploy
+
+* Merge pull request #560 from SheffieldML/devel. [Max Zwiessele]
+
+  appveyor twine upload error fix
+
+* Merge branch &#x27;deploy&#x27; into devel. [Max Zwiessele]
+
+* Merge pull request #558 from SheffieldML/devel. [Max Zwiessele]
+
+  Uniform prior fix for other domains
+
+* Merge pull request #559 from SheffieldML/PS-upload-error. [Max Zwiessele]
+
+  Update appveyor.yml
+
+* The implementation of SVI-MOGP. [Zhenwen Dai]
 
-* Fix tests for Matplotlib plotting issue [Alan Saul]
 
 ## v1.8.4 (2017-10-06)
 
diff --git a/GPy/__version__.py b/GPy/__version__.py
index 89c6ad8e..2cbc28c3 100644
--- a/GPy/__version__.py
+++ b/GPy/__version__.py
@@ -1 +1 @@
-__version__ = "1.8.5"
+__version__ = "1.9.2"
diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 7bad7648..536b2ad4 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -282,10 +282,12 @@ class GP(Model):
             mu += self.mean_function.f(Xnew)
         return mu, var
 
-    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None, likelihood=None, include_likelihood=True):
+    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None, 
+                likelihood=None, include_likelihood=True):
         """
-        Predict the function(s) at the new point(s) Xnew. This includes the likelihood
-        variance added to the predicted underlying function (usually referred to as f).
+        Predict the function(s) at the new point(s) Xnew. This includes the
+        likelihood variance added to the predicted underlying function
+        (usually referred to as f).
 
         In order to predict without adding in the likelihood give
         `include_likelihood=False`, or refer to self.predict_noiseless().
@@ -295,33 +297,49 @@ class GP(Model):
         :param full_cov: whether to return the full covariance matrix, or just
                          the diagonal
         :type full_cov: bool
-        :param Y_metadata: metadata about the predicting point to pass to the likelihood
+        :param Y_metadata: metadata about the predicting point to pass to the
+                           likelihood
         :param kern: The kernel to use for prediction (defaults to the model
                      kern). this is useful for examining e.g. subprocesses.
-        :param bool include_likelihood: Whether or not to add likelihood noise to the predicted underlying latent function f.
+        :param include_likelihood: Whether or not to add likelihood noise to
+                                   the predicted underlying latent function f.
+        :type include_likelihood: bool
 
         :returns: (mean, var):
             mean: posterior mean, a Numpy array, Nnew x self.input_dim
-            var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
+            var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False,
+                 Nnew x Nnew otherwise
 
-           If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
-           This is to allow for different normalizations of the output dimensions.
+            If full_cov and self.input_dim > 1, the return shape of var is
+            Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return
+            shape is Nnew x Nnew. This is to allow for different normalizations
+            of the output dimensions.
 
-        Note: If you want the predictive quantiles (e.g. 95% confidence interval) use :py:func:"~GPy.core.gp.GP.predict_quantiles".
+        Note: If you want the predictive quantiles (e.g. 95% confidence
+        interval) use :py:func:"~GPy.core.gp.GP.predict_quantiles".
         """
-        #predict the latent function values
-        mu, var = self._raw_predict(Xnew, full_cov=full_cov, kern=kern)
+
+        # Predict the latent function values
+        mean, var = self._raw_predict(Xnew, full_cov=full_cov, kern=kern)
 
         if include_likelihood:
             # now push through likelihood
             if likelihood is None:
                 likelihood = self.likelihood
-            mu, var = likelihood.predictive_values(mu, var, full_cov, Y_metadata=Y_metadata)
+            mean, var = likelihood.predictive_values(mean, var, full_cov,
+                                                     Y_metadata=Y_metadata)
 
         if self.normalizer is not None:
-            mu, var = self.normalizer.inverse_mean(mu), self.normalizer.inverse_variance(var)
+            mean = self.normalizer.inverse_mean(mean)
 
-        return mu, var
+            # We need to create 3d array for the full covariance matrix with
+            # multiple outputs.
+            if full_cov & (mean.shape[1] > 1):
+                var = self.normalizer.inverse_covariance(var)
+            else:
+                var = self.normalizer.inverse_variance(var)
+
+        return mean, var
 
     def predict_noiseless(self,  Xnew, full_cov=False, Y_metadata=None, kern=None):
         """
@@ -376,13 +394,16 @@ class GP(Model):
 
     def predictive_gradients(self, Xnew, kern=None):
         """
-        Compute the derivatives of the predicted latent function with respect to X*
+        Compute the derivatives of the predicted latent function with respect
+        to X*
 
         Given a set of points at which to predict X* (size [N*,Q]), compute the
         derivatives of the mean and variance. Resulting arrays are sized:
-         dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).
+            dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP
+            (usually one).
 
-        Note that this is not the same as computing the mean and variance of the derivative of the function!
+        Note that this is not the same as computing the mean and variance of
+        the derivative of the function!
 
          dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
         :param X: The points at which to get the predictive gradients
@@ -393,25 +414,32 @@ class GP(Model):
         """
         if kern is None:
             kern = self.kern
-        mean_jac = np.empty((Xnew.shape[0],Xnew.shape[1],self.output_dim))
+        mean_jac = np.empty((Xnew.shape[0], Xnew.shape[1], self.output_dim))
 
         for i in range(self.output_dim):
-            mean_jac[:,:,i] = kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1].T, Xnew, self._predictive_variable)
+            mean_jac[:, :, i] = kern.gradients_X(
+                self.posterior.woodbury_vector[:, i:i+1].T, Xnew,
+                self._predictive_variable)
 
-        # gradients wrt the diagonal part k_{xx}
-        dv_dX = kern.gradients_X(np.eye(Xnew.shape[0]), Xnew)
-        #grads wrt 'Schur' part K_{xf}K_{ff}^{-1}K_{fx}
+        # Gradients wrt the diagonal part k_{xx}
+        dv_dX = kern.gradients_X_diag(np.ones(Xnew.shape[0]), Xnew)
+
+        # Grads wrt 'Schur' part K_{xf}K_{ff}^{-1}K_{fx}
         if self.posterior.woodbury_inv.ndim == 3:
-            tmp = np.empty(dv_dX.shape + (self.posterior.woodbury_inv.shape[2],))
-            tmp[:] = dv_dX[:,:,None]
+            var_jac = np.empty(dv_dX.shape +
+                               (self.posterior.woodbury_inv.shape[2],))
+            var_jac[:] = dv_dX[:, :, None]
             for i in range(self.posterior.woodbury_inv.shape[2]):
-                alpha = -2.*np.dot(kern.K(Xnew, self._predictive_variable), self.posterior.woodbury_inv[:, :, i])
-                tmp[:, :, i] += kern.gradients_X(alpha, Xnew, self._predictive_variable)
+                alpha = -2.*np.dot(kern.K(Xnew, self._predictive_variable),
+                                   self.posterior.woodbury_inv[:, :, i])
+                var_jac[:, :, i] += kern.gradients_X(alpha, Xnew,
+                                                     self._predictive_variable)
         else:
-            tmp = dv_dX
-            alpha = -2.*np.dot(kern.K(Xnew, self._predictive_variable), self.posterior.woodbury_inv)
-            tmp += kern.gradients_X(alpha, Xnew, self._predictive_variable)
-        return mean_jac, tmp
+            var_jac = dv_dX
+            alpha = -2.*np.dot(kern.K(Xnew, self._predictive_variable),
+                               self.posterior.woodbury_inv)
+            var_jac += kern.gradients_X(alpha, Xnew, self._predictive_variable)
+        return mean_jac, var_jac
 
     def predict_jacobian(self, Xnew, kern=None, full_cov=False):
         """
@@ -678,3 +706,12 @@ class GP(Model):
         """
         mu_star, var_star = self._raw_predict(x_test)
         return self.likelihood.log_predictive_density_sampling(y_test, mu_star, var_star, Y_metadata=Y_metadata, num_samples=num_samples)
+
+    def posterior_covariance_between_points(self, X1, X2):
+        """
+        Computes the posterior covariance between points.
+
+        :param X1: some input observations
+        :param X2: other input observations
+        """
+        return self.posterior.covariance_between_points(self.kern, self.X, X1, X2)
diff --git a/GPy/core/parameterization/priors.py b/GPy/core/parameterization/priors.py
index 3d69f39e..cbff4ca0 100644
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@@ -288,9 +288,17 @@ class Gamma(Prior):
         cls._instances.append(weakref.ref(o))
         return cls._instances[-1]()
 
+    @property
+    def a(self):
+        return self._a
+
+    @property
+    def b(self):
+        return self._b
+
     def __init__(self, a, b):
-        self.a = float(a)
-        self.b = float(b)
+        self._a = float(a)
+        self._b = float(b)
         self.constant = -gammaln(self.a) + a * np.log(b)
 
     def __str__(self):
@@ -333,8 +341,8 @@ class Gamma(Prior):
         return self.a, self.b
 
     def __setstate__(self, state):
-        self.a = state[0]
-        self.b = state[1]
+        self._a = state[0]
+        self._b = state[1]
         self.constant = -gammaln(self.a) + self.a * np.log(self.b)
 
 class InverseGamma(Gamma):
@@ -360,8 +368,8 @@ class InverseGamma(Gamma):
         return cls._instances[-1]()
 
     def __init__(self, a, b):
-        self.a = float(a)
-        self.b = float(b)
+        self._a = float(a)
+        self._b = float(b)
         self.constant = -gammaln(self.a) + a * np.log(b)
 
     def __str__(self):
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 110f601e..4c7e98c4 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -76,7 +76,7 @@ class SparseGP(GP):
     def parameters_changed(self):
         self.posterior, self._log_marginal_likelihood, self.grad_dict = \
         self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood,
-                                        self.Y, Y_metadata=self.Y_metadata,
+                                        self.Y_normalized, Y_metadata=self.Y_metadata,
                                         mean_function=self.mean_function)
         self._update_gradients()
 
diff --git a/GPy/examples/state_space.py b/GPy/examples/state_space.py
index 5a213f45..898d1676 100644
--- a/GPy/examples/state_space.py
+++ b/GPy/examples/state_space.py
@@ -4,23 +4,26 @@ import matplotlib.pyplot as plt
 
 import GPy.models.state_space_model as SS_model
 
-X = np.linspace(0, 10, 2000)[:, None]
-Y = np.sin(X) + np.random.randn(*X.shape)*0.1
+def state_space_example():
+    X = np.linspace(0, 10, 2000)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*0.1
 
-kernel1 = GPy.kern.Matern32(X.shape[1])
-m1  = GPy.models.GPRegression(X,Y, kernel1)
+    kernel1 = GPy.kern.Matern32(X.shape[1])
+    m1  = GPy.models.GPRegression(X,Y, kernel1)
 
-print(m1)
-m1.optimize(optimizer='bfgs',messages=True)
+    print(m1)
+    m1.optimize(optimizer='bfgs',messages=True)
 
-print(m1)
+    print(m1)
 
-kernel2 = GPy.kern.sde_Matern32(X.shape[1])
-#m2  = SS_model.StateSpace(X,Y, kernel2)
-m2 = GPy.models.StateSpace(X,Y, kernel2)
-print(m2)
+    kernel2 = GPy.kern.sde_Matern32(X.shape[1])
+    #m2  = SS_model.StateSpace(X,Y, kernel2)
+    m2 = GPy.models.StateSpace(X,Y, kernel2)
+    print(m2)
 
-m2.optimize(optimizer='bfgs',messages=True)
+    m2.optimize(optimizer='bfgs',messages=True)
 
-print(m2)
+    print(m2)
+
+    return m1, m2
 
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index 40ea5c73..4a8dea45 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -101,6 +101,29 @@ class Posterior(object):
             #self._covariance = self._K - self._K.dot(self.woodbury_inv).dot(self._K)
         return self._covariance
 
+    def covariance_between_points(self, kern, X, X1, X2):
+        """
+        Computes the posterior covariance between points.
+
+        :param kern: GP kernel
+        :param X: current input observations
+        :param X1: some input observations
+        :param X2: other input observations
+        """
+        # ndim == 3 is a model for missing data
+        if self.woodbury_chol.ndim != 2:
+            raise RuntimeError("This method does not support posterior for missing data models")
+
+        Kx1 = kern.K(X, X1)
+        Kx2 = kern.K(X, X2)
+        K12 = kern.K(X1, X2)
+
+        tmp1 = dtrtrs(self.woodbury_chol, Kx1)[0]
+        tmp2 = dtrtrs(self.woodbury_chol, Kx2)[0]
+        var = K12 - tmp1.T.dot(tmp2)
+
+        return var
+
     @property
     def precision(self):
         """
diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index d8239910..96abab39 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -42,3 +42,4 @@ from .src.sde_standard_periodic import sde_StdPeriodic
 from .src.sde_static import sde_White, sde_Bias
 from .src.sde_stationary import sde_RBF,sde_Exponential,sde_RatQuad
 from .src.sde_brownian import sde_Brownian
+from .src.multioutput_kern import MultioutputKern
\ No newline at end of file
diff --git a/GPy/kern/src/kern.py b/GPy/kern/src/kern.py
index b9971b8c..c08489e2 100644
--- a/GPy/kern/src/kern.py
+++ b/GPy/kern/src/kern.py
@@ -185,6 +185,9 @@ class Kern(Parameterized):
     def update_gradients_full(self, dL_dK, X, X2):
         """Set the gradients of all parameters when doing full (N) inference."""
         raise NotImplementedError
+    
+    def reset_gradients(self):
+        raise NotImplementedError
 
     def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         """
@@ -345,7 +348,7 @@ class CombinationKernel(Kern):
     A combination kernel combines (a list of) kernels and works on those.
     Examples are the HierarchicalKernel or Add and Prod kernels.
     """
-    def __init__(self, kernels, name, extra_dims=[]):
+    def __init__(self, kernels, name, extra_dims=[], link_parameters=True):
         """
         Abstract super class for combination kernels.
         A combination kernel combines (a list of) kernels and works on those.
@@ -369,7 +372,8 @@ class CombinationKernel(Kern):
         self._all_dims_active = np.array(np.concatenate((np.arange(effective_input_dim), extra_dims if extra_dims is not None else [])), dtype=int)
 
         self.extra_dims = extra_dims
-        self.link_parameters(*kernels)
+        if link_parameters:
+            self.link_parameters(*kernels)
 
     def _to_dict(self):
         input_dict = super(CombinationKernel, self)._to_dict()
diff --git a/GPy/kern/src/mlp.py b/GPy/kern/src/mlp.py
index 6c997881..dc69f5fd 100644
--- a/GPy/kern/src/mlp.py
+++ b/GPy/kern/src/mlp.py
@@ -15,7 +15,7 @@ class MLP(Kern):
 
     .. math::
 
-          k(x,y) = \\sigma^{2}\\frac{2}{\\pi }  \\text{asin} \\left ( \\frac{ \\sigma_w^2 x^\\top y+\\sigma_b^2}{\\sqrt{\\sigma_w^2x^\\top x + \\sigma_b^2 + 1}\\sqrt{\\sigma_w^2 y^\\top y \\sigma_b^2 +1}} \\right )
+          k(x,y) = \\sigma^{2}\\frac{2}{\\pi }  \\text{asin} \\left ( \\frac{ \\sigma_w^2 x^\\top y+\\sigma_b^2}{\\sqrt{\\sigma_w^2x^\\top x + \\sigma_b^2 + 1}\\sqrt{\\sigma_w^2 y^\\top y + \\sigma_b^2 +1}} \\right )
 
 
     :param input_dim: the number of input dimensions
diff --git a/GPy/kern/src/multioutput_kern.py b/GPy/kern/src/multioutput_kern.py
new file mode 100644
index 00000000..b7feaadb
--- /dev/null
+++ b/GPy/kern/src/multioutput_kern.py
@@ -0,0 +1,131 @@
+from .kern import Kern, CombinationKernel
+import numpy as np
+from functools import reduce, partial
+from .independent_outputs import index_to_slices
+from paramz.caching import Cache_this
+
+class ZeroKern(Kern):
+    def __init__(self):
+        super(ZeroKern, self).__init__(1, None, name='ZeroKern',useGPU=False)
+
+    def K(self, X ,X2=None):
+        if X2 is None:
+            X2 = X
+        return np.zeros((X.shape[0],X2.shape[0]))
+    
+    def update_gradients_full(self,dL_dK, X, X2=None):
+        return np.zeros(dL_dK.shape)
+    
+    def gradients_X(self,dL_dK, X, X2=None):
+        return np.zeros((X.shape[0],X.shape[1]))
+        
+class MultioutputKern(CombinationKernel):
+    """
+    Multioutput kernel is a meta class for combining different kernels for multioutput GPs. 
+
+    As an example let us have inputs x1 for output 1 with covariance k1 and x2 for output 2 with covariance k2.
+    In addition, we need to define the cross covariances k12(x1,x2) and k21(x2,x1). Then the kernel becomes:
+    k([x1,x2],[x1,x2]) = [k1(x1,x1) k12(x1, x2); k21(x2, x1), k2(x2,x2)]
+    
+    For  the kernel, the kernels of outputs are given as list in param "kernels" and cross covariances are
+    given in param "cross_covariances" as a dictionary of tuples (i,j) as keys. If no cross covariance is given,
+    it defaults to zero, as in k12(x1,x2)=0.
+    
+    In the cross covariance dictionary, the value needs to be a struct with elements 
+    -'kernel': a member of Kernel class that stores the hyper parameters to be updated when optimizing the GP
+    -'K': function defining the cross covariance
+    -'update_gradients_full': a function to be used for updating gradients
+    -'gradients_X': gives a gradient of the cross covariance with respect to the first input
+    """
+    def __init__(self, kernels, cross_covariances={}, name='MultioutputKern'):
+        #kernels contains a list of kernels as input, 
+        if not isinstance(kernels, list):
+            self.single_kern = True
+            self.kern = kernels
+            kernels = [kernels]
+        else:
+            self.single_kern = False
+            self.kern = kernels
+            
+        # The combination kernel ALLWAYS puts the extra dimension last.
+        # Thus, the index dimension of this kernel is always the last dimension
+        # after slicing. This is why the index_dim is just the last column:
+        self.index_dim = -1
+        
+        super(MultioutputKern, self).__init__(kernels=kernels, extra_dims=[self.index_dim], name=name, link_parameters=False)
+
+        nl = len(kernels)
+        #build covariance structure
+        covariance = [[None for i in range(nl)] for j in range(nl)]
+        linked = []
+        for i in range(0,nl):
+            unique=True
+            for j in range(0,nl):
+                if i==j or (kernels[i] is kernels[j]):
+                    covariance[i][j] = {'kern': kernels[i], 'K': kernels[i].K, 'update_gradients_full': kernels[i].update_gradients_full, 'gradients_X': kernels[i].gradients_X}
+                    if i>j:
+                        unique=False
+                elif cross_covariances.get((i,j)) is not None: #cross covariance is given
+                    covariance[i][j] = cross_covariances.get((i,j))
+                else: # zero covariance structure
+                    kern = ZeroKern()
+                    covariance[i][j] = {'kern': kern, 'K': kern.K, 'update_gradients_full': kern.update_gradients_full, 'gradients_X': kern.gradients_X}       
+            if unique is True:
+                linked.append(i)
+        self.covariance = covariance
+        self.link_parameters(*[kernels[i] for i in linked])
+        
+    @Cache_this(limit=3, ignore_args=())
+    def K(self, X ,X2=None):
+        if X2 is None:
+            X2 = X
+        slices = index_to_slices(X[:,self.index_dim])
+        slices2 = index_to_slices(X2[:,self.index_dim])
+        target =  np.zeros((X.shape[0], X2.shape[0]))
+        [[[[ target.__setitem__((slices[i][k],slices2[j][l]), self.covariance[i][j]['K'](X[slices[i][k],:],X2[slices2[j][l],:])) for k in range( len(slices[i]))] for l in range(len(slices2[j])) ] for i in range(len(slices))] for j in range(len(slices2))]  
+        return target
+
+    @Cache_this(limit=3, ignore_args=())
+    def Kdiag(self,X):
+        slices = index_to_slices(X[:,self.index_dim])
+        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
+        target = np.zeros(X.shape[0])
+        [[np.copyto(target[s], kern.Kdiag(X[s])) for s in slices_i] for kern, slices_i in zip(kerns, slices)]
+        return target
+    
+    def _update_gradients_full_wrapper(self, cov_struct, dL_dK, X, X2):
+        gradient = cov_struct['kern'].gradient.copy()
+        cov_struct['update_gradients_full'](dL_dK, X, X2)
+        cov_struct['kern'].gradient += gradient
+    
+    def _update_gradients_diag_wrapper(self, kern, dL_dKdiag, X):
+        gradient = kern.gradient.copy()
+        kern.update_gradients_diag(dL_dKdiag, X)
+        kern.gradient += gradient
+        
+    def reset_gradients(self):
+        for kern in self.kern: kern.reset_gradients()
+
+    def update_gradients_full(self,dL_dK, X, X2=None):
+        self.reset_gradients()
+        slices = index_to_slices(X[:,self.index_dim])
+        if X2 is not None:
+            slices2 = index_to_slices(X2[:,self.index_dim])
+            [[[[ self._update_gradients_full_wrapper(self.covariance[i][j], dL_dK[slices[i][k],slices2[j][l]], X[slices[i][k],:], X2[slices2[j][l],:]) for k in range(len(slices[i]))] for l in range(len(slices2[j]))] for i in range(len(slices))] for j in range(len(slices2))]
+        else:
+            [[[[ self._update_gradients_full_wrapper(self.covariance[i][j], dL_dK[slices[i][k],slices[j][l]], X[slices[i][k],:], X[slices[j][l],:]) for k in range(len(slices[i]))] for l in range(len(slices[j]))] for i in range(len(slices))] for j in range(len(slices))]
+            
+    def update_gradients_diag(self, dL_dKdiag, X):
+        self.reset_gradients()
+        slices = index_to_slices(X[:,self.index_dim])
+        [[ self._update_gradients_diag_wrapper(self.covariance[i][i]['kern'], dL_dKdiag[slices[i][k]], X[slices[i][k],:]) for k in range(len(slices[i]))] for i in range(len(slices))]
+    
+    def gradients_X(self,dL_dK, X, X2=None):
+        slices = index_to_slices(X[:,self.index_dim])
+        target = np.zeros((X.shape[0], X.shape[1]) )
+        if X2 is not None:
+            slices2 = index_to_slices(X2[:,self.index_dim])
+            [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j]['gradients_X'](dL_dK[slices[i][k],slices2[j][l]], X[slices[i][k],:], X2[slices2[j][l],:]) ) for k in range(len(slices[i]))] for l in range(len(slices2[j]))] for i in range(len(slices))] for j in range(len(slices2))]
+        else:
+            [[[[ target.__setitem__((slices[i][k]), target[slices[i][k],:] + self.covariance[i][j]['gradients_X'](dL_dK[slices[i][k],slices[j][l]], X[slices[i][k],:], (None if (i==j and k==l) else X[slices[j][l],:] )) ) for k in range(len(slices[i]))] for l in range(len(slices[j]))] for i in range(len(slices))] for j in range(len(slices))]
+        return target
\ No newline at end of file
diff --git a/GPy/kern/src/prod.py b/GPy/kern/src/prod.py
index 43314e7a..31e62392 100644
--- a/GPy/kern/src/prod.py
+++ b/GPy/kern/src/prod.py
@@ -31,13 +31,16 @@ class Prod(CombinationKernel):
 
     """
     def __init__(self, kernels, name='mul'):
-        for i, kern in enumerate(kernels[:]):
+        _newkerns = []
+        for kern in kernels:
             if isinstance(kern, Prod):
-                del kernels[i]
-                for part in kern.parts[::-1]:
-                    kern.unlink_parameter(part)
-                    kernels.insert(i, part)
-        super(Prod, self).__init__(kernels, name)
+                for part in kern.parts:
+                    #kern.unlink_parameter(part)
+                    _newkerns.append(part.copy())
+            else:
+                _newkerns.append(kern.copy())
+
+        super(Prod, self).__init__(_newkerns, name)
 
     def to_dict(self):
         input_dict = super(Prod, self)._to_dict()
diff --git a/GPy/kern/src/stationary.py b/GPy/kern/src/stationary.py
index 4e8ddb77..81129a75 100644
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@@ -171,6 +171,13 @@ class Stationary(Kern):
         ret[:] = self.variance
         return ret
 
+    def reset_gradients(self):
+        self.variance.gradient = 0.
+        if not self.ARD:
+            self.lengthscale.gradient = 0.
+        else:
+            self.lengthscale.gradient = np.zeros(self.input_dim)
+
     def update_gradients_diag(self, dL_dKdiag, X):
         """
         Given the derivative of the objective with respect to the diagonal of
@@ -182,7 +189,7 @@ class Stationary(Kern):
         self.variance.gradient = np.sum(dL_dKdiag)
         self.lengthscale.gradient = 0.
 
-    def update_gradients_full(self, dL_dK, X, X2=None):
+    def update_gradients_full(self, dL_dK, X, X2=None, reset=True):
         """
         Given the derivative of the objective wrt the covariance matrix
         (dL_dK), compute the gradient wrt the parameters of this kernel,
@@ -632,7 +639,7 @@ class RatQuad(Stationary):
     def dK_dr(self, r):
         r2 = np.square(r)
 #         return -self.variance*self.power*r*np.power(1. + r2/2., - self.power - 1.)
-        return-self.variance*self.power*r*np.exp(-(self.power+1)*np.log1p(r2/2.))
+        return -self.variance*self.power*r*np.exp(-(self.power+1)*np.log1p(r2/2.))
 
     def update_gradients_full(self, dL_dK, X, X2=None):
         super(RatQuad, self).update_gradients_full(dL_dK, X, X2)
diff --git a/GPy/plotting/gpy_plot/gp_plots.py b/GPy/plotting/gpy_plot/gp_plots.py
index 230d47f0..a12fc858 100644
--- a/GPy/plotting/gpy_plot/gp_plots.py
+++ b/GPy/plotting/gpy_plot/gp_plots.py
@@ -337,7 +337,7 @@ def plot(self, plot_limits=None, fixed_inputs=None,
         plot_data = False
     plots = {}
     if hasattr(self, 'Z') and plot_inducing:
-        plots.update(_plot_inducing(self, canvas, visible_dims, projection, 'Inducing'))
+        plots.update(_plot_inducing(self, canvas, free_dims, projection, 'Inducing'))
     if plot_data:
         plots.update(_plot_data(self, canvas, which_data_rows, which_data_ycols, free_dims, projection, "Data"))
         plots.update(_plot_data_error(self, canvas, which_data_rows, which_data_ycols, free_dims, projection, "Data Error"))
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index 053fce35..e1c9d934 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -482,6 +482,17 @@ class KernelGradientTestsContinuous(unittest.TestCase):
         k = GPy.kern.StdPeriodic(self.D)
         k.randomize()
         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+    
+    def test_MultioutputKern(self):
+        k1 = GPy.kern.RBF(self.D, ARD=True)
+        k1.randomize()
+        k2 = GPy.kern.RBF(self.D, ARD=True)
+        k2.randomize()
+
+        k = GPy.kern.MultioutputKern([k1, k2])
+        Xt,_,_ = GPy.util.multioutput.build_XY([self.X, self.X])
+        X2t,_,_ = GPy.util.multioutput.build_XY([self.X2, self.X2])
+        self.assertTrue(check_kernel_gradient_functions(k, X=Xt, X2=X2t, verbose=verbose, fixed_X_dims=-1))
 
     def test_Precomputed(self):
         Xall = np.concatenate([self.X, self.X2])
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 68e95ec0..3558b9bb 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -118,6 +118,51 @@ class MiscTests(unittest.TestCase):
         from scipy.stats import norm
         np.testing.assert_allclose((mu+(norm.ppf(qs/100.)*np.sqrt(var))).flatten(), np.array(q95).flatten())
 
+    def test_multioutput_regression_with_normalizer(self):
+        """
+        Test that normalizing works in multi-output case
+        """
+
+        # Create test inputs
+        X = self.X
+        Y1 = np.sin(X) + np.random.randn(*X.shape) * 0.2
+        Y2 = -np.sin(X) + np.random.randn(*X.shape) * 0.05
+        Y = np.hstack((Y1, Y2))
+
+        mu, std = Y.mean(0), Y.std(0)
+        m = GPy.models.GPRegression(X, Y, normalizer=True)
+        m.optimize(messages=True)
+        assert(m.checkgrad())
+        k = GPy.kern.RBF(1)
+        m2 = GPy.models.GPRegression(X, (Y-mu)/std, normalizer=False)
+        m2[:] = m[:]
+
+        mu1, var1 = m.predict(m.X, full_cov=True)
+        mu2, var2 = m2.predict(m2.X, full_cov=True)
+        np.testing.assert_allclose(mu1, (mu2*std)+mu)
+        np.testing.assert_allclose(var1, var2[:, :, None]*std[None, None, :]**2)
+
+        mu1, var1 = m.predict(m.X, full_cov=False)
+        mu2, var2 = m2.predict(m2.X, full_cov=False)
+
+        np.testing.assert_allclose(mu1, (mu2*std)+mu)
+        np.testing.assert_allclose(var1, var2*std[None, :]**2)
+
+        q50n = m.predict_quantiles(m.X, (50,))
+        q50 = m2.predict_quantiles(m2.X, (50,))
+
+        np.testing.assert_allclose(q50n[0], (q50[0]*std)+mu)
+
+        # Test variance component:
+        qs = np.array([2.5, 97.5])
+        # The quantiles get computed before unormalization
+        # And transformed using the mean transformation:
+        c = np.random.choice(X.shape[0])
+        q95 = m2.predict_quantiles(X[[c]], qs)
+        mu, var = m2.predict(X[[c]])
+        from scipy.stats import norm
+        np.testing.assert_allclose((mu.T+(norm.ppf(qs/100.)*np.sqrt(var))).T.flatten(), np.array(q95).flatten())
+
     def check_jacobian(self):
         try:
             import autograd.numpy as np, autograd as ag, GPy, matplotlib.pyplot as plt
@@ -259,29 +304,18 @@ class MiscTests(unittest.TestCase):
         np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood())
 
     def test_missing_data(self):
-        from GPy import kern
-        from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
-        from GPy.examples.dimensionality_reduction import _simulate_matern
+        Q = 4
 
-        D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 400, 3, 4
-        _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, False)
-        Y = Ylist[0]
-
-        inan = np.random.binomial(1, .9, size=Y.shape).astype(bool) # 80% missing data
-        Ymissing = Y.copy()
-        Ymissing[inan] = np.nan
-
-        k = kern.Linear(Q, ARD=True) + kern.White(Q, np.exp(-2)) # + kern.bias(Q)
-        m = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
-                          kernel=k, missing_data=True)
+        k = GPy.kern.Linear(Q, ARD=True) + GPy.kern.White(Q, np.exp(-2)) # + kern.bias(Q)
+        m = _create_missing_data_model(k, Q)
         assert(m.checkgrad())
         mul, varl = m.predict(m.X)
 
-        k = kern.RBF(Q, ARD=True) + kern.White(Q, np.exp(-2)) # + kern.bias(Q)
-        m2 = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
-                          kernel=k, missing_data=True)
+        k = GPy.kern.RBF(Q, ARD=True) + GPy.kern.White(Q, np.exp(-2)) # + kern.bias(Q)
+        m2 = _create_missing_data_model(k, Q)
         assert(m.checkgrad())
         m2.kern.rbf.lengthscale[:] = 1e6
+
         m2.X[:] = m.X.param_array
         m2.likelihood[:] = m.likelihood[:]
         m2.kern.white[:] = m.kern.white[:]
@@ -1082,6 +1116,46 @@ class GradientTests(np.testing.TestCase):
         m.randomize()
         self.assertTrue(m.checkgrad())
 
+    def test_posterior_covariance(self):
+        k = GPy.kern.Poly(2, order=1)
+        X1 = np.array([
+                 [-2, 2],
+                 [-1, 1]
+             ])
+        X2 = np.array([
+                 [2, 3],
+                 [-1, 3]
+             ])
+        Y = np.array([[1], [2]])
+        m = GPy.models.GPRegression(X1, Y, kernel=k)
+
+        result = m.posterior_covariance_between_points(X1, X2)
+        expected = np.array([[0.4, 2.2], [1.0, 1.0]]) / 3.0
+
+        self.assertTrue(np.allclose(result, expected))
+
+    def test_posterior_covariance_missing_data(self):
+        Q = 4
+        k = GPy.kern.Linear(Q, ARD=True)
+        m = _create_missing_data_model(k, Q)
+
+        with self.assertRaises(RuntimeError):
+            m.posterior_covariance_between_points(np.array([[1], [2]]), np.array([[3], [4]]))
+
+def _create_missing_data_model(kernel, Q):
+    D1, D2, D3, N, num_inducing = 13, 5, 8, 400, 3
+    _, _, Ylist = GPy.examples.dimensionality_reduction._simulate_matern(D1, D2, D3, N, num_inducing, False)
+    Y = Ylist[0]
+
+    inan = np.random.binomial(1, .9, size=Y.shape).astype(bool) # 80% missing data
+    Ymissing = Y.copy()
+    Ymissing[inan] = np.nan
+
+    m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
+                      kernel=kernel, missing_data=True)
+
+    return m
+
 if __name__ == "__main__":
     print("Running unit tests, please be (very) patient...")
     unittest.main()
diff --git a/GPy/testing/util_tests.py b/GPy/testing/util_tests.py
index 5cd275c2..bdab63e8 100644
--- a/GPy/testing/util_tests.py
+++ b/GPy/testing/util_tests.py
@@ -28,7 +28,8 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #===============================================================================
 
-import unittest, numpy as np
+import unittest
+import numpy as np
 import GPy
 
 class TestDebug(unittest.TestCase):
@@ -225,3 +226,17 @@ class TestUnivariateGaussian(unittest.TestCase):
         for i in range(len(pySols)):
           diff += abs(derivLogCdfNormal(self.zz[i]) - pySols[i])
         self.assertTrue(diff  < 1e-8)
+
+class TestStandardize(unittest.TestCase):
+    def setUp(self):
+        self.normalizer = GPy.util.normalizer.Standardize()
+        y = np.stack([np.random.randn(10), 2*np.random.randn(10)], axis=1)
+        self.normalizer.scale_by(y)
+    
+    def test_inverse_covariance(self):
+        """
+        Test inverse covariance outputs correct size
+        """
+        covariance = np.random.rand(100, 100)
+        output = self.normalizer.inverse_covariance(covariance)
+        self.assertTrue(output.shape == (100, 100, 2))
\ No newline at end of file
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index f8fa8239..035f7b75 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -622,7 +622,7 @@ def robot_wireless(data_set='robot_wireless'):
         download_data(data_set)
     file_name = os.path.join(data_path, data_set, 'uw-floor.txt')
     all_time = np.genfromtxt(file_name, usecols=(0))
-    macaddress = np.genfromtxt(file_name, usecols=(1), dtype='string')
+    macaddress = np.genfromtxt(file_name, usecols=(1), dtype=str)
     x = np.genfromtxt(file_name, usecols=(2))
     y = np.genfromtxt(file_name, usecols=(3))
     strength = np.genfromtxt(file_name, usecols=(4))
diff --git a/GPy/util/normalizer.py b/GPy/util/normalizer.py
index b62ac35b..7a3ee020 100644
--- a/GPy/util/normalizer.py
+++ b/GPy/util/normalizer.py
@@ -3,30 +3,46 @@ Created on Aug 27, 2014
 
 @author: Max Zwiessele
 '''
-import logging
 import numpy as np
 
+
 class _Norm(object):
     def __init__(self):
         pass
+
     def scale_by(self, Y):
         """
         Use data matrix Y as normalization space to work in.
         """
         raise NotImplementedError
+
     def normalize(self, Y):
         """
         Project Y into normalized space
         """
         if not self.scaled():
             raise AttributeError("Norm object not initialized yet, try calling scale_by(data) first.")
+
     def inverse_mean(self, X):
         """
         Project the normalized object X into space of Y
         """
         raise NotImplementedError
+
     def inverse_variance(self, var):
         return var
+
+    def inverse_covariance(self, covariance):
+        """
+        Convert scaled covariance to unscaled.
+        Args:
+            covariance - numpy array of shape (n, n)
+        Returns:
+            covariance - numpy array of shape (n, n, m) where m is number of
+                         outputs
+        """
+        raise NotImplementedError
+
     def scaled(self):
         """
         Whether this Norm object has been initialized.
@@ -57,17 +73,25 @@ class _Norm(object):
 class Standardize(_Norm):
     def __init__(self):
         self.mean = None
+
     def scale_by(self, Y):
         Y = np.ma.masked_invalid(Y, copy=False)
         self.mean = Y.mean(0).view(np.ndarray)
         self.std = Y.std(0).view(np.ndarray)
+
     def normalize(self, Y):
         super(Standardize, self).normalize(Y)
         return (Y-self.mean)/self.std
+
     def inverse_mean(self, X):
         return (X*self.std)+self.mean
+
     def inverse_variance(self, var):
         return (var*(self.std**2))
+
+    def inverse_covariance(self, covariance):
+        return (covariance[..., np.newaxis]*(self.std**2))
+
     def scaled(self):
         return self.mean is not None
 
@@ -87,29 +111,3 @@ class Standardize(_Norm):
         if "std" in input_dict:
             s.std = np.array(input_dict["std"])
         return s
-
-# Inverse variance to be implemented, disabling for now
-# If someone in the future want to implement this,
-# we need to implement the inverse variance for
-# normalization. This means, we need to know the factor
-# for the variance to be multiplied to the variance in
-# normalized space. This is easy to compute for standardization
-# (see above) but gets tricky here.
-# class Normalize(_Norm):
-#     def __init__(self):
-#         self.ymin = None
-#         self.ymax = None
-#     def scale_by(self, Y):
-#         Y = np.ma.masked_invalid(Y, copy=False)
-#         self.ymin = Y.min(0).view(np.ndarray)
-#         self.ymax = Y.max(0).view(np.ndarray)
-#     def normalize(self, Y):
-#         super(Normalize, self).normalize(Y)
-#         return (Y - self.ymin) / (self.ymax - self.ymin) - .5
-#     def inverse_mean(self, X):
-#         return (X + .5) * (self.ymax - self.ymin) + self.ymin
-#     def inverse_variance(self, var):
-#
-#         return (var*(self.std**2))
-#     def scaled(self):
-#         return (self.ymin is not None) and (self.ymax is not None)
diff --git a/README.md b/README.md
index ffbf6a34..f03213b0 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ The Gaussian processes framework in Python.
 * GPy [homepage](http://sheffieldml.github.io/GPy/)
 * Tutorial [notebooks](http://nbviewer.ipython.org/github/SheffieldML/notebook/blob/master/GPy/index.ipynb)
 * User [mailing-list](https://lists.shef.ac.uk/sympa/subscribe/gpy-users)
-* Developer [documentation](http://pythonhosted.org/GPy/)
+* Developer [documentation](http://gpy.readthedocs.io/)
 * Travis-CI [unit-tests](https://travis-ci.org/SheffieldML/GPy)
 * [![licence](https://img.shields.io/badge/licence-BSD-blue.svg)](http://opensource.org/licenses/BSD-3-Clause)
 
diff --git a/appveyor.yml b/appveyor.yml
index dd315e9d..b736d6b4 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -3,7 +3,7 @@ environment:
     secure: 8/ZjXFwtd1S7ixd7PJOpptupKKEDhm2da/q3unabJ00=
   COVERALLS_REPO_TOKEN:
     secure: d3Luic/ESkGaWnZrvWZTKrzO+xaVwJWaRCEP0F+K/9DQGPSRZsJ/Du5g3s4XF+tS
-  gpy_version: 1.8.5
+  gpy_version: 1.9.2
   matrix:
     - PYTHON_VERSION: 2.7
       MINICONDA: C:\Miniconda-x64
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 1f9c98b6..b968fe48 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -22,32 +22,52 @@ import shlex
 #for p in os.walk('../../GPy'):
 #    sys.path.append(p[0])
 sys.path.insert(0, os.path.abspath('../../'))
-#sys.path.insert(0, os.path.abspath('../../GPy/'))
+sys.path.insert(0, os.path.abspath('../../GPy/'))
 
 on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
 
+import sys
+from unittest.mock import MagicMock
+
+class Mock(MagicMock):
+    @classmethod
+    def __getattr__(cls, name):
+        return MagicMock()
+
+MOCK_MODULES = [
+    "GPy.util.linalg.linalg_cython",
+    "GPy.util.linalg_cython",
+    "sympy",
+    'GPy.kern.stationary_cython',
+    "sympy.utilities",
+    "sympy.utilities.lambdify",
+]
+
+sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
+
 #on_rtd = True
 if on_rtd:
     # sys.path.append(os.path.abspath('../GPy'))
 
     import subprocess
 
+    # build extensions:
+    # proc = subprocess.Popen("cd ../../; python setup.py build_ext install", stdout=subprocess.PIPE, shell=True)
+    # (out, err) = proc.communicate()
+    # print("build_ext develop:")
+    # print(out)
+
+    # print current folder:
     proc = subprocess.Popen("pwd", stdout=subprocess.PIPE, shell=True)
     (out, err) = proc.communicate()
-    print "program output:", out
-    proc = subprocess.Popen("ls ../../", stdout=subprocess.PIPE, shell=True)
-    (out, err) = proc.communicate()
-    print "program output:", out
+    print("$ pwd: ")
+    print(out)
+
     #Lets regenerate our rst files from the source, -P adds private modules (i.e kern._src)
     proc = subprocess.Popen("sphinx-apidoc -P -f -o . ../../GPy", stdout=subprocess.PIPE, shell=True)
     (out, err) = proc.communicate()
-    print "program output:", out
-    #proc = subprocess.Popen("whereis numpy", stdout=subprocess.PIPE, shell=True)
-    #(out, err) = proc.communicate()
-    #print "program output:", out
-    #proc = subprocess.Popen("whereis matplotlib", stdout=subprocess.PIPE, shell=True)
-    #(out, err) = proc.communicate()
-    #print "program output:", out
+    print("$ Apidoc:")
+    print(out)
 
 
 # -- General configuration ------------------------------------------------
@@ -77,15 +97,6 @@ extensions = [
 #    def __getattr__(cls, name):
 #            return Mock()
 #
-MOCK_MODULES = ['scipy.linalg.blas', 'blas', 'scipy.optimize', 'scipy.optimize.linesearch', 'scipy.linalg',
-                'scipy', 'scipy.special', 'scipy.integrate', 'scipy.io', 'scipy.stats',
-                'sympy', 'sympy.utilities.iterables', 'sympy.utilities.lambdify',
-                'sympy.utilities', 'sympy.utilities.codegen', 'sympy.core.cache',
-                'sympy.core', 'sympy.parsing', 'sympy.parsing.sympy_parser',
-                'nose', 'nose.tools'
-                ]
-
-autodoc_mock_imports = MOCK_MODULES
 #
 #sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
 #
@@ -97,6 +108,7 @@ autodoc_default_flags = ['members',
                          #'special-members',
                          #'inherited-members',
                          'show-inheritance']
+
 autodoc_member_order = 'groupwise'
 add_function_parentheses = False
 add_module_names = False
@@ -144,7 +156,21 @@ print version
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+language = 'python'
+
+# autodoc:
+autoclass_content = 'both'
+autodoc_default_flags = ['members',
+                         #'undoc-members',
+                         #'private-members',
+                         #'special-members',
+                         #'inherited-members',
+                         'show-inheritance']
+autodoc_member_order = 'groupwise'
+add_function_parentheses = False
+add_module_names = False
+modindex_common_prefix = ['paramz']
+show_authors = True
 
 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:
@@ -172,7 +198,7 @@ exclude_patterns = []
 #show_authors = False
 
 # The name of the Pygments (syntax highlighting) style to use.
-#pygments_style = 'sphinx'
+pygments_style = 'sphinx'
 
 # A list of ignored prefixes for module index sorting.
 #modindex_common_prefix = []
@@ -217,7 +243,7 @@ html_theme = 'sphinx_rtd_theme'
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-#html_static_path = ['_static']
+html_static_path = ['_static']
 
 # Add any extra paths that contain custom files (such as robots.txt or
 # .htaccess) here, relative to this directory. These files are copied
@@ -242,16 +268,16 @@ html_theme = 'sphinx_rtd_theme'
 #html_additional_pages = {}
 
 # If false, no module index is generated.
-#html_domain_indices = False
+html_domain_indices = False
 
 # If false, no index is generated.
-#html_use_index = False
+html_use_index = False
 
 # If true, the index is split into individual pages for each letter.
 html_split_index = True
 
 # If true, links to the reST sources are added to the pages.
-#html_show_sourcelink = True
+html_show_sourcelink = True
 
 # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
 #html_show_sphinx = True
@@ -286,9 +312,9 @@ htmlhelp_basename = 'GPydoc'
 
 # -- Options for LaTeX output ---------------------------------------------
 
-#latex_elements = {
+latex_elements = {
 # The paper size ('letterpaper' or 'a4paper').
-#'papersize': 'letterpaper',
+'papersize': 'a4paper',
 
 # The font size ('10pt', '11pt' or '12pt').
 #'pointsize': '10pt',
@@ -297,8 +323,8 @@ htmlhelp_basename = 'GPydoc'
 #'preamble': '',
 
 # Latex figure (float) alignment
-#'figure_align': 'htbp',
-#}
+'figure_align': 'htbp',
+}
 
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title,
diff --git a/doc/source/requirements.txt b/doc/source/requirements.txt
index dd3ba36f..5ae1e857 100644
--- a/doc/source/requirements.txt
+++ b/doc/source/requirements.txt
@@ -1 +1,10 @@
-paramz
\ No newline at end of file
+numpy
+scipy
+six
+decorator
+matplotlib
+paramz
+cython
+mock
+sympy
+nose
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index 21160939..0f1d4075 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 1.8.5
+current_version = 1.9.2
 tag = True
 commit = True
 
@@ -12,3 +12,4 @@ upload-dir = doc/build/html
 
 [medatdata]
 description-file = README.rst
+
diff --git a/setup.py b/setup.py
index 7d3a5355..5e4357b5 100644
--- a/setup.py
+++ b/setup.py
@@ -94,15 +94,18 @@ ext_mods = [Extension(name='GPy.kern.src.stationary_cython',
             Extension(name='GPy.util.linalg_cython',
                       sources=['GPy/util/linalg_cython.c'],
                       include_dirs=[np.get_include(),'.'],
-                      extra_compile_args=compile_flags),
+                      extra_compile_args=compile_flags,
+                      extra_link_args = link_args),
             Extension(name='GPy.kern.src.coregionalize_cython',
                       sources=['GPy/kern/src/coregionalize_cython.c'],
                       include_dirs=[np.get_include(),'.'],
-                      extra_compile_args=compile_flags),
+                      extra_compile_args=compile_flags,
+                      extra_link_args = link_args),
             Extension(name='GPy.models.state_space_cython',
                       sources=['GPy/models/state_space_cython.c'],
                       include_dirs=[np.get_include(),'.'],
-                      extra_compile_args=compile_flags)]
+                      extra_compile_args=compile_flags,
+                      extra_link_args = link_args)]
 
 setup(name = 'GPy',
       version = __version__,
@@ -150,7 +153,7 @@ setup(name = 'GPy',
       py_modules = ['GPy.__init__'],
       test_suite = 'GPy.testing',
       setup_requires = ['numpy>=1.7'],
-      install_requires = ['numpy>=1.7', 'scipy>=0.16', 'six', 'paramz>=0.8.5'],
+      install_requires = ['numpy>=1.7', 'scipy>=0.16', 'six', 'paramz>=0.9.0'],
       extras_require = {'docs':['sphinx'],
                         'optional':['mpi4py',
                                     'ipython>=4.0.0',