From ba2ea3eb7338d3b304bc760f50058f169c020e6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David=20Men=C3=A9ndez=20Hurtado?=
 <david.menendez.hurtado@scilifelab.se>
Date: Mon, 24 Aug 2015 12:56:18 +0200
Subject: [PATCH 01/18] FIX: now Scipy 0.16 is required, removing fixes for
 older versions. Accessing blas through the scipy interface

---
 GPy/util/linalg.py | 65 +++++++++-------------------------------------
 1 file changed, 12 insertions(+), 53 deletions(-)

diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index ed73d133..606a9d88 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -7,48 +7,16 @@
 
 import numpy as np
 from scipy import linalg
-import types
+from scipy.linalg import lapack, blas
+
 import ctypes
 from ctypes import byref, c_char, c_int, c_double # TODO
-import scipy
-import warnings
-import os
+
 from .config import config
 import logging
 from . import linalg_cython
 
 
-_scipyversion = np.float64((scipy.__version__).split('.')[:2])
-_fix_dpotri_scipy_bug = True
-if np.all(_scipyversion >= np.array([0, 14])):
-    from scipy.linalg import lapack
-    _fix_dpotri_scipy_bug = False
-elif np.all(_scipyversion >= np.array([0, 12])):
-    #import scipy.linalg.lapack.clapack as lapack
-    from scipy.linalg import lapack
-else:
-    from scipy.linalg.lapack import flapack as lapack
-
-if config.getboolean('anaconda', 'installed') and config.getboolean('anaconda', 'MKL'):
-    try:
-        anaconda_path = str(config.get('anaconda', 'location'))
-        mkl_rt = ctypes.cdll.LoadLibrary(os.path.join(anaconda_path, 'DLLs', 'mkl_rt.dll'))
-        dsyrk = mkl_rt.dsyrk
-        dsyr = mkl_rt.dsyr
-        _blas_available = True
-        print('anaconda installed and mkl is loaded')
-    except:
-        _blas_available = False
-else:
-    try:
-        _blaslib = ctypes.cdll.LoadLibrary(np.core._dotblas.__file__) # @UndefinedVariable
-        dsyrk = _blaslib.dsyrk_
-        dsyr = _blaslib.dsyr_
-        _blas_available = True
-    except AttributeError as e:
-        _blas_available = False
-        warnings.warn("warning: caught this exception:" + str(e))
-
 def force_F_ordered_symmetric(A):
     """
     return a F ordered version of A, assuming A is symmetric
@@ -169,9 +137,6 @@ def dpotri(A, lower=1):
     :returns: A inverse
 
     """
-    if _fix_dpotri_scipy_bug:
-        assert lower==1, "scipy linalg behaviour is very weird. please use lower, fortran ordered arrays"
-        lower = 0
 
     A = force_F_ordered(A)
     R, info = lapack.dpotri(A, lower=lower) #needs to be zero here, seems to be a scipy bug
@@ -300,8 +265,8 @@ def pca(Y, input_dim):
     Z = linalg.svd(Y - Y.mean(axis=0), full_matrices=False)
     [X, W] = [Z[0][:, 0:input_dim], np.dot(np.diag(Z[1]), Z[2]).T[:, 0:input_dim]]
     v = X.std(axis=0)
-    X /= v;
-    W *= v;
+    X /= v
+    W *= v
     return X, W.T
 
 def ppca(Y, Q, iterations=100):
@@ -362,19 +327,15 @@ def tdot_blas(mat, out=None):
     BETA = c_double(0.0)
     C = out.ctypes.data_as(ctypes.c_void_p)
     LDC = c_int(np.max(out.strides) // 8)
-    dsyrk(byref(UPLO), byref(TRANS), byref(N), byref(K),
-            byref(ALPHA), A, byref(LDA), byref(BETA), C, byref(LDC))
+    blas.dsyrk(byref(UPLO), byref(TRANS), byref(N), byref(K),
+               byref(ALPHA), A, byref(LDA), byref(BETA), C, byref(LDC))
 
     symmetrify(out, upper=True)
 
-
     return np.ascontiguousarray(out)
 
 def tdot(*args, **kwargs):
-    if _blas_available:
-        return tdot_blas(*args, **kwargs)
-    else:
-        return tdot_numpy(*args, **kwargs)
+    return tdot_blas(*args, **kwargs)
 
 def DSYR_blas(A, x, alpha=1.):
     """
@@ -393,8 +354,8 @@ def DSYR_blas(A, x, alpha=1.):
     A_ = A.ctypes.data_as(ctypes.c_void_p)
     x_ = x.ctypes.data_as(ctypes.c_void_p)
     INCX = c_int(1)
-    dsyr(byref(UPLO), byref(N), byref(ALPHA),
-            x_, byref(INCX), A_, byref(LDA))
+    blas.dsyr(byref(UPLO), byref(N), byref(ALPHA),
+              x_, byref(INCX), A_, byref(LDA))
     symmetrify(A, upper=True)
 
 def DSYR_numpy(A, x, alpha=1.):
@@ -411,10 +372,8 @@ def DSYR_numpy(A, x, alpha=1.):
 
 
 def DSYR(*args, **kwargs):
-    if _blas_available:
-        return DSYR_blas(*args, **kwargs)
-    else:
-        return DSYR_numpy(*args, **kwargs)
+    return DSYR_blas(*args, **kwargs)
+
 
 def symmetrify(A, upper=False):
     """

From 3746af772921126ad36448379613badaabac6598 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David=20Men=C3=A9ndez=20Hurtado?=
 <david.menendez.hurtado@scilifelab.se>
Date: Mon, 24 Aug 2015 13:29:40 +0200
Subject: [PATCH 02/18] Fixing confussion between lapack and ctypes interfaces

---
 GPy/util/linalg.py | 32 +++-----------------------------
 1 file changed, 3 insertions(+), 29 deletions(-)

diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 606a9d88..4c5f1b79 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -9,9 +9,6 @@ import numpy as np
 from scipy import linalg
 from scipy.linalg import lapack, blas
 
-import ctypes
-from ctypes import byref, c_char, c_int, c_double # TODO
-
 from .config import config
 import logging
 from . import linalg_cython
@@ -312,26 +309,11 @@ def tdot_blas(mat, out=None):
         out[:] = 0.0
 
     # # Call to DSYRK from BLAS
-    # If already in Fortran order (rare), and has the right sorts of strides I
-    # could avoid the copy. I also thought swapping to cblas API would allow use
-    # of C order. However, I tried that and had errors with large matrices:
-    # http://homepages.inf.ed.ac.uk/imurray2/code/tdot/tdot_broken.py
     mat = np.asfortranarray(mat)
-    TRANS = c_char('n'.encode('ascii'))
-    N = c_int(mat.shape[0])
-    K = c_int(mat.shape[1])
-    LDA = c_int(mat.shape[0])
-    UPLO = c_char('l'.encode('ascii'))
-    ALPHA = c_double(1.0)
-    A = mat.ctypes.data_as(ctypes.c_void_p)
-    BETA = c_double(0.0)
-    C = out.ctypes.data_as(ctypes.c_void_p)
-    LDC = c_int(np.max(out.strides) // 8)
-    blas.dsyrk(byref(UPLO), byref(TRANS), byref(N), byref(K),
-               byref(ALPHA), A, byref(LDA), byref(BETA), C, byref(LDC))
+    out = blas.dsyrk(alpha=1.0, a=mat, beta=0.0, c=out, overwrite_c=1,
+                     trans=0, lower=1, trans=0)
 
     symmetrify(out, upper=True)
-
     return np.ascontiguousarray(out)
 
 def tdot(*args, **kwargs):
@@ -347,15 +329,7 @@ def DSYR_blas(A, x, alpha=1.):
     :param alpha: scalar
 
     """
-    N = c_int(A.shape[0])
-    LDA = c_int(A.shape[0])
-    UPLO = c_char('l'.encode('ascii'))
-    ALPHA = c_double(alpha)
-    A_ = A.ctypes.data_as(ctypes.c_void_p)
-    x_ = x.ctypes.data_as(ctypes.c_void_p)
-    INCX = c_int(1)
-    blas.dsyr(byref(UPLO), byref(N), byref(ALPHA),
-              x_, byref(INCX), A_, byref(LDA))
+    blas.dsyr(lower=1, x=x, a=A, alpha=alpha)
     symmetrify(A, upper=True)
 
 def DSYR_numpy(A, x, alpha=1.):

From 13438f1c10dbabca6c4218a23364e13d29a4685b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?David=20Men=C3=A9ndez=20Hurtado?=
 <david.menendez.hurtado@scilifelab.se>
Date: Mon, 24 Aug 2015 13:46:20 +0200
Subject: [PATCH 03/18] Fixing the blas arguments for DSYRK

---
 GPy/util/linalg.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 4c5f1b79..acceeb16 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -311,7 +311,7 @@ def tdot_blas(mat, out=None):
     # # Call to DSYRK from BLAS
     mat = np.asfortranarray(mat)
     out = blas.dsyrk(alpha=1.0, a=mat, beta=0.0, c=out, overwrite_c=1,
-                     trans=0, lower=1, trans=0)
+                     trans=0, lower=0)
 
     symmetrify(out, upper=True)
     return np.ascontiguousarray(out)
@@ -329,7 +329,7 @@ def DSYR_blas(A, x, alpha=1.):
     :param alpha: scalar
 
     """
-    blas.dsyr(lower=1, x=x, a=A, alpha=alpha)
+    A = blas.dsyr(lower=0, x=x, a=A, alpha=alpha, overwrite_a=True)
     symmetrify(A, upper=True)
 
 def DSYR_numpy(A, x, alpha=1.):

From 0d32652c8876d02aed274c69e976a00733a56265 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Fri, 28 Aug 2015 16:26:55 +0100
Subject: [PATCH 04/18] [heteroscedastic gauss] Implemented Heteroscedastic
 Guassian Lik with @ric70x7

---
 GPy/likelihoods/gaussian.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index 9abb8cde..ee1f2a5d 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -48,6 +48,7 @@ class Gaussian(Likelihood):
 
     def betaY(self,Y,Y_metadata=None):
         #TODO: ~Ricardo this does not live here
+        print "Iam Here"
         return Y/self.gaussian_variance(Y_metadata)
 
     def gaussian_variance(self, Y_metadata=None):
@@ -321,3 +322,30 @@ class Gaussian(Likelihood):
         dF_dv = np.ones_like(v)*(-0.5/lik_var)
         dF_dtheta = -0.5/lik_var + 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/(lik_var**2)
         return F, dF_dmu, dF_dv, dF_dtheta.reshape(1, Y.shape[0], Y.shape[1])
+
+class Heteroscedastic_Gaussian(Gaussian):
+    def __init__(self, Y_metadata, gp_link=None, variance=1., name='het_Gauss'):
+        if gp_link is None:
+            gp_link = link_functions.Identity()
+
+        if not isinstance(gp_link, link_functions.Identity):
+            print("Warning, Exact inference is not implemeted for non-identity link functions,\
+            if you are not already, ensure Laplace inference_method is used")
+
+        super(Heteroscedastic_Gaussian, self).__init__(gp_link, np.ones(Y_metadata['output_index'].shape[0])*variance, name)
+
+    def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
+        return dL_dKdiag[Y_metadata['output_index']][:,0]
+
+    def gaussian_variance(self, Y_metadata=None):
+        return self.variance[Y_metadata['output_index']]
+
+    def predictive_values(self, mu, var, full_cov=False, Y_metadata=None):
+        if full_cov:
+            if var.ndim == 2:
+                var += np.eye(var.shape[0])*self.variance
+            if var.ndim == 3:
+                var += np.atleast_3d(np.eye(var.shape[0])*self.variance)
+        else:
+            var += self.variance
+        return mu, var
\ No newline at end of file

From 4bd99c674fdb53237019386b8c93c456b96e9e6f Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Fri, 28 Aug 2015 16:28:01 +0100
Subject: [PATCH 05/18] [heteroscedastic gauss] Implemented Heteroscedastic
 Guassian Lik with @ric70x7

---
 GPy/likelihoods/gaussian.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index ee1f2a5d..4fd80ef5 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -48,7 +48,7 @@ class Gaussian(Likelihood):
 
     def betaY(self,Y,Y_metadata=None):
         #TODO: ~Ricardo this does not live here
-        print "Iam Here"
+        raise RuntimeError, "Please notify the GPy developers, this should not happen"
         return Y/self.gaussian_variance(Y_metadata)
 
     def gaussian_variance(self, Y_metadata=None):

From ce9ee6c7585d4030f6a70d65e981b0f5a7ed492b Mon Sep 17 00:00:00 2001
From: James Hensman <james.hensman@gmail.com>
Date: Tue, 1 Sep 2015 22:38:49 +0100
Subject: [PATCH 06/18] ensuring the shape of the mean vector at predict time
 fixes bug in EP prediction

---
 GPy/core/gp.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index ddef0647..a87ac66e 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -201,6 +201,8 @@ class GP(Model):
 
         Kx = kern.K(self.X, Xnew)
         mu = np.dot(Kx.T, self.posterior.woodbury_vector)
+        if len(mu.shape)==1:
+            mu = mu.reshape(-1,1)
         if full_cov:
             Kxx = kern.K(Xnew)
             if self.posterior.woodbury_inv.ndim == 2:

From 70a9a26d7e12861b3bacc833e17b416c65d53122 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 2 Sep 2015 09:06:17 +0100
Subject: [PATCH 07/18] [core] updating system, security branching

---
 GPy/core/gp.py                                | 105 ++++++++++++++++++
 GPy/core/mapping.py                           |   2 +-
 GPy/core/sparse_gp.py                         |  26 +++--
 GPy/core/verbose_optimization.py              |   2 +-
 GPy/kern/_src/basis_funcs.py                  |  52 ++++-----
 GPy/kern/_src/kern.py                         |   2 +
 GPy/kern/_src/kernel_slice_operations.py      |  27 ++++-
 GPy/kern/_src/linear.py                       |   6 +
 GPy/kern/_src/rbf.py                          |   3 +
 GPy/kern/_src/stationary.py                   |  64 ++++++++++-
 GPy/likelihoods/__init__.py                   |   2 +-
 GPy/models/bayesian_gplvm.py                  |  14 +++
 GPy/models/gplvm.py                           |  40 ++++---
 .../matplot_dep/dim_reduction_plots.py        |  84 +++++++++++++-
 GPy/util/diag.py                              |   2 +
 15 files changed, 366 insertions(+), 65 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index ddef0647..aba065f9 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -106,6 +106,13 @@ class GP(Model):
         self.link_parameter(self.likelihood)
         self.posterior = None
 
+        # The predictive variable to be used to predict using the posterior object's
+        # woodbury_vector and woodbury_inv is defined as predictive_variable
+        # This is usually just a link to self.X (full GP) or self.Z (sparse GP).
+        # Make sure to name this variable and the predict functions will "just work"
+        # as long as the posterior has the right woodbury entries.
+        self._predictive_variable = self.X
+
 
     def set_XY(self, X=None, Y=None, trigger_update=True):
         """
@@ -207,6 +214,7 @@ class GP(Model):
                 var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
             elif self.posterior.woodbury_inv.ndim == 3:
                 var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
+                from ..util.linalg import mdot
                 for i in range(var.shape[2]):
                     var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
             var = var
@@ -302,6 +310,103 @@ class GP(Model):
         return dmu_dX, dv_dX
 
 
+    def predict_jacobian(self, Xnew, kern=None, full_cov=True):
+        """
+        Compute the derivatives of the posterior of the GP.
+
+        Given a set of points at which to predict X* (size [N*,Q]), compute the
+        mean and variance of the derivative. Resulting arrays are sized:
+
+         dL_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).
+          Note that this is the mean and variance of the derivative,
+          not the derivative of the mean and variance! (See predictive_gradients for that)
+
+         dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
+          If there is missing data, it is not implemented for now, but
+          there will be one output variance per output dimension.
+
+        :param X: The points at which to get the predictive gradients.
+        :type X: np.ndarray (Xnew x self.input_dim)
+        :param kern: The kernel to compute the jacobian for.
+        :param boolean full_cov: whether to return the full covariance of the jacobian.
+
+        :returns: dmu_dX, dv_dX
+        :rtype: [np.ndarray (N*, Q ,D), np.ndarray (N*,Q,(D)) ]
+
+        Note: We always return sum in input_dim gradients, as the off-diagonals
+        in the input_dim are not needed for further calculations.
+        This is a compromise for increase in speed. Mathematically the jacobian would
+        have another dimension in Q.
+        """
+        if kern is None:
+            kern = self.kern
+
+        mean_jac = np.empty((Xnew.shape[0],Xnew.shape[1],self.output_dim))
+
+        for i in range(self.output_dim):
+            mean_jac[:,:,i] = kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1].T, Xnew, self._predictive_variable)
+
+        dK_dXnew_full = np.empty((self._predictive_variable.shape[0], Xnew.shape[0], Xnew.shape[1]))
+        for i in range(self._predictive_variable.shape[0]):
+            dK_dXnew_full[i] = kern.gradients_X([[1.]], Xnew, self._predictive_variable[[i]])
+
+        def compute_cov_inner(wi):
+            if full_cov:
+                # full covariance gradients:
+                dK2_dXdX = kern.gradients_XX([[1.]], Xnew)
+                var_jac = dK2_dXdX - np.einsum('qnm,miq->niq', dK_dXnew_full.T.dot(wi), dK_dXnew_full)
+            else:
+                dK2_dXdX = kern.gradients_XX_diag([[1.]], Xnew)
+                var_jac = dK2_dXdX - np.einsum('qim,miq->iq', dK_dXnew_full.T.dot(wi), dK_dXnew_full)
+            return var_jac
+
+        if self.posterior.woodbury_inv.ndim == 3:
+            var_jac = []
+            for d in range(self.posterior.woodbury_inv.shape[2]):
+                var_jac.append(compute_cov_inner(self.posterior.woodbury_inv[:, :, d]))
+            var_jac = np.concatenate(var_jac)
+        else:
+            var_jac = compute_cov_inner(self.posterior.woodbury_inv)
+        return mean_jac, var_jac
+
+    def predict_wishard_embedding(self, Xnew, kern=None):
+        """
+        Predict the wishard embedding G of the GP. This is the density of the
+        input of the GP defined by the probabilistic function mapping f.
+        G = J_mean.T*J_mean + output_dim*J_cov.
+
+        :param array-like Xnew: The points at which to evaluate the magnification.
+        :param :py:class:`~GPy.kern.Kern` kern: The kernel to use for the magnification.
+
+        Supplying only a part of the learning kernel gives insights into the density
+        of the specific kernel part of the input function. E.g. one can see how dense the
+        linear part of a kernel is compared to the non-linear part etc.
+        """
+        if kern is None:
+            kern = self.kern
+
+        mu_jac, var_jac = self.predict_jacobian(Xnew, kern, full_cov=False)
+        mumuT = np.einsum('iqd,ipd->iqp', mu_jac, mu_jac)
+        if var_jac.ndim == 3:
+            Sigma = np.einsum('iqd,ipd->iqp', var_jac, var_jac)
+            G = mumuT + Sigma
+        else:
+            Sigma = np.einsum('iq,ip->iqp', var_jac, var_jac)
+            G = mumuT + self.output_dim*Sigma
+        return G
+
+    def predict_magnification(self, Xnew, kern=None):
+        """
+        Predict the magnification factor as
+
+        sqrt(det(G))
+
+        for each point N in Xnew
+        """
+        from ..util.linalg import jitchol
+        G = self.predict_wishard_embedding(Xnew, kern)
+        return np.array([2*np.sqrt(np.exp(np.sum(np.log(np.diag(jitchol(G[n, :, :])))))) for n in range(Xnew.shape[0])])
+
     def posterior_samples_f(self,X,size=10, full_cov=True):
         """
         Samples the posterior GP at the points X.
diff --git a/GPy/core/mapping.py b/GPy/core/mapping.py
index 30614384..9853ea8a 100644
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@@ -32,7 +32,7 @@ class Bijective_mapping(Mapping):
     also back from f to X. The inverse mapping is called g().
     """
     def __init__(self, input_dim, output_dim, name='bijective_mapping'):
-        super(Bijective_apping, self).__init__(name=name)
+        super(Bijective_mapping, self).__init__(name=name)
 
     def g(self, f):
         """Inverse mapping from output domain of the function to the inputs."""
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index c7c99be6..ac5fb62b 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -59,6 +59,8 @@ class SparseGP(GP):
         logger.info("Adding Z as parameter")
         self.link_parameter(self.Z, index=0)
         self.posterior = None
+        self._predictive_variable = self.Z
+
 
     def has_uncertain_inputs(self):
         return isinstance(self.X, VariationalPosterior)
@@ -114,10 +116,10 @@ class SparseGP(GP):
         Make a prediction for the latent function values.
 
         For certain inputs we give back a full_cov of shape NxN,
-        if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of, 
+        if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of,
         we take only the diagonal elements across N.
-        
-        For uncertain inputs, the SparseGP bound produces a full covariance structure across D, so for full_cov we 
+
+        For uncertain inputs, the SparseGP bound produces a full covariance structure across D, so for full_cov we
         return a NxDxD matrix and in the not full_cov case, we return the diagonal elements across D (NxD).
         This is for both with and without missing data. See for missing data SparseGP implementation py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'.
         """
@@ -125,7 +127,7 @@ class SparseGP(GP):
         if kern is None: kern = self.kern
 
         if not isinstance(Xnew, VariationalPosterior):
-            Kx = kern.K(self.Z, Xnew)
+            Kx = kern.K(self._predictive_variable, Xnew)
             mu = np.dot(Kx.T, self.posterior.woodbury_vector)
             if full_cov:
                 Kxx = kern.K(Xnew)
@@ -149,28 +151,28 @@ class SparseGP(GP):
             if self.mean_function is not None:
                 mu += self.mean_function.f(Xnew)
         else:
-            psi0_star = kern.psi0(self.Z, Xnew)
-            psi1_star = kern.psi1(self.Z, Xnew)
+            psi0_star = kern.psi0(self._predictive_variable, Xnew)
+            psi1_star = kern.psi1(self._predictive_variable, Xnew)
             #psi2_star = kern.psi2(self.Z, Xnew) # Only possible if we get NxMxM psi2 out of the code.
             la = self.posterior.woodbury_vector
             mu = np.dot(psi1_star, la) # TODO: dimensions?
-            
-            if full_cov: 
+
+            if full_cov:
                 var = np.empty((Xnew.shape[0], la.shape[1], la.shape[1]))
                 di = np.diag_indices(la.shape[1])
-            else: 
+            else:
                 var = np.empty((Xnew.shape[0], la.shape[1]))
-                
+
             for i in range(Xnew.shape[0]):
                 _mu, _var = Xnew.mean.values[[i]], Xnew.variance.values[[i]]
-                psi2_star = kern.psi2(self.Z, NormalPosterior(_mu, _var))
+                psi2_star = kern.psi2(self._predictive_variable, NormalPosterior(_mu, _var))
                 tmp = (psi2_star[:, :] - psi1_star[[i]].T.dot(psi1_star[[i]]))
 
                 var_ = mdot(la.T, tmp, la)
                 p0 = psi0_star[i]
                 t = np.atleast_3d(self.posterior.woodbury_inv)
                 t2 = np.trace(t.T.dot(psi2_star), axis1=1, axis2=2)
-                
+
                 if full_cov:
                     var_[di] += p0
                     var_[di] += -t2
diff --git a/GPy/core/verbose_optimization.py b/GPy/core/verbose_optimization.py
index 08c5e2dd..57f0c9e6 100644
--- a/GPy/core/verbose_optimization.py
+++ b/GPy/core/verbose_optimization.py
@@ -146,7 +146,7 @@ class VerboseOptimization(object):
         seconds = time.time()-self.start
         #sys.stdout.write(" "*len(self.message))
         self.deltat += seconds
-        if self.deltat > .2:
+        if self.deltat > .3 or seconds < .3:
             self.print_out(seconds)
             self.deltat = 0
 
diff --git a/GPy/kern/_src/basis_funcs.py b/GPy/kern/_src/basis_funcs.py
index a6c1f36c..dc21a687 100644
--- a/GPy/kern/_src/basis_funcs.py
+++ b/GPy/kern/_src/basis_funcs.py
@@ -11,7 +11,7 @@ class BasisFuncKernel(Kern):
     def __init__(self, input_dim, variance=1., active_dims=None, ARD=False, name='basis func kernel'):
         """
         Abstract superclass for kernels with explicit basis functions for use in GPy.
-        
+
         This class does NOT automatically add an offset to the design matrix phi!
         """
         super(BasisFuncKernel, self).__init__(input_dim, active_dims, name)
@@ -23,24 +23,24 @@ class BasisFuncKernel(Kern):
             variance = np.array(variance)
         self.variance = Param('variance', variance, Logexp())
         self.link_parameter(self.variance)
-    
+
     def parameters_changed(self):
         self.alpha = np.sqrt(self.variance)
         self.beta = 1./self.variance
-    
+
     @Cache_this(limit=3, ignore_args=())
     def phi(self, X):
         return self._phi(X)
 
     def _phi(self, X):
         raise NotImplementedError('Overwrite this _phi function, which maps the input X into the higher dimensional space and returns the design matrix Phi')
-        
+
     def K(self, X, X2=None):
         return self._K(X, X2)
 
     def Kdiag(self, X, X2=None):
         return np.diag(self._K(X, X2))
-    
+
     def update_gradients_full(self, dL_dK, X, X2=None):
         if self.ARD:
             phi1 = self.phi(X)
@@ -51,22 +51,22 @@ class BasisFuncKernel(Kern):
                 self.variance.gradient = np.einsum('ij,iq,jq->q', dL_dK, phi1, phi2)
         else:
             self.variance.gradient = np.einsum('ij,ij', dL_dK, self._K(X, X2)) * self.beta
-        
+
     def update_gradients_diag(self, dL_dKdiag, X):
         if self.ARD:
             phi1 = self.phi(X)
             self.variance.gradient = np.einsum('i,iq,iq->q', dL_dKdiag, phi1, phi1)
         else:
             self.variance.gradient = np.einsum('i,i', dL_dKdiag, self.Kdiag(X)) * self.beta
-        
+
     def concatenate_offset(self, X):
         return np.c_[np.ones((X.shape[0], 1)), X]
-    
+
     def posterior_inf(self, X=None, posterior=None):
         """
-        Do the posterior inference on the parameters given this kernels functions 
-        and the model posterior, which has to be a GPy posterior, usually found at m.posterior, if m is a GPy model. 
-        If not given we search for the the highest parent to be a model, containing the posterior, and for X accordingly. 
+        Do the posterior inference on the parameters given this kernels functions
+        and the model posterior, which has to be a GPy posterior, usually found at m.posterior, if m is a GPy model.
+        If not given we search for the the highest parent to be a model, containing the posterior, and for X accordingly.
         """
         if X is None:
             try:
@@ -80,7 +80,7 @@ class BasisFuncKernel(Kern):
                 raise RuntimeError("This kernel is not part of a model and cannot be used for posterior inference")
         phi_alpha = self.phi(X) * self.variance
         return (phi_alpha).T.dot(posterior.woodbury_vector), (np.eye(phi_alpha.shape[1])*self.variance - mdot(phi_alpha.T, posterior.woodbury_inv, phi_alpha))
-    
+
     @Cache_this(limit=3, ignore_args=())
     def _K(self, X, X2):
         if X2 is None or X is X2:
@@ -95,35 +95,35 @@ class BasisFuncKernel(Kern):
                 phi1 = phi1[:, None]
                 phi2 = phi2[:, None]
             return phi1.dot(phi2.T)
-        
-        
+
+
 class LinearSlopeBasisFuncKernel(BasisFuncKernel):
     def __init__(self, input_dim, start, stop, variance=1., active_dims=None, ARD=False, name='linear_segment'):
         """
         A linear segment transformation. The segments start at start, \
-        are then linear to stop and constant again. The segments are 
-        normalized, so that they have exactly as much mass above 
-        as below the origin. 
-        
-        Start and stop can be tuples or lists of starts and stops. 
+        are then linear to stop and constant again. The segments are
+        normalized, so that they have exactly as much mass above
+        as below the origin.
+
+        Start and stop can be tuples or lists of starts and stops.
         Behaviour of start stop is as np.where(X<start) would do.
         """
-        
+
         self.start = np.array(start)
         self.stop = np.array(stop)
         super(LinearSlopeBasisFuncKernel, self).__init__(input_dim, variance, active_dims, ARD, name)
-    
+
     @Cache_this(limit=3, ignore_args=())
     def _phi(self, X):
         phi = np.where(X < self.start, self.start, X)
         phi = np.where(phi > self.stop, self.stop, phi)
         return ((phi-(self.stop+self.start)/2.))#/(.5*(self.stop-self.start)))-1.
-    
+
 class ChangePointBasisFuncKernel(BasisFuncKernel):
     def __init__(self, input_dim, changepoint, variance=1., active_dims=None, ARD=False, name='changepoint'):
         self.changepoint = np.array(changepoint)
         super(ChangePointBasisFuncKernel, self).__init__(input_dim, variance, active_dims, ARD, name)
-    
+
     @Cache_this(limit=3, ignore_args=())
     def _phi(self, X):
         return np.where((X < self.changepoint), -1, 1)
@@ -131,7 +131,7 @@ class ChangePointBasisFuncKernel(BasisFuncKernel):
 class DomainKernel(LinearSlopeBasisFuncKernel):
     def __init__(self, input_dim, start, stop, variance=1., active_dims=None, ARD=False, name='constant_domain'):
         super(DomainKernel, self).__init__(input_dim, start, stop, variance, active_dims, ARD, name)
-    
+
     @Cache_this(limit=3, ignore_args=())
     def _phi(self, X):
         phi = np.where((X>self.start)*(X<self.stop), 1, 0)
@@ -147,7 +147,7 @@ class LogisticBasisFuncKernel(BasisFuncKernel):
             self.slope = Param('slope', slope, Logexp())
         super(LogisticBasisFuncKernel, self).__init__(input_dim, variance, active_dims, ARD, name)
         self.link_parameter(self.slope)
-    
+
     @Cache_this(limit=3, ignore_args=())
     def _phi(self, X):
         import scipy as sp
@@ -156,7 +156,7 @@ class LogisticBasisFuncKernel(BasisFuncKernel):
 
     def parameters_changed(self):
         BasisFuncKernel.parameters_changed(self)
-    
+
     def update_gradients_full(self, dL_dK, X, X2=None):
         super(LogisticBasisFuncKernel, self).update_gradients_full(dL_dK, X, X2)
         if X2 is None or X is X2:
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index 5af3fb1c..62be679c 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -100,6 +100,8 @@ class Kern(Parameterized):
         return self.psicomp.psicomputations(self, Z, variational_posterior)[2]
     def gradients_X(self, dL_dK, X, X2):
         raise NotImplementedError
+    def gradients_XX(self, dL_dK, X, X2):
+        raise(NotImplementedError, "This is the second derivative of K wrt X and X2, and not implemented for this kernel")
     def gradients_X_diag(self, dL_dKdiag, X):
         raise NotImplementedError
 
diff --git a/GPy/kern/_src/kernel_slice_operations.py b/GPy/kern/_src/kernel_slice_operations.py
index 3473ffce..0dc4481b 100644
--- a/GPy/kern/_src/kernel_slice_operations.py
+++ b/GPy/kern/_src/kernel_slice_operations.py
@@ -19,6 +19,8 @@ class KernCallsViaSlicerMeta(ParametersChangedMeta):
         put_clean(dct, 'update_gradients_full', _slice_update_gradients_full)
         put_clean(dct, 'update_gradients_diag', _slice_update_gradients_diag)
         put_clean(dct, 'gradients_X', _slice_gradients_X)
+        put_clean(dct, 'gradients_XX', _slice_gradients_XX)
+        put_clean(dct, 'gradients_XX_diag', _slice_gradients_X_diag)
         put_clean(dct, 'gradients_X_diag', _slice_gradients_X_diag)
 
         put_clean(dct, 'psi0', _slice_psi)
@@ -30,9 +32,12 @@ class KernCallsViaSlicerMeta(ParametersChangedMeta):
         return super(KernCallsViaSlicerMeta, cls).__new__(cls, name, bases, dct)
 
 class _Slice_wrap(object):
-    def __init__(self, k, X, X2=None):
+    def __init__(self, k, X, X2=None, ret_shape=None):
         self.k = k
-        self.shape = X.shape
+        if ret_shape is None:
+            self.shape = X.shape
+        else:
+            self.shape = ret_shape
         assert X.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X.shape={!s}".format(X.shape)
         if X2 is not None:
             assert X2.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X2.shape={!s}".format(X2.shape)
@@ -54,7 +59,10 @@ class _Slice_wrap(object):
     def handle_return_array(self, return_val):
         if self.ret:
             ret = np.zeros(self.shape)
-            ret[:, self.k.active_dims] = return_val
+            if len(self.shape) == 2:
+                ret[:, self.k.active_dims] = return_val
+            elif len(self.shape) == 3:
+                ret[:, :, self.k.active_dims] = return_val
             return ret
         return return_val
 
@@ -98,6 +106,19 @@ def _slice_gradients_X(f):
         return ret
     return wrap
 
+def _slice_gradients_XX(f):
+    @wraps(f)
+    def wrap(self, dL_dK, X, X2=None):
+        if X2 is None:
+            N, M = X.shape[0], X.shape[0]
+        else:
+            N, M = X.shape[0], X2.shape[0]
+        with _Slice_wrap(self, X, X2, ret_shape=(N, M, X.shape[1])) as s:
+        #with _Slice_wrap(self, X, X2, ret_shape=None) as s:
+            ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2))
+        return ret
+    return wrap
+
 def _slice_gradients_X_diag(f):
     @wraps(f)
     def wrap(self, dL_dKdiag, X):
diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index e3a45c67..892f7217 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -100,6 +100,12 @@ class Linear(Kern):
             #return (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
             return np.einsum('jq,q,ij->iq', X2, self.variances, dL_dK)
 
+    def gradients_XX(self, dL_dK, X, X2=None):
+        if X2 is None:
+            return 2*np.ones(X.shape)*self.variances
+        else:
+            return np.ones(X.shape)*self.variances
+
     def gradients_X_diag(self, dL_dKdiag, X):
         return 2.*self.variances*dL_dKdiag[:,None]*X
 
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index 73f2d0a4..ba7843c6 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -31,6 +31,9 @@ class RBF(Stationary):
     def dK_dr(self, r):
         return -r*self.K_of_r(r)
 
+    def dK2_drdr(self, r):
+        return (r**2-1)*self.K_of_r(r)
+
     def __getstate__(self):
         dc = super(RBF, self).__getstate__()
         if self.useGPU:
diff --git a/GPy/kern/_src/stationary.py b/GPy/kern/_src/stationary.py
index 9c4f1436..ab1ec282 100644
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@@ -15,7 +15,7 @@ from ...util.caching import Cache_this
 try:
     from . import stationary_cython
 except ImportError:
-    print('warning in sationary: failed to import cython module: falling back to numpy')
+    print('warning in stationary: failed to import cython module: falling back to numpy')
     config.set('cython', 'working', 'false')
 
 
@@ -78,6 +78,10 @@ class Stationary(Kern):
         raise NotImplementedError("implement derivative of the covariance function wrt r to use this class")
 
     @Cache_this(limit=20, ignore_args=())
+    def dK2_drdr(self, r):
+        raise NotImplementedError("implement second derivative of covariance wrt r to use this method")
+
+    @Cache_this(limit=5, ignore_args=())
     def K(self, X, X2=None):
         """
         Kernel function applied on inputs X and X2.
@@ -94,6 +98,11 @@ class Stationary(Kern):
         #a convenience function, so we can cache dK_dr
         return self.dK_dr(self._scaled_dist(X, X2))
 
+    @Cache_this(limit=3, ignore_args=())
+    def dK2_drdr_via_X(self, X, X2):
+        #a convenience function, so we can cache dK_dr
+        return self.dK2_drdr(self._scaled_dist(X, X2))
+
     def _unscaled_dist(self, X, X2=None):
         """
         Compute the Euclidean distance between each row of X and X2, or between
@@ -201,6 +210,59 @@ class Stationary(Kern):
         else:
             return self._gradients_X_pure(dL_dK, X, X2)
 
+    def gradients_XX(self, dL_dK, X, X2=None):
+        """
+        Given the derivative of the objective K(dL_dK), compute the second derivative of K wrt X and X2:
+
+        ..math:
+          \frac{\partial^2 K}{\partial X\partial X2}
+
+        ..returns:
+            dL2_dXdX2: NxMxQ, for X [NxQ] and X2[MxQ] (X2 is X if, X2 is None)
+            Thus, we return the second derivative in X2.
+        """
+        # The off diagonals in Q are always zero, this should also be true for the Linear kernel...
+        # According to multivariable chain rule, we can chain the second derivative through r:
+        # d2K_dXdX2 = dK_dr*d2r_dXdX2 + d2K_drdr * dr_dX * dr_dX2:
+        invdist = self._inv_dist(X, X2)
+        invdist2 = invdist**2
+
+        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
+        tmp1 = dL_dr * invdist
+
+        dL_drdr = self.dK2_drdr_via_X(X, X2) * dL_dK
+        tmp2 = dL_drdr * invdist2
+
+        l2 = np.ones(X.shape[1]) * self.lengthscale**2
+
+        if X2 is None:
+            X2 = X
+            tmp1 -= np.eye(X.shape[0])*self.variance
+        else:
+            tmp1[X==X2.T] -= self.variance
+
+        grad = np.empty((X.shape[0], X2.shape[0], X.shape[1]), dtype=np.float64)
+        #grad = np.empty(X.shape, dtype=np.float64)
+        for q in range(self.input_dim):
+            tmpdist2 = (X[:,[q]]-X2[:,[q]].T) ** 2
+            grad[:, :, q] = ((tmp1*invdist2 - tmp2)*tmpdist2/l2[q] - tmp1)/l2[q]
+            #grad[:, :, q] = ((tmp1*(((tmpdist2)*invdist2/l2[q])-1)) - (tmp2*(tmpdist2))/l2[q])/l2[q]
+            #np.sum(((tmp1*(((tmpdist2)*invdist2/l2[q])-1)) - (tmp2*(tmpdist2))/l2[q])/l2[q], axis=1, out=grad[:,q])
+            #np.sum( - (tmp2*(tmpdist**2)), axis=1, out=grad[:,q])
+        return grad
+
+    def gradients_XX_diag(self, dL_dK, X):
+        """
+        Given the derivative of the objective K(dL_dK), compute the second derivative of K wrt X and X2:
+
+        ..math:
+          \frac{\partial^2 K}{\partial X\partial X2}
+
+        ..returns:
+            dL2_dXdX2: NxMxQ, for X [NxQ] and X2[MxQ]
+        """
+        return np.ones(X.shape) * self.variance/self.lengthscale**2
+
     def _gradients_X_pure(self, dL_dK, X, X2=None):
         invdist = self._inv_dist(X, X2)
         dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py
index 3157bd5b..20fe43d7 100644
--- a/GPy/likelihoods/__init__.py
+++ b/GPy/likelihoods/__init__.py
@@ -1,6 +1,6 @@
 from .bernoulli import Bernoulli
 from .exponential import Exponential
-from .gaussian import Gaussian
+from .gaussian import Gaussian, Heteroscedastic_Gaussian
 from .gamma import Gamma
 from .poisson import Poisson
 from .student_t import StudentT
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index 3ac703fe..ff0d6855 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -137,6 +137,20 @@ class BayesianGPLVM(SparseGP_MPI):
                 fignum, plot_inducing, legend,
                 plot_limits, aspect, updates, predict_kwargs, imshow_kwargs)
 
+    def plot_magnification(self, labels=None, which_indices=None,
+                resolution=50, ax=None, marker='o', s=40,
+                fignum=None, legend=True,
+                plot_limits=None,
+                aspect='auto', updates=False, **kwargs):
+        import sys
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import dim_reduction_plots
+
+        return dim_reduction_plots.plot_magnification(self, labels, which_indices,
+                resolution, ax, marker, s,
+                fignum, False, legend,
+                plot_limits, aspect, updates, **kwargs)
+
     def do_test_latents(self, Y):
         """
         Compute the latent representation for a set of new points Y
diff --git a/GPy/models/gplvm.py b/GPy/models/gplvm.py
index d6f29907..c95c9fa0 100644
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@@ -43,19 +43,19 @@ class GPLVM(GP):
         super(GPLVM, self).parameters_changed()
         self.X.gradient = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X, None)
 
-    def jacobian(self,X):
-        J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
-        for i in range(self.output_dim):
-            J[:,:,i] = self.kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1], X, self.X)
-        return J
+    #def jacobian(self,X):
+    #    J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
+    #    for i in range(self.output_dim):
+    #        J[:,:,i] = self.kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1], X, self.X)
+    #    return J
 
-    def magnification(self,X):
-        target=np.zeros(X.shape[0])
-        #J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
-        J = self.jacobian(X)
-        for i in range(X.shape[0]):
-            target[i]=np.sqrt(np.linalg.det(np.dot(J[i,:,:],np.transpose(J[i,:,:]))))
-        return target
+    #def magnification(self,X):
+    #    target=np.zeros(X.shape[0])
+    #    #J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
+    ##    J = self.jacobian(X)
+    #    for i in range(X.shape[0]):
+    #        target[i]=np.sqrt(np.linalg.det(np.dot(J[i,:,:],np.transpose(J[i,:,:]))))
+    #    return target
 
     def plot(self):
         assert self.Y.shape[1] == 2, "too high dimensional to plot. Try plot_latent"
@@ -82,5 +82,17 @@ class GPLVM(GP):
                 fignum, False, legend,
                 plot_limits, aspect, updates, **kwargs)
 
-    def plot_magnification(self, *args, **kwargs):
-        return util.plot_latent.plot_magnification(self, *args, **kwargs)
+    def plot_magnification(self, labels=None, which_indices=None,
+                resolution=50, ax=None, marker='o', s=40,
+                fignum=None, legend=True,
+                plot_limits=None,
+                aspect='auto', updates=False, **kwargs):
+
+        import sys
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import dim_reduction_plots
+
+        return dim_reduction_plots.plot_magnification(self, labels, which_indices,
+                resolution, ax, marker, s,
+                fignum, False, legend,
+                plot_limits, aspect, updates, **kwargs)
diff --git a/GPy/plotting/matplot_dep/dim_reduction_plots.py b/GPy/plotting/matplot_dep/dim_reduction_plots.py
index 2c243e13..7d13ef04 100644
--- a/GPy/plotting/matplot_dep/dim_reduction_plots.py
+++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py
@@ -114,7 +114,7 @@ def plot_latent(model, labels=None, which_indices=None,
 
     # create a function which computes the shading of latent space according to the output variance
     def plot_function(x):
-        Xtest_full = np.zeros((x.shape[0], model.X.shape[1]))
+        Xtest_full = np.zeros((x.shape[0], X.shape[1]))
         Xtest_full[:, [input_1, input_2]] = x
         _, var = model.predict(Xtest_full, **predict_kwargs)
         var = var[:, :1]
@@ -202,6 +202,7 @@ def plot_latent(model, labels=None, which_indices=None,
 def plot_magnification(model, labels=None, which_indices=None,
                 resolution=60, ax=None, marker='o', s=40,
                 fignum=None, plot_inducing=False, legend=True,
+                plot_limits=None,
                 aspect='auto', updates=False):
     """
     :param labels: a np.array of size model.num_data containing labels for the points (can be number, strings, etc)
@@ -217,17 +218,88 @@ def plot_magnification(model, labels=None, which_indices=None,
 
     input_1, input_2 = most_significant_input_dimensions(model, which_indices)
 
-    # first, plot the output variance as a function of the latent space
-    Xtest, xx, yy, xmin, xmax = x_frame2D(model.X[:, [input_1, input_2]], resolution=resolution)
-    Xtest_full = np.zeros((Xtest.shape[0], model.X.shape[1]))
+    #fethch the data points X that we'd like to plot
+    X = model.X
+    if isinstance(X, VariationalPosterior):
+        X = X.mean
+    else:
+        X = X
+
+    if X.shape[0] > 1000:
+        print("Warning: subsampling X, as it has more samples then 1000. X.shape={!s}".format(X.shape))
+        subsample = np.random.choice(X.shape[0], size=1000, replace=False)
+        X = X[subsample]
+        labels = labels[subsample]
+        #=======================================================================
+        #     <<<WORK IN PROGRESS>>>
+        #     <<<DO NOT DELETE>>>
+        #     plt.close('all')
+        #     fig, ax = plt.subplots(1,1)
+        #     from GPy.plotting.matplot_dep.dim_reduction_plots import most_significant_input_dimensions
+        #     import matplotlib.patches as mpatches
+        #     i1, i2 = most_significant_input_dimensions(m, None)
+        #     xmin, xmax = 100, -100
+        #     ymin, ymax = 100, -100
+        #     legend_handles = []
+        #
+        #     X = m.X.mean[:, [i1, i2]]
+        #     X = m.X.variance[:, [i1, i2]]
+        #
+        #     xmin = X[:,0].min(); xmax = X[:,0].max()
+        #     ymin = X[:,1].min(); ymax = X[:,1].max()
+        #     range_ = [[xmin, xmax], [ymin, ymax]]
+        #     ul = np.unique(labels)
+        #
+        #     for i, l in enumerate(ul):
+        #         #cdict = dict(red  =[(0., colors[i][0], colors[i][0]), (1., colors[i][0], colors[i][0])],
+        #         #             green=[(0., colors[i][0], colors[i][1]), (1., colors[i][1], colors[i][1])],
+        #         #             blue =[(0., colors[i][0], colors[i][2]), (1., colors[i][2], colors[i][2])],
+        #         #             alpha=[(0., 0., .0), (.5, .5, .5), (1., .5, .5)])
+        #         #cmap = LinearSegmentedColormap('{}'.format(l), cdict)
+        #         cmap = LinearSegmentedColormap.from_list('cmap_{}'.format(str(l)), [colors[i], colors[i]], 255)
+        #         cmap._init()
+        #         #alphas = .5*(1+scipy.special.erf(np.linspace(-2,2, cmap.N+3)))#np.log(np.linspace(np.exp(0), np.exp(1.), cmap.N+3))
+        #         alphas = (scipy.special.erf(np.linspace(0,2.4, cmap.N+3)))#np.log(np.linspace(np.exp(0), np.exp(1.), cmap.N+3))
+        #         cmap._lut[:, -1] = alphas
+        #         print l
+        #         x, y = X[labels==l].T
+        #
+        #         heatmap, xedges, yedges = np.histogram2d(x, y, bins=300, range=range_)
+        #         #heatmap, xedges, yedges = np.histogram2d(x, y, bins=100)
+        #
+        #         im = ax.imshow(heatmap, extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]], cmap=cmap, aspect='auto', interpolation='nearest', label=str(l))
+        #         legend_handles.append(mpatches.Patch(color=colors[i], label=l))
+        #     ax.set_xlim(xmin, xmax)
+        #     ax.set_ylim(ymin, ymax)
+        #     plt.legend(legend_handles, [l.get_label() for l in legend_handles])
+        #     plt.draw()
+        #     plt.show()
+        #=======================================================================
+
+    #Create an IMshow controller that can re-plot the latent space shading at a good resolution
+    if plot_limits is None:
+        xmin, ymin = X[:, [input_1, input_2]].min(0)
+        xmax, ymax = X[:, [input_1, input_2]].max(0)
+        x_r, y_r = xmax-xmin, ymax-ymin
+        xmin -= .1*x_r
+        xmax += .1*x_r
+        ymin -= .1*y_r
+        ymax += .1*y_r
+    else:
+        try:
+            xmin, xmax, ymin, ymax = plot_limits
+        except (TypeError, ValueError) as e:
+            raise e.__class__("Wrong plot limits: {} given -> need (xmin, xmax, ymin, ymax)".format(plot_limits))
+
 
     def plot_function(x):
+        Xtest_full = np.zeros((x.shape[0], X.shape[1]))
         Xtest_full[:, [input_1, input_2]] = x
-        mf=model.magnification(Xtest_full)
+        mf = model.predict_magnification(Xtest_full)
         return mf
 
     view = ImshowController(ax, plot_function,
-                            tuple(model.X.min(0)[:, [input_1, input_2]]) + tuple(model.X.max(0)[:, [input_1, input_2]]),
+                            (xmin, ymin, xmax, ymax),
                             resolution, aspect=aspect, interpolation='bilinear',
                             cmap=pb.cm.gray)
 
diff --git a/GPy/util/diag.py b/GPy/util/diag.py
index e7c332e2..9a3343f0 100644
--- a/GPy/util/diag.py
+++ b/GPy/util/diag.py
@@ -46,6 +46,8 @@ def offdiag_view(A, offset=0):
     return as_strided(Af[(1+offset):], shape=(A.shape[0]-1, A.shape[1]), strides=(A.strides[0] + A.itemsize, A.strides[1]))
 
 def _diag_ufunc(A,b,offset,func):
+    b = np.squeeze(b)
+    assert b.ndim <= 1, "only implemented for one dimensional arrays"
     dA = view(A, offset); func(dA,b,dA)
     return A
 

From 7a5872e11d1ba4410ecf997400963039bd1e26db Mon Sep 17 00:00:00 2001
From: Ricardo <acq11ra@sheffield.ac.uk>
Date: Wed, 2 Sep 2015 12:15:06 +0100
Subject: [PATCH 08/18] Model uses the new HeteroscedasticGaussian likelihood

---
 GPy/models/gp_heteroscedastic_regression.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/GPy/models/gp_heteroscedastic_regression.py b/GPy/models/gp_heteroscedastic_regression.py
index 19cb18d8..2fe3b788 100644
--- a/GPy/models/gp_heteroscedastic_regression.py
+++ b/GPy/models/gp_heteroscedastic_regression.py
@@ -30,10 +30,7 @@ class GPHeteroscedasticRegression(GP):
             kernel = kern.RBF(X.shape[1])
 
         #Likelihood
-        #likelihoods_list = [likelihoods.Gaussian(name="Gaussian_noise_%s" %j) for j in range(Ny)]
-        noise_terms = np.unique(Y_metadata['output_index'].flatten())
-        likelihoods_list = [likelihoods.Gaussian(name="Gaussian_noise_%s" %j) for j in noise_terms]
-        likelihood = likelihoods.MixedNoise(likelihoods_list=likelihoods_list)
+        likelihood = likelihoods.HeteroscedasticGaussian(Y_metadata)
 
         super(GPHeteroscedasticRegression, self).__init__(X,Y,kernel,likelihood, Y_metadata=Y_metadata)
 

From f780693577d0b52555a60061df5dd294b7afee9f Mon Sep 17 00:00:00 2001
From: Ricardo <acq11ra@sheffield.ac.uk>
Date: Wed, 2 Sep 2015 13:29:33 +0100
Subject: [PATCH 09/18] New likelihood: HeteroscedasticGaussian

---
 GPy/likelihoods/__init__.py | 1 +
 GPy/likelihoods/gaussian.py | 6 +++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py
index 3157bd5b..3c39157d 100644
--- a/GPy/likelihoods/__init__.py
+++ b/GPy/likelihoods/__init__.py
@@ -1,6 +1,7 @@
 from .bernoulli import Bernoulli
 from .exponential import Exponential
 from .gaussian import Gaussian
+from .gaussian import HeteroscedasticGaussian
 from .gamma import Gamma
 from .poisson import Poisson
 from .student_t import StudentT
diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index 7b001e17..ef4b26b1 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -326,7 +326,7 @@ class Gaussian(Likelihood):
         dF_dtheta = -0.5/lik_var + 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/(lik_var**2)
         return F, dF_dmu, dF_dv, dF_dtheta.reshape(1, Y.shape[0], Y.shape[1])
 
-class Heteroscedastic_Gaussian(Gaussian):
+class HeteroscedasticGaussian(Gaussian):
     def __init__(self, Y_metadata, gp_link=None, variance=1., name='het_Gauss'):
         if gp_link is None:
             gp_link = link_functions.Identity()
@@ -335,7 +335,7 @@ class Heteroscedastic_Gaussian(Gaussian):
             print("Warning, Exact inference is not implemeted for non-identity link functions,\
             if you are not already, ensure Laplace inference_method is used")
 
-        super(Heteroscedastic_Gaussian, self).__init__(gp_link, np.ones(Y_metadata['output_index'].shape[0])*variance, name)
+        super(HeteroscedasticGaussian, self).__init__(gp_link, np.ones(Y_metadata['output_index'].shape[0])*variance, name)
 
     def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
         return dL_dKdiag[Y_metadata['output_index']][:,0]
@@ -351,4 +351,4 @@ class Heteroscedastic_Gaussian(Gaussian):
                 var += np.atleast_3d(np.eye(var.shape[0])*self.variance)
         else:
             var += self.variance
-        return mu, var
\ No newline at end of file
+        return mu, var

From 21f6b92475b05c7caee3e32389c63b655790f25d Mon Sep 17 00:00:00 2001
From: Ricardo <acq11ra@sheffield.ac.uk>
Date: Wed, 2 Sep 2015 13:32:06 +0100
Subject: [PATCH 10/18] Change in _diag_ufunc with @mzwiessele

---
 GPy/util/diag.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/GPy/util/diag.py b/GPy/util/diag.py
index e7c332e2..9a3343f0 100644
--- a/GPy/util/diag.py
+++ b/GPy/util/diag.py
@@ -46,6 +46,8 @@ def offdiag_view(A, offset=0):
     return as_strided(Af[(1+offset):], shape=(A.shape[0]-1, A.shape[1]), strides=(A.strides[0] + A.itemsize, A.strides[1]))
 
 def _diag_ufunc(A,b,offset,func):
+    b = np.squeeze(b)
+    assert b.ndim <= 1, "only implemented for one dimensional arrays"
     dA = view(A, offset); func(dA,b,dA)
     return A
 

From 4bec78704b4fd37cd51578f9c846010f90cd67c2 Mon Sep 17 00:00:00 2001
From: Ricardo <acq11ra@sheffield.ac.uk>
Date: Wed, 2 Sep 2015 13:32:39 +0100
Subject: [PATCH 11/18] Note added to the docs

---
 GPy/models/gp_heteroscedastic_regression.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/GPy/models/gp_heteroscedastic_regression.py b/GPy/models/gp_heteroscedastic_regression.py
index 2fe3b788..63c6352a 100644
--- a/GPy/models/gp_heteroscedastic_regression.py
+++ b/GPy/models/gp_heteroscedastic_regression.py
@@ -16,6 +16,8 @@ class GPHeteroscedasticRegression(GP):
     :param X: input observations
     :param Y: observed values
     :param kernel: a GPy kernel, defaults to rbf
+
+    NB: This model does not make inference on the noise outside the training set
     """
     def __init__(self, X, Y, kernel=None, Y_metadata=None):
 

From 26bdcfa82e6540098ae8c0592313cf65ab77d048 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 2 Sep 2015 15:45:55 +0100
Subject: [PATCH 12/18] [hetnoise] import correction

---
 GPy/likelihoods/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py
index 20fe43d7..627ef39f 100644
--- a/GPy/likelihoods/__init__.py
+++ b/GPy/likelihoods/__init__.py
@@ -1,6 +1,6 @@
 from .bernoulli import Bernoulli
 from .exponential import Exponential
-from .gaussian import Gaussian, Heteroscedastic_Gaussian
+from .gaussian import Gaussian, HeteroscedasticGaussian
 from .gamma import Gamma
 from .poisson import Poisson
 from .student_t import StudentT

From ca60ad3195641ed5c48b00b565f1a4fb97c57431 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 2 Sep 2015 15:46:40 +0100
Subject: [PATCH 13/18] [magnification] mostly plotting and some model
 corrections for _predictive_variable

---
 GPy/core/gp.py                                | 21 +++++++++++++++++--
 GPy/core/sparse_gp.py                         | 15 +++++++++++++
 GPy/core/verbose_optimization.py              | 10 ++++-----
 GPy/kern/_src/kern.py                         |  4 +++-
 GPy/models/gplvm.py                           | 15 -------------
 GPy/models/sparse_gp_minibatch.py             |  3 ++-
 .../matplot_dep/dim_reduction_plots.py        | 19 +++++++++--------
 7 files changed, 54 insertions(+), 33 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 8267a569..1e285092 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -405,9 +405,10 @@ class GP(Model):
 
         for each point N in Xnew
         """
-        from ..util.linalg import jitchol
         G = self.predict_wishard_embedding(Xnew, kern)
-        return np.array([2*np.sqrt(np.exp(np.sum(np.log(np.diag(jitchol(G[n, :, :])))))) for n in range(Xnew.shape[0])])
+        from ..util.linalg import jitchol
+        return np.array([np.sqrt(np.exp(2*np.sum(np.log(np.diag(jitchol(G[n, :, :])))))) for n in range(Xnew.shape[0])])
+        #return np.array([np.sqrt(np.linalg.det(G[n, :, :])) for n in range(Xnew.shape[0])])
 
     def posterior_samples_f(self,X,size=10, full_cov=True):
         """
@@ -564,6 +565,22 @@ class GP(Model):
                                      plot_raw=plot_raw, Y_metadata=Y_metadata,
                                      data_symbol=data_symbol, predict_kw=predict_kw, **kw)
 
+    def plot_magnification(self, labels=None, which_indices=None,
+                resolution=50, ax=None, marker='o', s=40,
+                fignum=None, legend=True,
+                plot_limits=None,
+                aspect='auto', updates=False, **kwargs):
+
+        import sys
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import dim_reduction_plots
+
+        return dim_reduction_plots.plot_magnification(self, labels, which_indices,
+                resolution, ax, marker, s,
+                fignum, False, legend,
+                plot_limits, aspect, updates, **kwargs)
+
+
     def input_sensitivity(self, summarize=True):
         """
         Returns the sensitivity for each dimension of this model
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index ac5fb62b..5ab302db 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -181,3 +181,18 @@ class SparseGP(GP):
                     var[i] = np.diag(var_)+p0-t2
 
         return mu, var
+
+    def plot_magnification(self, labels=None, which_indices=None,
+                resolution=50, ax=None, marker='o', s=40,
+                fignum=None, legend=True,
+                plot_limits=None,
+                aspect='auto', updates=False, plot_inducing=True, **kwargs):
+
+        import sys
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import dim_reduction_plots
+
+        return dim_reduction_plots.plot_magnification(self, labels, which_indices,
+                resolution, ax, marker, s,
+                fignum, plot_inducing, legend,
+                plot_limits, aspect, updates, **kwargs)
\ No newline at end of file
diff --git a/GPy/core/verbose_optimization.py b/GPy/core/verbose_optimization.py
index 57f0c9e6..c4539736 100644
--- a/GPy/core/verbose_optimization.py
+++ b/GPy/core/verbose_optimization.py
@@ -24,7 +24,6 @@ class VerboseOptimization(object):
             self.model.add_observer(self, self.print_status)
             self.status = 'running'
             self.clear = clear_after_finish
-            self.deltat = .2
 
             self.update()
 
@@ -80,6 +79,7 @@ class VerboseOptimization(object):
 
     def __enter__(self):
         self.start = time.time()
+        self._time = self.start
         return self
 
     def print_out(self, seconds):
@@ -143,12 +143,12 @@ class VerboseOptimization(object):
     def print_status(self, me, which=None):
         self.update()
 
-        seconds = time.time()-self.start
+        t = time.time()
+        seconds = t-self.start
         #sys.stdout.write(" "*len(self.message))
-        self.deltat += seconds
-        if self.deltat > .3 or seconds < .3:
+        if t-self._time > .3 or seconds < .3:
             self.print_out(seconds)
-            self.deltat = 0
+            self._time = t
 
         self.iteration += 1
 
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index 62be679c..aae4460a 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -59,7 +59,7 @@ class Kern(Parameterized):
         self._sliced_X = 0
         self.useGPU = self._support_GPU and useGPU
         self._return_psi2_n_flag = ObsAr(np.zeros(1)).astype(bool)
-        
+
         from .psi_comp import PSICOMP_GH
         self.psicomp = PSICOMP_GH()
 
@@ -102,6 +102,8 @@ class Kern(Parameterized):
         raise NotImplementedError
     def gradients_XX(self, dL_dK, X, X2):
         raise(NotImplementedError, "This is the second derivative of K wrt X and X2, and not implemented for this kernel")
+    def gradients_XX_diag(self, dL_dK, X, X2):
+        raise(NotImplementedError, "This is the diagonal of the second derivative of K wrt X and X2, and not implemented for this kernel")
     def gradients_X_diag(self, dL_dKdiag, X):
         raise NotImplementedError
 
diff --git a/GPy/models/gplvm.py b/GPy/models/gplvm.py
index c95c9fa0..d4f4f564 100644
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@@ -81,18 +81,3 @@ class GPLVM(GP):
                 resolution, ax, marker, s,
                 fignum, False, legend,
                 plot_limits, aspect, updates, **kwargs)
-
-    def plot_magnification(self, labels=None, which_indices=None,
-                resolution=50, ax=None, marker='o', s=40,
-                fignum=None, legend=True,
-                plot_limits=None,
-                aspect='auto', updates=False, **kwargs):
-
-        import sys
-        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
-        from ..plotting.matplot_dep import dim_reduction_plots
-
-        return dim_reduction_plots.plot_magnification(self, labels, which_indices,
-                resolution, ax, marker, s,
-                fignum, False, legend,
-                plot_limits, aspect, updates, **kwargs)
diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py
index 07295255..81bd22a4 100644
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@@ -44,7 +44,7 @@ class SparseGPMiniBatch(SparseGP):
     def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
                  name='sparse gp', Y_metadata=None, normalizer=False,
                  missing_data=False, stochastic=False, batchsize=1):
-        
+
         # pick a sensible inference method
         if inference_method is None:
             if isinstance(likelihood, likelihoods.Gaussian):
@@ -76,6 +76,7 @@ class SparseGPMiniBatch(SparseGP):
         logger.info("Adding Z as parameter")
         self.link_parameter(self.Z, index=0)
         self.posterior = None
+        self._predictive_variable = self.Z
 
     def has_uncertain_inputs(self):
         return isinstance(self.X, VariationalPosterior)
diff --git a/GPy/plotting/matplot_dep/dim_reduction_plots.py b/GPy/plotting/matplot_dep/dim_reduction_plots.py
index 7d13ef04..6db46a81 100644
--- a/GPy/plotting/matplot_dep/dim_reduction_plots.py
+++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py
@@ -322,11 +322,11 @@ def plot_magnification(model, labels=None, which_indices=None,
 
         index = np.nonzero(labels == ul)[0]
         if model.input_dim == 1:
-            x = model.X[index, input_1]
+            x = X[index, input_1]
             y = np.zeros(index.size)
         else:
-            x = model.X[index, input_1]
-            y = model.X[index, input_2]
+            x = X[index, input_1]
+            y = X[index, input_2]
         ax.scatter(x, y, marker=m, s=s, color=Tango.nextMedium(), label=this_label)
 
     ax.set_xlabel('latent dimension %i' % input_1)
@@ -335,13 +335,14 @@ def plot_magnification(model, labels=None, which_indices=None,
     if not np.all(labels == 1.) and legend:
         ax.legend(loc=0, numpoints=1)
 
-    ax.set_xlim(xmin[0], xmax[0])
-    ax.set_ylim(xmin[1], xmax[1])
+    ax.set_xlim((xmin, xmax))
+    ax.set_ylim((ymin, ymax))
     ax.grid(b=False) # remove the grid if present, it doesn't look good
     ax.set_aspect('auto') # set a nice aspect ratio
 
     if plot_inducing:
-        ax.plot(model.Z[:, input_1], model.Z[:, input_2], '^w')
+        Z = model.Z
+        ax.scatter(Z[:, input_1], Z[:, input_2], c='w', s=18, marker="^", edgecolor='k', linewidth=.3, alpha=.7)
 
     if updates:
         fig.canvas.show()
@@ -386,8 +387,8 @@ def plot_steepest_gradient_map(model, fignum=None, ax=None, which_indices=None,
             this_label = 'class %i' % i
         m = marker.next()
         index = np.nonzero(data_labels == ul)[0]
-        x = model.X[index, input_1]
-        y = model.X[index, input_2]
+        x = X[index, input_1]
+        y = X[index, input_2]
         ax.scatter(x, y, marker=m, s=data_s, color=Tango.nextMedium(), label=this_label)
 
     ax.set_xlabel('latent dimension %i' % input_1)
@@ -395,7 +396,7 @@ def plot_steepest_gradient_map(model, fignum=None, ax=None, which_indices=None,
 
     controller = ImAnnotateController(ax,
                                   plot_function,
-                                  tuple(model.X.min(0)[:, significant_dims]) + tuple(model.X.max(0)[:, significant_dims]),
+                                  tuple(X.min(0)[:, significant_dims]) + tuple(X.max(0)[:, significant_dims]),
                                   resolution=resolution,
                                   aspect=aspect,
                                   cmap=get_cmap('jet'),

From 8e437e4bda653cc2dabd511a2be49b9e14c7848b Mon Sep 17 00:00:00 2001
From: Ricardo <acq11ra@sheffield.ac.uk>
Date: Wed, 2 Sep 2015 19:40:11 +0100
Subject: [PATCH 14/18] New function to plot just the errorbars of the training
 data

---
 GPy/core/gp.py                           |  86 ++++++++++++-
 GPy/plotting/matplot_dep/base_plots.py   |  21 +++
 GPy/plotting/matplot_dep/models_plots.py | 157 ++++++++++++++++++++++-
 3 files changed, 256 insertions(+), 8 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index a87ac66e..1f5bc136 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -407,8 +407,8 @@ class GP(Model):
     def plot(self, plot_limits=None, which_data_rows='all',
         which_data_ycols='all', fixed_inputs=[],
         levels=20, samples=0, fignum=None, ax=None, resolution=None,
-        plot_raw=False,
-        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx', predict_kw=None):
+        plot_raw=False, linecol=None,fillcol=None, Y_metadata=None,
+        data_symbol='kx', predict_kw=None, plot_training_data=True):
         """
         Plot the posterior of the GP.
           - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
@@ -445,6 +445,8 @@ class GP(Model):
         :type Y_metadata: dict
         :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
         :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
+        :param plot_training_data: whether or not to plot the training points
+        :type plot_training_data: boolean
         """
         assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
         from ..plotting.matplot_dep import models_plots
@@ -457,7 +459,85 @@ class GP(Model):
                                      which_data_ycols, fixed_inputs,
                                      levels, samples, fignum, ax, resolution,
                                      plot_raw=plot_raw, Y_metadata=Y_metadata,
-                                     data_symbol=data_symbol, predict_kw=predict_kw, **kw)
+                                     data_symbol=data_symbol, predict_kw=predict_kw, 
+                                     plot_training_data=plot_training_data, **kw)
+
+
+    def plot_data(self, which_data_rows='all',
+        which_data_ycols='all', visible_dims=None,
+        fignum=None, ax=None, data_symbol='kx'):
+        """
+        Plot the training data
+          - For higher dimensions than two, use fixed_inputs to plot the data points with some of the inputs fixed.
+
+        Can plot only part of the data
+        using which_data_rows and which_data_ycols.
+
+        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+        :type plot_limits: np.array
+        :param which_data_rows: which of the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
+        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+        :type which_data_ycols: 'all' or a list of integers
+        :param visible_dims: an array specifying the input dimensions to plot (maximum two)
+        :type visible_dims: a numpy array
+        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+        :type resolution: int
+        :param levels: number of levels to plot in a contour plot.
+        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
+        :type levels: int
+        :param samples: the number of a posteriori samples to plot
+        :type samples: int
+        :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+        :param linecol: color of line to plot [Tango.colorsHex['darkBlue']]
+        :type linecol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
+        :param fillcol: color of fill [Tango.colorsHex['lightBlue']]
+        :type fillcol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
+        :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
+        :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
+        """
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import models_plots
+        kw = {}
+        return models_plots.plot_data(self, which_data_rows,
+                                     which_data_ycols, visible_dims,
+                                     fignum, ax, data_symbol, **kw)
+
+
+    def plot_fit_errorbars(self, which_data_rows='all',
+            which_data_ycols='all', fixed_inputs=[], fignum=None, ax=None,
+            linecol=None, data_symbol='kx', predict_kw=None, plot_training_data=True):
+
+        """
+        Plot the posterior error bars corresponding to the training data
+          - For higher dimensions than two, use fixed_inputs to plot the data points with some of the inputs fixed.
+
+        Can plot only part of the data
+        using which_data_rows and which_data_ycols.
+
+        :param which_data_rows: which of the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
+        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+        :type which_data_rows: 'all' or a list of integers
+        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
+        :type fixed_inputs: a list of tuples
+        :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+        :param plot_training_data: whether or not to plot the training points
+        :type plot_training_data: boolean
+        """
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import models_plots
+        kw = {}
+        return models_plots.plot_fit_errorbars(self, which_data_rows, which_data_ycols, fixed_inputs,
+                                    fignum, ax, linecol, data_symbol,
+                                    predict_kw, plot_training_data, **kw)
+
 
     def input_sensitivity(self, summarize=True):
         """
diff --git a/GPy/plotting/matplot_dep/base_plots.py b/GPy/plotting/matplot_dep/base_plots.py
index dade87cf..80f114b1 100644
--- a/GPy/plotting/matplot_dep/base_plots.py
+++ b/GPy/plotting/matplot_dep/base_plots.py
@@ -47,6 +47,27 @@ def gpplot(x, mu, lower, upper, edgecol='#3300FF', fillcol='#33CCFF', ax=None, f
     return plots
 
 
+def gperrors(x, mu, lower, upper, edgecol=None, ax=None, fignum=None, **kwargs):
+    _, axes = ax_default(fignum, ax)
+
+    mu = mu.flatten()
+    x = x.flatten()
+    lower = lower.flatten()
+    upper = upper.flatten()
+
+    plots = []
+
+    if edgecol is None:
+        edgecol='#3300FF'
+
+    if not 'alpha' in kwargs.keys():
+        kwargs['alpha'] = 0.3
+
+    plots.append(axes.errorbar(x,mu,yerr=np.vstack([mu-lower,upper-mu]),color=edgecol,**kwargs))
+    plots[-1][0].remove()
+    return plots
+
+
 def removeRightTicks(ax=None):
     ax = ax or pb.gca()
     for i, line in enumerate(ax.get_yticklines()):
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index d0f6c952..e1ba327e 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -3,19 +3,79 @@
 
 import numpy as np
 from . import Tango
-from base_plots import gpplot, x_frame1D, x_frame2D
+from base_plots import gpplot, x_frame1D, x_frame2D,gperrors
 from ...models.gp_coregionalized_regression import GPCoregionalizedRegression
 from ...models.sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
 from scipy import sparse
 from ...core.parameterization.variational import VariationalPosterior
 from matplotlib import pyplot as plt
 
+
+def plot_data(model, which_data_rows='all',
+        which_data_ycols='all', visible_dims=None,
+        fignum=None, ax=None, data_symbol='kx',mew=1.5):
+    """
+    Plot the training data
+      - For higher dimensions than two, use fixed_inputs to plot the data points with some of the inputs fixed.
+
+    Can plot only part of the data
+    using which_data_rows and which_data_ycols.
+
+    :param which_data_rows: which of the training data to plot (default all)
+    :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
+    :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+    :type which_data_rows: 'all' or a list of integers
+    :param visible_dims: an array specifying the input dimensions to plot (maximum two)
+    :type visible_dims: a numpy array
+    :param fignum: figure to plot on.
+    :type fignum: figure number
+    :param ax: axes to plot on.
+    :type ax: axes handle
+    """
+    #deal with optional arguments
+    if which_data_rows == 'all':
+        which_data_rows = slice(None)
+    if which_data_ycols == 'all':
+        which_data_ycols = np.arange(model.output_dim)
+
+    if ax is None:
+        fig = plt.figure(num=fignum)
+        ax = fig.add_subplot(111)
+
+    #data
+    X = model.X
+    Y = model.Y
+
+    #work out what the inputs are for plotting (1D or 2D)
+    if visible_dims is None:
+        visible_dims = np.arange(model.input_dim)
+    assert visible_dims.size <= 2, "Visible inputs cannot be larger than two"
+    free_dims = visible_dims
+    plots = {}
+    #one dimensional plotting
+    if len(free_dims) == 1:
+
+        for d in which_data_ycols:
+            plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], data_symbol, mew=mew)
+
+    #2D plotting
+    elif len(free_dims) == 2:
+
+        for d in which_data_ycols:
+            plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40,
+            Y[which_data_rows, d], cmap=plt.cm.jet, vmin=Y.min(), vmax=Y.max(), linewidth=0.)
+
+    else:
+        raise NotImplementedError("Cannot define a frame with more than two input dimensions")
+    return plots
+
+
 def plot_fit(model, plot_limits=None, which_data_rows='all',
         which_data_ycols='all', fixed_inputs=[],
         levels=20, samples=0, fignum=None, ax=None, resolution=None,
         plot_raw=False,
         linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue'], Y_metadata=None, data_symbol='kx',
-        apply_link=False, samples_f=0, plot_uncertain_inputs=True, predict_kw=None):
+        apply_link=False, samples_f=0, plot_uncertain_inputs=True, predict_kw=None, plot_training_data=True):
     """
     Plot the posterior of the GP.
       - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
@@ -43,7 +103,6 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
     :type fignum: figure number
     :param ax: axes to plot on.
     :type ax: axes handle
-    :type output: integer (first output is 0)
     :param linecol: color of line to plot.
     :type linecol:
     :param fillcol: color of fill
@@ -52,6 +111,8 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
     :type apply_link: boolean
     :param samples_f: the number of posteriori f samples to plot p(f*|y)
     :type samples_f: int
+    :param plot_training_data: whether or not to plot the training points
+    :type plot_training_data: boolean
     """
     #deal with optional arguments
     if which_data_rows == 'all':
@@ -116,7 +177,11 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
 
         for d in which_data_ycols:
             plots['gpplot'] = gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], ax=ax, edgecol=linecol, fillcol=fillcol)
-            if not plot_raw: plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], data_symbol, mew=1.5)
+            #if not plot_raw: plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], data_symbol, mew=1.5)
+            if not plot_raw and plot_training_data:
+                plots['dataplot'] = plot_data(model=model, which_data_rows=which_data_rows,
+                visible_dims=free_dims, data_symbol=data_symbol, mew=1.5, ax=ax, fignum=fignum)
+
 
         #optionally plot some samples
         if samples: #NOTE not tested with fixed_inputs
@@ -196,7 +261,9 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
         for d in which_data_ycols:
             m_d = m[:,d].reshape(resolution, resolution).T
             plots['contour'] = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=plt.cm.jet)
-            if not plot_raw: plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=plt.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+            #if not plot_raw: plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=plt.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+            if not plot_raw and plot_training_data:
+                plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=plt.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
 
         #set the limits of the plot to some sensible values
         ax.set_xlim(xmin[0], xmax[0])
@@ -261,3 +328,83 @@ def fixed_inputs(model, non_fixed_inputs, fix_routine='median', as_list=True, X_
         return f_inputs
     else:
         return X
+
+
+def plot_fit_errorbars(model, which_data_rows='all',
+        which_data_ycols='all', fixed_inputs=[],
+        fignum=None, ax=None,
+        linecol='red', data_symbol='kx',
+        predict_kw=None, plot_training_data=True):
+
+    """
+    Plot the posterior error bars corresponding to the training data
+      - For higher dimensions than two, use fixed_inputs to plot the data points with some of the inputs fixed.
+
+    Can plot only part of the data
+    using which_data_rows and which_data_ycols.
+
+    :param which_data_rows: which of the training data to plot (default all)
+    :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
+    :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+    :type which_data_rows: 'all' or a list of integers
+    :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
+    :type fixed_inputs: a list of tuples
+    :param fignum: figure to plot on.
+    :type fignum: figure number
+    :param ax: axes to plot on.
+    :type ax: axes handle
+    :param plot_training_data: whether or not to plot the training points
+    :type plot_training_data: boolean
+    """
+
+    #deal with optional arguments
+    if which_data_rows == 'all':
+        which_data_rows = slice(None)
+    if which_data_ycols == 'all':
+        which_data_ycols = np.arange(model.output_dim)
+
+    if ax is None:
+        fig = plt.figure(num=fignum)
+        ax = fig.add_subplot(111)
+
+    X = model.X
+    Y = model.Y
+
+    if predict_kw is None:
+        predict_kw = {}
+
+
+    #work out what the inputs are for plotting (1D or 2D)
+    fixed_dims = np.array([i for i,v in fixed_inputs])
+    free_dims = np.setdiff1d(np.arange(model.input_dim),fixed_dims)
+    plots = {}
+
+    #one dimensional plotting
+    if len(free_dims) == 1:
+
+        m, v = model.predict(X, full_cov=False, Y_metadata=model.Y_metadata, **predict_kw)
+        fmu, fv = model._raw_predict(X, full_cov=False, **predict_kw)
+        lower, upper = model.likelihood.predictive_quantiles(fmu, fv, (2.5, 97.5), Y_metadata=model.Y_metadata)
+
+
+        for d in which_data_ycols:
+            plots['gperrors'] = gperrors(X, m[:, d], lower[:, d], upper[:, d], edgecol=linecol, ax=ax, fignum=fignum )
+            if plot_training_data:
+                plots['dataplot'] = plot_data(model=model, which_data_rows=which_data_rows,
+                visible_dims=free_dims, data_symbol=data_symbol, mew=1.5, ax=ax, fignum=fignum)
+
+
+        #set the limits of the plot to some sensible values
+        ymin, ymax = min(np.append(Y[which_data_rows, which_data_ycols].flatten(), lower)), max(np.append(Y[which_data_rows, which_data_ycols].flatten(), upper))
+        ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
+        ax.set_xlim(X[:,free_dims].min(), X[:,free_dims].max())
+        ax.set_ylim(ymin, ymax)
+
+
+    elif len(free_dims) == 2:
+        raise NotImplementedError("Not implemented yet")
+
+
+    else:
+        raise NotImplementedError("Cannot define a frame with more than two input dimensions")
+    return plots

From 839e3dc6f03444f916cd4ffd75b1ce3f272b4fc5 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 3 Sep 2015 09:33:07 +0100
Subject: [PATCH 15/18] [magnification] plot_magnification expanded

---
 GPy/core/gp.py                                | 19 ++++----
 GPy/core/sparse_gp.py                         | 15 ------
 GPy/kern/_src/add.py                          | 16 +++++--
 GPy/kern/_src/kern.py                         |  2 +-
 GPy/kern/_src/static.py                       |  5 ++
 GPy/models/bayesian_gplvm.py                  | 14 ------
 .../matplot_dep/dim_reduction_plots.py        | 46 ++++++++++++-------
 .../controllers/axis_event_controller.py      |  7 ++-
 8 files changed, 64 insertions(+), 60 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 1e285092..e932fabd 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -371,7 +371,7 @@ class GP(Model):
             var_jac = compute_cov_inner(self.posterior.woodbury_inv)
         return mean_jac, var_jac
 
-    def predict_wishard_embedding(self, Xnew, kern=None):
+    def predict_wishard_embedding(self, Xnew, kern=None, mean=True, covariance=True):
         """
         Predict the wishard embedding G of the GP. This is the density of the
         input of the GP defined by the probabilistic function mapping f.
@@ -391,13 +391,16 @@ class GP(Model):
         mumuT = np.einsum('iqd,ipd->iqp', mu_jac, mu_jac)
         if var_jac.ndim == 3:
             Sigma = np.einsum('iqd,ipd->iqp', var_jac, var_jac)
-            G = mumuT + Sigma
         else:
-            Sigma = np.einsum('iq,ip->iqp', var_jac, var_jac)
-            G = mumuT + self.output_dim*Sigma
+            Sigma = self.output_dim*np.einsum('iq,ip->iqp', var_jac, var_jac)
+        G = 0.
+        if mean:
+            G += mumuT
+        if covariance:
+            G += Sigma
         return G
 
-    def predict_magnification(self, Xnew, kern=None):
+    def predict_magnification(self, Xnew, kern=None, mean=True, covariance=True):
         """
         Predict the magnification factor as
 
@@ -405,7 +408,7 @@ class GP(Model):
 
         for each point N in Xnew
         """
-        G = self.predict_wishard_embedding(Xnew, kern)
+        G = self.predict_wishard_embedding(Xnew, kern, mean, covariance)
         from ..util.linalg import jitchol
         return np.array([np.sqrt(np.exp(2*np.sum(np.log(np.diag(jitchol(G[n, :, :])))))) for n in range(Xnew.shape[0])])
         #return np.array([np.sqrt(np.linalg.det(G[n, :, :])) for n in range(Xnew.shape[0])])
@@ -569,7 +572,7 @@ class GP(Model):
                 resolution=50, ax=None, marker='o', s=40,
                 fignum=None, legend=True,
                 plot_limits=None,
-                aspect='auto', updates=False, **kwargs):
+                aspect='auto', updates=False, plot_inducing=True, kern=None, **kwargs):
 
         import sys
         assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
@@ -577,7 +580,7 @@ class GP(Model):
 
         return dim_reduction_plots.plot_magnification(self, labels, which_indices,
                 resolution, ax, marker, s,
-                fignum, False, legend,
+                fignum, plot_inducing, legend,
                 plot_limits, aspect, updates, **kwargs)
 
 
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index 5ab302db..ac5fb62b 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -181,18 +181,3 @@ class SparseGP(GP):
                     var[i] = np.diag(var_)+p0-t2
 
         return mu, var
-
-    def plot_magnification(self, labels=None, which_indices=None,
-                resolution=50, ax=None, marker='o', s=40,
-                fignum=None, legend=True,
-                plot_limits=None,
-                aspect='auto', updates=False, plot_inducing=True, **kwargs):
-
-        import sys
-        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
-        from ..plotting.matplot_dep import dim_reduction_plots
-
-        return dim_reduction_plots.plot_magnification(self, labels, which_indices,
-                resolution, ax, marker, s,
-                fignum, plot_inducing, legend,
-                plot_limits, aspect, updates, **kwargs)
\ No newline at end of file
diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py
index 696a8b04..132c53d2 100644
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@@ -14,7 +14,7 @@ class Add(CombinationKernel):
 
     This kernel will take over the active dims of it's subkernels passed in.
     """
-    def __init__(self, subkerns, name='add'):
+    def __init__(self, subkerns, name='sum'):
         for i, kern in enumerate(subkerns[:]):
             if isinstance(kern, Add):
                 del subkerns[i]
@@ -71,11 +71,21 @@ class Add(CombinationKernel):
         target = np.zeros(X.shape)
         [target.__iadd__(p.gradients_X_diag(dL_dKdiag, X)) for p in self.parts]
         return target
-    
+
+    def gradients_XX(self, dL_dK, X, X2):
+        target = 0.
+        [target.__iadd__(p.gradients_XX(dL_dK, X, X2)) for p in self.parts]
+        return target
+
+    def gradients_XX_diag(self, dL_dKdiag, X):
+        target = np.zeros(X.shape)
+        [target.__iadd__(p.gradients_XX_diag(dL_dKdiag, X)) for p in self.parts]
+        return target
+
     @Cache_this(limit=2, force_kwargs=['which_parts'])
     def psi0(self, Z, variational_posterior):
         return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts))
-    
+
     @Cache_this(limit=2, force_kwargs=['which_parts'])
     def psi1(self, Z, variational_posterior):
         return reduce(np.add, (p.psi1(Z, variational_posterior) for p in self.parts))
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index aae4460a..52745ddf 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -102,7 +102,7 @@ class Kern(Parameterized):
         raise NotImplementedError
     def gradients_XX(self, dL_dK, X, X2):
         raise(NotImplementedError, "This is the second derivative of K wrt X and X2, and not implemented for this kernel")
-    def gradients_XX_diag(self, dL_dK, X, X2):
+    def gradients_XX_diag(self, dL_dKdiag, X):
         raise(NotImplementedError, "This is the diagonal of the second derivative of K wrt X and X2, and not implemented for this kernel")
     def gradients_X_diag(self, dL_dKdiag, X):
         raise NotImplementedError
diff --git a/GPy/kern/_src/static.py b/GPy/kern/_src/static.py
index 64d14018..b32077e9 100644
--- a/GPy/kern/_src/static.py
+++ b/GPy/kern/_src/static.py
@@ -24,6 +24,11 @@ class Static(Kern):
     def gradients_X_diag(self, dL_dKdiag, X):
         return np.zeros(X.shape)
 
+    def gradients_XX(self, dL_dK, X, X2):
+        return np.zeros((X.shape[0], X2.shape[0], X.shape[1]), dtype=np.float64)
+    def gradients_XX_diag(self, dL_dKdiag, X):
+        return np.zeros(X.shape)
+
     def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         return np.zeros(Z.shape)
 
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index ff0d6855..3ac703fe 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -137,20 +137,6 @@ class BayesianGPLVM(SparseGP_MPI):
                 fignum, plot_inducing, legend,
                 plot_limits, aspect, updates, predict_kwargs, imshow_kwargs)
 
-    def plot_magnification(self, labels=None, which_indices=None,
-                resolution=50, ax=None, marker='o', s=40,
-                fignum=None, legend=True,
-                plot_limits=None,
-                aspect='auto', updates=False, **kwargs):
-        import sys
-        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
-        from ..plotting.matplot_dep import dim_reduction_plots
-
-        return dim_reduction_plots.plot_magnification(self, labels, which_indices,
-                resolution, ax, marker, s,
-                fignum, False, legend,
-                plot_limits, aspect, updates, **kwargs)
-
     def do_test_latents(self, Y):
         """
         Compute the latent representation for a set of new points Y
diff --git a/GPy/plotting/matplot_dep/dim_reduction_plots.py b/GPy/plotting/matplot_dep/dim_reduction_plots.py
index 6db46a81..a36f168d 100644
--- a/GPy/plotting/matplot_dep/dim_reduction_plots.py
+++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py
@@ -9,7 +9,8 @@ import itertools
 try:
     import Tango
     from matplotlib.cm import get_cmap
-    import pylab as pb
+    from matplotlib import pyplot as pb
+    from matplotlib import cm
 except:
     pass
 
@@ -137,7 +138,7 @@ def plot_latent(model, labels=None, which_indices=None,
     view = ImshowController(ax, plot_function,
                             (xmin, ymin, xmax, ymax),
                             resolution, aspect=aspect, interpolation='bilinear',
-                            cmap=pb.cm.binary, **imshow_kwargs)
+                            cmap=cm.binary, **imshow_kwargs)
 
     # make sure labels are in order of input:
     labels = np.asarray(labels)
@@ -192,18 +193,18 @@ def plot_latent(model, labels=None, which_indices=None,
 
     if updates:
         try:
-            ax.figure.canvas.show()
+            fig.canvas.show()
         except Exception as e:
             print("Could not invoke show: {}".format(e))
-        raw_input('Enter to continue')
-        view.deactivate()
+        #raw_input('Enter to continue')
+        return view
     return ax
 
 def plot_magnification(model, labels=None, which_indices=None,
                 resolution=60, ax=None, marker='o', s=40,
                 fignum=None, plot_inducing=False, legend=True,
                 plot_limits=None,
-                aspect='auto', updates=False):
+                aspect='auto', updates=False, mean=True, covariance=True, kern=None):
     """
     :param labels: a np.array of size model.num_data containing labels for the points (can be number, strings, etc)
     :param resolution: the resolution of the grid on which to evaluate the predictive variance
@@ -211,6 +212,8 @@ def plot_magnification(model, labels=None, which_indices=None,
     if ax is None:
         fig = pb.figure(num=fignum)
         ax = fig.add_subplot(111)
+    else:
+        fig = ax.figure
     Tango.reset()
 
     if labels is None:
@@ -295,13 +298,13 @@ def plot_magnification(model, labels=None, which_indices=None,
     def plot_function(x):
         Xtest_full = np.zeros((x.shape[0], X.shape[1]))
         Xtest_full[:, [input_1, input_2]] = x
-        mf = model.predict_magnification(Xtest_full)
+        mf = model.predict_magnification(Xtest_full, kern=kern, mean=mean, covariance=covariance)
         return mf
 
     view = ImshowController(ax, plot_function,
                             (xmin, ymin, xmax, ymax),
                             resolution, aspect=aspect, interpolation='bilinear',
-                            cmap=pb.cm.gray)
+                            cmap=cm.gray)
 
     # make sure labels are in order of input:
     ulabels = []
@@ -317,7 +320,7 @@ def plot_magnification(model, labels=None, which_indices=None,
         elif type(ul) is np.int64:
             this_label = 'class %i' % ul
         else:
-            this_label = 'class %i' % i
+            this_label = unicode(ul)
         m = marker.next()
 
         index = np.nonzero(labels == ul)[0]
@@ -327,7 +330,7 @@ def plot_magnification(model, labels=None, which_indices=None,
         else:
             x = X[index, input_1]
             y = X[index, input_2]
-        ax.scatter(x, y, marker=m, s=s, color=Tango.nextMedium(), label=this_label)
+        ax.scatter(x, y, marker=m, s=s, c=Tango.nextMedium(), label=this_label, linewidth=.2, edgecolor='k', alpha=.9)
 
     ax.set_xlabel('latent dimension %i' % input_1)
     ax.set_ylabel('latent dimension %i' % input_2)
@@ -337,18 +340,27 @@ def plot_magnification(model, labels=None, which_indices=None,
 
     ax.set_xlim((xmin, xmax))
     ax.set_ylim((ymin, ymax))
-    ax.grid(b=False) # remove the grid if present, it doesn't look good
-    ax.set_aspect('auto') # set a nice aspect ratio
 
-    if plot_inducing:
+    if plot_inducing and hasattr(model, 'Z'):
         Z = model.Z
         ax.scatter(Z[:, input_1], Z[:, input_2], c='w', s=18, marker="^", edgecolor='k', linewidth=.3, alpha=.7)
 
-    if updates:
-        fig.canvas.show()
-        raw_input('Enter to continue')
+    try:
+        fig.canvas.draw()
+        fig.tight_layout()
+        fig.canvas.draw()
+    except Exception as e:
+        print("Could not invoke tight layout: {}".format(e))
+        pass
 
-    pb.title('Magnification Factor')
+    if updates:
+        try:
+            fig.canvas.draw()
+            fig.canvas.show()
+        except Exception as e:
+            print("Could not invoke show: {}".format(e))
+        #raw_input('Enter to continue')
+        return view
     return ax
 
 
diff --git a/GPy/plotting/matplot_dep/latent_space_visualizations/controllers/axis_event_controller.py b/GPy/plotting/matplot_dep/latent_space_visualizations/controllers/axis_event_controller.py
index 62b622c5..15fcd098 100644
--- a/GPy/plotting/matplot_dep/latent_space_visualizations/controllers/axis_event_controller.py
+++ b/GPy/plotting/matplot_dep/latent_space_visualizations/controllers/axis_event_controller.py
@@ -9,6 +9,9 @@ class AxisEventController(object):
     def __init__(self, ax):
         self.ax = ax
         self.activate()
+    def __del__(self):
+        self.deactivate()
+        return self
     def deactivate(self):
         for cb_class in self.ax.callbacks.callbacks.values():
             for cb_num in cb_class.keys():
@@ -81,9 +84,9 @@ class BufferedAxisChangedController(AxisChangedController):
     def __init__(self, ax, plot_function, plot_limits, resolution=50, update_lim=None, **kwargs):
         """
         Buffered axis changed controller. Controls the buffer and handles update events for when the axes changed.
-        
+
         Updated plotting will be after first reload (first time will be within plot limits, after that the limits will be buffered)
-        
+
         :param plot_function:
             function to use for creating image for plotting (return ndarray-like)
             plot_function gets called with (2D!) Xtest grid if replotting required

From f6d07ff76acaab59950ff75630a3a5183bca38fe Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 3 Sep 2015 10:19:57 +0100
Subject: [PATCH 16/18] [magnification] added static kernel support and faster
 derivative computations

---
 GPy/core/gp.py       | 9 ++++++---
 GPy/kern/_src/add.py | 5 ++++-
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 60038263..c278d509 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -352,13 +352,16 @@ class GP(Model):
         for i in range(self._predictive_variable.shape[0]):
             dK_dXnew_full[i] = kern.gradients_X([[1.]], Xnew, self._predictive_variable[[i]])
 
+        if full_cov:
+            dK2_dXdX = kern.gradients_XX([[1.]], Xnew)
+        else:
+            dK2_dXdX = kern.gradients_XX_diag([[1.]], Xnew)
+
         def compute_cov_inner(wi):
             if full_cov:
                 # full covariance gradients:
-                dK2_dXdX = kern.gradients_XX([[1.]], Xnew)
                 var_jac = dK2_dXdX - np.einsum('qnm,miq->niq', dK_dXnew_full.T.dot(wi), dK_dXnew_full)
             else:
-                dK2_dXdX = kern.gradients_XX_diag([[1.]], Xnew)
                 var_jac = dK2_dXdX - np.einsum('qim,miq->iq', dK_dXnew_full.T.dot(wi), dK_dXnew_full)
             return var_jac
 
@@ -568,7 +571,7 @@ class GP(Model):
                                      which_data_ycols, fixed_inputs,
                                      levels, samples, fignum, ax, resolution,
                                      plot_raw=plot_raw, Y_metadata=Y_metadata,
-                                     data_symbol=data_symbol, predict_kw=predict_kw, 
+                                     data_symbol=data_symbol, predict_kw=predict_kw,
                                      plot_training_data=plot_training_data, **kw)
 
 
diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py
index 132c53d2..3c97a08a 100644
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@@ -73,7 +73,10 @@ class Add(CombinationKernel):
         return target
 
     def gradients_XX(self, dL_dK, X, X2):
-        target = 0.
+        if X2 is None:
+            target = np.zeros((X.shape[0], X.shape[0], X.shape[1]))
+        else:
+            target = np.zeros((X.shape[0], X2.shape[0], X.shape[1]))
         [target.__iadd__(p.gradients_XX(dL_dK, X, X2)) for p in self.parts]
         return target
 

From 7376627895996d33ba32e2ea3a38151f2227f06f Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 3 Sep 2015 10:23:09 +0100
Subject: [PATCH 17/18] [magnification] static corrections

---
 GPy/kern/_src/static.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/GPy/kern/_src/static.py b/GPy/kern/_src/static.py
index b32077e9..e7bee6c2 100644
--- a/GPy/kern/_src/static.py
+++ b/GPy/kern/_src/static.py
@@ -25,6 +25,8 @@ class Static(Kern):
         return np.zeros(X.shape)
 
     def gradients_XX(self, dL_dK, X, X2):
+        if X2 is None:
+            X2 = X
         return np.zeros((X.shape[0], X2.shape[0], X.shape[1]), dtype=np.float64)
     def gradients_XX_diag(self, dL_dKdiag, X):
         return np.zeros(X.shape)

From 3dbc32f4b19c6487018ea567d74bf0922fb2e32d Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Thu, 3 Sep 2015 12:59:51 +0100
Subject: [PATCH 18/18] [add] renamed>sum

---
 GPy/kern/_src/kern.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index 52745ddf..d5482b80 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -180,7 +180,7 @@ class Kern(Parameterized):
     def __iadd__(self, other):
         return self.add(other)
 
-    def add(self, other, name='add'):
+    def add(self, other, name='sum'):
         """
         Add another kernel to this one.