Merge branch 'devel' of github.com:SheffieldML/GPy into devel

2026-05-12 13:32:39 +02:00 · 2015-09-03 16:32:15 +01:00 · 2015-09-03 16:32:15 +01:00 · 5d2ef2c086
commit 5d2ef2c086
parent ded0b3366d 3dbc32f4b1
23 changed files with 736 additions and 192 deletions
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -106,6 +106,13 @@ class GP(Model):
        self.link_parameter(self.likelihood)
        self.posterior = None
        # The predictive variable to be used to predict using the posterior object's
        # woodbury_vector and woodbury_inv is defined as predictive_variable
        # This is usually just a link to self.X (full GP) or self.Z (sparse GP).
        # Make sure to name this variable and the predict functions will "just work"
        # as long as the posterior has the right woodbury entries.
        self._predictive_variable = self.X
    def set_XY(self, X=None, Y=None, trigger_update=True):
        """
@ -201,12 +208,15 @@ class GP(Model):
        Kx = kern.K(self.X, Xnew)
        mu = np.dot(Kx.T, self.posterior.woodbury_vector)
        if len(mu.shape)==1:
            mu = mu.reshape(-1,1)
        if full_cov:
            Kxx = kern.K(Xnew)
            if self.posterior.woodbury_inv.ndim == 2:
                var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
            elif self.posterior.woodbury_inv.ndim == 3:
                var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
                from ..util.linalg import mdot
                for i in range(var.shape[2]):
                    var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
            var = var
@ -302,6 +312,110 @@ class GP(Model):
        return dmu_dX, dv_dX
    def predict_jacobian(self, Xnew, kern=None, full_cov=True):
        """
        Compute the derivatives of the posterior of the GP.
        Given a set of points at which to predict X* (size [N*,Q]), compute the
        mean and variance of the derivative. Resulting arrays are sized:
         dL_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).
          Note that this is the mean and variance of the derivative,
          not the derivative of the mean and variance! (See predictive_gradients for that)
         dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
          If there is missing data, it is not implemented for now, but
          there will be one output variance per output dimension.
        :param X: The points at which to get the predictive gradients.
        :type X: np.ndarray (Xnew x self.input_dim)
        :param kern: The kernel to compute the jacobian for.
        :param boolean full_cov: whether to return the full covariance of the jacobian.
        :returns: dmu_dX, dv_dX
        :rtype: [np.ndarray (N*, Q ,D), np.ndarray (N*,Q,(D)) ]
        Note: We always return sum in input_dim gradients, as the off-diagonals
        in the input_dim are not needed for further calculations.
        This is a compromise for increase in speed. Mathematically the jacobian would
        have another dimension in Q.
        """
        if kern is None:
            kern = self.kern
        mean_jac = np.empty((Xnew.shape[0],Xnew.shape[1],self.output_dim))
        for i in range(self.output_dim):
            mean_jac[:,:,i] = kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1].T, Xnew, self._predictive_variable)
        dK_dXnew_full = np.empty((self._predictive_variable.shape[0], Xnew.shape[0], Xnew.shape[1]))
        for i in range(self._predictive_variable.shape[0]):
            dK_dXnew_full[i] = kern.gradients_X([[1.]], Xnew, self._predictive_variable[[i]])
        if full_cov:
            dK2_dXdX = kern.gradients_XX([[1.]], Xnew)
        else:
            dK2_dXdX = kern.gradients_XX_diag([[1.]], Xnew)
        def compute_cov_inner(wi):
            if full_cov:
                # full covariance gradients:
                var_jac = dK2_dXdX - np.einsum('qnm,miq->niq', dK_dXnew_full.T.dot(wi), dK_dXnew_full)
            else:
                var_jac = dK2_dXdX - np.einsum('qim,miq->iq', dK_dXnew_full.T.dot(wi), dK_dXnew_full)
            return var_jac
        if self.posterior.woodbury_inv.ndim == 3:
            var_jac = []
            for d in range(self.posterior.woodbury_inv.shape[2]):
                var_jac.append(compute_cov_inner(self.posterior.woodbury_inv[:, :, d]))
            var_jac = np.concatenate(var_jac)
        else:
            var_jac = compute_cov_inner(self.posterior.woodbury_inv)
        return mean_jac, var_jac
    def predict_wishard_embedding(self, Xnew, kern=None, mean=True, covariance=True):
        """
        Predict the wishard embedding G of the GP. This is the density of the
        input of the GP defined by the probabilistic function mapping f.
        G = J_mean.T*J_mean + output_dim*J_cov.
        :param array-like Xnew: The points at which to evaluate the magnification.
        :param :py:class:`~GPy.kern.Kern` kern: The kernel to use for the magnification.
        Supplying only a part of the learning kernel gives insights into the density
        of the specific kernel part of the input function. E.g. one can see how dense the
        linear part of a kernel is compared to the non-linear part etc.
        """
        if kern is None:
            kern = self.kern
        mu_jac, var_jac = self.predict_jacobian(Xnew, kern, full_cov=False)
        mumuT = np.einsum('iqd,ipd->iqp', mu_jac, mu_jac)
        if var_jac.ndim == 3:
            Sigma = np.einsum('iqd,ipd->iqp', var_jac, var_jac)
        else:
            Sigma = self.output_dim*np.einsum('iq,ip->iqp', var_jac, var_jac)
        G = 0.
        if mean:
            G += mumuT
        if covariance:
            G += Sigma
        return G
    def predict_magnification(self, Xnew, kern=None, mean=True, covariance=True):
        """
        Predict the magnification factor as
        sqrt(det(G))
        for each point N in Xnew
        """
        G = self.predict_wishard_embedding(Xnew, kern, mean, covariance)
        from ..util.linalg import jitchol
        return np.array([np.sqrt(np.exp(2*np.sum(np.log(np.diag(jitchol(G[n, :, :])))))) for n in range(Xnew.shape[0])])
        #return np.array([np.sqrt(np.linalg.det(G[n, :, :])) for n in range(Xnew.shape[0])])
    def posterior_samples_f(self,X,size=10, full_cov=True):
        """
        Samples the posterior GP at the points X.
@ -405,8 +519,8 @@ class GP(Model):
    def plot(self, plot_limits=None, which_data_rows='all',
        which_data_ycols='all', fixed_inputs=[],
        levels=20, samples=0, fignum=None, ax=None, resolution=None,
-        plot_raw=False,
+        plot_raw=False, linecol=None,fillcol=None, Y_metadata=None,
-        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx', predict_kw=None):
+        data_symbol='kx', predict_kw=None, plot_training_data=True):
        """
        Plot the posterior of the GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
@ -443,6 +557,8 @@ class GP(Model):
        :type Y_metadata: dict
        :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
        :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
        :param plot_training_data: whether or not to plot the training points
        :type plot_training_data: boolean
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import models_plots
@ -455,7 +571,101 @@ class GP(Model):
                                     which_data_ycols, fixed_inputs,
                                     levels, samples, fignum, ax, resolution,
                                     plot_raw=plot_raw, Y_metadata=Y_metadata,
-                                     data_symbol=data_symbol, predict_kw=predict_kw, **kw)
+                                     data_symbol=data_symbol, predict_kw=predict_kw,
                                     plot_training_data=plot_training_data, **kw)
    def plot_data(self, which_data_rows='all',
        which_data_ycols='all', visible_dims=None,
        fignum=None, ax=None, data_symbol='kx'):
        """
        Plot the training data
          - For higher dimensions than two, use fixed_inputs to plot the data points with some of the inputs fixed.
        Can plot only part of the data
        using which_data_rows and which_data_ycols.
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
        :type plot_limits: np.array
        :param which_data_rows: which of the training data to plot (default all)
        :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
        :type which_data_ycols: 'all' or a list of integers
        :param visible_dims: an array specifying the input dimensions to plot (maximum two)
        :type visible_dims: a numpy array
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        :type resolution: int
        :param levels: number of levels to plot in a contour plot.
        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
        :type levels: int
        :param samples: the number of a posteriori samples to plot
        :type samples: int
        :param fignum: figure to plot on.
        :type fignum: figure number
        :param ax: axes to plot on.
        :type ax: axes handle
        :param linecol: color of line to plot [Tango.colorsHex['darkBlue']]
        :type linecol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
        :param fillcol: color of fill [Tango.colorsHex['lightBlue']]
        :type fillcol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
        :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
        :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import models_plots
        kw = {}
        return models_plots.plot_data(self, which_data_rows,
                                     which_data_ycols, visible_dims,
                                     fignum, ax, data_symbol, **kw)
    def plot_fit_errorbars(self, which_data_rows='all',
            which_data_ycols='all', fixed_inputs=[], fignum=None, ax=None,
            linecol=None, data_symbol='kx', predict_kw=None, plot_training_data=True):
        """
        Plot the posterior error bars corresponding to the training data
          - For higher dimensions than two, use fixed_inputs to plot the data points with some of the inputs fixed.
        Can plot only part of the data
        using which_data_rows and which_data_ycols.
        :param which_data_rows: which of the training data to plot (default all)
        :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
        :type which_data_rows: 'all' or a list of integers
        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
        :type fixed_inputs: a list of tuples
        :param fignum: figure to plot on.
        :type fignum: figure number
        :param ax: axes to plot on.
        :type ax: axes handle
        :param plot_training_data: whether or not to plot the training points
        :type plot_training_data: boolean
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import models_plots
        kw = {}
        return models_plots.plot_fit_errorbars(self, which_data_rows, which_data_ycols, fixed_inputs,
                                    fignum, ax, linecol, data_symbol,
                                    predict_kw, plot_training_data, **kw)
    def plot_magnification(self, labels=None, which_indices=None,
                resolution=50, ax=None, marker='o', s=40,
                fignum=None, legend=True,
                plot_limits=None,
                aspect='auto', updates=False, plot_inducing=True, kern=None, **kwargs):
        import sys
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import dim_reduction_plots
        return dim_reduction_plots.plot_magnification(self, labels, which_indices,
                resolution, ax, marker, s,
                fignum, plot_inducing, legend,
                plot_limits, aspect, updates, **kwargs)
    def input_sensitivity(self, summarize=True):
        """
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@ -32,7 +32,7 @@ class Bijective_mapping(Mapping):
    also back from f to X. The inverse mapping is called g().
    """
    def __init__(self, input_dim, output_dim, name='bijective_mapping'):
-        super(Bijective_apping, self).__init__(name=name)
+        super(Bijective_mapping, self).__init__(name=name)
    def g(self, f):
        """Inverse mapping from output domain of the function to the inputs."""
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -59,6 +59,8 @@ class SparseGP(GP):
        logger.info("Adding Z as parameter")
        self.link_parameter(self.Z, index=0)
        self.posterior = None
        self._predictive_variable = self.Z
    def has_uncertain_inputs(self):
        return isinstance(self.X, VariationalPosterior)
@ -125,7 +127,7 @@ class SparseGP(GP):
        if kern is None: kern = self.kern
        if not isinstance(Xnew, VariationalPosterior):
-            Kx = kern.K(self.Z, Xnew)
+            Kx = kern.K(self._predictive_variable, Xnew)
            mu = np.dot(Kx.T, self.posterior.woodbury_vector)
            if full_cov:
                Kxx = kern.K(Xnew)
@ -149,8 +151,8 @@ class SparseGP(GP):
            if self.mean_function is not None:
                mu += self.mean_function.f(Xnew)
        else:
-            psi0_star = kern.psi0(self.Z, Xnew)
+            psi0_star = kern.psi0(self._predictive_variable, Xnew)
-            psi1_star = kern.psi1(self.Z, Xnew)
+            psi1_star = kern.psi1(self._predictive_variable, Xnew)
            #psi2_star = kern.psi2(self.Z, Xnew) # Only possible if we get NxMxM psi2 out of the code.
            la = self.posterior.woodbury_vector
            mu = np.dot(psi1_star, la) # TODO: dimensions?
@ -163,7 +165,7 @@ class SparseGP(GP):
            for i in range(Xnew.shape[0]):
                _mu, _var = Xnew.mean.values[[i]], Xnew.variance.values[[i]]
-                psi2_star = kern.psi2(self.Z, NormalPosterior(_mu, _var))
+                psi2_star = kern.psi2(self._predictive_variable, NormalPosterior(_mu, _var))
                tmp = (psi2_star[:, :] - psi1_star[[i]].T.dot(psi1_star[[i]]))
                var_ = mdot(la.T, tmp, la)
--- a/GPy/core/verbose_optimization.py
+++ b/GPy/core/verbose_optimization.py
@ -24,7 +24,6 @@ class VerboseOptimization(object):
            self.model.add_observer(self, self.print_status)
            self.status = 'running'
            self.clear = clear_after_finish
            self.deltat = .2
            self.update()
@ -80,6 +79,7 @@ class VerboseOptimization(object):
    def __enter__(self):
        self.start = time.time()
        self._time = self.start
        return self
    def print_out(self, seconds):
@ -143,12 +143,12 @@ class VerboseOptimization(object):
    def print_status(self, me, which=None):
        self.update()
-        seconds = time.time()-self.start
+        t = time.time()
        seconds = t-self.start
        #sys.stdout.write(" "*len(self.message))
-        self.deltat += seconds
+        if t-self._time > .3 or seconds < .3:
        if self.deltat > .2:
            self.print_out(seconds)
-            self.deltat = 0
+            self._time = t
        self.iteration += 1
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@ -14,7 +14,7 @@ class Add(CombinationKernel):
    This kernel will take over the active dims of it's subkernels passed in.
    """
-    def __init__(self, subkerns, name='add'):
+    def __init__(self, subkerns, name='sum'):
        for i, kern in enumerate(subkerns[:]):
            if isinstance(kern, Add):
                del subkerns[i]
@ -72,6 +72,19 @@ class Add(CombinationKernel):
        [target.__iadd__(p.gradients_X_diag(dL_dKdiag, X)) for p in self.parts]
        return target
    def gradients_XX(self, dL_dK, X, X2):
        if X2 is None:
            target = np.zeros((X.shape[0], X.shape[0], X.shape[1]))
        else:
            target = np.zeros((X.shape[0], X2.shape[0], X.shape[1]))
        [target.__iadd__(p.gradients_XX(dL_dK, X, X2)) for p in self.parts]
        return target
    def gradients_XX_diag(self, dL_dKdiag, X):
        target = np.zeros(X.shape)
        [target.__iadd__(p.gradients_XX_diag(dL_dKdiag, X)) for p in self.parts]
        return target
    @Cache_this(limit=2, force_kwargs=['which_parts'])
    def psi0(self, Z, variational_posterior):
        return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts))
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@ -100,6 +100,10 @@ class Kern(Parameterized):
        return self.psicomp.psicomputations(self, Z, variational_posterior)[2]
    def gradients_X(self, dL_dK, X, X2):
        raise NotImplementedError
    def gradients_XX(self, dL_dK, X, X2):
        raise(NotImplementedError, "This is the second derivative of K wrt X and X2, and not implemented for this kernel")
    def gradients_XX_diag(self, dL_dKdiag, X):
        raise(NotImplementedError, "This is the diagonal of the second derivative of K wrt X and X2, and not implemented for this kernel")
    def gradients_X_diag(self, dL_dKdiag, X):
        raise NotImplementedError
@ -176,7 +180,7 @@ class Kern(Parameterized):
    def __iadd__(self, other):
        return self.add(other)
-    def add(self, other, name='add'):
+    def add(self, other, name='sum'):
        """
        Add another kernel to this one.
--- a/GPy/kern/_src/kernel_slice_operations.py
+++ b/GPy/kern/_src/kernel_slice_operations.py
@ -19,6 +19,8 @@ class KernCallsViaSlicerMeta(ParametersChangedMeta):
        put_clean(dct, 'update_gradients_full', _slice_update_gradients_full)
        put_clean(dct, 'update_gradients_diag', _slice_update_gradients_diag)
        put_clean(dct, 'gradients_X', _slice_gradients_X)
        put_clean(dct, 'gradients_XX', _slice_gradients_XX)
        put_clean(dct, 'gradients_XX_diag', _slice_gradients_X_diag)
        put_clean(dct, 'gradients_X_diag', _slice_gradients_X_diag)
        put_clean(dct, 'psi0', _slice_psi)
@ -30,9 +32,12 @@ class KernCallsViaSlicerMeta(ParametersChangedMeta):
        return super(KernCallsViaSlicerMeta, cls).__new__(cls, name, bases, dct)
 class _Slice_wrap(object):
-    def __init__(self, k, X, X2=None):
+    def __init__(self, k, X, X2=None, ret_shape=None):
        self.k = k
-        self.shape = X.shape
+        if ret_shape is None:
            self.shape = X.shape
        else:
            self.shape = ret_shape
        assert X.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X.shape={!s}".format(X.shape)
        if X2 is not None:
            assert X2.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X2.shape={!s}".format(X2.shape)
@ -54,7 +59,10 @@ class _Slice_wrap(object):
    def handle_return_array(self, return_val):
        if self.ret:
            ret = np.zeros(self.shape)
-            ret[:, self.k.active_dims] = return_val
+            if len(self.shape) == 2:
                ret[:, self.k.active_dims] = return_val
            elif len(self.shape) == 3:
                ret[:, :, self.k.active_dims] = return_val
            return ret
        return return_val
@ -98,6 +106,19 @@ def _slice_gradients_X(f):
        return ret
    return wrap
 def _slice_gradients_XX(f):
    @wraps(f)
    def wrap(self, dL_dK, X, X2=None):
        if X2 is None:
            N, M = X.shape[0], X.shape[0]
        else:
            N, M = X.shape[0], X2.shape[0]
        with _Slice_wrap(self, X, X2, ret_shape=(N, M, X.shape[1])) as s:
        #with _Slice_wrap(self, X, X2, ret_shape=None) as s:
            ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2))
        return ret
    return wrap
 def _slice_gradients_X_diag(f):
    @wraps(f)
    def wrap(self, dL_dKdiag, X):
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@ -100,6 +100,12 @@ class Linear(Kern):
            #return (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
            return np.einsum('jq,q,ij->iq', X2, self.variances, dL_dK)
    def gradients_XX(self, dL_dK, X, X2=None):
        if X2 is None:
            return 2*np.ones(X.shape)*self.variances
        else:
            return np.ones(X.shape)*self.variances
    def gradients_X_diag(self, dL_dKdiag, X):
        return 2.*self.variances*dL_dKdiag[:,None]*X
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@ -31,6 +31,9 @@ class RBF(Stationary):
    def dK_dr(self, r):
        return -r*self.K_of_r(r)
    def dK2_drdr(self, r):
        return (r**2-1)*self.K_of_r(r)
    def __getstate__(self):
        dc = super(RBF, self).__getstate__()
        if self.useGPU:
--- a/GPy/kern/_src/static.py
+++ b/GPy/kern/_src/static.py
@ -24,6 +24,13 @@ class Static(Kern):
    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)
    def gradients_XX(self, dL_dK, X, X2):
        if X2 is None:
            X2 = X
        return np.zeros((X.shape[0], X2.shape[0], X.shape[1]), dtype=np.float64)
    def gradients_XX_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)
    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        return np.zeros(Z.shape)
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@ -15,7 +15,7 @@ from ...util.caching import Cache_this
 try:
    from . import stationary_cython
 except ImportError:
-    print('warning in sationary: failed to import cython module: falling back to numpy')
+    print('warning in stationary: failed to import cython module: falling back to numpy')
    config.set('cython', 'working', 'false')
@ -78,6 +78,10 @@ class Stationary(Kern):
        raise NotImplementedError("implement derivative of the covariance function wrt r to use this class")
    @Cache_this(limit=20, ignore_args=())
    def dK2_drdr(self, r):
        raise NotImplementedError("implement second derivative of covariance wrt r to use this method")
    @Cache_this(limit=5, ignore_args=())
    def K(self, X, X2=None):
        """
        Kernel function applied on inputs X and X2.
@ -94,6 +98,11 @@ class Stationary(Kern):
        #a convenience function, so we can cache dK_dr
        return self.dK_dr(self._scaled_dist(X, X2))
    @Cache_this(limit=3, ignore_args=())
    def dK2_drdr_via_X(self, X, X2):
        #a convenience function, so we can cache dK_dr
        return self.dK2_drdr(self._scaled_dist(X, X2))
    def _unscaled_dist(self, X, X2=None):
        """
        Compute the Euclidean distance between each row of X and X2, or between
@ -201,6 +210,59 @@ class Stationary(Kern):
        else:
            return self._gradients_X_pure(dL_dK, X, X2)
    def gradients_XX(self, dL_dK, X, X2=None):
        """
        Given the derivative of the objective K(dL_dK), compute the second derivative of K wrt X and X2:
        ..math:
          \frac{\partial^2 K}{\partial X\partial X2}
        ..returns:
            dL2_dXdX2: NxMxQ, for X [NxQ] and X2[MxQ] (X2 is X if, X2 is None)
            Thus, we return the second derivative in X2.
        """
        # The off diagonals in Q are always zero, this should also be true for the Linear kernel...
        # According to multivariable chain rule, we can chain the second derivative through r:
        # d2K_dXdX2 = dK_dr*d2r_dXdX2 + d2K_drdr * dr_dX * dr_dX2:
        invdist = self._inv_dist(X, X2)
        invdist2 = invdist**2
        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
        tmp1 = dL_dr * invdist
        dL_drdr = self.dK2_drdr_via_X(X, X2) * dL_dK
        tmp2 = dL_drdr * invdist2
        l2 = np.ones(X.shape[1]) * self.lengthscale**2
        if X2 is None:
            X2 = X
            tmp1 -= np.eye(X.shape[0])*self.variance
        else:
            tmp1[X==X2.T] -= self.variance
        grad = np.empty((X.shape[0], X2.shape[0], X.shape[1]), dtype=np.float64)
        #grad = np.empty(X.shape, dtype=np.float64)
        for q in range(self.input_dim):
            tmpdist2 = (X[:,[q]]-X2[:,[q]].T) ** 2
            grad[:, :, q] = ((tmp1*invdist2 - tmp2)*tmpdist2/l2[q] - tmp1)/l2[q]
            #grad[:, :, q] = ((tmp1*(((tmpdist2)*invdist2/l2[q])-1)) - (tmp2*(tmpdist2))/l2[q])/l2[q]
            #np.sum(((tmp1*(((tmpdist2)*invdist2/l2[q])-1)) - (tmp2*(tmpdist2))/l2[q])/l2[q], axis=1, out=grad[:,q])
            #np.sum( - (tmp2*(tmpdist**2)), axis=1, out=grad[:,q])
        return grad
    def gradients_XX_diag(self, dL_dK, X):
        """
        Given the derivative of the objective K(dL_dK), compute the second derivative of K wrt X and X2:
        ..math:
          \frac{\partial^2 K}{\partial X\partial X2}
        ..returns:
            dL2_dXdX2: NxMxQ, for X [NxQ] and X2[MxQ]
        """
        return np.ones(X.shape) * self.variance/self.lengthscale**2
    def _gradients_X_pure(self, dL_dK, X, X2=None):
        invdist = self._inv_dist(X, X2)
        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
--- a/GPy/likelihoods/init.py
+++ b/GPy/likelihoods/init.py
@ -1,6 +1,6 @@
 from .bernoulli import Bernoulli
 from .exponential import Exponential
-from .gaussian import Gaussian
+from .gaussian import Gaussian, HeteroscedasticGaussian
 from .gamma import Gamma
 from .poisson import Poisson
 from .student_t import StudentT
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -48,6 +48,7 @@ class Gaussian(Likelihood):
    def betaY(self,Y,Y_metadata=None):
        #TODO: ~Ricardo this does not live here
        raise RuntimeError, "Please notify the GPy developers, this should not happen"
        return Y/self.gaussian_variance(Y_metadata)
    def gaussian_variance(self, Y_metadata=None):
@ -324,3 +325,30 @@ class Gaussian(Likelihood):
        dF_dv = np.ones_like(v)*(-0.5/lik_var)
        dF_dtheta = -0.5/lik_var + 0.5*(np.square(Y) + np.square(m) + v - 2*m*Y)/(lik_var**2)
        return F, dF_dmu, dF_dv, dF_dtheta.reshape(1, Y.shape[0], Y.shape[1])
 class HeteroscedasticGaussian(Gaussian):
    def __init__(self, Y_metadata, gp_link=None, variance=1., name='het_Gauss'):
        if gp_link is None:
            gp_link = link_functions.Identity()
        if not isinstance(gp_link, link_functions.Identity):
            print("Warning, Exact inference is not implemeted for non-identity link functions,\
            if you are not already, ensure Laplace inference_method is used")
        super(HeteroscedasticGaussian, self).__init__(gp_link, np.ones(Y_metadata['output_index'].shape[0])*variance, name)
    def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
        return dL_dKdiag[Y_metadata['output_index']][:,0]
    def gaussian_variance(self, Y_metadata=None):
        return self.variance[Y_metadata['output_index']]
    def predictive_values(self, mu, var, full_cov=False, Y_metadata=None):
        if full_cov:
            if var.ndim == 2:
                var += np.eye(var.shape[0])*self.variance
            if var.ndim == 3:
                var += np.atleast_3d(np.eye(var.shape[0])*self.variance)
        else:
            var += self.variance
        return mu, var
--- a/GPy/models/gp_heteroscedastic_regression.py
+++ b/GPy/models/gp_heteroscedastic_regression.py
@ -16,6 +16,8 @@ class GPHeteroscedasticRegression(GP):
    :param X: input observations
    :param Y: observed values
    :param kernel: a GPy kernel, defaults to rbf
    NB: This model does not make inference on the noise outside the training set
    """
    def __init__(self, X, Y, kernel=None, Y_metadata=None):
@ -30,10 +32,7 @@ class GPHeteroscedasticRegression(GP):
            kernel = kern.RBF(X.shape[1])
        #Likelihood
-        #likelihoods_list = [likelihoods.Gaussian(name="Gaussian_noise_%s" %j) for j in range(Ny)]
+        likelihood = likelihoods.HeteroscedasticGaussian(Y_metadata)
        noise_terms = np.unique(Y_metadata['output_index'].flatten())
        likelihoods_list = [likelihoods.Gaussian(name="Gaussian_noise_%s" %j) for j in noise_terms]
        likelihood = likelihoods.MixedNoise(likelihoods_list=likelihoods_list)
        super(GPHeteroscedasticRegression, self).__init__(X,Y,kernel,likelihood, Y_metadata=Y_metadata)
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@ -43,19 +43,19 @@ class GPLVM(GP):
        super(GPLVM, self).parameters_changed()
        self.X.gradient = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X, None)
-    def jacobian(self,X):
+    #def jacobian(self,X):
-        J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
+    #    J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
-        for i in range(self.output_dim):
+    #    for i in range(self.output_dim):
-            J[:,:,i] = self.kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1], X, self.X)
+    #        J[:,:,i] = self.kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1], X, self.X)
-        return J
+    #    return J
-    def magnification(self,X):
+    #def magnification(self,X):
-        target=np.zeros(X.shape[0])
+    #    target=np.zeros(X.shape[0])
-        #J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
+    #    #J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
-        J = self.jacobian(X)
+    ##    J = self.jacobian(X)
-        for i in range(X.shape[0]):
+    #    for i in range(X.shape[0]):
-            target[i]=np.sqrt(np.linalg.det(np.dot(J[i,:,:],np.transpose(J[i,:,:]))))
+    #        target[i]=np.sqrt(np.linalg.det(np.dot(J[i,:,:],np.transpose(J[i,:,:]))))
-        return target
+    #    return target
    def plot(self):
        assert self.Y.shape[1] == 2, "too high dimensional to plot. Try plot_latent"
@ -81,6 +81,3 @@ class GPLVM(GP):
                resolution, ax, marker, s,
                fignum, False, legend,
                plot_limits, aspect, updates, **kwargs)
    def plot_magnification(self, *args, **kwargs):
        return util.plot_latent.plot_magnification(self, *args, **kwargs)
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@ -76,6 +76,7 @@ class SparseGPMiniBatch(SparseGP):
        logger.info("Adding Z as parameter")
        self.link_parameter(self.Z, index=0)
        self.posterior = None
        self._predictive_variable = self.Z
    def has_uncertain_inputs(self):
        return isinstance(self.X, VariationalPosterior)
--- a/GPy/plotting/matplot_dep/base_plots.py
+++ b/GPy/plotting/matplot_dep/base_plots.py
@ -47,6 +47,27 @@ def gpplot(x, mu, lower, upper, edgecol='#3300FF', fillcol='#33CCFF', ax=None, f
    return plots
 def gperrors(x, mu, lower, upper, edgecol=None, ax=None, fignum=None, **kwargs):
    _, axes = ax_default(fignum, ax)
    mu = mu.flatten()
    x = x.flatten()
    lower = lower.flatten()
    upper = upper.flatten()
    plots = []
    if edgecol is None:
        edgecol='#3300FF'
    if not 'alpha' in kwargs.keys():
        kwargs['alpha'] = 0.3
    plots.append(axes.errorbar(x,mu,yerr=np.vstack([mu-lower,upper-mu]),color=edgecol,**kwargs))
    plots[-1][0].remove()
    return plots
 def removeRightTicks(ax=None):
    ax = ax or pb.gca()
    for i, line in enumerate(ax.get_yticklines()):
--- a/GPy/plotting/matplot_dep/dim_reduction_plots.py
+++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py
@ -9,7 +9,8 @@ import itertools
 try:
    import Tango
    from matplotlib.cm import get_cmap
-    import pylab as pb
+    from matplotlib import pyplot as pb
    from matplotlib import cm
 except:
    pass
@ -114,7 +115,7 @@ def plot_latent(model, labels=None, which_indices=None,
    # create a function which computes the shading of latent space according to the output variance
    def plot_function(x):
-        Xtest_full = np.zeros((x.shape[0], model.X.shape[1]))
+        Xtest_full = np.zeros((x.shape[0], X.shape[1]))
        Xtest_full[:, [input_1, input_2]] = x
        _, var = model.predict(Xtest_full, **predict_kwargs)
        var = var[:, :1]
@ -137,7 +138,7 @@ def plot_latent(model, labels=None, which_indices=None,
    view = ImshowController(ax, plot_function,
                            (xmin, ymin, xmax, ymax),
                            resolution, aspect=aspect, interpolation='bilinear',
-                            cmap=pb.cm.binary, **imshow_kwargs)
+                            cmap=cm.binary, **imshow_kwargs)
    # make sure labels are in order of input:
    labels = np.asarray(labels)
@ -192,17 +193,18 @@ def plot_latent(model, labels=None, which_indices=None,
    if updates:
        try:
-            ax.figure.canvas.show()
+            fig.canvas.show()
        except Exception as e:
            print("Could not invoke show: {}".format(e))
-        raw_input('Enter to continue')
+        #raw_input('Enter to continue')
-        view.deactivate()
+        return view
    return ax
 def plot_magnification(model, labels=None, which_indices=None,
                resolution=60, ax=None, marker='o', s=40,
                fignum=None, plot_inducing=False, legend=True,
-                aspect='auto', updates=False):
+                plot_limits=None,
                aspect='auto', updates=False, mean=True, covariance=True, kern=None):
    """
    :param labels: a np.array of size model.num_data containing labels for the points (can be number, strings, etc)
    :param resolution: the resolution of the grid on which to evaluate the predictive variance
@ -210,6 +212,8 @@ def plot_magnification(model, labels=None, which_indices=None,
    if ax is None:
        fig = pb.figure(num=fignum)
        ax = fig.add_subplot(111)
    else:
        fig = ax.figure
    Tango.reset()
    if labels is None:
@ -217,19 +221,90 @@ def plot_magnification(model, labels=None, which_indices=None,
    input_1, input_2 = most_significant_input_dimensions(model, which_indices)
-    # first, plot the output variance as a function of the latent space
+    #fethch the data points X that we'd like to plot
-    Xtest, xx, yy, xmin, xmax = x_frame2D(model.X[:, [input_1, input_2]], resolution=resolution)
+    X = model.X
-    Xtest_full = np.zeros((Xtest.shape[0], model.X.shape[1]))
+    if isinstance(X, VariationalPosterior):
        X = X.mean
    else:
        X = X
    if X.shape[0] > 1000:
        print("Warning: subsampling X, as it has more samples then 1000. X.shape={!s}".format(X.shape))
        subsample = np.random.choice(X.shape[0], size=1000, replace=False)
        X = X[subsample]
        labels = labels[subsample]
        #=======================================================================
        #     <<<WORK IN PROGRESS>>>
        #     <<<DO NOT DELETE>>>
        #     plt.close('all')
        #     fig, ax = plt.subplots(1,1)
        #     from GPy.plotting.matplot_dep.dim_reduction_plots import most_significant_input_dimensions
        #     import matplotlib.patches as mpatches
        #     i1, i2 = most_significant_input_dimensions(m, None)
        #     xmin, xmax = 100, -100
        #     ymin, ymax = 100, -100
        #     legend_handles = []
        #
        #     X = m.X.mean[:, [i1, i2]]
        #     X = m.X.variance[:, [i1, i2]]
        #
        #     xmin = X[:,0].min(); xmax = X[:,0].max()
        #     ymin = X[:,1].min(); ymax = X[:,1].max()
        #     range_ = [[xmin, xmax], [ymin, ymax]]
        #     ul = np.unique(labels)
        #
        #     for i, l in enumerate(ul):
        #         #cdict = dict(red  =[(0., colors[i][0], colors[i][0]), (1., colors[i][0], colors[i][0])],
        #         #             green=[(0., colors[i][0], colors[i][1]), (1., colors[i][1], colors[i][1])],
        #         #             blue =[(0., colors[i][0], colors[i][2]), (1., colors[i][2], colors[i][2])],
        #         #             alpha=[(0., 0., .0), (.5, .5, .5), (1., .5, .5)])
        #         #cmap = LinearSegmentedColormap('{}'.format(l), cdict)
        #         cmap = LinearSegmentedColormap.from_list('cmap_{}'.format(str(l)), [colors[i], colors[i]], 255)
        #         cmap._init()
        #         #alphas = .5*(1+scipy.special.erf(np.linspace(-2,2, cmap.N+3)))#np.log(np.linspace(np.exp(0), np.exp(1.), cmap.N+3))
        #         alphas = (scipy.special.erf(np.linspace(0,2.4, cmap.N+3)))#np.log(np.linspace(np.exp(0), np.exp(1.), cmap.N+3))
        #         cmap._lut[:, -1] = alphas
        #         print l
        #         x, y = X[labels==l].T
        #
        #         heatmap, xedges, yedges = np.histogram2d(x, y, bins=300, range=range_)
        #         #heatmap, xedges, yedges = np.histogram2d(x, y, bins=100)
        #
        #         im = ax.imshow(heatmap, extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]], cmap=cmap, aspect='auto', interpolation='nearest', label=str(l))
        #         legend_handles.append(mpatches.Patch(color=colors[i], label=l))
        #     ax.set_xlim(xmin, xmax)
        #     ax.set_ylim(ymin, ymax)
        #     plt.legend(legend_handles, [l.get_label() for l in legend_handles])
        #     plt.draw()
        #     plt.show()
        #=======================================================================
    #Create an IMshow controller that can re-plot the latent space shading at a good resolution
    if plot_limits is None:
        xmin, ymin = X[:, [input_1, input_2]].min(0)
        xmax, ymax = X[:, [input_1, input_2]].max(0)
        x_r, y_r = xmax-xmin, ymax-ymin
        xmin -= .1*x_r
        xmax += .1*x_r
        ymin -= .1*y_r
        ymax += .1*y_r
    else:
        try:
            xmin, xmax, ymin, ymax = plot_limits
        except (TypeError, ValueError) as e:
            raise e.__class__("Wrong plot limits: {} given -> need (xmin, xmax, ymin, ymax)".format(plot_limits))
    def plot_function(x):
        Xtest_full = np.zeros((x.shape[0], X.shape[1]))
        Xtest_full[:, [input_1, input_2]] = x
-        mf=model.magnification(Xtest_full)
+        mf = model.predict_magnification(Xtest_full, kern=kern, mean=mean, covariance=covariance)
        return mf
    view = ImshowController(ax, plot_function,
-                            tuple(model.X.min(0)[:, [input_1, input_2]]) + tuple(model.X.max(0)[:, [input_1, input_2]]),
+                            (xmin, ymin, xmax, ymax),
                            resolution, aspect=aspect, interpolation='bilinear',
-                            cmap=pb.cm.gray)
+                            cmap=cm.gray)
    # make sure labels are in order of input:
    ulabels = []
@ -245,17 +320,17 @@ def plot_magnification(model, labels=None, which_indices=None,
        elif type(ul) is np.int64:
            this_label = 'class %i' % ul
        else:
-            this_label = 'class %i' % i
+            this_label = unicode(ul)
        m = marker.next()
        index = np.nonzero(labels == ul)[0]
        if model.input_dim == 1:
-            x = model.X[index, input_1]
+            x = X[index, input_1]
            y = np.zeros(index.size)
        else:
-            x = model.X[index, input_1]
+            x = X[index, input_1]
-            y = model.X[index, input_2]
+            y = X[index, input_2]
-        ax.scatter(x, y, marker=m, s=s, color=Tango.nextMedium(), label=this_label)
+        ax.scatter(x, y, marker=m, s=s, c=Tango.nextMedium(), label=this_label, linewidth=.2, edgecolor='k', alpha=.9)
    ax.set_xlabel('latent dimension %i' % input_1)
    ax.set_ylabel('latent dimension %i' % input_2)
@ -263,19 +338,29 @@ def plot_magnification(model, labels=None, which_indices=None,
    if not np.all(labels == 1.) and legend:
        ax.legend(loc=0, numpoints=1)
-    ax.set_xlim(xmin[0], xmax[0])
+    ax.set_xlim((xmin, xmax))
-    ax.set_ylim(xmin[1], xmax[1])
+    ax.set_ylim((ymin, ymax))
    ax.grid(b=False) # remove the grid if present, it doesn't look good
    ax.set_aspect('auto') # set a nice aspect ratio
-    if plot_inducing:
+    if plot_inducing and hasattr(model, 'Z'):
-        ax.plot(model.Z[:, input_1], model.Z[:, input_2], '^w')
+        Z = model.Z
        ax.scatter(Z[:, input_1], Z[:, input_2], c='w', s=18, marker="^", edgecolor='k', linewidth=.3, alpha=.7)
    try:
        fig.canvas.draw()
        fig.tight_layout()
        fig.canvas.draw()
    except Exception as e:
        print("Could not invoke tight layout: {}".format(e))
        pass
    if updates:
-        fig.canvas.show()
+        try:
-        raw_input('Enter to continue')
+            fig.canvas.draw()
-
+            fig.canvas.show()
-    pb.title('Magnification Factor')
+        except Exception as e:
            print("Could not invoke show: {}".format(e))
        #raw_input('Enter to continue')
        return view
    return ax
@ -314,8 +399,8 @@ def plot_steepest_gradient_map(model, fignum=None, ax=None, which_indices=None,
            this_label = 'class %i' % i
        m = marker.next()
        index = np.nonzero(data_labels == ul)[0]
-        x = model.X[index, input_1]
+        x = X[index, input_1]
-        y = model.X[index, input_2]
+        y = X[index, input_2]
        ax.scatter(x, y, marker=m, s=data_s, color=Tango.nextMedium(), label=this_label)
    ax.set_xlabel('latent dimension %i' % input_1)
@ -323,7 +408,7 @@ def plot_steepest_gradient_map(model, fignum=None, ax=None, which_indices=None,
    controller = ImAnnotateController(ax,
                                  plot_function,
-                                  tuple(model.X.min(0)[:, significant_dims]) + tuple(model.X.max(0)[:, significant_dims]),
+                                  tuple(X.min(0)[:, significant_dims]) + tuple(X.max(0)[:, significant_dims]),
                                  resolution=resolution,
                                  aspect=aspect,
                                  cmap=get_cmap('jet'),
--- a/GPy/plotting/matplot_dep/latent_space_visualizations/controllers/axis_event_controller.py
+++ b/GPy/plotting/matplot_dep/latent_space_visualizations/controllers/axis_event_controller.py
@ -9,6 +9,9 @@ class AxisEventController(object):
    def __init__(self, ax):
        self.ax = ax
        self.activate()
    def __del__(self):
        self.deactivate()
        return self
    def deactivate(self):
        for cb_class in self.ax.callbacks.callbacks.values():
            for cb_num in cb_class.keys():
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@ -3,19 +3,79 @@
 import numpy as np
 from . import Tango
-from base_plots import gpplot, x_frame1D, x_frame2D
+from base_plots import gpplot, x_frame1D, x_frame2D,gperrors
 from ...models.gp_coregionalized_regression import GPCoregionalizedRegression
 from ...models.sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
 from scipy import sparse
 from ...core.parameterization.variational import VariationalPosterior
 from matplotlib import pyplot as plt
 def plot_data(model, which_data_rows='all',
        which_data_ycols='all', visible_dims=None,
        fignum=None, ax=None, data_symbol='kx',mew=1.5):
    """
    Plot the training data
      - For higher dimensions than two, use fixed_inputs to plot the data points with some of the inputs fixed.
    Can plot only part of the data
    using which_data_rows and which_data_ycols.
    :param which_data_rows: which of the training data to plot (default all)
    :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
    :param which_data_ycols: when the data has several columns (independant outputs), only plot these
    :type which_data_rows: 'all' or a list of integers
    :param visible_dims: an array specifying the input dimensions to plot (maximum two)
    :type visible_dims: a numpy array
    :param fignum: figure to plot on.
    :type fignum: figure number
    :param ax: axes to plot on.
    :type ax: axes handle
    """
    #deal with optional arguments
    if which_data_rows == 'all':
        which_data_rows = slice(None)
    if which_data_ycols == 'all':
        which_data_ycols = np.arange(model.output_dim)
    if ax is None:
        fig = plt.figure(num=fignum)
        ax = fig.add_subplot(111)
    #data
    X = model.X
    Y = model.Y
    #work out what the inputs are for plotting (1D or 2D)
    if visible_dims is None:
        visible_dims = np.arange(model.input_dim)
    assert visible_dims.size <= 2, "Visible inputs cannot be larger than two"
    free_dims = visible_dims
    plots = {}
    #one dimensional plotting
    if len(free_dims) == 1:
        for d in which_data_ycols:
            plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], data_symbol, mew=mew)
    #2D plotting
    elif len(free_dims) == 2:
        for d in which_data_ycols:
            plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40,
            Y[which_data_rows, d], cmap=plt.cm.jet, vmin=Y.min(), vmax=Y.max(), linewidth=0.)
    else:
        raise NotImplementedError("Cannot define a frame with more than two input dimensions")
    return plots
 def plot_fit(model, plot_limits=None, which_data_rows='all',
        which_data_ycols='all', fixed_inputs=[],
        levels=20, samples=0, fignum=None, ax=None, resolution=None,
        plot_raw=False,
        linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue'], Y_metadata=None, data_symbol='kx',
-        apply_link=False, samples_f=0, plot_uncertain_inputs=True, predict_kw=None):
+        apply_link=False, samples_f=0, plot_uncertain_inputs=True, predict_kw=None, plot_training_data=True):
    """
    Plot the posterior of the GP.
      - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
@ -43,7 +103,6 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
    :type fignum: figure number
    :param ax: axes to plot on.
    :type ax: axes handle
    :type output: integer (first output is 0)
    :param linecol: color of line to plot.
    :type linecol:
    :param fillcol: color of fill
@ -52,6 +111,8 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
    :type apply_link: boolean
    :param samples_f: the number of posteriori f samples to plot p(f*|y)
    :type samples_f: int
    :param plot_training_data: whether or not to plot the training points
    :type plot_training_data: boolean
    """
    #deal with optional arguments
    if which_data_rows == 'all':
@ -116,7 +177,11 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
        for d in which_data_ycols:
            plots['gpplot'] = gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], ax=ax, edgecol=linecol, fillcol=fillcol)
-            if not plot_raw: plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], data_symbol, mew=1.5)
+            #if not plot_raw: plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], data_symbol, mew=1.5)
            if not plot_raw and plot_training_data:
                plots['dataplot'] = plot_data(model=model, which_data_rows=which_data_rows,
                visible_dims=free_dims, data_symbol=data_symbol, mew=1.5, ax=ax, fignum=fignum)
        #optionally plot some samples
        if samples: #NOTE not tested with fixed_inputs
@ -196,7 +261,9 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
        for d in which_data_ycols:
            m_d = m[:,d].reshape(resolution, resolution).T
            plots['contour'] = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=plt.cm.jet)
-            if not plot_raw: plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=plt.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+            #if not plot_raw: plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=plt.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
            if not plot_raw and plot_training_data:
                plots['dataplot'] = ax.scatter(X[which_data_rows, free_dims[0]], X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=plt.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
        #set the limits of the plot to some sensible values
        ax.set_xlim(xmin[0], xmax[0])
@ -261,3 +328,83 @@ def fixed_inputs(model, non_fixed_inputs, fix_routine='median', as_list=True, X_
        return f_inputs
    else:
        return X
 def plot_fit_errorbars(model, which_data_rows='all',
        which_data_ycols='all', fixed_inputs=[],
        fignum=None, ax=None,
        linecol='red', data_symbol='kx',
        predict_kw=None, plot_training_data=True):
    """
    Plot the posterior error bars corresponding to the training data
      - For higher dimensions than two, use fixed_inputs to plot the data points with some of the inputs fixed.
    Can plot only part of the data
    using which_data_rows and which_data_ycols.
    :param which_data_rows: which of the training data to plot (default all)
    :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
    :param which_data_ycols: when the data has several columns (independant outputs), only plot these
    :type which_data_rows: 'all' or a list of integers
    :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
    :type fixed_inputs: a list of tuples
    :param fignum: figure to plot on.
    :type fignum: figure number
    :param ax: axes to plot on.
    :type ax: axes handle
    :param plot_training_data: whether or not to plot the training points
    :type plot_training_data: boolean
    """
    #deal with optional arguments
    if which_data_rows == 'all':
        which_data_rows = slice(None)
    if which_data_ycols == 'all':
        which_data_ycols = np.arange(model.output_dim)
    if ax is None:
        fig = plt.figure(num=fignum)
        ax = fig.add_subplot(111)
    X = model.X
    Y = model.Y
    if predict_kw is None:
        predict_kw = {}
    #work out what the inputs are for plotting (1D or 2D)
    fixed_dims = np.array([i for i,v in fixed_inputs])
    free_dims = np.setdiff1d(np.arange(model.input_dim),fixed_dims)
    plots = {}
    #one dimensional plotting
    if len(free_dims) == 1:
        m, v = model.predict(X, full_cov=False, Y_metadata=model.Y_metadata, **predict_kw)
        fmu, fv = model._raw_predict(X, full_cov=False, **predict_kw)
        lower, upper = model.likelihood.predictive_quantiles(fmu, fv, (2.5, 97.5), Y_metadata=model.Y_metadata)
        for d in which_data_ycols:
            plots['gperrors'] = gperrors(X, m[:, d], lower[:, d], upper[:, d], edgecol=linecol, ax=ax, fignum=fignum )
            if plot_training_data:
                plots['dataplot'] = plot_data(model=model, which_data_rows=which_data_rows,
                visible_dims=free_dims, data_symbol=data_symbol, mew=1.5, ax=ax, fignum=fignum)
        #set the limits of the plot to some sensible values
        ymin, ymax = min(np.append(Y[which_data_rows, which_data_ycols].flatten(), lower)), max(np.append(Y[which_data_rows, which_data_ycols].flatten(), upper))
        ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
        ax.set_xlim(X[:,free_dims].min(), X[:,free_dims].max())
        ax.set_ylim(ymin, ymax)
    elif len(free_dims) == 2:
        raise NotImplementedError("Not implemented yet")
    else:
        raise NotImplementedError("Cannot define a frame with more than two input dimensions")
    return plots
--- a/GPy/util/diag.py
+++ b/GPy/util/diag.py
@ -46,6 +46,8 @@ def offdiag_view(A, offset=0):
    return as_strided(Af[(1+offset):], shape=(A.shape[0]-1, A.shape[1]), strides=(A.strides[0] + A.itemsize, A.strides[1]))
 def _diag_ufunc(A,b,offset,func):
    b = np.squeeze(b)
    assert b.ndim <= 1, "only implemented for one dimensional arrays"
    dA = view(A, offset); func(dA,b,dA)
    return A
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -7,48 +7,13 @@
 import numpy as np
 from scipy import linalg
-import types
+from scipy.linalg import lapack, blas
-import ctypes
+
 from ctypes import byref, c_char, c_int, c_double # TODO
 import scipy
 import warnings
 import os
 from .config import config
 import logging
 from . import linalg_cython
 _scipyversion = np.float64((scipy.__version__).split('.')[:2])
 _fix_dpotri_scipy_bug = True
 if np.all(_scipyversion >= np.array([0, 14])):
    from scipy.linalg import lapack
    _fix_dpotri_scipy_bug = False
 elif np.all(_scipyversion >= np.array([0, 12])):
    #import scipy.linalg.lapack.clapack as lapack
    from scipy.linalg import lapack
 else:
    from scipy.linalg.lapack import flapack as lapack
 if config.getboolean('anaconda', 'installed') and config.getboolean('anaconda', 'MKL'):
    try:
        anaconda_path = str(config.get('anaconda', 'location'))
        mkl_rt = ctypes.cdll.LoadLibrary(os.path.join(anaconda_path, 'DLLs', 'mkl_rt.dll'))
        dsyrk = mkl_rt.dsyrk
        dsyr = mkl_rt.dsyr
        _blas_available = True
        print('anaconda installed and mkl is loaded')
    except:
        _blas_available = False
 else:
    try:
        _blaslib = ctypes.cdll.LoadLibrary(np.core._dotblas.__file__) # @UndefinedVariable
        dsyrk = _blaslib.dsyrk_
        dsyr = _blaslib.dsyr_
        _blas_available = True
    except AttributeError as e:
        _blas_available = False
        warnings.warn("warning: caught this exception:" + str(e))
 def force_F_ordered_symmetric(A):
    """
    return a F ordered version of A, assuming A is symmetric
@ -169,9 +134,6 @@ def dpotri(A, lower=1):
    :returns: A inverse
    """
    if _fix_dpotri_scipy_bug:
        assert lower==1, "scipy linalg behaviour is very weird. please use lower, fortran ordered arrays"
        lower = 0
    A = force_F_ordered(A)
    R, info = lapack.dpotri(A, lower=lower) #needs to be zero here, seems to be a scipy bug
@ -300,8 +262,8 @@ def pca(Y, input_dim):
    Z = linalg.svd(Y - Y.mean(axis=0), full_matrices=False)
    [X, W] = [Z[0][:, 0:input_dim], np.dot(np.diag(Z[1]), Z[2]).T[:, 0:input_dim]]
    v = X.std(axis=0)
-    X /= v;
+    X /= v
-    W *= v;
+    W *= v
    return X, W.T
 def ppca(Y, Q, iterations=100):
@ -347,34 +309,15 @@ def tdot_blas(mat, out=None):
        out[:] = 0.0
    # # Call to DSYRK from BLAS
    # If already in Fortran order (rare), and has the right sorts of strides I
    # could avoid the copy. I also thought swapping to cblas API would allow use
    # of C order. However, I tried that and had errors with large matrices:
    # http://homepages.inf.ed.ac.uk/imurray2/code/tdot/tdot_broken.py
    mat = np.asfortranarray(mat)
-    TRANS = c_char('n'.encode('ascii'))
+    out = blas.dsyrk(alpha=1.0, a=mat, beta=0.0, c=out, overwrite_c=1,
-    N = c_int(mat.shape[0])
+                     trans=0, lower=0)
    K = c_int(mat.shape[1])
    LDA = c_int(mat.shape[0])
    UPLO = c_char('l'.encode('ascii'))
    ALPHA = c_double(1.0)
    A = mat.ctypes.data_as(ctypes.c_void_p)
    BETA = c_double(0.0)
    C = out.ctypes.data_as(ctypes.c_void_p)
    LDC = c_int(np.max(out.strides) // 8)
    dsyrk(byref(UPLO), byref(TRANS), byref(N), byref(K),
            byref(ALPHA), A, byref(LDA), byref(BETA), C, byref(LDC))
    symmetrify(out, upper=True)
    return np.ascontiguousarray(out)
 def tdot(*args, **kwargs):
-    if _blas_available:
+    return tdot_blas(*args, **kwargs)
        return tdot_blas(*args, **kwargs)
    else:
        return tdot_numpy(*args, **kwargs)
 def DSYR_blas(A, x, alpha=1.):
    """
@ -386,15 +329,7 @@ def DSYR_blas(A, x, alpha=1.):
    :param alpha: scalar
    """
-    N = c_int(A.shape[0])
+    A = blas.dsyr(lower=0, x=x, a=A, alpha=alpha, overwrite_a=True)
    LDA = c_int(A.shape[0])
    UPLO = c_char('l'.encode('ascii'))
    ALPHA = c_double(alpha)
    A_ = A.ctypes.data_as(ctypes.c_void_p)
    x_ = x.ctypes.data_as(ctypes.c_void_p)
    INCX = c_int(1)
    dsyr(byref(UPLO), byref(N), byref(ALPHA),
            x_, byref(INCX), A_, byref(LDA))
    symmetrify(A, upper=True)
 def DSYR_numpy(A, x, alpha=1.):
@ -411,10 +346,8 @@ def DSYR_numpy(A, x, alpha=1.):
 def DSYR(*args, **kwargs):
-    if _blas_available:
+    return DSYR_blas(*args, **kwargs)
-        return DSYR_blas(*args, **kwargs)
+
    else:
        return DSYR_numpy(*args, **kwargs)
 def symmetrify(A, upper=False):
    """