merge with upstream

2026-05-02 16:22:39 +02:00 · 2016-03-10 18:17:35 +00:00 · 2016-03-10 18:17:35 +00:00 · ba74e29aee
commit ba74e29aee
parent e9cc56e8e8 2e7ad7b8d4
115 changed files with 1178 additions and 531 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -2,7 +2,7 @@
 [run]
 branch = True
 source = GPy
-omit = ./GPy/testing/*.py, travis_tests.py, setup.py, ./GPy/__version__.py
+omit = ./GPy/testing/*.py, travis_tests.py, setup.py, ./GPy/__version__.py, ./GPy/plotting/*
 [report]
 # Regexes for lines to exclude from consideration
--- a/.travis.yml
+++ b/.travis.yml
@ -20,33 +20,19 @@ env:
  - PYTHON_VERSION=3.5
 before_install:
-  - export CONDA_CACHED=1
+- wget https://github.com/mzwiessele/travis_scripts/raw/master/download_miniconda.sh
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; 
+- wget https://github.com/mzwiessele/travis_scripts/raw/master/install_retry.sh
-      then export OS=Linux; 
+- source download_miniconda.sh
-    elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; 
+- echo $PATH
      then export OS=MacOSX;
      brew install pandoc; 
    else 
      echo "OS not supported yet"; 
      exit 1; fi;
  - if [[ $PYTHON_VERSION == "2.7" ]]; 
      then export MINICONDA=Miniconda; 
    elif [[ $PYTHON_VERSION == 3* ]]; 
      then export MINICONDA=Miniconda3; 
    else echo "Could not find python version";exit 1; fi;
  - if [ ! -d $HOME/download/ ]; then mkdir $HOME/download/; fi;
  - if [ ! -d $HOME/install/ ]; then mkdir $HOME/install/; fi;
  - export MINICONDA_FILE=$MINICONDA-latest-$OS-x86_64-$PYTHON_VERSION
  - export MINCONDA_CACHE_FILE=$HOME/download/$MINICONDA_FILE.sh
  - export MINICONDA_INSTALL=$HOME/install/$MINICONDA_FILE
  - if [ ! -f $MINCONDA_CACHE_FILE ]; then export CONDA_CACHED=0; wget http://repo.continuum.io/miniconda/$MINICONDA-latest-$OS-x86_64.sh -O $MINCONDA_CACHE_FILE; bash $MINCONDA_CACHE_FILE -b -p $MINICONDA_INSTALL; fi;
  - export PATH="$MINICONDA_INSTALL/bin:$PATH";
 install:
-  - conda install --yes python=$PYTHON_VERSION numpy=1.9 scipy=0.16 nose pip six matplotlib sphinx;
+- echo $PATH
-  - pip install codecov
+- source install_retry.sh
-  - pip install pypandoc
+- pip install codecov
-  - python setup.py develop
+- pip install pypandoc
 - pip install git+git://github.com/BRML/climin.git
 - pip install autograd
 - python setup.py develop
 script:
  - coverage run travis_tests.py
--- a/GPy/version.py
+++ b/GPy/version.py
@ -1 +1 @@
-__version__ = "0.9.6"
+__version__ = "0.9.7"
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -401,9 +401,9 @@ class GP(Model):
            var_jac = compute_cov_inner(self.posterior.woodbury_inv)
        return mean_jac, var_jac
-    def predict_wishard_embedding(self, Xnew, kern=None, mean=True, covariance=True):
+    def predict_wishart_embedding(self, Xnew, kern=None, mean=True, covariance=True):
        """
-        Predict the wishard embedding G of the GP. This is the density of the
+        Predict the wishart embedding G of the GP. This is the density of the
        input of the GP defined by the probabilistic function mapping f.
        G = J_mean.T*J_mean + output_dim*J_cov.
@ -431,6 +431,10 @@ class GP(Model):
            G += Sigma
        return G
    def predict_wishard_embedding(self, Xnew, kern=None, mean=True, covariance=True):
        warnings.warn("Wrong naming, use predict_wishart_embedding instead. Will be removed in future versions!", DeprecationWarning)
        return self.predict_wishart_embedding(Xnew, kern, mean, covariance)
    def predict_magnification(self, Xnew, kern=None, mean=True, covariance=True):
        """
        Predict the magnification factor as
--- a/GPy/core/parameterization/init.py
+++ b/GPy/core/parameterization/init.py
@ -3,7 +3,7 @@
 from .param import Param
 from .parameterized import Parameterized
-from paramz import transformations
+from . import transformations
 from paramz.core import lists_and_dicts, index_operations, observable_array, observable
 from paramz import ties_and_remappings, ObsAr
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -44,7 +44,7 @@ class SparseGP(GP):
        #pick a sensible inference method
        if inference_method is None:
            if isinstance(likelihood, likelihoods.Gaussian):
-                inference_method = var_dtc.VarDTC(limit=1)
+                inference_method = var_dtc.VarDTC(limit=3)
            else:
                #inference_method = ??
                raise NotImplementedError("what to do what to do?")
--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@ -89,7 +89,7 @@ class SVGP(SparseGP):
        """
        Return a new batch of X and Y by taking a chunk of data from the complete X and Y
        """
-        i = self.slicer.next()
+        i = next(self.slicer)
        return self.X_all[i], self.Y_all[i]
    def stochastic_grad(self, parameters):
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -459,7 +459,7 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim)
-    k = kern.Linear(Q) + kern.White(Q, variance=1e-4)
+    k = kern.Linear(Q, ARD=True) + kern.White(Q, variance=1e-4)
    m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing, kernel=k, initx="PCA_concat", initz='permute', **kw)
    m['.*noise'] = [Y.var() / 40. for Y in Ylist]
@ -479,7 +479,7 @@ def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim
    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
-    k = kern.Linear(Q) + kern.White(Q, variance=1e-4)
+    k = kern.Linear(Q, ARD=True) + kern.White(Q, variance=1e-4)
    inanlist = []
    for Y in Ylist:
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@ -22,7 +22,7 @@ class VarDTC(LatentFunctionInference):
    """
    const_jitter = 1e-8
-    def __init__(self, limit=1):
+    def __init__(self, limit=3):
        from paramz.caching import Cacher
        self.limit = limit
        self.get_trYYT = Cacher(self._get_trYYT, limit)
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@ -21,7 +21,7 @@ class VarDTC_minibatch(LatentFunctionInference):
    """
    const_jitter = 1e-8
-    def __init__(self, batchsize=None, limit=1, mpi_comm=None):
+    def __init__(self, batchsize=None, limit=3, mpi_comm=None):
        self.batchsize = batchsize
        self.mpi_comm = mpi_comm
--- a/GPy/inference/optimization/init.py
+++ b/GPy/inference/optimization/init.py
@ -1,5 +1,8 @@
-from paramz.optimization import stochastics, Optimizer
+from paramz.optimization import Optimizer
 from . import stochastics
 from paramz.optimization import *
 import sys
 sys.modules['GPy.inference.optimization.stochastics'] = stochastics
 sys.modules['GPy.inference.optimization.Optimizer'] = Optimizer
--- a/GPy/inference/optimization/stochastics.py
+++ b/GPy/inference/optimization/stochastics.py
@ -0,0 +1,119 @@
 #===============================================================================
 # Copyright (c) 2015, Max Zwiessele
 # All rights reserved.
 # 
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 # 
 # * Redistributions of source code must retain the above copyright notice, this
 #   list of conditions and the following disclaimer.
 # 
 # * Redistributions in binary form must reproduce the above copyright notice,
 #   this list of conditions and the following disclaimer in the documentation
 #   and/or other materials provided with the distribution.
 # 
 # * Neither the name of paramax nor the names of its
 #   contributors may be used to endorse or promote products derived from
 #   this software without specific prior written permission.
 # 
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #===============================================================================
 class StochasticStorage(object):
    '''
    This is a container for holding the stochastic parameters,
    such as subset indices or step length and so on.
    self.d has to be a list of lists:
    [dimension indices, nan indices for those dimensions]
    so that the minibatches can be used as efficiently as possible.
    '''
    def __init__(self, model):
        """
        Initialize this stochastic container using the given model
        """
    def do_stochastics(self):
        """
        Update the internal state to the next batch of the stochastic
        descent algorithm.
        """
        pass
    def reset(self):
        """
        Reset the state of this stochastics generator.
        """
 class SparseGPMissing(StochasticStorage):
    def __init__(self, model, batchsize=1):
        """
        Here we want to loop over all dimensions everytime.
        Thus, we can just make sure the loop goes over self.d every
        time. We will try to get batches which look the same together
        which speeds up calculations significantly.
        """
        import numpy as np
        self.Y = model.Y_normalized
        bdict = {}
        #For N > 1000 array2string default crops
        opt = np.get_printoptions()
        np.set_printoptions(threshold=np.inf)
        for d in range(self.Y.shape[1]):
            inan = np.isnan(self.Y)[:, d]
            arr_str = np.array2string(inan, np.inf, 0, True, '', formatter={'bool':lambda x: '1' if x else '0'})
            try:
                bdict[arr_str][0].append(d)
            except:
                bdict[arr_str] = [[d], ~inan]
        np.set_printoptions(**opt)
        self.d = bdict.values()
 class SparseGPStochastics(StochasticStorage):
    """
    For the sparse gp we need to store the dimension we are in,
    and the indices corresponding to those
    """
    def __init__(self, model, batchsize=1, missing_data=True):
        self.batchsize = batchsize
        self.output_dim = model.Y.shape[1]
        self.Y = model.Y_normalized
        self.missing_data = missing_data
        self.reset()
        self.do_stochastics()
    def do_stochastics(self):
        import numpy as np
        if self.batchsize == 1:
            self.current_dim = (self.current_dim+1)%self.output_dim
            self.d = [[[self.current_dim], np.isnan(self.Y[:, self.current_dim]) if self.missing_data else None]]
        else:
            self.d = np.random.choice(self.output_dim, size=self.batchsize, replace=False)
            bdict = {}
            if self.missing_data:
                opt = np.get_printoptions()
                np.set_printoptions(threshold=np.inf)
                for d in self.d:
                    inan = np.isnan(self.Y[:, d])
                    arr_str = np.array2string(inan,np.inf, 0,True, '',formatter={'bool':lambda x: '1' if x else '0'})
                    try:
                        bdict[arr_str][0].append(d)
                    except:
                        bdict[arr_str] = [[d], ~inan]
                np.set_printoptions(**opt)
                self.d = bdict.values()
            else:
                self.d = [[self.d, None]]
    def reset(self):
        self.current_dim = -1
        self.d = None
--- a/GPy/installation.cfg
+++ b/GPy/installation.cfg
@ -15,4 +15,4 @@
 # [plotting]
-# library = matplotlib # plotly
+# library = matplotlib # plotly, none
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -10,7 +10,7 @@ from .src.add import Add
 from .src.prod import Prod
 from .src.rbf import RBF
 from .src.linear import Linear, LinearFull
-from .src.static import Bias, White, Fixed
+from .src.static import Bias, White, Fixed, WhiteHeteroscedastic
 from .src.brownian import Brownian
 from .src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
 from .src.mlp import MLP
--- a/GPy/kern/src/add.py
+++ b/GPy/kern/src/add.py
@ -19,8 +19,8 @@ class Add(CombinationKernel):
            if isinstance(kern, Add):
                del subkerns[i]
                for part in kern.parts[::-1]:
-                    kern.unlink_parameter(part)
+                    #kern.unlink_parameter(part)
-                    subkerns.insert(i, part)
+                    subkerns.insert(i, part.copy())
        super(Add, self).__init__(subkerns, name)
        self._exact_psicomp = self._check_exact_psicomp()
@ -37,7 +37,7 @@ class Add(CombinationKernel):
        else:
            return False
-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def K(self, X, X2=None, which_parts=None):
        """
        Add all kernels together.
@ -51,7 +51,7 @@ class Add(CombinationKernel):
            which_parts = [which_parts]
        return reduce(np.add, (p.K(X, X2) for p in which_parts))
-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def Kdiag(self, X, which_parts=None):
        if which_parts is None:
            which_parts = self.parts
@ -98,17 +98,17 @@ class Add(CombinationKernel):
        [target.__iadd__(p.gradients_XX_diag(dL_dKdiag, X)) for p in self.parts]
        return target
-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def psi0(self, Z, variational_posterior):
        if not self._exact_psicomp: return Kern.psi0(self,Z,variational_posterior)
        return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts))
-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def psi1(self, Z, variational_posterior):
        if not self._exact_psicomp: return Kern.psi1(self,Z,variational_posterior)
        return reduce(np.add, (p.psi1(Z, variational_posterior) for p in self.parts))
-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def psi2(self, Z, variational_posterior):
        if not self._exact_psicomp: return Kern.psi2(self,Z,variational_posterior)
        psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts))
@ -144,7 +144,7 @@ class Add(CombinationKernel):
                raise NotImplementedError("psi2 cannot be computed for this kernel")
        return psi2
-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def psi2n(self, Z, variational_posterior):
        if not self._exact_psicomp: return Kern.psi2n(self, Z, variational_posterior)
        psi2 = reduce(np.add, (p.psi2n(Z, variational_posterior) for p in self.parts))
@ -241,16 +241,20 @@ class Add(CombinationKernel):
            [np.add(target_grads[i],grads[i],target_grads[i]) for i in range(len(grads))]
        return target_grads
-    def add(self, other):
+    #def add(self, other):
-        if isinstance(other, Add):
+    #    parts = self.parts
-            other_params = other.parameters[:]
+    #    if 0:#isinstance(other, Add):
-            for p in other_params:
+    #        #other_params = other.parameters[:]
-                other.unlink_parameter(p)
+    #        for p in other.parts[:]:
-            self.link_parameters(*other_params)
+    #            other.unlink_parameter(p)
-        else:
+    #        parts.extend(other.parts)
-            self.link_parameter(other)
+    #        #self.link_parameters(*other_params)
-        self.input_dim, self._all_dims_active = self.get_input_dim_active_dims(self.parts)
+    #        
-        return self
+    #    else:
    #        #self.link_parameter(other)
    #        parts.append(other)
    #    #self.input_dim, self._all_dims_active = self.get_input_dim_active_dims(parts)
    #    return Add([p for p in parts], self.name)
    def input_sensitivity(self, summarize=True):
        if summarize:
--- a/GPy/kern/src/eq_ode2.py
+++ b/GPy/kern/src/eq_ode2.py
@ -64,7 +64,7 @@ class EQ_ODE2(Kern):
        self.W = Param('W', W)
        self.link_parameters(self.lengthscale, self.C, self.B, self.W)
-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
    def K(self, X, X2=None):
        #This way is not working, indexes are lost after using k._slice_X
        #index = np.asarray(X, dtype=np.int)
--- a/GPy/kern/src/kern.py
+++ b/GPy/kern/src/kern.py
@ -49,10 +49,11 @@ class Kern(Parameterized):
        if active_dims is None:
            active_dims = np.arange(input_dim)
-        self.active_dims = active_dims
+        self.active_dims = np.asarray(active_dims, np.int_)
        self._all_dims_active = np.atleast_1d(active_dims).astype(int)
-        assert self._all_dims_active.size == self.input_dim, "input_dim={} does not match len(active_dim)={}, _all_dims_active={}".format(self.input_dim, self._all_dims_active.size, self._all_dims_active)
+        self._all_dims_active = np.atleast_1d(self.active_dims).astype(int)
        assert self.active_dims.size == self.input_dim, "input_dim={} does not match len(active_dim)={}".format(self.input_dim, self._all_dims_active.size)
        self._sliced_X = 0
        self.useGPU = self._support_GPU and useGPU
@ -68,8 +69,11 @@ class Kern(Parameterized):
    def _effective_input_dim(self):
        return np.size(self._all_dims_active)
-    @Cache_this(limit=20)
+    @Cache_this(limit=3)
    def _slice_X(self, X):
        try:
            return X[:, self._all_dims_active].astype('float')
        except:
            return X[:, self._all_dims_active]
    def K(self, X, X2):
@ -319,10 +323,20 @@ class CombinationKernel(Kern):
        :param array-like extra_dims: if needed extra dimensions for the combination kernel to work on
        """
        assert all([isinstance(k, Kern) for k in kernels])
-        extra_dims = np.array(extra_dims, dtype=int)
+        extra_dims = np.asarray(extra_dims, dtype=int)
-        input_dim, active_dims = self.get_input_dim_active_dims(kernels, extra_dims)
+        
        active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int))
        input_dim = active_dims.size
        if extra_dims is not None:
            input_dim += extra_dims.size
        # initialize the kernel with the full input_dim
        super(CombinationKernel, self).__init__(input_dim, active_dims, name)
        effective_input_dim = reduce(max, (k._all_dims_active.max() for k in kernels)) + 1
        self._all_dims_active = np.array(np.concatenate((np.arange(effective_input_dim), extra_dims if extra_dims is not None else [])), dtype=int)
        self.extra_dims = extra_dims
        self.link_parameters(*kernels)
@ -330,16 +344,8 @@ class CombinationKernel(Kern):
    def parts(self):
        return self.parameters
-    def get_input_dim_active_dims(self, kernels, extra_dims = None):
+    def _set_all_dims_ative(self):
-        self.active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int))
+        self._all_dims_active = np.atleast_1d(self.active_dims).astype(int)        
        #_all_dims_active = np.array(np.concatenate((_all_dims_active, extra_dims if extra_dims is not None else [])), dtype=int)
        input_dim = reduce(max, (k._all_dims_active.max() for k in kernels)) + 1
        if extra_dims is not None:
            input_dim += extra_dims.size
        _all_dims_active = np.arange(input_dim)
        return input_dim, _all_dims_active
    def input_sensitivity(self, summarize=True):
        """
--- a/GPy/kern/src/linear.py
+++ b/GPy/kern/src/linear.py
@ -51,7 +51,7 @@ class Linear(Kern):
        self.link_parameter(self.variances)
        self.psicomp = PSICOMP_Linear()
-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
    def K(self, X, X2=None):
        if self.ARD:
            if X2 is None:
@ -62,7 +62,7 @@ class Linear(Kern):
        else:
            return self._dot_product(X, X2) * self.variances
-    @Cache_this(limit=1, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def _dot_product(self, X, X2=None):
        if X2 is None:
            return tdot(X)
--- a/GPy/kern/src/mlp.py
+++ b/GPy/kern/src/mlp.py
@ -45,7 +45,7 @@ class MLP(Kern):
        self.link_parameters(self.variance, self.weight_variance, self.bias_variance)
-    @Cache_this(limit=20, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def K(self, X, X2=None):
        if X2 is None:
            X_denom = np.sqrt(self._comp_prod(X)+1.)
@ -57,7 +57,7 @@ class MLP(Kern):
        XTX = self._comp_prod(X,X2)/X_denom[:,None]/X2_denom[None,:]
        return self.variance*four_over_tau*np.arcsin(XTX)
-    @Cache_this(limit=20, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def Kdiag(self, X):
        """Compute the diagonal of the covariance matrix for X."""
        X_prod = self._comp_prod(X)
@ -88,14 +88,14 @@ class MLP(Kern):
        """Gradient of diagonal of covariance with respect to X"""
        return self._comp_grads_diag(dL_dKdiag, X)[3]
-    @Cache_this(limit=50, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def _comp_prod(self, X, X2=None):
        if X2 is None:
            return (np.square(X)*self.weight_variance).sum(axis=1)+self.bias_variance
        else:
            return (X*self.weight_variance).dot(X2.T)+self.bias_variance
-    @Cache_this(limit=20, ignore_args=(1,))
+    @Cache_this(limit=3, ignore_args=(1,))
    def _comp_grads(self, dL_dK, X, X2=None):
        var,w,b = self.variance, self.weight_variance, self.bias_variance
        K = self.K(X, X2)
@ -130,7 +130,7 @@ class MLP(Kern):
            dX2 = common.T.dot(X)*w-((common*XTX).sum(axis=0)/(X2_prod+1.))[:,None]*X2*w
        return dvar, dw, db, dX, dX2
-    @Cache_this(limit=20, ignore_args=(1,))
+    @Cache_this(limit=3, ignore_args=(1,))
    def _comp_grads_diag(self, dL_dKdiag, X):
        var,w,b = self.variance, self.weight_variance, self.bias_variance
        K = self.Kdiag(X)
--- a/GPy/kern/src/poly.py
+++ b/GPy/kern/src/poly.py
@ -5,32 +5,49 @@ import numpy as np
 from .kern import Kern
 from ...core.parameterization import Param
 from paramz.transformations import Logexp
 from paramz.caching import Cache_this
 class Poly(Kern):
    """
    Polynomial kernel
    """
-    def __init__(self, input_dim, variance=1., order=3., active_dims=None, name='poly'):
+    def __init__(self, input_dim, variance=1., scale=1., bias=1., order=3., active_dims=None, name='poly'):
        super(Poly, self).__init__(input_dim, active_dims, name)
        self.variance = Param('variance', variance, Logexp())
-        self.link_parameter(self.variance)
+        self.scale = Param('scale', scale, Logexp())
        self.bias = Param('bias', bias, Logexp())
        self.link_parameters(self.variance, self.scale, self.bias)
        assert order >= 1, 'The order of the polynomial has to be at least 1.'
        self.order=order
    def K(self, X, X2=None):
        return (self._dot_product(X, X2) + 1.)**self.order * self.variance
-    def _dot_product(self, X, X2=None):
+    def K(self, X, X2=None):
        _, _, B = self._AB(X, X2)
        return B * self.variance
    @Cache_this(limit=3)
    def _AB(self, X, X2=None):
        if X2 is None:
-            return np.dot(X, X.T)
+            dot_prod = np.dot(X, X.T)
        else:
-            return np.dot(X, X2.T)
+            dot_prod = np.dot(X, X2.T)
        A = (self.scale * dot_prod) + self.bias
        B = A ** self.order
        return dot_prod, A, B
    def Kdiag(self, X):
-        return self.variance*(np.square(X).sum(1) + 1.)**self.order
+        return self.K(X).diagonal()#self.variance*(np.square(X).sum(1) + 1.)**self.order
    def update_gradients_full(self, dL_dK, X, X2=None):
-        self.variance.gradient = np.sum(dL_dK * (self._dot_product(X, X2) + 1.)**self.order)
+        dot_prod, A, B = self._AB(X, X2)
        dK_dA = self.variance * self.order * A ** (self.order-1.)
        dL_dA = dL_dK * (dK_dA)
        self.scale.gradient = (dL_dA * dot_prod).sum()
        self.bias.gradient = dL_dA.sum()
        self.variance.gradient = np.sum(dL_dK * B)
        #import ipdb;ipdb.set_trace()
    def update_gradients_diag(self, dL_dKdiag, X):
        raise NotImplementedError
--- a/GPy/kern/src/prod.py
+++ b/GPy/kern/src/prod.py
@ -39,7 +39,7 @@ class Prod(CombinationKernel):
                    kernels.insert(i, part)
        super(Prod, self).__init__(kernels, name)
-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def K(self, X, X2=None, which_parts=None):
        if which_parts is None:
            which_parts = self.parts
@ -48,7 +48,7 @@ class Prod(CombinationKernel):
            which_parts = [which_parts]
        return reduce(np.multiply, (p.K(X, X2) for p in which_parts))
-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def Kdiag(self, X, which_parts=None):
        if which_parts is None:
            which_parts = self.parts
--- a/GPy/kern/src/psi_comp/init.py
+++ b/GPy/kern/src/psi_comp/init.py
@ -21,7 +21,7 @@ from .gaussherm import PSICOMP_GH
 from . import rbf_psi_comp, linear_psi_comp, ssrbf_psi_comp, sslinear_psi_comp
 class PSICOMP_RBF(PSICOMP):
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        variance, lengthscale = kern.variance, kern.lengthscale
        if isinstance(variational_posterior, variational.NormalPosterior):
@ -31,7 +31,7 @@ class PSICOMP_RBF(PSICOMP):
        else:
            raise ValueError("unknown distriubtion received for psi-statistics")
-    @Cache_this(limit=10, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        variance, lengthscale = kern.variance, kern.lengthscale
        if isinstance(variational_posterior, variational.NormalPosterior):
@ -43,7 +43,7 @@ class PSICOMP_RBF(PSICOMP):
 class PSICOMP_Linear(PSICOMP):
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        variances = kern.variances
        if isinstance(variational_posterior, variational.NormalPosterior):
@ -53,7 +53,7 @@ class PSICOMP_Linear(PSICOMP):
        else:
            raise ValueError("unknown distriubtion received for psi-statistics")
-    @Cache_this(limit=10, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        variances = kern.variances
        if isinstance(variational_posterior, variational.NormalPosterior):
--- a/GPy/kern/src/psi_comp/gaussherm.py
+++ b/GPy/kern/src/psi_comp/gaussherm.py
@ -27,7 +27,7 @@ class PSICOMP_GH(PSICOMP):
    def _setup_observers(self):
        pass
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def comp_K(self, Z, qX):
        if self.Xs is None or self.Xs.shape != qX.mean.shape:
            from paramz import ObsAr
@ -38,7 +38,7 @@ class PSICOMP_GH(PSICOMP):
            self.Xs[i] = self.locs[i]*S_sq+mu
        return self.Xs
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def psicomputations(self, kern, Z, qX, return_psi2_n=False):
        mu, S = qX.mean.values, qX.variance.values
        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
@ -62,7 +62,7 @@ class PSICOMP_GH(PSICOMP):
                psi2 += self.weights[i]* tdot(Kfu.T)
        return psi0, psi1, psi2
-    @Cache_this(limit=10, ignore_args=(0, 2,3,4))
+    @Cache_this(limit=3, ignore_args=(0, 2,3,4))
    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, qX):
        mu, S = qX.mean.values, qX.variance.values
        if self.cache_K: Xs = self.comp_K(Z, qX)
--- a/GPy/kern/src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/src/psi_comp/rbf_psi_comp.py
@ -132,5 +132,5 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
-_psi1computations = Cacher(__psi1computations, limit=5)
+_psi1computations = Cacher(__psi1computations, limit=3)
-_psi2computations = Cacher(__psi2computations, limit=5)
+_psi2computations = Cacher(__psi2computations, limit=3)
--- a/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
+++ b/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
@ -324,7 +324,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
        except:
            return self.fall_back.psicomputations(kern, Z, variational_posterior, return_psi2_n)
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def _psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        """
        Z - MxQ
@ -369,7 +369,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
        except:
            return self.fall_back.psiDerivativecomputations(kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)
-    @Cache_this(limit=10, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
    def _psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        # resolve the requirement of dL_dpsi2 to be symmetric
        if len(dL_dpsi2.shape)==2: dL_dpsi2 = (dL_dpsi2+dL_dpsi2.T)/2
--- a/GPy/kern/src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/src/psi_comp/ssrbf_psi_comp.py
@ -88,7 +88,7 @@ try:
        return psi0,psi1,psi2,psi2n
    from GPy.util.caching import Cacher
-    psicomputations = Cacher(_psicomputations, limit=1)
+    psicomputations = Cacher(_psicomputations, limit=3)
    def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
        ARD = (len(lengthscale)!=1)
--- a/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
+++ b/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
@ -373,7 +373,7 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
    def get_dimensions(self, Z, variational_posterior):
        return variational_posterior.mean.shape[0], Z.shape[0], Z.shape[1]
-    @Cache_this(limit=1, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        """
        Z - MxQ
@ -407,7 +407,7 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
        else:
            return psi0, psi1_gpu.get(), psi2_gpu.get()
-    @Cache_this(limit=1, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        variance, lengthscale = kern.variance, kern.lengthscale
        from ....util.linalg_gpu import sum_axis
--- a/GPy/kern/src/standard_periodic.py
+++ b/GPy/kern/src/standard_periodic.py
@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-
-
+# Copyright (c) 2015, Alex Grigorevskiy
 # Copyright (c) 2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 The standard periodic kernel which mentioned in:
@ -25,55 +24,55 @@ class StdPeriodic(Kern):
    .. math::
-       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} {}\sum_{i=1}^{input\_dim}  
+       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} \sum_{i=1}^{input\_dim}
-       \left( \frac{\sin(\frac{\pi}{\lambda_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
+       \left( \frac{\sin(\frac{\pi}{T_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
    :param input_dim: the number of input dimensions
    :type input_dim: int
    :param variance: the variance :math:`\theta_1` in the formula above
    :type variance: float
-    :param wavelength: the vector of wavelengths :math:`\lambda_i`. If None then 1.0 is assumed.
+    :param period: the vector of periods :math:`\T_i`. If None then 1.0 is assumed.
-    :type wavelength: array or list of the appropriate size (or float if there is only one wavelength parameter)
+    :type period: array or list of the appropriate size (or float if there is only one period parameter)
    :param lengthscale: the vector of lengthscale :math:`\l_i`. If None then 1.0 is assumed.
    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
-    :param ARD1: Auto Relevance Determination with respect to wavelength. 
+    :param ARD1: Auto Relevance Determination with respect to period.
-        If equal to "False" one single wavelength parameter :math:`\lambda_i` for 
+        If equal to "False" one single period parameter :math:`\T_i` for
        each dimension is assumed, otherwise there is one lengthscale
        parameter per dimension.
    :type ARD1: Boolean
    :param ARD2: Auto Relevance Determination with respect to lengthscale.
-        If equal to "False" one single wavelength parameter :math:`l_i` for 
+        If equal to "False" one single lengthscale parameter :math:`l_i` for
        each dimension is assumed, otherwise there is one lengthscale
        parameter per dimension.
    :type ARD2: Boolean
    :param active_dims: indices of dimensions which are used in the computation of the kernel
-    :type wavelength: array or list of the appropriate size
+    :type active_dims: array or list of the appropriate size
    :param name: Name of the kernel for output
    :type String
    :param useGPU: whether of not use GPU
    :type Boolean
    """
-    def __init__(self, input_dim, variance=1., wavelength=None, lengthscale=None, ARD1=False, ARD2=False, active_dims=None, name='std_periodic',useGPU=False):
+    def __init__(self, input_dim, variance=1., period=None, lengthscale=None, ARD1=False, ARD2=False, active_dims=None, name='std_periodic',useGPU=False):
        super(StdPeriodic, self).__init__(input_dim, active_dims, name, useGPU=useGPU)
        self.input_dim = input_dim
-        self.ARD1 = ARD1 # correspond to wavelengths        
+        self.ARD1 = ARD1 # correspond to periods
        self.ARD2 = ARD2 # correspond to lengthscales
        self.name = name
        if self.ARD1 == False:
-            if wavelength is not None:
+            if period is not None:
-                wavelength = np.asarray(wavelength)
+                period = np.asarray(period)
-                assert wavelength.size == 1, "Only one wavelength needed for non-ARD kernel"
+                assert period.size == 1, "Only one period needed for non-ARD kernel"
            else:
-                wavelength = np.ones(1)
+                period = np.ones(1.0)
        else:
-            if wavelength is not None:
+            if period is not None:
-                wavelength = np.asarray(wavelength)
+                period = np.asarray(period)
-                assert wavelength.size == input_dim, "bad number of wavelengths"
+                assert period.size == input_dim, "bad number of periods"
            else:
-                wavelength = np.ones(input_dim)
+                period = np.ones(input_dim)
        if self.ARD2 == False:
            if lengthscale is not None:
@ -90,10 +89,10 @@ class StdPeriodic(Kern):
        self.variance = Param('variance', variance, Logexp())
        assert self.variance.size==1, "Variance size must be one"
-        self.wavelengths =  Param('wavelengths', wavelength, Logexp())
+        self.period =  Param('period', period, Logexp())
-        self.lengthscales =  Param('lengthscales', lengthscale, Logexp())
+        self.lengthscale =  Param('lengthscale', lengthscale, Logexp())
-        self.link_parameters(self.variance,  self.wavelengths, self.lengthscales)
+        self.link_parameters(self.variance,  self.period, self.lengthscale)
    def parameters_changed(self):
        """
@ -111,8 +110,8 @@ class StdPeriodic(Kern):
        if X2 is None:
            X2 = X
-        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
+        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.period
-        exp_dist = np.exp( -0.5* np.sum( np.square(  np.sin( base ) / self.lengthscales ), axis = -1 ) ) 
+        exp_dist = np.exp( -0.5* np.sum( np.square(  np.sin( base ) / self.lengthscale ), axis = -1 ) )
        return self.variance * exp_dist
@ -128,33 +127,33 @@ class StdPeriodic(Kern):
        if X2 is None:
            X2 = X
-        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
+        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.period
        sin_base = np.sin( base )
-        exp_dist = np.exp( -0.5* np.sum( np.square(  sin_base / self.lengthscales ), axis = -1 ) ) 
+        exp_dist = np.exp( -0.5* np.sum( np.square(  sin_base / self.lengthscale ), axis = -1 ) )
-        dwl = self.variance * (1.0/np.square(self.lengthscales)) * sin_base*np.cos(base) * (base / self.wavelengths)
+        dwl = self.variance * (1.0/np.square(self.lengthscale)) * sin_base*np.cos(base) * (base / self.period)
-        dl = self.variance * np.square( sin_base) / np.power( self.lengthscales, 3) 
+        dl = self.variance * np.square( sin_base) / np.power( self.lengthscale, 3)
        self.variance.gradient = np.sum(exp_dist * dL_dK)
        #target[0] += np.sum( exp_dist * dL_dK)
-        if self.ARD1: # different wavelengths
+        if self.ARD1: # different periods
-            self.wavelengths.gradient = (dwl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
+            self.period.gradient = (dwl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
-        else:  # same wavelengths
+        else:  # same period
-            self.wavelengths.gradient = np.sum(dwl.sum(-1) * exp_dist * dL_dK)
+            self.period.gradient = np.sum(dwl.sum(-1) * exp_dist * dL_dK)
        if self.ARD2: # different lengthscales
-            self.lengthscales.gradient = (dl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
+            self.lengthscale.gradient = (dl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
        else: # same lengthscales
-            self.lengthscales.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)
+            self.lengthscale.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)
    def update_gradients_diag(self, dL_dKdiag, X):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
        self.variance.gradient = np.sum(dL_dKdiag)
-        self.wavelengths.gradient = 0
+        self.period.gradient = 0
-        self.lengthscales.gradient = 0
+        self.lengthscale.gradient = 0
 #    def gradients_X(self, dL_dK, X, X2=None):
 #        """derivative of the covariance matrix with respect to X."""
--- a/GPy/kern/src/static.py
+++ b/GPy/kern/src/static.py
@ -81,6 +81,52 @@ class White(Static):
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        self.variance.gradient = dL_dpsi0.sum()
 class WhiteHeteroscedastic(Static):
    def __init__(self, input_dim, num_data, variance=1., active_dims=None, name='white_hetero'):
        """
        A heteroscedastic White kernel (nugget/noise).
        It defines one variance (nugget) per input sample. 
        Prediction excludes any noise learnt by this Kernel, so be careful using this kernel.
        You can plot the errors learnt by this kernel by something similar as:
        plt.errorbar(m.X, m.Y, yerr=2*np.sqrt(m.kern.white.variance))
        """
        super(Static, self).__init__(input_dim, active_dims, name)
        self.variance = Param('variance', np.ones(num_data) * variance, Logexp())
        self.link_parameters(self.variance)
    def Kdiag(self, X):
        if X.shape[0] == self.variance.shape[0]:
            # If the input has the same number of samples as 
            # the number of variances, we return the variances
            return self.variance
        return 0.
    def K(self, X, X2=None):
        if X2 is None and X.shape[0] == self.variance.shape[0]:
            return np.eye(X.shape[0]) * self.variance
        else:
            return 0.
    def psi2(self, Z, variational_posterior):
        return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)
    def psi2n(self, Z, variational_posterior):
        return np.zeros((1, Z.shape[0], Z.shape[0]), dtype=np.float64)
    def update_gradients_full(self, dL_dK, X, X2=None):
        if X2 is None:
            self.variance.gradient = np.diagonal(dL_dK)
        else:
            self.variance.gradient = 0.
    def update_gradients_diag(self, dL_dKdiag, X):
        self.variance.gradient = dL_dKdiag
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        self.variance.gradient = dL_dpsi0
 class Bias(Static):
    def __init__(self, input_dim, variance=1., active_dims=None, name='bias'):
        super(Bias, self).__init__(input_dim, variance, active_dims, name)
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@ -81,11 +81,11 @@ class Stationary(Kern):
    def dK_dr(self, r):
        raise NotImplementedError("implement derivative of the covariance function wrt r to use this class")
-    @Cache_this(limit=20, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def dK2_drdr(self, r):
        raise NotImplementedError("implement second derivative of covariance wrt r to use this method")
-    @Cache_this(limit=5, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def K(self, X, X2=None):
        """
        Kernel function applied on inputs X and X2.
@ -99,6 +99,9 @@ class Stationary(Kern):
    @Cache_this(limit=3, ignore_args=())
    def dK_dr_via_X(self, X, X2):
        """
        compute the derivative of K wrt X going through X
        """
        #a convenience function, so we can cache dK_dr
        return self.dK_dr(self._scaled_dist(X, X2))
--- a/GPy/kern/src/trunclinear.py
+++ b/GPy/kern/src/trunclinear.py
@ -54,12 +54,12 @@ class TruncLinear(Kern):
        self.add_parameter(self.variances)
        self.add_parameter(self.delta)
-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
    def K(self, X, X2=None):
        XX = self.variances*self._product(X, X2)
        return XX.sum(axis=-1)
-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
    def _product(self, X, X2=None):
        if X2 is None:
            X2 = X
@ -149,12 +149,12 @@ class TruncLinear_inf(Kern):
        self.add_parameter(self.variances)
-#     @Cache_this(limit=2)
+#     @Cache_this(limit=3)
    def K(self, X, X2=None):
        tmp = self._product(X, X2)
        return (self.variances*tmp).sum(axis=-1)
-#     @Cache_this(limit=2)
+#     @Cache_this(limit=3)
    def _product(self, X, X2=None):
        if X2 is None:
            X2 = X
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@ -61,7 +61,7 @@ class BayesianGPLVM(SparseGP_MPI):
            else:
                from ..inference.latent_function_inference.var_dtc import VarDTC
                self.logger.debug("creating inference_method var_dtc")
-                inference_method = VarDTC(limit=1 if not missing_data else Y.shape[1])
+                inference_method = VarDTC(limit=3 if not missing_data else Y.shape[1])
        if isinstance(inference_method,VarDTC_minibatch):
            inference_method.mpi_comm = mpi_comm
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@ -40,10 +40,11 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
            Z = np.random.permutation(X.copy())[:num_inducing]
        assert Z.shape[1] == X.shape[1]
-        if X_variance == False:
+        if X_variance is False:
            self.logger.info('no variance on X, activating sparse GPLVM')
            X = Param("latent space", X)
-        elif X_variance is None:
+        else:
            if X_variance is None:
                self.logger.info("initializing latent space variance ~ uniform(0,.1)")
                X_variance = np.random.uniform(0,.1,X.shape)
            self.variational_prior = NormalPrior()
@ -61,7 +62,7 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
        if inference_method is None:
            from ..inference.latent_function_inference.var_dtc import VarDTC
            self.logger.debug("creating inference_method var_dtc")
-            inference_method = VarDTC(limit=1 if not missing_data else Y.shape[1])
+            inference_method = VarDTC(limit=3 if not missing_data else Y.shape[1])
        super(BayesianGPLVMMiniBatch,self).__init__(X, Y, Z, kernel, likelihood=likelihood,
                                           name=name, inference_method=inference_method,
@ -71,13 +72,13 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
        self.X = X
        self.link_parameter(self.X, 0)
-    def set_X_gradients(self, X, X_grad):
+    #def set_X_gradients(self, X, X_grad):
-        """Set the gradients of the posterior distribution of X in its specific form."""
+    #    """Set the gradients of the posterior distribution of X in its specific form."""
-        X.mean.gradient, X.variance.gradient = X_grad
+    #    X.mean.gradient, X.variance.gradient = X_grad
-    def get_X_gradients(self, X):
+    #def get_X_gradients(self, X):
-        """Get the gradients of the posterior distribution of X in its specific form."""
+    #    """Get the gradients of the posterior distribution of X in its specific form."""
-        return X.mean.gradient, X.variance.gradient
+    #    return X.mean.gradient, X.variance.gradient
    def _outer_values_update(self, full_values):
        """
@ -106,7 +107,7 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
        super(BayesianGPLVMMiniBatch,self).parameters_changed()
        kl_fctr = self.kl_factr
-        if kl_fctr > 0:
+        if kl_fctr > 0 and self.has_uncertain_inputs():
            Xgrad = self.X.gradient.copy()
            self.X.gradient[:] = 0
            self.variational_prior.update_gradients_KL(self.X)
@ -122,7 +123,7 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
            if self.missing_data or not self.stochastics:
                self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)
-            elif self.stochastics:
+            else: #self.stochastics is given:
                d = self.output_dim
                self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)*self.stochastics.batchsize/d
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@ -5,14 +5,14 @@ import numpy as np
 import itertools, logging
 from ..kern import Kern
-from GPy.core.parameterization.variational import NormalPrior
+from ..core.parameterization.variational import NormalPrior
 from ..core.parameterization import Param
 from paramz import ObsAr
 from ..inference.latent_function_inference.var_dtc import VarDTC
 from ..inference.latent_function_inference import InferenceMethodList
 from ..likelihoods import Gaussian
 from ..util.initialization import initialize_latent
-from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
+from ..models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
 class MRD(BayesianGPLVMMiniBatch):
    """
@ -215,40 +215,6 @@ class MRD(BayesianGPLVMMiniBatch):
            Z = np.random.randn(self.num_inducing, self.input_dim) * X.var()
        return Z
    def _handle_plotting(self, fignum, axes, plotf, sharex=False, sharey=False):
        import matplotlib.pyplot as plt
        if axes is None:
            fig = plt.figure(num=fignum)
        sharex_ax = None
        sharey_ax = None
        plots = []
        for i, g in enumerate(self.bgplvms):
            try:
                if sharex:
                    sharex_ax = ax # @UndefinedVariable
                    sharex = False # dont set twice
                if sharey:
                    sharey_ax = ax # @UndefinedVariable
                    sharey = False # dont set twice
            except:
                pass
            if axes is None:
                ax = fig.add_subplot(1, len(self.bgplvms), i + 1, sharex=sharex_ax, sharey=sharey_ax)
            elif isinstance(axes, (tuple, list, np.ndarray)):
                ax = axes[i]
            else:
                raise ValueError("Need one axes per latent dimension input_dim")
            plots.append(plotf(i, g, ax))
            if sharey_ax is not None:
                plt.setp(ax.get_yticklabels(), visible=False)
        plt.draw()
        if axes is None:
            try:
                fig.tight_layout()
            except:
                pass
        return plots
    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None, Yindex=0):
        """
        Prediction for data set Yindex[default=0].
@ -270,59 +236,50 @@ class MRD(BayesianGPLVMMiniBatch):
    #                                     sharex=sharex, sharey=sharey)
    #         return fig
-    def plot_scales(self, fignum=None, ax=None, titles=None, sharex=False, sharey=True, *args, **kwargs):
+    def plot_scales(self, titles=None, fig_kwargs={}, **kwargs):
        """
-
+        Plot input sensitivity for all datasets, to see which input dimensions are
-        TODO: Explain other parameters
+        significant for which dataset.
        :param titles: titles for axes of datasets
        kwargs go into plot_ARD for each kernel.
        """
        from ..plotting import plotting_library as pl
        if titles is None:
            titles = [r'${}$'.format(name) for name in self.names]
-        ymax = reduce(max, [np.ceil(max(g.kern.input_sensitivity())) for g in self.bgplvms])
+
-        def plotf(i, g, ax):
+        M = len(self.bgplvms)
-            #ax.set_ylim([0,ymax])
+        fig = pl().figure(rows=1, cols=M, **fig_kwargs)
-            return g.kern.plot_ARD(ax=ax, title=titles[i], *args, **kwargs)
+        for c in range(M):
-        fig = self._handle_plotting(fignum, ax, plotf, sharex=sharex, sharey=sharey)
+            canvas = self.bgplvms[c].kern.plot_ARD(title=titles[c], figure=fig, col=c+1, **kwargs)
-        return fig
+        return canvas
    def plot_latent(self, labels=None, which_indices=None,
-                resolution=50, ax=None, marker='o', s=40,
+                resolution=60, legend=True,
                fignum=None, plot_inducing=True, legend=True,
                plot_limits=None,
-                aspect='auto', updates=False, predict_kwargs={}, imshow_kwargs={}):
+                updates=False,
                kern=None, marker='<>^vsd',
                num_samples=1000, projection='2d',
                predict_kwargs={},
                scatter_kwargs=None, **imshow_kwargs):
        """
        see plotting.matplot_dep.dim_reduction_plots.plot_latent
        if predict_kwargs is None, will plot latent spaces for 0th dataset (and kernel), otherwise give
        predict_kwargs=dict(Yindex='index') for plotting only the latent space of dataset with 'index'.
        """
-        import sys
+        from ..plotting.gpy_plot.latent_plots import plot_latent
-        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+
        from matplotlib import pyplot as plt
        from ..plotting.matplot_dep import dim_reduction_plots
        if "Yindex" not in predict_kwargs:
            predict_kwargs['Yindex'] = 0
        Yindex = predict_kwargs['Yindex']
-        if ax is None:
+
            fig = plt.figure(num=fignum)
            ax = fig.add_subplot(111)
        else:
            fig = ax.figure
        self.kern = self.bgplvms[Yindex].kern
        self.likelihood = self.bgplvms[Yindex].likelihood
        plot = dim_reduction_plots.plot_latent(self, labels, which_indices,
                                        resolution, ax, marker, s,
                                        fignum, plot_inducing, legend,
                                        plot_limits, aspect, updates, predict_kwargs, imshow_kwargs)
        ax.set_title(self.bgplvms[Yindex].name)
        try:
            fig.tight_layout()
        except:
            pass
-        return plot
+        return plot_latent(self, labels, which_indices, resolution, legend, plot_limits, updates, kern, marker, num_samples, projection, scatter_kwargs)
    def __getstate__(self):
        state = super(MRD, self).__getstate__()
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@ -41,11 +41,12 @@ class SparseGPMiniBatch(SparseGP):
    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
                 name='sparse gp', Y_metadata=None, normalizer=False,
                 missing_data=False, stochastic=False, batchsize=1):
        self._update_stochastics = False
        # pick a sensible inference method
        if inference_method is None:
            if isinstance(likelihood, likelihoods.Gaussian):
-                inference_method = var_dtc.VarDTC(limit=1 if not missing_data else Y.shape[1])
+                inference_method = var_dtc.VarDTC(limit=3 if not missing_data else Y.shape[1])
            else:
                #inference_method = ??
                raise NotImplementedError("what to do what to do?")
@ -74,6 +75,13 @@ class SparseGPMiniBatch(SparseGP):
        self.link_parameter(self.Z, index=0)
        self.posterior = None
    def optimize(self, optimizer=None, start=None, **kwargs):
        try:
            self._update_stochastics = True
            SparseGP.optimize(self, optimizer=optimizer, start=start, **kwargs)
        finally:
            self._update_stochastics = False
    def has_uncertain_inputs(self):
        return isinstance(self.X, VariationalPosterior)
@ -226,16 +234,16 @@ class SparseGPMiniBatch(SparseGP):
            woodbury_inv = self.posterior._woodbury_inv
            woodbury_vector = self.posterior._woodbury_vector
-        if not self.stochastics:
+        #if not self.stochastics:
-            m_f = lambda i: "Inference with missing_data: {: >7.2%}".format(float(i+1)/self.output_dim)
+        #    m_f = lambda i: "Inference with missing_data: {: >7.2%}".format(float(i+1)/self.output_dim)
-            message = m_f(-1)
+        #    message = m_f(-1)
-            print(message, end=' ')
+        #    print(message, end=' ')
        for d, ninan in self.stochastics.d:
-            if not self.stochastics:
+            #if not self.stochastics:
-                print(' '*(len(message)) + '\r', end=' ')
+            #    print(' '*(len(message)) + '\r', end=' ')
-                message = m_f(d)
+            #    message = m_f(d)
-                print(message, end=' ')
+            #    print(message, end=' ')
            psi0ni = self.psi0[ninan]
            psi1ni = self.psi1[ninan]
@ -262,8 +270,8 @@ class SparseGPMiniBatch(SparseGP):
            woodbury_vector[:, d] = posterior.woodbury_vector
            self._log_marginal_likelihood += log_marginal_likelihood
-        if not self.stochastics:
+        #if not self.stochastics:
-            print('')
+        #    print('')
        if self.posterior is None:
            self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector,
@ -314,6 +322,8 @@ class SparseGPMiniBatch(SparseGP):
        if self.missing_data:
            self._outer_loop_for_missing_data()
        elif self.stochastics:
            if self._update_stochastics:
                self.stochastics.do_stochastics()
            self._outer_loop_without_missing_data()
        else:
            self.posterior, self._log_marginal_likelihood, self.grad_dict = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
--- a/GPy/models/sparse_gplvm.py
+++ b/GPy/models/sparse_gplvm.py
@ -4,6 +4,7 @@
 import sys
 from .sparse_gp_regression import SparseGPRegression
 from ..core import Param
 class SparseGPLVM(SparseGPRegression):
    """
@ -21,7 +22,9 @@ class SparseGPLVM(SparseGPRegression):
        if X is None:
            from ..util.initialization import initialize_latent
            X, fracs = initialize_latent(init, input_dim, Y)
        X = Param('latent space', X)
        SparseGPRegression.__init__(self, X, Y, kernel=kernel, num_inducing=num_inducing)
        self.link_parameter(self.X, 0)
    def parameters_changed(self):
        super(SparseGPLVM, self).parameters_changed()
--- a/GPy/plotting/init.py
+++ b/GPy/plotting/init.py
@ -25,35 +25,16 @@ def change_plotting_library(lib):
            current_lib[0] = PlotlyPlots()
        if lib == 'none':
            current_lib[0] = None
        inject_plotting()
        #===========================================================================
    except (ImportError, NameError):
        config.set('plotting', 'library', 'none')
        raise
        import warnings
        warnings.warn(ImportWarning("You spevified {} in your configuration, but is not available. Install newest version of {} for plotting".format(lib, lib)))
-from ..util.config import config, NoOptionError
+def inject_plotting():
-try:
+    if current_lib[0] is not None:
    lib = config.get('plotting', 'library')
    change_plotting_library(lib)
 except NoOptionError:
    print("No plotting library was specified in config file. \n{}".format(error_suggestion))
 def plotting_library():
    if current_lib[0] is None:
        raise RuntimeError("No plotting library was loaded. \n{}".format(error_suggestion))
    return current_lib[0]
 def show(figure, **kwargs):
    """
    Show the specific plotting library figure, returned by
    add_to_canvas().
    kwargs are the plotting library specific options
    for showing/drawing a figure.
    """
    return plotting_library().show_canvas(figure, **kwargs)
 if config.get('plotting', 'library') is not 'none':
        # Inject the plots into classes here:
        # Already converted to new style:
@ -104,3 +85,26 @@ if config.get('plotting', 'library') is not 'none':
        from ..inference.optimization import Optimizer
        Optimizer.plot = gpy_plot.inference_plots.plot_optimizer
        # Variational plot!
 def plotting_library():
    if current_lib[0] is None:
        raise RuntimeError("No plotting library was loaded. \n{}".format(error_suggestion))
    return current_lib[0]
 def show(figure, **kwargs):
    """
    Show the specific plotting library figure, returned by
    add_to_canvas().
    kwargs are the plotting library specific options
    for showing/drawing a figure.
    """
    return plotting_library().show_canvas(figure, **kwargs)
 from ..util.config import config, NoOptionError
 try:
    lib = config.get('plotting', 'library')
    change_plotting_library(lib)
 except NoOptionError:
    print("No plotting library was specified in config file. \n{}".format(error_suggestion))
--- a/GPy/plotting/gpy_plot/gp_plots.py
+++ b/GPy/plotting/gpy_plot/gp_plots.py
@ -91,7 +91,7 @@ def _plot_mean(self, canvas, helper_data, helper_prediction,
            if projection == '2d':
                update_not_existing_kwargs(kwargs, pl().defaults.meanplot_2d)  # @UndefinedVariable
                plots = dict(gpmean=[pl().contour(canvas, x[:,0], y[0,:],
-                                               mu.reshape(resolution, resolution),
+                                               mu.reshape(resolution, resolution).T,
                                               levels=levels, label=label, **kwargs)])
            elif projection == '3d':
                update_not_existing_kwargs(kwargs, pl().defaults.meanplot_3d)  # @UndefinedVariable
--- a/GPy/plotting/gpy_plot/kernel_plots.py
+++ b/GPy/plotting/gpy_plot/kernel_plots.py
@ -33,7 +33,7 @@ from .. import Tango
 from .plot_util import update_not_existing_kwargs, helper_for_plot_data
 from ...kern.src.kern import Kern, CombinationKernel
-def plot_ARD(kernel, filtering=None, legend=False, **kwargs):
+def plot_ARD(kernel, filtering=None, legend=False, canvas=None, **kwargs):
    """
    If an ARD kernel is present, plot a bar representation using matplotlib
@ -62,7 +62,11 @@ def plot_ARD(kernel, filtering=None, legend=False, **kwargs):
    bars = []
    kwargs = update_not_existing_kwargs(kwargs, pl().defaults.ard)
    if canvas is None:
        canvas, kwargs = pl().new_canvas(xlim=(-.5, kernel._effective_input_dim-.5), xlabel='input dimension', ylabel='sensitivity', **kwargs)
    for i in range(ard_params.shape[0]):
        if parts[i].name in filtering:
            c = Tango.nextMedium()
--- a/GPy/plotting/gpy_plot/latent_plots.py
+++ b/GPy/plotting/gpy_plot/latent_plots.py
@ -147,6 +147,7 @@ def _plot_magnification(self, canvas, which_indices, Xgrid,
    def plot_function(x):
        Xtest_full = np.zeros((x.shape[0], Xgrid.shape[1]))
        Xtest_full[:, which_indices] = x
        mf = self.predict_magnification(Xtest_full, kern=kern, mean=mean, covariance=covariance)
        return mf.reshape(resolution, resolution).T
    imshow_kwargs = update_not_existing_kwargs(imshow_kwargs, pl().defaults.magnification)
@ -163,7 +164,8 @@ def plot_magnification(self, labels=None, which_indices=None,
                updates=False,
                mean=True, covariance=True,
                kern=None, num_samples=1000,
-                scatter_kwargs=None, **imshow_kwargs):
+                scatter_kwargs=None, plot_scatter=True,
                **imshow_kwargs):
    """
    Plot the magnification factor of the GP on the inputs. This is the
    density of the GP as a gray scale.
@ -188,17 +190,20 @@ def plot_magnification(self, labels=None, which_indices=None,
    _, _, Xgrid, _, _, xmin, xmax, resolution = helper_for_plot_data(self, X, plot_limits, which_indices, None, resolution)
    canvas, imshow_kwargs = pl().new_canvas(xlim=(xmin[0], xmax[0]), ylim=(xmin[1], xmax[1]),
                           xlabel='latent dimension %i' % input_1, ylabel='latent dimension %i' % input_2, **imshow_kwargs)
    plots = {}
    if legend and plot_scatter:
        if (labels is not None):
            legend = find_best_layout_for_subplots(len(np.unique(labels)))[1]
        else:
            labels = np.ones(self.num_data)
            legend = False
-    scatters = _plot_latent_scatter(canvas, X, which_indices, labels, marker, num_samples, projection='2d', **scatter_kwargs or {})
+    if plot_scatter:
-    view = _plot_magnification(self, canvas, which_indices, Xgrid, xmin, xmax, resolution, updates, mean, covariance, kern, **imshow_kwargs)
+        plots['scatters'] = _plot_latent_scatter(canvas, X, which_indices, labels, marker, num_samples, projection='2d', **scatter_kwargs or {})
-    retval = pl().add_to_canvas(canvas, dict(scatter=scatters, imshow=view),
+    plots['view'] = _plot_magnification(self, canvas, which_indices, Xgrid, xmin, xmax, resolution, updates, mean, covariance, kern, **imshow_kwargs)
    retval = pl().add_to_canvas(canvas, plots,
                           legend=legend,
                           )
-    _wait_for_updates(view, updates)
+    _wait_for_updates(plots['view'], updates)
    return retval
@ -211,7 +216,12 @@ def _plot_latent(self, canvas, which_indices, Xgrid,
    def plot_function(x):
        Xtest_full = np.zeros((x.shape[0], Xgrid.shape[1]))
        Xtest_full[:, which_indices] = x
-        mf = np.log(self.predict(Xtest_full, kern=kern)[1])
+        mf = self.predict(Xtest_full, kern=kern)[1]
        if mf.shape[1]==self.output_dim:
            mf = mf.sum(-1)
        else:
            mf *= self.output_dim
        mf = np.log(mf)
        return mf.reshape(resolution, resolution).T
    imshow_kwargs = update_not_existing_kwargs(imshow_kwargs, pl().defaults.latent)
@ -254,6 +264,7 @@ def plot_latent(self, labels=None, which_indices=None,
    _, _, Xgrid, _, _, xmin, xmax, resolution = helper_for_plot_data(self, X, plot_limits, which_indices, None, resolution)
    canvas, imshow_kwargs = pl().new_canvas(xlim=(xmin[0], xmax[0]), ylim=(xmin[1], xmax[1]),
                           xlabel='latent dimension %i' % input_1, ylabel='latent dimension %i' % input_2, **imshow_kwargs)
    if legend:
        if (labels is not None):
            legend = find_best_layout_for_subplots(len(np.unique(labels)))[1]
        else:
--- a/GPy/plotting/matplot_dep/init.py
+++ b/GPy/plotting/matplot_dep/init.py
@ -18,4 +18,4 @@
 from .util import align_subplot_array, align_subplots, fewerXticks, removeRightTicks, removeUpperTicks
-from . import controllers
+from . import controllers, base_plots
--- a/GPy/plotting/matplot_dep/base_plots.py
+++ b/GPy/plotting/matplot_dep/base_plots.py
@ -0,0 +1,265 @@
 # #Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from matplotlib import pyplot as plt
 import numpy as np
 def ax_default(fignum, ax):
    if ax is None:
        fig = plt.figure(fignum)
        ax = fig.add_subplot(111)
    else:
        fig = ax.figure
    return fig, ax
 def meanplot(x, mu, color='#3300FF', ax=None, fignum=None, linewidth=2,**kw):
    _, axes = ax_default(fignum, ax)
    return axes.plot(x,mu,color=color,linewidth=linewidth,**kw)
 def gpplot(x, mu, lower, upper, edgecol='#3300FF', fillcol='#33CCFF', ax=None, fignum=None, **kwargs):
    _, axes = ax_default(fignum, ax)
    mu = mu.flatten()
    x = x.flatten()
    lower = lower.flatten()
    upper = upper.flatten()
    plots = []
    #here's the mean
    plots.append(meanplot(x, mu, edgecol, axes))
    #here's the box
    kwargs['linewidth']=0.5
    if not 'alpha' in kwargs.keys():
        kwargs['alpha'] = 0.3
    plots.append(axes.fill(np.hstack((x,x[::-1])),np.hstack((upper,lower[::-1])),color=fillcol,**kwargs))
    #this is the edge:
    plots.append(meanplot(x, upper,color=edgecol, linewidth=0.2, ax=axes))
    plots.append(meanplot(x, lower,color=edgecol, linewidth=0.2, ax=axes))
    return plots
 def gradient_fill(x, percentiles, ax=None, fignum=None, **kwargs):
    _, ax = ax_default(fignum, ax)
    plots = []
    #here's the box
    if 'linewidth' not in kwargs:
        kwargs['linewidth'] = 0.5
    if not 'alpha' in kwargs.keys():
        kwargs['alpha'] = 1./(len(percentiles))
    # pop where from kwargs
    where = kwargs.pop('where') if 'where' in kwargs else None
    # pop interpolate, which we actually do not do here!
    if 'interpolate' in kwargs: kwargs.pop('interpolate')
    def pairwise(inlist):
        l = len(inlist)
        for i in range(int(np.ceil(l/2.))):
            yield inlist[:][i], inlist[:][(l-1)-i]
    polycol = []
    for y1, y2 in pairwise(percentiles):
        import matplotlib.mlab as mlab
        # Handle united data, such as dates
        ax._process_unit_info(xdata=x, ydata=y1)
        ax._process_unit_info(ydata=y2)
        # Convert the arrays so we can work with them
        from numpy import ma
        x = ma.masked_invalid(ax.convert_xunits(x))
        y1 = ma.masked_invalid(ax.convert_yunits(y1))
        y2 = ma.masked_invalid(ax.convert_yunits(y2))
        if y1.ndim == 0:
            y1 = np.ones_like(x) * y1
        if y2.ndim == 0:
            y2 = np.ones_like(x) * y2
        if where is None:
            where = np.ones(len(x), np.bool)
        else:
            where = np.asarray(where, np.bool)
        if not (x.shape == y1.shape == y2.shape == where.shape):
            raise ValueError("Argument dimensions are incompatible")
        mask = reduce(ma.mask_or, [ma.getmask(a) for a in (x, y1, y2)])
        if mask is not ma.nomask:
            where &= ~mask
        polys = []
        for ind0, ind1 in mlab.contiguous_regions(where):
            xslice = x[ind0:ind1]
            y1slice = y1[ind0:ind1]
            y2slice = y2[ind0:ind1]
            if not len(xslice):
                continue
            N = len(xslice)
            X = np.zeros((2 * N + 2, 2), np.float)
            # the purpose of the next two lines is for when y2 is a
            # scalar like 0 and we want the fill to go all the way
            # down to 0 even if none of the y1 sample points do
            start = xslice[0], y2slice[0]
            end = xslice[-1], y2slice[-1]
            X[0] = start
            X[N + 1] = end
            X[1:N + 1, 0] = xslice
            X[1:N + 1, 1] = y1slice
            X[N + 2:, 0] = xslice[::-1]
            X[N + 2:, 1] = y2slice[::-1]
            polys.append(X)
        polycol.extend(polys)
    from matplotlib.collections import PolyCollection
    plots.append(PolyCollection(polycol, **kwargs))
    ax.add_collection(plots[-1], autolim=True)
    ax.autoscale_view()
    return plots
 def gperrors(x, mu, lower, upper, edgecol=None, ax=None, fignum=None, **kwargs):
    _, axes = ax_default(fignum, ax)
    mu = mu.flatten()
    x = x.flatten()
    lower = lower.flatten()
    upper = upper.flatten()
    plots = []
    if edgecol is None:
        edgecol='#3300FF'
    if not 'alpha' in kwargs.keys():
        kwargs['alpha'] = 1.
    if not 'lw' in kwargs.keys():
        kwargs['lw'] = 1.
    plots.append(axes.errorbar(x,mu,yerr=np.vstack([mu-lower,upper-mu]),color=edgecol,**kwargs))
    plots[-1][0].remove()
    return plots
 def removeRightTicks(ax=None):
    ax = ax or plt.gca()
    for i, line in enumerate(ax.get_yticklines()):
        if i%2 == 1:   # odd indices
            line.set_visible(False)
 def removeUpperTicks(ax=None):
    ax = ax or plt.gca()
    for i, line in enumerate(ax.get_xticklines()):
        if i%2 == 1:   # odd indices
            line.set_visible(False)
 def fewerXticks(ax=None,divideby=2):
    ax = ax or plt.gca()
    ax.set_xticks(ax.get_xticks()[::divideby])
 def align_subplots(N,M,xlim=None, ylim=None):
    """make all of the subplots have the same limits, turn off unnecessary ticks"""
    #find sensible xlim,ylim
    if xlim is None:
        xlim = [np.inf,-np.inf]
        for i in range(N*M):
            plt.subplot(N,M,i+1)
            xlim[0] = min(xlim[0],plt.xlim()[0])
            xlim[1] = max(xlim[1],plt.xlim()[1])
    if ylim is None:
        ylim = [np.inf,-np.inf]
        for i in range(N*M):
            plt.subplot(N,M,i+1)
            ylim[0] = min(ylim[0],plt.ylim()[0])
            ylim[1] = max(ylim[1],plt.ylim()[1])
    for i in range(N*M):
        plt.subplot(N,M,i+1)
        plt.xlim(xlim)
        plt.ylim(ylim)
        if (i)%M:
            plt.yticks([])
        else:
            removeRightTicks()
        if i<(M*(N-1)):
            plt.xticks([])
        else:
            removeUpperTicks()
 def align_subplot_array(axes,xlim=None, ylim=None):
    """
    Make all of the axes in the array hae the same limits, turn off unnecessary ticks
    use plt.subplots() to get an array of axes
    """
    #find sensible xlim,ylim
    if xlim is None:
        xlim = [np.inf,-np.inf]
        for ax in axes.flatten():
            xlim[0] = min(xlim[0],ax.get_xlim()[0])
            xlim[1] = max(xlim[1],ax.get_xlim()[1])
    if ylim is None:
        ylim = [np.inf,-np.inf]
        for ax in axes.flatten():
            ylim[0] = min(ylim[0],ax.get_ylim()[0])
            ylim[1] = max(ylim[1],ax.get_ylim()[1])
    N,M = axes.shape
    for i,ax in enumerate(axes.flatten()):
        ax.set_xlim(xlim)
        ax.set_ylim(ylim)
        if (i)%M:
            ax.set_yticks([])
        else:
            removeRightTicks(ax)
        if i<(M*(N-1)):
            ax.set_xticks([])
        else:
            removeUpperTicks(ax)
 def x_frame1D(X,plot_limits=None,resolution=None):
    """
    Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits
    """
    assert X.shape[1] ==1, "x_frame1D is defined for one-dimensional inputs"
    if plot_limits is None:
        from ...core.parameterization.variational import VariationalPosterior
        if isinstance(X, VariationalPosterior):
            xmin,xmax = X.mean.min(0),X.mean.max(0)
        else:
            xmin,xmax = X.min(0),X.max(0)
        xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
    elif len(plot_limits)==2:
        xmin, xmax = plot_limits
    else:
        raise ValueError("Bad limits for plotting")
    Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None]
    return Xnew, xmin, xmax
 def x_frame2D(X,plot_limits=None,resolution=None):
    """
    Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits
    """
    assert X.shape[1] ==2, "x_frame2D is defined for two-dimensional inputs"
    if plot_limits is None:
        xmin,xmax = X.min(0),X.max(0)
        xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
    elif len(plot_limits)==2:
        xmin, xmax = plot_limits
    else:
        raise ValueError("Bad limits for plotting")
    resolution = resolution or 50
    xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution]
    Xnew = np.vstack((xx.flatten(),yy.flatten())).T
    return Xnew, xx, yy, xmin, xmax
--- a/GPy/plotting/matplot_dep/plot_definitions.py
+++ b/GPy/plotting/matplot_dep/plot_definitions.py
@ -42,10 +42,11 @@ class MatplotlibPlots(AbstractPlottingLibrary):
        super(MatplotlibPlots, self).__init__()
        self._defaults = defaults.__dict__
-    def figure(self, rows=1, cols=1, **kwargs):
+    def figure(self, rows=1, cols=1, gridspec_kwargs={}, tight_layout=True, **kwargs):
-        fig = plt.figure(**kwargs)
+        fig = plt.figure(tight_layout=tight_layout, **kwargs)
        fig.rows = rows
        fig.cols = cols
        fig.gridspec = plt.GridSpec(rows, cols, **gridspec_kwargs)
        return fig
    def new_canvas(self, figure=None, row=1, col=1, projection='2d', xlabel=None, ylabel=None, zlabel=None, title=None, xlim=None, ylim=None, zlim=None, **kwargs):
@ -56,7 +57,9 @@ class MatplotlibPlots(AbstractPlottingLibrary):
        if 'ax' in kwargs:
            ax = kwargs.pop('ax')
        else:
-            if 'num' in kwargs and 'figsize' in kwargs:
+            if figure is not None:
                fig = figure
            elif 'num' in kwargs and 'figsize' in kwargs:
                fig = self.figure(num=kwargs.pop('num'), figsize=kwargs.pop('figsize'))
            elif 'num' in kwargs:
                fig = self.figure(num=kwargs.pop('num'))
@ -66,7 +69,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
                fig = self.figure()
            #if hasattr(fig, 'rows') and hasattr(fig, 'cols'):
-            ax = fig.add_subplot(fig.rows, fig.cols, (col,row), projection=projection)
+            ax = fig.add_subplot(fig.gridspec[row-1, col-1], projection=projection)
        if xlim is not None: ax.set_xlim(xlim)
        if ylim is not None: ax.set_ylim(ylim)
@ -79,7 +82,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
        return ax, kwargs
    def add_to_canvas(self, ax, plots, legend=False, title=None, **kwargs):
-        ax.autoscale_view()
+        #ax.autoscale_view()
        fontdict=dict(family='sans-serif', weight='light', size=9)
        if legend is True:
            ax.legend(*ax.get_legend_handles_labels())
@ -89,9 +92,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
        if title is not None: ax.figure.suptitle(title)
        return ax
-    def show_canvas(self, ax, tight_layout=False, **kwargs):
+    def show_canvas(self, ax):
        if tight_layout:
            ax.figure.tight_layout()
        ax.figure.canvas.draw()
        return ax.figure
--- a/GPy/plotting/plotly_dep/plot_definitions.py
+++ b/GPy/plotting/plotly_dep/plot_definitions.py
@ -254,7 +254,7 @@ class PlotlyPlots(AbstractPlottingLibrary):
                        font=dict(color='white' if np.abs(var) > 0.8 else 'black', size=10),
                        opacity=.5,
                        showarrow=False,
-                        hoverinfo='x'))
+                        ))
        return imshow, annotations
    def annotation_heatmap_interact(self, ax, plot_function, extent, label=None, resolution=15, imshow_kwargs=None, **annotation_kwargs):
--- a/GPy/testing/init.py
+++ b/GPy/testing/init.py
@ -1,14 +1,9 @@
-# Copyright (c) 2014, Max Zwiessele
+# Copyright (c) 2014, Max Zwiessele, GPy Authors
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 MaxZ
 """
 import unittest
 import sys
 def deepTest(reason):
    if reason:
        return lambda x:x
-    return unittest.skip("Not deep scanning, enable deepscan by adding 'deep' argument")
+    return unittest.skip("Not deep scanning, enable deepscan by adding 'deep' argument to unittest call")
--- a/GPy/testing/baseline/bayesian_gplvm_gradient.png
+++ b/GPy/testing/baseline/bayesian_gplvm_gradient.png
--- a/GPy/testing/baseline/bayesian_gplvm_inducing.png
+++ b/GPy/testing/baseline/bayesian_gplvm_inducing.png
--- a/GPy/testing/baseline/bayesian_gplvm_inducing_3d.png
+++ b/GPy/testing/baseline/bayesian_gplvm_inducing_3d.png
--- a/GPy/testing/baseline/bayesian_gplvm_latent.png
+++ b/GPy/testing/baseline/bayesian_gplvm_latent.png
--- a/GPy/testing/baseline/bayesian_gplvm_latent_3d.png
+++ b/GPy/testing/baseline/bayesian_gplvm_latent_3d.png
--- a/GPy/testing/baseline/bayesian_gplvm_magnification.png
+++ b/GPy/testing/baseline/bayesian_gplvm_magnification.png
--- a/GPy/testing/baseline/coverage_3d_plot.png
+++ b/GPy/testing/baseline/coverage_3d_plot.png
--- a/GPy/testing/baseline/coverage_annotation_interact.png
+++ b/GPy/testing/baseline/coverage_annotation_interact.png
--- a/GPy/testing/baseline/coverage_gradient.png
+++ b/GPy/testing/baseline/coverage_gradient.png
--- a/GPy/testing/baseline/coverage_imshow_interact.png
+++ b/GPy/testing/baseline/coverage_imshow_interact.png
--- a/GPy/testing/baseline/gp_2d_data.png
+++ b/GPy/testing/baseline/gp_2d_data.png
--- a/GPy/testing/baseline/gp_2d_in_error.png
+++ b/GPy/testing/baseline/gp_2d_in_error.png
--- a/GPy/testing/baseline/gp_2d_inducing.png
+++ b/GPy/testing/baseline/gp_2d_inducing.png
--- a/GPy/testing/baseline/gp_2d_mean.png
+++ b/GPy/testing/baseline/gp_2d_mean.png
--- a/GPy/testing/baseline/gp_3d_data.png
+++ b/GPy/testing/baseline/gp_3d_data.png
--- a/GPy/testing/baseline/gp_3d_inducing.png
+++ b/GPy/testing/baseline/gp_3d_inducing.png
--- a/GPy/testing/baseline/gp_3d_mean.png
+++ b/GPy/testing/baseline/gp_3d_mean.png
--- a/GPy/testing/baseline/gp_class_likelihood.png
+++ b/GPy/testing/baseline/gp_class_likelihood.png
--- a/GPy/testing/baseline/gp_class_raw.png
+++ b/GPy/testing/baseline/gp_class_raw.png
--- a/GPy/testing/baseline/gp_class_raw_link.png
+++ b/GPy/testing/baseline/gp_class_raw_link.png
--- a/GPy/testing/baseline/gp_conf.png
+++ b/GPy/testing/baseline/gp_conf.png
--- a/GPy/testing/baseline/gp_data.png
+++ b/GPy/testing/baseline/gp_data.png
--- a/GPy/testing/baseline/gp_density.png
+++ b/GPy/testing/baseline/gp_density.png
--- a/GPy/testing/baseline/gp_in_error.png
+++ b/GPy/testing/baseline/gp_in_error.png
--- a/GPy/testing/baseline/gp_mean.png
+++ b/GPy/testing/baseline/gp_mean.png
--- a/GPy/testing/baseline/gp_out_error.png
+++ b/GPy/testing/baseline/gp_out_error.png
--- a/GPy/testing/baseline/gp_samples.png
+++ b/GPy/testing/baseline/gp_samples.png
--- a/GPy/testing/baseline/gplvm_gradient.png
+++ b/GPy/testing/baseline/gplvm_gradient.png
--- a/GPy/testing/baseline/gplvm_latent.png
+++ b/GPy/testing/baseline/gplvm_latent.png
--- a/GPy/testing/baseline/gplvm_latent_3d.png
+++ b/GPy/testing/baseline/gplvm_latent_3d.png
--- a/GPy/testing/baseline/gplvm_magnification.png
+++ b/GPy/testing/baseline/gplvm_magnification.png
--- a/GPy/testing/baseline/kern_ARD.png
+++ b/GPy/testing/baseline/kern_ARD.png
--- a/GPy/testing/baseline/kern_cov_1d.png
+++ b/GPy/testing/baseline/kern_cov_1d.png
--- a/GPy/testing/baseline/kern_cov_2d.png
+++ b/GPy/testing/baseline/kern_cov_2d.png
--- a/GPy/testing/baseline/kern_cov_3d.png
+++ b/GPy/testing/baseline/kern_cov_3d.png
--- a/GPy/testing/baseline/kern_cov_no_lim.png
+++ b/GPy/testing/baseline/kern_cov_no_lim.png
--- a/GPy/testing/baseline/sparse_gp_class_likelihood.png
+++ b/GPy/testing/baseline/sparse_gp_class_likelihood.png
--- a/GPy/testing/baseline/sparse_gp_class_raw.png
+++ b/GPy/testing/baseline/sparse_gp_class_raw.png
--- a/GPy/testing/baseline/sparse_gp_class_raw_link.png
+++ b/GPy/testing/baseline/sparse_gp_class_raw_link.png
--- a/GPy/testing/baseline/sparse_gp_data_error.png
+++ b/GPy/testing/baseline/sparse_gp_data_error.png
--- a/GPy/testing/bgplvm_minibatch_tests.py
+++ b/GPy/testing/bgplvm_minibatch_tests.py
@ -1,109 +0,0 @@
 '''
 Created on 4 Sep 2015
@author: maxz
 '''
 import unittest
 import numpy as np
 import GPy
 class BGPLVMTest(unittest.TestCase):
    def setUp(self):
        np.random.seed(12345)
        X, W = np.random.normal(0,1,(100,6)), np.random.normal(0,1,(6,13))
        Y = X.dot(W) + np.random.normal(0, .1, (X.shape[0], W.shape[1]))
        self.inan = np.random.binomial(1, .1, Y.shape).astype(bool)
        self.X, self.W, self.Y = X,W,Y
        self.Q = 3
        self.m_full = GPy.models.BayesianGPLVM(Y, self.Q)
    def test_lik_comparisons_m1_s0(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=False)
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_predict_missing_data(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        self.assertRaises(NotImplementedError, m.predict, m.X, full_cov=True)
        mu1, var1 = m.predict(m.X, full_cov=False)
        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1, var2)
        mu1, var1 = m.predict(m.X.mean, full_cov=True)
        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=True)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1[:,:,0], var2)
        mu1, var1 = m.predict(m.X.mean, full_cov=False)
        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=False)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1[:,[0]], var2)
    def test_lik_comparisons_m0_s0(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=False)
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_lik_comparisons_m1_s1(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_lik_comparisons_m0_s1(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_gradients_missingdata(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=False, batchsize=self.Y.shape[1])
        assert(m.checkgrad())
    def test_gradients_missingdata_stochastics(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=1)
        assert(m.checkgrad())
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=4)
        assert(m.checkgrad())
    def test_gradients_stochastics(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=1)
        assert(m.checkgrad())
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=4)
        assert(m.checkgrad())
    def test_predict(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
 if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.testName']
    unittest.main()
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -324,8 +324,18 @@ class KernelGradientTestsContinuous(unittest.TestCase):
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
    def test_Poly(self):
        k = GPy.kern.Poly(self.D, order=5)
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
    def test_WhiteHeteroscedastic(self):
        k = GPy.kern.WhiteHeteroscedastic(self.D, self.X.shape[0])
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
    def test_standard_periodic(self):
-        k = GPy.kern.StdPeriodic(self.D, self.D-1)
+        k = GPy.kern.StdPeriodic(self.D)
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
@ -334,11 +344,14 @@ class KernelTestsMiscellaneous(unittest.TestCase):
        N, D = 100, 10
        self.X = np.linspace(-np.pi, +np.pi, N)[:,None] * np.random.uniform(-10,10,D)
        self.rbf = GPy.kern.RBF(2, active_dims=np.arange(0,4,2))
        self.rbf.randomize()
        self.linear = GPy.kern.Linear(2, active_dims=(3,9))
        self.linear.randomize()
        self.matern = GPy.kern.Matern32(3, active_dims=np.array([1,7,9]))
        self.matern.randomize()
        self.sumkern = self.rbf + self.linear
        self.sumkern += self.matern
-        self.sumkern.randomize()
+        #self.sumkern.randomize()
    def test_which_parts(self):
        self.assertTrue(np.allclose(self.sumkern.K(self.X, which_parts=[self.linear, self.matern]), self.linear.K(self.X)+self.matern.K(self.X)))
@ -348,6 +361,21 @@ class KernelTestsMiscellaneous(unittest.TestCase):
    def test_active_dims(self):
        np.testing.assert_array_equal(self.sumkern.active_dims, [0,1,2,3,7,9])
        np.testing.assert_array_equal(self.sumkern._all_dims_active, range(10))
        tmp = self.linear+self.rbf
        np.testing.assert_array_equal(tmp.active_dims, [0,2,3,9])
        np.testing.assert_array_equal(tmp._all_dims_active, range(10))
        tmp = self.matern+self.rbf
        np.testing.assert_array_equal(tmp.active_dims, [0,1,2,7,9])
        np.testing.assert_array_equal(tmp._all_dims_active, range(10))
        tmp = self.matern+self.rbf*self.linear
        np.testing.assert_array_equal(tmp.active_dims, [0,1,2,3,7,9])
        np.testing.assert_array_equal(tmp._all_dims_active, range(10))
        tmp = self.matern+self.rbf+self.linear
        np.testing.assert_array_equal(tmp.active_dims, [0,1,2,3,7,9])
        np.testing.assert_array_equal(tmp._all_dims_active, range(10))
        tmp = self.matern*self.rbf*self.linear
        np.testing.assert_array_equal(tmp.active_dims, [0,1,2,3,7,9])
        np.testing.assert_array_equal(tmp._all_dims_active, range(10))
 class KernelTestsNonContinuous(unittest.TestCase):
    def setUp(self):
--- a/GPy/testing/minibatch_tests.py
+++ b/GPy/testing/minibatch_tests.py
@ -0,0 +1,226 @@
 '''
 Created on 4 Sep 2015
@author: maxz
 '''
 import unittest
 import numpy as np
 import GPy
 class BGPLVMTest(unittest.TestCase):
    def setUp(self):
        np.random.seed(12345)
        X, W = np.random.normal(0,1,(100,6)), np.random.normal(0,1,(6,13))
        Y = X.dot(W) + np.random.normal(0, .1, (X.shape[0], W.shape[1]))
        self.inan = np.random.binomial(1, .1, Y.shape).astype(bool)
        self.X, self.W, self.Y = X,W,Y
        self.Q = 3
        self.m_full = GPy.models.BayesianGPLVM(Y, self.Q)
    def test_lik_comparisons_m1_s0(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=False)
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_predict_missing_data(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        self.assertRaises(NotImplementedError, m.predict, m.X, full_cov=True)
        mu1, var1 = m.predict(m.X, full_cov=False)
        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1, var2)
        mu1, var1 = m.predict(m.X.mean, full_cov=True)
        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=True)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1[:,:,0], var2)
        mu1, var1 = m.predict(m.X.mean, full_cov=False)
        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=False)
        np.testing.assert_allclose(mu1, mu2)
        np.testing.assert_allclose(var1[:,[0]], var2)
    def test_lik_comparisons_m0_s0(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=self.m_full.X.variance.values, missing_data=False, stochastic=False)
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_lik_comparisons_m1_s1(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_lik_comparisons_m0_s1(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_gradients_missingdata(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=False, batchsize=self.Y.shape[1])
        assert(m.checkgrad())
    def test_gradients_missingdata_stochastics(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=1)
        assert(m.checkgrad())
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=4)
        assert(m.checkgrad())
    def test_gradients_stochastics(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=1)
        assert(m.checkgrad())
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=4)
        assert(m.checkgrad())
    def test_predict(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
 class SparseGPMinibatchTest(unittest.TestCase):
    def setUp(self):
        np.random.seed(12345)
        X, W = np.random.normal(0,1,(100,6)), np.random.normal(0,1,(6,13))
        Y = X.dot(W) + np.random.normal(0, .1, (X.shape[0], W.shape[1]))
        self.inan = np.random.binomial(1, .1, Y.shape).astype(bool)
        self.X, self.W, self.Y = X,W,Y
        self.Q = 3
        self.m_full = GPy.models.SparseGPLVM(Y, self.Q, kernel=GPy.kern.RBF(self.Q, ARD=True))
    def test_lik_comparisons_m1_s0(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=False)
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_sparsegp_init(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        np.random.seed(1234)
        Z = self.X[np.random.choice(self.X.shape[0], replace=False, size=10)].copy()
        Q = Z.shape[1]
        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=True, stochastic=False)
        assert(m.checkgrad())
        m.optimize('adadelta', max_iters=10)
        assert(m.checkgrad())
        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=True, stochastic=True)
        assert(m.checkgrad())
        m.optimize('rprop', max_iters=10)
        assert(m.checkgrad())
        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=False, stochastic=False)
        assert(m.checkgrad())
        m.optimize('rprop', max_iters=10)
        assert(m.checkgrad())
        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=False, stochastic=True)
        assert(m.checkgrad())
        m.optimize('adadelta', max_iters=10)
        assert(m.checkgrad())
    def test_predict_missing_data(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        mu1, var1 = m.predict(m.X, full_cov=False)
        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
        np.testing.assert_allclose(mu1, mu2)
        for i in range(var1.shape[1]):
            np.testing.assert_allclose(var1[:,[i]], var2)
        mu1, var1 = m.predict(m.X, full_cov=True)
        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=True)
        np.testing.assert_allclose(mu1, mu2)
        for i in range(var1.shape[2]):
            np.testing.assert_allclose(var1[:,:,i], var2)
    def test_lik_comparisons_m0_s0(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=False)
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_lik_comparisons_m1_s1(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_lik_comparisons_m0_s1(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
    def test_gradients_missingdata(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=False, batchsize=self.Y.shape[1])
        assert(m.checkgrad())
    def test_gradients_missingdata_stochastics(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=1)
        assert(m.checkgrad())
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=4)
        assert(m.checkgrad())
    def test_gradients_stochastics(self):
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=True, batchsize=1)
        assert(m.checkgrad())
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=True, batchsize=4)
        assert(m.checkgrad())
    def test_predict(self):
        # Test if the different implementations give the exact same likelihood as the full model.
        # All of the following settings should give the same likelihood and gradients as the full model:
        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
        m[:] = self.m_full[:]
        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
        assert(m.checkgrad())
 if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.testName']
    unittest.main()
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -553,16 +553,27 @@ class GradientTests(np.testing.TestCase):
        rbflin = GPy.kern.RBF(1) + GPy.kern.White(1)
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=1, uncertain_inputs=1)
    def test_GPLVM_rbf_bias_white_kern_2D(self):
        """ Testing GPLVM with rbf + bias kernel """
        N, input_dim, D = 50, 1, 2
        X = np.random.rand(N, input_dim)
-        k = GPy.kern.RBF(input_dim, 0.5, 0.9 * np.ones((1,))) + GPy.kern.Bias(input_dim, 0.1) + GPy.kern.White(input_dim, 0.05)
+        k = GPy.kern.RBF(input_dim, 0.5, 0.9 * np.ones((1,))) + GPy.kern.Bias(input_dim, 0.1) + GPy.kern.White(input_dim, 0.05) + GPy.kern.Matern32(input_dim) + GPy.kern.Matern52(input_dim)
        K = k.K(X)
        Y = np.random.multivariate_normal(np.zeros(N), K, input_dim).T
        m = GPy.models.GPLVM(Y, input_dim, kernel=k)
        self.assertTrue(m.checkgrad())
    def test_SparseGPLVM_rbf_bias_white_kern_2D(self):
        """ Testing GPLVM with rbf + bias kernel """
        N, input_dim, D = 50, 1, 2
        X = np.random.rand(N, input_dim)
        k = GPy.kern.RBF(input_dim, 0.5, 0.9 * np.ones((1,))) + GPy.kern.Bias(input_dim, 0.1) + GPy.kern.White(input_dim, 0.05) + GPy.kern.Matern32(input_dim) + GPy.kern.Matern52(input_dim)
        K = k.K(X)
        Y = np.random.multivariate_normal(np.zeros(N), K, input_dim).T
        m = GPy.models.SparseGPLVM(Y, input_dim, kernel=k)
        self.assertTrue(m.checkgrad())
    def test_BCGPLVM_rbf_bias_white_kern_2D(self):
        """ Testing GPLVM with rbf + bias kernel """
        N, input_dim, D = 50, 1, 2
--- a/GPy/testing/plotting_tests.py
+++ b/GPy/testing/plotting_tests.py
@ -100,7 +100,7 @@ def _image_comparison(baseline_images, extensions=['pdf','svg','png'], tol=11):
            fig.axes[0].set_axis_off()
            fig.set_frameon(False)
            fig.canvas.draw()
-            fig.savefig(os.path.join(result_dir, "{}.{}".format(base, ext)), transparent=True, edgecolor='none', facecolor='none')
+            fig.savefig(os.path.join(result_dir, "{}.{}".format(base, ext)), transparent=True, edgecolor='none', facecolor='none', bbox='tight')
    for num, base in zip(plt.get_fignums(), baseline_images):
        for ext in extensions:
            #plt.close(num)
@ -116,7 +116,7 @@ def _image_comparison(baseline_images, extensions=['pdf','svg','png'], tol=11):
 def test_figure():
    np.random.seed(1239847)
    from GPy.plotting import plotting_library as pl
-    import matplotlib
+    #import matplotlib
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams[u'text.usetex'] = False
@ -160,7 +160,7 @@ def test_figure():
 def test_kernel():
    np.random.seed(1239847)
-    import matplotlib
+    #import matplotlib
    matplotlib.rcParams.update(matplotlib.rcParamsDefault)
    matplotlib.rcParams[u'figure.figsize'] = (4,3)
    matplotlib.rcParams[u'text.usetex'] = False
--- a/GPy/testing/util_tests.py
+++ b/GPy/testing/util_tests.py
@ -0,0 +1,49 @@
 #===============================================================================
 # Copyright (c) 2016, Max Zwiessele
 # All rights reserved.
 # 
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 # 
 # * Redistributions of source code must retain the above copyright notice, this
 #   list of conditions and the following disclaimer.
 # 
 # * Redistributions in binary form must reproduce the above copyright notice,
 #   this list of conditions and the following disclaimer in the documentation
 #   and/or other materials provided with the distribution.
 # 
 # * Neither the name of GPy.testing.util_tests nor the names of its
 #   contributors may be used to endorse or promote products derived from
 #   this software without specific prior written permission.
 # 
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #===============================================================================
 import unittest, numpy as np
 class TestDebug(unittest.TestCase):
    def test_checkFinite(self):
        from GPy.util.debug import checkFinite
        array = np.random.normal(0, 1, 100).reshape(25,4)
        self.assertTrue(checkFinite(array, name='test'))
        array[np.random.binomial(1, .3, array.shape).astype(bool)] = np.nan
        self.assertFalse(checkFinite(array))
    def test_checkFullRank(self):
        from GPy.util.debug import checkFullRank
        from GPy.util.linalg import tdot
        array = np.random.normal(0, 1, 100).reshape(25,4)
        self.assertFalse(checkFullRank(tdot(array), name='test'))
        array = np.random.normal(0, 1, (25,25))
        self.assertTrue(checkFullRank(tdot(array)))
--- a/GPy/util/init.py
+++ b/GPy/util/init.py
@ -15,4 +15,4 @@ from . import diag
 from . import initialization
 from . import multioutput
 from . import parallel
-
+from . import functions
--- a/GPy/util/debug.py
+++ b/GPy/util/debug.py
@ -22,7 +22,7 @@ def checkFullRank(m, tol=1e-10, name=None, force_check=False):
        name = 'Matrix with ID['+str(id(m))+']'
    assert len(m.shape)==2 and m.shape[0]==m.shape[1], 'The input of checkFullRank has to be a square matrix!'
-    if not force_check and m.shape[0]>=10000:
+    if not force_check and m.shape[0]>=10000: # pragma: no cover
        print('The size of '+name+'is too big to check (>=10000)!')
        return True
--- a/GPy/util/functions.py
+++ b/GPy/util/functions.py
@ -1,27 +1,31 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
-from scipy.special import erf, erfc, erfcx
+from scipy import special
 from scipy.special import erfcx
 import sys
 epsilon = sys.float_info.epsilon
 lim_val = -np.log(epsilon) 
-def logisticln(x):
+def logisticln(x): # pragma: no cover
    return np.where(x<lim_val, np.where(x>-lim_val, -np.log(1+np.exp(-x)), -x), -np.log(1+epsilon))
-def logistic(x):
+def logistic(x): # pragma: no cover
-    return np.where(x<lim_val, np.where(x>-lim_val, 1/(1+np.exp(-x)), epsilon/(epsilon+1)), 1/(1+epsilon))
+    return special.expit(x)
    #return np.where(x<lim_val, np.where(x>-lim_val, 1/(1+np.exp(-x)), epsilon/(epsilon+1)), 1/(1+epsilon))
-def normcdf(x):
+def normcdf(x): # pragma: no cover
-    g=0.5*erfc(-x/np.sqrt(2))
+    return special.ndtr(x)
-    return np.where(g==0, epsilon, np.where(g==1, 1-epsilon, g)) 
+    #g=0.5*erfc(-x/np.sqrt(2))
    #return np.where(g==0, epsilon, np.where(g==1, 1-epsilon, g)) 
-def normcdfln(x):
+def normcdfln(x): # pragma: no cover
-    return np.where(x < 0, -.5*x*x + np.log(.5) + np.log(erfcx(-x/np.sqrt(2))), np.log(normcdf(x)))
+    return special.log_ndtr(x)
    #return np.where(x < 0, -.5*x*x + np.log(.5) + np.log(erfcx(-x/np.sqrt(2))), np.log(normcdf(x)))
-def clip_exp(x):
+def clip_exp(x): # pragma: no cover
    return np.where(x<lim_val, np.where(x>-lim_val, np.exp(x), epsilon), 1/epsilon)
-def differfln(x0, x1):
+def differfln(x0, x1): # pragma: no cover
    # this is a, hopefully!, a numerically more stable variant of log(erf(x0)-erf(x1)) = log(erfc(x1)-erfc(x0)).
    return np.where(x0>x1, -x1*x1 + np.log(erfcx(x1)-np.exp(-x0**2+x1**2)*erfcx(x0)), -x0*x0 + np.log(np.exp(-x1**2+x0**2)*erfcx(x1) - erfcx(x0)))
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -78,7 +78,7 @@ def jitchol(A, maxtries=5):
    try: raise
    except:
        logging.warning('\n'.join(['Added jitter of {:.10e}'.format(jitter),
-            '  in '+traceback.format_list(traceback.extract_stack(limit=2)[-2:-1])[0][2:]]))
+            '  in '+traceback.format_list(traceback.extract_stack(limit=3)[-2:-1])[0][2:]]))
    return L
 # def dtrtri(L, lower=1):
--- a/setup.cfg
+++ b/setup.cfg
@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.9.6
+current_version = 0.9.7
 tag = True
 commit = True
`@ -15,4 +15,4 @@`


	`# [plotting]`	`# [plotting]`
	`# library = matplotlib # plotly`	`# library = matplotlib # plotly, none`
`@ -18,4 +18,4 @@`


	`from .util import align_subplot_array, align_subplots, fewerXticks, removeRightTicks, removeUpperTicks`	`from .util import align_subplot_array, align_subplots, fewerXticks, removeRightTicks, removeUpperTicks`
	`from . import controllers`	`from . import controllers, base_plots`