diff --git a/.coveragerc b/.coveragerc
index 84dfe227..eac8d014 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -2,7 +2,7 @@
 [run]
 branch = True
 source = GPy
-omit = ./GPy/testing/*.py, travis_tests.py, setup.py, ./GPy/__version__.py
+omit = ./GPy/testing/*.py, travis_tests.py, setup.py, ./GPy/__version__.py, ./GPy/plotting/*
 
 [report]
 # Regexes for lines to exclude from consideration
diff --git a/.travis.yml b/.travis.yml
index 98806750..b236d515 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -20,33 +20,19 @@ env:
   - PYTHON_VERSION=3.5
 
 before_install:
-  - export CONDA_CACHED=1
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; 
-      then export OS=Linux; 
-    elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; 
-      then export OS=MacOSX;
-      brew install pandoc; 
-    else 
-      echo "OS not supported yet"; 
-      exit 1; fi;
-  - if [[ $PYTHON_VERSION == "2.7" ]]; 
-      then export MINICONDA=Miniconda; 
-    elif [[ $PYTHON_VERSION == 3* ]]; 
-      then export MINICONDA=Miniconda3; 
-    else echo "Could not find python version";exit 1; fi;
-  - if [ ! -d $HOME/download/ ]; then mkdir $HOME/download/; fi;
-  - if [ ! -d $HOME/install/ ]; then mkdir $HOME/install/; fi;
-  - export MINICONDA_FILE=$MINICONDA-latest-$OS-x86_64-$PYTHON_VERSION
-  - export MINCONDA_CACHE_FILE=$HOME/download/$MINICONDA_FILE.sh
-  - export MINICONDA_INSTALL=$HOME/install/$MINICONDA_FILE
-  - if [ ! -f $MINCONDA_CACHE_FILE ]; then export CONDA_CACHED=0; wget http://repo.continuum.io/miniconda/$MINICONDA-latest-$OS-x86_64.sh -O $MINCONDA_CACHE_FILE; bash $MINCONDA_CACHE_FILE -b -p $MINICONDA_INSTALL; fi;
-  - export PATH="$MINICONDA_INSTALL/bin:$PATH";
+- wget https://github.com/mzwiessele/travis_scripts/raw/master/download_miniconda.sh
+- wget https://github.com/mzwiessele/travis_scripts/raw/master/install_retry.sh
+- source download_miniconda.sh
+- echo $PATH
 
 install:
-  - conda install --yes python=$PYTHON_VERSION numpy=1.9 scipy=0.16 nose pip six matplotlib sphinx;
-  - pip install codecov
-  - pip install pypandoc
-  - python setup.py develop
+- echo $PATH
+- source install_retry.sh
+- pip install codecov
+- pip install pypandoc
+- pip install git+git://github.com/BRML/climin.git
+- pip install autograd
+- python setup.py develop
 
 script:
   - coverage run travis_tests.py
@@ -60,16 +46,16 @@ before_deploy:
   - sphinx-apidoc -o source/ ../GPy
   - make html
   - cd ../
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; 
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]];
       then export DIST='sdist';
-    elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; 
+    elif [[ "$TRAVIS_OS_NAME" == "osx" ]];
       then export DIST='bdist_wheel';
     fi;
 
 deploy:
   provider: pypi
   user: maxz
-  password: 
+  password:
     secure: "vMEOlP7DQhFJ7hQAKtKC5hrJXFl5BkUt4nXdosWWiw//Kg8E+PPLg88XPI2gqIosir9wwgtbSBBbbwCxkM6uxRNMpoNR8Ixyv9fmSXp4rLl7bbBY768W7IRXKIBjpuEy2brQjoT+CwDDSzUkckHvuUjJDNRvUv8ab4P/qYO1LG4="
   on:
     tags: false
diff --git a/GPy/__version__.py b/GPy/__version__.py
index 50533e30..f5b77301 100644
--- a/GPy/__version__.py
+++ b/GPy/__version__.py
@@ -1 +1 @@
-__version__ = "0.9.6"
+__version__ = "0.9.7"
diff --git a/GPy/core/__init__.py b/GPy/core/__init__.py
index b3a29859..b0743916 100644
--- a/GPy/core/__init__.py
+++ b/GPy/core/__init__.py
@@ -43,4 +43,4 @@ def randomize(self, rand_gen=None, *args, **kwargs):
     
 Model.randomize = randomize
 Param.randomize = randomize
-Parameterized.randomize = randomize
\ No newline at end of file
+Parameterized.randomize = randomize
diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 8d5daed2..0b87b1d5 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -401,9 +401,9 @@ class GP(Model):
             var_jac = compute_cov_inner(self.posterior.woodbury_inv)
         return mean_jac, var_jac
 
-    def predict_wishard_embedding(self, Xnew, kern=None, mean=True, covariance=True):
+    def predict_wishart_embedding(self, Xnew, kern=None, mean=True, covariance=True):
         """
-        Predict the wishard embedding G of the GP. This is the density of the
+        Predict the wishart embedding G of the GP. This is the density of the
         input of the GP defined by the probabilistic function mapping f.
         G = J_mean.T*J_mean + output_dim*J_cov.
 
@@ -431,6 +431,10 @@ class GP(Model):
             G += Sigma
         return G
 
+    def predict_wishard_embedding(self, Xnew, kern=None, mean=True, covariance=True):
+        warnings.warn("Wrong naming, use predict_wishart_embedding instead. Will be removed in future versions!", DeprecationWarning)
+        return self.predict_wishart_embedding(Xnew, kern, mean, covariance)
+
     def predict_magnification(self, Xnew, kern=None, mean=True, covariance=True):
         """
         Predict the magnification factor as
diff --git a/GPy/core/model.py b/GPy/core/model.py
index ad09c917..7da6552a 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -45,4 +45,4 @@ class Model(ParamzModel, Priorizable):
         (including the MAP prior), so we return it here. If your model is not
         probabilistic, just return your *negative* gradient here!
         """
-        return -(self._log_likelihood_gradients() + self._log_prior_gradients())
\ No newline at end of file
+        return -(self._log_likelihood_gradients() + self._log_prior_gradients())
diff --git a/GPy/core/parameterization/__init__.py b/GPy/core/parameterization/__init__.py
index 11b75730..ff62a493 100644
--- a/GPy/core/parameterization/__init__.py
+++ b/GPy/core/parameterization/__init__.py
@@ -3,7 +3,7 @@
 
 from .param import Param
 from .parameterized import Parameterized
-from paramz import transformations
+from . import transformations
 
 from paramz.core import lists_and_dicts, index_operations, observable_array, observable
-from paramz import ties_and_remappings, ObsAr
\ No newline at end of file
+from paramz import ties_and_remappings, ObsAr
diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 69b93548..df755002 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -7,4 +7,4 @@ from paramz.transformations import __fixed__
 import logging, numpy as np
 
 class Param(Param, Priorizable):
-    pass
\ No newline at end of file
+    pass
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 9e71ddcf..3ff77c96 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -49,4 +49,4 @@ class Parameterized(Parameterized, Priorizable):
         If you want to operate on all parameters use m[''] to wildcard select all paramters
         and concatenate them. Printing m[''] will result in printing of all parameters in detail.
     """     
-    pass
\ No newline at end of file
+    pass
diff --git a/GPy/core/parameterization/transformations.py b/GPy/core/parameterization/transformations.py
index 936c7a64..0efc18c1 100644
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@@ -2,4 +2,4 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 from paramz.transformations import *
-from paramz.transformations import __fixed__
\ No newline at end of file
+from paramz.transformations import __fixed__
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index a0f50ce9..488b7026 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -44,7 +44,7 @@ class SparseGP(GP):
         #pick a sensible inference method
         if inference_method is None:
             if isinstance(likelihood, likelihoods.Gaussian):
-                inference_method = var_dtc.VarDTC(limit=1)
+                inference_method = var_dtc.VarDTC(limit=3)
             else:
                 #inference_method = ??
                 raise NotImplementedError("what to do what to do?")
diff --git a/GPy/core/svgp.py b/GPy/core/svgp.py
index a678a1fd..916952f2 100644
--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@@ -89,7 +89,7 @@ class SVGP(SparseGP):
         """
         Return a new batch of X and Y by taking a chunk of data from the complete X and Y
         """
-        i = self.slicer.next()
+        i = next(self.slicer)
         return self.X_all[i], self.Y_all[i]
 
     def stochastic_grad(self, parameters):
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 024b12ee..ce1c89e8 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -459,7 +459,7 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
     D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
     _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim)
 
-    k = kern.Linear(Q) + kern.White(Q, variance=1e-4)
+    k = kern.Linear(Q, ARD=True) + kern.White(Q, variance=1e-4)
     m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing, kernel=k, initx="PCA_concat", initz='permute', **kw)
 
     m['.*noise'] = [Y.var() / 40. for Y in Ylist]
@@ -479,7 +479,7 @@ def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim
     D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
     _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
 
-    k = kern.Linear(Q) + kern.White(Q, variance=1e-4)
+    k = kern.Linear(Q, ARD=True) + kern.White(Q, variance=1e-4)
     inanlist = []
 
     for Y in Ylist:
diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py
index ec055120..dc334059 100644
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@@ -22,7 +22,7 @@ class VarDTC(LatentFunctionInference):
 
     """
     const_jitter = 1e-8
-    def __init__(self, limit=1):
+    def __init__(self, limit=3):
         from paramz.caching import Cacher
         self.limit = limit
         self.get_trYYT = Cacher(self._get_trYYT, limit)
diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py
index b72e4fd2..603623a7 100644
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@@ -21,7 +21,7 @@ class VarDTC_minibatch(LatentFunctionInference):
 
     """
     const_jitter = 1e-8
-    def __init__(self, batchsize=None, limit=1, mpi_comm=None):
+    def __init__(self, batchsize=None, limit=3, mpi_comm=None):
 
         self.batchsize = batchsize
         self.mpi_comm = mpi_comm
diff --git a/GPy/inference/optimization/__init__.py b/GPy/inference/optimization/__init__.py
index 24ca752a..2fa96960 100644
--- a/GPy/inference/optimization/__init__.py
+++ b/GPy/inference/optimization/__init__.py
@@ -1,5 +1,8 @@
-from paramz.optimization import stochastics, Optimizer
+from paramz.optimization import Optimizer
+from . import stochastics
+
 from paramz.optimization import *
 import sys
+
 sys.modules['GPy.inference.optimization.stochastics'] = stochastics
-sys.modules['GPy.inference.optimization.Optimizer'] = Optimizer
\ No newline at end of file
+sys.modules['GPy.inference.optimization.Optimizer'] = Optimizer
diff --git a/GPy/inference/optimization/stochastics.py b/GPy/inference/optimization/stochastics.py
new file mode 100644
index 00000000..41f5320b
--- /dev/null
+++ b/GPy/inference/optimization/stochastics.py
@@ -0,0 +1,119 @@
+#===============================================================================
+# Copyright (c) 2015, Max Zwiessele
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# 
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+# 
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# 
+# * Neither the name of paramax nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#===============================================================================
+
+class StochasticStorage(object):
+    '''
+    This is a container for holding the stochastic parameters,
+    such as subset indices or step length and so on.
+
+    self.d has to be a list of lists:
+    [dimension indices, nan indices for those dimensions]
+    so that the minibatches can be used as efficiently as possible.
+    '''
+    def __init__(self, model):
+        """
+        Initialize this stochastic container using the given model
+        """
+
+    def do_stochastics(self):
+        """
+        Update the internal state to the next batch of the stochastic
+        descent algorithm.
+        """
+        pass
+
+    def reset(self):
+        """
+        Reset the state of this stochastics generator.
+        """
+
+class SparseGPMissing(StochasticStorage):
+    def __init__(self, model, batchsize=1):
+        """
+        Here we want to loop over all dimensions everytime.
+        Thus, we can just make sure the loop goes over self.d every
+        time. We will try to get batches which look the same together
+        which speeds up calculations significantly.
+        """
+        import numpy as np
+        self.Y = model.Y_normalized
+        bdict = {}
+        #For N > 1000 array2string default crops
+        opt = np.get_printoptions()
+        np.set_printoptions(threshold=np.inf)
+        for d in range(self.Y.shape[1]):
+            inan = np.isnan(self.Y)[:, d]
+            arr_str = np.array2string(inan, np.inf, 0, True, '', formatter={'bool':lambda x: '1' if x else '0'})
+            try:
+                bdict[arr_str][0].append(d)
+            except:
+                bdict[arr_str] = [[d], ~inan]
+        np.set_printoptions(**opt)
+        self.d = bdict.values()
+
+class SparseGPStochastics(StochasticStorage):
+    """
+    For the sparse gp we need to store the dimension we are in,
+    and the indices corresponding to those
+    """
+    def __init__(self, model, batchsize=1, missing_data=True):
+        self.batchsize = batchsize
+        self.output_dim = model.Y.shape[1]
+        self.Y = model.Y_normalized
+        self.missing_data = missing_data
+        self.reset()
+        self.do_stochastics()
+
+    def do_stochastics(self):
+        import numpy as np
+        if self.batchsize == 1:
+            self.current_dim = (self.current_dim+1)%self.output_dim
+            self.d = [[[self.current_dim], np.isnan(self.Y[:, self.current_dim]) if self.missing_data else None]]
+        else:
+            self.d = np.random.choice(self.output_dim, size=self.batchsize, replace=False)
+            bdict = {}
+            if self.missing_data:
+                opt = np.get_printoptions()
+                np.set_printoptions(threshold=np.inf)
+                for d in self.d:
+                    inan = np.isnan(self.Y[:, d])
+                    arr_str = np.array2string(inan,np.inf, 0,True, '',formatter={'bool':lambda x: '1' if x else '0'})
+                    try:
+                        bdict[arr_str][0].append(d)
+                    except:
+                        bdict[arr_str] = [[d], ~inan]
+                np.set_printoptions(**opt)
+                self.d = bdict.values()
+            else:
+                self.d = [[self.d, None]]
+
+    def reset(self):
+        self.current_dim = -1
+        self.d = None
diff --git a/GPy/installation.cfg b/GPy/installation.cfg
index 8458a86b..841bf608 100644
--- a/GPy/installation.cfg
+++ b/GPy/installation.cfg
@@ -15,4 +15,4 @@
 
 
 # [plotting]
-# library = matplotlib # plotly
+# library = matplotlib # plotly, none
diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index e2990f99..62796d93 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -10,7 +10,7 @@ from .src.add import Add
 from .src.prod import Prod
 from .src.rbf import RBF
 from .src.linear import Linear, LinearFull
-from .src.static import Bias, White, Fixed
+from .src.static import Bias, White, Fixed, WhiteHeteroscedastic
 from .src.brownian import Brownian
 from .src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
 from .src.mlp import MLP
@@ -28,4 +28,4 @@ from .src.trunclinear import TruncLinear,TruncLinear_inf
 from .src.splitKern import SplitKern,DEtime
 from .src.splitKern import DEtime as DiffGenomeKern
 from .src.spline import Spline
-from .src.basis_funcs import LogisticBasisFuncKernel, LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel
\ No newline at end of file
+from .src.basis_funcs import LogisticBasisFuncKernel, LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel
diff --git a/GPy/kern/src/ODE_t.py b/GPy/kern/src/ODE_t.py
index d5dae665..f09ab77d 100644
--- a/GPy/kern/src/ODE_t.py
+++ b/GPy/kern/src/ODE_t.py
@@ -162,4 +162,4 @@ class ODE_t(Kern):
 
                 self.lengthscale_Yt.gradient =  np.sum(dkYdlent*(-0.5*self.lengthscale_Yt**(-2)) * dL_dK)
 
-                self.ubias.gradient = np.sum(dkdubias * dL_dK) 
\ No newline at end of file
+                self.ubias.gradient = np.sum(dkdubias * dL_dK) 
diff --git a/GPy/kern/src/__init__.py b/GPy/kern/src/__init__.py
index d90842ca..69522e32 100644
--- a/GPy/kern/src/__init__.py
+++ b/GPy/kern/src/__init__.py
@@ -1 +1 @@
-from . import psi_comp
\ No newline at end of file
+from . import psi_comp
diff --git a/GPy/kern/src/add.py b/GPy/kern/src/add.py
index 86bceac7..8f91d639 100644
--- a/GPy/kern/src/add.py
+++ b/GPy/kern/src/add.py
@@ -19,8 +19,8 @@ class Add(CombinationKernel):
             if isinstance(kern, Add):
                 del subkerns[i]
                 for part in kern.parts[::-1]:
-                    kern.unlink_parameter(part)
-                    subkerns.insert(i, part)
+                    #kern.unlink_parameter(part)
+                    subkerns.insert(i, part.copy())
         super(Add, self).__init__(subkerns, name)
         self._exact_psicomp = self._check_exact_psicomp()
 
@@ -37,7 +37,7 @@ class Add(CombinationKernel):
         else:
             return False
 
-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
     def K(self, X, X2=None, which_parts=None):
         """
         Add all kernels together.
@@ -51,7 +51,7 @@ class Add(CombinationKernel):
             which_parts = [which_parts]
         return reduce(np.add, (p.K(X, X2) for p in which_parts))
 
-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
     def Kdiag(self, X, which_parts=None):
         if which_parts is None:
             which_parts = self.parts
@@ -98,17 +98,17 @@ class Add(CombinationKernel):
         [target.__iadd__(p.gradients_XX_diag(dL_dKdiag, X)) for p in self.parts]
         return target
 
-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
     def psi0(self, Z, variational_posterior):
         if not self._exact_psicomp: return Kern.psi0(self,Z,variational_posterior)
         return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts))
 
-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
     def psi1(self, Z, variational_posterior):
         if not self._exact_psicomp: return Kern.psi1(self,Z,variational_posterior)
         return reduce(np.add, (p.psi1(Z, variational_posterior) for p in self.parts))
 
-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
     def psi2(self, Z, variational_posterior):
         if not self._exact_psicomp: return Kern.psi2(self,Z,variational_posterior)
         psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts))
@@ -144,7 +144,7 @@ class Add(CombinationKernel):
                 raise NotImplementedError("psi2 cannot be computed for this kernel")
         return psi2
 
-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
     def psi2n(self, Z, variational_posterior):
         if not self._exact_psicomp: return Kern.psi2n(self, Z, variational_posterior)
         psi2 = reduce(np.add, (p.psi2n(Z, variational_posterior) for p in self.parts))
@@ -241,16 +241,20 @@ class Add(CombinationKernel):
             [np.add(target_grads[i],grads[i],target_grads[i]) for i in range(len(grads))]
         return target_grads
 
-    def add(self, other):
-        if isinstance(other, Add):
-            other_params = other.parameters[:]
-            for p in other_params:
-                other.unlink_parameter(p)
-            self.link_parameters(*other_params)
-        else:
-            self.link_parameter(other)
-        self.input_dim, self._all_dims_active = self.get_input_dim_active_dims(self.parts)
-        return self
+    #def add(self, other):
+    #    parts = self.parts
+    #    if 0:#isinstance(other, Add):
+    #        #other_params = other.parameters[:]
+    #        for p in other.parts[:]:
+    #            other.unlink_parameter(p)
+    #        parts.extend(other.parts)
+    #        #self.link_parameters(*other_params)
+    #        
+    #    else:
+    #        #self.link_parameter(other)
+    #        parts.append(other)
+    #    #self.input_dim, self._all_dims_active = self.get_input_dim_active_dims(parts)
+    #    return Add([p for p in parts], self.name)
 
     def input_sensitivity(self, summarize=True):
         if summarize:
diff --git a/GPy/kern/src/eq_ode2.py b/GPy/kern/src/eq_ode2.py
index ef71ffe0..8e735248 100644
--- a/GPy/kern/src/eq_ode2.py
+++ b/GPy/kern/src/eq_ode2.py
@@ -64,7 +64,7 @@ class EQ_ODE2(Kern):
         self.W = Param('W', W)
         self.link_parameters(self.lengthscale, self.C, self.B, self.W)
 
-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
     def K(self, X, X2=None):
         #This way is not working, indexes are lost after using k._slice_X
         #index = np.asarray(X, dtype=np.int)
diff --git a/GPy/kern/src/kern.py b/GPy/kern/src/kern.py
index 6a746092..ad6ed7db 100644
--- a/GPy/kern/src/kern.py
+++ b/GPy/kern/src/kern.py
@@ -48,11 +48,12 @@ class Kern(Parameterized):
 
         if active_dims is None:
             active_dims = np.arange(input_dim)
-
-        self.active_dims = active_dims
-        self._all_dims_active = np.atleast_1d(active_dims).astype(int)
-
-        assert self._all_dims_active.size == self.input_dim, "input_dim={} does not match len(active_dim)={}, _all_dims_active={}".format(self.input_dim, self._all_dims_active.size, self._all_dims_active)
+        
+        self.active_dims = np.asarray(active_dims, np.int_)
+        
+        self._all_dims_active = np.atleast_1d(self.active_dims).astype(int)
+        
+        assert self.active_dims.size == self.input_dim, "input_dim={} does not match len(active_dim)={}".format(self.input_dim, self._all_dims_active.size)
 
         self._sliced_X = 0
         self.useGPU = self._support_GPU and useGPU
@@ -68,9 +69,12 @@ class Kern(Parameterized):
     def _effective_input_dim(self):
         return np.size(self._all_dims_active)
 
-    @Cache_this(limit=20)
+    @Cache_this(limit=3)
     def _slice_X(self, X):
-        return X[:, self._all_dims_active]
+        try:
+            return X[:, self._all_dims_active].astype('float')
+        except:
+            return X[:, self._all_dims_active]
 
     def K(self, X, X2):
         """
@@ -319,10 +323,20 @@ class CombinationKernel(Kern):
         :param array-like extra_dims: if needed extra dimensions for the combination kernel to work on
         """
         assert all([isinstance(k, Kern) for k in kernels])
-        extra_dims = np.array(extra_dims, dtype=int)
-        input_dim, active_dims = self.get_input_dim_active_dims(kernels, extra_dims)
+        extra_dims = np.asarray(extra_dims, dtype=int)
+        
+        active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int))
+        
+        input_dim = active_dims.size
+        if extra_dims is not None:
+            input_dim += extra_dims.size
+
         # initialize the kernel with the full input_dim
         super(CombinationKernel, self).__init__(input_dim, active_dims, name)
+
+        effective_input_dim = reduce(max, (k._all_dims_active.max() for k in kernels)) + 1
+        self._all_dims_active = np.array(np.concatenate((np.arange(effective_input_dim), extra_dims if extra_dims is not None else [])), dtype=int)
+        
         self.extra_dims = extra_dims
         self.link_parameters(*kernels)
 
@@ -330,16 +344,8 @@ class CombinationKernel(Kern):
     def parts(self):
         return self.parameters
 
-    def get_input_dim_active_dims(self, kernels, extra_dims = None):
-        self.active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int))
-        #_all_dims_active = np.array(np.concatenate((_all_dims_active, extra_dims if extra_dims is not None else [])), dtype=int)
-        input_dim = reduce(max, (k._all_dims_active.max() for k in kernels)) + 1
-
-        if extra_dims is not None:
-            input_dim += extra_dims.size
-
-        _all_dims_active = np.arange(input_dim)
-        return input_dim, _all_dims_active
+    def _set_all_dims_ative(self):
+        self._all_dims_active = np.atleast_1d(self.active_dims).astype(int)        
 
     def input_sensitivity(self, summarize=True):
         """
diff --git a/GPy/kern/src/linear.py b/GPy/kern/src/linear.py
index 59595fea..fa412c1d 100644
--- a/GPy/kern/src/linear.py
+++ b/GPy/kern/src/linear.py
@@ -51,7 +51,7 @@ class Linear(Kern):
         self.link_parameter(self.variances)
         self.psicomp = PSICOMP_Linear()
 
-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
     def K(self, X, X2=None):
         if self.ARD:
             if X2 is None:
@@ -62,7 +62,7 @@ class Linear(Kern):
         else:
             return self._dot_product(X, X2) * self.variances
 
-    @Cache_this(limit=1, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
     def _dot_product(self, X, X2=None):
         if X2 is None:
             return tdot(X)
diff --git a/GPy/kern/src/mlp.py b/GPy/kern/src/mlp.py
index d86e5b15..6c997881 100644
--- a/GPy/kern/src/mlp.py
+++ b/GPy/kern/src/mlp.py
@@ -45,7 +45,7 @@ class MLP(Kern):
         self.link_parameters(self.variance, self.weight_variance, self.bias_variance)
 
 
-    @Cache_this(limit=20, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
     def K(self, X, X2=None):
         if X2 is None:
             X_denom = np.sqrt(self._comp_prod(X)+1.)
@@ -57,7 +57,7 @@ class MLP(Kern):
         XTX = self._comp_prod(X,X2)/X_denom[:,None]/X2_denom[None,:]
         return self.variance*four_over_tau*np.arcsin(XTX)
 
-    @Cache_this(limit=20, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
     def Kdiag(self, X):
         """Compute the diagonal of the covariance matrix for X."""
         X_prod = self._comp_prod(X)
@@ -88,14 +88,14 @@ class MLP(Kern):
         """Gradient of diagonal of covariance with respect to X"""
         return self._comp_grads_diag(dL_dKdiag, X)[3]
 
-    @Cache_this(limit=50, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
     def _comp_prod(self, X, X2=None):
         if X2 is None:
             return (np.square(X)*self.weight_variance).sum(axis=1)+self.bias_variance
         else:
             return (X*self.weight_variance).dot(X2.T)+self.bias_variance
     
-    @Cache_this(limit=20, ignore_args=(1,))
+    @Cache_this(limit=3, ignore_args=(1,))
     def _comp_grads(self, dL_dK, X, X2=None):
         var,w,b = self.variance, self.weight_variance, self.bias_variance
         K = self.K(X, X2)
@@ -130,7 +130,7 @@ class MLP(Kern):
             dX2 = common.T.dot(X)*w-((common*XTX).sum(axis=0)/(X2_prod+1.))[:,None]*X2*w
         return dvar, dw, db, dX, dX2
     
-    @Cache_this(limit=20, ignore_args=(1,))
+    @Cache_this(limit=3, ignore_args=(1,))
     def _comp_grads_diag(self, dL_dKdiag, X):
         var,w,b = self.variance, self.weight_variance, self.bias_variance
         K = self.Kdiag(X)
diff --git a/GPy/kern/src/poly.py b/GPy/kern/src/poly.py
index 216e3a00..57cb8800 100644
--- a/GPy/kern/src/poly.py
+++ b/GPy/kern/src/poly.py
@@ -5,32 +5,49 @@ import numpy as np
 from .kern import Kern
 from ...core.parameterization import Param
 from paramz.transformations import Logexp
+from paramz.caching import Cache_this
 
 class Poly(Kern):
     """
     Polynomial kernel
     """
 
-    def __init__(self, input_dim, variance=1., order=3., active_dims=None, name='poly'):
+    def __init__(self, input_dim, variance=1., scale=1., bias=1., order=3., active_dims=None, name='poly'):
         super(Poly, self).__init__(input_dim, active_dims, name)
         self.variance = Param('variance', variance, Logexp())
-        self.link_parameter(self.variance)
+        self.scale = Param('scale', scale, Logexp())
+        self.bias = Param('bias', bias, Logexp())
+
+        self.link_parameters(self.variance, self.scale, self.bias)
+        assert order >= 1, 'The order of the polynomial has to be at least 1.'
         self.order=order
 
-    def K(self, X, X2=None):
-        return (self._dot_product(X, X2) + 1.)**self.order * self.variance
 
-    def _dot_product(self, X, X2=None):
+    def K(self, X, X2=None):
+        _, _, B = self._AB(X, X2)
+        return B * self.variance
+
+    @Cache_this(limit=3)
+    def _AB(self, X, X2=None):
         if X2 is None:
-            return np.dot(X, X.T)
+            dot_prod = np.dot(X, X.T)
         else:
-            return np.dot(X, X2.T)
+            dot_prod = np.dot(X, X2.T)
+        A = (self.scale * dot_prod) + self.bias
+        B = A ** self.order
+        return dot_prod, A, B
 
     def Kdiag(self, X):
-        return self.variance*(np.square(X).sum(1) + 1.)**self.order
+        return self.K(X).diagonal()#self.variance*(np.square(X).sum(1) + 1.)**self.order
 
     def update_gradients_full(self, dL_dK, X, X2=None):
-        self.variance.gradient = np.sum(dL_dK * (self._dot_product(X, X2) + 1.)**self.order)
+        dot_prod, A, B = self._AB(X, X2)
+        dK_dA = self.variance * self.order * A ** (self.order-1.)
+        dL_dA = dL_dK * (dK_dA)
+        self.scale.gradient = (dL_dA * dot_prod).sum()
+        self.bias.gradient = dL_dA.sum()
+        self.variance.gradient = np.sum(dL_dK * B)
+        #import ipdb;ipdb.set_trace()
 
     def update_gradients_diag(self, dL_dKdiag, X):
         raise NotImplementedError
diff --git a/GPy/kern/src/prod.py b/GPy/kern/src/prod.py
index ae00a949..1e18b405 100644
--- a/GPy/kern/src/prod.py
+++ b/GPy/kern/src/prod.py
@@ -39,7 +39,7 @@ class Prod(CombinationKernel):
                     kernels.insert(i, part)
         super(Prod, self).__init__(kernels, name)
 
-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
     def K(self, X, X2=None, which_parts=None):
         if which_parts is None:
             which_parts = self.parts
@@ -48,7 +48,7 @@ class Prod(CombinationKernel):
             which_parts = [which_parts]
         return reduce(np.multiply, (p.K(X, X2) for p in which_parts))
 
-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
     def Kdiag(self, X, which_parts=None):
         if which_parts is None:
             which_parts = self.parts
diff --git a/GPy/kern/src/psi_comp/__init__.py b/GPy/kern/src/psi_comp/__init__.py
index 9afa8e8c..0edf4b72 100644
--- a/GPy/kern/src/psi_comp/__init__.py
+++ b/GPy/kern/src/psi_comp/__init__.py
@@ -21,7 +21,7 @@ from .gaussherm import PSICOMP_GH
 from . import rbf_psi_comp, linear_psi_comp, ssrbf_psi_comp, sslinear_psi_comp
 
 class PSICOMP_RBF(PSICOMP):
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
     def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
         variance, lengthscale = kern.variance, kern.lengthscale
         if isinstance(variational_posterior, variational.NormalPosterior):
@@ -31,7 +31,7 @@ class PSICOMP_RBF(PSICOMP):
         else:
             raise ValueError("unknown distriubtion received for psi-statistics")
 
-    @Cache_this(limit=10, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
     def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         variance, lengthscale = kern.variance, kern.lengthscale
         if isinstance(variational_posterior, variational.NormalPosterior):
@@ -43,7 +43,7 @@ class PSICOMP_RBF(PSICOMP):
 
 class PSICOMP_Linear(PSICOMP):
 
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
     def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
         variances = kern.variances
         if isinstance(variational_posterior, variational.NormalPosterior):
@@ -53,7 +53,7 @@ class PSICOMP_Linear(PSICOMP):
         else:
             raise ValueError("unknown distriubtion received for psi-statistics")
 
-    @Cache_this(limit=10, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
     def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         variances = kern.variances
         if isinstance(variational_posterior, variational.NormalPosterior):
diff --git a/GPy/kern/src/psi_comp/gaussherm.py b/GPy/kern/src/psi_comp/gaussherm.py
index 5fac6619..fe343aff 100644
--- a/GPy/kern/src/psi_comp/gaussherm.py
+++ b/GPy/kern/src/psi_comp/gaussherm.py
@@ -27,7 +27,7 @@ class PSICOMP_GH(PSICOMP):
     def _setup_observers(self):
         pass
     
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
     def comp_K(self, Z, qX):
         if self.Xs is None or self.Xs.shape != qX.mean.shape:
             from paramz import ObsAr
@@ -38,7 +38,7 @@ class PSICOMP_GH(PSICOMP):
             self.Xs[i] = self.locs[i]*S_sq+mu
         return self.Xs
     
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
     def psicomputations(self, kern, Z, qX, return_psi2_n=False):
         mu, S = qX.mean.values, qX.variance.values
         N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
@@ -62,7 +62,7 @@ class PSICOMP_GH(PSICOMP):
                 psi2 += self.weights[i]* tdot(Kfu.T)
         return psi0, psi1, psi2
     
-    @Cache_this(limit=10, ignore_args=(0, 2,3,4))
+    @Cache_this(limit=3, ignore_args=(0, 2,3,4))
     def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, qX):
         mu, S = qX.mean.values, qX.variance.values
         if self.cache_K: Xs = self.comp_K(Z, qX)
diff --git a/GPy/kern/src/psi_comp/rbf_psi_comp.py b/GPy/kern/src/psi_comp/rbf_psi_comp.py
index bf954717..670f24de 100644
--- a/GPy/kern/src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/src/psi_comp/rbf_psi_comp.py
@@ -132,5 +132,5 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
 
     return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
 
-_psi1computations = Cacher(__psi1computations, limit=5)
-_psi2computations = Cacher(__psi2computations, limit=5)
+_psi1computations = Cacher(__psi1computations, limit=3)
+_psi2computations = Cacher(__psi2computations, limit=3)
diff --git a/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py b/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
index 4eda1b32..6ffd414b 100644
--- a/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
+++ b/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
@@ -324,7 +324,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
         except:
             return self.fall_back.psicomputations(kern, Z, variational_posterior, return_psi2_n)
 
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
     def _psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
         """
         Z - MxQ
@@ -369,7 +369,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
         except:
             return self.fall_back.psiDerivativecomputations(kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)
 
-    @Cache_this(limit=10, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
     def _psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         # resolve the requirement of dL_dpsi2 to be symmetric
         if len(dL_dpsi2.shape)==2: dL_dpsi2 = (dL_dpsi2+dL_dpsi2.T)/2
diff --git a/GPy/kern/src/psi_comp/ssrbf_psi_comp.py b/GPy/kern/src/psi_comp/ssrbf_psi_comp.py
index 10ea95e4..ba057d4f 100644
--- a/GPy/kern/src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/src/psi_comp/ssrbf_psi_comp.py
@@ -88,7 +88,7 @@ try:
         return psi0,psi1,psi2,psi2n
 
     from GPy.util.caching import Cacher
-    psicomputations = Cacher(_psicomputations, limit=1)
+    psicomputations = Cacher(_psicomputations, limit=3)
 
     def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
         ARD = (len(lengthscale)!=1)
diff --git a/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
index 04063dcc..488e9e22 100644
--- a/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
+++ b/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
@@ -373,7 +373,7 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
     def get_dimensions(self, Z, variational_posterior):
         return variational_posterior.mean.shape[0], Z.shape[0], Z.shape[1]
 
-    @Cache_this(limit=1, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
     def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
         """
         Z - MxQ
@@ -407,7 +407,7 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
         else:
             return psi0, psi1_gpu.get(), psi2_gpu.get()
 
-    @Cache_this(limit=1, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
     def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         variance, lengthscale = kern.variance, kern.lengthscale
         from ....util.linalg_gpu import sum_axis
diff --git a/GPy/kern/src/standard_periodic.py b/GPy/kern/src/standard_periodic.py
index bc27107e..e0c76d0c 100644
--- a/GPy/kern/src/standard_periodic.py
+++ b/GPy/kern/src/standard_periodic.py
@@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-
-
-# Copyright (c) 2014, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2015, Alex Grigorevskiy
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 The standard periodic kernel which mentioned in:
@@ -9,7 +8,7 @@ The standard periodic kernel which mentioned in:
 The MIT Press, 2005.
 
 
-[2] Introduction to Gaussian processes. D. J. C. MacKay. In C. M. Bishop, editor, 
+[2] Introduction to Gaussian processes. D. J. C. MacKay. In C. M. Bishop, editor,
 Neural Networks and Machine Learning, pages 133-165. Springer, 1998.
 """
 
@@ -25,56 +24,56 @@ class StdPeriodic(Kern):
 
     .. math::
 
-       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} {}\sum_{i=1}^{input\_dim}  
-       \left( \frac{\sin(\frac{\pi}{\lambda_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
+       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} \sum_{i=1}^{input\_dim}
+       \left( \frac{\sin(\frac{\pi}{T_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
 
     :param input_dim: the number of input dimensions
     :type input_dim: int
     :param variance: the variance :math:`\theta_1` in the formula above
     :type variance: float
-    :param wavelength: the vector of wavelengths :math:`\lambda_i`. If None then 1.0 is assumed.
-    :type wavelength: array or list of the appropriate size (or float if there is only one wavelength parameter)
+    :param period: the vector of periods :math:`\T_i`. If None then 1.0 is assumed.
+    :type period: array or list of the appropriate size (or float if there is only one period parameter)
     :param lengthscale: the vector of lengthscale :math:`\l_i`. If None then 1.0 is assumed.
     :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
-    :param ARD1: Auto Relevance Determination with respect to wavelength. 
-        If equal to "False" one single wavelength parameter :math:`\lambda_i` for 
-        each dimension is assumed, otherwise there is one lengthscale 
+    :param ARD1: Auto Relevance Determination with respect to period.
+        If equal to "False" one single period parameter :math:`\T_i` for
+        each dimension is assumed, otherwise there is one lengthscale
         parameter per dimension.
     :type ARD1: Boolean
-    :param ARD2: Auto Relevance Determination with respect to lengthscale. 
-        If equal to "False" one single wavelength parameter :math:`l_i` for 
-        each dimension is assumed, otherwise there is one lengthscale 
+    :param ARD2: Auto Relevance Determination with respect to lengthscale.
+        If equal to "False" one single lengthscale parameter :math:`l_i` for
+        each dimension is assumed, otherwise there is one lengthscale
         parameter per dimension.
     :type ARD2: Boolean
     :param active_dims: indices of dimensions which are used in the computation of the kernel
-    :type wavelength: array or list of the appropriate size
+    :type active_dims: array or list of the appropriate size
     :param name: Name of the kernel for output
     :type String
     :param useGPU: whether of not use GPU
     :type Boolean
     """
-    
-    def __init__(self, input_dim, variance=1., wavelength=None, lengthscale=None, ARD1=False, ARD2=False, active_dims=None, name='std_periodic',useGPU=False):
+
+    def __init__(self, input_dim, variance=1., period=None, lengthscale=None, ARD1=False, ARD2=False, active_dims=None, name='std_periodic',useGPU=False):
         super(StdPeriodic, self).__init__(input_dim, active_dims, name, useGPU=useGPU)
         self.input_dim = input_dim
-        self.ARD1 = ARD1 # correspond to wavelengths        
+        self.ARD1 = ARD1 # correspond to periods
         self.ARD2 = ARD2 # correspond to lengthscales
-        
+
         self.name = name
-        
+
         if self.ARD1 == False:
-            if wavelength is not None:
-                wavelength = np.asarray(wavelength)
-                assert wavelength.size == 1, "Only one wavelength needed for non-ARD kernel"
+            if period is not None:
+                period = np.asarray(period)
+                assert period.size == 1, "Only one period needed for non-ARD kernel"
             else:
-                wavelength = np.ones(1)
+                period = np.ones(1.0)
         else:
-            if wavelength is not None:
-                wavelength = np.asarray(wavelength)
-                assert wavelength.size == input_dim, "bad number of wavelengths"
+            if period is not None:
+                period = np.asarray(period)
+                assert period.size == input_dim, "bad number of periods"
             else:
-                wavelength = np.ones(input_dim)
-        
+                period = np.ones(input_dim)
+
         if self.ARD2 == False:
             if lengthscale is not None:
                 lengthscale = np.asarray(lengthscale)
@@ -87,33 +86,33 @@ class StdPeriodic(Kern):
                 assert lengthscale.size == input_dim, "bad number of lengthscales"
             else:
                 lengthscale = np.ones(input_dim)
-        
+
         self.variance = Param('variance', variance, Logexp())
         assert self.variance.size==1, "Variance size must be one"
-        self.wavelengths =  Param('wavelengths', wavelength, Logexp())
-        self.lengthscales =  Param('lengthscales', lengthscale, Logexp())
-        
-        self.link_parameters(self.variance,  self.wavelengths, self.lengthscales)
+        self.period =  Param('period', period, Logexp())
+        self.lengthscale =  Param('lengthscale', lengthscale, Logexp())
+
+        self.link_parameters(self.variance,  self.period, self.lengthscale)
 
     def parameters_changed(self):
         """
-        This functions deals as a callback for each optimization iteration. 
+        This functions deals as a callback for each optimization iteration.
         If one optimization step was successfull and the parameters
-        this callback function will be called to be able to update any 
+        this callback function will be called to be able to update any
         precomputations for the kernel.
         """
-        
+
         pass
-        
-        
+
+
     def K(self, X, X2=None):
         """Compute the covariance matrix between X and X2."""
-        if X2 is None: 
+        if X2 is None:
             X2 = X
-            
-        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
-        exp_dist = np.exp( -0.5* np.sum( np.square(  np.sin( base ) / self.lengthscales ), axis = -1 ) ) 
-            
+
+        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.period
+        exp_dist = np.exp( -0.5* np.sum( np.square(  np.sin( base ) / self.lengthscale ), axis = -1 ) )
+
         return self.variance * exp_dist
 
 
@@ -125,42 +124,42 @@ class StdPeriodic(Kern):
 
     def update_gradients_full(self, dL_dK, X, X2=None):
         """derivative of the covariance matrix with respect to the parameters."""
-        if X2 is None: 
+        if X2 is None:
             X2 = X
-        
-        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
-        
-        sin_base = np.sin( base )         
-        exp_dist = np.exp( -0.5* np.sum( np.square(  sin_base / self.lengthscales ), axis = -1 ) ) 
-        
-        dwl = self.variance * (1.0/np.square(self.lengthscales)) * sin_base*np.cos(base) * (base / self.wavelengths)
-        
-        dl = self.variance * np.square( sin_base) / np.power( self.lengthscales, 3) 
-        
-        self.variance.gradient = np.sum(exp_dist * dL_dK)    
-        #target[0] += np.sum( exp_dist * dL_dK)        
-        
-        if self.ARD1: # different wavelengths
-            self.wavelengths.gradient = (dwl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
-        else:  # same wavelengths
-            self.wavelengths.gradient = np.sum(dwl.sum(-1) * exp_dist * dL_dK)
-            
+
+        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.period
+
+        sin_base = np.sin( base )
+        exp_dist = np.exp( -0.5* np.sum( np.square(  sin_base / self.lengthscale ), axis = -1 ) )
+
+        dwl = self.variance * (1.0/np.square(self.lengthscale)) * sin_base*np.cos(base) * (base / self.period)
+
+        dl = self.variance * np.square( sin_base) / np.power( self.lengthscale, 3)
+
+        self.variance.gradient = np.sum(exp_dist * dL_dK)
+        #target[0] += np.sum( exp_dist * dL_dK)
+
+        if self.ARD1: # different periods
+            self.period.gradient = (dwl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
+        else:  # same period
+            self.period.gradient = np.sum(dwl.sum(-1) * exp_dist * dL_dK)
+
         if self.ARD2: # different lengthscales
-            self.lengthscales.gradient = (dl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
+            self.lengthscale.gradient = (dl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
         else: # same lengthscales
-            self.lengthscales.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)
-        
+            self.lengthscale.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)
+
     def update_gradients_diag(self, dL_dKdiag, X):
         """derivative of the diagonal of the covariance matrix with respect to the parameters."""
         self.variance.gradient = np.sum(dL_dKdiag)
-        self.wavelengths.gradient = 0
-        self.lengthscales.gradient = 0
+        self.period.gradient = 0
+        self.lengthscale.gradient = 0
 
 #    def gradients_X(self, dL_dK, X, X2=None):
 #        """derivative of the covariance matrix with respect to X."""
-#    
+#
 #        raise NotImplemented("Periodic kernel: dK_dX not implemented")
 #
 #    def gradients_X_diag(self, dL_dKdiag, X):
-#        
+#
 #        raise NotImplemented("Periodic kernel: dKdiag_dX not implemented")
diff --git a/GPy/kern/src/static.py b/GPy/kern/src/static.py
index dc6fe7a0..18f7605f 100644
--- a/GPy/kern/src/static.py
+++ b/GPy/kern/src/static.py
@@ -81,6 +81,52 @@ class White(Static):
     def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
         self.variance.gradient = dL_dpsi0.sum()
 
+class WhiteHeteroscedastic(Static):
+    def __init__(self, input_dim, num_data, variance=1., active_dims=None, name='white_hetero'):
+        """
+        A heteroscedastic White kernel (nugget/noise).
+        It defines one variance (nugget) per input sample. 
+        
+        Prediction excludes any noise learnt by this Kernel, so be careful using this kernel.
+        
+        You can plot the errors learnt by this kernel by something similar as:
+        plt.errorbar(m.X, m.Y, yerr=2*np.sqrt(m.kern.white.variance))
+        """
+        super(Static, self).__init__(input_dim, active_dims, name)
+        self.variance = Param('variance', np.ones(num_data) * variance, Logexp())
+        self.link_parameters(self.variance)
+
+    def Kdiag(self, X):
+        if X.shape[0] == self.variance.shape[0]:
+            # If the input has the same number of samples as 
+            # the number of variances, we return the variances
+            return self.variance
+        return 0.
+
+    def K(self, X, X2=None):
+        if X2 is None and X.shape[0] == self.variance.shape[0]:
+            return np.eye(X.shape[0]) * self.variance
+        else:
+            return 0.
+
+    def psi2(self, Z, variational_posterior):
+        return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)
+
+    def psi2n(self, Z, variational_posterior):
+        return np.zeros((1, Z.shape[0], Z.shape[0]), dtype=np.float64)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        if X2 is None:
+            self.variance.gradient = np.diagonal(dL_dK)
+        else:
+            self.variance.gradient = 0.
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        self.variance.gradient = dL_dKdiag
+
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        self.variance.gradient = dL_dpsi0
+
 class Bias(Static):
     def __init__(self, input_dim, variance=1., active_dims=None, name='bias'):
         super(Bias, self).__init__(input_dim, variance, active_dims, name)
diff --git a/GPy/kern/src/stationary.py b/GPy/kern/src/stationary.py
index 7b4c3625..286edcc2 100644
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@@ -81,11 +81,11 @@ class Stationary(Kern):
     def dK_dr(self, r):
         raise NotImplementedError("implement derivative of the covariance function wrt r to use this class")
 
-    @Cache_this(limit=20, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
     def dK2_drdr(self, r):
         raise NotImplementedError("implement second derivative of covariance wrt r to use this method")
 
-    @Cache_this(limit=5, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
     def K(self, X, X2=None):
         """
         Kernel function applied on inputs X and X2.
@@ -99,6 +99,9 @@ class Stationary(Kern):
 
     @Cache_this(limit=3, ignore_args=())
     def dK_dr_via_X(self, X, X2):
+        """
+        compute the derivative of K wrt X going through X
+        """
         #a convenience function, so we can cache dK_dr
         return self.dK_dr(self._scaled_dist(X, X2))
 
diff --git a/GPy/kern/src/trunclinear.py b/GPy/kern/src/trunclinear.py
index 3a35744f..bb94ae73 100644
--- a/GPy/kern/src/trunclinear.py
+++ b/GPy/kern/src/trunclinear.py
@@ -54,12 +54,12 @@ class TruncLinear(Kern):
         self.add_parameter(self.variances)
         self.add_parameter(self.delta)
 
-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
     def K(self, X, X2=None):
         XX = self.variances*self._product(X, X2)
         return XX.sum(axis=-1)
 
-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
     def _product(self, X, X2=None):
         if X2 is None:
             X2 = X
@@ -149,12 +149,12 @@ class TruncLinear_inf(Kern):
         self.add_parameter(self.variances)
 
 
-#     @Cache_this(limit=2)
+#     @Cache_this(limit=3)
     def K(self, X, X2=None):
         tmp = self._product(X, X2)
         return (self.variances*tmp).sum(axis=-1)
 
-#     @Cache_this(limit=2)
+#     @Cache_this(limit=3)
     def _product(self, X, X2=None):
         if X2 is None:
             X2 = X
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index 86638eb9..aa8ac4ea 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -61,7 +61,7 @@ class BayesianGPLVM(SparseGP_MPI):
             else:
                 from ..inference.latent_function_inference.var_dtc import VarDTC
                 self.logger.debug("creating inference_method var_dtc")
-                inference_method = VarDTC(limit=1 if not missing_data else Y.shape[1])
+                inference_method = VarDTC(limit=3 if not missing_data else Y.shape[1])
         if isinstance(inference_method,VarDTC_minibatch):
             inference_method.mpi_comm = mpi_comm
 
diff --git a/GPy/models/bayesian_gplvm_minibatch.py b/GPy/models/bayesian_gplvm_minibatch.py
index 73324386..2a457a21 100644
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@@ -40,12 +40,13 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
             Z = np.random.permutation(X.copy())[:num_inducing]
         assert Z.shape[1] == X.shape[1]
 
-        if X_variance == False:
+        if X_variance is False:
             self.logger.info('no variance on X, activating sparse GPLVM')
             X = Param("latent space", X)
-        elif X_variance is None:
-            self.logger.info("initializing latent space variance ~ uniform(0,.1)")
-            X_variance = np.random.uniform(0,.1,X.shape)
+        else:
+            if X_variance is None:
+                self.logger.info("initializing latent space variance ~ uniform(0,.1)")
+                X_variance = np.random.uniform(0,.1,X.shape)
             self.variational_prior = NormalPrior()
             X = NormalPosterior(X, X_variance)
 
@@ -61,7 +62,7 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
         if inference_method is None:
             from ..inference.latent_function_inference.var_dtc import VarDTC
             self.logger.debug("creating inference_method var_dtc")
-            inference_method = VarDTC(limit=1 if not missing_data else Y.shape[1])
+            inference_method = VarDTC(limit=3 if not missing_data else Y.shape[1])
 
         super(BayesianGPLVMMiniBatch,self).__init__(X, Y, Z, kernel, likelihood=likelihood,
                                            name=name, inference_method=inference_method,
@@ -71,13 +72,13 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
         self.X = X
         self.link_parameter(self.X, 0)
 
-    def set_X_gradients(self, X, X_grad):
-        """Set the gradients of the posterior distribution of X in its specific form."""
-        X.mean.gradient, X.variance.gradient = X_grad
+    #def set_X_gradients(self, X, X_grad):
+    #    """Set the gradients of the posterior distribution of X in its specific form."""
+    #    X.mean.gradient, X.variance.gradient = X_grad
 
-    def get_X_gradients(self, X):
-        """Get the gradients of the posterior distribution of X in its specific form."""
-        return X.mean.gradient, X.variance.gradient
+    #def get_X_gradients(self, X):
+    #    """Get the gradients of the posterior distribution of X in its specific form."""
+    #    return X.mean.gradient, X.variance.gradient
 
     def _outer_values_update(self, full_values):
         """
@@ -106,7 +107,7 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
         super(BayesianGPLVMMiniBatch,self).parameters_changed()
 
         kl_fctr = self.kl_factr
-        if kl_fctr > 0:
+        if kl_fctr > 0 and self.has_uncertain_inputs():
             Xgrad = self.X.gradient.copy()
             self.X.gradient[:] = 0
             self.variational_prior.update_gradients_KL(self.X)
@@ -122,8 +123,8 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
 
             if self.missing_data or not self.stochastics:
                 self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)
-            elif self.stochastics:
+            else: #self.stochastics is given:
                 d = self.output_dim
                 self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)*self.stochastics.batchsize/d
 
-        self._Xgrad = self.X.gradient.copy()
\ No newline at end of file
+        self._Xgrad = self.X.gradient.copy()
diff --git a/GPy/models/gplvm.py b/GPy/models/gplvm.py
index 6416847c..cdc0ab47 100644
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@@ -41,4 +41,4 @@ class GPLVM(GP):
 
     def parameters_changed(self):
         super(GPLVM, self).parameters_changed()
-        self.X.gradient = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X, None)
\ No newline at end of file
+        self.X.gradient = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X, None)
diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
index be28d1a5..547f096f 100644
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@@ -5,14 +5,14 @@ import numpy as np
 import itertools, logging
 
 from ..kern import Kern
-from GPy.core.parameterization.variational import NormalPrior
+from ..core.parameterization.variational import NormalPrior
 from ..core.parameterization import Param
 from paramz import ObsAr
 from ..inference.latent_function_inference.var_dtc import VarDTC
 from ..inference.latent_function_inference import InferenceMethodList
 from ..likelihoods import Gaussian
 from ..util.initialization import initialize_latent
-from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
+from ..models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
 
 class MRD(BayesianGPLVMMiniBatch):
     """
@@ -215,40 +215,6 @@ class MRD(BayesianGPLVMMiniBatch):
             Z = np.random.randn(self.num_inducing, self.input_dim) * X.var()
         return Z
 
-    def _handle_plotting(self, fignum, axes, plotf, sharex=False, sharey=False):
-        import matplotlib.pyplot as plt
-        if axes is None:
-            fig = plt.figure(num=fignum)
-        sharex_ax = None
-        sharey_ax = None
-        plots = []
-        for i, g in enumerate(self.bgplvms):
-            try:
-                if sharex:
-                    sharex_ax = ax # @UndefinedVariable
-                    sharex = False # dont set twice
-                if sharey:
-                    sharey_ax = ax # @UndefinedVariable
-                    sharey = False # dont set twice
-            except:
-                pass
-            if axes is None:
-                ax = fig.add_subplot(1, len(self.bgplvms), i + 1, sharex=sharex_ax, sharey=sharey_ax)
-            elif isinstance(axes, (tuple, list, np.ndarray)):
-                ax = axes[i]
-            else:
-                raise ValueError("Need one axes per latent dimension input_dim")
-            plots.append(plotf(i, g, ax))
-            if sharey_ax is not None:
-                plt.setp(ax.get_yticklabels(), visible=False)
-        plt.draw()
-        if axes is None:
-            try:
-                fig.tight_layout()
-            except:
-                pass
-        return plots
-
     def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None, Yindex=0):
         """
         Prediction for data set Yindex[default=0].
@@ -270,59 +236,50 @@ class MRD(BayesianGPLVMMiniBatch):
     #                                     sharex=sharex, sharey=sharey)
     #         return fig
 
-    def plot_scales(self, fignum=None, ax=None, titles=None, sharex=False, sharey=True, *args, **kwargs):
+    def plot_scales(self, titles=None, fig_kwargs={}, **kwargs):
         """
-
-        TODO: Explain other parameters
+        Plot input sensitivity for all datasets, to see which input dimensions are
+        significant for which dataset.
 
         :param titles: titles for axes of datasets
 
+        kwargs go into plot_ARD for each kernel.
         """
+        from ..plotting import plotting_library as pl
+
         if titles is None:
             titles = [r'${}$'.format(name) for name in self.names]
-        ymax = reduce(max, [np.ceil(max(g.kern.input_sensitivity())) for g in self.bgplvms])
-        def plotf(i, g, ax):
-            #ax.set_ylim([0,ymax])
-            return g.kern.plot_ARD(ax=ax, title=titles[i], *args, **kwargs)
-        fig = self._handle_plotting(fignum, ax, plotf, sharex=sharex, sharey=sharey)
-        return fig
+
+        M = len(self.bgplvms)
+        fig = pl().figure(rows=1, cols=M, **fig_kwargs)
+        for c in range(M):
+            canvas = self.bgplvms[c].kern.plot_ARD(title=titles[c], figure=fig, col=c+1, **kwargs)
+        return canvas
 
     def plot_latent(self, labels=None, which_indices=None,
-                resolution=50, ax=None, marker='o', s=40,
-                fignum=None, plot_inducing=True, legend=True,
+                resolution=60, legend=True,
                 plot_limits=None,
-                aspect='auto', updates=False, predict_kwargs={}, imshow_kwargs={}):
+                updates=False,
+                kern=None, marker='<>^vsd',
+                num_samples=1000, projection='2d',
+                predict_kwargs={},
+                scatter_kwargs=None, **imshow_kwargs):
         """
         see plotting.matplot_dep.dim_reduction_plots.plot_latent
         if predict_kwargs is None, will plot latent spaces for 0th dataset (and kernel), otherwise give
         predict_kwargs=dict(Yindex='index') for plotting only the latent space of dataset with 'index'.
         """
-        import sys
-        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
-        from matplotlib import pyplot as plt
-        from ..plotting.matplot_dep import dim_reduction_plots
+        from ..plotting.gpy_plot.latent_plots import plot_latent
+
         if "Yindex" not in predict_kwargs:
             predict_kwargs['Yindex'] = 0
 
         Yindex = predict_kwargs['Yindex']
-        if ax is None:
-            fig = plt.figure(num=fignum)
-            ax = fig.add_subplot(111)
-        else:
-            fig = ax.figure
+
         self.kern = self.bgplvms[Yindex].kern
         self.likelihood = self.bgplvms[Yindex].likelihood
-        plot = dim_reduction_plots.plot_latent(self, labels, which_indices,
-                                        resolution, ax, marker, s,
-                                        fignum, plot_inducing, legend,
-                                        plot_limits, aspect, updates, predict_kwargs, imshow_kwargs)
-        ax.set_title(self.bgplvms[Yindex].name)
-        try:
-            fig.tight_layout()
-        except:
-            pass
 
-        return plot
+        return plot_latent(self, labels, which_indices, resolution, legend, plot_limits, updates, kern, marker, num_samples, projection, scatter_kwargs)
 
     def __getstate__(self):
         state = super(MRD, self).__getstate__()
diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py
index 73393d85..d1c252f8 100644
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@@ -41,11 +41,12 @@ class SparseGPMiniBatch(SparseGP):
     def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
                  name='sparse gp', Y_metadata=None, normalizer=False,
                  missing_data=False, stochastic=False, batchsize=1):
+        self._update_stochastics = False
 
         # pick a sensible inference method
         if inference_method is None:
             if isinstance(likelihood, likelihoods.Gaussian):
-                inference_method = var_dtc.VarDTC(limit=1 if not missing_data else Y.shape[1])
+                inference_method = var_dtc.VarDTC(limit=3 if not missing_data else Y.shape[1])
             else:
                 #inference_method = ??
                 raise NotImplementedError("what to do what to do?")
@@ -73,7 +74,14 @@ class SparseGPMiniBatch(SparseGP):
         logger.info("Adding Z as parameter")
         self.link_parameter(self.Z, index=0)
         self.posterior = None
-
+        
+    def optimize(self, optimizer=None, start=None, **kwargs):
+        try:
+            self._update_stochastics = True
+            SparseGP.optimize(self, optimizer=optimizer, start=start, **kwargs)
+        finally:
+            self._update_stochastics = False
+            
     def has_uncertain_inputs(self):
         return isinstance(self.X, VariationalPosterior)
 
@@ -226,16 +234,16 @@ class SparseGPMiniBatch(SparseGP):
             woodbury_inv = self.posterior._woodbury_inv
             woodbury_vector = self.posterior._woodbury_vector
 
-        if not self.stochastics:
-            m_f = lambda i: "Inference with missing_data: {: >7.2%}".format(float(i+1)/self.output_dim)
-            message = m_f(-1)
-            print(message, end=' ')
+        #if not self.stochastics:
+        #    m_f = lambda i: "Inference with missing_data: {: >7.2%}".format(float(i+1)/self.output_dim)
+        #    message = m_f(-1)
+        #    print(message, end=' ')
 
         for d, ninan in self.stochastics.d:
-            if not self.stochastics:
-                print(' '*(len(message)) + '\r', end=' ')
-                message = m_f(d)
-                print(message, end=' ')
+            #if not self.stochastics:
+            #    print(' '*(len(message)) + '\r', end=' ')
+            #    message = m_f(d)
+            #    print(message, end=' ')
 
             psi0ni = self.psi0[ninan]
             psi1ni = self.psi1[ninan]
@@ -262,8 +270,8 @@ class SparseGPMiniBatch(SparseGP):
             woodbury_vector[:, d] = posterior.woodbury_vector
             self._log_marginal_likelihood += log_marginal_likelihood
 
-        if not self.stochastics:
-            print('')
+        #if not self.stochastics:
+        #    print('')
 
         if self.posterior is None:
             self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector,
@@ -314,6 +322,8 @@ class SparseGPMiniBatch(SparseGP):
         if self.missing_data:
             self._outer_loop_for_missing_data()
         elif self.stochastics:
+            if self._update_stochastics:
+                self.stochastics.do_stochastics()
             self._outer_loop_without_missing_data()
         else:
             self.posterior, self._log_marginal_likelihood, self.grad_dict = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
diff --git a/GPy/models/sparse_gp_regression.py b/GPy/models/sparse_gp_regression.py
index 31bde23d..b1180511 100644
--- a/GPy/models/sparse_gp_regression.py
+++ b/GPy/models/sparse_gp_regression.py
@@ -62,4 +62,4 @@ class SparseGPRegression(SparseGP_MPI):
         if isinstance(self.inference_method,VarDTC_minibatch):
             update_gradients_sparsegp(self, mpi_comm=self.mpi_comm)
         else:
-            super(SparseGPRegression, self).parameters_changed()
\ No newline at end of file
+            super(SparseGPRegression, self).parameters_changed()
diff --git a/GPy/models/sparse_gplvm.py b/GPy/models/sparse_gplvm.py
index 22852d93..53696b45 100644
--- a/GPy/models/sparse_gplvm.py
+++ b/GPy/models/sparse_gplvm.py
@@ -4,6 +4,7 @@
 
 import sys
 from .sparse_gp_regression import SparseGPRegression
+from ..core import Param
 
 class SparseGPLVM(SparseGPRegression):
     """
@@ -21,7 +22,9 @@ class SparseGPLVM(SparseGPRegression):
         if X is None:
             from ..util.initialization import initialize_latent
             X, fracs = initialize_latent(init, input_dim, Y)
+        X = Param('latent space', X)
         SparseGPRegression.__init__(self, X, Y, kernel=kernel, num_inducing=num_inducing)
+        self.link_parameter(self.X, 0)
 
     def parameters_changed(self):
         super(SparseGPLVM, self).parameters_changed()
diff --git a/GPy/plotting/Tango.py b/GPy/plotting/Tango.py
index eb943962..50460410 100644
--- a/GPy/plotting/Tango.py
+++ b/GPy/plotting/Tango.py
@@ -104,4 +104,4 @@ cdict_Alu = {'red' :((0./5,colorsRGB['Aluminium1'][0]/256.,colorsRGB['Aluminium1
                      (2./5,colorsRGB['Aluminium3'][2]/256.,colorsRGB['Aluminium3'][2]/256.),
                      (3./5,colorsRGB['Aluminium4'][2]/256.,colorsRGB['Aluminium4'][2]/256.),
                      (4./5,colorsRGB['Aluminium5'][2]/256.,colorsRGB['Aluminium5'][2]/256.),
-                     (5./5,colorsRGB['Aluminium6'][2]/256.,colorsRGB['Aluminium6'][2]/256.))}
\ No newline at end of file
+                     (5./5,colorsRGB['Aluminium6'][2]/256.,colorsRGB['Aluminium6'][2]/256.))}
diff --git a/GPy/plotting/__init__.py b/GPy/plotting/__init__.py
index c46d5281..0bb91254 100644
--- a/GPy/plotting/__init__.py
+++ b/GPy/plotting/__init__.py
@@ -25,18 +25,66 @@ def change_plotting_library(lib):
             current_lib[0] = PlotlyPlots()
         if lib == 'none':
             current_lib[0] = None
+        inject_plotting()
         #===========================================================================
     except (ImportError, NameError):
         config.set('plotting', 'library', 'none')
+        raise
         import warnings
         warnings.warn(ImportWarning("You spevified {} in your configuration, but is not available. Install newest version of {} for plotting".format(lib, lib)))
 
-from ..util.config import config, NoOptionError
-try:
-    lib = config.get('plotting', 'library')
-    change_plotting_library(lib)
-except NoOptionError:
-    print("No plotting library was specified in config file. \n{}".format(error_suggestion))
+def inject_plotting():
+    if current_lib[0] is not None:
+        # Inject the plots into classes here:
+
+        # Already converted to new style:
+        from . import gpy_plot
+
+        from ..core import GP
+        GP.plot_data = gpy_plot.data_plots.plot_data
+        GP.plot_data_error = gpy_plot.data_plots.plot_data_error
+        GP.plot_errorbars_trainset = gpy_plot.data_plots.plot_errorbars_trainset
+        GP.plot_mean = gpy_plot.gp_plots.plot_mean
+        GP.plot_confidence = gpy_plot.gp_plots.plot_confidence
+        GP.plot_density = gpy_plot.gp_plots.plot_density
+        GP.plot_samples = gpy_plot.gp_plots.plot_samples
+        GP.plot = gpy_plot.gp_plots.plot
+        GP.plot_f = gpy_plot.gp_plots.plot_f
+        GP.plot_magnification = gpy_plot.latent_plots.plot_magnification
+
+        from ..core import SparseGP
+        SparseGP.plot_inducing = gpy_plot.data_plots.plot_inducing
+
+        from ..models import GPLVM, BayesianGPLVM, bayesian_gplvm_minibatch, SSGPLVM, SSMRD
+        GPLVM.plot_latent = gpy_plot.latent_plots.plot_latent
+        GPLVM.plot_scatter = gpy_plot.latent_plots.plot_latent_scatter
+        GPLVM.plot_inducing = gpy_plot.latent_plots.plot_latent_inducing
+        GPLVM.plot_steepest_gradient_map = gpy_plot.latent_plots.plot_steepest_gradient_map
+        BayesianGPLVM.plot_latent = gpy_plot.latent_plots.plot_latent
+        BayesianGPLVM.plot_scatter = gpy_plot.latent_plots.plot_latent_scatter
+        BayesianGPLVM.plot_inducing = gpy_plot.latent_plots.plot_latent_inducing
+        BayesianGPLVM.plot_steepest_gradient_map = gpy_plot.latent_plots.plot_steepest_gradient_map
+        bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch.plot_latent = gpy_plot.latent_plots.plot_latent
+        bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch.plot_scatter = gpy_plot.latent_plots.plot_latent_scatter
+        bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch.plot_inducing = gpy_plot.latent_plots.plot_latent_inducing
+        bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch.plot_steepest_gradient_map = gpy_plot.latent_plots.plot_steepest_gradient_map
+        SSGPLVM.plot_latent = gpy_plot.latent_plots.plot_latent
+        SSGPLVM.plot_scatter = gpy_plot.latent_plots.plot_latent_scatter
+        SSGPLVM.plot_inducing = gpy_plot.latent_plots.plot_latent_inducing
+        SSGPLVM.plot_steepest_gradient_map = gpy_plot.latent_plots.plot_steepest_gradient_map
+
+        from ..kern import Kern
+        Kern.plot_covariance = gpy_plot.kernel_plots.plot_covariance
+        def deprecate_plot(self, *args, **kwargs):
+            import warnings
+            warnings.warn(DeprecationWarning('Kern.plot is being deprecated and will not be available in the 1.0 release. Use Kern.plot_covariance instead'))
+            return self.plot_covariance(*args, **kwargs)
+        Kern.plot = deprecate_plot
+        Kern.plot_ARD = gpy_plot.kernel_plots.plot_ARD
+
+        from ..inference.optimization import Optimizer
+        Optimizer.plot = gpy_plot.inference_plots.plot_optimizer
+        # Variational plot!
 
 def plotting_library():
     if current_lib[0] is None:
@@ -53,54 +101,10 @@ def show(figure, **kwargs):
     """
     return plotting_library().show_canvas(figure, **kwargs)
 
-if config.get('plotting', 'library') is not 'none':
-    # Inject the plots into classes here:
 
-    # Already converted to new style:
-    from . import gpy_plot
-
-    from ..core import GP
-    GP.plot_data = gpy_plot.data_plots.plot_data
-    GP.plot_data_error = gpy_plot.data_plots.plot_data_error
-    GP.plot_errorbars_trainset = gpy_plot.data_plots.plot_errorbars_trainset
-    GP.plot_mean = gpy_plot.gp_plots.plot_mean
-    GP.plot_confidence = gpy_plot.gp_plots.plot_confidence
-    GP.plot_density = gpy_plot.gp_plots.plot_density
-    GP.plot_samples = gpy_plot.gp_plots.plot_samples
-    GP.plot = gpy_plot.gp_plots.plot
-    GP.plot_f = gpy_plot.gp_plots.plot_f
-    GP.plot_magnification = gpy_plot.latent_plots.plot_magnification
-
-    from ..core import SparseGP
-    SparseGP.plot_inducing = gpy_plot.data_plots.plot_inducing
-
-    from ..models import GPLVM, BayesianGPLVM, bayesian_gplvm_minibatch, SSGPLVM, SSMRD
-    GPLVM.plot_latent = gpy_plot.latent_plots.plot_latent
-    GPLVM.plot_scatter = gpy_plot.latent_plots.plot_latent_scatter
-    GPLVM.plot_inducing = gpy_plot.latent_plots.plot_latent_inducing
-    GPLVM.plot_steepest_gradient_map = gpy_plot.latent_plots.plot_steepest_gradient_map
-    BayesianGPLVM.plot_latent = gpy_plot.latent_plots.plot_latent
-    BayesianGPLVM.plot_scatter = gpy_plot.latent_plots.plot_latent_scatter
-    BayesianGPLVM.plot_inducing = gpy_plot.latent_plots.plot_latent_inducing
-    BayesianGPLVM.plot_steepest_gradient_map = gpy_plot.latent_plots.plot_steepest_gradient_map
-    bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch.plot_latent = gpy_plot.latent_plots.plot_latent
-    bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch.plot_scatter = gpy_plot.latent_plots.plot_latent_scatter
-    bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch.plot_inducing = gpy_plot.latent_plots.plot_latent_inducing
-    bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch.plot_steepest_gradient_map = gpy_plot.latent_plots.plot_steepest_gradient_map
-    SSGPLVM.plot_latent = gpy_plot.latent_plots.plot_latent
-    SSGPLVM.plot_scatter = gpy_plot.latent_plots.plot_latent_scatter
-    SSGPLVM.plot_inducing = gpy_plot.latent_plots.plot_latent_inducing
-    SSGPLVM.plot_steepest_gradient_map = gpy_plot.latent_plots.plot_steepest_gradient_map
-
-    from ..kern import Kern
-    Kern.plot_covariance = gpy_plot.kernel_plots.plot_covariance
-    def deprecate_plot(self, *args, **kwargs):
-        import warnings
-        warnings.warn(DeprecationWarning('Kern.plot is being deprecated and will not be available in the 1.0 release. Use Kern.plot_covariance instead'))
-        return self.plot_covariance(*args, **kwargs)
-    Kern.plot = deprecate_plot
-    Kern.plot_ARD = gpy_plot.kernel_plots.plot_ARD
-
-    from ..inference.optimization import Optimizer
-    Optimizer.plot = gpy_plot.inference_plots.plot_optimizer
-    # Variational plot!
+from ..util.config import config, NoOptionError
+try:
+    lib = config.get('plotting', 'library')
+    change_plotting_library(lib)
+except NoOptionError:
+    print("No plotting library was specified in config file. \n{}".format(error_suggestion))
diff --git a/GPy/plotting/gpy_plot/gp_plots.py b/GPy/plotting/gpy_plot/gp_plots.py
index 7439bd9d..c6975326 100644
--- a/GPy/plotting/gpy_plot/gp_plots.py
+++ b/GPy/plotting/gpy_plot/gp_plots.py
@@ -91,7 +91,7 @@ def _plot_mean(self, canvas, helper_data, helper_prediction,
             if projection == '2d':
                 update_not_existing_kwargs(kwargs, pl().defaults.meanplot_2d)  # @UndefinedVariable
                 plots = dict(gpmean=[pl().contour(canvas, x[:,0], y[0,:],
-                                               mu.reshape(resolution, resolution),
+                                               mu.reshape(resolution, resolution).T,
                                                levels=levels, label=label, **kwargs)])
             elif projection == '3d':
                 update_not_existing_kwargs(kwargs, pl().defaults.meanplot_3d)  # @UndefinedVariable
@@ -420,4 +420,4 @@ def _plot(self, canvas, plots, helper_data, helper_prediction, levels, plot_indu
 
         if helper_prediction[2] is not None:
             plots.update(_plot_samples(self, canvas, helper_data, helper_prediction, projection, "Samples"))
-        return plots
\ No newline at end of file
+        return plots
diff --git a/GPy/plotting/gpy_plot/kernel_plots.py b/GPy/plotting/gpy_plot/kernel_plots.py
index 492754b2..1e80a8e8 100644
--- a/GPy/plotting/gpy_plot/kernel_plots.py
+++ b/GPy/plotting/gpy_plot/kernel_plots.py
@@ -33,7 +33,7 @@ from .. import Tango
 from .plot_util import update_not_existing_kwargs, helper_for_plot_data
 from ...kern.src.kern import Kern, CombinationKernel
 
-def plot_ARD(kernel, filtering=None, legend=False, **kwargs):
+def plot_ARD(kernel, filtering=None, legend=False, canvas=None, **kwargs):
     """
     If an ARD kernel is present, plot a bar representation using matplotlib
 
@@ -62,7 +62,11 @@ def plot_ARD(kernel, filtering=None, legend=False, **kwargs):
 
     bars = []
     kwargs = update_not_existing_kwargs(kwargs, pl().defaults.ard)
-    canvas, kwargs = pl().new_canvas(xlim=(-.5, kernel._effective_input_dim-.5), xlabel='input dimension', ylabel='sensitivity', **kwargs)
+
+
+    if canvas is None:
+        canvas, kwargs = pl().new_canvas(xlim=(-.5, kernel._effective_input_dim-.5), xlabel='input dimension', ylabel='sensitivity', **kwargs)
+
     for i in range(ard_params.shape[0]):
         if parts[i].name in filtering:
             c = Tango.nextMedium()
@@ -96,7 +100,7 @@ def plot_covariance(kernel, x=None, label=None,
     """
     X = np.ones((2, kernel._effective_input_dim)) * [[-3], [3]]
     _, free_dims, Xgrid, xx, yy, _, _, resolution = helper_for_plot_data(kernel, X, plot_limits, visible_dims, None, resolution)
-    
+
     from numbers import Number
     if x is None:
         from ...kern.src.stationary import Stationary
@@ -104,7 +108,7 @@ def plot_covariance(kernel, x=None, label=None,
     elif isinstance(x, Number):
         x = np.ones((1, kernel._effective_input_dim))*x
     K = kernel.K(Xgrid, x)
-    
+
     if projection == '3d':
         xlabel = 'X[:,0]'
         ylabel = 'X[:,1]'
@@ -136,4 +140,4 @@ def plot_covariance(kernel, x=None, label=None,
         return pl().add_to_canvas(canvas, plots)
 
     else:
-        raise NotImplementedError("Cannot plot a kernel with more than two input dimensions")
\ No newline at end of file
+        raise NotImplementedError("Cannot plot a kernel with more than two input dimensions")
diff --git a/GPy/plotting/gpy_plot/latent_plots.py b/GPy/plotting/gpy_plot/latent_plots.py
index ed12ad9a..5427e013 100644
--- a/GPy/plotting/gpy_plot/latent_plots.py
+++ b/GPy/plotting/gpy_plot/latent_plots.py
@@ -147,6 +147,7 @@ def _plot_magnification(self, canvas, which_indices, Xgrid,
     def plot_function(x):
         Xtest_full = np.zeros((x.shape[0], Xgrid.shape[1]))
         Xtest_full[:, which_indices] = x
+
         mf = self.predict_magnification(Xtest_full, kern=kern, mean=mean, covariance=covariance)
         return mf.reshape(resolution, resolution).T
     imshow_kwargs = update_not_existing_kwargs(imshow_kwargs, pl().defaults.magnification)
@@ -163,7 +164,8 @@ def plot_magnification(self, labels=None, which_indices=None,
                 updates=False,
                 mean=True, covariance=True,
                 kern=None, num_samples=1000,
-                scatter_kwargs=None, **imshow_kwargs):
+                scatter_kwargs=None, plot_scatter=True,
+                **imshow_kwargs):
     """
     Plot the magnification factor of the GP on the inputs. This is the
     density of the GP as a gray scale.
@@ -188,17 +190,20 @@ def plot_magnification(self, labels=None, which_indices=None,
     _, _, Xgrid, _, _, xmin, xmax, resolution = helper_for_plot_data(self, X, plot_limits, which_indices, None, resolution)
     canvas, imshow_kwargs = pl().new_canvas(xlim=(xmin[0], xmax[0]), ylim=(xmin[1], xmax[1]),
                            xlabel='latent dimension %i' % input_1, ylabel='latent dimension %i' % input_2, **imshow_kwargs)
-    if (labels is not None):
-        legend = find_best_layout_for_subplots(len(np.unique(labels)))[1]
-    else:
-        labels = np.ones(self.num_data)
-        legend = False
-    scatters = _plot_latent_scatter(canvas, X, which_indices, labels, marker, num_samples, projection='2d', **scatter_kwargs or {})
-    view = _plot_magnification(self, canvas, which_indices, Xgrid, xmin, xmax, resolution, updates, mean, covariance, kern, **imshow_kwargs)
-    retval = pl().add_to_canvas(canvas, dict(scatter=scatters, imshow=view),
+    plots = {}
+    if legend and plot_scatter:
+        if (labels is not None):
+            legend = find_best_layout_for_subplots(len(np.unique(labels)))[1]
+        else:
+            labels = np.ones(self.num_data)
+            legend = False
+    if plot_scatter:
+        plots['scatters'] = _plot_latent_scatter(canvas, X, which_indices, labels, marker, num_samples, projection='2d', **scatter_kwargs or {})
+    plots['view'] = _plot_magnification(self, canvas, which_indices, Xgrid, xmin, xmax, resolution, updates, mean, covariance, kern, **imshow_kwargs)
+    retval = pl().add_to_canvas(canvas, plots,
                            legend=legend,
                            )
-    _wait_for_updates(view, updates)
+    _wait_for_updates(plots['view'], updates)
     return retval
 
 
@@ -211,7 +216,12 @@ def _plot_latent(self, canvas, which_indices, Xgrid,
     def plot_function(x):
         Xtest_full = np.zeros((x.shape[0], Xgrid.shape[1]))
         Xtest_full[:, which_indices] = x
-        mf = np.log(self.predict(Xtest_full, kern=kern)[1])
+        mf = self.predict(Xtest_full, kern=kern)[1]
+        if mf.shape[1]==self.output_dim:
+            mf = mf.sum(-1)
+        else:
+            mf *= self.output_dim
+        mf = np.log(mf)
         return mf.reshape(resolution, resolution).T
 
     imshow_kwargs = update_not_existing_kwargs(imshow_kwargs, pl().defaults.latent)
@@ -254,11 +264,12 @@ def plot_latent(self, labels=None, which_indices=None,
     _, _, Xgrid, _, _, xmin, xmax, resolution = helper_for_plot_data(self, X, plot_limits, which_indices, None, resolution)
     canvas, imshow_kwargs = pl().new_canvas(xlim=(xmin[0], xmax[0]), ylim=(xmin[1], xmax[1]),
                            xlabel='latent dimension %i' % input_1, ylabel='latent dimension %i' % input_2, **imshow_kwargs)
-    if (labels is not None):
-        legend = find_best_layout_for_subplots(len(np.unique(labels)))[1]
-    else:
-        labels = np.ones(self.num_data)
-        legend = False
+    if legend:
+        if (labels is not None):
+            legend = find_best_layout_for_subplots(len(np.unique(labels)))[1]
+        else:
+            labels = np.ones(self.num_data)
+            legend = False
     scatters = _plot_latent_scatter(canvas, X, which_indices, labels, marker, num_samples, projection='2d', **scatter_kwargs or {})
     view = _plot_latent(self, canvas, which_indices, Xgrid, xmin, xmax, resolution, updates, kern, **imshow_kwargs)
     retval = pl().add_to_canvas(canvas, dict(scatter=scatters, imshow=view), legend=legend)
diff --git a/GPy/plotting/gpy_plot/plot_util.py b/GPy/plotting/gpy_plot/plot_util.py
index 4e71a3bc..3089af20 100644
--- a/GPy/plotting/gpy_plot/plot_util.py
+++ b/GPy/plotting/gpy_plot/plot_util.py
@@ -380,4 +380,4 @@ def x_frame2D(X,plot_limits=None,resolution=None):
     resolution = resolution or 50
     xx, yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution]
     Xnew = np.vstack((xx.flatten(),yy.flatten())).T
-    return Xnew, xx, yy, xmin, xmax
\ No newline at end of file
+    return Xnew, xx, yy, xmin, xmax
diff --git a/GPy/plotting/matplot_dep/__init__.py b/GPy/plotting/matplot_dep/__init__.py
index e9d2395d..dbdbd7d5 100644
--- a/GPy/plotting/matplot_dep/__init__.py
+++ b/GPy/plotting/matplot_dep/__init__.py
@@ -18,4 +18,4 @@
 
 
 from .util import align_subplot_array, align_subplots, fewerXticks, removeRightTicks, removeUpperTicks
-from . import controllers
\ No newline at end of file
+from . import controllers, base_plots
diff --git a/GPy/plotting/matplot_dep/base_plots.py b/GPy/plotting/matplot_dep/base_plots.py
new file mode 100644
index 00000000..d9910f59
--- /dev/null
+++ b/GPy/plotting/matplot_dep/base_plots.py
@@ -0,0 +1,265 @@
+# #Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+from matplotlib import pyplot as plt
+import numpy as np
+
+def ax_default(fignum, ax):
+    if ax is None:
+        fig = plt.figure(fignum)
+        ax = fig.add_subplot(111)
+    else:
+        fig = ax.figure
+    return fig, ax
+
+def meanplot(x, mu, color='#3300FF', ax=None, fignum=None, linewidth=2,**kw):
+    _, axes = ax_default(fignum, ax)
+    return axes.plot(x,mu,color=color,linewidth=linewidth,**kw)
+
+def gpplot(x, mu, lower, upper, edgecol='#3300FF', fillcol='#33CCFF', ax=None, fignum=None, **kwargs):
+    _, axes = ax_default(fignum, ax)
+
+    mu = mu.flatten()
+    x = x.flatten()
+    lower = lower.flatten()
+    upper = upper.flatten()
+
+    plots = []
+
+    #here's the mean
+    plots.append(meanplot(x, mu, edgecol, axes))
+
+    #here's the box
+    kwargs['linewidth']=0.5
+    if not 'alpha' in kwargs.keys():
+        kwargs['alpha'] = 0.3
+    plots.append(axes.fill(np.hstack((x,x[::-1])),np.hstack((upper,lower[::-1])),color=fillcol,**kwargs))
+
+    #this is the edge:
+    plots.append(meanplot(x, upper,color=edgecol, linewidth=0.2, ax=axes))
+    plots.append(meanplot(x, lower,color=edgecol, linewidth=0.2, ax=axes))
+
+    return plots
+
+def gradient_fill(x, percentiles, ax=None, fignum=None, **kwargs):
+    _, ax = ax_default(fignum, ax)
+
+    plots = []
+
+    #here's the box
+    if 'linewidth' not in kwargs:
+        kwargs['linewidth'] = 0.5
+    if not 'alpha' in kwargs.keys():
+        kwargs['alpha'] = 1./(len(percentiles))
+    
+    # pop where from kwargs
+    where = kwargs.pop('where') if 'where' in kwargs else None
+    # pop interpolate, which we actually do not do here!
+    if 'interpolate' in kwargs: kwargs.pop('interpolate')
+    
+    def pairwise(inlist):
+        l = len(inlist)
+        for i in range(int(np.ceil(l/2.))):
+            yield inlist[:][i], inlist[:][(l-1)-i]
+    
+    polycol = []
+    for y1, y2 in pairwise(percentiles):
+        import matplotlib.mlab as mlab
+        # Handle united data, such as dates
+        ax._process_unit_info(xdata=x, ydata=y1)
+        ax._process_unit_info(ydata=y2)
+    
+        # Convert the arrays so we can work with them
+        from numpy import ma
+        x = ma.masked_invalid(ax.convert_xunits(x))
+        y1 = ma.masked_invalid(ax.convert_yunits(y1))
+        y2 = ma.masked_invalid(ax.convert_yunits(y2))
+    
+        if y1.ndim == 0:
+            y1 = np.ones_like(x) * y1
+        if y2.ndim == 0:
+            y2 = np.ones_like(x) * y2
+    
+        if where is None:
+            where = np.ones(len(x), np.bool)
+        else:
+            where = np.asarray(where, np.bool)
+    
+        if not (x.shape == y1.shape == y2.shape == where.shape):
+            raise ValueError("Argument dimensions are incompatible")
+    
+        mask = reduce(ma.mask_or, [ma.getmask(a) for a in (x, y1, y2)])
+        if mask is not ma.nomask:
+            where &= ~mask
+        
+        polys = []
+        for ind0, ind1 in mlab.contiguous_regions(where):
+            xslice = x[ind0:ind1]
+            y1slice = y1[ind0:ind1]
+            y2slice = y2[ind0:ind1]
+    
+            if not len(xslice):
+                continue
+    
+            N = len(xslice)
+            X = np.zeros((2 * N + 2, 2), np.float)
+    
+            # the purpose of the next two lines is for when y2 is a
+            # scalar like 0 and we want the fill to go all the way
+            # down to 0 even if none of the y1 sample points do
+            start = xslice[0], y2slice[0]
+            end = xslice[-1], y2slice[-1]
+    
+            X[0] = start
+            X[N + 1] = end
+    
+            X[1:N + 1, 0] = xslice
+            X[1:N + 1, 1] = y1slice
+            X[N + 2:, 0] = xslice[::-1]
+            X[N + 2:, 1] = y2slice[::-1]
+    
+            polys.append(X)
+        polycol.extend(polys)
+    from matplotlib.collections import PolyCollection
+    plots.append(PolyCollection(polycol, **kwargs))
+    ax.add_collection(plots[-1], autolim=True)
+    ax.autoscale_view()
+    return plots
+
+def gperrors(x, mu, lower, upper, edgecol=None, ax=None, fignum=None, **kwargs):
+    _, axes = ax_default(fignum, ax)
+
+    mu = mu.flatten()
+    x = x.flatten()
+    lower = lower.flatten()
+    upper = upper.flatten()
+
+    plots = []
+
+    if edgecol is None:
+        edgecol='#3300FF'
+
+    if not 'alpha' in kwargs.keys():
+        kwargs['alpha'] = 1.
+
+
+    if not 'lw' in kwargs.keys():
+        kwargs['lw'] = 1.
+
+
+    plots.append(axes.errorbar(x,mu,yerr=np.vstack([mu-lower,upper-mu]),color=edgecol,**kwargs))
+    plots[-1][0].remove()
+    return plots
+
+
+def removeRightTicks(ax=None):
+    ax = ax or plt.gca()
+    for i, line in enumerate(ax.get_yticklines()):
+        if i%2 == 1:   # odd indices
+            line.set_visible(False)
+
+def removeUpperTicks(ax=None):
+    ax = ax or plt.gca()
+    for i, line in enumerate(ax.get_xticklines()):
+        if i%2 == 1:   # odd indices
+            line.set_visible(False)
+
+def fewerXticks(ax=None,divideby=2):
+    ax = ax or plt.gca()
+    ax.set_xticks(ax.get_xticks()[::divideby])
+
+def align_subplots(N,M,xlim=None, ylim=None):
+    """make all of the subplots have the same limits, turn off unnecessary ticks"""
+    #find sensible xlim,ylim
+    if xlim is None:
+        xlim = [np.inf,-np.inf]
+        for i in range(N*M):
+            plt.subplot(N,M,i+1)
+            xlim[0] = min(xlim[0],plt.xlim()[0])
+            xlim[1] = max(xlim[1],plt.xlim()[1])
+    if ylim is None:
+        ylim = [np.inf,-np.inf]
+        for i in range(N*M):
+            plt.subplot(N,M,i+1)
+            ylim[0] = min(ylim[0],plt.ylim()[0])
+            ylim[1] = max(ylim[1],plt.ylim()[1])
+
+    for i in range(N*M):
+        plt.subplot(N,M,i+1)
+        plt.xlim(xlim)
+        plt.ylim(ylim)
+        if (i)%M:
+            plt.yticks([])
+        else:
+            removeRightTicks()
+        if i<(M*(N-1)):
+            plt.xticks([])
+        else:
+            removeUpperTicks()
+
+def align_subplot_array(axes,xlim=None, ylim=None):
+    """
+    Make all of the axes in the array hae the same limits, turn off unnecessary ticks
+    use plt.subplots() to get an array of axes
+    """
+    #find sensible xlim,ylim
+    if xlim is None:
+        xlim = [np.inf,-np.inf]
+        for ax in axes.flatten():
+            xlim[0] = min(xlim[0],ax.get_xlim()[0])
+            xlim[1] = max(xlim[1],ax.get_xlim()[1])
+    if ylim is None:
+        ylim = [np.inf,-np.inf]
+        for ax in axes.flatten():
+            ylim[0] = min(ylim[0],ax.get_ylim()[0])
+            ylim[1] = max(ylim[1],ax.get_ylim()[1])
+
+    N,M = axes.shape
+    for i,ax in enumerate(axes.flatten()):
+        ax.set_xlim(xlim)
+        ax.set_ylim(ylim)
+        if (i)%M:
+            ax.set_yticks([])
+        else:
+            removeRightTicks(ax)
+        if i<(M*(N-1)):
+            ax.set_xticks([])
+        else:
+            removeUpperTicks(ax)
+
+def x_frame1D(X,plot_limits=None,resolution=None):
+    """
+    Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits
+    """
+    assert X.shape[1] ==1, "x_frame1D is defined for one-dimensional inputs"
+    if plot_limits is None:
+        from ...core.parameterization.variational import VariationalPosterior
+        if isinstance(X, VariationalPosterior):
+            xmin,xmax = X.mean.min(0),X.mean.max(0)
+        else:
+            xmin,xmax = X.min(0),X.max(0)
+        xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
+    elif len(plot_limits)==2:
+        xmin, xmax = plot_limits
+    else:
+        raise ValueError("Bad limits for plotting")
+
+    Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None]
+    return Xnew, xmin, xmax
+
+def x_frame2D(X,plot_limits=None,resolution=None):
+    """
+    Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits
+    """
+    assert X.shape[1] ==2, "x_frame2D is defined for two-dimensional inputs"
+    if plot_limits is None:
+        xmin,xmax = X.min(0),X.max(0)
+        xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
+    elif len(plot_limits)==2:
+        xmin, xmax = plot_limits
+    else:
+        raise ValueError("Bad limits for plotting")
+
+    resolution = resolution or 50
+    xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution]
+    Xnew = np.vstack((xx.flatten(),yy.flatten())).T
+    return Xnew, xx, yy, xmin, xmax
diff --git a/GPy/plotting/matplot_dep/controllers/__init__.py b/GPy/plotting/matplot_dep/controllers/__init__.py
index 61cfb73b..a7e897e8 100644
--- a/GPy/plotting/matplot_dep/controllers/__init__.py
+++ b/GPy/plotting/matplot_dep/controllers/__init__.py
@@ -1 +1 @@
-from .imshow_controller import ImshowController, ImAnnotateController
\ No newline at end of file
+from .imshow_controller import ImshowController, ImAnnotateController
diff --git a/GPy/plotting/matplot_dep/controllers/imshow_controller.py b/GPy/plotting/matplot_dep/controllers/imshow_controller.py
index d67c9b4b..de64ed23 100644
--- a/GPy/plotting/matplot_dep/controllers/imshow_controller.py
+++ b/GPy/plotting/matplot_dep/controllers/imshow_controller.py
@@ -72,4 +72,4 @@ class ImAnnotateController(ImshowController):
             text.set_x(x+xoffset)
             text.set_y(y+yoffset)
             text.set_text("{}".format(X[1][j, i]))
-        return view
\ No newline at end of file
+        return view
diff --git a/GPy/plotting/matplot_dep/defaults.py b/GPy/plotting/matplot_dep/defaults.py
index eab98298..69257c8c 100644
--- a/GPy/plotting/matplot_dep/defaults.py
+++ b/GPy/plotting/matplot_dep/defaults.py
@@ -72,4 +72,4 @@ latent = dict(aspect='auto', cmap='Greys', interpolation='bicubic')
 gradient = dict(aspect='auto', cmap='RdBu', interpolation='nearest', alpha=.7)
 magnification = dict(aspect='auto', cmap='Greys', interpolation='bicubic')
 latent_scatter = dict(s=40, linewidth=.2, edgecolor='k', alpha=.9)
-annotation = dict(fontdict=dict(family='sans-serif', weight='light', fontsize=9), zorder=.3, alpha=.7)
\ No newline at end of file
+annotation = dict(fontdict=dict(family='sans-serif', weight='light', fontsize=9), zorder=.3, alpha=.7)
diff --git a/GPy/plotting/matplot_dep/plot_definitions.py b/GPy/plotting/matplot_dep/plot_definitions.py
index 9eb9efb0..52100ea3 100644
--- a/GPy/plotting/matplot_dep/plot_definitions.py
+++ b/GPy/plotting/matplot_dep/plot_definitions.py
@@ -42,10 +42,11 @@ class MatplotlibPlots(AbstractPlottingLibrary):
         super(MatplotlibPlots, self).__init__()
         self._defaults = defaults.__dict__
 
-    def figure(self, rows=1, cols=1, **kwargs):
-        fig = plt.figure(**kwargs)
+    def figure(self, rows=1, cols=1, gridspec_kwargs={}, tight_layout=True, **kwargs):
+        fig = plt.figure(tight_layout=tight_layout, **kwargs)
         fig.rows = rows
         fig.cols = cols
+        fig.gridspec = plt.GridSpec(rows, cols, **gridspec_kwargs)
         return fig
 
     def new_canvas(self, figure=None, row=1, col=1, projection='2d', xlabel=None, ylabel=None, zlabel=None, title=None, xlim=None, ylim=None, zlim=None, **kwargs):
@@ -56,7 +57,9 @@ class MatplotlibPlots(AbstractPlottingLibrary):
         if 'ax' in kwargs:
             ax = kwargs.pop('ax')
         else:
-            if 'num' in kwargs and 'figsize' in kwargs:
+            if figure is not None:
+                fig = figure
+            elif 'num' in kwargs and 'figsize' in kwargs:
                 fig = self.figure(num=kwargs.pop('num'), figsize=kwargs.pop('figsize'))
             elif 'num' in kwargs:
                 fig = self.figure(num=kwargs.pop('num'))
@@ -66,7 +69,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
                 fig = self.figure()
 
             #if hasattr(fig, 'rows') and hasattr(fig, 'cols'):
-            ax = fig.add_subplot(fig.rows, fig.cols, (col,row), projection=projection)
+            ax = fig.add_subplot(fig.gridspec[row-1, col-1], projection=projection)
 
         if xlim is not None: ax.set_xlim(xlim)
         if ylim is not None: ax.set_ylim(ylim)
@@ -79,7 +82,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
         return ax, kwargs
 
     def add_to_canvas(self, ax, plots, legend=False, title=None, **kwargs):
-        ax.autoscale_view()
+        #ax.autoscale_view()
         fontdict=dict(family='sans-serif', weight='light', size=9)
         if legend is True:
             ax.legend(*ax.get_legend_handles_labels())
@@ -89,9 +92,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
         if title is not None: ax.figure.suptitle(title)
         return ax
 
-    def show_canvas(self, ax, tight_layout=False, **kwargs):
-        if tight_layout:
-            ax.figure.tight_layout()
+    def show_canvas(self, ax):
         ax.figure.canvas.draw()
         return ax.figure
 
diff --git a/GPy/plotting/matplot_dep/ssgplvm.py b/GPy/plotting/matplot_dep/ssgplvm.py
index b741bc5d..0ed8a043 100644
--- a/GPy/plotting/matplot_dep/ssgplvm.py
+++ b/GPy/plotting/matplot_dep/ssgplvm.py
@@ -13,16 +13,16 @@ class SSGPLVM_plot(object):
         self.model = model
         self.imgsize= imgsize
         assert model.Y.shape[1] == imgsize[0]*imgsize[1]
-        
+
     def plot_inducing(self):
         fig1 = pylab.figure()
         mean = self.model.posterior.mean
         arr = mean.reshape(*(mean.shape[0],self.imgsize[1],self.imgsize[0]))
         plot_2D_images(fig1, arr)
         fig1.gca().set_title('The mean of inducing points')
-        
+
         fig2 = pylab.figure()
         covar = self.model.posterior.covariance
         plot_2D_images(fig2, covar)
         fig2.gca().set_title('The variance of inducing points')
-        
+
diff --git a/GPy/plotting/matplot_dep/util.py b/GPy/plotting/matplot_dep/util.py
index 2dd6af85..ca129bc9 100644
--- a/GPy/plotting/matplot_dep/util.py
+++ b/GPy/plotting/matplot_dep/util.py
@@ -116,4 +116,4 @@ def align_subplot_array(axes,xlim=None, ylim=None):
         if i<(M*(N-1)):
             ax.set_xticks([])
         else:
-            removeUpperTicks(ax)
\ No newline at end of file
+            removeUpperTicks(ax)
diff --git a/GPy/plotting/plotly_dep/defaults.py b/GPy/plotting/plotly_dep/defaults.py
index 24170b95..121e0b37 100644
--- a/GPy/plotting/plotly_dep/defaults.py
+++ b/GPy/plotting/plotly_dep/defaults.py
@@ -73,4 +73,4 @@ latent = dict(colorscale='Greys', reversescale=True, zsmooth='best')
 gradient = dict(colorscale='RdBu', opacity=.7)
 magnification = dict(colorscale='Greys', zsmooth='best', reversescale=True)
 latent_scatter = dict(marker_kwargs=dict(size='5', opacity=.7))
-# annotation = dict(fontdict=dict(family='sans-serif', weight='light', fontsize=9), zorder=.3, alpha=.7)
\ No newline at end of file
+# annotation = dict(fontdict=dict(family='sans-serif', weight='light', fontsize=9), zorder=.3, alpha=.7)
diff --git a/GPy/plotting/plotly_dep/plot_definitions.py b/GPy/plotting/plotly_dep/plot_definitions.py
index 54f04a75..eaa70f32 100644
--- a/GPy/plotting/plotly_dep/plot_definitions.py
+++ b/GPy/plotting/plotly_dep/plot_definitions.py
@@ -254,7 +254,7 @@ class PlotlyPlots(AbstractPlottingLibrary):
                         font=dict(color='white' if np.abs(var) > 0.8 else 'black', size=10),
                         opacity=.5,
                         showarrow=False,
-                        hoverinfo='x'))
+                        ))
         return imshow, annotations
 
     def annotation_heatmap_interact(self, ax, plot_function, extent, label=None, resolution=15, imshow_kwargs=None, **annotation_kwargs):
diff --git a/GPy/testing/__init__.py b/GPy/testing/__init__.py
index 2e64d90e..abad1fa3 100644
--- a/GPy/testing/__init__.py
+++ b/GPy/testing/__init__.py
@@ -1,14 +1,9 @@
-# Copyright (c) 2014, Max Zwiessele
+# Copyright (c) 2014, Max Zwiessele, GPy Authors
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-"""
-
-MaxZ
-
-"""
 import unittest
 import sys
 
 def deepTest(reason):
     if reason:
         return lambda x:x
-    return unittest.skip("Not deep scanning, enable deepscan by adding 'deep' argument")
+    return unittest.skip("Not deep scanning, enable deepscan by adding 'deep' argument to unittest call")
diff --git a/GPy/testing/baseline/bayesian_gplvm_gradient.png b/GPy/testing/baseline/bayesian_gplvm_gradient.png
index 9ceec5df..5bf1286d 100644
Binary files a/GPy/testing/baseline/bayesian_gplvm_gradient.png and b/GPy/testing/baseline/bayesian_gplvm_gradient.png differ
diff --git a/GPy/testing/baseline/bayesian_gplvm_inducing.png b/GPy/testing/baseline/bayesian_gplvm_inducing.png
index cbf7c344..b3b154a8 100644
Binary files a/GPy/testing/baseline/bayesian_gplvm_inducing.png and b/GPy/testing/baseline/bayesian_gplvm_inducing.png differ
diff --git a/GPy/testing/baseline/bayesian_gplvm_inducing_3d.png b/GPy/testing/baseline/bayesian_gplvm_inducing_3d.png
index edff93ef..e8c73aca 100644
Binary files a/GPy/testing/baseline/bayesian_gplvm_inducing_3d.png and b/GPy/testing/baseline/bayesian_gplvm_inducing_3d.png differ
diff --git a/GPy/testing/baseline/bayesian_gplvm_latent.png b/GPy/testing/baseline/bayesian_gplvm_latent.png
index 626bcb8b..9349f6db 100644
Binary files a/GPy/testing/baseline/bayesian_gplvm_latent.png and b/GPy/testing/baseline/bayesian_gplvm_latent.png differ
diff --git a/GPy/testing/baseline/bayesian_gplvm_latent_3d.png b/GPy/testing/baseline/bayesian_gplvm_latent_3d.png
index ea0009f2..f2ac36de 100644
Binary files a/GPy/testing/baseline/bayesian_gplvm_latent_3d.png and b/GPy/testing/baseline/bayesian_gplvm_latent_3d.png differ
diff --git a/GPy/testing/baseline/bayesian_gplvm_magnification.png b/GPy/testing/baseline/bayesian_gplvm_magnification.png
index 85c3eb7f..0ffaa918 100644
Binary files a/GPy/testing/baseline/bayesian_gplvm_magnification.png and b/GPy/testing/baseline/bayesian_gplvm_magnification.png differ
diff --git a/GPy/testing/baseline/coverage_3d_plot.png b/GPy/testing/baseline/coverage_3d_plot.png
index c5469dd0..9b0d9d8f 100644
Binary files a/GPy/testing/baseline/coverage_3d_plot.png and b/GPy/testing/baseline/coverage_3d_plot.png differ
diff --git a/GPy/testing/baseline/coverage_annotation_interact.png b/GPy/testing/baseline/coverage_annotation_interact.png
index 7555a872..c5367f34 100644
Binary files a/GPy/testing/baseline/coverage_annotation_interact.png and b/GPy/testing/baseline/coverage_annotation_interact.png differ
diff --git a/GPy/testing/baseline/coverage_gradient.png b/GPy/testing/baseline/coverage_gradient.png
index 60bd7fb9..13ee8bf4 100644
Binary files a/GPy/testing/baseline/coverage_gradient.png and b/GPy/testing/baseline/coverage_gradient.png differ
diff --git a/GPy/testing/baseline/coverage_imshow_interact.png b/GPy/testing/baseline/coverage_imshow_interact.png
index 70c59276..b8f52c2f 100644
Binary files a/GPy/testing/baseline/coverage_imshow_interact.png and b/GPy/testing/baseline/coverage_imshow_interact.png differ
diff --git a/GPy/testing/baseline/gp_2d_data.png b/GPy/testing/baseline/gp_2d_data.png
index e16283d4..b4c04b6c 100644
Binary files a/GPy/testing/baseline/gp_2d_data.png and b/GPy/testing/baseline/gp_2d_data.png differ
diff --git a/GPy/testing/baseline/gp_2d_in_error.png b/GPy/testing/baseline/gp_2d_in_error.png
index 9f0652c2..7fed0154 100644
Binary files a/GPy/testing/baseline/gp_2d_in_error.png and b/GPy/testing/baseline/gp_2d_in_error.png differ
diff --git a/GPy/testing/baseline/gp_2d_inducing.png b/GPy/testing/baseline/gp_2d_inducing.png
index 3f3c153d..970c9132 100644
Binary files a/GPy/testing/baseline/gp_2d_inducing.png and b/GPy/testing/baseline/gp_2d_inducing.png differ
diff --git a/GPy/testing/baseline/gp_2d_mean.png b/GPy/testing/baseline/gp_2d_mean.png
index 9f376fe6..3a66f4d7 100644
Binary files a/GPy/testing/baseline/gp_2d_mean.png and b/GPy/testing/baseline/gp_2d_mean.png differ
diff --git a/GPy/testing/baseline/gp_3d_data.png b/GPy/testing/baseline/gp_3d_data.png
index 1fa42efb..e3030a88 100644
Binary files a/GPy/testing/baseline/gp_3d_data.png and b/GPy/testing/baseline/gp_3d_data.png differ
diff --git a/GPy/testing/baseline/gp_3d_inducing.png b/GPy/testing/baseline/gp_3d_inducing.png
index 00feec6e..42a87969 100644
Binary files a/GPy/testing/baseline/gp_3d_inducing.png and b/GPy/testing/baseline/gp_3d_inducing.png differ
diff --git a/GPy/testing/baseline/gp_3d_mean.png b/GPy/testing/baseline/gp_3d_mean.png
index 87a2c282..fd91fd1d 100644
Binary files a/GPy/testing/baseline/gp_3d_mean.png and b/GPy/testing/baseline/gp_3d_mean.png differ
diff --git a/GPy/testing/baseline/gp_class_likelihood.png b/GPy/testing/baseline/gp_class_likelihood.png
index 9faaeee7..e925b78c 100644
Binary files a/GPy/testing/baseline/gp_class_likelihood.png and b/GPy/testing/baseline/gp_class_likelihood.png differ
diff --git a/GPy/testing/baseline/gp_class_raw.png b/GPy/testing/baseline/gp_class_raw.png
index a0e04b66..ebe62e14 100644
Binary files a/GPy/testing/baseline/gp_class_raw.png and b/GPy/testing/baseline/gp_class_raw.png differ
diff --git a/GPy/testing/baseline/gp_class_raw_link.png b/GPy/testing/baseline/gp_class_raw_link.png
index 41d23556..307aee26 100644
Binary files a/GPy/testing/baseline/gp_class_raw_link.png and b/GPy/testing/baseline/gp_class_raw_link.png differ
diff --git a/GPy/testing/baseline/gp_conf.png b/GPy/testing/baseline/gp_conf.png
index 4dcd5919..47491ec4 100644
Binary files a/GPy/testing/baseline/gp_conf.png and b/GPy/testing/baseline/gp_conf.png differ
diff --git a/GPy/testing/baseline/gp_data.png b/GPy/testing/baseline/gp_data.png
index 08c3845c..8bede27f 100644
Binary files a/GPy/testing/baseline/gp_data.png and b/GPy/testing/baseline/gp_data.png differ
diff --git a/GPy/testing/baseline/gp_density.png b/GPy/testing/baseline/gp_density.png
index 953d571b..cf5a28f5 100644
Binary files a/GPy/testing/baseline/gp_density.png and b/GPy/testing/baseline/gp_density.png differ
diff --git a/GPy/testing/baseline/gp_in_error.png b/GPy/testing/baseline/gp_in_error.png
index 54d45479..e44fa087 100644
Binary files a/GPy/testing/baseline/gp_in_error.png and b/GPy/testing/baseline/gp_in_error.png differ
diff --git a/GPy/testing/baseline/gp_mean.png b/GPy/testing/baseline/gp_mean.png
index 735e3cc6..e955872e 100644
Binary files a/GPy/testing/baseline/gp_mean.png and b/GPy/testing/baseline/gp_mean.png differ
diff --git a/GPy/testing/baseline/gp_out_error.png b/GPy/testing/baseline/gp_out_error.png
index 2d7c1968..4ec4ef83 100644
Binary files a/GPy/testing/baseline/gp_out_error.png and b/GPy/testing/baseline/gp_out_error.png differ
diff --git a/GPy/testing/baseline/gp_samples.png b/GPy/testing/baseline/gp_samples.png
index e9641a23..b5a0b90e 100644
Binary files a/GPy/testing/baseline/gp_samples.png and b/GPy/testing/baseline/gp_samples.png differ
diff --git a/GPy/testing/baseline/gplvm_gradient.png b/GPy/testing/baseline/gplvm_gradient.png
index 338326f6..d21b2777 100644
Binary files a/GPy/testing/baseline/gplvm_gradient.png and b/GPy/testing/baseline/gplvm_gradient.png differ
diff --git a/GPy/testing/baseline/gplvm_latent.png b/GPy/testing/baseline/gplvm_latent.png
index 305ec046..bdb0c27d 100644
Binary files a/GPy/testing/baseline/gplvm_latent.png and b/GPy/testing/baseline/gplvm_latent.png differ
diff --git a/GPy/testing/baseline/gplvm_latent_3d.png b/GPy/testing/baseline/gplvm_latent_3d.png
index ea0009f2..f2ac36de 100644
Binary files a/GPy/testing/baseline/gplvm_latent_3d.png and b/GPy/testing/baseline/gplvm_latent_3d.png differ
diff --git a/GPy/testing/baseline/gplvm_magnification.png b/GPy/testing/baseline/gplvm_magnification.png
index dc7d7101..2c430a37 100644
Binary files a/GPy/testing/baseline/gplvm_magnification.png and b/GPy/testing/baseline/gplvm_magnification.png differ
diff --git a/GPy/testing/baseline/kern_ARD.png b/GPy/testing/baseline/kern_ARD.png
index 7b917abd..26090ed9 100644
Binary files a/GPy/testing/baseline/kern_ARD.png and b/GPy/testing/baseline/kern_ARD.png differ
diff --git a/GPy/testing/baseline/kern_cov_1d.png b/GPy/testing/baseline/kern_cov_1d.png
index 449a686d..6ab80ec6 100644
Binary files a/GPy/testing/baseline/kern_cov_1d.png and b/GPy/testing/baseline/kern_cov_1d.png differ
diff --git a/GPy/testing/baseline/kern_cov_2d.png b/GPy/testing/baseline/kern_cov_2d.png
index db76f5b6..89eb6ea7 100644
Binary files a/GPy/testing/baseline/kern_cov_2d.png and b/GPy/testing/baseline/kern_cov_2d.png differ
diff --git a/GPy/testing/baseline/kern_cov_3d.png b/GPy/testing/baseline/kern_cov_3d.png
index 31b32b5e..e536dd0c 100644
Binary files a/GPy/testing/baseline/kern_cov_3d.png and b/GPy/testing/baseline/kern_cov_3d.png differ
diff --git a/GPy/testing/baseline/kern_cov_no_lim.png b/GPy/testing/baseline/kern_cov_no_lim.png
index ed9960f9..686c8724 100644
Binary files a/GPy/testing/baseline/kern_cov_no_lim.png and b/GPy/testing/baseline/kern_cov_no_lim.png differ
diff --git a/GPy/testing/baseline/sparse_gp_class_likelihood.png b/GPy/testing/baseline/sparse_gp_class_likelihood.png
index a29e8eba..fb212db4 100644
Binary files a/GPy/testing/baseline/sparse_gp_class_likelihood.png and b/GPy/testing/baseline/sparse_gp_class_likelihood.png differ
diff --git a/GPy/testing/baseline/sparse_gp_class_raw.png b/GPy/testing/baseline/sparse_gp_class_raw.png
index 9fc027f0..6e9cda9c 100644
Binary files a/GPy/testing/baseline/sparse_gp_class_raw.png and b/GPy/testing/baseline/sparse_gp_class_raw.png differ
diff --git a/GPy/testing/baseline/sparse_gp_class_raw_link.png b/GPy/testing/baseline/sparse_gp_class_raw_link.png
index c24d2d66..3db9362f 100644
Binary files a/GPy/testing/baseline/sparse_gp_class_raw_link.png and b/GPy/testing/baseline/sparse_gp_class_raw_link.png differ
diff --git a/GPy/testing/baseline/sparse_gp_data_error.png b/GPy/testing/baseline/sparse_gp_data_error.png
index c78a8df1..2d253bb8 100644
Binary files a/GPy/testing/baseline/sparse_gp_data_error.png and b/GPy/testing/baseline/sparse_gp_data_error.png differ
diff --git a/GPy/testing/bgplvm_minibatch_tests.py b/GPy/testing/bgplvm_minibatch_tests.py
deleted file mode 100644
index 4a824368..00000000
--- a/GPy/testing/bgplvm_minibatch_tests.py
+++ /dev/null
@@ -1,109 +0,0 @@
-'''
-Created on 4 Sep 2015
-
-@author: maxz
-'''
-import unittest
-import numpy as np
-import GPy
-
-class BGPLVMTest(unittest.TestCase):
-
-
-    def setUp(self):
-        np.random.seed(12345)
-        X, W = np.random.normal(0,1,(100,6)), np.random.normal(0,1,(6,13))
-        Y = X.dot(W) + np.random.normal(0, .1, (X.shape[0], W.shape[1]))
-        self.inan = np.random.binomial(1, .1, Y.shape).astype(bool)
-        self.X, self.W, self.Y = X,W,Y
-        self.Q = 3
-        self.m_full = GPy.models.BayesianGPLVM(Y, self.Q)
-
-    def test_lik_comparisons_m1_s0(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=False)
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_predict_missing_data(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-
-        self.assertRaises(NotImplementedError, m.predict, m.X, full_cov=True)
-
-        mu1, var1 = m.predict(m.X, full_cov=False)
-        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
-        np.testing.assert_allclose(mu1, mu2)
-        np.testing.assert_allclose(var1, var2)
-
-        mu1, var1 = m.predict(m.X.mean, full_cov=True)
-        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=True)
-        np.testing.assert_allclose(mu1, mu2)
-        np.testing.assert_allclose(var1[:,:,0], var2)
-
-        mu1, var1 = m.predict(m.X.mean, full_cov=False)
-        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=False)
-        np.testing.assert_allclose(mu1, mu2)
-        np.testing.assert_allclose(var1[:,[0]], var2)
-
-    def test_lik_comparisons_m0_s0(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=False)
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_lik_comparisons_m1_s1(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_lik_comparisons_m0_s1(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-    def test_gradients_missingdata(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=False, batchsize=self.Y.shape[1])
-        assert(m.checkgrad())
-
-    def test_gradients_missingdata_stochastics(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=1)
-        assert(m.checkgrad())
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=4)
-        assert(m.checkgrad())
-
-    def test_gradients_stochastics(self):
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=1)
-        assert(m.checkgrad())
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=4)
-        assert(m.checkgrad())
-
-    def test_predict(self):
-        # Test if the different implementations give the exact same likelihood as the full model.
-        # All of the following settings should give the same likelihood and gradients as the full model:
-        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
-        m[:] = self.m_full[:]
-        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
-        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
-        assert(m.checkgrad())
-
-
-if __name__ == "__main__":
-    #import sys;sys.argv = ['', 'Test.testName']
-    unittest.main()
\ No newline at end of file
diff --git a/GPy/testing/gp_tests.py b/GPy/testing/gp_tests.py
index b8cd89e2..3ce3ffc4 100644
--- a/GPy/testing/gp_tests.py
+++ b/GPy/testing/gp_tests.py
@@ -97,4 +97,4 @@ class Test(unittest.TestCase):
 
 if __name__ == "__main__":
     #import sys;sys.argv = ['', 'Test.testName']
-    unittest.main()
\ No newline at end of file
+    unittest.main()
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index bae7b2e4..57805eef 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -324,8 +324,18 @@ class KernelGradientTestsContinuous(unittest.TestCase):
         k.randomize()
         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
 
+    def test_Poly(self):
+        k = GPy.kern.Poly(self.D, order=5)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
+    def test_WhiteHeteroscedastic(self):
+        k = GPy.kern.WhiteHeteroscedastic(self.D, self.X.shape[0])
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
     def test_standard_periodic(self):
-        k = GPy.kern.StdPeriodic(self.D, self.D-1)
+        k = GPy.kern.StdPeriodic(self.D)
         k.randomize()
         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
 
@@ -334,11 +344,14 @@ class KernelTestsMiscellaneous(unittest.TestCase):
         N, D = 100, 10
         self.X = np.linspace(-np.pi, +np.pi, N)[:,None] * np.random.uniform(-10,10,D)
         self.rbf = GPy.kern.RBF(2, active_dims=np.arange(0,4,2))
+        self.rbf.randomize()
         self.linear = GPy.kern.Linear(2, active_dims=(3,9))
+        self.linear.randomize()
         self.matern = GPy.kern.Matern32(3, active_dims=np.array([1,7,9]))
+        self.matern.randomize()
         self.sumkern = self.rbf + self.linear
         self.sumkern += self.matern
-        self.sumkern.randomize()
+        #self.sumkern.randomize()
 
     def test_which_parts(self):
         self.assertTrue(np.allclose(self.sumkern.K(self.X, which_parts=[self.linear, self.matern]), self.linear.K(self.X)+self.matern.K(self.X)))
@@ -348,6 +361,21 @@ class KernelTestsMiscellaneous(unittest.TestCase):
     def test_active_dims(self):
         np.testing.assert_array_equal(self.sumkern.active_dims, [0,1,2,3,7,9])
         np.testing.assert_array_equal(self.sumkern._all_dims_active, range(10))
+        tmp = self.linear+self.rbf
+        np.testing.assert_array_equal(tmp.active_dims, [0,2,3,9])
+        np.testing.assert_array_equal(tmp._all_dims_active, range(10))
+        tmp = self.matern+self.rbf
+        np.testing.assert_array_equal(tmp.active_dims, [0,1,2,7,9])
+        np.testing.assert_array_equal(tmp._all_dims_active, range(10))
+        tmp = self.matern+self.rbf*self.linear
+        np.testing.assert_array_equal(tmp.active_dims, [0,1,2,3,7,9])
+        np.testing.assert_array_equal(tmp._all_dims_active, range(10))
+        tmp = self.matern+self.rbf+self.linear
+        np.testing.assert_array_equal(tmp.active_dims, [0,1,2,3,7,9])
+        np.testing.assert_array_equal(tmp._all_dims_active, range(10))
+        tmp = self.matern*self.rbf*self.linear
+        np.testing.assert_array_equal(tmp.active_dims, [0,1,2,3,7,9])
+        np.testing.assert_array_equal(tmp._all_dims_active, range(10))
 
 class KernelTestsNonContinuous(unittest.TestCase):
     def setUp(self):
diff --git a/GPy/testing/minibatch_tests.py b/GPy/testing/minibatch_tests.py
new file mode 100644
index 00000000..fbf12939
--- /dev/null
+++ b/GPy/testing/minibatch_tests.py
@@ -0,0 +1,226 @@
+'''
+Created on 4 Sep 2015
+
+@author: maxz
+'''
+import unittest
+import numpy as np
+import GPy
+
+class BGPLVMTest(unittest.TestCase):
+
+
+    def setUp(self):
+        np.random.seed(12345)
+        X, W = np.random.normal(0,1,(100,6)), np.random.normal(0,1,(6,13))
+        Y = X.dot(W) + np.random.normal(0, .1, (X.shape[0], W.shape[1]))
+        self.inan = np.random.binomial(1, .1, Y.shape).astype(bool)
+        self.X, self.W, self.Y = X,W,Y
+        self.Q = 3
+        self.m_full = GPy.models.BayesianGPLVM(Y, self.Q)
+
+    def test_lik_comparisons_m1_s0(self):
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=False)
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert(m.checkgrad())
+
+    def test_predict_missing_data(self):
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+
+        self.assertRaises(NotImplementedError, m.predict, m.X, full_cov=True)
+
+        mu1, var1 = m.predict(m.X, full_cov=False)
+        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
+        np.testing.assert_allclose(mu1, mu2)
+        np.testing.assert_allclose(var1, var2)
+
+        mu1, var1 = m.predict(m.X.mean, full_cov=True)
+        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=True)
+        np.testing.assert_allclose(mu1, mu2)
+        np.testing.assert_allclose(var1[:,:,0], var2)
+
+        mu1, var1 = m.predict(m.X.mean, full_cov=False)
+        mu2, var2 = self.m_full.predict(self.m_full.X.mean, full_cov=False)
+        np.testing.assert_allclose(mu1, mu2)
+        np.testing.assert_allclose(var1[:,[0]], var2)
+
+    def test_lik_comparisons_m0_s0(self):
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=self.m_full.X.variance.values, missing_data=False, stochastic=False)
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert(m.checkgrad())
+
+    def test_lik_comparisons_m1_s1(self):
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert(m.checkgrad())
+
+    def test_lik_comparisons_m0_s1(self):
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=self.Y.shape[1])
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert(m.checkgrad())
+
+    def test_gradients_missingdata(self):
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=False, batchsize=self.Y.shape[1])
+        assert(m.checkgrad())
+
+    def test_gradients_missingdata_stochastics(self):
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=1)
+        assert(m.checkgrad())
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=4)
+        assert(m.checkgrad())
+
+    def test_gradients_stochastics(self):
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=1)
+        assert(m.checkgrad())
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=False, stochastic=True, batchsize=4)
+        assert(m.checkgrad())
+
+    def test_predict(self):
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert(m.checkgrad())
+
+class SparseGPMinibatchTest(unittest.TestCase):
+
+
+    def setUp(self):
+        np.random.seed(12345)
+        X, W = np.random.normal(0,1,(100,6)), np.random.normal(0,1,(6,13))
+        Y = X.dot(W) + np.random.normal(0, .1, (X.shape[0], W.shape[1]))
+        self.inan = np.random.binomial(1, .1, Y.shape).astype(bool)
+        self.X, self.W, self.Y = X,W,Y
+        self.Q = 3
+        self.m_full = GPy.models.SparseGPLVM(Y, self.Q, kernel=GPy.kern.RBF(self.Q, ARD=True))
+
+    def test_lik_comparisons_m1_s0(self):
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=False)
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert(m.checkgrad())
+
+    def test_sparsegp_init(self):
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        np.random.seed(1234)
+        Z = self.X[np.random.choice(self.X.shape[0], replace=False, size=10)].copy()
+        Q = Z.shape[1]
+        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=True, stochastic=False)
+        assert(m.checkgrad())
+        m.optimize('adadelta', max_iters=10)
+        assert(m.checkgrad())
+
+        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=True, stochastic=True)
+        assert(m.checkgrad())
+        m.optimize('rprop', max_iters=10)
+        assert(m.checkgrad())
+        
+        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=False, stochastic=False)
+        assert(m.checkgrad())
+        m.optimize('rprop', max_iters=10)
+        assert(m.checkgrad())
+        
+        m = GPy.models.sparse_gp_minibatch.SparseGPMiniBatch(self.X, self.Y, Z, GPy.kern.RBF(Q)+GPy.kern.Matern32(Q)+GPy.kern.Bias(Q), GPy.likelihoods.Gaussian(), missing_data=False, stochastic=True)
+        assert(m.checkgrad())
+        m.optimize('adadelta', max_iters=10)
+        assert(m.checkgrad())
+
+    def test_predict_missing_data(self):
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+
+        mu1, var1 = m.predict(m.X, full_cov=False)
+        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=False)
+        np.testing.assert_allclose(mu1, mu2)
+        for i in range(var1.shape[1]):
+            np.testing.assert_allclose(var1[:,[i]], var2)
+
+        mu1, var1 = m.predict(m.X, full_cov=True)
+        mu2, var2 = self.m_full.predict(self.m_full.X, full_cov=True)
+        np.testing.assert_allclose(mu1, mu2)
+        for i in range(var1.shape[2]):
+            np.testing.assert_allclose(var1[:,:,i], var2)
+            
+    def test_lik_comparisons_m0_s0(self):
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=False)
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert(m.checkgrad())
+
+    def test_lik_comparisons_m1_s1(self):
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert(m.checkgrad())
+
+    def test_lik_comparisons_m0_s1(self):
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=True, batchsize=self.Y.shape[1])
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert(m.checkgrad())
+
+    def test_gradients_missingdata(self):
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=False, batchsize=self.Y.shape[1])
+        assert(m.checkgrad())
+
+    def test_gradients_missingdata_stochastics(self):
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=1)
+        assert(m.checkgrad())
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=4)
+        assert(m.checkgrad())
+
+    def test_gradients_stochastics(self):
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=True, batchsize=1)
+        assert(m.checkgrad())
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=False, stochastic=True, batchsize=4)
+        assert(m.checkgrad())
+
+    def test_predict(self):
+        # Test if the different implementations give the exact same likelihood as the full model.
+        # All of the following settings should give the same likelihood and gradients as the full model:
+        m = GPy.models.bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch(self.Y, self.Q, X_variance=False, missing_data=True, stochastic=True, batchsize=self.Y.shape[1])
+        m[:] = self.m_full[:]
+        np.testing.assert_almost_equal(m.log_likelihood(), self.m_full.log_likelihood(), 7)
+        np.testing.assert_allclose(m.gradient, self.m_full.gradient)
+        assert(m.checkgrad())
+
+
+if __name__ == "__main__":
+    #import sys;sys.argv = ['', 'Test.testName']
+    unittest.main()
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 8ff06e65..55e5309c 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -553,16 +553,27 @@ class GradientTests(np.testing.TestCase):
         rbflin = GPy.kern.RBF(1) + GPy.kern.White(1)
         self.check_model(rbflin, model_type='SparseGPRegression', dimension=1, uncertain_inputs=1)
 
+
     def test_GPLVM_rbf_bias_white_kern_2D(self):
         """ Testing GPLVM with rbf + bias kernel """
         N, input_dim, D = 50, 1, 2
         X = np.random.rand(N, input_dim)
-        k = GPy.kern.RBF(input_dim, 0.5, 0.9 * np.ones((1,))) + GPy.kern.Bias(input_dim, 0.1) + GPy.kern.White(input_dim, 0.05)
+        k = GPy.kern.RBF(input_dim, 0.5, 0.9 * np.ones((1,))) + GPy.kern.Bias(input_dim, 0.1) + GPy.kern.White(input_dim, 0.05) + GPy.kern.Matern32(input_dim) + GPy.kern.Matern52(input_dim)
         K = k.K(X)
         Y = np.random.multivariate_normal(np.zeros(N), K, input_dim).T
         m = GPy.models.GPLVM(Y, input_dim, kernel=k)
         self.assertTrue(m.checkgrad())
 
+    def test_SparseGPLVM_rbf_bias_white_kern_2D(self):
+        """ Testing GPLVM with rbf + bias kernel """
+        N, input_dim, D = 50, 1, 2
+        X = np.random.rand(N, input_dim)
+        k = GPy.kern.RBF(input_dim, 0.5, 0.9 * np.ones((1,))) + GPy.kern.Bias(input_dim, 0.1) + GPy.kern.White(input_dim, 0.05) + GPy.kern.Matern32(input_dim) + GPy.kern.Matern52(input_dim)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N), K, input_dim).T
+        m = GPy.models.SparseGPLVM(Y, input_dim, kernel=k)
+        self.assertTrue(m.checkgrad())
+
     def test_BCGPLVM_rbf_bias_white_kern_2D(self):
         """ Testing GPLVM with rbf + bias kernel """
         N, input_dim, D = 50, 1, 2
diff --git a/GPy/testing/plotting_tests.py b/GPy/testing/plotting_tests.py
index 441854d4..bcf7b6b5 100644
--- a/GPy/testing/plotting_tests.py
+++ b/GPy/testing/plotting_tests.py
@@ -100,7 +100,7 @@ def _image_comparison(baseline_images, extensions=['pdf','svg','png'], tol=11):
             fig.axes[0].set_axis_off()
             fig.set_frameon(False)
             fig.canvas.draw()
-            fig.savefig(os.path.join(result_dir, "{}.{}".format(base, ext)), transparent=True, edgecolor='none', facecolor='none')
+            fig.savefig(os.path.join(result_dir, "{}.{}".format(base, ext)), transparent=True, edgecolor='none', facecolor='none', bbox='tight')
     for num, base in zip(plt.get_fignums(), baseline_images):
         for ext in extensions:
             #plt.close(num)
@@ -116,7 +116,7 @@ def _image_comparison(baseline_images, extensions=['pdf','svg','png'], tol=11):
 def test_figure():
     np.random.seed(1239847)
     from GPy.plotting import plotting_library as pl
-    import matplotlib
+    #import matplotlib
     matplotlib.rcParams.update(matplotlib.rcParamsDefault)
     matplotlib.rcParams[u'figure.figsize'] = (4,3)
     matplotlib.rcParams[u'text.usetex'] = False
@@ -160,7 +160,7 @@ def test_figure():
 
 def test_kernel():
     np.random.seed(1239847)
-    import matplotlib
+    #import matplotlib
     matplotlib.rcParams.update(matplotlib.rcParamsDefault)
     matplotlib.rcParams[u'figure.figsize'] = (4,3)
     matplotlib.rcParams[u'text.usetex'] = False
diff --git a/GPy/testing/util_tests.py b/GPy/testing/util_tests.py
new file mode 100644
index 00000000..6fcf838e
--- /dev/null
+++ b/GPy/testing/util_tests.py
@@ -0,0 +1,49 @@
+#===============================================================================
+# Copyright (c) 2016, Max Zwiessele
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# 
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+# 
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# 
+# * Neither the name of GPy.testing.util_tests nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#===============================================================================
+
+import unittest, numpy as np
+
+class TestDebug(unittest.TestCase):
+    def test_checkFinite(self):
+        from GPy.util.debug import checkFinite
+        array = np.random.normal(0, 1, 100).reshape(25,4)
+        self.assertTrue(checkFinite(array, name='test'))
+        
+        array[np.random.binomial(1, .3, array.shape).astype(bool)] = np.nan
+        self.assertFalse(checkFinite(array))
+
+    def test_checkFullRank(self):
+        from GPy.util.debug import checkFullRank
+        from GPy.util.linalg import tdot
+        array = np.random.normal(0, 1, 100).reshape(25,4)
+        self.assertFalse(checkFullRank(tdot(array), name='test'))
+        
+        array = np.random.normal(0, 1, (25,25))
+        self.assertTrue(checkFullRank(tdot(array)))
\ No newline at end of file
diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py
index 1c504f89..fb1eb0d6 100644
--- a/GPy/util/__init__.py
+++ b/GPy/util/__init__.py
@@ -15,4 +15,4 @@ from . import diag
 from . import initialization
 from . import multioutput
 from . import parallel
-
+from . import functions
diff --git a/GPy/util/debug.py b/GPy/util/debug.py
index d691ad82..e09f162c 100644
--- a/GPy/util/debug.py
+++ b/GPy/util/debug.py
@@ -22,7 +22,7 @@ def checkFullRank(m, tol=1e-10, name=None, force_check=False):
         name = 'Matrix with ID['+str(id(m))+']'
     assert len(m.shape)==2 and m.shape[0]==m.shape[1], 'The input of checkFullRank has to be a square matrix!'
 
-    if not force_check and m.shape[0]>=10000:
+    if not force_check and m.shape[0]>=10000: # pragma: no cover
         print('The size of '+name+'is too big to check (>=10000)!')
         return True
 
diff --git a/GPy/util/functions.py b/GPy/util/functions.py
index be024aeb..a7e8584c 100644
--- a/GPy/util/functions.py
+++ b/GPy/util/functions.py
@@ -1,27 +1,31 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
-from scipy.special import erf, erfc, erfcx
+from scipy import special
+from scipy.special import erfcx
 import sys
 epsilon = sys.float_info.epsilon
 lim_val = -np.log(epsilon) 
 
-def logisticln(x):
+def logisticln(x): # pragma: no cover
     return np.where(x<lim_val, np.where(x>-lim_val, -np.log(1+np.exp(-x)), -x), -np.log(1+epsilon))
 
-def logistic(x):
-    return np.where(x<lim_val, np.where(x>-lim_val, 1/(1+np.exp(-x)), epsilon/(epsilon+1)), 1/(1+epsilon))
+def logistic(x): # pragma: no cover
+    return special.expit(x)
+    #return np.where(x<lim_val, np.where(x>-lim_val, 1/(1+np.exp(-x)), epsilon/(epsilon+1)), 1/(1+epsilon))
 
-def normcdf(x):
-    g=0.5*erfc(-x/np.sqrt(2))
-    return np.where(g==0, epsilon, np.where(g==1, 1-epsilon, g)) 
+def normcdf(x): # pragma: no cover
+    return special.ndtr(x)
+    #g=0.5*erfc(-x/np.sqrt(2))
+    #return np.where(g==0, epsilon, np.where(g==1, 1-epsilon, g)) 
 
-def normcdfln(x):
-    return np.where(x < 0, -.5*x*x + np.log(.5) + np.log(erfcx(-x/np.sqrt(2))), np.log(normcdf(x)))
+def normcdfln(x): # pragma: no cover
+    return special.log_ndtr(x)
+    #return np.where(x < 0, -.5*x*x + np.log(.5) + np.log(erfcx(-x/np.sqrt(2))), np.log(normcdf(x)))
 
-def clip_exp(x):
+def clip_exp(x): # pragma: no cover
     return np.where(x<lim_val, np.where(x>-lim_val, np.exp(x), epsilon), 1/epsilon)
 
-def differfln(x0, x1):
+def differfln(x0, x1): # pragma: no cover
     # this is a, hopefully!, a numerically more stable variant of log(erf(x0)-erf(x1)) = log(erfc(x1)-erfc(x0)).
     return np.where(x0>x1, -x1*x1 + np.log(erfcx(x1)-np.exp(-x0**2+x1**2)*erfcx(x0)), -x0*x0 + np.log(np.exp(-x1**2+x0**2)*erfcx(x1) - erfcx(x0)))
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index b4ffd1b0..83a6452b 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -78,7 +78,7 @@ def jitchol(A, maxtries=5):
     try: raise
     except:
         logging.warning('\n'.join(['Added jitter of {:.10e}'.format(jitter),
-            '  in '+traceback.format_list(traceback.extract_stack(limit=2)[-2:-1])[0][2:]]))
+            '  in '+traceback.format_list(traceback.extract_stack(limit=3)[-2:-1])[0][2:]]))
     return L
 
 # def dtrtri(L, lower=1):
diff --git a/benchmarks/regression/evaluation.py b/benchmarks/regression/evaluation.py
index fbbfe6d7..c57bce7e 100644
--- a/benchmarks/regression/evaluation.py
+++ b/benchmarks/regression/evaluation.py
@@ -18,4 +18,4 @@ class RMSE(Evaluation):
     
     def evaluate(self, gt, pred):
         return np.sqrt(np.square(gt-pred).astype(np.float).mean())
-    
\ No newline at end of file
+    
diff --git a/setup.cfg b/setup.cfg
index 3a2581bd..188a6bab 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.9.6
+current_version = 0.9.7
 tag = True
 commit = True