merge with upstream

2026-05-02 00:02:38 +02:00 · 2016-03-10 18:17:35 +00:00 · 2016-03-10 18:17:35 +00:00 · ba74e29aee
commit ba74e29aee
parent e9cc56e8e8 2e7ad7b8d4
115 changed files with 1178 additions and 531 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -2,7 +2,7 @@
 [run]
 branch = True
 source = GPy
-omit = ./GPy/testing/*.py, travis_tests.py, setup.py, ./GPy/__version__.py
+omit = ./GPy/testing/*.py, travis_tests.py, setup.py, ./GPy/__version__.py, ./GPy/plotting/*

 [report]
 # Regexes for lines to exclude from consideration
--- a/.travis.yml
+++ b/.travis.yml
@ -20,33 +20,19 @@ env:
  - PYTHON_VERSION=3.5

 before_install:
-  - export CONDA_CACHED=1
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; 
-      then export OS=Linux; 
-    elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; 
-      then export OS=MacOSX;
-      brew install pandoc; 
-    else 
-      echo "OS not supported yet"; 
-      exit 1; fi;
-  - if [[ $PYTHON_VERSION == "2.7" ]]; 
-      then export MINICONDA=Miniconda; 
-    elif [[ $PYTHON_VERSION == 3* ]]; 
-      then export MINICONDA=Miniconda3; 
-    else echo "Could not find python version";exit 1; fi;
-  - if [ ! -d $HOME/download/ ]; then mkdir $HOME/download/; fi;
-  - if [ ! -d $HOME/install/ ]; then mkdir $HOME/install/; fi;
-  - export MINICONDA_FILE=$MINICONDA-latest-$OS-x86_64-$PYTHON_VERSION
-  - export MINCONDA_CACHE_FILE=$HOME/download/$MINICONDA_FILE.sh
-  - export MINICONDA_INSTALL=$HOME/install/$MINICONDA_FILE
-  - if [ ! -f $MINCONDA_CACHE_FILE ]; then export CONDA_CACHED=0; wget http://repo.continuum.io/miniconda/$MINICONDA-latest-$OS-x86_64.sh -O $MINCONDA_CACHE_FILE; bash $MINCONDA_CACHE_FILE -b -p $MINICONDA_INSTALL; fi;
-  - export PATH="$MINICONDA_INSTALL/bin:$PATH";
+- wget https://github.com/mzwiessele/travis_scripts/raw/master/download_miniconda.sh
+- wget https://github.com/mzwiessele/travis_scripts/raw/master/install_retry.sh
+- source download_miniconda.sh
+- echo $PATH

 install:
-  - conda install --yes python=$PYTHON_VERSION numpy=1.9 scipy=0.16 nose pip six matplotlib sphinx;
-  - pip install codecov
-  - pip install pypandoc
-  - python setup.py develop
+- echo $PATH
+- source install_retry.sh
+- pip install codecov
+- pip install pypandoc
+- pip install git+git://github.com/BRML/climin.git
+- pip install autograd
+- python setup.py develop

 script:
  - coverage run travis_tests.py
@ -60,16 +46,16 @@ before_deploy:
  - sphinx-apidoc -o source/ ../GPy
  - make html
  - cd ../
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; 
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]];
      then export DIST='sdist';
-    elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; 
+    elif [[ "$TRAVIS_OS_NAME" == "osx" ]];
      then export DIST='bdist_wheel';
    fi;

 deploy:
  provider: pypi
  user: maxz
-  password: 
+  password:
    secure: "vMEOlP7DQhFJ7hQAKtKC5hrJXFl5BkUt4nXdosWWiw//Kg8E+PPLg88XPI2gqIosir9wwgtbSBBbbwCxkM6uxRNMpoNR8Ixyv9fmSXp4rLl7bbBY768W7IRXKIBjpuEy2brQjoT+CwDDSzUkckHvuUjJDNRvUv8ab4P/qYO1LG4="
  on:
    tags: false
--- a/GPy/version.py
+++ b/GPy/version.py
@ -1 +1 @@
-__version__ = "0.9.6"
+__version__ = "0.9.7"
--- a/GPy/core/init.py
+++ b/GPy/core/init.py
@ -43,4 +43,4 @@ def randomize(self, rand_gen=None, *args, **kwargs):
    
 Model.randomize = randomize
 Param.randomize = randomize
-Parameterized.randomize = randomize
+Parameterized.randomize = randomize
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -401,9 +401,9 @@ class GP(Model):
            var_jac = compute_cov_inner(self.posterior.woodbury_inv)
        return mean_jac, var_jac

-    def predict_wishard_embedding(self, Xnew, kern=None, mean=True, covariance=True):
+    def predict_wishart_embedding(self, Xnew, kern=None, mean=True, covariance=True):
        """
-        Predict the wishard embedding G of the GP. This is the density of the
+        Predict the wishart embedding G of the GP. This is the density of the
        input of the GP defined by the probabilistic function mapping f.
        G = J_mean.T*J_mean + output_dim*J_cov.

@ -431,6 +431,10 @@ class GP(Model):
            G += Sigma
        return G

+    def predict_wishard_embedding(self, Xnew, kern=None, mean=True, covariance=True):
+        warnings.warn("Wrong naming, use predict_wishart_embedding instead. Will be removed in future versions!", DeprecationWarning)
+        return self.predict_wishart_embedding(Xnew, kern, mean, covariance)
+
    def predict_magnification(self, Xnew, kern=None, mean=True, covariance=True):
        """
        Predict the magnification factor as
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -45,4 +45,4 @@ class Model(ParamzModel, Priorizable):
        (including the MAP prior), so we return it here. If your model is not
        probabilistic, just return your *negative* gradient here!
        """
-        return -(self._log_likelihood_gradients() + self._log_prior_gradients())
+        return -(self._log_likelihood_gradients() + self._log_prior_gradients())
--- a/GPy/core/parameterization/init.py
+++ b/GPy/core/parameterization/init.py
@ -3,7 +3,7 @@

 from .param import Param
 from .parameterized import Parameterized
-from paramz import transformations
+from . import transformations

 from paramz.core import lists_and_dicts, index_operations, observable_array, observable
-from paramz import ties_and_remappings, ObsAr
+from paramz import ties_and_remappings, ObsAr
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@ -7,4 +7,4 @@ from paramz.transformations import __fixed__
 import logging, numpy as np

 class Param(Param, Priorizable):
-    pass
+    pass
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -49,4 +49,4 @@ class Parameterized(Parameterized, Priorizable):
        If you want to operate on all parameters use m[''] to wildcard select all paramters
        and concatenate them. Printing m[''] will result in printing of all parameters in detail.
    """     
-    pass
+    pass
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@ -2,4 +2,4 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 from paramz.transformations import *
-from paramz.transformations import __fixed__
+from paramz.transformations import __fixed__
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -44,7 +44,7 @@ class SparseGP(GP):
        #pick a sensible inference method
        if inference_method is None:
            if isinstance(likelihood, likelihoods.Gaussian):
-                inference_method = var_dtc.VarDTC(limit=1)
+                inference_method = var_dtc.VarDTC(limit=3)
            else:
                #inference_method = ??
                raise NotImplementedError("what to do what to do?")
--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@ -89,7 +89,7 @@ class SVGP(SparseGP):
        """
        Return a new batch of X and Y by taking a chunk of data from the complete X and Y
        """
-        i = self.slicer.next()
+        i = next(self.slicer)
        return self.X_all[i], self.Y_all[i]

    def stochastic_grad(self, parameters):
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -459,7 +459,7 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim)

-    k = kern.Linear(Q) + kern.White(Q, variance=1e-4)
+    k = kern.Linear(Q, ARD=True) + kern.White(Q, variance=1e-4)
    m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing, kernel=k, initx="PCA_concat", initz='permute', **kw)

    m['.*noise'] = [Y.var() / 40. for Y in Ylist]
@ -479,7 +479,7 @@ def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim
    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)

-    k = kern.Linear(Q) + kern.White(Q, variance=1e-4)
+    k = kern.Linear(Q, ARD=True) + kern.White(Q, variance=1e-4)
    inanlist = []

    for Y in Ylist:
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@ -22,7 +22,7 @@ class VarDTC(LatentFunctionInference):

    """
    const_jitter = 1e-8
-    def __init__(self, limit=1):
+    def __init__(self, limit=3):
        from paramz.caching import Cacher
        self.limit = limit
        self.get_trYYT = Cacher(self._get_trYYT, limit)
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@ -21,7 +21,7 @@ class VarDTC_minibatch(LatentFunctionInference):

    """
    const_jitter = 1e-8
-    def __init__(self, batchsize=None, limit=1, mpi_comm=None):
+    def __init__(self, batchsize=None, limit=3, mpi_comm=None):

        self.batchsize = batchsize
        self.mpi_comm = mpi_comm
--- a/GPy/inference/optimization/init.py
+++ b/GPy/inference/optimization/init.py
@ -1,5 +1,8 @@
-from paramz.optimization import stochastics, Optimizer
+from paramz.optimization import Optimizer
+from . import stochastics
+
 from paramz.optimization import *
 import sys
+
 sys.modules['GPy.inference.optimization.stochastics'] = stochastics
-sys.modules['GPy.inference.optimization.Optimizer'] = Optimizer
+sys.modules['GPy.inference.optimization.Optimizer'] = Optimizer
--- a/GPy/inference/optimization/stochastics.py
+++ b/GPy/inference/optimization/stochastics.py
@ -0,0 +1,119 @@
+#===============================================================================
+# Copyright (c) 2015, Max Zwiessele
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# 
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+# 
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# 
+# * Neither the name of paramax nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#===============================================================================
+
+class StochasticStorage(object):
+    '''
+    This is a container for holding the stochastic parameters,
+    such as subset indices or step length and so on.
+
+    self.d has to be a list of lists:
+    [dimension indices, nan indices for those dimensions]
+    so that the minibatches can be used as efficiently as possible.
+    '''
+    def __init__(self, model):
+        """
+        Initialize this stochastic container using the given model
+        """
+
+    def do_stochastics(self):
+        """
+        Update the internal state to the next batch of the stochastic
+        descent algorithm.
+        """
+        pass
+
+    def reset(self):
+        """
+        Reset the state of this stochastics generator.
+        """
+
+class SparseGPMissing(StochasticStorage):
+    def __init__(self, model, batchsize=1):
+        """
+        Here we want to loop over all dimensions everytime.
+        Thus, we can just make sure the loop goes over self.d every
+        time. We will try to get batches which look the same together
+        which speeds up calculations significantly.
+        """
+        import numpy as np
+        self.Y = model.Y_normalized
+        bdict = {}
+        #For N > 1000 array2string default crops
+        opt = np.get_printoptions()
+        np.set_printoptions(threshold=np.inf)
+        for d in range(self.Y.shape[1]):
+            inan = np.isnan(self.Y)[:, d]
+            arr_str = np.array2string(inan, np.inf, 0, True, '', formatter={'bool':lambda x: '1' if x else '0'})
+            try:
+                bdict[arr_str][0].append(d)
+            except:
+                bdict[arr_str] = [[d], ~inan]
+        np.set_printoptions(**opt)
+        self.d = bdict.values()
+
+class SparseGPStochastics(StochasticStorage):
+    """
+    For the sparse gp we need to store the dimension we are in,
+    and the indices corresponding to those
+    """
+    def __init__(self, model, batchsize=1, missing_data=True):
+        self.batchsize = batchsize
+        self.output_dim = model.Y.shape[1]
+        self.Y = model.Y_normalized
+        self.missing_data = missing_data
+        self.reset()
+        self.do_stochastics()
+
+    def do_stochastics(self):
+        import numpy as np
+        if self.batchsize == 1:
+            self.current_dim = (self.current_dim+1)%self.output_dim
+            self.d = [[[self.current_dim], np.isnan(self.Y[:, self.current_dim]) if self.missing_data else None]]
+        else:
+            self.d = np.random.choice(self.output_dim, size=self.batchsize, replace=False)
+            bdict = {}
+            if self.missing_data:
+                opt = np.get_printoptions()
+                np.set_printoptions(threshold=np.inf)
+                for d in self.d:
+                    inan = np.isnan(self.Y[:, d])
+                    arr_str = np.array2string(inan,np.inf, 0,True, '',formatter={'bool':lambda x: '1' if x else '0'})
+                    try:
+                        bdict[arr_str][0].append(d)
+                    except:
+                        bdict[arr_str] = [[d], ~inan]
+                np.set_printoptions(**opt)
+                self.d = bdict.values()
+            else:
+                self.d = [[self.d, None]]
+
+    def reset(self):
+        self.current_dim = -1
+        self.d = None
--- a/GPy/installation.cfg
+++ b/GPy/installation.cfg
@ -15,4 +15,4 @@


 # [plotting]
-# library = matplotlib # plotly
+# library = matplotlib # plotly, none
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -10,7 +10,7 @@ from .src.add import Add
 from .src.prod import Prod
 from .src.rbf import RBF
 from .src.linear import Linear, LinearFull
-from .src.static import Bias, White, Fixed
+from .src.static import Bias, White, Fixed, WhiteHeteroscedastic
 from .src.brownian import Brownian
 from .src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
 from .src.mlp import MLP
@ -28,4 +28,4 @@ from .src.trunclinear import TruncLinear,TruncLinear_inf
 from .src.splitKern import SplitKern,DEtime
 from .src.splitKern import DEtime as DiffGenomeKern
 from .src.spline import Spline
-from .src.basis_funcs import LogisticBasisFuncKernel, LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel
+from .src.basis_funcs import LogisticBasisFuncKernel, LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel
--- a/GPy/kern/src/ODE_t.py
+++ b/GPy/kern/src/ODE_t.py
@ -162,4 +162,4 @@ class ODE_t(Kern):

                self.lengthscale_Yt.gradient =  np.sum(dkYdlent*(-0.5*self.lengthscale_Yt**(-2)) * dL_dK)

-                self.ubias.gradient = np.sum(dkdubias * dL_dK) 
+                self.ubias.gradient = np.sum(dkdubias * dL_dK) 
--- a/GPy/kern/src/init.py
+++ b/GPy/kern/src/init.py
@ -1 +1 @@
-from . import psi_comp
+from . import psi_comp
--- a/GPy/kern/src/add.py
+++ b/GPy/kern/src/add.py
@ -19,8 +19,8 @@ class Add(CombinationKernel):
            if isinstance(kern, Add):
                del subkerns[i]
                for part in kern.parts[::-1]:
-                    kern.unlink_parameter(part)
-                    subkerns.insert(i, part)
+                    #kern.unlink_parameter(part)
+                    subkerns.insert(i, part.copy())
        super(Add, self).__init__(subkerns, name)
        self._exact_psicomp = self._check_exact_psicomp()

@ -37,7 +37,7 @@ class Add(CombinationKernel):
        else:
            return False

-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def K(self, X, X2=None, which_parts=None):
        """
        Add all kernels together.
@ -51,7 +51,7 @@ class Add(CombinationKernel):
            which_parts = [which_parts]
        return reduce(np.add, (p.K(X, X2) for p in which_parts))

-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def Kdiag(self, X, which_parts=None):
        if which_parts is None:
            which_parts = self.parts
@ -98,17 +98,17 @@ class Add(CombinationKernel):
        [target.__iadd__(p.gradients_XX_diag(dL_dKdiag, X)) for p in self.parts]
        return target

-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def psi0(self, Z, variational_posterior):
        if not self._exact_psicomp: return Kern.psi0(self,Z,variational_posterior)
        return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts))

-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def psi1(self, Z, variational_posterior):
        if not self._exact_psicomp: return Kern.psi1(self,Z,variational_posterior)
        return reduce(np.add, (p.psi1(Z, variational_posterior) for p in self.parts))

-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def psi2(self, Z, variational_posterior):
        if not self._exact_psicomp: return Kern.psi2(self,Z,variational_posterior)
        psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts))
@ -144,7 +144,7 @@ class Add(CombinationKernel):
                raise NotImplementedError("psi2 cannot be computed for this kernel")
        return psi2

-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def psi2n(self, Z, variational_posterior):
        if not self._exact_psicomp: return Kern.psi2n(self, Z, variational_posterior)
        psi2 = reduce(np.add, (p.psi2n(Z, variational_posterior) for p in self.parts))
@ -241,16 +241,20 @@ class Add(CombinationKernel):
            [np.add(target_grads[i],grads[i],target_grads[i]) for i in range(len(grads))]
        return target_grads

-    def add(self, other):
-        if isinstance(other, Add):
-            other_params = other.parameters[:]
-            for p in other_params:
-                other.unlink_parameter(p)
-            self.link_parameters(*other_params)
-        else:
-            self.link_parameter(other)
-        self.input_dim, self._all_dims_active = self.get_input_dim_active_dims(self.parts)
-        return self
+    #def add(self, other):
+    #    parts = self.parts
+    #    if 0:#isinstance(other, Add):
+    #        #other_params = other.parameters[:]
+    #        for p in other.parts[:]:
+    #            other.unlink_parameter(p)
+    #        parts.extend(other.parts)
+    #        #self.link_parameters(*other_params)
+    #        
+    #    else:
+    #        #self.link_parameter(other)
+    #        parts.append(other)
+    #    #self.input_dim, self._all_dims_active = self.get_input_dim_active_dims(parts)
+    #    return Add([p for p in parts], self.name)

    def input_sensitivity(self, summarize=True):
        if summarize:
--- a/GPy/kern/src/eq_ode2.py
+++ b/GPy/kern/src/eq_ode2.py
@ -64,7 +64,7 @@ class EQ_ODE2(Kern):
        self.W = Param('W', W)
        self.link_parameters(self.lengthscale, self.C, self.B, self.W)

-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
    def K(self, X, X2=None):
        #This way is not working, indexes are lost after using k._slice_X
        #index = np.asarray(X, dtype=np.int)
--- a/GPy/kern/src/kern.py
+++ b/GPy/kern/src/kern.py
@ -48,11 +48,12 @@ class Kern(Parameterized):

        if active_dims is None:
            active_dims = np.arange(input_dim)
-
-        self.active_dims = active_dims
-        self._all_dims_active = np.atleast_1d(active_dims).astype(int)
-
-        assert self._all_dims_active.size == self.input_dim, "input_dim={} does not match len(active_dim)={}, _all_dims_active={}".format(self.input_dim, self._all_dims_active.size, self._all_dims_active)
+        
+        self.active_dims = np.asarray(active_dims, np.int_)
+        
+        self._all_dims_active = np.atleast_1d(self.active_dims).astype(int)
+        
+        assert self.active_dims.size == self.input_dim, "input_dim={} does not match len(active_dim)={}".format(self.input_dim, self._all_dims_active.size)

        self._sliced_X = 0
        self.useGPU = self._support_GPU and useGPU
@ -68,9 +69,12 @@ class Kern(Parameterized):
    def _effective_input_dim(self):
        return np.size(self._all_dims_active)

-    @Cache_this(limit=20)
+    @Cache_this(limit=3)
    def _slice_X(self, X):
-        return X[:, self._all_dims_active]
+        try:
+            return X[:, self._all_dims_active].astype('float')
+        except:
+            return X[:, self._all_dims_active]

    def K(self, X, X2):
        """
@ -319,10 +323,20 @@ class CombinationKernel(Kern):
        :param array-like extra_dims: if needed extra dimensions for the combination kernel to work on
        """
        assert all([isinstance(k, Kern) for k in kernels])
-        extra_dims = np.array(extra_dims, dtype=int)
-        input_dim, active_dims = self.get_input_dim_active_dims(kernels, extra_dims)
+        extra_dims = np.asarray(extra_dims, dtype=int)
+        
+        active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int))
+        
+        input_dim = active_dims.size
+        if extra_dims is not None:
+            input_dim += extra_dims.size
+
        # initialize the kernel with the full input_dim
        super(CombinationKernel, self).__init__(input_dim, active_dims, name)
+
+        effective_input_dim = reduce(max, (k._all_dims_active.max() for k in kernels)) + 1
+        self._all_dims_active = np.array(np.concatenate((np.arange(effective_input_dim), extra_dims if extra_dims is not None else [])), dtype=int)
+        
        self.extra_dims = extra_dims
        self.link_parameters(*kernels)

@ -330,16 +344,8 @@ class CombinationKernel(Kern):
    def parts(self):
        return self.parameters

-    def get_input_dim_active_dims(self, kernels, extra_dims = None):
-        self.active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int))
-        #_all_dims_active = np.array(np.concatenate((_all_dims_active, extra_dims if extra_dims is not None else [])), dtype=int)
-        input_dim = reduce(max, (k._all_dims_active.max() for k in kernels)) + 1
-
-        if extra_dims is not None:
-            input_dim += extra_dims.size
-
-        _all_dims_active = np.arange(input_dim)
-        return input_dim, _all_dims_active
+    def _set_all_dims_ative(self):
+        self._all_dims_active = np.atleast_1d(self.active_dims).astype(int)        

    def input_sensitivity(self, summarize=True):
        """
--- a/GPy/kern/src/linear.py
+++ b/GPy/kern/src/linear.py
@ -51,7 +51,7 @@ class Linear(Kern):
        self.link_parameter(self.variances)
        self.psicomp = PSICOMP_Linear()

-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
    def K(self, X, X2=None):
        if self.ARD:
            if X2 is None:
@ -62,7 +62,7 @@ class Linear(Kern):
        else:
            return self._dot_product(X, X2) * self.variances

-    @Cache_this(limit=1, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def _dot_product(self, X, X2=None):
        if X2 is None:
            return tdot(X)
--- a/GPy/kern/src/mlp.py
+++ b/GPy/kern/src/mlp.py
@ -45,7 +45,7 @@ class MLP(Kern):
        self.link_parameters(self.variance, self.weight_variance, self.bias_variance)


-    @Cache_this(limit=20, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def K(self, X, X2=None):
        if X2 is None:
            X_denom = np.sqrt(self._comp_prod(X)+1.)
@ -57,7 +57,7 @@ class MLP(Kern):
        XTX = self._comp_prod(X,X2)/X_denom[:,None]/X2_denom[None,:]
        return self.variance*four_over_tau*np.arcsin(XTX)

-    @Cache_this(limit=20, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def Kdiag(self, X):
        """Compute the diagonal of the covariance matrix for X."""
        X_prod = self._comp_prod(X)
@ -88,14 +88,14 @@ class MLP(Kern):
        """Gradient of diagonal of covariance with respect to X"""
        return self._comp_grads_diag(dL_dKdiag, X)[3]

-    @Cache_this(limit=50, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def _comp_prod(self, X, X2=None):
        if X2 is None:
            return (np.square(X)*self.weight_variance).sum(axis=1)+self.bias_variance
        else:
            return (X*self.weight_variance).dot(X2.T)+self.bias_variance
    
-    @Cache_this(limit=20, ignore_args=(1,))
+    @Cache_this(limit=3, ignore_args=(1,))
    def _comp_grads(self, dL_dK, X, X2=None):
        var,w,b = self.variance, self.weight_variance, self.bias_variance
        K = self.K(X, X2)
@ -130,7 +130,7 @@ class MLP(Kern):
            dX2 = common.T.dot(X)*w-((common*XTX).sum(axis=0)/(X2_prod+1.))[:,None]*X2*w
        return dvar, dw, db, dX, dX2
    
-    @Cache_this(limit=20, ignore_args=(1,))
+    @Cache_this(limit=3, ignore_args=(1,))
    def _comp_grads_diag(self, dL_dKdiag, X):
        var,w,b = self.variance, self.weight_variance, self.bias_variance
        K = self.Kdiag(X)
--- a/GPy/kern/src/poly.py
+++ b/GPy/kern/src/poly.py
@ -5,32 +5,49 @@ import numpy as np
 from .kern import Kern
 from ...core.parameterization import Param
 from paramz.transformations import Logexp
+from paramz.caching import Cache_this

 class Poly(Kern):
    """
    Polynomial kernel
    """

-    def __init__(self, input_dim, variance=1., order=3., active_dims=None, name='poly'):
+    def __init__(self, input_dim, variance=1., scale=1., bias=1., order=3., active_dims=None, name='poly'):
        super(Poly, self).__init__(input_dim, active_dims, name)
        self.variance = Param('variance', variance, Logexp())
-        self.link_parameter(self.variance)
+        self.scale = Param('scale', scale, Logexp())
+        self.bias = Param('bias', bias, Logexp())
+
+        self.link_parameters(self.variance, self.scale, self.bias)
+        assert order >= 1, 'The order of the polynomial has to be at least 1.'
        self.order=order

-    def K(self, X, X2=None):
-        return (self._dot_product(X, X2) + 1.)**self.order * self.variance

-    def _dot_product(self, X, X2=None):
+    def K(self, X, X2=None):
+        _, _, B = self._AB(X, X2)
+        return B * self.variance
+
+    @Cache_this(limit=3)
+    def _AB(self, X, X2=None):
        if X2 is None:
-            return np.dot(X, X.T)
+            dot_prod = np.dot(X, X.T)
        else:
-            return np.dot(X, X2.T)
+            dot_prod = np.dot(X, X2.T)
+        A = (self.scale * dot_prod) + self.bias
+        B = A ** self.order
+        return dot_prod, A, B

    def Kdiag(self, X):
-        return self.variance*(np.square(X).sum(1) + 1.)**self.order
+        return self.K(X).diagonal()#self.variance*(np.square(X).sum(1) + 1.)**self.order

    def update_gradients_full(self, dL_dK, X, X2=None):
-        self.variance.gradient = np.sum(dL_dK * (self._dot_product(X, X2) + 1.)**self.order)
+        dot_prod, A, B = self._AB(X, X2)
+        dK_dA = self.variance * self.order * A ** (self.order-1.)
+        dL_dA = dL_dK * (dK_dA)
+        self.scale.gradient = (dL_dA * dot_prod).sum()
+        self.bias.gradient = dL_dA.sum()
+        self.variance.gradient = np.sum(dL_dK * B)
+        #import ipdb;ipdb.set_trace()

    def update_gradients_diag(self, dL_dKdiag, X):
        raise NotImplementedError
--- a/GPy/kern/src/prod.py
+++ b/GPy/kern/src/prod.py
@ -39,7 +39,7 @@ class Prod(CombinationKernel):
                    kernels.insert(i, part)
        super(Prod, self).__init__(kernels, name)

-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def K(self, X, X2=None, which_parts=None):
        if which_parts is None:
            which_parts = self.parts
@ -48,7 +48,7 @@ class Prod(CombinationKernel):
            which_parts = [which_parts]
        return reduce(np.multiply, (p.K(X, X2) for p in which_parts))

-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def Kdiag(self, X, which_parts=None):
        if which_parts is None:
            which_parts = self.parts
--- a/GPy/kern/src/psi_comp/init.py
+++ b/GPy/kern/src/psi_comp/init.py
@ -21,7 +21,7 @@ from .gaussherm import PSICOMP_GH
 from . import rbf_psi_comp, linear_psi_comp, ssrbf_psi_comp, sslinear_psi_comp

 class PSICOMP_RBF(PSICOMP):
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        variance, lengthscale = kern.variance, kern.lengthscale
        if isinstance(variational_posterior, variational.NormalPosterior):
@ -31,7 +31,7 @@ class PSICOMP_RBF(PSICOMP):
        else:
            raise ValueError("unknown distriubtion received for psi-statistics")

-    @Cache_this(limit=10, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        variance, lengthscale = kern.variance, kern.lengthscale
        if isinstance(variational_posterior, variational.NormalPosterior):
@ -43,7 +43,7 @@ class PSICOMP_RBF(PSICOMP):

 class PSICOMP_Linear(PSICOMP):

-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        variances = kern.variances
        if isinstance(variational_posterior, variational.NormalPosterior):
@ -53,7 +53,7 @@ class PSICOMP_Linear(PSICOMP):
        else:
            raise ValueError("unknown distriubtion received for psi-statistics")

-    @Cache_this(limit=10, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        variances = kern.variances
        if isinstance(variational_posterior, variational.NormalPosterior):
--- a/GPy/kern/src/psi_comp/gaussherm.py
+++ b/GPy/kern/src/psi_comp/gaussherm.py
@ -27,7 +27,7 @@ class PSICOMP_GH(PSICOMP):
    def _setup_observers(self):
        pass
    
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def comp_K(self, Z, qX):
        if self.Xs is None or self.Xs.shape != qX.mean.shape:
            from paramz import ObsAr
@ -38,7 +38,7 @@ class PSICOMP_GH(PSICOMP):
            self.Xs[i] = self.locs[i]*S_sq+mu
        return self.Xs
    
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def psicomputations(self, kern, Z, qX, return_psi2_n=False):
        mu, S = qX.mean.values, qX.variance.values
        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
@ -62,7 +62,7 @@ class PSICOMP_GH(PSICOMP):
                psi2 += self.weights[i]* tdot(Kfu.T)
        return psi0, psi1, psi2
    
-    @Cache_this(limit=10, ignore_args=(0, 2,3,4))
+    @Cache_this(limit=3, ignore_args=(0, 2,3,4))
    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, qX):
        mu, S = qX.mean.values, qX.variance.values
        if self.cache_K: Xs = self.comp_K(Z, qX)
--- a/GPy/kern/src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/src/psi_comp/rbf_psi_comp.py
@ -132,5 +132,5 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):

    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS

-_psi1computations = Cacher(__psi1computations, limit=5)
-_psi2computations = Cacher(__psi2computations, limit=5)
+_psi1computations = Cacher(__psi1computations, limit=3)
+_psi2computations = Cacher(__psi2computations, limit=3)
--- a/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
+++ b/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
@ -324,7 +324,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
        except:
            return self.fall_back.psicomputations(kern, Z, variational_posterior, return_psi2_n)

-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def _psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        """
        Z - MxQ
@ -369,7 +369,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
        except:
            return self.fall_back.psiDerivativecomputations(kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)

-    @Cache_this(limit=10, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
    def _psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        # resolve the requirement of dL_dpsi2 to be symmetric
        if len(dL_dpsi2.shape)==2: dL_dpsi2 = (dL_dpsi2+dL_dpsi2.T)/2
--- a/GPy/kern/src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/src/psi_comp/ssrbf_psi_comp.py
@ -88,7 +88,7 @@ try:
        return psi0,psi1,psi2,psi2n

    from GPy.util.caching import Cacher
-    psicomputations = Cacher(_psicomputations, limit=1)
+    psicomputations = Cacher(_psicomputations, limit=3)

    def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
        ARD = (len(lengthscale)!=1)
--- a/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
+++ b/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
@ -373,7 +373,7 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
    def get_dimensions(self, Z, variational_posterior):
        return variational_posterior.mean.shape[0], Z.shape[0], Z.shape[1]

-    @Cache_this(limit=1, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        """
        Z - MxQ
@ -407,7 +407,7 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
        else:
            return psi0, psi1_gpu.get(), psi2_gpu.get()

-    @Cache_this(limit=1, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        variance, lengthscale = kern.variance, kern.lengthscale
        from ....util.linalg_gpu import sum_axis
--- a/GPy/kern/src/standard_periodic.py
+++ b/GPy/kern/src/standard_periodic.py
@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-
-
-# Copyright (c) 2014, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2015, Alex Grigorevskiy
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 The standard periodic kernel which mentioned in:
@ -9,7 +8,7 @@ The standard periodic kernel which mentioned in:
 The MIT Press, 2005.


-[2] Introduction to Gaussian processes. D. J. C. MacKay. In C. M. Bishop, editor, 
+[2] Introduction to Gaussian processes. D. J. C. MacKay. In C. M. Bishop, editor,
 Neural Networks and Machine Learning, pages 133-165. Springer, 1998.
 """

@ -25,56 +24,56 @@ class StdPeriodic(Kern):

    .. math::

-       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} {}\sum_{i=1}^{input\_dim}  
-       \left( \frac{\sin(\frac{\pi}{\lambda_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
+       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} \sum_{i=1}^{input\_dim}
+       \left( \frac{\sin(\frac{\pi}{T_i} (x_i - y_i) )}{l_i} \right)^2 \right] }

    :param input_dim: the number of input dimensions
    :type input_dim: int
    :param variance: the variance :math:`\theta_1` in the formula above
    :type variance: float
-    :param wavelength: the vector of wavelengths :math:`\lambda_i`. If None then 1.0 is assumed.
-    :type wavelength: array or list of the appropriate size (or float if there is only one wavelength parameter)
+    :param period: the vector of periods :math:`\T_i`. If None then 1.0 is assumed.
+    :type period: array or list of the appropriate size (or float if there is only one period parameter)
    :param lengthscale: the vector of lengthscale :math:`\l_i`. If None then 1.0 is assumed.
    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
-    :param ARD1: Auto Relevance Determination with respect to wavelength. 
-        If equal to "False" one single wavelength parameter :math:`\lambda_i` for 
-        each dimension is assumed, otherwise there is one lengthscale 
+    :param ARD1: Auto Relevance Determination with respect to period.
+        If equal to "False" one single period parameter :math:`\T_i` for
+        each dimension is assumed, otherwise there is one lengthscale
        parameter per dimension.
    :type ARD1: Boolean
-    :param ARD2: Auto Relevance Determination with respect to lengthscale. 
-        If equal to "False" one single wavelength parameter :math:`l_i` for 
-        each dimension is assumed, otherwise there is one lengthscale 
+    :param ARD2: Auto Relevance Determination with respect to lengthscale.
+        If equal to "False" one single lengthscale parameter :math:`l_i` for
+        each dimension is assumed, otherwise there is one lengthscale
        parameter per dimension.
    :type ARD2: Boolean
    :param active_dims: indices of dimensions which are used in the computation of the kernel
-    :type wavelength: array or list of the appropriate size
+    :type active_dims: array or list of the appropriate size
    :param name: Name of the kernel for output
    :type String
    :param useGPU: whether of not use GPU
    :type Boolean
    """
-    
-    def __init__(self, input_dim, variance=1., wavelength=None, lengthscale=None, ARD1=False, ARD2=False, active_dims=None, name='std_periodic',useGPU=False):
+
+    def __init__(self, input_dim, variance=1., period=None, lengthscale=None, ARD1=False, ARD2=False, active_dims=None, name='std_periodic',useGPU=False):
        super(StdPeriodic, self).__init__(input_dim, active_dims, name, useGPU=useGPU)
        self.input_dim = input_dim
-        self.ARD1 = ARD1 # correspond to wavelengths        
+        self.ARD1 = ARD1 # correspond to periods
        self.ARD2 = ARD2 # correspond to lengthscales
-        
+
        self.name = name
-        
+
        if self.ARD1 == False:
-            if wavelength is not None:
-                wavelength = np.asarray(wavelength)
-                assert wavelength.size == 1, "Only one wavelength needed for non-ARD kernel"
+            if period is not None:
+                period = np.asarray(period)
+                assert period.size == 1, "Only one period needed for non-ARD kernel"
            else:
-                wavelength = np.ones(1)
+                period = np.ones(1.0)
        else:
-            if wavelength is not None:
-                wavelength = np.asarray(wavelength)
-                assert wavelength.size == input_dim, "bad number of wavelengths"
+            if period is not None:
+                period = np.asarray(period)
+                assert period.size == input_dim, "bad number of periods"
            else:
-                wavelength = np.ones(input_dim)
-        
+                period = np.ones(input_dim)
+
        if self.ARD2 == False:
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
@ -87,33 +86,33 @@ class StdPeriodic(Kern):
                assert lengthscale.size == input_dim, "bad number of lengthscales"
            else:
                lengthscale = np.ones(input_dim)
-        
+
        self.variance = Param('variance', variance, Logexp())
        assert self.variance.size==1, "Variance size must be one"
-        self.wavelengths =  Param('wavelengths', wavelength, Logexp())
-        self.lengthscales =  Param('lengthscales', lengthscale, Logexp())
-        
-        self.link_parameters(self.variance,  self.wavelengths, self.lengthscales)
+        self.period =  Param('period', period, Logexp())
+        self.lengthscale =  Param('lengthscale', lengthscale, Logexp())
+
+        self.link_parameters(self.variance,  self.period, self.lengthscale)

    def parameters_changed(self):
        """
-        This functions deals as a callback for each optimization iteration. 
+        This functions deals as a callback for each optimization iteration.
        If one optimization step was successfull and the parameters
-        this callback function will be called to be able to update any 
+        this callback function will be called to be able to update any
        precomputations for the kernel.
        """
-        
+
        pass
-        
-        
+
+
    def K(self, X, X2=None):
        """Compute the covariance matrix between X and X2."""
-        if X2 is None: 
+        if X2 is None:
            X2 = X
-            
-        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
-        exp_dist = np.exp( -0.5* np.sum( np.square(  np.sin( base ) / self.lengthscales ), axis = -1 ) ) 
-            
+
+        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.period
+        exp_dist = np.exp( -0.5* np.sum( np.square(  np.sin( base ) / self.lengthscale ), axis = -1 ) )
+
        return self.variance * exp_dist


@ -125,42 +124,42 @@ class StdPeriodic(Kern):

    def update_gradients_full(self, dL_dK, X, X2=None):
        """derivative of the covariance matrix with respect to the parameters."""
-        if X2 is None: 
+        if X2 is None:
            X2 = X
-        
-        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
-        
-        sin_base = np.sin( base )         
-        exp_dist = np.exp( -0.5* np.sum( np.square(  sin_base / self.lengthscales ), axis = -1 ) ) 
-        
-        dwl = self.variance * (1.0/np.square(self.lengthscales)) * sin_base*np.cos(base) * (base / self.wavelengths)
-        
-        dl = self.variance * np.square( sin_base) / np.power( self.lengthscales, 3) 
-        
-        self.variance.gradient = np.sum(exp_dist * dL_dK)    
-        #target[0] += np.sum( exp_dist * dL_dK)        
-        
-        if self.ARD1: # different wavelengths
-            self.wavelengths.gradient = (dwl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
-        else:  # same wavelengths
-            self.wavelengths.gradient = np.sum(dwl.sum(-1) * exp_dist * dL_dK)
-            
+
+        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.period
+
+        sin_base = np.sin( base )
+        exp_dist = np.exp( -0.5* np.sum( np.square(  sin_base / self.lengthscale ), axis = -1 ) )
+
+        dwl = self.variance * (1.0/np.square(self.lengthscale)) * sin_base*np.cos(base) * (base / self.period)
+
+        dl = self.variance * np.square( sin_base) / np.power( self.lengthscale, 3)
+
+        self.variance.gradient = np.sum(exp_dist * dL_dK)
+        #target[0] += np.sum( exp_dist * dL_dK)
+
+        if self.ARD1: # different periods
+            self.period.gradient = (dwl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
+        else:  # same period
+            self.period.gradient = np.sum(dwl.sum(-1) * exp_dist * dL_dK)
+
        if self.ARD2: # different lengthscales
-            self.lengthscales.gradient = (dl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
+            self.lengthscale.gradient = (dl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
        else: # same lengthscales
-            self.lengthscales.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)
-        
+            self.lengthscale.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)
+
    def update_gradients_diag(self, dL_dKdiag, X):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
        self.variance.gradient = np.sum(dL_dKdiag)
-        self.wavelengths.gradient = 0
-        self.lengthscales.gradient = 0
+        self.period.gradient = 0
+        self.lengthscale.gradient = 0

 #    def gradients_X(self, dL_dK, X, X2=None):
 #        """derivative of the covariance matrix with respect to X."""
-#    
+#
 #        raise NotImplemented("Periodic kernel: dK_dX not implemented")
 #
 #    def gradients_X_diag(self, dL_dKdiag, X):
-#        
+#
 #        raise NotImplemented("Periodic kernel: dKdiag_dX not implemented")
--- a/GPy/kern/src/static.py
+++ b/GPy/kern/src/static.py
@ -81,6 +81,52 @@ class White(Static):
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        self.variance.gradient = dL_dpsi0.sum()

+class WhiteHeteroscedastic(Static):
+    def __init__(self, input_dim, num_data, variance=1., active_dims=None, name='white_hetero'):
+        """
+        A heteroscedastic White kernel (nugget/noise).
+        It defines one variance (nugget) per input sample. 
+        
+        Prediction excludes any noise learnt by this Kernel, so be careful using this kernel.
+        
+        You can plot the errors learnt by this kernel by something similar as:
+        plt.errorbar(m.X, m.Y, yerr=2*np.sqrt(m.kern.white.variance))
+        """
+        super(Static, self).__init__(input_dim, active_dims, name)
+        self.variance = Param('variance', np.ones(num_data) * variance, Logexp())
+        self.link_parameters(self.variance)
+
+    def Kdiag(self, X):
+        if X.shape[0] == self.variance.shape[0]:
+            # If the input has the same number of samples as 
+            # the number of variances, we return the variances
+            return self.variance
+        return 0.
+
+    def K(self, X, X2=None):
+        if X2 is None and X.shape[0] == self.variance.shape[0]:
+            return np.eye(X.shape[0]) * self.variance
+        else:
+            return 0.
+
+    def psi2(self, Z, variational_posterior):
+        return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)
+
+    def psi2n(self, Z, variational_posterior):
+        return np.zeros((1, Z.shape[0], Z.shape[0]), dtype=np.float64)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        if X2 is None:
+            self.variance.gradient = np.diagonal(dL_dK)
+        else:
+            self.variance.gradient = 0.
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        self.variance.gradient = dL_dKdiag
+
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        self.variance.gradient = dL_dpsi0
+
 class Bias(Static):
    def __init__(self, input_dim, variance=1., active_dims=None, name='bias'):
        super(Bias, self).__init__(input_dim, variance, active_dims, name)
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@ -81,11 +81,11 @@ class Stationary(Kern):
    def dK_dr(self, r):
        raise NotImplementedError("implement derivative of the covariance function wrt r to use this class")

-    @Cache_this(limit=20, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def dK2_drdr(self, r):
        raise NotImplementedError("implement second derivative of covariance wrt r to use this method")

-    @Cache_this(limit=5, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def K(self, X, X2=None):
        """
        Kernel function applied on inputs X and X2.
@ -99,6 +99,9 @@ class Stationary(Kern):

    @Cache_this(limit=3, ignore_args=())
    def dK_dr_via_X(self, X, X2):
+        """
+        compute the derivative of K wrt X going through X
+        """
        #a convenience function, so we can cache dK_dr
        return self.dK_dr(self._scaled_dist(X, X2))

--- a/GPy/kern/src/trunclinear.py
+++ b/GPy/kern/src/trunclinear.py
@ -54,12 +54,12 @@ class TruncLinear(Kern):
        self.add_parameter(self.variances)
        self.add_parameter(self.delta)

-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
    def K(self, X, X2=None):
        XX = self.variances*self._product(X, X2)
        return XX.sum(axis=-1)

-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
    def _product(self, X, X2=None):
        if X2 is None:
            X2 = X
@ -149,12 +149,12 @@ class TruncLinear_inf(Kern):
        self.add_parameter(self.variances)


-#     @Cache_this(limit=2)
+#     @Cache_this(limit=3)
    def K(self, X, X2=None):
        tmp = self._product(X, X2)
        return (self.variances*tmp).sum(axis=-1)

-#     @Cache_this(limit=2)
+#     @Cache_this(limit=3)
    def _product(self, X, X2=None):
        if X2 is None:
            X2 = X
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@ -61,7 +61,7 @@ class BayesianGPLVM(SparseGP_MPI):
            else:
                from ..inference.latent_function_inference.var_dtc import VarDTC
                self.logger.debug("creating inference_method var_dtc")
-                inference_method = VarDTC(limit=1 if not missing_data else Y.shape[1])
+                inference_method = VarDTC(limit=3 if not missing_data else Y.shape[1])
        if isinstance(inference_method,VarDTC_minibatch):
            inference_method.mpi_comm = mpi_comm

--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@ -40,12 +40,13 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
            Z = np.random.permutation(X.copy())[:num_inducing]
        assert Z.shape[1] == X.shape[1]

-        if X_variance == False:
+        if X_variance is False:
            self.logger.info('no variance on X, activating sparse GPLVM')
            X = Param("latent space", X)
-        elif X_variance is None:
-            self.logger.info("initializing latent space variance ~ uniform(0,.1)")
-            X_variance = np.random.uniform(0,.1,X.shape)
+        else:
+            if X_variance is None:
+                self.logger.info("initializing latent space variance ~ uniform(0,.1)")
+                X_variance = np.random.uniform(0,.1,X.shape)
            self.variational_prior = NormalPrior()
            X = NormalPosterior(X, X_variance)

@ -61,7 +62,7 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
        if inference_method is None:
            from ..inference.latent_function_inference.var_dtc import VarDTC
            self.logger.debug("creating inference_method var_dtc")
-            inference_method = VarDTC(limit=1 if not missing_data else Y.shape[1])
+            inference_method = VarDTC(limit=3 if not missing_data else Y.shape[1])

        super(BayesianGPLVMMiniBatch,self).__init__(X, Y, Z, kernel, likelihood=likelihood,
                                           name=name, inference_method=inference_method,
@ -71,13 +72,13 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
        self.X = X
        self.link_parameter(self.X, 0)

-    def set_X_gradients(self, X, X_grad):
-        """Set the gradients of the posterior distribution of X in its specific form."""
-        X.mean.gradient, X.variance.gradient = X_grad
+    #def set_X_gradients(self, X, X_grad):
+    #    """Set the gradients of the posterior distribution of X in its specific form."""
+    #    X.mean.gradient, X.variance.gradient = X_grad

-    def get_X_gradients(self, X):
-        """Get the gradients of the posterior distribution of X in its specific form."""
-        return X.mean.gradient, X.variance.gradient
+    #def get_X_gradients(self, X):
+    #    """Get the gradients of the posterior distribution of X in its specific form."""
+    #    return X.mean.gradient, X.variance.gradient

    def _outer_values_update(self, full_values):
        """
@ -106,7 +107,7 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
        super(BayesianGPLVMMiniBatch,self).parameters_changed()

        kl_fctr = self.kl_factr
-        if kl_fctr > 0:
+        if kl_fctr > 0 and self.has_uncertain_inputs():
            Xgrad = self.X.gradient.copy()
            self.X.gradient[:] = 0
            self.variational_prior.update_gradients_KL(self.X)
@ -122,8 +123,8 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):

            if self.missing_data or not self.stochastics:
                self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)
-            elif self.stochastics:
+            else: #self.stochastics is given:
                d = self.output_dim
                self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)*self.stochastics.batchsize/d

-        self._Xgrad = self.X.gradient.copy()
+        self._Xgrad = self.X.gradient.copy()
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@ -41,4 +41,4 @@ class GPLVM(GP):

    def parameters_changed(self):
        super(GPLVM, self).parameters_changed()
-        self.X.gradient = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X, None)
+        self.X.gradient = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X, None)
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@ -5,14 +5,14 @@ import numpy as np
 import itertools, logging

 from ..kern import Kern
-from GPy.core.parameterization.variational import NormalPrior
+from ..core.parameterization.variational import NormalPrior
 from ..core.parameterization import Param
 from paramz import ObsAr
 from ..inference.latent_function_inference.var_dtc import VarDTC
 from ..inference.latent_function_inference import InferenceMethodList
 from ..likelihoods import Gaussian
 from ..util.initialization import initialize_latent
-from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
+from ..models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch

 class MRD(BayesianGPLVMMiniBatch):
    """
@ -215,40 +215,6 @@ class MRD(BayesianGPLVMMiniBatch):
            Z = np.random.randn(self.num_inducing, self.input_dim) * X.var()
        return Z

-    def _handle_plotting(self, fignum, axes, plotf, sharex=False, sharey=False):
-        import matplotlib.pyplot as plt
-        if axes is None:
-            fig = plt.figure(num=fignum)
-        sharex_ax = None
-        sharey_ax = None
-        plots = []
-        for i, g in enumerate(self.bgplvms):
-            try:
-                if sharex:
-                    sharex_ax = ax # @UndefinedVariable
-                    sharex = False # dont set twice
-                if sharey:
-                    sharey_ax = ax # @UndefinedVariable
-                    sharey = False # dont set twice
-            except:
-                pass
-            if axes is None:
-                ax = fig.add_subplot(1, len(self.bgplvms), i + 1, sharex=sharex_ax, sharey=sharey_ax)
-            elif isinstance(axes, (tuple, list, np.ndarray)):
-                ax = axes[i]
-            else:
-                raise ValueError("Need one axes per latent dimension input_dim")
-            plots.append(plotf(i, g, ax))
-            if sharey_ax is not None:
-                plt.setp(ax.get_yticklabels(), visible=False)
-        plt.draw()
-        if axes is None:
-            try:
-                fig.tight_layout()
-            except:
-                pass
-        return plots
-
    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None, Yindex=0):
        """
        Prediction for data set Yindex[default=0].
@ -270,59 +236,50 @@ class MRD(BayesianGPLVMMiniBatch):
    #                                     sharex=sharex, sharey=sharey)
    #         return fig

-    def plot_scales(self, fignum=None, ax=None, titles=None, sharex=False, sharey=True, *args, **kwargs):
+    def plot_scales(self, titles=None, fig_kwargs={}, **kwargs):
        """
-
-        TODO: Explain other parameters
+        Plot input sensitivity for all datasets, to see which input dimensions are
+        significant for which dataset.

        :param titles: titles for axes of datasets

+        kwargs go into plot_ARD for each kernel.
        """
+        from ..plotting import plotting_library as pl
+
        if titles is None:
            titles = [r'${}$'.format(name) for name in self.names]
-        ymax = reduce(max, [np.ceil(max(g.kern.input_sensitivity())) for g in self.bgplvms])
-        def plotf(i, g, ax):
-            #ax.set_ylim([0,ymax])
-            return g.kern.plot_ARD(ax=ax, title=titles[i], *args, **kwargs)
-        fig = self._handle_plotting(fignum, ax, plotf, sharex=sharex, sharey=sharey)
-        return fig
+
+        M = len(self.bgplvms)
+        fig = pl().figure(rows=1, cols=M, **fig_kwargs)
+        for c in range(M):
+            canvas = self.bgplvms[c].kern.plot_ARD(title=titles[c], figure=fig, col=c+1, **kwargs)
+        return canvas

    def plot_latent(self, labels=None, which_indices=None,
-                resolution=50, ax=None, marker='o', s=40,
-                fignum=None, plot_inducing=True, legend=True,
+                resolution=60, legend=True,
                plot_limits=None,
-                aspect='auto', updates=False, predict_kwargs={}, imshow_kwargs={}):
+                updates=False,
+                kern=None, marker='<>^vsd',
+                num_samples=1000, projection='2d',
+                predict_kwargs={},
+                scatter_kwargs=None, **imshow_kwargs):
        """
        see plotting.matplot_dep.dim_reduction_plots.plot_latent
        if predict_kwargs is None, will plot latent spaces for 0th dataset (and kernel), otherwise give
        predict_kwargs=dict(Yindex='index') for plotting only the latent space of dataset with 'index'.
        """
-        import sys
-        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
-        from matplotlib import pyplot as plt
-        from ..plotting.matplot_dep import dim_reduction_plots
+        from ..plotting.gpy_plot.latent_plots import plot_latent
+
        if "Yindex" not in predict_kwargs:
            predict_kwargs['Yindex'] = 0

        Yindex = predict_kwargs['Yindex']
-        if ax is None:
-            fig = plt.figure(num=fignum)
-            ax = fig.add_subplot(111)
-        else:
-            fig = ax.figure
+
        self.kern = self.bgplvms[Yindex].kern
        self.likelihood = self.bgplvms[Yindex].likelihood
-        plot = dim_reduction_plots.plot_latent(self, labels, which_indices,
-                                        resolution, ax, marker, s,
-                                        fignum, plot_inducing, legend,
-                                        plot_limits, aspect, updates, predict_kwargs, imshow_kwargs)
-        ax.set_title(self.bgplvms[Yindex].name)
-        try:
-            fig.tight_layout()
-        except:
-            pass

-        return plot
+        return plot_latent(self, labels, which_indices, resolution, legend, plot_limits, updates, kern, marker, num_samples, projection, scatter_kwargs)

    def __getstate__(self):
        state = super(MRD, self).__getstate__()
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@ -41,11 +41,12 @@ class SparseGPMiniBatch(SparseGP):
    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
                 name='sparse gp', Y_metadata=None, normalizer=False,
                 missing_data=False, stochastic=False, batchsize=1):
+        self._update_stochastics = False

        # pick a sensible inference method
        if inference_method is None:
            if isinstance(likelihood, likelihoods.Gaussian):
-                inference_method = var_dtc.VarDTC(limit=1 if not missing_data else Y.shape[1])
+                inference_method = var_dtc.VarDTC(limit=3 if not missing_data else Y.shape[1])
            else:
                #inference_method = ??
                raise NotImplementedError("what to do what to do?")
@ -73,7 +74,14 @@ class SparseGPMiniBatch(SparseGP):
        logger.info("Adding Z as parameter")
        self.link_parameter(self.Z, index=0)
        self.posterior = None
-
+        
+    def optimize(self, optimizer=None, start=None, **kwargs):
+        try:
+            self._update_stochastics = True
+            SparseGP.optimize(self, optimizer=optimizer, start=start, **kwargs)
+        finally:
+            self._update_stochastics = False
+            
    def has_uncertain_inputs(self):
        return isinstance(self.X, VariationalPosterior)

@ -226,16 +234,16 @@ class SparseGPMiniBatch(SparseGP):
            woodbury_inv = self.posterior._woodbury_inv
            woodbury_vector = self.posterior._woodbury_vector

-        if not self.stochastics:
-            m_f = lambda i: "Inference with missing_data: {: >7.2%}".format(float(i+1)/self.output_dim)
-            message = m_f(-1)
-            print(message, end=' ')
+        #if not self.stochastics:
+        #    m_f = lambda i: "Inference with missing_data: {: >7.2%}".format(float(i+1)/self.output_dim)
+        #    message = m_f(-1)
+        #    print(message, end=' ')

        for d, ninan in self.stochastics.d:
-            if not self.stochastics:
-                print(' '*(len(message)) + '\r', end=' ')
-                message = m_f(d)
-                print(message, end=' ')
+            #if not self.stochastics:
+            #    print(' '*(len(message)) + '\r', end=' ')
+            #    message = m_f(d)
+            #    print(message, end=' ')

            psi0ni = self.psi0[ninan]
            psi1ni = self.psi1[ninan]
@ -262,8 +270,8 @@ class SparseGPMiniBatch(SparseGP):
            woodbury_vector[:, d] = posterior.woodbury_vector
            self._log_marginal_likelihood += log_marginal_likelihood

-        if not self.stochastics:
-            print('')
+        #if not self.stochastics:
+        #    print('')

        if self.posterior is None:
            self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector,
@ -314,6 +322,8 @@ class SparseGPMiniBatch(SparseGP):
        if self.missing_data:
            self._outer_loop_for_missing_data()
        elif self.stochastics:
+            if self._update_stochastics:
+                self.stochastics.do_stochastics()
            self._outer_loop_without_missing_data()
        else:
            self.posterior, self._log_marginal_likelihood, self.grad_dict = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
--- a/GPy/models/sparse_gp_regression.py
+++ b/GPy/models/sparse_gp_regression.py
@ -62,4 +62,4 @@ class SparseGPRegression(SparseGP_MPI):
        if isinstance(self.inference_method,VarDTC_minibatch):
            update_gradients_sparsegp(self, mpi_comm=self.mpi_comm)
        else:
-            super(SparseGPRegression, self).parameters_changed()
+            super(SparseGPRegression, self).parameters_changed()
--- a/GPy/models/sparse_gplvm.py
+++ b/GPy/models/sparse_gplvm.py
@ -4,6 +4,7 @@

 import sys
 from .sparse_gp_regression import SparseGPRegression
+from ..core import Param

 class SparseGPLVM(SparseGPRegression):
    """
@ -21,7 +22,9 @@ class SparseGPLVM(SparseGPRegression):
        if X is None:
            from ..util.initialization import initialize_latent
            X, fracs = initialize_latent(init, input_dim, Y)
+        X = Param('latent space', X)
        SparseGPRegression.__init__(self, X, Y, kernel=kernel, num_inducing=num_inducing)
+        self.link_parameter(self.X, 0)

    def parameters_changed(self):
        super(SparseGPLVM, self).parameters_changed()
--- a/GPy/plotting/Tango.py
+++ b/GPy/plotting/Tango.py
@ -104,4 +104,4 @@ cdict_Alu = {'red' :((0./5,colorsRGB['Aluminium1'][0]/256.,colorsRGB['Aluminium1
                     (2./5,colorsRGB['Aluminium3'][2]/256.,colorsRGB['Aluminium3'][2]/256.),
                     (3./5,colorsRGB['Aluminium4'][2]/256.,colorsRGB['Aluminium4'][2]/256.),
                     (4./5,colorsRGB['Aluminium5'][2]/256.,colorsRGB['Aluminium5'][2]/256.),
-                     (5./5,colorsRGB['Aluminium6'][2]/256.,colorsRGB['Aluminium6'][2]/256.))}
+                     (5./5,colorsRGB['Aluminium6'][2]/256.,colorsRGB['Aluminium6'][2]/256.))}
--- a/GPy/plotting/init.py
+++ b/GPy/plotting/init.py
@ -25,18 +25,66 @@ def change_plotting_library(lib):
            current_lib[0] = PlotlyPlots()
        if lib == 'none':
            current_lib[0] = None
+        inject_plotting()
        #===========================================================================
    except (ImportError, NameError):
        config.set('plotting', 'library', 'none')
+        raise
        import warnings
        warnings.warn(ImportWarning("You spevified {} in your configuration, but is not available. Install newest version of {} for plotting".format(lib, lib)))

-from ..util.config import config, NoOptionError
-try:
-    lib = config.get('plotting', 'library')
-    change_plotting_library(lib)
-except NoOptionError:
-    print("No plotting library was specified in config file. \n{}".format(error_suggestion))
+def inject_plotting():
+    if current_lib[0] is not None:
+        # Inject the plots into classes here:
+
+        # Already converted to new style:
+        from . import gpy_plot
+
+        from ..core import GP
+        GP.plot_data = gpy_plot.data_plots.plot_data
+        GP.plot_data_error = gpy_plot.data_plots.plot_data_error
+        GP.plot_errorbars_trainset = gpy_plot.data_plots.plot_errorbars_trainset
+        GP.plot_mean = gpy_plot.gp_plots.plot_mean
+        GP.plot_confidence = gpy_plot.gp_plots.plot_confidence
+        GP.plot_density = gpy_plot.gp_plots.plot_density
+        GP.plot_samples = gpy_plot.gp_plots.plot_samples
+        GP.plot = gpy_plot.gp_plots.plot
+        GP.plot_f = gpy_plot.gp_plots.plot_f
+        GP.plot_magnification = gpy_plot.latent_plots.plot_magnification
+
+        from ..core import SparseGP
+        SparseGP.plot_inducing = gpy_plot.data_plots.plot_inducing
+
+        from ..models import GPLVM, BayesianGPLVM, bayesian_gplvm_minibatch, SSGPLVM, SSMRD
+        GPLVM.plot_latent = gpy_plot.latent_plots.plot_latent
+        GPLVM.plot_scatter = gpy_plot.latent_plots.plot_latent_scatter
+        GPLVM.plot_inducing = gpy_plot.latent_plots.plot_latent_inducing
+        GPLVM.plot_steepest_gradient_map = gpy_plot.latent_plots.plot_steepest_gradient_map
+        BayesianGPLVM.plot_latent = gpy_plot.latent_plots.plot_latent
+        BayesianGPLVM.plot_scatter = gpy_plot.latent_plots.plot_latent_scatter
+        BayesianGPLVM.plot_inducing = gpy_plot.latent_plots.plot_latent_inducing
+        BayesianGPLVM.plot_steepest_gradient_map = gpy_plot.latent_plots.plot_steepest_gradient_map
+        bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch.plot_latent = gpy_plot.latent_plots.plot_latent
+        bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch.plot_scatter = gpy_plot.latent_plots.plot_latent_scatter
+        bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch.plot_inducing = gpy_plot.latent_plots.plot_latent_inducing
+        bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch.plot_steepest_gradient_map = gpy_plot.latent_plots.plot_steepest_gradient_map
+        SSGPLVM.plot_latent = gpy_plot.latent_plots.plot_latent
+        SSGPLVM.plot_scatter = gpy_plot.latent_plots.plot_latent_scatter
+        SSGPLVM.plot_inducing = gpy_plot.latent_plots.plot_latent_inducing
+        SSGPLVM.plot_steepest_gradient_map = gpy_plot.latent_plots.plot_steepest_gradient_map
+
+        from ..kern import Kern
+        Kern.plot_covariance = gpy_plot.kernel_plots.plot_covariance
+        def deprecate_plot(self, *args, **kwargs):
+            import warnings
+            warnings.warn(DeprecationWarning('Kern.plot is being deprecated and will not be available in the 1.0 release. Use Kern.plot_covariance instead'))
+            return self.plot_covariance(*args, **kwargs)
+        Kern.plot = deprecate_plot
+        Kern.plot_ARD = gpy_plot.kernel_plots.plot_ARD
+
+        from ..inference.optimization import Optimizer
+        Optimizer.plot = gpy_plot.inference_plots.plot_optimizer
+        # Variational plot!

 def plotting_library():
    if current_lib[0] is None:
@ -53,54 +101,10 @@ def show(figure, **kwargs):
    """
    return plotting_library().show_canvas(figure, **kwargs)

-if config.get('plotting', 'library') is not 'none':
-    # Inject the plots into classes here:

-    # Already converted to new style:
-    from . import gpy_plot
-
-    from ..core import GP
-    GP.plot_data = gpy_plot.data_plots.plot_data
-    GP.plot_data_error = gpy_plot.data_plots.plot_data_error
-    GP.plot_errorbars_trainset = gpy_plot.data_plots.plot_errorbars_trainset
-    GP.plot_mean = gpy_plot.gp_plots.plot_mean
-    GP.plot_confidence = gpy_plot.gp_plots.plot_confidence
-    GP.plot_density = gpy_plot.gp_plots.plot_density
-    GP.plot_samples = gpy_plot.gp_plots.plot_samples
-    GP.plot = gpy_plot.gp_plots.plot
-    GP.plot_f = gpy_plot.gp_plots.plot_f
-    GP.plot_magnification = gpy_plot.latent_plots.plot_magnification
-
-    from ..core import SparseGP
-    SparseGP.plot_inducing = gpy_plot.data_plots.plot_inducing
-
-    from ..models import GPLVM, BayesianGPLVM, bayesian_gplvm_minibatch, SSGPLVM, SSMRD
-    GPLVM.plot_latent = gpy_plot.latent_plots.plot_latent
-    GPLVM.plot_scatter = gpy_plot.latent_plots.plot_latent_scatter
-    GPLVM.plot_inducing = gpy_plot.latent_plots.plot_latent_inducing
-    GPLVM.plot_steepest_gradient_map = gpy_plot.latent_plots.plot_steepest_gradient_map
-    BayesianGPLVM.plot_latent = gpy_plot.latent_plots.plot_latent
-    BayesianGPLVM.plot_scatter = gpy_plot.latent_plots.plot_latent_scatter
-    BayesianGPLVM.plot_inducing = gpy_plot.latent_plots.plot_latent_inducing
-    BayesianGPLVM.plot_steepest_gradient_map = gpy_plot.latent_plots.plot_steepest_gradient_map
-    bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch.plot_latent = gpy_plot.latent_plots.plot_latent
-    bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch.plot_scatter = gpy_plot.latent_plots.plot_latent_scatter
-    bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch.plot_inducing = gpy_plot.latent_plots.plot_latent_inducing
-    bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch.plot_steepest_gradient_map = gpy_plot.latent_plots.plot_steepest_gradient_map
-    SSGPLVM.plot_latent = gpy_plot.latent_plots.plot_latent
-    SSGPLVM.plot_scatter = gpy_plot.latent_plots.plot_latent_scatter
-    SSGPLVM.plot_inducing = gpy_plot.latent_plots.plot_latent_inducing
-    SSGPLVM.plot_steepest_gradient_map = gpy_plot.latent_plots.plot_steepest_gradient_map
-
-    from ..kern import Kern
-    Kern.plot_covariance = gpy_plot.kernel_plots.plot_covariance
-    def deprecate_plot(self, *args, **kwargs):
-        import warnings
-        warnings.warn(DeprecationWarning('Kern.plot is being deprecated and will not be available in the 1.0 release. Use Kern.plot_covariance instead'))
-        return self.plot_covariance(*args, **kwargs)
-    Kern.plot = deprecate_plot
-    Kern.plot_ARD = gpy_plot.kernel_plots.plot_ARD
-
-    from ..inference.optimization import Optimizer
-    Optimizer.plot = gpy_plot.inference_plots.plot_optimizer
-    # Variational plot!
+from ..util.config import config, NoOptionError
+try:
+    lib = config.get('plotting', 'library')
+    change_plotting_library(lib)
+except NoOptionError:
+    print("No plotting library was specified in config file. \n{}".format(error_suggestion))
--- a/GPy/plotting/gpy_plot/gp_plots.py
+++ b/GPy/plotting/gpy_plot/gp_plots.py
@ -91,7 +91,7 @@ def _plot_mean(self, canvas, helper_data, helper_prediction,
            if projection == '2d':
                update_not_existing_kwargs(kwargs, pl().defaults.meanplot_2d)  # @UndefinedVariable
                plots = dict(gpmean=[pl().contour(canvas, x[:,0], y[0,:],
-                                               mu.reshape(resolution, resolution),
+                                               mu.reshape(resolution, resolution).T,
                                               levels=levels, label=label, **kwargs)])
            elif projection == '3d':
                update_not_existing_kwargs(kwargs, pl().defaults.meanplot_3d)  # @UndefinedVariable
@ -420,4 +420,4 @@ def _plot(self, canvas, plots, helper_data, helper_prediction, levels, plot_indu

        if helper_prediction[2] is not None:
            plots.update(_plot_samples(self, canvas, helper_data, helper_prediction, projection, "Samples"))
-        return plots
+        return plots
--- a/GPy/plotting/gpy_plot/kernel_plots.py
+++ b/GPy/plotting/gpy_plot/kernel_plots.py
@ -33,7 +33,7 @@ from .. import Tango
 from .plot_util import update_not_existing_kwargs, helper_for_plot_data
 from ...kern.src.kern import Kern, CombinationKernel

-def plot_ARD(kernel, filtering=None, legend=False, **kwargs):
+def plot_ARD(kernel, filtering=None, legend=False, canvas=None, **kwargs):
    """
    If an ARD kernel is present, plot a bar representation using matplotlib

@ -62,7 +62,11 @@ def plot_ARD(kernel, filtering=None, legend=False, **kwargs):

    bars = []
    kwargs = update_not_existing_kwargs(kwargs, pl().defaults.ard)
-    canvas, kwargs = pl().new_canvas(xlim=(-.5, kernel._effective_input_dim-.5), xlabel='input dimension', ylabel='sensitivity', **kwargs)
+
+
+    if canvas is None:
+        canvas, kwargs = pl().new_canvas(xlim=(-.5, kernel._effective_input_dim-.5), xlabel='input dimension', ylabel='sensitivity', **kwargs)
+
    for i in range(ard_params.shape[0]):
        if parts[i].name in filtering:
            c = Tango.nextMedium()
@ -96,7 +100,7 @@ def plot_covariance(kernel, x=None, label=None,
    """
    X = np.ones((2, kernel._effective_input_dim)) * [[-3], [3]]
    _, free_dims, Xgrid, xx, yy, _, _, resolution = helper_for_plot_data(kernel, X, plot_limits, visible_dims, None, resolution)
-    
+
    from numbers import Number
    if x is None:
        from ...kern.src.stationary import Stationary
@ -104,7 +108,7 @@ def plot_covariance(kernel, x=None, label=None,
    elif isinstance(x, Number):
        x = np.ones((1, kernel._effective_input_dim))*x
    K = kernel.K(Xgrid, x)
-    
+
    if projection == '3d':
        xlabel = 'X[:,0]'
        ylabel = 'X[:,1]'
@ -136,4 +140,4 @@ def plot_covariance(kernel, x=None, label=None,
        return pl().add_to_canvas(canvas, plots)

    else:
-        raise NotImplementedError("Cannot plot a kernel with more than two input dimensions")
+        raise NotImplementedError("Cannot plot a kernel with more than two input dimensions")
--- a/GPy/plotting/gpy_plot/latent_plots.py
+++ b/GPy/plotting/gpy_plot/latent_plots.py
@ -147,6 +147,7 @@ def _plot_magnification(self, canvas, which_indices, Xgrid,
    def plot_function(x):
        Xtest_full = np.zeros((x.shape[0], Xgrid.shape[1]))
        Xtest_full[:, which_indices] = x
+
        mf = self.predict_magnification(Xtest_full, kern=kern, mean=mean, covariance=covariance)
        return mf.reshape(resolution, resolution).T
    imshow_kwargs = update_not_existing_kwargs(imshow_kwargs, pl().defaults.magnification)
@ -163,7 +164,8 @@ def plot_magnification(self, labels=None, which_indices=None,
                updates=False,
                mean=True, covariance=True,
                kern=None, num_samples=1000,
-                scatter_kwargs=None, **imshow_kwargs):
+                scatter_kwargs=None, plot_scatter=True,
+                **imshow_kwargs):
    """
    Plot the magnification factor of the GP on the inputs. This is the
    density of the GP as a gray scale.
@ -188,17 +190,20 @@ def plot_magnification(self, labels=None, which_indices=None,
    _, _, Xgrid, _, _, xmin, xmax, resolution = helper_for_plot_data(self, X, plot_limits, which_indices, None, resolution)
    canvas, imshow_kwargs = pl().new_canvas(xlim=(xmin[0], xmax[0]), ylim=(xmin[1], xmax[1]),
                           xlabel='latent dimension %i' % input_1, ylabel='latent dimension %i' % input_2, **imshow_kwargs)
-    if (labels is not None):
-        legend = find_best_layout_for_subplots(len(np.unique(labels)))[1]
-    else:
-        labels = np.ones(self.num_data)
-        legend = False
-    scatters = _plot_latent_scatter(canvas, X, which_indices, labels, marker, num_samples, projection='2d', **scatter_kwargs or {})
-    view = _plot_magnification(self, canvas, which_indices, Xgrid, xmin, xmax, resolution, updates, mean, covariance, kern, **imshow_kwargs)
-    retval = pl().add_to_canvas(canvas, dict(scatter=scatters, imshow=view),
+    plots = {}
+    if legend and plot_scatter:
+        if (labels is not None):
+            legend = find_best_layout_for_subplots(len(np.unique(labels)))[1]
+        else:
+            labels = np.ones(self.num_data)
+            legend = False
+    if plot_scatter:
+        plots['scatters'] = _plot_latent_scatter(canvas, X, which_indices, labels, marker, num_samples, projection='2d', **scatter_kwargs or {})
+    plots['view'] = _plot_magnification(self, canvas, which_indices, Xgrid, xmin, xmax, resolution, updates, mean, covariance, kern, **imshow_kwargs)
+    retval = pl().add_to_canvas(canvas, plots,
                           legend=legend,
                           )
-    _wait_for_updates(view, updates)
+    _wait_for_updates(plots['view'], updates)
    return retval


@ -211,7 +216,12 @@ def _plot_latent(self, canvas, which_indices, Xgrid,
    def plot_function(x):
        Xtest_full = np.zeros((x.shape[0], Xgrid.shape[1]))
        Xtest_full[:, which_indices] = x
-        mf = np.log(self.predict(Xtest_full, kern=kern)[1])
+        mf = self.predict(Xtest_full, kern=kern)[1]
+        if mf.shape[1]==self.output_dim:
+            mf = mf.sum(-1)
+        else:
+            mf *= self.output_dim
+        mf = np.log(mf)
        return mf.reshape(resolution, resolution).T

    imshow_kwargs = update_not_existing_kwargs(imshow_kwargs, pl().defaults.latent)
@ -254,11 +264,12 @@ def plot_latent(self, labels=None, which_indices=None,
    _, _, Xgrid, _, _, xmin, xmax, resolution = helper_for_plot_data(self, X, plot_limits, which_indices, None, resolution)
    canvas, imshow_kwargs = pl().new_canvas(xlim=(xmin[0], xmax[0]), ylim=(xmin[1], xmax[1]),
                           xlabel='latent dimension %i' % input_1, ylabel='latent dimension %i' % input_2, **imshow_kwargs)
-    if (labels is not None):
-        legend = find_best_layout_for_subplots(len(np.unique(labels)))[1]
-    else:
-        labels = np.ones(self.num_data)
-        legend = False
+    if legend:
+        if (labels is not None):
+            legend = find_best_layout_for_subplots(len(np.unique(labels)))[1]
+        else:
+            labels = np.ones(self.num_data)
+            legend = False
    scatters = _plot_latent_scatter(canvas, X, which_indices, labels, marker, num_samples, projection='2d', **scatter_kwargs or {})
    view = _plot_latent(self, canvas, which_indices, Xgrid, xmin, xmax, resolution, updates, kern, **imshow_kwargs)
    retval = pl().add_to_canvas(canvas, dict(scatter=scatters, imshow=view), legend=legend)
--- a/GPy/plotting/gpy_plot/plot_util.py
+++ b/GPy/plotting/gpy_plot/plot_util.py
@ -380,4 +380,4 @@ def x_frame2D(X,plot_limits=None,resolution=None):
    resolution = resolution or 50
    xx, yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution]
    Xnew = np.vstack((xx.flatten(),yy.flatten())).T
-    return Xnew, xx, yy, xmin, xmax
+    return Xnew, xx, yy, xmin, xmax
--- a/GPy/plotting/matplot_dep/init.py
+++ b/GPy/plotting/matplot_dep/init.py
@ -18,4 +18,4 @@


 from .util import align_subplot_array, align_subplots, fewerXticks, removeRightTicks, removeUpperTicks
-from . import controllers
+from . import controllers, base_plots
--- a/GPy/plotting/matplot_dep/base_plots.py
+++ b/GPy/plotting/matplot_dep/base_plots.py
@ -0,0 +1,265 @@
+# #Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+from matplotlib import pyplot as plt
+import numpy as np
+
+def ax_default(fignum, ax):
+    if ax is None:
+        fig = plt.figure(fignum)
+        ax = fig.add_subplot(111)
+    else:
+        fig = ax.figure
+    return fig, ax
+
+def meanplot(x, mu, color='#3300FF', ax=None, fignum=None, linewidth=2,**kw):
+    _, axes = ax_default(fignum, ax)
+    return axes.plot(x,mu,color=color,linewidth=linewidth,**kw)
+
+def gpplot(x, mu, lower, upper, edgecol='#3300FF', fillcol='#33CCFF', ax=None, fignum=None, **kwargs):
+    _, axes = ax_default(fignum, ax)
+
+    mu = mu.flatten()
+    x = x.flatten()
+    lower = lower.flatten()
+    upper = upper.flatten()
+
+    plots = []
+
+    #here's the mean
+    plots.append(meanplot(x, mu, edgecol, axes))
+
+    #here's the box
+    kwargs['linewidth']=0.5
+    if not 'alpha' in kwargs.keys():
+        kwargs['alpha'] = 0.3
+    plots.append(axes.fill(np.hstack((x,x[::-1])),np.hstack((upper,lower[::-1])),color=fillcol,**kwargs))
+
+    #this is the edge:
+    plots.append(meanplot(x, upper,color=edgecol, linewidth=0.2, ax=axes))
+    plots.append(meanplot(x, lower,color=edgecol, linewidth=0.2, ax=axes))
+
+    return plots
+
+def gradient_fill(x, percentiles, ax=None, fignum=None, **kwargs):
+    _, ax = ax_default(fignum, ax)
+
+    plots = []
+
+    #here's the box
+    if 'linewidth' not in kwargs:
+        kwargs['linewidth'] = 0.5
+    if not 'alpha' in kwargs.keys():
+        kwargs['alpha'] = 1./(len(percentiles))
+    
+    # pop where from kwargs
+    where = kwargs.pop('where') if 'where' in kwargs else None
+    # pop interpolate, which we actually do not do here!
+    if 'interpolate' in kwargs: kwargs.pop('interpolate')
+    
+    def pairwise(inlist):
+        l = len(inlist)
+        for i in range(int(np.ceil(l/2.))):
+            yield inlist[:][i], inlist[:][(l-1)-i]
+    
+    polycol = []
+    for y1, y2 in pairwise(percentiles):
+        import matplotlib.mlab as mlab
+        # Handle united data, such as dates
+        ax._process_unit_info(xdata=x, ydata=y1)
+        ax._process_unit_info(ydata=y2)
+    
+        # Convert the arrays so we can work with them
+        from numpy import ma
+        x = ma.masked_invalid(ax.convert_xunits(x))
+        y1 = ma.masked_invalid(ax.convert_yunits(y1))
+        y2 = ma.masked_invalid(ax.convert_yunits(y2))
+    
+        if y1.ndim == 0:
+            y1 = np.ones_like(x) * y1
+        if y2.ndim == 0:
+            y2 = np.ones_like(x) * y2
+    
+        if where is None:
+            where = np.ones(len(x), np.bool)
+        else:
+            where = np.asarray(where, np.bool)
+    
+        if not (x.shape == y1.shape == y2.shape == where.shape):
+            raise ValueError("Argument dimensions are incompatible")
+    
+        mask = reduce(ma.mask_or, [ma.getmask(a) for a in (x, y1, y2)])
+        if mask is not ma.nomask:
+            where &= ~mask
+        
+        polys = []
+        for ind0, ind1 in mlab.contiguous_regions(where):
+            xslice = x[ind0:ind1]
+            y1slice = y1[ind0:ind1]
+            y2slice = y2[ind0:ind1]
+    
+            if not len(xslice):
+                continue
+    
+            N = len(xslice)
+            X = np.zeros((2 * N + 2, 2), np.float)
+    
+            # the purpose of the next two lines is for when y2 is a
+            # scalar like 0 and we want the fill to go all the way
+            # down to 0 even if none of the y1 sample points do
+            start = xslice[0], y2slice[0]
+            end = xslice[-1], y2slice[-1]
+    
+            X[0] = start
+            X[N + 1] = end
+    
+            X[1:N + 1, 0] = xslice
+            X[1:N + 1, 1] = y1slice
+            X[N + 2:, 0] = xslice[::-1]
+            X[N + 2:, 1] = y2slice[::-1]
+    
+            polys.append(X)
+        polycol.extend(polys)
+    from matplotlib.collections import PolyCollection
+    plots.append(PolyCollection(polycol, **kwargs))
+    ax.add_collection(plots[-1], autolim=True)
+    ax.autoscale_view()
+    return plots
+
+def gperrors(x, mu, lower, upper, edgecol=None, ax=None, fignum=None, **kwargs):
+    _, axes = ax_default(fignum, ax)
+
+    mu = mu.flatten()
+    x = x.flatten()
+    lower = lower.flatten()
+    upper = upper.flatten()
+
+    plots = []
+
+    if edgecol is None:
+        edgecol='#3300FF'
+
+    if not 'alpha' in kwargs.keys():
+        kwargs['alpha'] = 1.
+
+
+    if not 'lw' in kwargs.keys():
+        kwargs['lw'] = 1.
+
+
+    plots.append(axes.errorbar(x,mu,yerr=np.vstack([mu-lower,upper-mu]),color=edgecol,**kwargs))
+    plots[-1][0].remove()
+    return plots
+
+
+def removeRightTicks(ax=None):
+    ax = ax or plt.gca()
+    for i, line in enumerate(ax.get_yticklines()):
+        if i%2 == 1:   # odd indices
+            line.set_visible(False)
+
+def removeUpperTicks(ax=None):
+    ax = ax or plt.gca()
+    for i, line in enumerate(ax.get_xticklines()):
+        if i%2 == 1:   # odd indices
+            line.set_visible(False)
+
+def fewerXticks(ax=None,divideby=2):
+    ax = ax or plt.gca()
+    ax.set_xticks(ax.get_xticks()[::divideby])
+
+def align_subplots(N,M,xlim=None, ylim=None):
+    """make all of the subplots have the same limits, turn off unnecessary ticks"""
+    #find sensible xlim,ylim
+    if xlim is None:
+        xlim = [np.inf,-np.inf]
+        for i in range(N*M):
+            plt.subplot(N,M,i+1)
+            xlim[0] = min(xlim[0],plt.xlim()[0])
+            xlim[1] = max(xlim[1],plt.xlim()[1])
+    if ylim is None:
+        ylim = [np.inf,-np.inf]
+        for i in range(N*M):
+            plt.subplot(N,M,i+1)
+            ylim[0] = min(ylim[0],plt.ylim()[0])
+            ylim[1] = max(ylim[1],plt.ylim()[1])
+
+    for i in range(N*M):
+        plt.subplot(N,M,i+1)
+        plt.xlim(xlim)
+        plt.ylim(ylim)
+        if (i)%M:
+            plt.yticks([])
+        else:
+            removeRightTicks()
+        if i<(M*(N-1)):
+            plt.xticks([])
+        else:
+            removeUpperTicks()
+
+def align_subplot_array(axes,xlim=None, ylim=None):
+    """
+    Make all of the axes in the array hae the same limits, turn off unnecessary ticks
+    use plt.subplots() to get an array of axes
+    """
+    #find sensible xlim,ylim
+    if xlim is None:
+        xlim = [np.inf,-np.inf]
+        for ax in axes.flatten():
+            xlim[0] = min(xlim[0],ax.get_xlim()[0])
+            xlim[1] = max(xlim[1],ax.get_xlim()[1])
+    if ylim is None:
+        ylim = [np.inf,-np.inf]
+        for ax in axes.flatten():
+            ylim[0] = min(ylim[0],ax.get_ylim()[0])
+            ylim[1] = max(ylim[1],ax.get_ylim()[1])
+
+    N,M = axes.shape
+    for i,ax in enumerate(axes.flatten()):
+        ax.set_xlim(xlim)
+        ax.set_ylim(ylim)
+        if (i)%M:
+            ax.set_yticks([])
+        else:
+            removeRightTicks(ax)
+        if i<(M*(N-1)):
+            ax.set_xticks([])
+        else:
+            removeUpperTicks(ax)
+
+def x_frame1D(X,plot_limits=None,resolution=None):
+    """
+    Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits
+    """
+    assert X.shape[1] ==1, "x_frame1D is defined for one-dimensional inputs"
+    if plot_limits is None:
+        from ...core.parameterization.variational import VariationalPosterior
+        if isinstance(X, VariationalPosterior):
+            xmin,xmax = X.mean.min(0),X.mean.max(0)
+        else:
+            xmin,xmax = X.min(0),X.max(0)
+        xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
+    elif len(plot_limits)==2:
+        xmin, xmax = plot_limits
+    else:
+        raise ValueError("Bad limits for plotting")
+
+    Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None]
+    return Xnew, xmin, xmax
+
+def x_frame2D(X,plot_limits=None,resolution=None):
+    """
+    Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits
+    """
+    assert X.shape[1] ==2, "x_frame2D is defined for two-dimensional inputs"
+    if plot_limits is None:
+        xmin,xmax = X.min(0),X.max(0)
+        xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
+    elif len(plot_limits)==2:
+        xmin, xmax = plot_limits
+    else:
+        raise ValueError("Bad limits for plotting")
+
+    resolution = resolution or 50
+    xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution]
+    Xnew = np.vstack((xx.flatten(),yy.flatten())).T
+    return Xnew, xx, yy, xmin, xmax
--- a/GPy/plotting/matplot_dep/controllers/init.py
+++ b/GPy/plotting/matplot_dep/controllers/init.py
@ -1 +1 @@
-from .imshow_controller import ImshowController, ImAnnotateController
+from .imshow_controller import ImshowController, ImAnnotateController
--- a/GPy/plotting/matplot_dep/controllers/imshow_controller.py
+++ b/GPy/plotting/matplot_dep/controllers/imshow_controller.py
@ -72,4 +72,4 @@ class ImAnnotateController(ImshowController):
            text.set_x(x+xoffset)
            text.set_y(y+yoffset)
            text.set_text("{}".format(X[1][j, i]))
-        return view
+        return view
--- a/GPy/plotting/matplot_dep/defaults.py
+++ b/GPy/plotting/matplot_dep/defaults.py
@ -72,4 +72,4 @@ latent = dict(aspect='auto', cmap='Greys', interpolation='bicubic')
 gradient = dict(aspect='auto', cmap='RdBu', interpolation='nearest', alpha=.7)
 magnification = dict(aspect='auto', cmap='Greys', interpolation='bicubic')
 latent_scatter = dict(s=40, linewidth=.2, edgecolor='k', alpha=.9)
-annotation = dict(fontdict=dict(family='sans-serif', weight='light', fontsize=9), zorder=.3, alpha=.7)
+annotation = dict(fontdict=dict(family='sans-serif', weight='light', fontsize=9), zorder=.3, alpha=.7)
--- a/GPy/plotting/matplot_dep/plot_definitions.py
+++ b/GPy/plotting/matplot_dep/plot_definitions.py
@ -42,10 +42,11 @@ class MatplotlibPlots(AbstractPlottingLibrary):
        super(MatplotlibPlots, self).__init__()
        self._defaults = defaults.__dict__

-    def figure(self, rows=1, cols=1, **kwargs):
-        fig = plt.figure(**kwargs)
+    def figure(self, rows=1, cols=1, gridspec_kwargs={}, tight_layout=True, **kwargs):
+        fig = plt.figure(tight_layout=tight_layout, **kwargs)
        fig.rows = rows
        fig.cols = cols
+        fig.gridspec = plt.GridSpec(rows, cols, **gridspec_kwargs)
        return fig

    def new_canvas(self, figure=None, row=1, col=1, projection='2d', xlabel=None, ylabel=None, zlabel=None, title=None, xlim=None, ylim=None, zlim=None, **kwargs):
@ -56,7 +57,9 @@ class MatplotlibPlots(AbstractPlottingLibrary):
        if 'ax' in kwargs:
            ax = kwargs.pop('ax')
        else:
-            if 'num' in kwargs and 'figsize' in kwargs:
+            if figure is not None:
+                fig = figure
+            elif 'num' in kwargs and 'figsize' in kwargs:
                fig = self.figure(num=kwargs.pop('num'), figsize=kwargs.pop('figsize'))
            elif 'num' in kwargs:
                fig = self.figure(num=kwargs.pop('num'))
@ -66,7 +69,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
                fig = self.figure()

            #if hasattr(fig, 'rows') and hasattr(fig, 'cols'):
-            ax = fig.add_subplot(fig.rows, fig.cols, (col,row), projection=projection)
+            ax = fig.add_subplot(fig.gridspec[row-1, col-1], projection=projection)

        if xlim is not None: ax.set_xlim(xlim)
        if ylim is not None: ax.set_ylim(ylim)
@ -79,7 +82,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
        return ax, kwargs

    def add_to_canvas(self, ax, plots, legend=False, title=None, **kwargs):
-        ax.autoscale_view()
+        #ax.autoscale_view()
        fontdict=dict(family='sans-serif', weight='light', size=9)
        if legend is True:
            ax.legend(*ax.get_legend_handles_labels())
@ -89,9 +92,7 @@ class MatplotlibPlots(AbstractPlottingLibrary):
        if title is not None: ax.figure.suptitle(title)
        return ax

-    def show_canvas(self, ax, tight_layout=False, **kwargs):
-        if tight_layout:
-            ax.figure.tight_layout()
+    def show_canvas(self, ax):
        ax.figure.canvas.draw()
        return ax.figure

--- a/GPy/plotting/matplot_dep/ssgplvm.py
+++ b/GPy/plotting/matplot_dep/ssgplvm.py
@ -13,16 +13,16 @@ class SSGPLVM_plot(object):
        self.model = model
        self.imgsize= imgsize
        assert model.Y.shape[1] == imgsize[0]*imgsize[1]
-        
+
    def plot_inducing(self):
        fig1 = pylab.figure()
        mean = self.model.posterior.mean
        arr = mean.reshape(*(mean.shape[0],self.imgsize[1],self.imgsize[0]))
        plot_2D_images(fig1, arr)
        fig1.gca().set_title('The mean of inducing points')
-        
+
        fig2 = pylab.figure()
        covar = self.model.posterior.covariance
        plot_2D_images(fig2, covar)
        fig2.gca().set_title('The variance of inducing points')
-        
+
--- a/GPy/plotting/matplot_dep/util.py
+++ b/GPy/plotting/matplot_dep/util.py
@ -116,4 +116,4 @@ def align_subplot_array(axes,xlim=None, ylim=None):
        if i<(M*(N-1)):
            ax.set_xticks([])
        else:
-            removeUpperTicks(ax)
+            removeUpperTicks(ax)
--- a/GPy/plotting/plotly_dep/defaults.py
+++ b/GPy/plotting/plotly_dep/defaults.py
@ -73,4 +73,4 @@ latent = dict(colorscale='Greys', reversescale=True, zsmooth='best')
 gradient = dict(colorscale='RdBu', opacity=.7)
 magnification = dict(colorscale='Greys', zsmooth='best', reversescale=True)
 latent_scatter = dict(marker_kwargs=dict(size='5', opacity=.7))
-# annotation = dict(fontdict=dict(family='sans-serif', weight='light', fontsize=9), zorder=.3, alpha=.7)
+# annotation = dict(fontdict=dict(family='sans-serif', weight='light', fontsize=9), zorder=.3, alpha=.7)
--- a/GPy/plotting/plotly_dep/plot_definitions.py
+++ b/GPy/plotting/plotly_dep/plot_definitions.py
@ -254,7 +254,7 @@ class PlotlyPlots(AbstractPlottingLibrary):
                        font=dict(color='white' if np.abs(var) > 0.8 else 'black', size=10),
                        opacity=.5,
                        showarrow=False,
-                        hoverinfo='x'))
+                        ))
        return imshow, annotations

    def annotation_heatmap_interact(self, ax, plot_function, extent, label=None, resolution=15, imshow_kwargs=None, **annotation_kwargs):
--- a/GPy/testing/init.py
+++ b/GPy/testing/init.py
@ -1,14 +1,9 @@
-# Copyright (c) 2014, Max Zwiessele
+# Copyright (c) 2014, Max Zwiessele, GPy Authors
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-"""
-
-MaxZ
-
-"""
 import unittest
 import sys

 def deepTest(reason):
    if reason:
        return lambda x:x
-    return unittest.skip("Not deep scanning, enable deepscan by adding 'deep' argument")
+    return unittest.skip("Not deep scanning, enable deepscan by adding 'deep' argument to unittest call")
--- a/GPy/testing/baseline/bayesian_gplvm_gradient.png
+++ b/GPy/testing/baseline/bayesian_gplvm_gradient.png
--- a/GPy/testing/baseline/bayesian_gplvm_inducing.png
+++ b/GPy/testing/baseline/bayesian_gplvm_inducing.png
--- a/GPy/testing/baseline/bayesian_gplvm_inducing_3d.png
+++ b/GPy/testing/baseline/bayesian_gplvm_inducing_3d.png
--- a/GPy/testing/baseline/bayesian_gplvm_latent.png
+++ b/GPy/testing/baseline/bayesian_gplvm_latent.png
--- a/GPy/testing/baseline/bayesian_gplvm_latent_3d.png
+++ b/GPy/testing/baseline/bayesian_gplvm_latent_3d.png
--- a/GPy/testing/baseline/bayesian_gplvm_magnification.png
+++ b/GPy/testing/baseline/bayesian_gplvm_magnification.png
--- a/GPy/testing/baseline/coverage_3d_plot.png
+++ b/GPy/testing/baseline/coverage_3d_plot.png
--- a/GPy/testing/baseline/coverage_annotation_interact.png
+++ b/GPy/testing/baseline/coverage_annotation_interact.png
--- a/GPy/testing/baseline/coverage_gradient.png
+++ b/GPy/testing/baseline/coverage_gradient.png
--- a/GPy/testing/baseline/coverage_imshow_interact.png
+++ b/GPy/testing/baseline/coverage_imshow_interact.png
--- a/GPy/testing/baseline/gp_2d_data.png
+++ b/GPy/testing/baseline/gp_2d_data.png
--- a/GPy/testing/baseline/gp_2d_in_error.png
+++ b/GPy/testing/baseline/gp_2d_in_error.png
--- a/GPy/testing/baseline/gp_2d_inducing.png
+++ b/GPy/testing/baseline/gp_2d_inducing.png
--- a/GPy/testing/baseline/gp_2d_mean.png
+++ b/GPy/testing/baseline/gp_2d_mean.png
--- a/GPy/testing/baseline/gp_3d_data.png
+++ b/GPy/testing/baseline/gp_3d_data.png
--- a/GPy/testing/baseline/gp_3d_inducing.png
+++ b/GPy/testing/baseline/gp_3d_inducing.png
--- a/GPy/testing/baseline/gp_3d_mean.png
+++ b/GPy/testing/baseline/gp_3d_mean.png
--- a/GPy/testing/baseline/gp_class_likelihood.png
+++ b/GPy/testing/baseline/gp_class_likelihood.png
--- a/GPy/testing/baseline/gp_class_raw.png
+++ b/GPy/testing/baseline/gp_class_raw.png
--- a/GPy/testing/baseline/gp_class_raw_link.png
+++ b/GPy/testing/baseline/gp_class_raw_link.png
--- a/GPy/testing/baseline/gp_conf.png
+++ b/GPy/testing/baseline/gp_conf.png
--- a/GPy/testing/baseline/gp_data.png
+++ b/GPy/testing/baseline/gp_data.png
--- a/GPy/testing/baseline/gp_density.png
+++ b/GPy/testing/baseline/gp_density.png
--- a/GPy/testing/baseline/gp_in_error.png
+++ b/GPy/testing/baseline/gp_in_error.png
--- a/GPy/testing/baseline/gp_mean.png
+++ b/GPy/testing/baseline/gp_mean.png
--- a/GPy/testing/baseline/gp_out_error.png
+++ b/GPy/testing/baseline/gp_out_error.png
--- a/GPy/testing/baseline/gp_samples.png
+++ b/GPy/testing/baseline/gp_samples.png
--- a/GPy/testing/baseline/gplvm_gradient.png
+++ b/GPy/testing/baseline/gplvm_gradient.png
--- a/GPy/testing/baseline/gplvm_latent.png
+++ b/GPy/testing/baseline/gplvm_latent.png
--- a/GPy/testing/baseline/gplvm_latent_3d.png
+++ b/GPy/testing/baseline/gplvm_latent_3d.png
--- a/GPy/testing/baseline/gplvm_magnification.png
+++ b/GPy/testing/baseline/gplvm_magnification.png
--- a/GPy/testing/baseline/kern_ARD.png
+++ b/GPy/testing/baseline/kern_ARD.png
--- a/GPy/testing/baseline/kern_cov_1d.png
+++ b/GPy/testing/baseline/kern_cov_1d.png
--- a/GPy/testing/baseline/kern_cov_2d.png
+++ b/GPy/testing/baseline/kern_cov_2d.png
--- a/GPy/testing/baseline/kern_cov_3d.png
+++ b/GPy/testing/baseline/kern_cov_3d.png
--- a/GPy/testing/baseline/kern_cov_no_lim.png
+++ b/GPy/testing/baseline/kern_cov_no_lim.png
--- a/GPy/testing/baseline/sparse_gp_class_likelihood.png
+++ b/GPy/testing/baseline/sparse_gp_class_likelihood.png
--- a/GPy/testing/baseline/sparse_gp_class_raw.png
+++ b/GPy/testing/baseline/sparse_gp_class_raw.png
--- a/Show more
+++ b/Show more