Merge branch 'devel' of github.com:SheffieldML/GPy into devel

2026-07-02 16:01:03 +02:00 · 2013-09-13 13:15:57 +01:00 · 2013-09-13 13:15:57 +01:00 · 304dbfad7f
commit 304dbfad7f
parent 778949fe28 a99dccaab7
98 changed files with 4824 additions and 9100 deletions
--- a/GPy/init.py
+++ b/GPy/init.py
@ -5,6 +5,7 @@ warnings.filterwarnings("ignore", category=DeprecationWarning)
 import core
 import models
 import mappings
 import inference
 import util
 import examples
--- a/GPy/core/init.py
+++ b/GPy/core/init.py
@ -2,9 +2,10 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from model import *
-from parameterised import *
+from parameterized import *
 import priors
 from gp import GP
 from sparse_gp import SparseGP
 from fitc import FITC
 from svigp import SVIGP
 from mapping import *
--- a/GPy/core/domains.py
+++ b/GPy/core/domains.py
@ -2,6 +2,22 @@
 Created on 4 Jun 2013
@author: maxz
 (Hyper-)Parameter domains defined for :py:mod:`~GPy.core.priors` and :py:mod:`~GPy.kern`.
 These domains specify the legitimate realm of the parameters to live in.
 :const:`~GPy.core.domains.REAL` :
    real domain, all values in the real numbers are allowed
 :const:`~GPy.core.domains.POSITIVE`:
    positive domain, only positive real values are allowed
 :const:`~GPy.core.domains.NEGATIVE`:
    same as :const:`~GPy.core.domains.POSITIVE`, but only negative values are allowed
 :const:`~GPy.core.domains.BOUNDED`:
    only values within the bounded range are allowed,
    the bounds are specified withing the object with the bounded range
 '''
 REAL = 'real'
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -6,7 +6,6 @@ import numpy as np
 import pylab as pb
 from .. import kern
 from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs
 #from ..util.plot import gpplot,  Tango
 from ..likelihoods import EP
 from gp_base import GPBase
@ -31,6 +30,13 @@ class GP(GPBase):
        GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
        self._set_params(self._get_params())
    def getstate(self):
        return GPBase.getstate(self)
    def setstate(self, state):
        GPBase.setstate(self, state)
        self._set_params(self._get_params())
    def _set_params(self, p):
        self.kern._set_params_transformed(p[:self.kern.num_params_transformed()])
        self.likelihood._set_params(p[self.kern.num_params_transformed():])
@ -42,12 +48,12 @@ class GP(GPBase):
        # the gradient of the likelihood wrt the covariance matrix
        if self.likelihood.YYT is None:
-            #alpha = np.dot(self.Ki, self.likelihood.Y)
+            # alpha = np.dot(self.Ki, self.likelihood.Y)
-            alpha,_ = dpotrs(self.L, self.likelihood.Y,lower=1)
+            alpha, _ = dpotrs(self.L, self.likelihood.Y, lower=1)
            self.dL_dK = 0.5 * (tdot(alpha) - self.output_dim * self.Ki)
        else:
-            #tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki)
+            # tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki)
            tmp, _ = dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1)
            tmp, _ = dpotrs(self.L, np.asfortranarray(tmp.T), lower=1)
            self.dL_dK = 0.5 * (tmp - self.output_dim * self.Ki)
@ -68,7 +74,7 @@ class GP(GPBase):
        """
        self.likelihood.restart()
        self.likelihood.fit_full(self.kern.K(self.X))
-        self._set_params(self._get_params())  # update the GP
+        self._set_params(self._get_params()) # update the GP
    def _model_fit_term(self):
        """
@ -77,7 +83,7 @@ class GP(GPBase):
        if self.likelihood.YYT is None:
            tmp, _ = dtrtrs(self.L, np.asfortranarray(self.likelihood.Y), lower=1)
            return -0.5 * np.sum(np.square(tmp))
-            #return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y)))
+            # return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y)))
        else:
            return -0.5 * np.sum(np.multiply(self.Ki, self.likelihood.YYT))
@ -89,7 +95,8 @@ class GP(GPBase):
        model for a new variable Y* = v_tilde/tau_tilde, with a covariance
        matrix K* = K + diag(1./tau_tilde) plus a normalization term.
        """
-        return -0.5 * self.output_dim * self.K_logdet + self._model_fit_term() + self.likelihood.Z
+        return (-0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) -
            0.5 * self.output_dim * self.K_logdet + self._model_fit_term() + self.likelihood.Z)
    def _log_likelihood_gradients(self):
@ -100,13 +107,13 @@ class GP(GPBase):
        """
        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
-    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False,stop=False):
+    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False):
        """
        Internal helper function for making predictions, does not account
        for normalization or likelihood
        """
-        Kx = self.kern.K(_Xnew,self.X,which_parts=which_parts).T
+        Kx = self.kern.K(_Xnew, self.X, which_parts=which_parts).T
-        #KiKx = np.dot(self.Ki, Kx)
+        # KiKx = np.dot(self.Ki, Kx)
        KiKx, _ = dpotrs(self.L, np.asfortranarray(Kx), lower=1)
        mu = np.dot(KiKx.T, self.likelihood.Y)
        if full_cov:
@ -120,7 +127,7 @@ class GP(GPBase):
            debug_this # @UndefinedVariable
        return mu, var
-    def predict(self, Xnew, which_parts='all', full_cov=False):
+    def predict(self, Xnew, which_parts='all', full_cov=False, likelihood_args=dict()):
        """
        Predict the function(s) at the new point(s) Xnew.
        Arguments
@ -145,6 +152,6 @@ class GP(GPBase):
        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)
        # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
+        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, **likelihood_args)
        return mean, var, _025pm, _975pm
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@ -6,8 +6,8 @@ from GPy.core.model import Model
 class GPBase(Model):
    """
-    Gaussian Process Model for holding shared behaviour between
+    Gaussian process base model for holding shared behaviour between
-    sprase_GP and GP models
+    sparse_GP and GP models.
    """
    def __init__(self, X, likelihood, kernel, normalize_X=False):
@ -29,23 +29,39 @@ class GPBase(Model):
            self._Xscale = np.ones((1, self.input_dim))
        super(GPBase, self).__init__()
-        #Model.__init__(self)
+        # Model.__init__(self)
        # All leaf nodes should call self._set_params(self._get_params()) at
        # the end
    def getstate(self):
        """
        Get the current state of the class, here we return everything that is needed to recompute the model.
        """
        return Model.getstate(self) + [self.X,
                self.num_data,
                self.input_dim,
                self.kern,
                self.likelihood,
                self.output_dim,
                self._Xoffset,
                self._Xscale]
    def setstate(self, state):
        self._Xscale = state.pop()
        self._Xoffset = state.pop()
        self.output_dim = state.pop()
        self.likelihood = state.pop()
        self.kern = state.pop()
        self.input_dim = state.pop()
        self.num_data = state.pop()
        self.X = state.pop()
        Model.setstate(self, state)
    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None):
        """
        Plot the GP's view of the world, where the data is normalized and the
        likelihood is Gaussian.
        :param samples: the number of a posteriori samples to plot
        :param which_data: which if the training data to plot (default all)
        :type which_data: 'all' or a slice object to slice self.X, self.Y
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
        :param which_parts: which of the kernel functions to plot (additively)
        :type which_parts: 'all', or list of bools
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        Plot the posterior of the GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
          - In two dimsensions, a contour-plot shows the mean predicted function
@ -53,6 +69,22 @@ class GPBase(Model):
        Can plot only part of the data and part of the posterior functions
        using which_data and which_functions
        :param samples: the number of a posteriori samples to plot
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
        :param which_data: which if the training data to plot (default all)
        :type which_data: 'all' or a slice object to slice self.X, self.Y
        :param which_parts: which of the kernel functions to plot (additively)
        :type which_parts: 'all', or list of bools
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        :type resolution: int
        :param full_cov:
        :type full_cov: bool
                :param fignum: figure to plot on.
        :type fignum: figure number
        :param ax: axes to plot on.
        :type ax: axes handle
        """
        if which_data == 'all':
            which_data = slice(None)
@ -91,12 +123,43 @@ class GPBase(Model):
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
-    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None):
+    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
        """
-        TODO: Docstrings!
+        Plot the GP with noise where the likelihood is Gaussian.
-        
+
        Plot the posterior of the GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
          - In two dimsensions, a contour-plot shows the mean predicted function
          - In higher dimensions, we've no implemented this yet !TODO!
        Can plot only part of the data and part of the posterior functions
        using which_data and which_functions
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
        :type plot_limits: np.array
        :param which_data: which if the training data to plot (default all)
        :type which_data: 'all' or a slice object to slice self.X, self.Y
        :param which_parts: which of the kernel functions to plot (additively)
        :type which_parts: 'all', or list of bools
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        :type resolution: int
        :param levels: number of levels to plot in a contour plot.
        :type levels: int
        :param samples: the number of a posteriori samples to plot
        :type samples: int
        :param fignum: figure to plot on.
        :type fignum: figure number
        :param ax: axes to plot on.
        :type ax: axes handle
        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
        :type fixed_inputs: a list of tuples
        :param linecol: color of line to plot.
        :type linecol:
        :param fillcol: color of fill
        :type fillcol: 
        :param levels: for 2D plotting, the number of contour levels to use
        is ax is None, create a new figure
        """
        # TODO include samples
        if which_data == 'all':
@ -106,15 +169,25 @@ class GPBase(Model):
            fig = pb.figure(num=fignum)
            ax = fig.add_subplot(111)
-        if self.X.shape[1] == 1:
+        plotdims = self.input_dim - len(fixed_inputs)
        if plotdims == 1:
            Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
-            Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
+            fixed_dims = np.array([i for i,v in fixed_inputs])
-            m, _, lower, upper = self.predict(Xnew, which_parts=which_parts)
+            freedim = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
            Xnew, xmin, xmax = x_frame1D(Xu[:,freedim], plot_limits=plot_limits)
            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
            Xgrid[:,freedim] = Xnew
            for i,v in fixed_inputs:
                Xgrid[:,i] = v
            m, _, lower, upper = self.predict(Xgrid, which_parts=which_parts)
            for d in range(m.shape[1]):
-                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax)
+                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
-                ax.plot(Xu[which_data], self.likelihood.data[which_data, d], 'kx', mew=1.5)
+                ax.plot(Xu[which_data,freedim], self.likelihood.data[which_data, d], 'kx', mew=1.5)
            ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
            ax.set_xlim(xmin, xmax)
@ -127,7 +200,7 @@ class GPBase(Model):
            m, _, lower, upper = self.predict(Xnew, which_parts=which_parts)
            m = m.reshape(resolution, resolution).T
            ax.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
-            Yf = self.likelihood.Y.flatten()
+            Yf = self.likelihood.data.flatten()
            ax.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.) # @UndefinedVariable
            ax.set_xlim(xmin[0], xmax[0])
            ax.set_ylim(xmin[1], xmax[1])
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@ -0,0 +1,190 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from ..util.plot import Tango, x_frame1D, x_frame2D
 from parameterized import Parameterized
 import numpy as np
 import pylab as pb
 class Mapping(Parameterized):
    """
    Base model for shared behavior between models that can act like a mapping. 
    """
    def __init__(self, input_dim, output_dim):
        self.input_dim = input_dim
        self.output_dim = output_dim
        super(Mapping, self).__init__()
        # Model.__init__(self)
        # All leaf nodes should call self._set_params(self._get_params()) at
        # the end
    def f(self, X):
        raise NotImplementedError
    def df_dX(self, dL_df, X):
        """Evaluate derivatives of mapping outputs with respect to inputs.
        :param dL_df: gradient of the objective with respect to the function.
        :type dL_df: ndarray (num_data x output_dim)
        :param X: the input locations where derivatives are to be evaluated.
        :type X: ndarray (num_data x input_dim)
        :returns: matrix containing gradients of the function with respect to the inputs.
        """
        raise NotImplementedError
    def df_dtheta(self, dL_df, X):
        """The gradient of the outputs of the multi-layer perceptron with respect to each of the parameters.
        :param dL_df: gradient of the objective with respect to the function.
        :type dL_df: ndarray (num_data x output_dim)
        :param X: input locations where the function is evaluated.
        :type X: ndarray (num_data x input_dim)
        :returns: Matrix containing gradients with respect to parameters of each output for each input data.
        :rtype: ndarray (num_params length)
        """
        raise NotImplementedError
    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue']):
        """
        Plot the mapping.
        Plots the mapping associated with the model.
          - In one dimension, the function is plotted.
          - In two dimsensions, a contour-plot shows the function
          - In higher dimensions, we've not implemented this yet !TODO!
        Can plot only part of the data and part of the posterior functions
        using which_data and which_functions
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
        :type plot_limits: np.array
        :param which_data: which if the training data to plot (default all)
        :type which_data: 'all' or a slice object to slice self.X, self.Y
        :param which_parts: which of the kernel functions to plot (additively)
        :type which_parts: 'all', or list of bools
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        :type resolution: int
        :param levels: number of levels to plot in a contour plot.
        :type levels: int
        :param samples: the number of a posteriori samples to plot
        :type samples: int
        :param fignum: figure to plot on.
        :type fignum: figure number
        :param ax: axes to plot on.
        :type ax: axes handle
        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
        :type fixed_inputs: a list of tuples
        :param linecol: color of line to plot.
        :type linecol:
        :param levels: for 2D plotting, the number of contour levels to use
        is ax is None, create a new figure
        """
        # TODO include samples
        if which_data == 'all':
            which_data = slice(None)
        if ax is None:
            fig = pb.figure(num=fignum)
            ax = fig.add_subplot(111)
        plotdims = self.input_dim - len(fixed_inputs)
        if plotdims == 1:
            Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
            fixed_dims = np.array([i for i,v in fixed_inputs])
            freedim = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
            Xnew, xmin, xmax = x_frame1D(Xu[:,freedim], plot_limits=plot_limits)
            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
            Xgrid[:,freedim] = Xnew
            for i,v in fixed_inputs:
                Xgrid[:,i] = v
            f = self.predict(Xgrid, which_parts=which_parts)
            for d in range(y.shape[1]):
                ax.plot(Xnew, f[:, d], edgecol=linecol)
        elif self.X.shape[1] == 2: 
            resolution = resolution or 50
            Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
            f = self.predict(Xnew, which_parts=which_parts)
            m = m.reshape(resolution, resolution).T
            ax.contour(x, y, f, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
            ax.set_xlim(xmin[0], xmax[0])
            ax.set_ylim(xmin[1], xmax[1])
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
 from GPy.core.model import Model
 class Mapping_check_model(Model):
    """This is a dummy model class used as a base class for checking that the gradients of a given mapping are implemented correctly. It enables checkgradient() to be called independently on each mapping."""
    def __init__(self, mapping=None, dL_df=None, X=None):
        num_samples = 20
        if mapping==None:
            mapping = GPy.mapping.linear(1, 1)
        if X==None:
            X = np.random.randn(num_samples, mapping.input_dim)
        if dL_df==None:
            dL_df = np.ones((num_samples, mapping.output_dim))
        self.mapping=mapping
        self.X = X
        self.dL_df = dL_df
        self.num_params = self.mapping.num_params
        Model.__init__(self)
    def _get_params(self):
        return self.mapping._get_params()
    def _get_param_names(self):
        return self.mapping._get_param_names()
    def _set_params(self, x):
        self.mapping._set_params(x)
    def log_likelihood(self):
        return (self.dL_df*self.mapping.f(self.X)).sum()
    def _log_likelihood_gradients(self):
        raise NotImplementedError, "This needs to be implemented to use the Mapping_check_model class."
 class Mapping_check_df_dtheta(Mapping_check_model):
    """This class allows gradient checks for the gradient of a mapping with respect to parameters. """
    def __init__(self, mapping=None, dL_df=None, X=None):
        Mapping_check_model.__init__(self,mapping=mapping,dL_df=dL_df, X=X)
    def _log_likelihood_gradients(self):
        return self.mapping.df_dtheta(self.dL_df, self.X)
 class Mapping_check_df_dX(Mapping_check_model):
    """This class allows gradient checks for the gradient of a mapping with respect to X. """
    def __init__(self, mapping=None, dL_df=None, X=None):
        Mapping_check_model.__init__(self,mapping=mapping,dL_df=dL_df, X=X)
        if dL_df==None:
            dL_df = np.ones((self.X.shape[0],self.mapping.output_dim))
        self.num_params = self.X.shape[0]*self.mapping.input_dim
    def _log_likelihood_gradients(self):
        return self.mapping.df_dX(self.dL_df, self.X).flatten()
    def _get_param_names(self):
        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
    def _get_params(self):
        return self.X.flatten()
    def _set_params(self, x):
        self.X=x.reshape(self.X.shape)
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -6,49 +6,72 @@ from .. import likelihoods
 from ..inference import optimization
 from ..util.linalg import jitchol
 from GPy.util.misc import opt_wrapper
-from parameterised import Parameterised
+from parameterized import Parameterized
 import multiprocessing as mp
 import numpy as np
 from GPy.core.domains import POSITIVE, REAL
 from numpy.linalg.linalg import LinAlgError
 # import numdifftools as ndt
-class Model(Parameterised):
+class Model(Parameterized):
    _fail_count = 0 # Count of failed optimization steps (see objective)
    _allowed_failures = 10 # number of allowed failures
    def __init__(self):
-        Parameterised.__init__(self)
+        Parameterized.__init__(self)
        self.priors = None
        self.optimization_runs = []
        self.sampling_runs = []
        self.preferred_optimizer = 'scg'
        # self._set_params(self._get_params()) has been taken out as it should only be called on leaf nodes
    def _get_params(self):
        raise NotImplementedError, "this needs to be implemented to use the Model class"
    def _set_params(self, x):
        raise NotImplementedError, "this needs to be implemented to use the Model class"
    def log_likelihood(self):
-        raise NotImplementedError, "this needs to be implemented to use the Model class"
+        raise NotImplementedError, "this needs to be implemented to use the model class"
    def _log_likelihood_gradients(self):
-        raise NotImplementedError, "this needs to be implemented to use the Model class"
+        raise NotImplementedError, "this needs to be implemented to use the model class"
    def getstate(self):
        """
        Get the current state of the class.
        Inherited from Parameterized, so add those parameters to the state
        :return: list of states from the model.
        """
        return Parameterized.getstate(self) + \
            [self.priors, self.optimization_runs,
             self.sampling_runs, self.preferred_optimizer]
    def setstate(self, state):
        """
        set state from previous call to getstate
        call Parameterized with the rest of the state
        :param state: the state of the model.
        :type state: list as returned from getstate.        
        """
        self.preferred_optimizer = state.pop()
        self.sampling_runs = state.pop()
        self.optimization_runs = state.pop()
        self.priors = state.pop()
        Parameterized.setstate(self, state)
    def set_prior(self, regexp, what):
        """
-        Sets priors on the Model parameters.
+        Sets priors on the model parameters.
        Arguments
        ---------
        regexp -- string, regexp, or integer array
        what -- instance of a Prior class
        Notes
        -----
-        Asserts that the Prior is suitable for the constraint. If the
+        Asserts that the prior is suitable for the constraint. If the
        wrong constraint is in place, an error is raised.  If no
        constraint is in place, one is added (warning printed).
-        For tied parameters, the Prior will only be "counted" once, thus
+        For tied parameters, the prior will only be "counted" once, thus
-        a Prior object is only inserted on the first tied index
+        a prior object is only inserted on the first tied index
        :param regexp: regular expression of parameters on which priors need to be set.
        :type param: string, regexp, or integer array
        :param what: prior to set on parameter.
        :type what: GPy.core.Prior type
        """
        if self.priors is None:
            self.priors = [None for i in range(self._get_params().size)]
@ -58,12 +81,12 @@ class Model(Parameterised):
        # check tied situation
        tie_partial_matches = [tie for tie in self.tied_indices if (not set(tie).isdisjoint(set(which))) & (not set(tie) == set(which))]
        if len(tie_partial_matches):
-            raise ValueError, "cannot place Prior across partial ties"
+            raise ValueError, "cannot place prior across partial ties"
        tie_matches = [tie for tie in self.tied_indices if set(which) == set(tie) ]
        if len(tie_matches) > 1:
-            raise ValueError, "cannot place Prior across multiple ties"
+            raise ValueError, "cannot place prior across multiple ties"
        elif len(tie_matches) == 1:
-            which = which[:1] # just place a Prior object on the first parameter
+            which = which[:1] # just place a prior object on the first parameter
        # check constraints are okay
@ -75,7 +98,7 @@ class Model(Parameterised):
            else:
                constrained_positive_indices = np.zeros(shape=(0,))
            bad_constraints = np.setdiff1d(self.all_constrained_indices(), constrained_positive_indices)
-            assert not np.any(which[:, None] == bad_constraints), "constraint and Prior incompatible"
+            assert not np.any(which[:, None] == bad_constraints), "constraint and prior incompatible"
            unconst = np.setdiff1d(which, constrained_positive_indices)
            if len(unconst):
                print "Warning: constraining parameters to be positive:"
@ -83,17 +106,22 @@ class Model(Parameterised):
                print '\n'
                self.constrain_positive(unconst)
        elif what.domain is REAL:
-            assert not np.any(which[:, None] == self.all_constrained_indices()), "constraint and Prior incompatible"
+            assert not np.any(which[:, None] == self.all_constrained_indices()), "constraint and prior incompatible"
        else:
-            raise ValueError, "Prior not recognised"
+            raise ValueError, "prior not recognised"
-        # store the Prior in a local list
+        # store the prior in a local list
        for w in which:
            self.priors[w] = what
    def get_gradient(self, name, return_names=False):
        """
-        Get Model gradient(s) by name. The name is applied as a regular expression and all parameters that match that regular expression are returned.
+        Get model gradient(s) by name. The name is applied as a regular expression and all parameters that match that regular expression are returned.
        :param name: the name of parameters required (as a regular expression).
        :type name: regular expression
        :param return_names: whether or not to return the names matched (default False)
        :type return_names: bool
        """
        matches = self.grep_param_names(name)
        if len(matches):
@ -133,14 +161,14 @@ class Model(Parameterised):
    def randomize(self):
        """
-        Randomize the Model.
+        Randomize the model.
-        Make this draw from the Prior if one exists, else draw from N(0,1)
+        Make this draw from the prior if one exists, else draw from N(0,1)
        """
        # first take care of all parameters (from N(0,1))
        x = self._get_params_transformed()
        x = np.random.randn(x.size)
        self._set_params_transformed(x)
-        # now draw from Prior where possible
+        # now draw from prior where possible
        x = self._get_params()
        if self.priors is not None:
            [np.put(x, i, p.rvs(1)) for i, p in enumerate(self.priors) if not p is None]
@ -150,21 +178,30 @@ class Model(Parameterised):
    def optimize_restarts(self, num_restarts=10, robust=False, verbose=True, parallel=False, num_processes=None, **kwargs):
        """
-        Perform random restarts of the Model, and set the Model to the best
+        Perform random restarts of the model, and set the model to the best
        seen solution.
        If the robust flag is set, exceptions raised during optimizations will
-        be handled silently.  If _all_ runs fail, the Model is reset to the
+        be handled silently.  If _all_ runs fail, the model is reset to the
        existing parameter values.
        Notes
        -----
        :param num_restarts: number of restarts to use (default 10)
        :type num_restarts: int
        :param robust: whether to handle exceptions silently or not (default False)
        :type robust: bool
        :param parallel: whether to run each restart as a separate process. It relies on the multiprocessing module.
        :type parallel: bool
        :param num_processes: number of workers in the multiprocessing pool
        :type numprocesses: int
        **kwargs are passed to the optimizer. They can be:
-        :max_f_eval: maximum number of function evaluations
+        :param max_f_eval: maximum number of function evaluations
-        :messages: whether to display during optimisation
+        :type max_f_eval: int
-        :verbose: whether to show informations about the current restart
+        :param max_iters: maximum number of iterations
-        :parallel: whether to run each restart as a separate process. It relies on the multiprocessing module.
+        :type max_iters: int
-        :num_processes: number of workers in the multiprocessing pool
+        :param messages: whether to display during optimisation
        :type messages: bool
        ..Note: If num_processes is None, the number of workes in the multiprocessing pool is automatically
        set to the number of processors on the current machine.
@ -212,8 +249,13 @@ class Model(Parameterised):
            self._set_params_transformed(initial_parameters)
    def ensure_default_constraints(self):
-        """
+        """       
-        Ensure that any variables which should clearly be positive have been constrained somehow.
+        Ensure that any variables which should clearly be positive
        have been constrained somehow. The method performs a regular
        expression search on parameter names looking for the terms
        'variance', 'lengthscale', 'precision' and 'kappa'. If any of
        these terms are present in the name the parameter is
        constrained positive.
        """
        positive_strings = ['variance', 'lengthscale', 'precision', 'kappa']
        # param_names = self._get_param_names()
@ -228,11 +270,15 @@ class Model(Parameterised):
    def objective_function(self, x):
        """
-        The objective function passed to the optimizer. It combines the likelihood and the priors.
+        The objective function passed to the optimizer. It combines
        the likelihood and the priors.
        Failures are handled robustly. The algorithm will try several times to
        return the objective, and will raise the original exception if it
        the objective cannot be computed.
        :param x: the parameters of the model.
        :parameter type: np.array
        """
        try:
            self._set_params_transformed(x)
@ -249,39 +295,53 @@ class Model(Parameterised):
        Gets the gradients from the likelihood and the priors.
        Failures are handled robustly. The algorithm will try several times to
-        return the objective, and will raise the original exception if it
+        return the gradients, and will raise the original exception if it
        the objective cannot be computed.
        :param x: the parameters of the model.
        :parameter type: np.array
        """
        try:
            self._set_params_transformed(x)
            obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
            self._fail_count = 0
        except (LinAlgError, ZeroDivisionError, ValueError) as e:
            if self._fail_count >= self._allowed_failures:
                raise e
            self._fail_count += 1
-        obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
+            obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100)
        return obj_grads
    def objective_and_gradients(self, x):
        """
        Compute the objective function of the model and the gradient of the model at the point given by x.
        :param x: the point at which gradients are to be computed.
        :type np.array:
        """
        try:
            self._set_params_transformed(x)
            obj_f = -self.log_likelihood() - self.log_prior()
            self._fail_count = 0
            obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
        except (LinAlgError, ZeroDivisionError, ValueError) as e:
            if self._fail_count >= self._allowed_failures:
                raise e
            self._fail_count += 1
            obj_f = np.inf
-        obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
+            obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100)
        return obj_f, obj_grads
    def optimize(self, optimizer=None, start=None, **kwargs):
        """
-        Optimize the Model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
+        Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
        kwargs are passed to the optimizer. They can be:
-        :max_f_eval: maximum number of function evaluations
+        :param max_f_eval: maximum number of function evaluations
        :type max_f_eval: int
        :messages: whether to display during optimisation
        :type messages: bool
        :param optimzer: which optimizer to use (defaults to self.preferred optimizer)
        :type optimzer: string TODO: valid strings?
        """
@ -293,7 +353,9 @@ class Model(Parameterised):
        optimizer = optimization.get_optimizer(optimizer)
        opt = optimizer(start, model=self, **kwargs)
        opt.run(f_fp=self.objective_and_gradients, f=self.objective_function, fp=self.objective_function_gradients)
        self.optimization_runs.append(opt)
        self._set_params_transformed(opt.x_opt)
@ -305,14 +367,14 @@ class Model(Parameterised):
        self.optimization_runs.append(sgd)
    def Laplace_covariance(self):
-        """return the covariance matric of a Laplace approximatino at the current (stationary) point"""
+        """return the covariance matrix of a Laplace approximation at the current (stationary) point."""
-        # TODO add in the Prior contributions for MAP estimation
+        # TODO add in the prior contributions for MAP estimation
        # TODO fix the hessian for tied, constrained and fixed components
        if hasattr(self, 'log_likelihood_hessian'):
            A = -self.log_likelihood_hessian()
        else:
-            print "numerically calculating hessian. please be patient!"
+            print "numerically calculating Hessian. please be patient!"
            x = self._get_params()
            def f(x):
                self._set_params(x)
@ -326,8 +388,8 @@ class Model(Parameterised):
        return A
    def Laplace_evidence(self):
-        """Returns an estiamte of the Model evidence based on the Laplace approximation.
+        """Returns an estiamte of the model evidence based on the Laplace approximation.
-        Uses a numerical estimate of the hessian if none is available analytically"""
+        Uses a numerical estimate of the Hessian if none is available analytically."""
        A = self.Laplace_covariance()
        try:
            hld = np.sum(np.log(np.diag(jitchol(A)[0])))
@ -335,40 +397,45 @@ class Model(Parameterised):
            return np.nan
        return 0.5 * self._get_params().size * np.log(2 * np.pi) + self.log_likelihood() - hld
-    def __str__(self):
+    def __str__(self, names=None):
-        s = Parameterised.__str__(self).split('\n')
+        if names is None:
            names = self._get_print_names()
        s = Parameterized.__str__(self, names=names).split('\n')
        # add priors to the string
        if self.priors is not None:
            strs = [str(p) if p is not None else '' for p in self.priors]
        else:
-            strs = [''] * len(self._get_params())
+            strs = [''] * len(self._get_param_names())
        name_indices = self.grep_param_names("|".join(names))
        strs = np.array(strs)[name_indices]
        width = np.array(max([len(p) for p in strs] + [5])) + 4
        log_like = self.log_likelihood()
        log_prior = self.log_prior()
        obj_funct = '\nLog-likelihood: {0:.3e}'.format(log_like)
        if len(''.join(strs)) != 0:
-            obj_funct += ', Log Prior: {0:.3e}, LL+Prior = {0:.3e}'.format(log_prior, log_like + log_prior)
+            obj_funct += ', Log prior: {0:.3e}, LL+prior = {0:.3e}'.format(log_prior, log_like + log_prior)
        obj_funct += '\n\n'
        s[0] = obj_funct + s[0]
-        s[0] += "|{h:^{col}}".format(h='Prior', col=width)
+        s[0] += "|{h:^{col}}".format(h='prior', col=width)
        s[1] += '-' * (width + 1)
        for p in range(2, len(strs) + 2):
-            s[p] += '|{Prior:^{width}}'.format(Prior=strs[p - 2], width=width)
+            s[p] += '|{prior:^{width}}'.format(prior=strs[p - 2], width=width)
        return '\n'.join(s)
    def checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3):
        """
-        Check the gradient of the Model by comparing to a numerical estimate.
+        Check the gradient of the ,odel by comparing to a numerical
-        If the verbose flag is passed, invividual components are tested (and printed)
+        estimate.  If the verbose flag is passed, invividual
        components are tested (and printed)
        :param verbose: If True, print a "full" checking of each parameter
        :type verbose: bool
        :param step: The size of the step around which to linearise the objective
-        :type step: float (defaul 1e-6)
+        :type step: float (default 1e-6)
        :param tolerance: the tolerance allowed (see note)
        :type tolerance: float (default 1e-3)
@ -391,7 +458,7 @@ class Model(Parameterised):
            numerical_gradient = (f1 - f2) / (2 * dx)
            global_ratio = (f1 - f2) / (2 * np.dot(dx, gradient))
-            return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() - 1) < tolerance
+            return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() < tolerance)
        else:
            # check the gradient of each parameter individually, and do some pretty printing
            try:
@ -445,25 +512,27 @@ class Model(Parameterised):
    def input_sensitivity(self):
        """
-        return an array describing the sesitivity of the Model to each input
+        return an array describing the sesitivity of the model to each input
        NB. Right now, we're basing this on the lengthscales (or
        variances) of the kernel.  TODO: proper sensitivity analysis
-        where we integrate across the Model inputs and evaluate the
+        where we integrate across the model inputs and evaluate the
-        effect on the variance of the Model output.  """
+        effect on the variance of the model output.  """
        if not hasattr(self, 'kern'):
-            raise ValueError, "this Model has no kernel"
+            raise ValueError, "this model has no kernel"
-        k = [p for p in self.kern.parts if p.name in ['rbf', 'linear']]
+        k = [p for p in self.kern.parts if p.name in ['rbf', 'linear', 'rbf_inv']]
        if (not len(k) == 1) or (not k[0].ARD):
            raise ValueError, "cannot determine sensitivity for this kernel"
        k = k[0]
        if k.name == 'rbf':
-            return k.lengthscale
+            return 1. / k.lengthscale
        elif k.name == 'rbf_inv':
            return k.inv_lengthscale
        elif k.name == 'linear':
-            return 1. / k.variances
+            return k.variances
    def pseudo_EM(self, epsilon=.1, **kwargs):
--- a/GPy/core/parameterized.py
+++ b/GPy/core/parameterized.py
@ -9,7 +9,7 @@ import cPickle
 import warnings
 import transformations
-class Parameterised(object):
+class Parameterized(object):
    def __init__(self):
        """
        This is the base class for model and kernel. Mostly just handles tieing and constraining of parameters
@ -20,55 +20,66 @@ class Parameterised(object):
        self.constrained_indices = []
        self.constraints = []
-    def pickle(self, filename, protocol= -1):
+    def _get_params(self):
-        f = file(filename, 'w')
+        raise NotImplementedError, "this needs to be implemented to use the Parameterized class"
-        cPickle.dump(self, f, protocol)
+    def _set_params(self, x):
-        f.close()
+        raise NotImplementedError, "this needs to be implemented to use the Parameterized class"
    def _get_param_names(self):
        raise NotImplementedError, "this needs to be implemented to use the Parameterized class"
    def _get_print_names(self):
        """ Override for which names to print out, when using print m """
        return self._get_param_names()
    def pickle(self, filename, protocol=None):
        if protocol is None:
            if self._has_get_set_state():
                protocol = 0
            else:
                protocol = -1
        with open(filename, 'w') as f:
            cPickle.dump(self, f, protocol)
    def copy(self):
        """Returns a (deep) copy of the current model """
        return copy.deepcopy(self)
-    @property
+    def __getstate__(self):
-    def params(self):
+        if self._has_get_set_state():
            return self.getstate()
        return self.__dict__
    def __setstate__(self, state):
        if self._has_get_set_state():
            self.setstate(state) # set state
            self._set_params(self._get_params()) # restore all values
            return
        self.__dict__ = state
    def _has_get_set_state(self):
        return 'getstate' in vars(self.__class__) and 'setstate' in vars(self.__class__)
    def getstate(self):
        """
-        Returns a **copy** of parameters in non transformed space
+        Get the current state of the class,
-
+        here just all the indices, rest can get recomputed
-        :see_also: :py:func:`GPy.core.Parameterised.params_transformed`
+        
        For inheriting from Parameterized:
        Allways append the state of the inherited object 
        and call down to the inherited object in setstate!! 
        """
-        return self._get_params()
+        return [self.tied_indices,
                self.fixed_indices,
                self.fixed_values,
                self.constrained_indices,
                self.constraints]
-    @params.setter
+    def setstate(self, state):
-    def params(self, params):
+        self.constraints = state.pop()
-        self._set_params(params)
+        self.constrained_indices = state.pop()
-
+        self.fixed_values = state.pop()
-    @property
+        self.fixed_indices = state.pop()
-    def params_transformed(self):
+        self.tied_indices = state.pop()
        """
        Returns a **copy** of parameters in transformed space
        :see_also: :py:func:`GPy.core.Parameterised.params`
        """
        return self._get_params_transformed()
    @params_transformed.setter
    def params_transformed(self, params):
        self._set_params_transformed(params)
    _get_set_deprecation = """get and set methods wont be available at next minor release
        in the next releases you will get and set with following syntax:
        Assume m is a model class:
        print m['var']          # > prints all parameters matching 'var'
        m['var'] = 2.           # > sets all parameters matching 'var' to 2.
        m['var'] = <array-like> # > sets parameters matching 'var' to <array-like>
        """
    def get(self, regexp):
        warnings.warn(self._get_set_deprecation, FutureWarning, stacklevel=2)
        return self[regexp]
    def set(self, regexp, val):
        warnings.warn(self._get_set_deprecation, FutureWarning, stacklevel=2)
        self[regexp] = val
    def __getitem__(self, regexp, return_names=False):
        """
@ -95,13 +106,16 @@ class Parameterised(object):
        if len(matches):
            val = np.array(val)
            assert (val.size == 1) or val.size == len(matches), "Shape mismatch: {}:({},)".format(val.size, len(matches))
-            x = self.params
+            x = self._get_params()
            x[matches] = val
-            self.params = x
+            self._set_params(x)
        else:
            raise AttributeError, "no parameter matches %s" % name
    def tie_params(self, regexp):
        """
        Tie (all!) parameters matching the regular expression `regexp`. 
        """
        matches = self.grep_param_names(regexp)
        assert matches.size > 0, "need at least something to tie together"
        if len(self.tied_indices):
@ -154,7 +168,7 @@ class Parameterised(object):
        return len(self._get_params()) - removed
    def unconstrain(self, regexp):
-        """Unconstrain matching parameters.  does not untie parameters"""
+        """Unconstrain matching parameters.  Does not untie parameters"""
        matches = self.grep_param_names(regexp)
        # tranformed contraints:
@ -181,7 +195,7 @@ class Parameterised(object):
    def constrain_negative(self, regexp):
        """ Set negative constraints. """
-        self.constrain(regexp, transformations.negative_exponent())
+        self.constrain(regexp, transformations.negative_logexp())
    def constrain_positive(self, regexp):
        """ Set positive constraints. """
@ -219,10 +233,11 @@ class Parameterised(object):
        """
        Arguments
        ---------
-        :param regexp: np.array(dtype=int), or regular expression object or string
+        :param regexp: which parameters need to be fixed.
-        :param value: a float to fix the matched values to. If the value is not specified,
+        :type regexp: ndarray(dtype=int) or regular expression object or string
        :param value: the vlaue to fix the parameters to. If the value is not specified,
                 the parameter is fixed to the current value
-
+        :type value: float
        Notes
        -----
        Fixing a parameter which is tied to another, or constrained in some way will result in an error.
@ -321,19 +336,26 @@ class Parameterised(object):
        n = [nn for i, nn in enumerate(n) if not i in remove]
        return n
-    def __str__(self, nw=30):
+    @property
    def all(self):
        return self.__str__(self._get_param_names())
    def __str__(self, names=None, nw=30):
        """
        Return a string describing the parameter names and their ties and constraints
        """
-        names = self._get_param_names()
+        if names is None:
            names = self._get_print_names()
        name_indices = self.grep_param_names("|".join(names))
        N = len(names)
        if not N:
            return "This object has no free parameters."
        header = ['Name', 'Value', 'Constraints', 'Ties']
-        values = self._get_params() # map(str,self._get_params())
+        values = self._get_params()[name_indices] # map(str,self._get_params())
        # sort out the constraints
-        constraints = [''] * len(names)
+        constraints = [''] * len(self._get_param_names())
        for i, t in zip(self.constrained_indices, self.constraints):
            for ii in i:
                constraints[ii] = t.__str__()
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -33,10 +33,11 @@ class SparseGP(GPBase):
        self.Z = Z
        self.num_inducing = Z.shape[0]
-        self.likelihood = likelihood
+#         self.likelihood = likelihood
        if X_variance is None:
            self.has_uncertain_inputs = False
            self.X_variance = None
        else:
            assert X_variance.shape == X.shape
            self.has_uncertain_inputs = True
@ -49,6 +50,25 @@ class SparseGP(GPBase):
        if self.has_uncertain_inputs:
            self.X_variance /= np.square(self._Xscale)
        self._const_jitter = None
    def getstate(self):
        """
        Get the current state of the class,
        here just all the indices, rest can get recomputed
        """
        return GPBase.getstate(self) + [self.Z,
                self.num_inducing,
                self.has_uncertain_inputs,
                self.X_variance]
    def setstate(self, state):
        self.X_variance = state.pop()
        self.has_uncertain_inputs = state.pop()
        self.num_inducing = state.pop()
        self.Z = state.pop()
        GPBase.setstate(self, state)
    def _compute_kernel_matrices(self):
        # kernel computations, using BGPLVM notation
        self.Kmm = self.kern.K(self.Z)
@ -62,12 +82,14 @@ class SparseGP(GPBase):
            self.psi2 = None
    def _computations(self):
-
+        if self._const_jitter is None or not(self._const_jitter.shape[0] == self.num_inducing):
            self._const_jitter = np.eye(self.num_inducing) * 1e-7
        # factor Kmm
-        self.Lm = jitchol(self.Kmm)
+        self._Lm = jitchol(self.Kmm + self._const_jitter)
        # TODO: no white kernel needed anymore, all noise in likelihood --------
-        # The rather complex computations of self.A
+        # The rather complex computations of self._A
        if self.has_uncertain_inputs:
            if self.likelihood.is_heteroscedastic:
                psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.num_data, 1, 1))).sum(0)
@ -75,29 +97,32 @@ class SparseGP(GPBase):
                psi2_beta = self.psi2.sum(0) * self.likelihood.precision
            evals, evecs = linalg.eigh(psi2_beta)
            clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
            if not np.array_equal(evals, clipped_evals):
                pass # print evals
            tmp = evecs * np.sqrt(clipped_evals)
            tmp = tmp.T
        else:
            if self.likelihood.is_heteroscedastic:
-                tmp = self.psi1 * (np.sqrt(self.likelihood.precision.flatten().reshape(self.num_data,1)))
+                tmp = self.psi1 * (np.sqrt(self.likelihood.precision.flatten().reshape(self.num_data, 1)))
            else:
                tmp = self.psi1 * (np.sqrt(self.likelihood.precision))
-        tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp.T), lower=1)
+        tmp, _ = dtrtrs(self._Lm, np.asfortranarray(tmp.T), lower=1)
-        self.A = tdot(tmp)
+        self._A = tdot(tmp)
        # factor B
-        self.B = np.eye(self.num_inducing) + self.A
+        self.B = np.eye(self.num_inducing) + self._A
        self.LB = jitchol(self.B)
-        #VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
+        # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
        self.psi1Vf = np.dot(self.psi1.T, self.likelihood.VVT_factor)
        # back substutue C into psi1Vf
-        tmp, info1 = dtrtrs(self.Lm, np.asfortranarray(self.psi1Vf), lower=1, trans=0)
+        tmp, info1 = dtrtrs(self._Lm, np.asfortranarray(self.psi1Vf), lower=1, trans=0)
        self._LBi_Lmi_psi1Vf, _ = dtrtrs(self.LB, np.asfortranarray(tmp), lower=1, trans=0)
-        tmp, info2 = dpotrs(self.LB, tmp, lower=1)
+        # tmp, info2 = dpotrs(self.LB, tmp, lower=1)
-        self.Cpsi1Vf, info3 = dtrtrs(self.Lm, tmp, lower=1, trans=1)
+        tmp, info2 = dtrtrs(self.LB, self._LBi_Lmi_psi1Vf, lower=1, trans=1)
        self.Cpsi1Vf, info3 = dtrtrs(self._Lm, tmp, lower=1, trans=1)
        # Compute dL_dKmm
        tmp = tdot(self._LBi_Lmi_psi1Vf)
@ -106,12 +131,12 @@ class SparseGP(GPBase):
        tmp = -0.5 * self.DBi_plus_BiPBi
        tmp += -0.5 * self.B * self.output_dim
        tmp += self.output_dim * np.eye(self.num_inducing)
-        self.dL_dKmm = backsub_both_sides(self.Lm, tmp)
+        self.dL_dKmm = backsub_both_sides(self._Lm, tmp)
        # Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertain inputs case
        self.dL_dpsi0 = -0.5 * self.output_dim * (self.likelihood.precision * np.ones([self.num_data, 1])).flatten()
        self.dL_dpsi1 = np.dot(self.likelihood.VVT_factor, self.Cpsi1Vf.T)
-        dL_dpsi2_beta = 0.5 * backsub_both_sides(self.Lm, self.output_dim * np.eye(self.num_inducing) - self.DBi_plus_BiPBi)
+        dL_dpsi2_beta = 0.5 * backsub_both_sides(self._Lm, self.output_dim * np.eye(self.num_inducing) - self.DBi_plus_BiPBi)
        if self.likelihood.is_heteroscedastic:
            if self.has_uncertain_inputs:
@ -139,17 +164,17 @@ class SparseGP(GPBase):
        else:
            # likelihood is not heterscedatic
            self.partial_for_likelihood = -0.5 * self.num_data * self.output_dim * self.likelihood.precision + 0.5 * self.likelihood.trYYT * self.likelihood.precision ** 2
-            self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0.sum() * self.likelihood.precision ** 2 - np.trace(self.A) * self.likelihood.precision)
+            self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0.sum() * self.likelihood.precision ** 2 - np.trace(self._A) * self.likelihood.precision)
-            self.partial_for_likelihood += self.likelihood.precision * (0.5 * np.sum(self.A * self.DBi_plus_BiPBi) - self.data_fit)
+            self.partial_for_likelihood += self.likelihood.precision * (0.5 * np.sum(self._A * self.DBi_plus_BiPBi) - self.data_fit)
    def log_likelihood(self):
        """ Compute the (lower bound on the) log marginal likelihood """
        if self.likelihood.is_heteroscedastic:
-            A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.likelihood.V*self.likelihood.Y)
+            A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.likelihood.V * self.likelihood.Y)
-            B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A))
+            B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self._A))
        else:
            A = -0.5 * self.num_data * self.output_dim * (np.log(2.*np.pi) - np.log(self.likelihood.precision)) - 0.5 * self.likelihood.precision * self.likelihood.trYYT
-            B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A))
+            B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self._A))
        C = -self.output_dim * (np.sum(np.log(np.diag(self.LB)))) # + 0.5 * self.num_inducing * np.log(sf2))
        D = 0.5 * self.data_fit
        return A + B + C + D + self.likelihood.Z
@ -166,9 +191,12 @@ class SparseGP(GPBase):
        return np.hstack([self.Z.flatten(), self.kern._get_params_transformed(), self.likelihood._get_params()])
    def _get_param_names(self):
-        return sum([['iip_%i_%i' % (i, j) for j in range(self.Z.shape[1])] for i in range(self.Z.shape[0])],[])\
+        return sum([['iip_%i_%i' % (i, j) for j in range(self.Z.shape[1])] for i in range(self.Z.shape[0])], [])\
            + self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
    def _get_print_names(self):
        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
    def update_likelihood_approximation(self):
        """
        Approximates a non-gaussian likelihood using Expectation Propagation
@ -179,7 +207,7 @@ class SparseGP(GPBase):
        if not isinstance(self.likelihood, Gaussian): # Updates not needed for Gaussian likelihood
            self.likelihood.restart()
            if self.has_uncertain_inputs:
-                Lmi = chol_inv(self.Lm)
+                Lmi = chol_inv(self._Lm)
                Kmmi = tdot(Lmi.T)
                diag_tr_psi2Kmmi = np.array([np.trace(psi2_Kmmi) for psi2_Kmmi in np.dot(self.psi2, Kmmi)])
@ -221,19 +249,20 @@ class SparseGP(GPBase):
        return dL_dZ
    def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
-        """Internal helper function for making predictions, does not account for normalization"""
+        """
        Internal helper function for making predictions, does not account for
        normalization or likelihood function
        """
-        Bi, _ = dpotri(self.LB, lower=0)  # WTH? this lower switch should be 1, but that doesn't work!
+        Bi, _ = dpotri(self.LB, lower=0) # WTH? this lower switch should be 1, but that doesn't work!
        symmetrify(Bi)
-        Kmmi_LmiBLmi = backsub_both_sides(self.Lm, np.eye(self.num_inducing) - Bi)
+        Kmmi_LmiBLmi = backsub_both_sides(self._Lm, np.eye(self.num_inducing) - Bi)
        if self.Cpsi1V is None:
-            psi1V = np.dot(self.psi1.T,self.likelihood.V)
+            psi1V = np.dot(self.psi1.T, self.likelihood.V)
-            tmp, _ = dtrtrs(self.Lm, np.asfortranarray(psi1V), lower=1, trans=0)
+            tmp, _ = dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
            tmp, _ = dpotrs(self.LB, tmp, lower=1)
-            self.Cpsi1V, _ = dtrtrs(self.Lm, tmp, lower=1, trans=1)
+            self.Cpsi1V, _ = dtrtrs(self._Lm, tmp, lower=1, trans=1)
        if X_variance_new is None:
            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
@ -294,17 +323,19 @@ class SparseGP(GPBase):
        return mean, var, _025pm, _975pm
    def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, fignum=None, ax=None):
        if ax is None:
            fig = pb.figure(num=fignum)
            ax = fig.add_subplot(111)
        if which_data is 'all':
            which_data = slice(None)
        GPBase.plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, ax=ax)
        # add the inducing inputs and some errorbars
        if self.X.shape[1] == 1:
            if self.has_uncertain_inputs:
-                Xu = self.X * self._Xscale + self._Xoffset  # NOTE self.X are the normalized values now
+                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
--- a/GPy/core/svigp.py
+++ b/GPy/core/svigp.py
@ -91,6 +91,58 @@ class SVIGP(GPBase):
        self._param_steplength_trace = []
        self._vb_steplength_trace = []
    def getstate(self):
        steplength_params = [self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength]
        return GPBase.getstate(self) + \
            [self.get_vb_param(),
             self.Z,
             self.num_inducing,
             self.has_uncertain_inputs,
             self.X_variance,
             self.X_batch,
             self.X_variance_batch,
             steplength_params,
             self.batchcounter,
             self.batchsize,
             self.epochs,
             self.momentum,
             self.data_prop,
             self._param_trace,
             self._param_steplength_trace,
             self._vb_steplength_trace,
             self._ll_trace,
             self._grad_trace,
             self.Y,
             self._permutation,
             self.iterations
            ]
    def setstate(self, state):
        self.iterations = state.pop()
        self._permutation = state.pop()
        self.Y = state.pop()
        self._grad_trace = state.pop()
        self._ll_trace = state.pop()
        self._vb_steplength_trace = state.pop()
        self._param_steplength_trace = state.pop()
        self._param_trace = state.pop()
        self.data_prop = state.pop()
        self.momentum = state.pop()
        self.epochs = state.pop()
        self.batchsize = state.pop()
        self.batchcounter = state.pop()
        steplength_params = state.pop()
        (self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength) = steplength_params
        self.X_variance_batch = state.pop()
        self.X_batch = state.pop()
        self.X_variance = state.pop()
        self.has_uncertain_inputs = state.pop()
        self.num_inducing = state.pop()
        self.Z = state.pop()
        vb_param = state.pop()
        GPBase.setstate(self, state)
        self.set_vb_param(vb_param)
    def _compute_kernel_matrices(self):
        # kernel computations, using BGPLVM notation
        self.Kmm = self.kern.K(self.Z)
@ -166,7 +218,7 @@ class SVIGP(GPBase):
                psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.batchsize, 1, 1))).sum(0)
            else:
                psi2_beta = self.psi2.sum(0) * self.likelihood.precision
-            evals, evecs = linalg.eigh(psi2_beta)
+            evals, evecs = np.linalg.eigh(psi2_beta)
            clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
            tmp = evecs * np.sqrt(clipped_evals)
        else:
@ -449,7 +501,7 @@ class SVIGP(GPBase):
            ax.plot(Zu, np.zeros_like(Zu) + Z_height, 'r|', mew=1.5, markersize=12)
        if self.input_dim==2:
-            ax.scatter(self.X_all[:,0], self.X_all[:,1], 20., self.Y[:,0], linewidth=0, cmap=pb.cm.jet)
+            ax.scatter(self.X[:,0], self.X[:,1], 20., self.Y[:,0], linewidth=0, cmap=pb.cm.jet)
            ax.plot(Zu[:,0], Zu[:,1], 'w^')
    def plot_traces(self):
--- a/GPy/core/transformations.py
+++ b/GPy/core/transformations.py
@ -4,6 +4,8 @@
 import numpy as np
 from GPy.core.domains import POSITIVE, NEGATIVE, BOUNDED
 import sys 
 lim_val = -np.log(sys.float_info.epsilon) 
 class transformation(object):
    domain = None
@ -17,7 +19,7 @@ class transformation(object):
        """ df_dx evaluated at self.f(x)=f"""
        raise NotImplementedError
    def initialize(self, f):
-        """ produce a sensible initial values for f(x)"""
+        """ produce a sensible initial value for f(x)"""
        raise NotImplementedError
    def __str__(self):
        raise NotImplementedError
@ -25,18 +27,34 @@ class transformation(object):
 class logexp(transformation):
    domain = POSITIVE
    def f(self, x):
-        return np.log(1. + np.exp(x))
+        return np.where(x>lim_val, x, np.log(1. + np.exp(x)))
    def finv(self, f):
-        return np.log(np.exp(f) - 1.)
+        return np.where(f>lim_val, f, np.log(np.exp(f) - 1.))
    def gradfactor(self, f):
-        ef = np.exp(f)
+        return np.where(f>lim_val, 1., 1 - np.exp(-f))
        return (ef - 1.) / ef
    def initialize(self, f):
        if np.any(f < 0.):
            print "Warning: changing parameters to satisfy constraints"
        return np.abs(f)
    def __str__(self):
        return '(+ve)'
-class logexp_clipped(transformation):
+class negative_logexp(transformation):
    domain = NEGATIVE
    def f(self, x):
        return -logexp.f(x) #np.log(1. + np.exp(x))
    def finv(self, f):
        return logexp.finv(-f) #np.log(np.exp(-f) - 1.)
    def gradfactor(self, f):
        return -logexp.gradfactor(-f)
        #ef = np.exp(-f)
        #return -(ef - 1.) / ef
    def initialize(self, f):
        return -logexp.initialize(f) #np.abs(f)
    def __str__(self):
        return '(-ve)'
 class logexp_clipped(logexp):
    max_bound = 1e100
    min_bound = 1e-10
    log_max_bound = np.log(max_bound)
@ -64,9 +82,10 @@ class logexp_clipped(transformation):
        return '(+ve_c)'
 class exponent(transformation):
    # TODO: can't allow this to go to zero, need to set a lower bound. Similar with negative exponent below. See old MATLAB code.
    domain = POSITIVE
    def f(self, x):
-        return np.exp(x)
+        return np.where(x<lim_val, np.where(x>-lim_val, np.exp(x), np.exp(-lim_val)), np.exp(lim_val))
    def finv(self, x):
        return np.log(x)
    def gradfactor(self, f):
@ -78,18 +97,16 @@ class exponent(transformation):
    def __str__(self):
        return '(+ve)'
-class negative_exponent(transformation):
+class negative_exponent(exponent):
    domain = NEGATIVE
    def f(self, x):
-        return -np.exp(x)
+        return -exponent.f(x)
-    def finv(self, x):
+    def finv(self, f):
-        return np.log(-x)
+        return exponent.finv(-f)
    def gradfactor(self, f):
        return f
    def initialize(self, f):
-        if np.any(f > 0.):
+        return -exponent.initialize(f) #np.abs(f)
            print "Warning: changing parameters to satisfy constraints"
        return -np.abs(f)
    def __str__(self):
        return '(-ve)'
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -10,7 +10,7 @@ import numpy as np
 import GPy
 default_seed = 10000
-def crescent_data(seed=default_seed): # FIXME
+def crescent_data(seed=default_seed, kernel=None): # FIXME
    """Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
    :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
@ -32,33 +32,33 @@ def crescent_data(seed=default_seed): # FIXME
    m.plot()
    return m
-def oil(num_inducing=50):
+def oil(num_inducing=50, max_iters=100, kernel=None):
    """
-    Run a Gaussian process classification on the oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
+    Run a Gaussian process classification on the three phase oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
    """
    data = GPy.util.datasets.oil()
-    X = data['X'][:600,:]
+    X = data['X']
-    X_test = data['X'][600:,:]
+    Xtest = data['Xtest']
-    Y = data['Y'][:600, 0:1]
+    Y = data['Y'][:, 0:1]
    Ytest = data['Ytest'][:, 0:1]
    Y[Y.flatten()==-1] = 0
-    Y_test = data['Y'][600:, 0:1]
+    Ytest[Ytest.flatten()==-1] = 0
    # Create GP model
-    m = GPy.models.SparseGPClassification(X, Y,num_inducing=num_inducing)
+    m = GPy.models.SparseGPClassification(X, Y,kernel=kernel,num_inducing=num_inducing)
    # Contrain all parameters to be positive
    m.constrain_positive('')
    m.tie_params('.*len')
    m['.*len'] = 10.
    m.update_likelihood_approximation()
    # Optimize
-    m.optimize()
+    m.optimize(max_iters=max_iters)
    print(m)
    #Test
-    probs = m.predict(X_test)[0]
+    probs = m.predict(Xtest)[0]
-    GPy.util.classification.conf_matrix(probs,Y_test)
+    GPy.util.classification.conf_matrix(probs,Ytest)
    return m
 def toy_linear_1d_classification(seed=default_seed):
@ -118,7 +118,7 @@ def sparse_toy_linear_1d_classification(num_inducing=10,seed=default_seed):
    return m
-def sparse_crescent_data(num_inducing=10, seed=default_seed):
+def sparse_crescent_data(num_inducing=10, seed=default_seed, kernel=None):
    """
    Run a Gaussian process classification with DTC approxiamtion on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
@ -133,7 +133,7 @@ def sparse_crescent_data(num_inducing=10, seed=default_seed):
    Y = data['Y']
    Y[Y.flatten()==-1]=0
-    m = GPy.models.SparseGPClassification(data['X'], Y,num_inducing=num_inducing)
+    m = GPy.models.SparseGPClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing)
    m['.*len'] = 10.
    #m.update_likelihood_approximation()
    #m.optimize()
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -7,26 +7,30 @@ from matplotlib import pyplot as plt, cm
 import GPy
 from GPy.core.transformations import logexp
 from GPy.models.bayesian_gplvm import BayesianGPLVM
 from GPy.likelihoods.gaussian import Gaussian
 default_seed = np.random.seed(123344)
 def BGPLVM(seed=default_seed):
-    N = 10
+    N = 5
-    num_inducing = 3
+    num_inducing = 4
-    Q = 2
+    Q = 3
-    D = 4
+    D = 2
    # generate GPLVM-like data
    X = np.random.rand(N, Q)
-    k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
+    lengthscales = np.random.rand(Q)
    k = (GPy.kern.rbf(Q, .5, lengthscales, ARD=True)
         + GPy.kern.white(Q, 0.01))
    K = k.K(X)
-    Y = np.random.multivariate_normal(np.zeros(N), K, Q).T
+    Y = np.random.multivariate_normal(np.zeros(N), K, D).T
    lik = Gaussian(Y, normalize=True)
-    k = GPy.kern.rbf(Q, ARD=True) + GPy.kern.linear(Q, ARD=True) + GPy.kern.rbf(Q, ARD=True) + GPy.kern.white(Q)
+    k = GPy.kern.rbf_inv(Q, .5, np.ones(Q) * 2., ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q)
    # k = GPy.kern.rbf(Q) + GPy.kern.rbf(Q) + GPy.kern.white(Q)
    # k = GPy.kern.rbf(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
    # k = GPy.kern.rbf(Q, ARD = False)  + GPy.kern.white(Q, 0.00001)
-    m = GPy.models.BayesianGPLVM(Y, Q, kernel=k, num_inducing=num_inducing)
+    m = GPy.models.BayesianGPLVM(lik, Q, kernel=k, num_inducing=num_inducing)
    m.lengthscales = lengthscales
    # m.constrain_positive('(rbf|bias|noise|white|S)')
    # m.constrain_fixed('S', 1)
@ -37,8 +41,8 @@ def BGPLVM(seed=default_seed):
    # m.optimize(messages = 1)
    # m.plot()
    # pb.title('After optimisation')
-    m.randomize()
+    # m.randomize()
-    m.checkgrad(verbose=1)
+    # m.checkgrad(verbose=1)
    return m
@ -60,6 +64,28 @@ def GPLVM_oil_100(optimize=True):
    m.plot_latent(labels=m.data_labels)
    return m
 def sparseGPLVM_oil(optimize=True, N=100, Q=6, num_inducing=15, max_iters=50):
    np.random.seed(0)
    data = GPy.util.datasets.oil()
    Y = data['X'][:N]
    Y = Y - Y.mean(0)
    Y /= Y.std(0)
    # create simple GP model
    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q)
    m = GPy.models.SparseGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing)
    m.data_labels = data['Y'].argmax(axis=1)
    # optimize
    if optimize:
        m.optimize('scg', messages=1, max_iters=max_iters)
    # plot
    print(m)
    # m.plot_latent(labels=m.data_labels)
    return m
 def swiss_roll(optimize=True, N=1000, num_inducing=15, Q=4, sigma=.2, plot=False):
    from GPy.util.datasets import swiss_roll_generated
    from GPy.core.transformations import logexp_clipped
@ -114,30 +140,33 @@ def swiss_roll(optimize=True, N=1000, num_inducing=15, Q=4, sigma=.2, plot=False
        m.optimize('scg', messages=1)
    return m
-def BGPLVM_oil(optimize=True, N=200, Q=10, num_inducing=15, max_f_eval=50, plot=False, **k):
+def BGPLVM_oil(optimize=True, N=200, Q=7, num_inducing=40, max_iters=1000, plot=False, **k):
    np.random.seed(0)
    data = GPy.util.datasets.oil()
    # create simple GP model
-    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2)) + GPy.kern.white(Q, np.exp(-2))
+    kernel = GPy.kern.rbf_inv(Q, 1., [.1] * Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2))
    Y = data['X'][:N]
-    Yn = Y - Y.mean(0)
+    Yn = Gaussian(Y, normalize=True)
-    Yn /= Yn.std(0)
+#     Yn = Y - Y.mean(0)
 #     Yn /= Yn.std(0)
    m = GPy.models.BayesianGPLVM(Yn, Q, kernel=kernel, num_inducing=num_inducing, **k)
    m.data_labels = data['Y'][:N].argmax(axis=1)
    # m.constrain('variance|leng', logexp_clipped())
-    m['.*lengt'] = 1. # m.X.var(0).max() / m.X.var(0)
+    # m['.*lengt'] = m.X.var(0).max() / m.X.var(0)
-    m['noise'] = Yn.var() / 100.
+    m['noise'] = Yn.Y.var() / 100.
    # optimize
    if optimize:
        m.constrain_fixed('noise')
-        m.optimize('scg', messages=1, max_f_eval=100, gtol=.05)
+        m.optimize('scg', messages=1, max_iters=200, gtol=.05)
        m.constrain_positive('noise')
-        m.optimize('scg', messages=1, max_f_eval=max_f_eval, gtol=.05)
+        m.constrain_bounded('white', 1e-7, 1)
        m.optimize('scg', messages=1, max_iters=max_iters, gtol=.05)
    if plot:
        y = m.likelihood.Y[0, :]
@ -186,7 +215,7 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
    Y1 += .3 * np.random.randn(*Y1.shape)
    Y2 += .2 * np.random.randn(*Y2.shape)
-    Y3 += .1 * np.random.randn(*Y3.shape)
+    Y3 += .25 * np.random.randn(*Y3.shape)
    Y1 -= Y1.mean(0)
    Y2 -= Y2.mean(0)
@ -241,29 +270,27 @@ def bgplvm_simulation_matlab_compare():
 def bgplvm_simulation(optimize='scg',
                      plot=True,
-                      max_f_eval=2e4):
+                      max_iters=2e4,
                      plot_sim=False):
 #     from GPy.core.transformations import logexp_clipped
-    D1, D2, D3, N, num_inducing, Q = 15, 8, 8, 100, 3, 5
+    D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 3, 10
-    slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot)
+    slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
    from GPy.models import mrd
    from GPy import kern
    reload(mrd); reload(kern)
    Y = Ylist[0]
    k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2)) # + kern.bias(Q)
-    m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k, _debug=True)
+    m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k)
    # m.constrain('variance|noise', logexp_clipped())
    m['noise'] = Y.var() / 100.
    m['linear_variance'] = .01
    if optimize:
        print "Optimizing model:"
-        m.optimize(optimize, max_iters=max_f_eval,
+        m.optimize(optimize, max_iters=max_iters,
                   max_f_eval=max_f_eval,
                   messages=True, gtol=.05)
    if plot:
        m.plot_X_1d("BGPLVM Latent Space 1D")
@ -271,19 +298,22 @@ def bgplvm_simulation(optimize='scg',
    return m
 def mrd_simulation(optimize=True, plot=True, plot_sim=True, **kw):
-    D1, D2, D3, N, num_inducing, Q = 150, 200, 400, 500, 3, 7
+    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
    slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
    likelihood_list = [Gaussian(x, normalize=True) for x in Ylist]
    from GPy.models import mrd
    from GPy import kern
    reload(mrd); reload(kern)
-    k = kern.linear(Q, [.05] * Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2))
+    k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2))
-    m = mrd.MRD(Ylist, input_dim=Q, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw)
+    m = mrd.MRD(likelihood_list, input_dim=Q, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw)
    m.ensure_default_constraints()
-    for i, Y in enumerate(Ylist):
+    for i, bgplvm in enumerate(m.bgplvms):
-        m['{}_noise'.format(i + 1)] = Y.var() / 100.
+        m['{}_noise'.format(i)] = bgplvm.likelihood.Y.var() / 500.
    # DEBUG
@ -291,7 +321,7 @@ def mrd_simulation(optimize=True, plot=True, plot_sim=True, **kw):
    if optimize:
        print "Optimizing Model:"
-        m.optimize(messages=1, max_iters=8e3, max_f_eval=8e3, gtol=.1)
+        m.optimize(messages=1, max_iters=8e3, gtol=.1)
    if plot:
        m.plot_X_1d("MRD Latent Space 1D")
        m.plot_scales("MRD Scales")
@ -322,9 +352,9 @@ def brendan_faces():
    return m
 def stick_play(range=None, frame_rate=15):
-    data = GPy.util.datasets.stick()
+    data = GPy.util.datasets.osu_run1()
    # optimize
-    if range==None:
+    if range == None:
        Y = data['Y'].copy()
    else:
        Y = data['Y'][range[0]:range[1], :].copy()
@ -333,29 +363,73 @@ def stick_play(range=None, frame_rate=15):
    GPy.util.visualize.data_play(Y, data_show, frame_rate)
    return Y
-def stick():
+def stick(kernel=None):
-    data = GPy.util.datasets.stick()
+    data = GPy.util.datasets.osu_run1()
    # optimize
    m = GPy.models.GPLVM(data['Y'], 2, kernel=kernel)
    m.optimize(messages=1, max_f_eval=10000)
    if GPy.util.visualize.visual_available:
        plt.clf
        ax = m.plot_latent()
        y = m.likelihood.Y[0, :]
        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')
    return m
 def bcgplvm_linear_stick(kernel=None):
    data = GPy.util.datasets.osu_run1()
    # optimize
    mapping = GPy.mappings.Linear(data['Y'].shape[1], 2)
    m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
    m.optimize(messages=1, max_f_eval=10000)
    if GPy.util.visualize.visual_available:
        plt.clf
        ax = m.plot_latent()
        y = m.likelihood.Y[0, :]
        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')
    return m
 def bcgplvm_stick(kernel=None):
    data = GPy.util.datasets.osu_run1()
    # optimize
    back_kernel=GPy.kern.rbf(data['Y'].shape[1], lengthscale=5.)
    mapping = GPy.mappings.Kernel(X=data['Y'], output_dim=2, kernel=back_kernel)
    m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
    m.optimize(messages=1, max_f_eval=10000)
    if GPy.util.visualize.visual_available:
        plt.clf
        ax = m.plot_latent()
        y = m.likelihood.Y[0, :]
        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')
    return m
 def robot_wireless():
    data = GPy.util.datasets.robot_wireless()
    # optimize
    m = GPy.models.GPLVM(data['Y'], 2)
    m.optimize(messages=1, max_f_eval=10000)
    m._set_params(m._get_params())
    plt.clf
    ax = m.plot_latent()
    y = m.likelihood.Y[0, :]
    data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
    lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
    raw_input('Press enter to finish')
    return m
 def stick_bgplvm(model=None):
-    data = GPy.util.datasets.stick()
+    data = GPy.util.datasets.osu_run1()
    Q = 6
    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2)) + GPy.kern.white(Q, np.exp(-2))
-    m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20,kernel=kernel)
+    m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel)
    # optimize
    m.ensure_default_constraints()
-    m.optimize(messages=1, max_f_eval=3000,xtol=1e-300,ftol=1e-300)
+    m.optimize('scg', messages=1, max_iters=200, xtol=1e-300, ftol=1e-300)
    m._set_params(m._get_params())
    plt.clf, (latent_axes, sense_axes) = plt.subplots(1, 2)
    plt.sca(latent_axes)
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -9,181 +9,156 @@ import pylab as pb
 import numpy as np
 import GPy
-
+def coregionalisation_toy2(max_iters=100):
 def toy_rbf_1d(optimizer='tnc', max_nb_eval_optim=100):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
    data = GPy.util.datasets.toy_rbf_1d()
    # create simple GP Model
    m = GPy.models.GPRegression(data['X'],data['Y'])
    # optimize
    m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
    # plot
    m.plot()
    print(m)
    return m
 def rogers_girolami_olympics(optim_iters=100):
    """Run a standard Gaussian process regression on the Rogers and Girolami olympics data."""
    data = GPy.util.datasets.rogers_girolami_olympics()
    # create simple GP Model
    m = GPy.models.GPRegression(data['X'],data['Y'])
    #set the lengthscale to be something sensible (defaults to 1)
    m['rbf_lengthscale'] = 10
    # optimize
    m.optimize(max_f_eval=optim_iters)
    # plot
    m.plot(plot_limits = (1850, 2050))
    print(m)
    return m
 def toy_rbf_1d_50(optim_iters=100):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
    data = GPy.util.datasets.toy_rbf_1d_50()
    # create simple GP Model
    m = GPy.models.GPRegression(data['X'],data['Y'])
    # optimize
    m.optimize(max_f_eval=optim_iters)
    # plot
    m.plot()
    print(m)
    return m
 def silhouette(optim_iters=100):
    """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper."""
    data = GPy.util.datasets.silhouette()
    # create simple GP Model
    m = GPy.models.GPRegression(data['X'],data['Y'])
    # optimize
    m.optimize(messages=True,max_f_eval=optim_iters)
    print(m)
    return m
 def coregionalisation_toy2(optim_iters=100):
    """
    A simple demonstration of coregionalisation on two sinusoidal functions.
    """
-    X1 = np.random.rand(50,1)*8
+    X1 = np.random.rand(50, 1) * 8
-    X2 = np.random.rand(30,1)*5
+    X2 = np.random.rand(30, 1) * 5
-    index = np.vstack((np.zeros_like(X1),np.ones_like(X2)))
+    index = np.vstack((np.zeros_like(X1), np.ones_like(X2)))
-    X = np.hstack((np.vstack((X1,X2)),index))
+    X = np.hstack((np.vstack((X1, X2)), index))
-    Y1 = np.sin(X1) + np.random.randn(*X1.shape)*0.05
+    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
-    Y2 = np.sin(X2) + np.random.randn(*X2.shape)*0.05 + 2.
+    Y2 = np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + 2.
-    Y = np.vstack((Y1,Y2))
+    Y = np.vstack((Y1, Y2))
    k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
-    k2 = GPy.kern.Coregionalise(2,1)
+    k2 = GPy.kern.coregionalise(2, 1)
-    k = k1.prod(k2,tensor=True)
+    k = k1**k2
-    m = GPy.models.GPRegression(X,Y,kernel=k)
+    m = GPy.models.GPRegression(X, Y, kernel=k)
-    m.constrain_fixed('.*rbf_var',1.)
+    m.constrain_fixed('.*rbf_var', 1.)
-    #m.constrain_positive('.*kappa')
+    # m.constrain_positive('.*kappa')
-    m.optimize('sim',messages=1,max_f_eval=optim_iters)
+    m.optimize('sim', messages=1, max_iters=max_iters)
    pb.figure()
-    Xtest1 = np.hstack((np.linspace(0,9,100)[:,None],np.zeros((100,1))))
+    Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1))))
-    Xtest2 = np.hstack((np.linspace(0,9,100)[:,None],np.ones((100,1))))
+    Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1))))
-    mean, var,low,up = m.predict(Xtest1)
+    mean, var, low, up = m.predict(Xtest1)
-    GPy.util.plot.gpplot(Xtest1[:,0],mean,low,up)
+    GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up)
-    mean, var,low,up = m.predict(Xtest2)
+    mean, var, low, up = m.predict(Xtest2)
-    GPy.util.plot.gpplot(Xtest2[:,0],mean,low,up)
+    GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up)
-    pb.plot(X1[:,0],Y1[:,0],'rx',mew=2)
+    pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2)
-    pb.plot(X2[:,0],Y2[:,0],'gx',mew=2)
+    pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2)
    return m
-def coregionalisation_toy(optim_iters=100):
+def coregionalisation_toy(max_iters=100):
    """
    A simple demonstration of coregionalisation on two sinusoidal functions.
    """
-    X1 = np.random.rand(50,1)*8
+    X1 = np.random.rand(50, 1) * 8
-    X2 = np.random.rand(30,1)*5
+    X2 = np.random.rand(30, 1) * 5
-    index = np.vstack((np.zeros_like(X1),np.ones_like(X2)))
+    index = np.vstack((np.zeros_like(X1), np.ones_like(X2)))
-    X = np.hstack((np.vstack((X1,X2)),index))
+    X = np.hstack((np.vstack((X1, X2)), index))
-    Y1 = np.sin(X1) + np.random.randn(*X1.shape)*0.05
+    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
-    Y2 = -np.sin(X2) + np.random.randn(*X2.shape)*0.05
+    Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
-    Y = np.vstack((Y1,Y2))
+    Y = np.vstack((Y1, Y2))
    k1 = GPy.kern.rbf(1)
-    k2 = GPy.kern.Coregionalise(2,2)
+    k2 = GPy.kern.coregionalise(2, 2)
-    k = k1.prod(k2,tensor=True)
+    k = k1**k2 #k1.prod(k2, tensor=True)
-    m = GPy.models.GPRegression(X,Y,kernel=k)
+    m = GPy.models.GPRegression(X, Y, kernel=k)
-    m.constrain_fixed('.*rbf_var',1.)
+    m.constrain_fixed('.*rbf_var', 1.)
-    #m.constrain_positive('kappa')
+    # m.constrain_positive('kappa')
-    m.optimize(max_f_eval=optim_iters)
+    m.optimize(max_iters=max_iters)
    pb.figure()
-    Xtest1 = np.hstack((np.linspace(0,9,100)[:,None],np.zeros((100,1))))
+    Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1))))
-    Xtest2 = np.hstack((np.linspace(0,9,100)[:,None],np.ones((100,1))))
+    Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1))))
-    mean, var,low,up = m.predict(Xtest1)
+    mean, var, low, up = m.predict(Xtest1)
-    GPy.util.plot.gpplot(Xtest1[:,0],mean,low,up)
+    GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up)
-    mean, var,low,up = m.predict(Xtest2)
+    mean, var, low, up = m.predict(Xtest2)
-    GPy.util.plot.gpplot(Xtest2[:,0],mean,low,up)
+    GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up)
-    pb.plot(X1[:,0],Y1[:,0],'rx',mew=2)
+    pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2)
-    pb.plot(X2[:,0],Y2[:,0],'gx',mew=2)
+    pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2)
    return m
-def coregionalisation_sparse(optim_iters=100):
+def coregionalisation_sparse(max_iters=100):
    """
    A simple demonstration of coregionalisation on two sinusoidal functions using sparse approximations.
    """
-    X1 = np.random.rand(500,1)*8
+    X1 = np.random.rand(500, 1) * 8
-    X2 = np.random.rand(300,1)*5
+    X2 = np.random.rand(300, 1) * 5
-    index = np.vstack((np.zeros_like(X1),np.ones_like(X2)))
+    index = np.vstack((np.zeros_like(X1), np.ones_like(X2)))
-    X = np.hstack((np.vstack((X1,X2)),index))
+    X = np.hstack((np.vstack((X1, X2)), index))
-    Y1 = np.sin(X1) + np.random.randn(*X1.shape)*0.05
+    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
-    Y2 = -np.sin(X2) + np.random.randn(*X2.shape)*0.05
+    Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
-    Y = np.vstack((Y1,Y2))
+    Y = np.vstack((Y1, Y2))
    num_inducing = 40
-    Z = np.hstack((np.random.rand(num_inducing,1)*8,np.random.randint(0,2,num_inducing)[:,None]))
+    Z = np.hstack((np.random.rand(num_inducing, 1) * 8, np.random.randint(0, 2, num_inducing)[:, None]))
    k1 = GPy.kern.rbf(1)
-    k2 = GPy.kern.Coregionalise(2,2)
+    k2 = GPy.kern.coregionalise(2, 2)
-    k = k1.prod(k2,tensor=True) + GPy.kern.white(2,0.001)
+    k = k1**k2 #.prod(k2, tensor=True) # + GPy.kern.white(2,0.001)
-    m = GPy.models.SparseGPRegression(X,Y,kernel=k,Z=Z)
+    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z)
-    m.constrain_fixed('.*rbf_var',1.)
+    m.constrain_fixed('.*rbf_var', 1.)
    m.constrain_fixed('iip')
-    m.constrain_bounded('noise_variance',1e-3,1e-1)
+    m.constrain_bounded('noise_variance', 1e-3, 1e-1)
-    m.optimize_restarts(5, robust=True, messages=1, max_f_eval=optim_iters)
+#     m.optimize_restarts(5, robust=True, messages=1, max_iters=max_iters, optimizer='bfgs')
    m.optimize(max_iters=max_iters)
-    #plotting:
+    # plotting:
    pb.figure()
-    Xtest1 = np.hstack((np.linspace(0,9,100)[:,None],np.zeros((100,1))))
+    Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1))))
-    Xtest2 = np.hstack((np.linspace(0,9,100)[:,None],np.ones((100,1))))
+    Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1))))
-    mean, var,low,up = m.predict(Xtest1)
+    mean, var, low, up = m.predict(Xtest1)
-    GPy.util.plot.gpplot(Xtest1[:,0],mean,low,up)
+    GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up)
-    mean, var,low,up = m.predict(Xtest2)
+    mean, var, low, up = m.predict(Xtest2)
-    GPy.util.plot.gpplot(Xtest2[:,0],mean,low,up)
+    GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up)
-    pb.plot(X1[:,0],Y1[:,0],'rx',mew=2)
+    pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2)
-    pb.plot(X2[:,0],Y2[:,0],'gx',mew=2)
+    pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2)
    y = pb.ylim()[0]
-    pb.plot(Z[:,0][Z[:,1]==0],np.zeros(np.sum(Z[:,1]==0))+y,'r|',mew=2)
+    pb.plot(Z[:, 0][Z[:, 1] == 0], np.zeros(np.sum(Z[:, 1] == 0)) + y, 'r|', mew=2)
-    pb.plot(Z[:,0][Z[:,1]==1],np.zeros(np.sum(Z[:,1]==1))+y,'g|',mew=2)
+    pb.plot(Z[:, 0][Z[:, 1] == 1], np.zeros(np.sum(Z[:, 1] == 1)) + y, 'g|', mew=2)
    return m
 def epomeo_gpx(max_iters=100):
    """Perform Gaussian process regression on the latitude and longitude data from the Mount Epomeo runs. Requires gpxpy to be installed on your system to load in the data."""
    data = GPy.util.datasets.epomeo_gpx()
    num_data_list = []
    for Xpart in data['X']:
        num_data_list.append(Xpart.shape[0])
    num_data_array = np.array(num_data_list)
    num_data = num_data_array.sum()
    Y = np.zeros((num_data, 2))
    t = np.zeros((num_data, 2))
    start = 0
    for Xpart, index in zip(data['X'], range(len(data['X']))):
        end = start+Xpart.shape[0]
        t[start:end, :] = np.hstack((Xpart[:, 0:1],
                                    index*np.ones((Xpart.shape[0], 1))))
        Y[start:end, :] = Xpart[:, 1:3]
    num_inducing = 200
    Z = np.hstack((np.linspace(t[:,0].min(), t[:, 0].max(), num_inducing)[:, None],
                   np.random.randint(0, 4, num_inducing)[:, None]))
    k1 = GPy.kern.rbf(1)
    k2 = GPy.kern.coregionalise(output_dim=5, rank=5)
    k = k1**k2 
    m = GPy.models.SparseGPRegression(t, Y, kernel=k, Z=Z, normalize_Y=True)
    m.constrain_fixed('.*rbf_var', 1.)
    m.constrain_fixed('iip')
    m.constrain_bounded('noise_variance', 1e-3, 1e-1)
 #     m.optimize_restarts(5, robust=True, messages=1, max_iters=max_iters, optimizer='bfgs')
    m.optimize(max_iters=max_iters,messages=True)
    return m
-def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000, optim_iters=300):
+def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=10000, max_iters=300):
-    """Show an example of a multimodal error surface for Gaussian process regression. Gene 939 has bimodal behaviour where the noisey mode is higher."""
+    """Show an example of a multimodal error surface for Gaussian process regression. Gene 939 has bimodal behaviour where the noisy mode is higher."""
    # Contour over a range of length scales and signal/noise ratios.
    length_scales = np.linspace(0.1, 60., resolution)
    log_SNRs = np.linspace(-3., 4., resolution)
    data = GPy.util.datasets.della_gatta_TRP63_gene_expression(gene_number)
-    #data['Y'] = data['Y'][0::2, :]
+    # data['Y'] = data['Y'][0::2, :]
-    #data['X'] = data['X'][0::2, :]
+    # data['X'] = data['X'][0::2, :]
    data['Y'] = data['Y'] - np.mean(data['Y'])
@ -202,26 +177,26 @@ def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000
    optim_point_y = np.empty(2)
    np.random.seed(seed=seed)
    for i in range(0, model_restarts):
-        #kern = GPy.kern.rbf(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.))
+        # kern = GPy.kern.rbf(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.))
-        kern = GPy.kern.rbf(1, variance=np.random.uniform(1e-3,1), lengthscale=np.random.uniform(5,50))
+        kern = GPy.kern.rbf(1, variance=np.random.uniform(1e-3, 1), lengthscale=np.random.uniform(5, 50))
-        m = GPy.models.GPRegression(data['X'],data['Y'], kernel=kern)
+        m = GPy.models.GPRegression(data['X'], data['Y'], kernel=kern)
-        m['noise_variance'] = np.random.uniform(1e-3,1)
+        m['noise_variance'] = np.random.uniform(1e-3, 1)
        optim_point_x[0] = m['rbf_lengthscale']
        optim_point_y[0] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);
        # optimize
-        m.optimize('scg', xtol=1e-6, ftol=1e-6, max_f_eval=optim_iters)
+        m.optimize('scg', xtol=1e-6, ftol=1e-6, max_iters=max_iters)
        optim_point_x[1] = m['rbf_lengthscale']
        optim_point_y[1] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);
-        pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1]-optim_point_x[0], optim_point_y[1]-optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k')
+        pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1] - optim_point_x[0], optim_point_y[1] - optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k')
        models.append(m)
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
-    return m #(models, lls)
+    return m # (models, lls)
 def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):
    """Evaluate the GP objective function for a given data set for a range of signal to noise ratios and a range of lengthscales.
@ -234,88 +209,261 @@ def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):
    lls = []
    total_var = np.var(data['Y'])
    kernel = kernel_call(1, variance=1., lengthscale=1.)
-    Model = GPy.models.GPRegression(data['X'], data['Y'], kernel=kernel)
+    model = GPy.models.GPRegression(data['X'], data['Y'], kernel=kernel)
    for log_SNR in log_SNRs:
        SNR = 10.**log_SNR
-        noise_var = total_var/(1.+SNR)
+        noise_var = total_var / (1. + SNR)
        signal_var = total_var - noise_var
-        Model.kern['.*variance'] = signal_var
+        model.kern['.*variance'] = signal_var
-        Model['noise_variance'] = noise_var
+        model['noise_variance'] = noise_var
        length_scale_lls = []
        for length_scale in length_scales:
-            Model['.*lengthscale'] = length_scale
+            model['.*lengthscale'] = length_scale
-            length_scale_lls.append(Model.log_likelihood())
+            length_scale_lls.append(model.log_likelihood())
        lls.append(length_scale_lls)
    return np.array(lls)
-def sparse_GP_regression_1D(N = 400, num_inducing = 5, optim_iters=100):
+
-    """Run a 1D example of a sparse GP regression."""
+def olympic_100m_men(max_iters=100, kernel=None):
-    # sample inputs and outputs
+    """Run a standard Gaussian process regression on the Rogers and Girolami olympics data."""
-    X = np.random.uniform(-3.,3.,(N,1))
+    data = GPy.util.datasets.olympic_100m_men()
-    Y = np.sin(X)+np.random.randn(N,1)*0.05
+
    # construct kernel
    rbf =  GPy.kern.rbf(1)
    noise = GPy.kern.white(1)
    kernel = rbf + noise
    # create simple GP Model
-    m = GPy.models.SparseGPRegression(X, Y, kernel, num_inducing=num_inducing)
+    m = GPy.models.GPRegression(data['X'], data['Y'], kernel)
    # set the lengthscale to be something sensible (defaults to 1)
    if kernel==None:
        m['rbf_lengthscale'] = 10
-    m.checkgrad(verbose=1)
+    # optimize
-    m.optimize('tnc', messages = 1, max_f_eval=optim_iters)
+    m.optimize(max_iters=max_iters)
-    m.plot()
+
    # plot
    m.plot(plot_limits=(1850, 2050))
    print(m)
    return m
-def sparse_GP_regression_2D(N = 400, num_inducing = 50, optim_iters=100):
+def olympic_marathon_men(max_iters=100, kernel=None):
-    """Run a 2D example of a sparse GP regression."""
+    """Run a standard Gaussian process regression on the Olympic marathon data."""
-    X = np.random.uniform(-3.,3.,(N,2))
+    data = GPy.util.datasets.olympic_marathon_men()
    Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(N,1)*0.05
    # construct kernel
    rbf =  GPy.kern.rbf(2)
    noise = GPy.kern.white(2)
    kernel = rbf + noise
    # create simple GP Model
-    m = GPy.models.SparseGPRegression(X,Y,kernel, num_inducing = num_inducing)
+    m = GPy.models.GPRegression(data['X'], data['Y'], kernel)
-    # contrain all parameters to be positive (but not inducing inputs)
+    # set the lengthscale to be something sensible (defaults to 1)
-    m.set('.*len',2.)
+    if kernel==None:
        m['rbf_lengthscale'] = 10
-    m.checkgrad()
+    # optimize
    m.optimize(max_iters=max_iters)
-    # optimize and plot
+    # plot
-    m.optimize('tnc', messages = 1, max_f_eval=optim_iters)
+    m.plot(plot_limits=(1850, 2050))
    print(m)
    return m
 def toy_rbf_1d(optimizer='tnc', max_nb_eval_optim=100):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
    data = GPy.util.datasets.toy_rbf_1d()
    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])
    # optimize
    m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
    # plot
    m.plot()
    print(m)
    return m
-def uncertain_inputs_sparse_regression(optim_iters=100):
+def toy_rbf_1d_50(max_iters=100):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
    data = GPy.util.datasets.toy_rbf_1d_50()
    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])
    # optimize
    m.optimize(max_iters=max_iters)
    # plot
    m.plot()
    print(m)
    return m
 def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4):
    # Create an artificial dataset where the values in the targets (Y)
    # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to
    # see if this dependency can be recovered
    X1 = np.sin(np.sort(np.random.rand(num_samples, 1) * 10, 0))
    X2 = np.cos(np.sort(np.random.rand(num_samples, 1) * 10, 0))
    X3 = np.exp(np.sort(np.random.rand(num_samples, 1), 0))
    X4 = np.log(np.sort(np.random.rand(num_samples, 1), 0))
    X = np.hstack((X1, X2, X3, X4))
    Y1 = np.asarray(2 * X[:, 0] + 3).reshape(-1, 1)
    Y2 = np.asarray(4 * (X[:, 2] - 1.5 * X[:, 0])).reshape(-1, 1)
    Y = np.hstack((Y1, Y2))
    Y = np.dot(Y, np.random.rand(2, D));
    Y = Y + 0.2 * np.random.randn(Y.shape[0], Y.shape[1])
    Y -= Y.mean()
    Y /= Y.std()
    if kernel_type == 'linear':
        kernel = GPy.kern.linear(X.shape[1], ARD=1)
    elif kernel_type == 'rbf_inv':
        kernel = GPy.kern.rbf_inv(X.shape[1], ARD=1)
    else:
        kernel = GPy.kern.rbf(X.shape[1], ARD=1)
    kernel += GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
    m = GPy.models.GPRegression(X, Y, kernel)
    # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
    # m.set_prior('.*lengthscale',len_prior)
    m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
    m.kern.plot_ARD()
    print(m)
    return m
 def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4):
    # Create an artificial dataset where the values in the targets (Y)
    # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to
    # see if this dependency can be recovered
    X1 = np.sin(np.sort(np.random.rand(num_samples, 1) * 10, 0))
    X2 = np.cos(np.sort(np.random.rand(num_samples, 1) * 10, 0))
    X3 = np.exp(np.sort(np.random.rand(num_samples, 1), 0))
    X4 = np.log(np.sort(np.random.rand(num_samples, 1), 0))
    X = np.hstack((X1, X2, X3, X4))
    Y1 = np.asarray(2 * X[:, 0] + 3)[:, None]
    Y2 = np.asarray(4 * (X[:, 2] - 1.5 * X[:, 0]))[:, None]
    Y = np.hstack((Y1, Y2))
    Y = np.dot(Y, np.random.rand(2, D));
    Y = Y + 0.2 * np.random.randn(Y.shape[0], Y.shape[1])
    Y -= Y.mean()
    Y /= Y.std()
    if kernel_type == 'linear':
        kernel = GPy.kern.linear(X.shape[1], ARD=1)
    elif kernel_type == 'rbf_inv':
        kernel = GPy.kern.rbf_inv(X.shape[1], ARD=1)
    else:
        kernel = GPy.kern.rbf(X.shape[1], ARD=1)
    kernel += GPy.kern.bias(X.shape[1])
    X_variance = np.ones(X.shape) * 0.5
    m = GPy.models.SparseGPRegression(X, Y, kernel, X_variance=X_variance)
    # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
    # m.set_prior('.*lengthscale',len_prior)
    m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
    m.kern.plot_ARD()
    print(m)
    return m
 def robot_wireless(max_iters=100, kernel=None):
    """Predict the location of a robot given wirelss signal strength readings."""
    data = GPy.util.datasets.robot_wireless()
    # create simple GP Model
    m = GPy.models.GPRegression(data['Y'], data['X'], kernel=kernel)
    # optimize
    m.optimize(messages=True, max_iters=max_iters)
    Xpredict = m.predict(data['Ytest'])[0]
    pb.plot(data['Xtest'][:, 0], data['Xtest'][:, 1], 'r-')
    pb.plot(Xpredict[:, 0], Xpredict[:, 1], 'b-')
    pb.axis('equal')
    pb.title('WiFi Localization with Gaussian Processes')
    pb.legend(('True Location', 'Predicted Location'))
    sse = ((data['Xtest'] - Xpredict)**2).sum()
    print(m)
    print('Sum of squares error on test data: ' + str(sse))
    return m
 def silhouette(max_iters=100):
    """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper."""
    data = GPy.util.datasets.silhouette()
    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])
    # optimize
    m.optimize(messages=True, max_iters=max_iters)
    print(m)
    return m
 def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100):
    """Run a 1D example of a sparse GP regression."""
    # sample inputs and outputs
    X = np.random.uniform(-3., 3., (num_samples, 1))
    Y = np.sin(X) + np.random.randn(num_samples, 1) * 0.05
    # construct kernel
    rbf = GPy.kern.rbf(1)
    # create simple GP Model
    m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing)
    m.checkgrad(verbose=1)
    m.optimize('tnc', messages=1, max_iters=max_iters)
    m.plot()
    return m
 def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100):
    """Run a 2D example of a sparse GP regression."""
    X = np.random.uniform(-3., 3., (num_samples, 2))
    Y = np.sin(X[:, 0:1]) * np.sin(X[:, 1:2]) + np.random.randn(num_samples, 1) * 0.05
    # construct kernel
    rbf = GPy.kern.rbf(2)
    # create simple GP Model
    m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing)
    # contrain all parameters to be positive (but not inducing inputs)
    m['.*len'] = 2.
    m.checkgrad()
    # optimize and plot
    m.optimize('tnc', messages=1, max_iters=max_iters)
    m.plot()
    print(m)
    return m
 def uncertain_inputs_sparse_regression(max_iters=100):
    """Run a 1D example of a sparse GP regression with uncertain inputs."""
-    fig, axes = pb.subplots(1,2,figsize=(12,5))
+    fig, axes = pb.subplots(1, 2, figsize=(12, 5))
    # sample inputs and outputs
-    S = np.ones((20,1))
+    S = np.ones((20, 1))
-    X = np.random.uniform(-3.,3.,(20,1))
+    X = np.random.uniform(-3., 3., (20, 1))
-    Y = np.sin(X)+np.random.randn(20,1)*0.05
+    Y = np.sin(X) + np.random.randn(20, 1) * 0.05
-    #likelihood = GPy.likelihoods.Gaussian(Y)
+    # likelihood = GPy.likelihoods.Gaussian(Y)
-    Z = np.random.uniform(-3.,3.,(7,1))
+    Z = np.random.uniform(-3., 3., (7, 1))
-    k = GPy.kern.rbf(1) + GPy.kern.white(1)
+    k = GPy.kern.rbf(1)
    # create simple GP Model - no input uncertainty on this one
    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z)
-    m.optimize('scg', messages=1, max_f_eval=optim_iters)
+    m.optimize('scg', messages=1, max_iters=max_iters)
    m.plot(ax=axes[0])
    axes[0].set_title('no input uncertainty')
-    #the same Model with uncertainty
+    # the same Model with uncertainty
    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z, X_variance=S)
-    m.optimize('scg', messages=1, max_f_eval=optim_iters)
+    m.optimize('scg', messages=1, max_iters=max_iters)
    m.plot(ax=axes[1])
    axes[1].set_title('with input uncertainty')
    print(m)
--- a/GPy/examples/stochastic.py
+++ b/GPy/examples/stochastic.py
@ -16,6 +16,7 @@ def toy_1d():
    m = GPy.models.SVIGPRegression(X,Y, batchsize=10, Z=Z)
    m.constrain_bounded('noise_variance',1e-3,1e-1)
    m.constrain_bounded('white_variance',1e-3,1e-1)
    m.param_steplength = 1e-4
--- a/GPy/inference/optimization.py
+++ b/GPy/inference/optimization.py
@ -4,6 +4,7 @@
 import pylab as pb
 import datetime as dt
 from scipy import optimize
 from warnings import warn
 try:
    import rasmussens_minimize as rasm
@ -129,7 +130,7 @@ class opt_lbfgsb(Optimizer):
            opt_dict['pgtol'] = self.gtol
        opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint=iprint,
-                                            maxfun=self.max_f_eval, **opt_dict)
+                                            maxfun=self.max_iters, **opt_dict)
        self.x_opt = opt_result[0]
        self.f_opt = f_fp(self.x_opt)[0]
        self.funct_eval = opt_result[2]['funcalls']
@ -198,17 +199,22 @@ class opt_rasm(Optimizer):
 class opt_SCG(Optimizer):
    def __init__(self, *args, **kwargs):
        if 'max_f_eval' in kwargs:
            warn("max_f_eval deprecated for SCG optimizer: use max_iters instead!\nIgnoring max_f_eval!", FutureWarning)
        Optimizer.__init__(self, *args, **kwargs)
        self.opt_name = "Scaled Conjugate Gradients"
    def opt(self, f_fp=None, f=None, fp=None):
        assert not f is None
        assert not fp is None
        opt_result = SCG(f, fp, self.x_init, display=self.messages,
                         maxiters=self.max_iters,
                         max_f_eval=self.max_f_eval,
                         xtol=self.xtol, ftol=self.ftol,
                         gtol=self.gtol)
        self.x_opt = opt_result[0]
        self.trace = opt_result[1]
        self.f_opt = self.trace[-1]
--- a/GPy/inference/scg.py
+++ b/GPy/inference/scg.py
@ -26,13 +26,16 @@ import numpy as np
 import sys
-def print_out(len_maxiters, display, fnow, current_grad, beta, iteration):
+def print_out(len_maxiters, fnow, current_grad, beta, iteration):
-    if display:
+    print '\r',
-        print '\r',
+    print '{0:>0{mi}g}  {1:> 12e}  {2:> 12e}  {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
-        print '{0:>0{mi}g}  {1:> 12e}  {2:> 12e}  {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
+    sys.stdout.flush()
        sys.stdout.flush()
-def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xtol=None, ftol=None, gtol=None):
+def exponents(fnow, current_grad):
    exps = [np.abs(fnow), current_grad]
    return np.sign(exps) * np.log10(exps).astype(int)
 def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True, xtol=None, ftol=None, gtol=None):
    """
    Optimisation through Scaled Conjugate Gradients (SCG)
@ -52,11 +55,14 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
        ftol = 1e-6
    if gtol is None:
        gtol = 1e-5
    sigma0 = 1.0e-8
    fold = f(x, *optargs) # Initial function value.
    function_eval = 1
    fnow = fold
    gradnew = gradf(x, *optargs) # Initial gradient.
    if any(np.isnan(gradnew)):
        raise UnexpectedInfOrNan
    current_grad = np.dot(gradnew, gradnew)
    gradold = gradnew.copy()
    d = -gradnew # Initial search direction.
@ -64,7 +70,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
    nsuccess = 0 # nsuccess counts number of successes.
    beta = 1.0 # Initial scale parameter.
    betamin = 1.0e-60 # Lower bound on scale.
-    betamax = 1.0e100 # Upper bound on scale.
+    betamax = 1.0e50 # Upper bound on scale.
    status = "Not converged"
    flog = [fold]
@ -74,6 +80,8 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
    len_maxiters = len(str(maxiters))
    if display:
        print ' {0:{mi}s}   {1:11s}    {2:11s}    {3:11s}'.format("I", "F", "Scale", "|g|", mi=len_maxiters)
        exps = exponents(fnow, current_grad)
        p_iter = iteration
    # Main optimization loop.
    while iteration < maxiters:
@ -103,9 +111,9 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
        fnew = f(xnew, *optargs)
        function_eval += 1
-        if function_eval >= max_f_eval:
+#         if function_eval >= max_f_eval:
-            status = "Maximum number of function evaluations exceeded"
+#             status = "maximum number of function evaluations exceeded"
-            break
+#             break
 #             return x, flog, function_eval, status
        Delta = 2.*(fnew - fold) / (alpha * mu)
@ -122,15 +130,28 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
        flog.append(fnow) # Current function value
        iteration += 1
-        print_out(len_maxiters, display, fnow, current_grad, beta, iteration)
+        if display:
            print_out(len_maxiters, fnow, current_grad, beta, iteration)
            n_exps = exponents(fnow, current_grad)
            if iteration - p_iter >= 20 * np.random.rand():
                a = iteration >= p_iter * 2.78
                b = np.any(n_exps < exps)
                if a or b:
                    p_iter = iteration
                    print ''
                if b:
                    exps = n_exps
        if success:
            # Test for termination
-            if (np.max(np.abs(alpha * d)) < xtol) or (np.abs(fnew - fold) < ftol):
+
-                status = 'converged'
+            if (np.abs(fnew - fold) < ftol):
                status = 'converged - relative reduction in objective'
                break
 #                 return x, flog, function_eval, status
-
+            elif (np.max(np.abs(alpha * d)) < xtol):
                status = 'converged - relative stepsize'
                break
            else:
                # Update variables for new position
                gradnew = gradf(x, *optargs)
@ -139,7 +160,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
                fold = fnew
                # If the gradient is zero then we are done.
                if current_grad <= gtol:
-                    status = 'converged'
+                    status = 'converged - relative reduction in gradient'
                    break
                    # return x, flog, function_eval, status
@ -164,6 +185,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
        status = "maxiter exceeded"
    if display:
-        print_out(len_maxiters, display, fnow, current_grad, beta, iteration)
+        print_out(len_maxiters, fnow, current_grad, beta, iteration)
        print ""
        print status
    return x, flog, function_eval, status
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from constructors import *
@ -6,4 +6,4 @@ try:
    from constructors import rbf_sympy, sympykern # these depend on sympy
 except:
    pass
-from kern import kern
+from kern import *
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@ -5,6 +5,23 @@ import numpy as np
 from kern import kern
 import parts
 def rbf_inv(input_dim,variance=1., inv_lengthscale=None,ARD=False):
    """
    Construct an RBF kernel
    :param input_dim: dimensionality of the kernel, obligatory
    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float
    :param lengthscale: the lengthscale of the kernel
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
    """
    part = parts.rbf_inv.RBFInv(input_dim,variance,inv_lengthscale,ARD)
    return kern(input_dim, [part])
 def rbf(input_dim,variance=1., lengthscale=None,ARD=False):
    """
    Construct an RBF kernel
@ -34,6 +51,78 @@ def linear(input_dim,variances=None,ARD=False):
    part = parts.linear.Linear(input_dim,variances,ARD)
    return kern(input_dim, [part])
 def mlp(input_dim,variance=1., weight_variance=None,bias_variance=100.,ARD=False):
    """
    Construct an MLP kernel
    :param input_dim: dimensionality of the kernel, obligatory
    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float
    :param weight_scale: the lengthscale of the kernel
    :type weight_scale: vector of weight variances for input weights in neural network (length 1 if kernel is isotropic)
    :param bias_variance: the variance of the biases in the neural network.
    :type bias_variance: float
    :param ARD: Auto Relevance Determination (allows for ARD version of covariance)
    :type ARD: Boolean
    """
    part = parts.mlp.MLP(input_dim,variance,weight_variance,bias_variance,ARD)
    return kern(input_dim, [part])
 def gibbs(input_dim,variance=1., mapping=None):
    """
    Gibbs and MacKay non-stationary covariance function.
    .. math::
       r = sqrt((x_i - x_j)'*(x_i - x_j))
       k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
       Z = \sqrt{2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')}
       where :math:`l(x)` is a function giving the length scale as a function of space.
       This is the non stationary kernel proposed by Mark Gibbs in his 1997
        thesis. It is similar to an RBF but has a length scale that varies
        with input location. This leads to an additional term in front of
        the kernel.
        The parameters are :math:`\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
        :param input_dim: the number of input dimensions
        :type input_dim: int 
        :param variance: the variance :math:`\sigma^2`
        :type variance: float
        :param mapping: the mapping that gives the lengthscale across the input space.
        :type mapping: GPy.core.Mapping
        :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
        :type ARD: Boolean
        :rtype: Kernpart object
    """
    part = parts.gibbs.Gibbs(input_dim,variance,mapping)
    return kern(input_dim, [part])
 def poly(input_dim,variance=1., weight_variance=None,bias_variance=1.,degree=2, ARD=False):
    """
    Construct a polynomial kernel
    :param input_dim: dimensionality of the kernel, obligatory
    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float
    :param weight_scale: the lengthscale of the kernel
    :type weight_scale: vector of weight variances for input weights.
    :param bias_variance: the variance of the biases.
    :type bias_variance: float
    :param degree: the degree of the polynomial
    :type degree: int
    :param ARD: Auto Relevance Determination (allows for ARD version of covariance)
    :type ARD: Boolean
    """
    part = parts.poly.POLY(input_dim,variance,weight_variance,bias_variance,degree,ARD)
    return kern(input_dim, [part])
 def white(input_dim,variance=1.):
    """
     Construct a white kernel.
@ -227,7 +316,7 @@ def periodic_Matern52(input_dim, variance=1., lengthscale=None, period=2 * np.pi
     :param n_freq: the number of frequencies considered for the periodic subspace
     :type n_freq: int
    """
-    part = parts.periodic_Matern52part(input_dim, variance, lengthscale, period, n_freq, lower, upper)
+    part = parts.periodic_Matern52.PeriodicMatern52(input_dim, variance, lengthscale, period, n_freq, lower, upper)
    return kern(input_dim, [part])
 def prod(k1,k2,tensor=False):
@ -236,21 +325,44 @@ def prod(k1,k2,tensor=False):
    :param k1, k2: the kernels to multiply
    :type k1, k2: kernpart
    :param tensor: The kernels are either multiply as functions defined on the same input space (default) or on the product of the input spaces
    :type tensor: Boolean
    :rtype: kernel object
    """
-    part = parts.prodpart(k1,k2,tensor)
+    part = parts.prod.Prod(k1, k2, tensor)
    return kern(part.input_dim, [part])
 def symmetric(k):
    """
-    Construct a symmetrical kernel from an existing kernel
+    Construct a symmetric kernel from an existing kernel
    """
    k_ = k.copy()
    k_.parts = [symmetric.Symmetric(p) for p in k.parts]
    return k_
-def coregionalise(Nout,R=1, W=None, kappa=None):
+def coregionalise(output_dim, rank=1, W=None, kappa=None):
-    p = parts.coregionalise.Coregionalise(Nout,R,W,kappa)
+    """
        Coregionalisation kernel. 
    Used for computing covariance functions of the form
    .. math::
       k_2(x, y)=\mathbf{B} k(x, y)
    where
    .. math::
       \mathbf{B} = \mathbf{W}\mathbf{W}^\top + kappa \mathbf{I}
    :param output_dim: the number of output dimensions
    :type output_dim: int
    :param rank: the rank of the coregionalisation matrix.
    :type rank: int
    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B.
    :type W: ndarray
    :param kappa: a diagonal term which allows the outputs to behave independently.
    :rtype: kernel object
    .. Note: see coregionalisation examples in GPy.examples.regression for some usage.
    """
    p = parts.coregionalise.Coregionalise(output_dim,rank,W,kappa)
    return kern(1,[p])
@ -274,11 +386,13 @@ def fixed(input_dim, K, variance=1.):
    """
     Construct a Fixed effect kernel.
-     Arguments
+    :param input_dim: the number of input dimensions
-     ---------
+    :type input_dim: int (input_dim=1 is the only value currently supported)
-     input_dim (int), obligatory
+    :param K: the variance :math:`\sigma^2`
-     K (np.array), obligatory
+    :type K: np.array
-     variance (float)
+    :param variance: kernel variance
    :type variance: float
    :rtype: kern object
    """
    part = parts.fixed.Fixed(input_dim, K, variance)
    return kern(input_dim, [part])
@ -296,5 +410,14 @@ def independent_outputs(k):
    """
    for sl in k.input_slices:
        assert (sl.start is None) and (sl.stop is None), "cannot adjust input slices! (TODO)"
-    parts = [independent_outputs.IndependentOutputs(p) for p in k.parts]
+    _parts = [parts.independent_outputs.IndependentOutputs(p) for p in k.parts]
-    return kern(k.input_dim+1,parts)
+    return kern(k.input_dim+1,_parts)
 def hierarchical(k):
    """
    TODO THis can't be right! Construct a kernel with independent outputs from an existing kernel
    """
    # for sl in k.input_slices:
    #     assert (sl.start is None) and (sl.stop is None), "cannot adjust input slices! (TODO)"
    _parts = [parts.hierarchical.Hierarchical(k.parts)]
    return kern(k.input_dim+len(k.parts),_parts)
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@ -3,17 +3,21 @@
 import numpy as np
 import pylab as pb
-from ..core.parameterised import Parameterised
+from ..core.parameterized import Parameterized
 from parts.kernpart import Kernpart
 import itertools
 from parts.prod import Prod as prod
 from matplotlib.transforms import offset_copy
-class kern(Parameterised):
+class kern(Parameterized):
    def __init__(self, input_dim, parts=[], input_slices=None):
        """
        This is the main kernel class for GPy. It handles multiple (additive) kernel functions, and keeps track of variaous things like which parameters live where.
-        The technical code for kernels is divided into _parts_ (see e.g. rbf.py). This obnject contains a list of parts, which are computed additively. For multiplication, special _prod_ parts are used.
+        The technical code for kernels is divided into _parts_ (see
        e.g. rbf.py). This object contains a list of parts, which are
        computed additively. For multiplication, special _prod_ parts
        are used.
        :param input_dim: The dimensionality of the kernel's input space
        :type input_dim: int
@ -41,26 +45,100 @@ class kern(Parameterised):
        self.compute_param_slices()
-        Parameterised.__init__(self)
+        Parameterized.__init__(self)
    def getstate(self):
        """
        Get the current state of the class,
        here just all the indices, rest can get recomputed
        """
        return Parameterized.getstate(self) + [self.parts,
                self.Nparts,
                self.num_params,
                self.input_dim,
                self.input_slices,
                self.param_slices
                ]
    def setstate(self, state):
        self.param_slices = state.pop()
        self.input_slices = state.pop()
        self.input_dim = state.pop()
        self.num_params = state.pop()
        self.Nparts = state.pop()
        self.parts = state.pop()
        Parameterized.setstate(self, state)
-    def plot_ARD(self, fignum=None, ax=None):
+    def plot_ARD(self, fignum=None, ax=None, title='', legend=False):
-        """If an ARD kernel is present, it bar-plots the ARD parameters"""
+        """If an ARD kernel is present, it bar-plots the ARD parameters,
        :param fignum: figure number of the plot
        :param ax: matplotlib axis to plot on
        :param title: 
            title of the plot, 
            pass '' to not print a title
            pass None for a generic title
        """
        if ax is None:
            fig = pb.figure(fignum)
            ax = fig.add_subplot(111)
        else:
            fig = ax.figure
        from GPy.util import Tango
        from matplotlib.textpath import TextPath
        Tango.reset()
        xticklabels = []
        bars = []
        x0 = 0
        for p in self.parts:
            c = Tango.nextMedium()
            if hasattr(p, 'ARD') and p.ARD:
-                ax.set_title('ARD parameters, %s kernel' % p.name)
+                if title is None:
-
+                    ax.set_title('ARD parameters, %s kernel' % p.name)
                else:
                    ax.set_title(title)
                if p.name == 'linear':
                    ard_params = p.variances
                else:
                    ard_params = 1. / p.lengthscale
-                ax.bar(np.arange(len(ard_params)) - 0.4, ard_params)
+                x = np.arange(x0, x0 + len(ard_params))
-                ax.set_xticks(np.arange(len(ard_params)))
+                bars.append(ax.bar(x, ard_params, align='center', color=c, edgecolor='k', linewidth=1.2, label=p.name))
-                ax.set_xticklabels([r"${}$".format(i) for i in range(len(ard_params))])
+                xticklabels.extend([r"$\mathrm{{{name}}}\ {x}$".format(name=p.name, x=i) for i in np.arange(len(ard_params))])
                x0 += len(ard_params)
        x = np.arange(x0)
        transOffset = offset_copy(ax.transData, fig=fig,
                                  x=0., y= -2., units='points')
        transOffsetUp = offset_copy(ax.transData, fig=fig,
                                  x=0., y=1., units='points')
        for bar in bars:
            for patch, num in zip(bar.patches, np.arange(len(bar.patches))):
                height = patch.get_height()
                xi = patch.get_x() + patch.get_width() / 2.
                va = 'top'
                c = 'w'
                t = TextPath((0, 0), "${xi}$".format(xi=xi), rotation=0, usetex=True, ha='center')
                transform = transOffset
                if patch.get_extents().height <= t.get_extents().height + 3:
                    va = 'bottom'
                    c = 'k'
                    transform = transOffsetUp
                ax.text(xi, height, "${xi}$".format(xi=int(num)), color=c, rotation=0, ha='center', va=va, transform=transform)
        # for xi, t in zip(x, xticklabels):
        #    ax.text(xi, maxi / 2, t, rotation=90, ha='center', va='center')
        # ax.set_xticklabels(xticklabels, rotation=17)
        ax.set_xticks([])
        ax.set_xlim(-.5, x0 - .5)
        if legend:
            if title is '':
                mode = 'expand'
                if len(bars) > 1:
                    mode = 'expand'
                ax.legend(bbox_to_anchor=(0., 1.02, 1., 1.02), loc=3,
                          ncol=len(bars), mode=mode, borderaxespad=0.)
                fig.tight_layout(rect=(0, 0, 1, .9))
            else:
                ax.legend()
        return ax
    def _transform_gradients(self, g):
@ -74,7 +152,7 @@ class kern(Parameterised):
            return g
    def compute_param_slices(self):
-        """create a set of slices that can index the parameters of each part"""
+        """create a set of slices that can index the parameters of each part."""
        self.param_slices = []
        count = 0
        for p in self.parts:
@ -125,11 +203,19 @@ class kern(Parameterised):
        """
        return self.prod(other)
    def __pow__(self, other, tensor=False):
        """
        Shortcut for tensor `prod`.
        """
        return self.prod(other, tensor=True)
    def prod(self, other, tensor=False):
        """
-        multiply two kernels (either on the same space, or on the tensor product of the input space)
+        multiply two kernels (either on the same space, or on the tensor product of the input space).
        :param other: the other kernel to be added
        :type other: GPy.kern
        :param tensor: whether or not to use the tensor space (default is false).
        :type tensor: bool 
        """
        K1 = self.copy()
        K2 = other.copy()
@ -198,7 +284,7 @@ class kern(Parameterised):
        [p._set_params(x[s]) for p, s in zip(self.parts, self.param_slices)]
    def _get_param_names(self):
-        # this is a bit nasty: we wat to distinguish between parts with the same name by appending a count
+        # this is a bit nasty: we want to distinguish between parts with the same name by appending a count
        part_names = np.array([k.name for k in self.parts], dtype=np.str)
        counts = [np.sum(part_names == ni) for i, ni in enumerate(part_names)]
        cum_counts = [np.sum(part_names[i:] == ni) for i, ni in enumerate(part_names)]
@ -220,11 +306,13 @@ class kern(Parameterised):
    def dK_dtheta(self, dL_dK, X, X2=None):
        """
-        :param dL_dK: An array of dL_dK derivaties, dL_dK
+        Compute the gradient of the covariance function with respect to the parameters.
-        :type dL_dK: Np.ndarray (N x num_inducing)
+        
        :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
        :type dL_dK: Np.ndarray (num_samples x num_inducing)
        :param X: Observed data inputs
-        :type X: np.ndarray (N x input_dim)
+        :type X: np.ndarray (num_samples x input_dim)
-        :param X2: Observed dara inputs (optional, defaults to X)
+        :param X2: Observed data inputs (optional, defaults to X)
        :type X2: np.ndarray (num_inducing x input_dim)
        """
        assert X.shape[1] == self.input_dim
@ -237,6 +325,14 @@ class kern(Parameterised):
        return self._transform_gradients(target)
    def dK_dX(self, dL_dK, X, X2=None):
        """Compute the gradient of the covariance function with respect to X.
        :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
        :type dL_dK: np.ndarray (num_samples x num_inducing)
        :param X: Observed data inputs
        :type X: np.ndarray (num_samples x input_dim)
        :param X2: Observed data inputs (optional, defaults to X)
        :type X2: np.ndarray (num_inducing x input_dim)"""
        if X2 is None:
            X2 = X
        target = np.zeros_like(X)
@ -247,6 +343,7 @@ class kern(Parameterised):
        return target
    def Kdiag(self, X, which_parts='all'):
        """Compute the diagonal of the covariance function for inputs X."""
        if which_parts == 'all':
            which_parts = [True] * self.Nparts
        assert X.shape[1] == self.input_dim
@ -255,6 +352,7 @@ class kern(Parameterised):
        return target
    def dKdiag_dtheta(self, dL_dKdiag, X):
        """Compute the gradient of the diagonal of the covariance function with respect to the parameters."""
        assert X.shape[1] == self.input_dim
        assert dL_dKdiag.size == X.shape[0]
        target = np.zeros(self.num_params)
@ -298,16 +396,18 @@ class kern(Parameterised):
        return target
    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S):
-        """return shapes are N,num_inducing,input_dim"""
+        """return shapes are num_samples,num_inducing,input_dim"""
        target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
        [p.dpsi1_dmuS(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
        return target_mu, target_S
    def psi2(self, Z, mu, S):
        """
        Computer the psi2 statistics for the covariance function.
        :param Z: np.ndarray of inducing inputs (num_inducing x input_dim)
-        :param mu, S: np.ndarrays of means and variances (each N x input_dim)
+        :param mu, S: np.ndarrays of means and variances (each num_samples x input_dim)
-        :returns psi2: np.ndarray (N,num_inducing,num_inducing)
+        :returns psi2: np.ndarray (num_samples,num_inducing,num_inducing)
        """
        target = np.zeros((mu.shape[0], Z.shape[0], Z.shape[0]))
        [p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
@ -316,21 +416,22 @@ class kern(Parameterised):
        # TODO: input_slices needed
        crossterms = 0
-        for p1, p2 in itertools.combinations(self.parts, 2):
+        for [p1, i_s1], [p2, i_s2] in itertools.combinations(zip(self.parts, self.input_slices), 2):
            if i_s1 == i_s2:
                # TODO psi1 this must be faster/better/precached/more nice
                tmp1 = np.zeros((mu.shape[0], Z.shape[0]))
                p1.psi1(Z[:, i_s1], mu[:, i_s1], S[:, i_s1], tmp1)
                tmp2 = np.zeros((mu.shape[0], Z.shape[0]))
                p2.psi1(Z[:, i_s2], mu[:, i_s2], S[:, i_s2], tmp2)
                prod = np.multiply(tmp1, tmp2)
                crossterms += prod[:, :, None] + prod[:, None, :]
-            # TODO psi1 this must be faster/better/precached/more nice
+        # target += crossterms
-            tmp1 = np.zeros((mu.shape[0], Z.shape[0]))
+        return target + crossterms
            p1.psi1(Z, mu, S, tmp1)
            tmp2 = np.zeros((mu.shape[0], Z.shape[0]))
            p2.psi1(Z, mu, S, tmp2)
            prod = np.multiply(tmp1, tmp2)
            crossterms += prod[:, :, None] + prod[:, None, :]
        target += crossterms
        return target
    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S):
        """Gradient of the psi2 statistics with respect to the parameters."""
        target = np.zeros(self.num_params)
        [p.dpsi2_dtheta(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, i_s, ps in zip(self.parts, self.input_slices, self.param_slices)]
@ -434,3 +535,135 @@ class kern(Parameterised):
            pb.title("k(x1,x2 ; %0.1f,%0.1f)" % (x[0, 0], x[0, 1]))
        else:
            raise NotImplementedError, "Cannot plot a kernel with more than two input dimensions"
 from GPy.core.model import Model
 class Kern_check_model(Model):
    """This is a dummy model class used as a base class for checking that the gradients of a given kernel are implemented correctly. It enables checkgradient() to be called independently on a kernel."""
    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
        num_samples = 20
        num_samples2 = 10
        if kernel==None:
            kernel = GPy.kern.rbf(1)
        if X==None:
            X = np.random.randn(num_samples, kernel.input_dim)
        if dL_dK==None:
            if X2==None:
                dL_dK = np.ones((X.shape[0], X.shape[0]))
            else:
                dL_dK = np.ones((X.shape[0], X2.shape[0]))
        self.kernel=kernel
        self.X = X
        self.X2 = X2
        self.dL_dK = dL_dK
        #self.constrained_indices=[]
        #self.constraints=[]
        Model.__init__(self)
    def is_positive_definite(self):
        v = np.linalg.eig(self.kernel.K(self.X))[0]
        if any(v<0):
            return False
        else:
            return True
    def _get_params(self):
        return self.kernel._get_params()
    def _get_param_names(self):
        return self.kernel._get_param_names()
    def _set_params(self, x):
        self.kernel._set_params(x)
    def log_likelihood(self):
        return (self.dL_dK*self.kernel.K(self.X, self.X2)).sum()
    def _log_likelihood_gradients(self):
        raise NotImplementedError, "This needs to be implemented to use the kern_check_model class."
 class Kern_check_dK_dtheta(Kern_check_model):
    """This class allows gradient checks for the gradient of a kernel with respect to parameters. """
    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
    def _log_likelihood_gradients(self):
        return self.kernel.dK_dtheta(self.dL_dK, self.X, self.X2)
 class Kern_check_dKdiag_dtheta(Kern_check_model):
    """This class allows gradient checks of the gradient of the diagonal of a kernel with respect to the parameters."""
    def __init__(self, kernel=None, dL_dK=None, X=None):
        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
        if dL_dK==None:
            self.dL_dK = np.ones((self.X.shape[0]))
    def log_likelihood(self):
        return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
    def _log_likelihood_gradients(self):
        return self.kernel.dKdiag_dtheta(self.dL_dK, self.X)
 class Kern_check_dK_dX(Kern_check_model):
    """This class allows gradient checks for the gradient of a kernel with respect to X. """
    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
    def _log_likelihood_gradients(self):
        return self.kernel.dK_dX(self.dL_dK, self.X, self.X2).flatten()
    def _get_param_names(self):
        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
    def _get_params(self):
        return self.X.flatten()
    def _set_params(self, x):
        self.X=x.reshape(self.X.shape)
 class Kern_check_dKdiag_dX(Kern_check_model):
    """This class allows gradient checks for the gradient of a kernel diagonal with respect to X. """
    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
        if dL_dK==None:
            self.dL_dK = np.ones((self.X.shape[0]))
    def log_likelihood(self):
        return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
    def _log_likelihood_gradients(self):
        return self.kernel.dKdiag_dX(self.dL_dK, self.X).flatten()
    def _get_param_names(self):
        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
    def _get_params(self):
        return self.X.flatten()
    def _set_params(self, x):
        self.X=x.reshape(self.X.shape)
 def kern_test(kern, X=None, X2=None, verbose=False):
    """This function runs on kernels to check the correctness of their implementation. It checks that the covariance function is positive definite for a randomly generated data set.
    :param kern: the kernel to be tested.
    :type kern: GPy.kern.Kernpart
    :param X: X input values to test the covariance function.
    :type X: ndarray
    :param X2: X2 input values to test the covariance function.
    :type X2: ndarray
    """
    if X==None:
        X = np.random.randn(10, kern.input_dim)
    if X2==None:
        X2 = np.random.randn(20, kern.input_dim)
    result = [Kern_check_model(kern, X=X).is_positive_definite(),
              Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=verbose),
              Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=verbose),
             Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=verbose),
              Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=verbose),
              Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=verbose)]
    # Need to check 
    #Kern_check_dK_dX(kern, X, X2=None).checkgrad(verbose=verbose)]
    # but currently I think these aren't implemented.
    return np.all(result)
--- a/GPy/kern/parts/init.py
+++ b/GPy/kern/parts/init.py
@ -4,13 +4,16 @@ import coregionalise
 import exponential
 import finite_dimensional
 import fixed
 import gibbs
 import independent_outputs
 import linear
 import Matern32
 import Matern52
 import mlp
 import periodic_exponential
 import periodic_Matern32
 import periodic_Matern52
 import poly
 import prod_orthogonal
 import prod
 import rational_quadratic
@ -19,3 +22,5 @@ import rbf
 import spline
 import symmetric
 import white
 import hierarchical
 import rbf_inv
--- a/GPy/kern/parts/coregionalise.py
+++ b/GPy/kern/parts/coregionalise.py
@ -9,24 +9,42 @@ from scipy import weave
 class Coregionalise(Kernpart):
    """
-    Kernel for Intrinsic Corregionalization Models
+    Coregionalisation kernel. 
    Used for computing covariance functions of the form
    .. math::
       k_2(x, y)=B k(x, y)
    where
    .. math::
       B = WW^\top + diag(kappa)
    :param output_dim: the number of output dimensions
    :type output_dim: int
    :param rank: the rank of the coregionalisation matrix.
    :type rank: int
    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B.
    :type W: ndarray
    :param kappa: a diagonal term which allows the outputs to behave independently.
    :rtype: kernel object
    .. Note: see coregionalisation examples in GPy.examples.regression for some usage.
    """
-    def __init__(self,Nout,R=1, W=None, kappa=None):
+    def __init__(self,output_dim,rank=1, W=None, kappa=None):
        self.input_dim = 1
        self.name = 'coregion'
-        self.Nout = Nout
+        self.output_dim = output_dim
-        self.R = R
+        self.rank = rank
        if W is None:
-            self.W = np.ones((self.Nout,self.R))
+            self.W = 0.5*np.random.randn(self.output_dim,self.rank)/np.sqrt(self.rank)
        else:
-            assert W.shape==(self.Nout,self.R)
+            assert W.shape==(self.output_dim,self.rank)
            self.W = W
        if kappa is None:
-            kappa = np.ones(self.Nout)
+            kappa = 0.5*np.ones(self.output_dim)
        else:
-            assert kappa.shape==(self.Nout,)
+            assert kappa.shape==(self.output_dim,)
        self.kappa = kappa
-        self.num_params = self.Nout*(self.R + 1)
+        self.num_params = self.output_dim*(self.rank + 1)
        self._set_params(np.hstack([self.W.flatten(),self.kappa]))
    def _get_params(self):
@ -34,12 +52,12 @@ class Coregionalise(Kernpart):
    def _set_params(self,x):
        assert x.size == self.num_params
-        self.kappa = x[-self.Nout:]
+        self.kappa = x[-self.output_dim:]
-        self.W = x[:-self.Nout].reshape(self.Nout,self.R)
+        self.W = x[:-self.output_dim].reshape(self.output_dim,self.rank)
        self.B = np.dot(self.W,self.W.T) + np.diag(self.kappa)
    def _get_param_names(self):
-        return sum([['W%i_%i'%(i,j) for j in range(self.R)] for i in range(self.Nout)],[]) + ['kappa_%i'%i for i in range(self.Nout)]
+        return sum([['W%i_%i'%(i,j) for j in range(self.rank)] for i in range(self.output_dim)],[]) + ['kappa_%i'%i for i in range(self.output_dim)]
    def K(self,index,index2,target):
        index = np.asarray(index,dtype=np.int)
@ -57,26 +75,26 @@ class Coregionalise(Kernpart):
        if index2 is None:
            code="""
            for(int i=0;i<N; i++){
-              target[i+i*N] += B[index[i]+Nout*index[i]];
+              target[i+i*N] += B[index[i]+output_dim*index[i]];
              for(int j=0; j<i; j++){
-                  target[j+i*N] += B[index[i]+Nout*index[j]];
+                  target[j+i*N] += B[index[i]+output_dim*index[j]];
                  target[i+j*N] += target[j+i*N];
                }
              }
            """
-            N,B,Nout = index.size, self.B, self.Nout
+            N,B,output_dim = index.size, self.B, self.output_dim
-            weave.inline(code,['target','index','N','B','Nout'])
+            weave.inline(code,['target','index','N','B','output_dim'])
        else:
            index2 = np.asarray(index2,dtype=np.int)
            code="""
            for(int i=0;i<num_inducing; i++){
              for(int j=0; j<N; j++){
-                  target[i+j*num_inducing] += B[Nout*index[j]+index2[i]];
+                  target[i+j*num_inducing] += B[output_dim*index[j]+index2[i]];
                }
              }
            """
-            N,num_inducing,B,Nout = index.size,index2.size, self.B, self.Nout
+            N,num_inducing,B,output_dim = index.size,index2.size, self.B, self.output_dim
-            weave.inline(code,['target','index','index2','N','num_inducing','B','Nout'])
+            weave.inline(code,['target','index','index2','N','num_inducing','B','output_dim'])
    def Kdiag(self,index,target):
@ -93,12 +111,12 @@ class Coregionalise(Kernpart):
        code="""
        for(int i=0; i<num_inducing; i++){
          for(int j=0; j<N; j++){
-            dL_dK_small[index[j] + Nout*index2[i]] += dL_dK[i+j*num_inducing];
+            dL_dK_small[index[j] + output_dim*index2[i]] += dL_dK[i+j*num_inducing];
          }
        }
        """
-        N, num_inducing, Nout = index.size, index2.size, self.Nout
+        N, num_inducing, output_dim = index.size, index2.size, self.output_dim
-        weave.inline(code, ['N','num_inducing','Nout','dL_dK','dL_dK_small','index','index2'])
+        weave.inline(code, ['N','num_inducing','output_dim','dL_dK','dL_dK_small','index','index2'])
        dkappa = np.diag(dL_dK_small)
        dL_dK_small += dL_dK_small.T
@ -115,8 +133,8 @@ class Coregionalise(Kernpart):
        ii,jj = ii.T, jj.T
        dL_dK_small = np.zeros_like(self.B)
-        for i in range(self.Nout):
+        for i in range(self.output_dim):
-            for j in range(self.Nout):
+            for j in range(self.output_dim):
                tmp = np.sum(dL_dK[(ii==i)*(jj==j)])
                dL_dK_small[i,j] = tmp
@ -128,8 +146,8 @@ class Coregionalise(Kernpart):
    def dKdiag_dtheta(self,dL_dKdiag,index,target):
        index = np.asarray(index,dtype=np.int).flatten()
-        dL_dKdiag_small = np.zeros(self.Nout)
+        dL_dKdiag_small = np.zeros(self.output_dim)
-        for i in range(self.Nout):
+        for i in range(self.output_dim):
            dL_dKdiag_small[i] += np.sum(dL_dKdiag[index==i])
        dW = 2.*self.W*dL_dKdiag_small[:,None]
        dkappa = dL_dKdiag_small
--- a/GPy/kern/parts/gibbs.py
+++ b/GPy/kern/parts/gibbs.py
@ -0,0 +1,135 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kernpart import Kernpart
 import numpy as np
 from ...util.linalg import tdot
 from ...core.mapping import Mapping
 import GPy
 class Gibbs(Kernpart):
    """
    Gibbs and MacKay non-stationary covariance function.
    .. math::
       r = sqrt((x_i - x_j)'*(x_i - x_j))
       k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
       Z = (2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')^{q/2}
       where :math:`l(x)` is a function giving the length scale as a function of space and :math:`q` is the dimensionality of the input space.
       This is the non stationary kernel proposed by Mark Gibbs in his 1997
        thesis. It is similar to an RBF but has a length scale that varies
        with input location. This leads to an additional term in front of
        the kernel.
        The parameters are :math:`\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
        :param input_dim: the number of input dimensions
        :type input_dim: int 
        :param variance: the variance :math:`\sigma^2`
        :type variance: float
        :param mapping: the mapping that gives the lengthscale across the input space (by default GPy.mappings.MLP is used with 20 hidden nodes).
        :type mapping: GPy.core.Mapping
        :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
        :type ARD: Boolean
        :rtype: Kernpart object
    """
    def __init__(self, input_dim, variance=1., mapping=None, ARD=False):
        self.input_dim = input_dim
        self.ARD = ARD
        if not mapping:
            mapping = GPy.mappings.MLP(output_dim=1, hidden_dim=20, input_dim=input_dim)
        if not ARD:
            self.num_params=1+mapping.num_params
        else:
            raise NotImplementedError
        self.mapping = mapping
        self.name='gibbs'
        self._set_params(np.hstack((variance, self.mapping._get_params())))
    def _get_params(self):
        return np.hstack((self.variance, self.mapping._get_params()))
    def _set_params(self, x):
        assert x.size == (self.num_params)
        self.variance = x[0]
        self.mapping._set_params(x[1:])
    def _get_param_names(self):
        return ['variance'] + self.mapping._get_param_names()
    def K(self, X, X2, target):
        """Return covariance between X and X2."""
        self._K_computations(X, X2)
        target += self.variance*self._K_dvar
    def Kdiag(self, X, target):
        """Compute the diagonal of the covariance matrix for X."""
        np.add(target, self.variance, target)
    def dK_dtheta(self, dL_dK, X, X2, target):
        """Derivative of the covariance with respect to the parameters."""
        self._K_computations(X, X2)
        self._dK_computations(dL_dK)
        if X2==None:
            gmapping = self.mapping.df_dtheta(2*self._dL_dl[:, None], X)
        else:
            gmapping = self.mapping.df_dtheta(self._dL_dl[:, None], X)
            gmapping += self.mapping.df_dtheta(self._dL_dl_two[:, None], X2)
        target+= np.hstack([(dL_dK*self._K_dvar).sum(), gmapping])
    def dK_dX(self, dL_dK, X, X2, target):
        """Derivative of the covariance matrix with respect to X."""
        # First account for gradients arising from presence of X in exponent.
        self._K_computations(X, X2)
        _K_dist = X[:, None, :] - X2[None, :, :]
        dK_dX = (-2.*self.variance)*np.transpose((self._K_dvar/self._w2)[:, :, None]*_K_dist, (1, 0, 2))
        target += np.sum(dK_dX*dL_dK.T[:, :, None], 0)
        # Now account for gradients arising from presence of X in lengthscale.
        self._dK_computations(dL_dK)
        target += self.mapping.df_dX(self._dL_dl[:, None], X)
    def dKdiag_dX(self, dL_dKdiag, X, target):
        """Gradient of diagonal of covariance with respect to X."""
        pass
    def dKdiag_dtheta(self, dL_dKdiag, X, target):
        """Gradient of diagonal of covariance with respect to parameters."""
        pass
    def _K_computations(self, X, X2=None):
        """Pre-computations for the covariance function (used both when computing the covariance and its gradients). Here self._dK_dvar and self._K_dist2 are updated."""
        self._lengthscales=self.mapping.f(X)
        self._lengthscales2=np.square(self._lengthscales)
        if X2==None:
            self._lengthscales_two = self._lengthscales
            self._lengthscales_two2 = self._lengthscales2
            Xsquare = np.square(X).sum(1)
            self._K_dist2 = -2.*tdot(X) + Xsquare[:, None] + Xsquare[None, :]
        else:
            self._lengthscales_two = self.mapping.f(X2)
            self._lengthscales_two2 = np.square(self._lengthscales_two)
            self._K_dist2 = -2.*np.dot(X, X2.T) + np.square(X).sum(1)[:, None] + np.square(X2).sum(1)[None, :]
        self._w2 = self._lengthscales2 + self._lengthscales_two2.T
        prod_length = self._lengthscales*self._lengthscales_two.T
        self._K_exponential = np.exp(-self._K_dist2/self._w2)
        self._K_dvar = np.sign(prod_length)*(2*np.abs(prod_length)/self._w2)**(self.input_dim/2.)*np.exp(-self._K_dist2/self._w2)
    def _dK_computations(self, dL_dK):
        """Pre-computations for the gradients of the covaraince function. Here the gradient of the covariance with respect to all the individual lengthscales is computed.
        :param dL_dK: the gradient of the objective with respect to the covariance function.
        :type dL_dK: ndarray"""
        self._dL_dl = (dL_dK*self.variance*self._K_dvar*(self.input_dim/2.*(self._lengthscales_two.T**4 - self._lengthscales**4) + 2*self._lengthscales2*self._K_dist2)/(self._w2*self._w2*self._lengthscales)).sum(1)
        if self._lengthscales_two is self._lengthscales:
            self._dL_dl_two = None
        else:
            self._dL_dl_two = (dL_dK*self.variance*self._K_dvar*(self.input_dim/2.*(self._lengthscales**4 - self._lengthscales_two.T**4 ) + 2*self._lengthscales_two2.T*self._K_dist2)/(self._w2*self._w2*self._lengthscales_two.T)).sum(0)
--- a/GPy/kern/parts/hierarchical.py
+++ b/GPy/kern/parts/hierarchical.py
@ -0,0 +1,76 @@
 # Copyright (c) 2012, James Hesnsman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kernpart import Kernpart
 import numpy as np
 from independent_outputs import index_to_slices
 class Hierarchical(Kernpart):
    """
    A kernel part which can reopresent a hierarchy of indepencnce: a gerenalisation of independent_outputs
    """
    def __init__(self,parts):
        self.levels = len(parts)
        self.input_dim = parts[0].input_dim + 1
        self.num_params = np.sum([k.num_params for k in parts])
        self.name = 'hierarchy'
        self.parts = parts
        self.param_starts = np.hstack((0,np.cumsum([k.num_params for k in self.parts[:-1]])))
        self.param_stops = np.cumsum([k.num_params for k in self.parts])
    def _get_params(self):
        return np.hstack([k._get_params() for k in self.parts])
    def _set_params(self,x):
        [k._set_params(x[start:stop]) for k, start, stop in zip(self.parts, self.param_starts, self.param_stops)]
    def _get_param_names(self):
        return sum([[str(i)+'_'+k.name+'_'+n for n in k._get_param_names()] for i,k in enumerate(self.parts)],[])
    def _sort_slices(self,X,X2):
        slices = [index_to_slices(x) for x in X[:,-self.levels:].T]
        X = X[:,:-self.levels]
        if X2 is None:
            slices2 = slices
            X2 = X
        else:
            slices2 = [index_to_slices(x) for x in X2[:,-self.levels:].T]
            X2 = X2[:,:-self.levels]
        return X, X2, slices, slices2
    def K(self,X,X2,target):
        X, X2, slices, slices2 = self._sort_slices(X,X2)
        [[[[k.K(X[s],X2[s2],target[s,s2]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices_,slices2_)] for k, slices_, slices2_ in zip(self.parts,slices,slices2)]
    def Kdiag(self,X,target):
        raise NotImplementedError
        #X,slices = X[:,:-1],index_to_slices(X[:,-1])
        #[[self.k.Kdiag(X[s],target[s]) for s in slices_i] for slices_i in slices]
    def dK_dtheta(self,dL_dK,X,X2,target):
        X, X2, slices, slices2 = self._sort_slices(X,X2)
        [[[[k.dK_dtheta(dL_dK[s,s2],X[s],X2[s2],target[p_start:p_stop]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices_, slices2_)] for k, p_start, p_stop, slices_, slices2_ in zip(self.parts, self.param_starts, self.param_stops, slices, slices2)]
    def dK_dX(self,dL_dK,X,X2,target):
        raise NotImplementedError
        #X,slices = X[:,:-1],index_to_slices(X[:,-1])
        #if X2 is None:
            #X2,slices2 = X,slices
        #else:
            #X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
        #[[[self.k.dK_dX(dL_dK[s,s2],X[s],X2[s2],target[s,:-1]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
 #
    def dKdiag_dX(self,dL_dKdiag,X,target):
        raise NotImplementedError
        #X,slices = X[:,:-1],index_to_slices(X[:,-1])
        #[[self.k.dKdiag_dX(dL_dKdiag[s],X[s],target[s,:-1]) for s in slices_i] for slices_i in slices]
    def dKdiag_dtheta(self,dL_dKdiag,X,target):
        raise NotImplementedError
        #X,slices = X[:,:-1],index_to_slices(X[:,-1])
        #[[self.k.dKdiag_dX(dL_dKdiag[s],X[s],target) for s in slices_i] for slices_i in slices]
--- a/GPy/kern/parts/kernpart.py
+++ b/GPy/kern/parts/kernpart.py
@ -29,7 +29,11 @@ class Kernpart(object):
    def dK_dtheta(self,dL_dK,X,X2,target):
        raise NotImplementedError
    def dKdiag_dtheta(self,dL_dKdiag,X,target):
-        raise NotImplementedError
+        # In the base case compute this by calling dK_dtheta. Need to
        # override for stationary covariances (for example) to save
        # time.
        for i in range(X.shape[0]):
            self.dK_dtheta(dL_dKdiag[i], X[i, :][None, :], X2=None, target=target)
    def psi0(self,Z,mu,S,target):
        raise NotImplementedError
    def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target):
@ -52,5 +56,21 @@ class Kernpart(object):
        raise NotImplementedError
    def dpsi2_dmuS(self,dL_dpsi2,Z,mu,S,target_mu,target_S):
        raise NotImplementedError
-    def dK_dX(self,X,X2,target):
+    def dK_dX(self, dL_dK, X, X2, target):
        raise NotImplementedError
 class Kernpart_inner(Kernpart):
    def __init__(self,input_dim):
        """
        The base class for a kernpart_inner: a positive definite function which forms part of a kernel that is based on the inner product between inputs.
        :param input_dim: the number of input dimensions to the function
        :type input_dim: int
        Do not instantiate.
        """
        Kernpart.__init__(self, input_dim)
        # initialize cache
        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
        self._X, self._X2, self._params = np.empty(shape=(3, 1))
--- a/GPy/kern/parts/linear.py
+++ b/GPy/kern/parts/linear.py
@ -5,6 +5,7 @@
 from kernpart import Kernpart
 import numpy as np
 from ...util.linalg import tdot
 from ...util.misc import fast_array_equal
 from scipy import weave
 class Linear(Kernpart):
@ -140,28 +141,24 @@ class Linear(Kernpart):
        self.dK_dX(dL_dpsi1.T, Z, mu, target)
    def psi2(self, Z, mu, S, target):
        """
        returns N,num_inducing,num_inducing matrix
        """
        self._psi_computations(Z, mu, S)
 #         psi2_old = self.ZZ * np.square(self.variances) * self.mu2_S[:, None, None, :]
 #         target += psi2.sum(-1)
        # slow way of doing it, but right
 #         psi2_real = rm np.zeros((mu.shape[0], Z.shape[0], Z.shape[0]))
 #         for n in range(mu.shape[0]):
 #             for m_prime in range(Z.shape[0]):
 #                 for m in range(Z.shape[0]):
 #                     tmp = self._Z[m:m + 1] * self.variances
 #                     tmp = np.dot(tmp, (tdot(self._mu[n:n + 1].T) + np.diag(S[n])))
 #                     psi2_real[n, m, m_prime] = np.dot(tmp, (
 #                             self._Z[m_prime:m_prime + 1] * self.variances).T)
 #         mu2_S = (self._mu[:, None, :] * self._mu[:, :, None])
 #         mu2_S[:, np.arange(self.input_dim), np.arange(self.input_dim)] += self._S
 #         psi2 = (self.ZA[None, :, None, :] * mu2_S[:, None]).sum(-1)
 #         psi2 = (psi2[:, :, None] * self.ZA[None, None]).sum(-1)
 #         psi2_tensor = np.tensordot(self.ZZ[None, :, :, :] * np.square(self.variances), self.mu2_S[:, None, None, :], ((3), (3))).squeeze().T
        target += self._psi2
    def psi2_new(self,Z,mu,S,target):
        tmp = np.zeros((mu.shape[0], Z.shape[0]))
        self.K(mu,Z,tmp)
        target += tmp[:,:,None]*tmp[:,None,:] + np.sum(S[:,None,None,:]*self.variances**2*Z[None,:,None,:]*Z[None,None,:,:],-1)
    def dpsi2_dtheta_new(self, dL_dpsi2, Z, mu, S, target):
        tmp = np.zeros((mu.shape[0], Z.shape[0]))
        self.K(mu,Z,tmp)
        self.dK_dtheta(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target)
        result= 2.*(dL_dpsi2[:,:,:,None]*S[:,None,None,:]*self.variances*Z[None,:,None,:]*Z[None,None,:,:]).sum(0).sum(0).sum(0)
        if self.ARD:
            target += result.sum(0).sum(0).sum(0)
        else:
            target += result.sum()
    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target):
        self._psi_computations(Z, mu, S)
        tmp = dL_dpsi2[:, :, :, None] * (self.ZAinner[:, :, None, :] * (2 * Z)[None, None, :, :])
@ -170,6 +167,15 @@ class Linear(Kernpart):
        else:
            target += tmp.sum()
    def dpsi2_dmuS_new(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
        tmp = np.zeros((mu.shape[0], Z.shape[0]))
        self.K(mu,Z,tmp)
        self.dK_dX(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target_mu)
        Zs = Z*self.variances
        Zs_sq = Zs[:,None,:]*Zs[None,:,:]
        target_S += (dL_dpsi2[:,:,:,None]*Zs_sq[None,:,:,:]).sum(1).sum(1)
    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
        """Think N,num_inducing,num_inducing,input_dim """
        self._psi_computations(Z, mu, S)
@ -266,7 +272,7 @@ class Linear(Kernpart):
    #---------------------------------------#
    def _K_computations(self, X, X2):
-        if not (np.array_equal(X, self._Xcache) and np.array_equal(X2, self._X2cache)):
+        if not (fast_array_equal(X, self._Xcache) and fast_array_equal(X2, self._X2cache)):
            self._Xcache = X.copy()
            if X2 is None:
                self._dot_product = tdot(X)
@ -277,8 +283,8 @@ class Linear(Kernpart):
    def _psi_computations(self, Z, mu, S):
        # here are the "statistics" for psi1 and psi2
-        Zv_changed = not (np.array_equal(Z, self._Z) and np.array_equal(self.variances, self._variances))
+        Zv_changed = not (fast_array_equal(Z, self._Z) and fast_array_equal(self.variances, self._variances))
-        muS_changed = not (np.array_equal(mu, self._mu) and np.array_equal(S, self._S))
+        muS_changed = not (fast_array_equal(mu, self._mu) and fast_array_equal(S, self._S))
        if Zv_changed:
            # Z has changed, compute Z specific stuff
            # self.ZZ = Z[:,None,:]*Z[None,:,:] # num_inducing,num_inducing,input_dim
--- a/GPy/kern/parts/mlp.py
+++ b/GPy/kern/parts/mlp.py
@ -0,0 +1,155 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kernpart import Kernpart
 import numpy as np
 four_over_tau = 2./np.pi
 class MLP(Kernpart):
    """
    multi layer perceptron kernel (also known as arc sine kernel or neural network kernel)
    .. math::
       k(x,y) = \sigma^2 \frac{2}{\pi}  \text{asin} \left(\frac{\sigma_w^2 x^\top y+\sigma_b^2}{\sqrt{\sigma_w^2x^\top x + \sigma_b^2 + 1}\sqrt{\sigma_w^2 y^\top y \sigma_b^2 +1}} \right)
    :param input_dim: the number of input dimensions
    :type input_dim: int 
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
    :param weight_variance: the vector of the variances of the prior over input weights in the neural network :math:`\sigma^2_w`
    :type weight_variance: array or list of the appropriate size (or float if there is only one weight variance parameter)
    :param bias_variance: the variance of the prior over bias parameters :math:`\sigma^2_b`
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
    :type ARD: Boolean
    :rtype: Kernpart object
    """
    def __init__(self, input_dim, variance=1., weight_variance=None, bias_variance=100., ARD=False):
        self.input_dim = input_dim
        self.ARD = ARD
        if not ARD:
            self.num_params=3
            if weight_variance is not None:
                weight_variance = np.asarray(weight_variance)
                assert weight_variance.size == 1, "Only one weight variance needed for non-ARD kernel"
            else:
                weight_variance = 100.*np.ones(1)
        else:
            self.num_params = self.input_dim + 2
            if weight_variance is not None:
                weight_variance = np.asarray(weight_variance)
                assert weight_variance.size == self.input_dim, "bad number of weight variances"
            else:
                weight_variance = np.ones(self.input_dim)
            raise NotImplementedError
        self.name='mlp'
        self._set_params(np.hstack((variance, weight_variance.flatten(), bias_variance)))
    def _get_params(self):
        return np.hstack((self.variance, self.weight_variance.flatten(), self.bias_variance))
    def _set_params(self, x):
        assert x.size == (self.num_params)
        self.variance = x[0]
        self.weight_variance = x[1:-1]
        self.weight_std = np.sqrt(self.weight_variance)
        self.bias_variance = x[-1]
    def _get_param_names(self):
        if self.num_params == 3:
            return ['variance', 'weight_variance', 'bias_variance']
        else:
            return ['variance'] + ['weight_variance_%i' % i for i in range(self.lengthscale.size)] + ['bias_variance']
    def K(self, X, X2, target):
        """Return covariance between X and X2."""
        self._K_computations(X, X2)
        target += self.variance*self._K_dvar
    def Kdiag(self, X, target):
        """Compute the diagonal of the covariance matrix for X."""
        self._K_diag_computations(X)
        target+= self.variance*self._K_diag_dvar
    def dK_dtheta(self, dL_dK, X, X2, target):
        """Derivative of the covariance with respect to the parameters."""
        self._K_computations(X, X2)
        denom3 = self._K_denom*self._K_denom*self._K_denom
        base = four_over_tau*self.variance/np.sqrt(1-self._K_asin_arg*self._K_asin_arg)
        base_cov_grad = base*dL_dK
        if X2 is None:
            vec = np.diag(self._K_inner_prod)
            target[1] += ((self._K_inner_prod/self._K_denom 
                           -.5*self._K_numer/denom3
                           *(np.outer((self.weight_variance*vec+self.bias_variance+1.), vec) 
                             +np.outer(vec,(self.weight_variance*vec+self.bias_variance+1.))))*base_cov_grad).sum()
            target[2] += ((1./self._K_denom 
                           -.5*self._K_numer/denom3 
                           *((vec[None, :]+vec[:, None])*self.weight_variance
                           +2.*self.bias_variance + 2.))*base_cov_grad).sum()
        else:
            vec1 = (X*X).sum(1)
            vec2 = (X2*X2).sum(1)
            target[1] += ((self._K_inner_prod/self._K_denom 
                           -.5*self._K_numer/denom3
                           *(np.outer((self.weight_variance*vec1+self.bias_variance+1.), vec2) + np.outer(vec1, self.weight_variance*vec2 + self.bias_variance+1.)))*base_cov_grad).sum()
            target[2] += ((1./self._K_denom 
                           -.5*self._K_numer/denom3 
                           *((vec1[:, None]+vec2[None, :])*self.weight_variance
                             + 2*self.bias_variance + 2.))*base_cov_grad).sum()
        target[0] += np.sum(self._K_dvar*dL_dK)
    def dK_dX(self, dL_dK, X, X2, target):
        """Derivative of the covariance matrix with respect to X"""
        self._K_computations(X, X2)
        arg = self._K_asin_arg
        numer = self._K_numer
        denom = self._K_denom
        vec2 = (X2*X2).sum(1)*self.weight_variance + self.bias_variance + 1.
        denom3 = denom*denom*denom
        target += four_over_tau*self.weight_variance*self.variance*((X2[None, :, :]/denom[:, :, None] - vec2[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
    def dKdiag_dX(self, dL_dKdiag, X, target):
        """Gradient of diagonal of covariance with respect to X"""
        self._K_diag_computations(X)
        arg = self._K_diag_asin_arg
        denom = self._K_diag_denom
        numer = self._K_diag_numer
        target += four_over_tau*2.*self.weight_variance*self.variance*X*(1/denom*(1 - arg)*dL_dKdiag/(np.sqrt(1-arg*arg)))[:, None] 
    def _K_computations(self, X, X2):
        """Pre-computations for the covariance matrix (used for computing the covariance and its gradients."""
        if self.ARD:
            pass
        else:
            if X2 is None:
                self._K_inner_prod = np.dot(X,X.T)
                self._K_numer = self._K_inner_prod*self.weight_variance+self.bias_variance
                vec = np.diag(self._K_numer) + 1.
                self._K_denom = np.sqrt(np.outer(vec,vec))
                self._K_asin_arg = self._K_numer/self._K_denom
                self._K_dvar = four_over_tau*np.arcsin(self._K_asin_arg)
            else:
                self._K_inner_prod = np.dot(X,X2.T)
                self._K_numer = self._K_inner_prod*self.weight_variance + self.bias_variance
                vec1 = (X*X).sum(1)*self.weight_variance + self.bias_variance + 1.
                vec2 = (X2*X2).sum(1)*self.weight_variance + self.bias_variance + 1.
                self._K_denom = np.sqrt(np.outer(vec1,vec2))
                self._K_asin_arg = self._K_numer/self._K_denom
                self._K_dvar = four_over_tau*np.arcsin(self._K_asin_arg)
    def _K_diag_computations(self, X):
        """Pre-computations concerning the diagonal terms (used for computation of diagonal and its gradients)."""
        if self.ARD:
            pass
        else:
            self._K_diag_numer = (X*X).sum(1)*self.weight_variance + self.bias_variance
            self._K_diag_denom = self._K_diag_numer+1.
            self._K_diag_asin_arg = self._K_diag_numer/self._K_diag_denom
            self._K_diag_dvar = four_over_tau*np.arcsin(self._K_diag_asin_arg)
--- a/GPy/kern/parts/poly.py
+++ b/GPy/kern/parts/poly.py
@ -0,0 +1,135 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kernpart import Kernpart
 import numpy as np
 four_over_tau = 2./np.pi
 class POLY(Kernpart):
    """
    polynomial kernel parameter initialisation.  Included for completeness, but generally not recommended, is the polynomial kernel,
    .. math::
    k(x, y) = \sigma^2*(\sigma_w^2 x'y+\sigma_b^b)^d
    The kernel parameters are \sigma^2 (variance), \sigma^2_w
    (weight_variance), \sigma^2_b (bias_variance) and d
    (degree). Only gradients of the first three are provided for
    kernel optimisation, it is assumed that polynomial degree would
    be set by hand.
    The kernel is not recommended as it is badly behaved when the
    \sigma^2_w*x'*y + \sigma^2_b has a magnitude greater than one. For completeness
    there is an automatic relevance determination version of this
    kernel provided.
    :param input_dim: the number of input dimensions
    :type input_dim: int 
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
    :param weight_variance: the vector of the variances of the prior over input weights in the neural network :math:`\sigma^2_w`
    :type weight_variance: array or list of the appropriate size (or float if there is only one weight variance parameter)
    :param bias_variance: the variance of the prior over bias parameters :math:`\sigma^2_b`
    :param degree: the degree of the polynomial.
    :type degree: int
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
    :type ARD: Boolean
    :rtype: Kernpart object
    """
    def __init__(self, input_dim, variance=1., weight_variance=None, bias_variance=1., degree=2, ARD=False):
        self.input_dim = input_dim
        self.ARD = ARD
        if not ARD:
            self.num_params=3
            if weight_variance is not None:
                weight_variance = np.asarray(weight_variance)
                assert weight_variance.size == 1, "Only one weight variance needed for non-ARD kernel"
            else:
                weight_variance = 1.*np.ones(1)
        else:
            self.num_params = self.input_dim + 2
            if weight_variance is not None:
                weight_variance = np.asarray(weight_variance)
                assert weight_variance.size == self.input_dim, "bad number of weight variances"
            else:
                weight_variance = np.ones(self.input_dim)
            raise NotImplementedError
        self.degree=degree
        self.name='poly_deg' + str(self.degree)
        self._set_params(np.hstack((variance, weight_variance.flatten(), bias_variance)))
    def _get_params(self):
        return np.hstack((self.variance, self.weight_variance.flatten(), self.bias_variance))
    def _set_params(self, x):
        assert x.size == (self.num_params)
        self.variance = x[0]
        self.weight_variance = x[1:-1]
        self.weight_std = np.sqrt(self.weight_variance)
        self.bias_variance = x[-1]
    def _get_param_names(self):
        if self.num_params == 3:
            return ['variance', 'weight_variance', 'bias_variance']
        else:
            return ['variance'] + ['weight_variance_%i' % i for i in range(self.lengthscale.size)] + ['bias_variance']
    def K(self, X, X2, target):
        """Return covariance between X and X2."""
        self._K_computations(X, X2)
        target += self.variance*self._K_dvar
    def Kdiag(self, X, target):
        """Compute the diagonal of the covariance matrix for X."""
        self._K_diag_computations(X)
        target+= self.variance*self._K_diag_dvar
    def dK_dtheta(self, dL_dK, X, X2, target):
        """Derivative of the covariance with respect to the parameters."""
        self._K_computations(X, X2)
        base = self.variance*self.degree*self._K_poly_arg**(self.degree-1)
        base_cov_grad = base*dL_dK
        target[0] += np.sum(self._K_dvar*dL_dK)
        target[1] += (self._K_inner_prod*base_cov_grad).sum()
        target[2] += base_cov_grad.sum()
    def dK_dX(self, dL_dK, X, X2, target):
        """Derivative of the covariance matrix with respect to X"""
        self._K_computations(X, X2)
        arg = self._K_poly_arg
        target += self.weight_variance*self.degree*self.variance*(((X2[None,:, :])) *(arg**(self.degree-1))[:, :, None]*dL_dK[:, :, None]).sum(1)
    def dKdiag_dX(self, dL_dKdiag, X, target):
        """Gradient of diagonal of covariance with respect to X"""
        self._K_diag_computations(X)
        arg = self._K_diag_poly_arg
        target += 2.*self.weight_variance*self.degree*self.variance*X*dL_dKdiag[:, None]*(arg**(self.degree-1))[:, None]
    def _K_computations(self, X, X2):
        if self.ARD:
            pass
        else:
            if X2 is None:
                self._K_inner_prod = np.dot(X,X.T)
            else:
                self._K_inner_prod = np.dot(X,X2.T)
            self._K_poly_arg = self._K_inner_prod*self.weight_variance + self.bias_variance
        self._K_dvar = self._K_poly_arg**self.degree
    def _K_diag_computations(self, X):
        if self.ARD:
            pass
        else:
            self._K_diag_poly_arg = (X*X).sum(1)*self.weight_variance + self.bias_variance
        self._K_diag_dvar = self._K_diag_poly_arg**self.degree
--- a/GPy/kern/parts/prod.py
+++ b/GPy/kern/parts/prod.py
@ -51,8 +51,18 @@ class Prod(Kernpart):
        self._K_computations(X,X2)
        target += self._K1 * self._K2
    def K1(self,X, X2):
        """Compute the part of the kernel associated with k1."""
        self._K_computations(X, X2)
        return self._K1
    def K2(self, X, X2):
        """Compute the part of the kernel associated with k2."""
        self._K_computations(X, X2)
        return self._K2
    def dK_dtheta(self,dL_dK,X,X2,target):
-        """derivative of the covariance matrix with respect to the parameters."""
+        """Derivative of the covariance matrix with respect to the parameters."""
        self._K_computations(X,X2)
        if X2 is None:
            self.k1.dK_dtheta(dL_dK*self._K2, X[:,self.slice1], None, target[:self.k1.num_params])
@ -80,8 +90,8 @@ class Prod(Kernpart):
    def dK_dX(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
        self._K_computations(X,X2)
-        self.k1.dK_dX(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target)
+        self.k1.dK_dX(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
-        self.k2.dK_dX(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target)
+        self.k2.dK_dX(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
    def dKdiag_dX(self, dL_dKdiag, X, target):
        K1 = np.zeros(X.shape[0])
@ -89,8 +99,8 @@ class Prod(Kernpart):
        self.k1.Kdiag(X[:,self.slice1],K1)
        self.k2.Kdiag(X[:,self.slice2],K2)
-        self.k1.dK_dX(dL_dKdiag*K2, X[:,self.slice1], target)
+        self.k1.dK_dX(dL_dKdiag*K2, X[:,self.slice1], target[:,self.slice1])
-        self.k2.dK_dX(dL_dKdiag*K1, X[:,self.slice2], target)
+        self.k2.dK_dX(dL_dKdiag*K1, X[:,self.slice2], target[:,self.slice2])
    def _K_computations(self,X,X2):
        if not (np.array_equal(X,self._X) and np.array_equal(X2,self._X2) and np.array_equal(self._params , self._get_params())):
--- a/GPy/kern/parts/rbf.py
+++ b/GPy/kern/parts/rbf.py
@ -4,9 +4,9 @@
 from kernpart import Kernpart
 import numpy as np
 import hashlib
 from scipy import weave
 from ...util.linalg import tdot
 from ...util.misc import fast_array_equal
 class RBF(Kernpart):
    """
@ -111,7 +111,7 @@ class RBF(Kernpart):
                }
                """
                num_data, num_inducing, input_dim = X.shape[0], X.shape[0], self.input_dim
-                weave.inline(code, arg_names=['num_data','num_inducing','input_dim','X','X2','target','dvardLdK','var_len3'], type_converters=weave.converters.blitz, **self.weave_options)
+                weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3'], type_converters=weave.converters.blitz, **self.weave_options)
            else:
                code = """
                int q,i,j;
@ -127,8 +127,8 @@ class RBF(Kernpart):
                }
                """
                num_data, num_inducing, input_dim = X.shape[0], X2.shape[0], self.input_dim
-                #[np.add(target[1+q:2+q],var_len3[q]*np.sum(dvardLdK*np.square(X[:,q][:,None]-X2[:,q][None,:])),target[1+q:2+q]) for q in range(self.input_dim)]
+                # [np.add(target[1+q:2+q],var_len3[q]*np.sum(dvardLdK*np.square(X[:,q][:,None]-X2[:,q][None,:])),target[1+q:2+q]) for q in range(self.input_dim)]
-                weave.inline(code, arg_names=['num_data','num_inducing','input_dim','X','X2','target','dvardLdK','var_len3'], type_converters=weave.converters.blitz, **self.weave_options)
+                weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3'], type_converters=weave.converters.blitz, **self.weave_options)
        else:
            target[1] += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dK)
@ -165,9 +165,8 @@ class RBF(Kernpart):
    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target):
        self._psi_computations(Z, mu, S)
        denom_deriv = S[:, None, :] / (self.lengthscale ** 3 + self.lengthscale * S[:, None, :])
        d_length = self._psi1[:, :, None] * (self.lengthscale * np.square(self._psi1_dist / (self.lengthscale2 + S[:, None, :])) + denom_deriv)
        target[0] += np.sum(dL_dpsi1 * self._psi1 / self.variance)
        d_length = self._psi1[:,:,None] * ((self._psi1_dist_sq - 1.)/(self.lengthscale*self._psi1_denom) +1./self.lengthscale)
        dpsi1_dlength = d_length * dL_dpsi1[:, :, None]
        if not self.ARD:
            target[1] += dpsi1_dlength.sum()
@ -222,9 +221,10 @@ class RBF(Kernpart):
    #---------------------------------------#
    def _K_computations(self, X, X2):
-        if not (np.array_equal(X, self._X) and np.array_equal(X2, self._X2) and np.array_equal(self._params , self._get_params())):
+        params = self._get_params()
        if not (fast_array_equal(X, self._X) and fast_array_equal(X2, self._X2) and fast_array_equal(self._params , params)):
            self._X = X.copy()
-            self._params == self._get_params().copy()
+            self._params = params.copy()
            if X2 is None:
                self._X2 = None
                X = X / self.lengthscale
@ -239,42 +239,42 @@ class RBF(Kernpart):
    def _psi_computations(self, Z, mu, S):
        # here are the "statistics" for psi1 and psi2
-        if not np.array_equal(Z, self._Z):
+        Z_changed = not fast_array_equal(Z, self._Z)
-            #Z has changed, compute Z specific stuff
+        if Z_changed:
-            self._psi2_Zhat = 0.5*(Z[:,None,:] +Z[None,:,:]) # M,M,Q
+            # Z has changed, compute Z specific stuff
-            self._psi2_Zdist = 0.5*(Z[:,None,:]-Z[None,:,:]) # M,M,Q
+            self._psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
-            self._psi2_Zdist_sq = np.square(self._psi2_Zdist/self.lengthscale) # M,M,Q
+            self._psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
-            self._Z = Z
+            self._psi2_Zdist_sq = np.square(self._psi2_Zdist / self.lengthscale) # M,M,Q
-        if not (np.array_equal(Z, self._Z) and np.array_equal(mu, self._mu) and np.array_equal(S, self._S)):
+        if Z_changed or not fast_array_equal(mu, self._mu) or not fast_array_equal(S, self._S):
-            #something's changed. recompute EVERYTHING
+            # something's changed. recompute EVERYTHING
-            #psi1
+            # psi1
-            self._psi1_denom = S[:,None,:]/self.lengthscale2 + 1.
+            self._psi1_denom = S[:, None, :] / self.lengthscale2 + 1.
-            self._psi1_dist = Z[None,:,:]-mu[:,None,:]
+            self._psi1_dist = Z[None, :, :] - mu[:, None, :]
-            self._psi1_dist_sq = np.square(self._psi1_dist)/self.lengthscale2/self._psi1_denom
+            self._psi1_dist_sq = np.square(self._psi1_dist) / self.lengthscale2 / self._psi1_denom
-            self._psi1_exponent = -0.5*np.sum(self._psi1_dist_sq+np.log(self._psi1_denom),-1)
+            self._psi1_exponent = -0.5 * np.sum(self._psi1_dist_sq + np.log(self._psi1_denom), -1)
-            self._psi1 = self.variance*np.exp(self._psi1_exponent)
+            self._psi1 = self.variance * np.exp(self._psi1_exponent)
-            #psi2
+            # psi2
-            self._psi2_denom = 2.*S[:,None,None,:]/self.lengthscale2+1. # N,M,M,Q
+            self._psi2_denom = 2.*S[:, None, None, :] / self.lengthscale2 + 1. # N,M,M,Q
-            self._psi2_mudist, self._psi2_mudist_sq, self._psi2_exponent, _ = self.weave_psi2(mu,self._psi2_Zhat)
+            self._psi2_mudist, self._psi2_mudist_sq, self._psi2_exponent, _ = self.weave_psi2(mu, self._psi2_Zhat)
-            #self._psi2_mudist = mu[:,None,None,:]-self._psi2_Zhat #N,M,M,Q
+            # self._psi2_mudist = mu[:,None,None,:]-self._psi2_Zhat #N,M,M,Q
-            #self._psi2_mudist_sq = np.square(self._psi2_mudist)/(self.lengthscale2*self._psi2_denom)
+            # self._psi2_mudist_sq = np.square(self._psi2_mudist)/(self.lengthscale2*self._psi2_denom)
-            #self._psi2_exponent = np.sum(-self._psi2_Zdist_sq -self._psi2_mudist_sq -0.5*np.log(self._psi2_denom),-1) #N,M,M,Q
+            # self._psi2_exponent = np.sum(-self._psi2_Zdist_sq -self._psi2_mudist_sq -0.5*np.log(self._psi2_denom),-1) #N,M,M,Q
-            self._psi2 = np.square(self.variance)*np.exp(self._psi2_exponent) # N,M,M,Q
+            self._psi2 = np.square(self.variance) * np.exp(self._psi2_exponent) # N,M,M,Q
-            #store matrices for caching
+            # store matrices for caching
-            self._Z, self._mu, self._S = Z, mu,S
+            self._Z, self._mu, self._S = Z, mu, S
-    def weave_psi2(self,mu,Zhat):
+    def weave_psi2(self, mu, Zhat):
-        N,input_dim = mu.shape
+        N, input_dim = mu.shape
        num_inducing = Zhat.shape[0]
-        mudist = np.empty((N,num_inducing,num_inducing,input_dim))
+        mudist = np.empty((N, num_inducing, num_inducing, input_dim))
-        mudist_sq = np.empty((N,num_inducing,num_inducing,input_dim))
+        mudist_sq = np.empty((N, num_inducing, num_inducing, input_dim))
-        psi2_exponent = np.zeros((N,num_inducing,num_inducing))
+        psi2_exponent = np.zeros((N, num_inducing, num_inducing))
-        psi2 = np.empty((N,num_inducing,num_inducing))
+        psi2 = np.empty((N, num_inducing, num_inducing))
        psi2_Zdist_sq = self._psi2_Zdist_sq
        _psi2_denom = self._psi2_denom.squeeze().reshape(N, self.input_dim)
@ -324,7 +324,7 @@ class RBF(Kernpart):
        #include <math.h>
        """
        weave.inline(code, support_code=support_code, libraries=['gomp'],
-                     arg_names=['N','num_inducing','input_dim','mu','Zhat','mudist_sq','mudist','lengthscale2','_psi2_denom','psi2_Zdist_sq','psi2_exponent','half_log_psi2_denom','psi2','variance_sq'],
+                     arg_names=['N', 'num_inducing', 'input_dim', 'mu', 'Zhat', 'mudist_sq', 'mudist', 'lengthscale2', '_psi2_denom', 'psi2_Zdist_sq', 'psi2_exponent', 'half_log_psi2_denom', 'psi2', 'variance_sq'],
                     type_converters=weave.converters.blitz, **self.weave_options)
        return mudist, mudist_sq, psi2_exponent, psi2
--- a/GPy/kern/parts/rbf_inv.py
+++ b/GPy/kern/parts/rbf_inv.py
@ -0,0 +1,322 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from rbf import RBF
 import numpy as np
 import hashlib
 from scipy import weave
 from ...util.linalg import tdot
 class RBFInv(RBF):
    """
    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel. It only
    differs from RBF in that here the parametrization is wrt the inverse lengthscale:
    .. math::
       k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \ \ \ \ \  \\text{ where  } r^2 = \sum_{i=1}^d \\frac{ (x_i-x^\prime_i)^2}{\ell_i^2}
    where \ell_i is the lengthscale, \sigma^2 the variance and d the dimensionality of the input.
    :param input_dim: the number of input dimensions
    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float
    :param lengthscale: the vector of lengthscale of the kernel
    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
    .. Note: this object implements both the ARD and 'spherical' version of the function
    """
    def __init__(self, input_dim, variance=1., inv_lengthscale=None, ARD=False):
        self.input_dim = input_dim
        self.name = 'rbf_inv'
        self.ARD = ARD
        if not ARD:
            self.num_params = 2
            if inv_lengthscale is not None:
                inv_lengthscale = np.asarray(inv_lengthscale)
                assert inv_lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
            else:
                inv_lengthscale = np.ones(1)
        else:
            self.num_params = self.input_dim + 1
            if inv_lengthscale is not None:
                inv_lengthscale = np.asarray(inv_lengthscale)
                assert inv_lengthscale.size == self.input_dim, "bad number of lengthscales"
            else:
                inv_lengthscale = np.ones(self.input_dim)
        self._set_params(np.hstack((variance, inv_lengthscale.flatten())))
        # initialize cache
        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
        self._X, self._X2, self._params = np.empty(shape=(3, 1))
        # a set of optional args to pass to weave
        self.weave_options = {'headers'           : ['<omp.h>'],
                         'extra_compile_args': ['-fopenmp -O3'], # -march=native'],
                         'extra_link_args'   : ['-lgomp']}
    def _get_params(self):
        return np.hstack((self.variance, self.inv_lengthscale))
    def _set_params(self, x):
        assert x.size == (self.num_params)
        self.variance = x[0]
        self.inv_lengthscale = x[1:]
        self.inv_lengthscale2 = np.square(self.inv_lengthscale)
        # TODO: We can rewrite everything with inv_lengthscale and never need to do the below
        self.lengthscale = 1. / self.inv_lengthscale
        self.lengthscale2 = np.square(self.lengthscale)
        # reset cached results
        self._X, self._X2, self._params = np.empty(shape=(3, 1))
        self._Z, self._mu, self._S = np.empty(shape=(3, 1)) # cached versions of Z,mu,S
    def _get_param_names(self):
        if self.num_params == 2:
            return ['variance', 'inv_lengthscale']
        else:
            return ['variance'] + ['inv_lengthscale%i' % i for i in range(self.inv_lengthscale.size)]
    # TODO: Rewrite computations so that lengthscale is not needed (but only inv. lengthscale)
    def dK_dtheta(self, dL_dK, X, X2, target):
        self._K_computations(X, X2)
        target[0] += np.sum(self._K_dvar * dL_dK)
        if self.ARD:
            dvardLdK = self._K_dvar * dL_dK
            var_len3 = self.variance / np.power(self.lengthscale, 3)
            len2 = self.lengthscale2
            if X2 is None:
                # save computation for the symmetrical case
                dvardLdK = dvardLdK + dvardLdK.T
                code = """
                int q,i,j;
                double tmp;
                for(q=0; q<input_dim; q++){
                  tmp = 0;
                  for(i=0; i<num_data; i++){
                    for(j=0; j<i; j++){
                      tmp += (X(i,q)-X(j,q))*(X(i,q)-X(j,q))*dvardLdK(i,j);
                    }
                  }
                  target(q+1) += var_len3(q)*tmp*(-len2(q));
                }
                """
                num_data, num_inducing, input_dim = X.shape[0], X.shape[0], self.input_dim
                weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3', 'len2'], type_converters=weave.converters.blitz, **self.weave_options)
            else:
                code = """
                int q,i,j;
                double tmp;
                for(q=0; q<input_dim; q++){
                  tmp = 0;
                  for(i=0; i<num_data; i++){
                    for(j=0; j<num_inducing; j++){
                      tmp += (X(i,q)-X2(j,q))*(X(i,q)-X2(j,q))*dvardLdK(i,j);
                    }
                  }
                  target(q+1) += var_len3(q)*tmp*(-len2(q));
                }
                """
                num_data, num_inducing, input_dim = X.shape[0], X2.shape[0], self.input_dim
                # [np.add(target[1+q:2+q],var_len3[q]*np.sum(dvardLdK*np.square(X[:,q][:,None]-X2[:,q][None,:])),target[1+q:2+q]) for q in range(self.input_dim)]
                weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3', 'len2'], type_converters=weave.converters.blitz, **self.weave_options)
        else:
            target[1] += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dK) * (-self.lengthscale2)
    def dK_dX(self, dL_dK, X, X2, target):
        self._K_computations(X, X2)
        _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
        dK_dX = (-self.variance * self.inv_lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
    def dKdiag_dX(self, dL_dKdiag, X, target):
        pass
    #---------------------------------------#
    #             PSI statistics            #
    #---------------------------------------#
    # def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target):
    #     self._psi_computations(Z, mu, S)
    #     denom_deriv = S[:, None, :] / (self.lengthscale ** 3 + self.lengthscale * S[:, None, :])
    #     d_length = self._psi1[:, :, None] * (self.lengthscale * np.square(self._psi1_dist / (self.lengthscale2 + S[:, None, :])) + denom_deriv)
    #     target[0] += np.sum(dL_dpsi1 * self._psi1 / self.variance)
    #     dpsi1_dlength = d_length * dL_dpsi1[:, :, None]
    #     if not self.ARD:
    #         target[1] += dpsi1_dlength.sum()*(-self.lengthscale2)
    #     else:
    #         target[1:] += dpsi1_dlength.sum(0).sum(0)*(-self.lengthscale2)
    #     #target[1:] = target[1:]*(-self.lengthscale2)
    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target):
        self._psi_computations(Z, mu, S)
        tmp = 1 + S[:, None, :] * self.inv_lengthscale2
        # d_inv_length_old = -self._psi1[:, :, None] * ((self._psi1_dist_sq - 1.) / (self.lengthscale * self._psi1_denom) + self.inv_lengthscale) / self.inv_lengthscale2
        d_length = -(self._psi1[:, :, None] * ((np.square(self._psi1_dist) * self.inv_lengthscale) / (tmp ** 2) + (S[:, None, :] * self.inv_lengthscale) / (tmp)))
        # d_inv_length = -self._psi1[:, :, None] * ((self._psi1_dist_sq - 1.) / self._psi1_denom + self.lengthscale)
        target[0] += np.sum(dL_dpsi1 * self._psi1 / self.variance)
        dpsi1_dlength = d_length * dL_dpsi1[:, :, None]
        if not self.ARD:
            target[1] += dpsi1_dlength.sum() # *(-self.lengthscale2)
        else:
            target[1:] += dpsi1_dlength.sum(0).sum(0) # *(-self.lengthscale2)
        # target[1:] = target[1:]*(-self.lengthscale2)
    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
        self._psi_computations(Z, mu, S)
        dpsi1_dZ = -self._psi1[:, :, None] * ((self.inv_lengthscale2 * self._psi1_dist) / self._psi1_denom)
        target += np.sum(dL_dpsi1[:, :, None] * dpsi1_dZ, 0)
    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
        self._psi_computations(Z, mu, S)
        tmp = (self._psi1[:, :, None] * self.inv_lengthscale2) / self._psi1_denom
        target_mu += np.sum(dL_dpsi1[:, :, None] * tmp * self._psi1_dist, 1)
        target_S += np.sum(dL_dpsi1[:, :, None] * 0.5 * tmp * (self._psi1_dist_sq - 1), 1)
    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target):
        """Shape N,num_inducing,num_inducing,Ntheta"""
        self._psi_computations(Z, mu, S)
        d_var = 2.*self._psi2 / self.variance
        # d_length = 2.*self._psi2[:, :, :, None] * (self._psi2_Zdist_sq * self._psi2_denom + self._psi2_mudist_sq + S[:, None, None, :] / self.lengthscale2) / (self.lengthscale * self._psi2_denom)
        d_length = -2.*self._psi2[:, :, :, None] * (self._psi2_Zdist_sq * self._psi2_denom + self._psi2_mudist_sq + S[:, None, None, :] * self.inv_lengthscale2) / (self.inv_lengthscale * self._psi2_denom)
        target[0] += np.sum(dL_dpsi2 * d_var)
        dpsi2_dlength = d_length * dL_dpsi2[:, :, :, None]
        if not self.ARD:
            target[1] += dpsi2_dlength.sum() # *(-self.lengthscale2)
        else:
            target[1:] += dpsi2_dlength.sum(0).sum(0).sum(0) # *(-self.lengthscale2)
        # target[1:] = target[1:]*(-self.lengthscale2)
    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
        self._psi_computations(Z, mu, S)
        term1 = self._psi2_Zdist * self.inv_lengthscale2 # num_inducing, num_inducing, input_dim
        term2 = (self._psi2_mudist * self.inv_lengthscale2) / self._psi2_denom # N, num_inducing, num_inducing, input_dim
        dZ = self._psi2[:, :, :, None] * (term1[None] + term2)
        target += (dL_dpsi2[:, :, :, None] * dZ).sum(0).sum(0)
    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
        """Think N,num_inducing,num_inducing,input_dim """
        self._psi_computations(Z, mu, S)
        tmp = (self.inv_lengthscale2 * self._psi2[:, :, :, None]) / self._psi2_denom
        target_mu += -2.*(dL_dpsi2[:, :, :, None] * tmp * self._psi2_mudist).sum(1).sum(1)
        target_S += (dL_dpsi2[:, :, :, None] * tmp * (2.*self._psi2_mudist_sq - 1)).sum(1).sum(1)
    #---------------------------------------#
    #            Precomputations            #
    #---------------------------------------#
    def _K_computations(self, X, X2):
        if not (np.array_equal(X, self._X) and np.array_equal(X2, self._X2) and np.array_equal(self._params , self._get_params())):
            self._X = X.copy()
            self._params = self._get_params().copy()
            if X2 is None:
                self._X2 = None
                X = X * self.inv_lengthscale
                Xsquare = np.sum(np.square(X), 1)
                self._K_dist2 = -2.*tdot(X) + (Xsquare[:, None] + Xsquare[None, :])
            else:
                self._X2 = X2.copy()
                X = X * self.inv_lengthscale
                X2 = X2 * self.inv_lengthscale
                self._K_dist2 = -2.*np.dot(X, X2.T) + (np.sum(np.square(X), 1)[:, None] + np.sum(np.square(X2), 1)[None, :])
            self._K_dvar = np.exp(-0.5 * self._K_dist2)
    def _psi_computations(self, Z, mu, S):
        # here are the "statistics" for psi1 and psi2
        if not np.array_equal(Z, self._Z):
            # Z has changed, compute Z specific stuff
            self._psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
            self._psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
            self._psi2_Zdist_sq = np.square(self._psi2_Zdist * self.inv_lengthscale) # M,M,Q
        if not (np.array_equal(Z, self._Z) and np.array_equal(mu, self._mu) and np.array_equal(S, self._S)):
            # something's changed. recompute EVERYTHING
            # psi1
            self._psi1_denom = S[:, None, :] * self.inv_lengthscale2 + 1.
            self._psi1_dist = Z[None, :, :] - mu[:, None, :]
            self._psi1_dist_sq = (np.square(self._psi1_dist) * self.inv_lengthscale2) / self._psi1_denom
            self._psi1_exponent = -0.5 * np.sum(self._psi1_dist_sq + np.log(self._psi1_denom), -1)
            self._psi1 = self.variance * np.exp(self._psi1_exponent)
            # psi2
            self._psi2_denom = 2.*S[:, None, None, :] * self.inv_lengthscale2 + 1. # N,M,M,Q
            self._psi2_mudist, self._psi2_mudist_sq, self._psi2_exponent, _ = self.weave_psi2(mu, self._psi2_Zhat)
            # self._psi2_mudist = mu[:,None,None,:]-self._psi2_Zhat #N,M,M,Q
            # self._psi2_mudist_sq = np.square(self._psi2_mudist)/(self.lengthscale2*self._psi2_denom)
            # self._psi2_exponent = np.sum(-self._psi2_Zdist_sq -self._psi2_mudist_sq -0.5*np.log(self._psi2_denom),-1) #N,M,M,Q
            self._psi2 = np.square(self.variance) * np.exp(self._psi2_exponent) # N,M,M,Q
            # store matrices for caching
            self._Z, self._mu, self._S = Z, mu, S
    def weave_psi2(self, mu, Zhat):
        N, input_dim = mu.shape
        num_inducing = Zhat.shape[0]
        mudist = np.empty((N, num_inducing, num_inducing, input_dim))
        mudist_sq = np.empty((N, num_inducing, num_inducing, input_dim))
        psi2_exponent = np.zeros((N, num_inducing, num_inducing))
        psi2 = np.empty((N, num_inducing, num_inducing))
        psi2_Zdist_sq = self._psi2_Zdist_sq
        _psi2_denom = self._psi2_denom.squeeze().reshape(N, self.input_dim)
        half_log_psi2_denom = 0.5 * np.log(self._psi2_denom).squeeze().reshape(N, self.input_dim)
        variance_sq = float(np.square(self.variance))
        if self.ARD:
            inv_lengthscale2 = self.inv_lengthscale2
        else:
            inv_lengthscale2 = np.ones(input_dim) * self.inv_lengthscale2
        code = """
        double tmp;
        #pragma omp parallel for private(tmp)
        for (int n=0; n<N; n++){
            for (int m=0; m<num_inducing; m++){
               for (int mm=0; mm<(m+1); mm++){
                   for (int q=0; q<input_dim; q++){
                       //compute mudist
                       tmp = mu(n,q) - Zhat(m,mm,q);
                       mudist(n,m,mm,q) = tmp;
                       mudist(n,mm,m,q) = tmp;
                       //now mudist_sq
                       tmp = tmp*tmp*inv_lengthscale2(q)/_psi2_denom(n,q);
                       mudist_sq(n,m,mm,q) = tmp;
                       mudist_sq(n,mm,m,q) = tmp;
                       //now psi2_exponent
                       tmp = -psi2_Zdist_sq(m,mm,q) - tmp - half_log_psi2_denom(n,q);
                       psi2_exponent(n,mm,m) += tmp;
                       if (m !=mm){
                           psi2_exponent(n,m,mm) += tmp;
                       }
                   //psi2 would be computed like this, but np is faster
                   //tmp = variance_sq*exp(psi2_exponent(n,m,mm));
                   //psi2(n,m,mm) = tmp;
                   //psi2(n,mm,m) = tmp;
                   }
                }
            }
        }
        """
        support_code = """
        #include <omp.h>
        #include <math.h>
        """
        weave.inline(code, support_code=support_code, libraries=['gomp'],
                     arg_names=['N', 'num_inducing', 'input_dim', 'mu', 'Zhat', 'mudist_sq', 'mudist', 'inv_lengthscale2', '_psi2_denom', 'psi2_Zdist_sq', 'psi2_exponent', 'half_log_psi2_denom', 'psi2', 'variance_sq'],
                     type_converters=weave.converters.blitz, **self.weave_options)
        return mudist, mudist_sq, psi2_exponent, psi2
--- a/GPy/kern/parts/white.py
+++ b/GPy/kern/parts/white.py
@ -44,17 +44,17 @@ class White(Kernpart):
    def dKdiag_dtheta(self,dL_dKdiag,X,target):
        target += np.sum(dL_dKdiag)
-    def dK_dX(self,dL_dK,X,X2,target):
+    def dK_dX(self,dL_dK,X,X2,target):        
        pass
    def dKdiag_dX(self,dL_dKdiag,X,target):
        pass
    def psi0(self,Z,mu,S,target):
-        target += self.variance
+        pass # target += self.variance
    def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target):
-        target += dL_dpsi0.sum()
+        pass # target += dL_dpsi0.sum()
    def dpsi0_dmuS(self,dL_dpsi0,Z,mu,S,target_mu,target_S):
        pass
--- a/GPy/mappings/init.py
+++ b/GPy/mappings/init.py
@ -0,0 +1,7 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kernel import Kernel
 from linear import Linear
 from mlp import MLP
 #from rbf import RBF
--- a/GPy/mappings/kernel.py
+++ b/GPy/mappings/kernel.py
@ -0,0 +1,60 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core.mapping import Mapping
 import GPy
 class Kernel(Mapping):
    """
    Mapping based on a kernel/covariance function.
    .. math::
       f(\mathbf{x}*) = \mathbf{A}\mathbf{k}(\mathbf{X}, \mathbf{x}^*) + \mathbf{b}
    :param X: input observations containing :math:`\mathbf{X}`
    :type X: ndarray
    :param output_dim: dimension of output.
    :type output_dim: int
    :param kernel: a GPy kernel, defaults to GPy.kern.rbf
    :type kernel: GPy.kern.kern
    """
    def __init__(self, X, output_dim=1, kernel=None):
        Mapping.__init__(self, input_dim=X.shape[1], output_dim=output_dim)
        if kernel is None:
            kernel = GPy.kern.rbf(self.input_dim)
        self.kern = kernel
        self.X = X
        self.num_data = X.shape[0]
        self.num_params = self.output_dim*(self.num_data + 1)
        self.A = np.array((self.num_data, self.output_dim))
        self.bias = np.array(self.output_dim)
        self.randomize()
        self.name = 'kernel'
    def _get_param_names(self):
        return sum([['A_%i_%i' % (n, d) for d in range(self.output_dim)] for n in range(self.num_data)], []) + ['bias_%i' % d for d in range(self.output_dim)]
    def _get_params(self):
        return np.hstack((self.A.flatten(), self.bias))
    def _set_params(self, x):
        self.A = x[:self.num_data * self.output_dim].reshape(self.num_data, self.output_dim).copy()
        self.bias = x[self.num_data*self.output_dim:].copy()
    def randomize(self):
        self.A = np.random.randn(self.num_data, self.output_dim)/np.sqrt(self.num_data+1)
        self.bias = np.random.randn(self.output_dim)/np.sqrt(self.num_data+1)
    def f(self, X):
        return np.dot(self.kern.K(X, self.X),self.A) + self.bias
    def df_dtheta(self, dL_df, X):
        self._df_dA = (dL_df[:, :, None]*self.kern.K(X, self.X)[:, None, :]).sum(0).T
        self._df_dbias = (dL_df.sum(0))
        return np.hstack((self._df_dA.flatten(), self._df_dbias))
    def df_dX(self, dL_df, X):
        return self.kern.dK_dX((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X) 
--- a/GPy/mappings/linear.py
+++ b/GPy/mappings/linear.py
@ -0,0 +1,53 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core.mapping import Mapping
 class Linear(Mapping):
    """
    Mapping based on a linear model.
    .. math::
       f(\mathbf{x}*) = \mathbf{W}\mathbf{x}^* + \mathbf{b}
    :param X: input observations
    :type X: ndarray
    :param output_dim: dimension of output.
    :type output_dim: int
    """
    def __init__(self, input_dim=1, output_dim=1):
        self.name = 'linear'
        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
        self.num_params = self.output_dim*(self.input_dim + 1)
        self.W = np.array((self.input_dim, self.output_dim))
        self.bias = np.array(self.output_dim)
        self.randomize()
    def _get_param_names(self):
        return sum([['W_%i_%i' % (n, d) for d in range(self.output_dim)] for n in range(self.input_dim)], []) + ['bias_%i' % d for d in range(self.output_dim)]
    def _get_params(self):
        return np.hstack((self.W.flatten(), self.bias))
    def _set_params(self, x):
        self.W = x[:self.input_dim * self.output_dim].reshape(self.input_dim, self.output_dim).copy()
        self.bias = x[self.input_dim*self.output_dim:].copy()
    def randomize(self):
        self.W = np.random.randn(self.input_dim, self.output_dim)/np.sqrt(self.input_dim + 1)
        self.bias = np.random.randn(self.output_dim)/np.sqrt(self.input_dim + 1)
    def f(self, X):
        return np.dot(X,self.W) + self.bias
    def df_dtheta(self, dL_df, X):
        self._df_dW = (dL_df[:, :, None]*X[:, None, :]).sum(0).T
        self._df_dbias = (dL_df.sum(0))
        return np.hstack((self._df_dW.flatten(), self._df_dbias))
    def df_dX(self, dL_df, X):
        return (dL_df[:, None, :]*self.W[None, :, :]).sum(2) 
--- a/GPy/mappings/mlp.py
+++ b/GPy/mappings/mlp.py
@ -0,0 +1,127 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core.mapping import Mapping
 class MLP(Mapping):
    """
    Mapping based on a multi-layer perceptron neural network model.
    .. math::
       f(\mathbf{x}*) = \mathbf{W}^0\boldsymbol{\phi}(\mathbf{W}^1\mathbf{x}+\mathb{b}^1)^* + \mathbf{b}^0
    where
    ..math::
      \phi(\cdot) = \text{tanh}(\cdot)
    :param X: input observations
    :type X: ndarray
    :param output_dim: dimension of output.
    :type output_dim: int
    :param hidden_dim: dimension of hidden layer. If it is an int, there is one hidden layer of the given dimension. If it is a list of ints there are as manny hidden layers as the length of the list, each with the given number of hidden nodes in it.
    :type hidden_dim: int or list of ints. 
    """
    def __init__(self, input_dim=1, output_dim=1, hidden_dim=3):
        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
        self.name = 'mlp'
        if isinstance(hidden_dim, int):
            hidden_dim = [hidden_dim]
        self.hidden_dim = hidden_dim
        self.activation = [None]*len(self.hidden_dim)
        self.W = []
        self._dL_dW = []
        self.bias = []
        self._dL_dbias = []
        self.W.append(np.zeros((self.input_dim, self.hidden_dim[0])))
        self._dL_dW.append(np.zeros((self.input_dim, self.hidden_dim[0])))
        self.bias.append(np.zeros(self.hidden_dim[0]))
        self._dL_dbias.append(np.zeros(self.hidden_dim[0]))
        self.num_params = self.hidden_dim[0]*(self.input_dim+1)
        for h1, h0 in zip(hidden_dim[1:], hidden_dim[0:-1]):
            self.W.append(np.zeros((h0, h1)))
            self._dL_dW.append(np.zeros((h0, h1)))
            self.bias.append(np.zeros(h1))
            self._dL_dbias.append(np.zeros(h1))
            self.num_params += h1*(h0+1)
        self.W.append(np.zeros((self.hidden_dim[-1], self.output_dim)))
        self._dL_dW.append(np.zeros((self.hidden_dim[-1], self.output_dim)))
        self.bias.append(np.zeros(self.output_dim))
        self._dL_dbias.append(np.zeros(self.output_dim))
        self.num_params += self.output_dim*(self.hidden_dim[-1]+1)
        self.randomize()
    def _get_param_names(self):
        return sum([['W%i_%i_%i' % (i, n, d)  for n in range(self.W[i].shape[0]) for d in range(self.W[i].shape[1])] + ['bias%i_%i' % (i, d) for d in range(self.W[i].shape[1])] for i in range(len(self.W))], [])
    def _get_params(self):
        param = np.array([])
        for W, bias in zip(self.W, self.bias):
            param = np.hstack((param, W.flatten(), bias))
        return param
    def _set_params(self, x):
        start = 0
        for W, bias in zip(self.W, self.bias):
            end = W.shape[0]*W.shape[1]+start
            W[:] = x[start:end].reshape(W.shape[0], W.shape[1]).copy()
            start = end
            end = W.shape[1]+end
            bias[:] = x[start:end].copy()
            start = end
    def randomize(self):
        for W, bias in zip(self.W, self.bias):
            W[:] = np.random.randn(W.shape[0], W.shape[1])/np.sqrt(W.shape[0]+1)
            bias[:] = np.random.randn(W.shape[1])/np.sqrt(W.shape[0]+1)
    def f(self, X):
        self._f_computations(X)
        return np.dot(np.tanh(self.activation[-1]), self.W[-1]) + self.bias[-1]
    def _f_computations(self, X):
        W = self.W[0]
        bias = self.bias[0]
        self.activation[0] = np.dot(X,W) + bias
        for W, bias, index in zip(self.W[1:-1], self.bias[1:-1], range(1, len(self.activation))):
            self.activation[index] = np.dot(np.tanh(self.activation[index-1]), W)+bias
    def df_dtheta(self, dL_df, X):
        self._df_computations(dL_df, X)
        g = np.array([])
        for gW, gbias in zip(self._dL_dW, self._dL_dbias):
            g = np.hstack((g, gW.flatten(), gbias))
        return g
    def _df_computations(self, dL_df, X):
        self._f_computations(X)
        a0 = self.activation[-1]
        W = self.W[-1]
        self._dL_dW[-1] = (dL_df[:, :, None]*np.tanh(a0[:, None, :])).sum(0).T
        dL_dta=(dL_df[:, None, :]*W[None, :, :]).sum(2)
        self._dL_dbias[-1] = (dL_df.sum(0))
        for dL_dW, dL_dbias, W, bias, a0, a1 in zip(self._dL_dW[-2:0:-1],
                                                    self._dL_dbias[-2:0:-1],
                                                    self.W[-2:0:-1],
                                                    self.bias[-2:0:-1],
                                                    self.activation[-2::-1],
                                                    self.activation[-1:0:-1]):
            ta = np.tanh(a1)
            dL_da = dL_dta*(1-ta*ta)
            dL_dW[:] = (dL_da[:, :, None]*np.tanh(a0[:, None, :])).sum(0).T
            dL_dbias[:] = (dL_da.sum(0))
            dL_dta = (dL_da[:, None, :]*W[None, :, :]).sum(2)
        ta = np.tanh(self.activation[0])
        dL_da = dL_dta*(1-ta*ta)
        W = self.W[0]
        self._dL_dW[0] = (dL_da[:, :, None]*X[:, None, :]).sum(0).T
        self._dL_dbias[0] = (dL_da.sum(0))
        self._dL_dX = (dL_da[:, None, :]*W[None, :, :]).sum(2)
    def df_dX(self, dL_df, X):
        self._df_computations(dL_df, X)
        return self._dL_dX
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -8,6 +8,9 @@ from svigp_regression import SVIGPRegression
 from sparse_gp_classification import SparseGPClassification
 from fitc_classification import FITCClassification
 from gplvm import GPLVM
 from bcgplvm import BCGPLVM
 from sparse_gplvm import SparseGPLVM
 from warped_gp import WarpedGP
 from bayesian_gplvm import BayesianGPLVM
 from mrd import MRD
 from gradient_checker import GradientChecker
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@ -10,6 +10,8 @@ from matplotlib.colors import colorConverter
 from GPy.inference.optimization import SCG
 from GPy.util import plot_latent
 from GPy.models.gplvm import GPLVM
 from GPy.util.plot_latent import most_significant_input_dimensions
 from matplotlib import pyplot
 class BayesianGPLVM(SparseGP, GPLVM):
    """
@ -24,8 +26,7 @@ class BayesianGPLVM(SparseGP, GPLVM):
    """
    def __init__(self, likelihood_or_Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
-                 Z=None, kernel=None, oldpsave=10, _debug=False,
+                 Z=None, kernel=None, **kwargs):
                 **kwargs):
        if type(likelihood_or_Y) is np.ndarray:
            likelihood = Gaussian(likelihood_or_Y)
        else:
@ -43,40 +44,31 @@ class BayesianGPLVM(SparseGP, GPLVM):
        assert Z.shape[1] == X.shape[1]
        if kernel is None:
-            kernel = kern.rbf(input_dim) + kern.white(input_dim)
+            kernel = kern.rbf(input_dim) # + kern.white(input_dim)
        self.oldpsave = oldpsave
        self._oldps = []
        self._debug = _debug
        if self._debug:
            self.f_call = 0
            self._count = itertools.count()
            self._savedklll = []
            self._savedparams = []
            self._savedgradients = []
            self._savederrors = []
            self._savedpsiKmm = []
            self._savedABCD = []
        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, X_variance=X_variance, **kwargs)
        self.ensure_default_constraints()
-    @property
+    def getstate(self):
-    def oldps(self):
+        """
-        return self._oldps
+        Get the current state of the class,
-    @oldps.setter
+        here just all the indices, rest can get recomputed
-    def oldps(self, p):
+        """
-        if len(self._oldps) == (self.oldpsave + 1):
+        return SparseGP.getstate(self) + [self.init]
-            self._oldps.pop()
+
-        # if len(self._oldps) == 0 or not np.any([np.any(np.abs(p - op) > 1e-5) for op in self._oldps]):
+    def setstate(self, state):
-        self._oldps.insert(0, p.copy())
+        self._const_jitter = None
        self.init = state.pop()
        SparseGP.setstate(self, state)
    def _get_param_names(self):
        X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
        S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
        return (X_names + S_names + SparseGP._get_param_names(self))
    def _get_print_names(self):
        return SparseGP._get_print_names(self)
    def _get_params(self):
        """
        Horizontally stacks the parameters in order to present them to the optimizer.
@ -90,24 +82,11 @@ class BayesianGPLVM(SparseGP, GPLVM):
        x = np.hstack((self.X.flatten(), self.X_variance.flatten(), SparseGP._get_params(self)))
        return x
    def _clipped(self, x):
        return x # np.clip(x, -1e300, 1e300)
    def _set_params(self, x, save_old=True, save_count=0):
-#         try:
+        N, input_dim = self.num_data, self.input_dim
-            x = self._clipped(x)
+        self.X = x[:self.X.size].reshape(N, input_dim).copy()
-            N, input_dim = self.num_data, self.input_dim
+        self.X_variance = x[(N * input_dim):(2 * N * input_dim)].reshape(N, input_dim).copy()
-            self.X = x[:self.X.size].reshape(N, input_dim).copy()
+        SparseGP._set_params(self, x[(2 * N * input_dim):])
            self.X_variance = x[(N * input_dim):(2 * N * input_dim)].reshape(N, input_dim).copy()
            SparseGP._set_params(self, x[(2 * N * input_dim):])
 #             self.oldps = x
 #         except (LinAlgError, FloatingPointError, ZeroDivisionError):
 #             print "\rWARNING: Caught LinAlgError, continueing without setting            "
 #             if self._debug:
 #                 self._savederrors.append(self.f_call)
 #             if save_count > 10:
 #                 raise
 #             self._set_params(self.oldps[-1], save_old=False, save_count=save_count + 1)
    def dKL_dmuS(self):
        dKL_dS = (1. - (1. / (self.X_variance))) * 0.5
@ -131,56 +110,19 @@ class BayesianGPLVM(SparseGP, GPLVM):
    def log_likelihood(self):
        ll = SparseGP.log_likelihood(self)
        kl = self.KL_divergence()
 #         if ll < -2E4:
 #             ll = -2E4 + np.random.randn()
 #         if kl > 5E4:
 #             kl = 5E4 + np.random.randn()
        if self._debug:
            self.f_call = self._count.next()
            if self.f_call % 1 == 0:
                self._savedklll.append([self.f_call, ll, kl])
                self._savedparams.append([self.f_call, self._get_params()])
                self._savedgradients.append([self.f_call, self._log_likelihood_gradients()])
                self._savedpsiKmm.append([self.f_call, [self.Kmm, self.dL_dKmm]])
 #                 sf2 = self.scale_factor ** 2
                if self.likelihood.is_heteroscedastic:
                    A = -0.5 * self.num_data * self.input_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.V * self.likelihood.Y)
 #                     B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A) * sf2)
                    B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A))
                else:
                    A = -0.5 * self.num_data * self.input_dim * (np.log(2.*np.pi) + np.log(self.likelihood._variance)) - 0.5 * self.likelihood.precision * self.likelihood.trYYT
 #                     B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A) * sf2)
                    B = -0.5 * self.input_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A))
                C = -self.input_dim * (np.sum(np.log(np.diag(self.LB)))) # + 0.5 * self.num_inducing * np.log(sf2))
                D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V))
                self._savedABCD.append([self.f_call, A, B, C, D])
        # print "\nkl:", kl, "ll:", ll
        return ll - kl
    def _log_likelihood_gradients(self):
        dKL_dmu, dKL_dS = self.dKL_dmuS()
        dL_dmu, dL_dS = self.dL_dmuS()
        # TODO: find way to make faster
        d_dmu = (dL_dmu - dKL_dmu).flatten()
        d_dS = (dL_dS - dKL_dS).flatten()
        # TEST KL: ====================
        # d_dmu = (dKL_dmu).flatten()
        # d_dS = (dKL_dS).flatten()
        # ========================
        # TEST L: ====================
 #         d_dmu = (dL_dmu).flatten()
 #         d_dS = (dL_dS).flatten()
        # ========================
        self.dbound_dmuS = np.hstack((d_dmu, d_dS))
        self.dbound_dZtheta = SparseGP._log_likelihood_gradients(self)
-        return self._clipped(np.hstack((self.dbound_dmuS.flatten(), self.dbound_dZtheta)))
+        return np.hstack((self.dbound_dmuS.flatten(), self.dbound_dZtheta))
-    def plot_latent(self, *args, **kwargs):
+    def plot_latent(self, plot_inducing=True, *args, **kwargs):
-        return plot_latent.plot_latent_indices(self, *args, **kwargs)
+        return plot_latent.plot_latent(self, plot_inducing=plot_inducing, *args, **kwargs)
    def do_test_latents(self, Y):
        """
@ -212,6 +154,84 @@ class BayesianGPLVM(SparseGP, GPLVM):
        return means, covars
    def dmu_dX(self, Xnew):
        """
        Calculate the gradient of the prediction at Xnew w.r.t Xnew.
        """
        dmu_dX = np.zeros_like(Xnew)
        for i in range(self.Z.shape[0]):
            dmu_dX += self.kern.dK_dX(self.Cpsi1Vf[i:i + 1, :], Xnew, self.Z[i:i + 1, :])
        return dmu_dX
    def dmu_dXnew(self, Xnew):
        """
        Individual gradient of prediction at Xnew w.r.t. each sample in Xnew
        """
        dK_dX = np.zeros((Xnew.shape[0], self.num_inducing))
        ones = np.ones((1, 1))
        for i in range(self.Z.shape[0]):
            dK_dX[:, i] = self.kern.dK_dX(ones, Xnew, self.Z[i:i + 1, :]).sum(-1)
        return np.dot(dK_dX, self.Cpsi1Vf)
    def plot_steepest_gradient_map(self, fignum=None, ax=None, which_indices=None, labels=None, data_labels=None, data_marker='o', data_s=40, resolution=20, aspect='auto', updates=False, ** kwargs):
        input_1, input_2 = significant_dims = most_significant_input_dimensions(self, which_indices)
        X = np.zeros((resolution ** 2, self.input_dim))
        indices = np.r_[:X.shape[0]]
        if labels is None:
            labels = range(self.output_dim)
        def plot_function(x):
            X[:, significant_dims] = x
            dmu_dX = self.dmu_dXnew(X)
            argmax = np.argmax(dmu_dX, 1)
            return dmu_dX[indices, argmax], np.array(labels)[argmax]
        if ax is None:
            fig = pyplot.figure(num=fignum)
            ax = fig.add_subplot(111)
        if data_labels is None:
            data_labels = np.ones(self.num_data)
        ulabels = []
        for lab in data_labels:
            if not lab in ulabels:
                ulabels.append(lab)
        marker = itertools.cycle(list(data_marker))
        from GPy.util import Tango
        for i, ul in enumerate(ulabels):
            if type(ul) is np.string_:
                this_label = ul
            elif type(ul) is np.int64:
                this_label = 'class %i' % ul
            else:
                this_label = 'class %i' % i
            m = marker.next()
            index = np.nonzero(data_labels == ul)[0]
            x = self.X[index, input_1]
            y = self.X[index, input_2]
            ax.scatter(x, y, marker=m, s=data_s, color=Tango.nextMedium(), label=this_label)
        ax.set_xlabel('latent dimension %i' % input_1)
        ax.set_ylabel('latent dimension %i' % input_2)
        from matplotlib.cm import get_cmap
        from GPy.util.latent_space_visualizations.controllers.imshow_controller import ImAnnotateController
        controller = ImAnnotateController(ax,
                                      plot_function,
                                      tuple(self.X.min(0)[:, significant_dims]) + tuple(self.X.max(0)[:, significant_dims]),
                                      resolution=resolution,
                                      aspect=aspect,
                                      cmap=get_cmap('jet'),
                                      **kwargs)
        ax.legend()
        ax.figure.tight_layout()
        if updates:
            pyplot.show()
            clear = raw_input('Enter to continue')
            if clear.lower() in 'yes' or clear == '':
                controller.deactivate()
        return controller.view
    def plot_X_1d(self, fignum=None, ax=None, colors=None):
        """
@ -256,275 +276,6 @@ class BayesianGPLVM(SparseGP, GPLVM):
        fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
        return fig
    def __getstate__(self):
        return (self.likelihood, self.input_dim, self.X, self.X_variance,
                self.init, self.num_inducing, self.Z, self.kern,
                self.oldpsave, self._debug)
    def __setstate__(self, state):
        self.__init__(*state)
    def _debug_filter_params(self, x):
        start, end = 0, self.X.size,
        X = x[start:end].reshape(self.num_data, self.input_dim)
        start, end = end, end + self.X_variance.size
        X_v = x[start:end].reshape(self.num_data, self.input_dim)
        start, end = end, end + (self.num_inducing * self.input_dim)
        Z = x[start:end].reshape(self.num_inducing, self.input_dim)
        start, end = end, end + self.input_dim
        theta = x[start:]
        return X, X_v, Z, theta
    def _debug_get_axis(self, figs):
        if figs[-1].axes:
            ax1 = figs[-1].axes[0]
            ax1.cla()
        else:
            ax1 = figs[-1].add_subplot(111)
        return ax1
    def _debug_plot(self):
        assert self._debug, "must enable _debug, to debug-plot"
        import pylab
 #         from mpl_toolkits.mplot3d import Axes3D
        figs = [pylab.figure('BGPLVM DEBUG', figsize=(12, 4))]
 #         fig.clf()
        # log like
 #         splotshape = (6, 4)
 #         ax1 = pylab.subplot2grid(splotshape, (0, 0), 1, 4)
        ax1 = self._debug_get_axis(figs)
        ax1.text(.5, .5, "Optimization", alpha=.3, transform=ax1.transAxes,
                 ha='center', va='center')
        kllls = np.array(self._savedklll)
        LL, = ax1.plot(kllls[:, 0], kllls[:, 1] - kllls[:, 2], '-', label=r'$\log p(\mathbf{Y})$', mew=1.5)
        KL, = ax1.plot(kllls[:, 0], kllls[:, 2], '-', label=r'$\mathcal{KL}(p||q)$', mew=1.5)
        L, = ax1.plot(kllls[:, 0], kllls[:, 1], '-', label=r'$L$', mew=1.5) # \mathds{E}_{q(\mathbf{X})}[p(\mathbf{Y|X})\frac{p(\mathbf{X})}{q(\mathbf{X})}]
        param_dict = dict(self._savedparams)
        gradient_dict = dict(self._savedgradients)
 #         kmm_dict = dict(self._savedpsiKmm)
        iters = np.array(param_dict.keys())
        ABCD_dict = np.array(self._savedABCD)
        self.showing = 0
 #         ax2 = pylab.subplot2grid(splotshape, (1, 0), 2, 4)
        figs.append(pylab.figure("BGPLVM DEBUG X", figsize=(12, 4)))
        ax2 = self._debug_get_axis(figs)
        ax2.text(.5, .5, r"$\mathbf{X}$", alpha=.5, transform=ax2.transAxes,
                 ha='center', va='center')
        figs[-1].canvas.draw()
        figs[-1].tight_layout(rect=(0, 0, 1, .86))
 #         ax3 = pylab.subplot2grid(splotshape, (3, 0), 2, 4, sharex=ax2)
        figs.append(pylab.figure("BGPLVM DEBUG S", figsize=(12, 4)))
        ax3 = self._debug_get_axis(figs)
        ax3.text(.5, .5, r"$\mathbf{S}$", alpha=.5, transform=ax3.transAxes,
                 ha='center', va='center')
        figs[-1].canvas.draw()
        figs[-1].tight_layout(rect=(0, 0, 1, .86))
 #         ax4 = pylab.subplot2grid(splotshape, (5, 0), 2, 2)
        figs.append(pylab.figure("BGPLVM DEBUG Z", figsize=(6, 4)))
        ax4 = self._debug_get_axis(figs)
        ax4.text(.5, .5, r"$\mathbf{Z}$", alpha=.5, transform=ax4.transAxes,
                 ha='center', va='center')
        figs[-1].canvas.draw()
        figs[-1].tight_layout(rect=(0, 0, 1, .86))
 #         ax5 = pylab.subplot2grid(splotshape, (5, 2), 2, 2)
        figs.append(pylab.figure("BGPLVM DEBUG theta", figsize=(6, 4)))
        ax5 = self._debug_get_axis(figs)
        ax5.text(.5, .5, r"${\theta}$", alpha=.5, transform=ax5.transAxes,
                 ha='center', va='center')
        figs[-1].canvas.draw()
        figs[-1].tight_layout(rect=(.15, 0, 1, .86))
 #         figs.append(pylab.figure("BGPLVM DEBUG Kmm", figsize=(12, 6)))
 #         fig = figs[-1]
 #         ax6 = fig.add_subplot(121)
 #         ax6.text(.5, .5, r"${\mathbf{K}_{mm}}$", color='magenta', alpha=.5, transform=ax6.transAxes,
 #                  ha='center', va='center')
 #         ax7 = fig.add_subplot(122)
 #         ax7.text(.5, .5, r"${\frac{dL}{dK_{mm}}}$", color='magenta', alpha=.5, transform=ax7.transAxes,
 #                  ha='center', va='center')
        figs.append(pylab.figure("BGPLVM DEBUG Kmm", figsize=(12, 6)))
        fig = figs[-1]
        ax8 = fig.add_subplot(121)
        ax8.text(.5, .5, r"${\mathbf{A,B,C,input_dim}}$", color='k', alpha=.5, transform=ax8.transAxes,
                 ha='center', va='center')
        ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 1], label='A')
        ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 2], label='B')
        ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 3], label='C')
        ax8.plot(ABCD_dict[:, 0], ABCD_dict[:, 4], label='input_dim')
        ax8.legend()
        figs[-1].canvas.draw()
        figs[-1].tight_layout(rect=(.15, 0, 1, .86))
        X, S, Z, theta = self._debug_filter_params(param_dict[self.showing])
        Xg, Sg, Zg, thetag = self._debug_filter_params(gradient_dict[self.showing])
 #         Xg, Sg, Zg, thetag = -Xg, -Sg, -Zg, -thetag
        quiver_units = 'xy'
        quiver_scale = 1
        quiver_scale_units = 'xy'
        Xlatentplts = ax2.plot(X, ls="-", marker="x")
        colors = colorConverter.to_rgba_array([p.get_color() for p in Xlatentplts], .4)
        Ulatent = np.zeros_like(X)
        xlatent = np.tile(np.arange(0, X.shape[0])[:, None], X.shape[1])
        Xlatentgrads = ax2.quiver(xlatent, X, Ulatent, Xg, color=colors,
                                  units=quiver_units, scale_units=quiver_scale_units,
                                  scale=quiver_scale)
        Slatentplts = ax3.plot(S, ls="-", marker="x")
        Slatentgrads = ax3.quiver(xlatent, S, Ulatent, Sg, color=colors,
                                  units=quiver_units, scale_units=quiver_scale_units,
                                  scale=quiver_scale)
        ax3.set_ylim(0, 1.)
        xZ = np.tile(np.arange(0, Z.shape[0])[:, None], Z.shape[1])
        UZ = np.zeros_like(Z)
        Zplts = ax4.plot(Z, ls="-", marker="x")
        Zgrads = ax4.quiver(xZ, Z, UZ, Zg, color=colors,
                                  units=quiver_units, scale_units=quiver_scale_units,
                                  scale=quiver_scale)
        xtheta = np.arange(len(theta))
        Utheta = np.zeros_like(theta)
        thetaplts = ax5.bar(xtheta - .4, theta, color=colors)
        thetagrads = ax5.quiver(xtheta, theta, Utheta, thetag, color=colors,
                                  units=quiver_units, scale_units=quiver_scale_units,
                                  scale=quiver_scale,
                                  edgecolors=('k',), linewidths=[1])
        pylab.setp(thetaplts, zorder=0)
        pylab.setp(thetagrads, zorder=10)
        ax5.set_xticks(np.arange(len(theta)))
        ax5.set_xticklabels(self._get_param_names()[-len(theta):], rotation=17)
 #         imkmm = ax6.imshow(kmm_dict[self.showing][0])
 #         from mpl_toolkits.axes_grid1 import make_axes_locatable
 #         divider = make_axes_locatable(ax6)
 #         caxkmm = divider.append_axes("right", "5%", pad="1%")
 #         cbarkmm = pylab.colorbar(imkmm, cax=caxkmm)
 #
 #         imkmmdl = ax7.imshow(kmm_dict[self.showing][1])
 #         divider = make_axes_locatable(ax7)
 #         caxkmmdl = divider.append_axes("right", "5%", pad="1%")
 #         cbarkmmdl = pylab.colorbar(imkmmdl, cax=caxkmmdl)
 #         input_dimleg = ax1.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)],
 #                    loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.15, 1, 1.15),
 #                    borderaxespad=0, mode="expand")
        ax2.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)],
                   loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.1, 1, 1.1),
                   borderaxespad=0, mode="expand")
        ax3.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)],
                   loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.1, 1, 1.1),
                   borderaxespad=0, mode="expand")
        ax4.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)],
                   loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.1, 1, 1.1),
                   borderaxespad=0, mode="expand")
        ax5.legend(Xlatentplts, [r"$input_dim_{}$".format(i + 1) for i in range(self.input_dim)],
                   loc=3, ncol=self.input_dim, bbox_to_anchor=(0, 1.1, 1, 1.1),
                   borderaxespad=0, mode="expand")
        Lleg = ax1.legend()
        Lleg.draggable()
 #         ax1.add_artist(input_dimleg)
        indicatorKL, = ax1.plot(kllls[self.showing, 0], kllls[self.showing, 2], 'o', c=KL.get_color())
        indicatorLL, = ax1.plot(kllls[self.showing, 0], kllls[self.showing, 1] - kllls[self.showing, 2], 'o', c=LL.get_color())
        indicatorL, = ax1.plot(kllls[self.showing, 0], kllls[self.showing, 1], 'o', c=L.get_color())
 #         for err in self._savederrors:
 #             if err < kllls.shape[0]:
 #                 ax1.scatter(kllls[err, 0], kllls[err, 2], s=50, marker=(5, 2), c=KL.get_color())
 #                 ax1.scatter(kllls[err, 0], kllls[err, 1] - kllls[err, 2], s=50, marker=(5, 2), c=LL.get_color())
 #                 ax1.scatter(kllls[err, 0], kllls[err, 1], s=50, marker=(5, 2), c=L.get_color())
 #         try:
 #             for f in figs:
 #                 f.canvas.draw()
 #                 f.tight_layout(box=(0, .15, 1, .9))
 # #             pylab.draw()
 # #             pylab.tight_layout(box=(0, .1, 1, .9))
 #         except:
 #             pass
        # parameter changes
        # ax2 = pylab.subplot2grid((4, 1), (1, 0), 3, 1, projection='3d')
        button_options = [0, 0] # [0]: clicked -- [1]: dragged
        def update_plots(event):
            if button_options[0] and not button_options[1]:
 #               event.button, event.x, event.y, event.xdata, event.ydata)
                tmp = np.abs(iters - event.xdata)
                closest_hit = iters[tmp == tmp.min()][0]
                if closest_hit != self.showing:
                    self.showing = closest_hit
                    # print closest_hit, iters, event.xdata
                    indicatorLL.set_data(self.showing, kllls[self.showing, 1] - kllls[self.showing, 2])
                    indicatorKL.set_data(self.showing, kllls[self.showing, 2])
                    indicatorL.set_data(self.showing, kllls[self.showing, 1])
                    X, S, Z, theta = self._debug_filter_params(param_dict[self.showing])
                    Xg, Sg, Zg, thetag = self._debug_filter_params(gradient_dict[self.showing])
 #                     Xg, Sg, Zg, thetag = -Xg, -Sg, -Zg, -thetag
                    for i, Xlatent in enumerate(Xlatentplts):
                        Xlatent.set_ydata(X[:, i])
                    Xlatentgrads.set_offsets(np.array([xlatent.ravel(), X.ravel()]).T)
                    Xlatentgrads.set_UVC(Ulatent, Xg)
                    for i, Slatent in enumerate(Slatentplts):
                        Slatent.set_ydata(S[:, i])
                    Slatentgrads.set_offsets(np.array([xlatent.ravel(), S.ravel()]).T)
                    Slatentgrads.set_UVC(Ulatent, Sg)
                    for i, Zlatent in enumerate(Zplts):
                        Zlatent.set_ydata(Z[:, i])
                    Zgrads.set_offsets(np.array([xZ.ravel(), Z.ravel()]).T)
                    Zgrads.set_UVC(UZ, Zg)
                    for p, t in zip(thetaplts, theta):
                        p.set_height(t)
                    thetagrads.set_offsets(np.array([xtheta.ravel(), theta.ravel()]).T)
                    thetagrads.set_UVC(Utheta, thetag)
 #                     imkmm.set_data(kmm_dict[self.showing][0])
 #                     imkmm.autoscale()
 #                     cbarkmm.update_normal(imkmm)
 #
 #                     imkmmdl.set_data(kmm_dict[self.showing][1])
 #                     imkmmdl.autoscale()
 #                     cbarkmmdl.update_normal(imkmmdl)
                    ax2.relim()
                    # ax3.relim()
                    ax4.relim()
                    ax5.relim()
                    ax2.autoscale()
                    # ax3.autoscale()
                    ax4.autoscale()
                    ax5.autoscale()
                    [fig.canvas.draw() for fig in figs]
            button_options[0] = 0
            button_options[1] = 0
        def onclick(event):
            if event.inaxes is ax1 and event.button == 1:
                button_options[0] = 1
        def motion(event):
            if button_options[0]:
                button_options[1] = 1
        cidr = figs[0].canvas.mpl_connect('button_release_event', update_plots)
        cidp = figs[0].canvas.mpl_connect('button_press_event', onclick)
        cidd = figs[0].canvas.mpl_connect('motion_notify_event', motion)
        return ax1, ax2, ax3, ax4, ax5 # , ax6, ax7
 def latent_cost_and_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
    """
    objective function for fitting the latent variables for test points
--- a/GPy/models/bcgplvm.py
+++ b/GPy/models/bcgplvm.py
@ -0,0 +1,50 @@
 # ## Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 import sys, pdb
 from ..core import GP
 from ..models import GPLVM
 from ..mappings import *
 class BCGPLVM(GPLVM):
    """
    Back constrained Gaussian Process Latent Variable Model
    :param Y: observed data
    :type Y: np.ndarray
    :param input_dim: latent dimensionality
    :type input_dim: int
    :param init: initialisation method for the latent space
    :type init: 'PCA'|'random'
    :param mapping: mapping for back constraint
    :type mapping: GPy.core.Mapping object
    """
    def __init__(self, Y, input_dim, init='PCA', X=None, kernel=None, normalize_Y=False, mapping=None):
        if mapping is None:
            mapping = Kernel(X=Y, output_dim=input_dim)
        self.mapping = mapping
        GPLVM.__init__(self, Y, input_dim, init, X, kernel, normalize_Y)
        self.X = self.mapping.f(self.likelihood.Y)
    def _get_param_names(self):
        return self.mapping._get_param_names() + GP._get_param_names(self)
    def _get_params(self):
        return np.hstack((self.mapping._get_params(), GP._get_params(self)))
    def _set_params(self, x):
        self.mapping._set_params(x[:self.mapping.num_params])
        self.X = self.mapping.f(self.likelihood.Y)
        GP._set_params(self, x[self.mapping.num_params:])
    def _log_likelihood_gradients(self):
        dL_df = 2.*self.kern.dK_dX(self.dL_dK, self.X)
        dL_dtheta = self.mapping.df_dtheta(dL_df, self.likelihood.Y)
        return np.hstack((dL_dtheta.flatten(), GP._log_likelihood_gradients(self)))
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@ -25,11 +25,19 @@ class GPRegression(GP):
    """
-    def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False):
+    def __init__(self, X, Y, kernel=None, normalize_X=False, normalize_Y=False):
        if kernel is None:
            kernel = kern.rbf(X.shape[1])
-        likelihood = likelihoods.Gaussian(Y,normalize=normalize_Y)
+        likelihood = likelihoods.Gaussian(Y, normalize=normalize_Y)
        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
        self.ensure_default_constraints()
    def getstate(self):
        return GP.getstate(self)
    def setstate(self, state):
        return GP.setstate(self, state)
    pass
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@ -1,4 +1,4 @@
-### Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# ## Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
@ -8,6 +8,7 @@ import sys, pdb
 from .. import kern
 from ..core import Model
 from ..util.linalg import pdinv, PCA
 from ..core.priors import Gaussian as Gaussian_prior
 from ..core import GP
 from ..likelihoods import Gaussian
 from .. import util
@ -26,42 +27,67 @@ class GPLVM(GP):
    :type init: 'PCA'|'random'
    """
-    def __init__(self, Y, input_dim, init='PCA', X = None, kernel=None, normalize_Y=False):
+    def __init__(self, Y, input_dim, init='PCA', X=None, kernel=None, normalize_Y=False):
        if X is None:
            X = self.initialise_latent(init, input_dim, Y)
        if kernel is None:
-            kernel = kern.rbf(input_dim, ARD=input_dim>1) + kern.bias(input_dim, np.exp(-2)) + kern.white(input_dim, np.exp(-2))
+            kernel = kern.rbf(input_dim, ARD=input_dim > 1) + kern.bias(input_dim, np.exp(-2))
-        likelihood = Gaussian(Y, normalize=normalize_Y)
+        likelihood = Gaussian(Y, normalize=normalize_Y, variance=np.exp(-2.))
        GP.__init__(self, X, likelihood, kernel, normalize_X=False)
        self.set_prior('.*X', Gaussian_prior(0, 1))
        self.ensure_default_constraints()
    def initialise_latent(self, init, input_dim, Y):
        Xr = np.random.randn(Y.shape[0], input_dim)
        if init == 'PCA':
-            return PCA(Y, input_dim)[0]
+            PC = PCA(Y, input_dim)[0]
-        else:
+            Xr[:PC.shape[0], :PC.shape[1]] = PC
-            return np.random.randn(Y.shape[0], input_dim)
+        return Xr
    def getstate(self):
        return GP.getstate(self)
    def setstate(self, state):
        GP.setstate(self, state)
    def _get_param_names(self):
-        return sum([['X_%i_%i'%(n,q) for q in range(self.input_dim)] for n in range(self.num_data)],[]) + GP._get_param_names(self)
+        return sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], []) + GP._get_param_names(self)
    def _get_params(self):
        return np.hstack((self.X.flatten(), GP._get_params(self)))
-    def _set_params(self,x):
+    def _set_params(self, x):
-        self.X = x[:self.num_data*self.input_dim].reshape(self.num_data,self.input_dim).copy()
+        self.X = x[:self.num_data * self.input_dim].reshape(self.num_data, self.input_dim).copy()
        GP._set_params(self, x[self.X.size:])
    def _log_likelihood_gradients(self):
-        dL_dX = 2.*self.kern.dK_dX(self.dL_dK,self.X)
+        dL_dX = 2.*self.kern.dK_dX(self.dL_dK, self.X)
-        return np.hstack((dL_dX.flatten(),GP._log_likelihood_gradients(self)))
+        return np.hstack((dL_dX.flatten(), GP._log_likelihood_gradients(self)))
    def jacobian(self,X):
        target = np.zeros((X.shape[0],X.shape[1],self.output_dim))
        for i in range(self.output_dim):
        	target[:,:,i]=self.kern.dK_dX(np.dot(self.Ki,self.likelihood.Y[:,i])[None, :],X,self.X)
        return target
    def magnification(self,X):
        target=np.zeros(X.shape[0])
        J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
    	J=self.jacobian(X)
        for i in range(X.shape[0]):
 		    target[i]=np.sqrt(pb.det(np.dot(J[i,:,:],np.transpose(J[i,:,:]))))
        return target
    def plot(self):
-        assert self.likelihood.Y.shape[1]==2
+        assert self.likelihood.Y.shape[1] == 2
-        pb.scatter(self.likelihood.Y[:,0],self.likelihood.Y[:,1],40,self.X[:,0].copy(),linewidth=0,cmap=pb.cm.jet)
+        pb.scatter(self.likelihood.Y[:, 0], self.likelihood.Y[:, 1], 40, self.X[:, 0].copy(), linewidth=0, cmap=pb.cm.jet)
-        Xnew = np.linspace(self.X.min(),self.X.max(),200)[:,None]
+        Xnew = np.linspace(self.X.min(), self.X.max(), 200)[:, None]
        mu, var, upper, lower = self.predict(Xnew)
-        pb.plot(mu[:,0], mu[:,1],'k',linewidth=1.5)
+        pb.plot(mu[:, 0], mu[:, 1], 'k', linewidth=1.5)
    def plot_latent(self, *args, **kwargs):
        return util.plot_latent.plot_latent(self, *args, **kwargs)
    def plot_magnification(self, *args, **kwargs):
        return util.plot_latent.plot_magnification(self, *args, **kwargs)
--- a/GPy/models/gradient_checker.py
+++ b/GPy/models/gradient_checker.py
@ -0,0 +1,114 @@
 '''
 Created on 17 Jul 2013
@author: maxz
 '''
 from GPy.core.model import Model
 import itertools
 import numpy
 def get_shape(x):
    if isinstance(x, numpy.ndarray):
        return x.shape
    return ()
 def at_least_one_element(x):
    if isinstance(x, (list, tuple)):
        return x
    return [x]
 def flatten_if_needed(x):
    return numpy.atleast_1d(x).flatten()
 class GradientChecker(Model):
    def __init__(self, f, df, x0, names=None, *args, **kwargs):
        """
        :param f: Function to check gradient for
        :param df: Gradient of function to check
        :param x0: 
            Initial guess for inputs x (if it has a shape (a,b) this will be reflected in the parameter names).
            Can be a list of arrays, if takes a list of arrays. This list will be passed 
            to f and df in the same order as given here.
            If only one argument, make sure not to pass a list!!!
        :type x0: [array-like] | array-like | float | int
        :param names:
            Names to print, when performing gradcheck. If a list was passed to x0
            a list of names with the same length is expected.
        :param args: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs)
        Examples:
        ---------
            from GPy.models import GradientChecker
            N, M, Q = 10, 5, 3
            Sinusoid:
                X = numpy.random.rand(N, Q)
                grad = GradientChecker(numpy.sin,numpy.cos,X,'x')
                grad.checkgrad(verbose=1)
            Using GPy:
                X, Z = numpy.random.randn(N,Q), numpy.random.randn(M,Q)
                kern = GPy.kern.linear(Q, ARD=True) + GPy.kern.rbf(Q, ARD=True)
                grad = GradientChecker(kern.K, 
                                       lambda x: 2*kern.dK_dX(numpy.ones((1,1)), x),
                                       x0 = X.copy(),
                                       names='X')  
                grad.checkgrad(verbose=1)
                grad.randomize()
                grad.checkgrad(verbose=1)      
        """
        Model.__init__(self)
        if isinstance(x0, (list, tuple)) and names is None:
            self.shapes = [get_shape(xi) for xi in x0]
            self.names = ['X{i}'.format(i=i) for i in range(len(x0))]
        elif isinstance(x0, (list, tuple)) and names is not None:
            self.shapes = [get_shape(xi) for xi in x0]
            self.names = names
        elif names is None:
            self.names = ['X']
            self.shapes = [get_shape(x0)]
        else:
            self.names = names
            self.shapes = [get_shape(x0)]
        for name, xi in zip(self.names, at_least_one_element(x0)):
            self.__setattr__(name, xi)
 #         self._param_names = []
 #         for name, shape in zip(self.names, self.shapes):
 #             self._param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape))))))
        self.args = args
        self.kwargs = kwargs
        self.f = f
        self.df = df
    def _get_x(self):
        if len(self.names) > 1:
            return [self.__getattribute__(name) for name in self.names] + list(self.args)
        return [self.__getattribute__(self.names[0])] + list(self.args)
    def log_likelihood(self):
        return float(numpy.sum(self.f(*self._get_x(), **self.kwargs)))
    def _log_likelihood_gradients(self):
        return numpy.atleast_1d(self.df(*self._get_x(), **self.kwargs)).flatten()
    def _get_params(self):
        return numpy.atleast_1d(numpy.hstack(map(lambda name: flatten_if_needed(self.__getattribute__(name)), self.names)))
    def _set_params(self, x):
        current_index = 0
        for name, shape in zip(self.names, self.shapes):
            current_size = numpy.prod(shape)
            self.__setattr__(name, x[current_index:current_index + current_size].reshape(shape))
            current_index += current_size
    def _get_param_names(self):
        _param_names = []
        for name, shape in zip(self.names, self.shapes):
            _param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape))))))
        return _param_names
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@ -18,29 +18,25 @@ class MRD(Model):
    All Ys in likelihood_list are in [N x Dn], where Dn can be different per Yn,
    N must be shared across datasets though.
-    :param likelihood_list...: likelihoods of observed datasets
+    :param likelihood_list: list of observed datasets (:py:class:`~GPy.likelihoods.gaussian.Gaussian` if not supplied directly)
-    :type likelihood_list: [GPy.likelihood] | [Y1..Yy]
+    :type likelihood_list: [:py:class:`~GPy.likelihoods.likelihood.likelihood` | :py:class:`ndarray`]
    :param names: names for different gplvm models
    :type names: [str]
-    :param input_dim: latent dimensionality (will raise
+    :param input_dim: latent dimensionality
    :type input_dim: int
-    :param initx: initialisation method for the latent space
+    :param initx: initialisation method for the latent space :
-    :type initx: 'PCA'|'random'
+        
        * 'concat' - PCA on concatenation of all datasets
        * 'single' - Concatenation of PCA on datasets, respectively
        * 'random' - Random draw from a normal
    :type initx: ['concat'|'single'|'random']
    :param initz: initialisation method for inducing inputs
    :type initz: 'permute'|'random'
-    :param X:
+    :param X: Initial latent space
-        Initial latent space
+    :param X_variance: Initial latent space variance
-    :param X_variance:
+    :param Z: initial inducing inputs
-        Initial latent space variance
+    :param num_inducing: number of inducing inputs to use
    :param init: [cooncat|single|random]
        initialization method to use:
            *concat: PCA on concatenated outputs
            *single: PCA on each output
            *random: random
    :param num_inducing:
        number of inducing inputs to use
    :param Z:
        initial inducing inputs
    :param kernels: list of kernels or kernel shared for all BGPLVMS
    :type kernels: [GPy.kern.kern] | GPy.kern.kern | None (default)
    """
@ -48,7 +44,7 @@ class MRD(Model):
                 kernels=None, initx='PCA',
                 initz='permute', _debug=False, **kw):
        if names is None:
-            self.names = ["{}".format(i + 1) for i in range(len(likelihood_or_Y_list))]
+            self.names = ["{}".format(i) for i in range(len(likelihood_or_Y_list))]
        # sort out the kernels
        if kernels is None:
@ -61,12 +57,14 @@ class MRD(Model):
        assert not ('kernel' in kw), "pass kernels through `kernels` argument"
        self.input_dim = input_dim
        self.num_inducing = num_inducing
        self._debug = _debug
        self.num_inducing = num_inducing
        self._init = True
        X = self._init_X(initx, likelihood_or_Y_list)
        Z = self._init_Z(initz, X)
        self.num_inducing = Z.shape[0] # ensure M==N if M>N
        self.bgplvms = [BayesianGPLVM(l, input_dim=input_dim, kernel=k, X=X, Z=Z, num_inducing=self.num_inducing, **kw) for l, k in zip(likelihood_or_Y_list, kernels)]
        del self._init
@ -75,12 +73,36 @@ class MRD(Model):
        self.nparams = nparams.cumsum()
        self.num_data = self.gref.num_data
        self.NQ = self.num_data * self.input_dim
        self.MQ = self.num_inducing * self.input_dim
        Model.__init__(self)
        self.ensure_default_constraints()
    def getstate(self):
        return Model.getstate(self) + [self.names,
                self.bgplvms,
                self.gref,
                self.nparams,
                self.input_dim,
                self.num_inducing,
                self.num_data,
                self.NQ,
                self.MQ]
    def setstate(self, state):
        self.MQ = state.pop()
        self.NQ = state.pop()
        self.num_data = state.pop()
        self.num_inducing = state.pop()
        self.input_dim = state.pop()
        self.nparams = state.pop()
        self.gref = state.pop()
        self.bgplvms = state.pop()
        self.names = state.pop()
        Model.setstate(self, state)
    @property
    def X(self):
        return self.gref.X
@ -141,17 +163,28 @@ class MRD(Model):
        self._init_X(initx, self.likelihood_list)
        self._init_Z(initz, self.X)
-    def _get_param_names(self):
+    def _get_latent_param_names(self):
        # X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
        # S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
        n1 = self.gref._get_param_names()
        n1var = n1[:self.NQ * 2 + self.MQ]
        return n1var
    def _get_kernel_names(self):
        map_names = lambda ns, name: map(lambda x: "{1}_{0}".format(*x),
                                         itertools.izip(ns,
                                                        itertools.repeat(name)))
-        return list(itertools.chain(n1var, *(map_names(\
+        kernel_names = (map_names(SparseGP._get_param_names(g)[self.MQ:], n) for g, n in zip(self.bgplvms, self.names))
-                SparseGP._get_param_names(g)[self.MQ:], n) \
+        return kernel_names
-                for g, n in zip(self.bgplvms, self.names))))
+
    def _get_param_names(self):
        # X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
        # S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
        n1var = self._get_latent_param_names()
        kernel_names = self._get_kernel_names()
        return list(itertools.chain(n1var, *kernel_names))
    def _get_print_names(self):
        return list(itertools.chain(*self._get_kernel_names()))
    def _get_params(self):
        """
@ -255,17 +288,30 @@ class MRD(Model):
        self.Z = Z
        return Z
-    def _handle_plotting(self, fignum, axes, plotf):
+    def _handle_plotting(self, fignum, axes, plotf, sharex=False, sharey=False):
        if axes is None:
-            fig = pylab.figure(num=fignum, figsize=(4 * len(self.bgplvms), 3))
+            fig = pylab.figure(num=fignum)
        sharex_ax = None
        sharey_ax = None
        for i, g in enumerate(self.bgplvms):
            try:
                if sharex:
                    sharex_ax = ax # @UndefinedVariable
                    sharex = False # dont set twice
                if sharey:
                    sharey_ax = ax # @UndefinedVariable
                    sharey = False # dont set twice
            except:
                pass
            if axes is None:
-                ax = fig.add_subplot(1, len(self.bgplvms), i + 1)
+                ax = fig.add_subplot(1, len(self.bgplvms), i + 1, sharex=sharex_ax, sharey=sharey_ax)
            elif isinstance(axes, (tuple, list)):
                ax = axes[i]
            else:
                raise ValueError("Need one axes per latent dimension input_dim")
            plotf(i, g, ax)
            if sharey_ax is not None:
                pylab.setp(ax.get_yticklabels(), visible=False)
        pylab.draw()
        if axes is None:
            fig.tight_layout()
@ -280,16 +326,29 @@ class MRD(Model):
        fig = self._handle_plotting(fignum, ax, lambda i, g, ax: ax.imshow(g.X))
        return fig
-    def plot_predict(self, fignum=None, ax=None, **kwargs):
+    def plot_predict(self, fignum=None, ax=None, sharex=False, sharey=False, **kwargs):
-        fig = self._handle_plotting(fignum, ax, lambda i, g, ax: ax.imshow(g. predict(g.X)[0], **kwargs))
+        fig = self._handle_plotting(fignum,
                                    ax,
                                    lambda i, g, ax: ax.imshow(g. predict(g.X)[0], **kwargs),
                                    sharex=sharex, sharey=sharey)
        return fig
-    def plot_scales(self, fignum=None, ax=None, *args, **kwargs):
+    def plot_scales(self, fignum=None, ax=None, titles=None, sharex=False, sharey=True, *args, **kwargs):
-        fig = self._handle_plotting(fignum, ax, lambda i, g, ax: g.kern.plot_ARD(ax=ax, *args, **kwargs))
+        """
        :param:`titles` :
            titles for axes of datasets
        """
        if titles is None:
            titles = [r'${}$'.format(name) for name in self.names]
        ymax = reduce(max, [numpy.ceil(max(g.input_sensitivity())) for g in self.bgplvms])
        def plotf(i, g, ax):
            ax.set_ylim([0,ymax])
            g.kern.plot_ARD(ax=ax, title=titles[i], *args, **kwargs)
        fig = self._handle_plotting(fignum, ax, plotf, sharex=sharex, sharey=sharey)
        return fig
    def plot_latent(self, fignum=None, ax=None, *args, **kwargs):
-        fig = self._handle_plotting(fignum, ax, lambda i, g, ax: g.plot_latent(ax=ax, *args, **kwargs))
+        fig = self.gref.plot_latent(fignum=fignum, ax=ax, *args, **kwargs) # self._handle_plotting(fignum, ax, lambda i, g, ax: g.plot_latent(ax=ax, *args, **kwargs))
        return fig
    def _debug_plot(self):
@ -305,13 +364,4 @@ class MRD(Model):
        pylab.draw()
        fig.tight_layout()
    def _debug_optimize(self, opt='scg', maxiters=5000, itersteps=10):
        iters = 0
        optstep = lambda: self.optimize(opt, messages=1, max_f_eval=itersteps)
        self._debug_plot()
        raw_input("enter to start debug")
        while iters < maxiters:
            optstep()
            self._debug_plot()
            iters += itersteps
--- a/GPy/models/sparse_gp_classification.py
+++ b/GPy/models/sparse_gp_classification.py
@ -28,7 +28,7 @@ class SparseGPClassification(SparseGP):
    def __init__(self, X, Y=None, likelihood=None, kernel=None, normalize_X=False, normalize_Y=False, Z=None, num_inducing=10):
        if kernel is None:
-            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1],1e-3)
+            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1], 1e-3)
        if likelihood is None:
            distribution = likelihoods.likelihood_functions.Binomial()
@ -41,7 +41,16 @@ class SparseGPClassification(SparseGP):
            i = np.random.permutation(X.shape[0])[:num_inducing]
            Z = X[i].copy()
        else:
-            assert Z.shape[1]==X.shape[1]
+            assert Z.shape[1] == X.shape[1]
        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X)
        self.ensure_default_constraints()
    def getstate(self):
        return SparseGP.getstate(self)
    def setstate(self, state):
        return SparseGP.setstate(self, state)
    pass
--- a/GPy/models/sparse_gp_regression.py
+++ b/GPy/models/sparse_gp_regression.py
@ -29,7 +29,7 @@ class SparseGPRegression(SparseGP):
    def __init__(self, X, Y, kernel=None, normalize_X=False, normalize_Y=False, Z=None, num_inducing=10, X_variance=None):
        # kern defaults to rbf (plus white for stability)
        if kernel is None:
-            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1], 1e-3)
+            kernel = kern.rbf(X.shape[1]) # + kern.white(X.shape[1], 1e-3)
        # Z defaults to a subset of the data
        if Z is None:
@ -43,3 +43,13 @@ class SparseGPRegression(SparseGP):
        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X, X_variance=X_variance)
        self.ensure_default_constraints()
        pass
    def getstate(self):
        return SparseGP.getstate(self)
    def setstate(self, state):
        return SparseGP.setstate(self, state)
    pass
--- a/GPy/models/sparse_gplvm.py
+++ b/GPy/models/sparse_gplvm.py
@ -28,6 +28,14 @@ class SparseGPLVM(SparseGPRegression, GPLVM):
        SparseGPRegression.__init__(self, X, Y, kernel=kernel, num_inducing=num_inducing)
        self.ensure_default_constraints()
    def getstate(self):
        return SparseGPRegression.getstate(self)
    def setstate(self, state):
        return SparseGPRegression.setstate(self, state)
    def _get_param_names(self):
        return (sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
                + SparseGPRegression._get_param_names(self))
--- a/GPy/models/svigp_regression.py
+++ b/GPy/models/svigp_regression.py
@ -42,3 +42,11 @@ class SVIGPRegression(SVIGP):
        SVIGP.__init__(self, X, likelihood, kernel, Z, q_u=q_u, batchsize=batchsize)
        self.load_batch()
    def getstate(self):
        return GPBase.getstate(self)
    def setstate(self, state):
        return GPBase.setstate(self, state)
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@ -19,7 +19,9 @@ class WarpedGP(GP):
            self.warping_function = TanhWarpingFunction_d(warping_terms)
            self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1,) * 1)
-        Y = self._scale_data(Y)
+        self.scale_data = False
        if self.scale_data:
            Y = self._scale_data(Y)
        self.has_uncertain_inputs = False
        self.Y_untransformed = Y.copy()
        self.predict_in_warped_space = False
@ -28,6 +30,14 @@ class WarpedGP(GP):
        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
        self._set_params(self._get_params())
    def getstate(self):
        return GP.getstate(self)
    def setstate(self, state):
        return GP.setstate(self, state)
    def _scale_data(self, Y):
        self._Ymax = Y.max()
        self._Ymin = Y.min()
@ -79,11 +89,19 @@ class WarpedGP(GP):
    def plot_warping(self):
        self.warping_function.plot(self.warping_params, self.Y_untransformed.min(), self.Y_untransformed.max())
-    def _raw_predict(self, *args, **kwargs):
+    def predict(self, Xnew, which_parts='all', full_cov=False, pred_init=None):
-        mu, var = GP._raw_predict(self, *args, **kwargs)
+        # normalize X values
        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
        mu, var = GP._raw_predict(self, Xnew, full_cov=full_cov, which_parts=which_parts)
        # now push through likelihood
        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
        if self.predict_in_warped_space:
-            mu = self.warping_function.f_inv(mu, self.warping_params)
+            mean = self.warping_function.f_inv(mean, self.warping_params, y=pred_init)
            var = self.warping_function.f_inv(var, self.warping_params)
-            mu = self._unscale_data(mu)
+
-        return mu, var
+        if self.scale_data:
            mean = self._unscale_data(mean)
        return mean, var, _025pm, _975pm
--- a/GPy/notes.txt
+++ b/GPy/notes.txt
@ -0,0 +1,80 @@
 Prod.py kernel could also take a list of kernels rather than two arguments for kernels.
 transformations.py should have limits on what is fed into exp() particularly for the negative log logistic (done -neil). 
 Load in a model with mlp kernel, plot it, change a parameter, plot it again. It doesn't update the plot.
 Tests for kernels which work directly on the kernel implementation (not through GP).
 Should stationary covariances have their own kernpart type, I think so, also inner product kernels. That way the caching so carefully constructed for RBF or linear could be shared.
 Where do we declare default kernel parameters. In constructors.py or in the definition file for the kernel?
 When printing to stdout, can we check that our approach is also working nicely for the ipython notebook? I like the way our optimization ticks over, but at the moment this doesn't seem to work in the ipython notebook, it would be nice if it did. My problems may be due to using ipython 0.12, I've had a poke around at fixing this and I can't do it for 0.12.
 When we print a model should we also include information such as number of inputs and number of outputs?
 Let's not use N for giving the number of data in the model. When it pops up as a help tip it's not as clear as num_samples or num_data. Prefer the second, but oddly I've been using first.
 Loving the fact that the * has been overloaded on the kernels (oddly never thought to check this before). Although naming can be a bit confusing. Can we think how to deal with the names in a clearer way when we use a kernel like this one:
 kern = GPy.kern.rbf(30)*(GPy.kern.mlp(30)+GPy.kern.poly(30, degree=5)) + GPy.kern.bias(30). There seems to be some tieing of parameters going on ... should there be? (you can try it as the kernel for the robot wireless model).
 Can we comment up some of the list incomprehensions in hierarchical.py??
 Need to tidy up classification.py, 
 many examples include help that doesn't apply 
 (it is suggested that you can try different approximation types)
 Shall we overload the ** operator to have tensor products? (I've done this now we can see if we like it)
 People aren't filling the doc strings in as they go *everyone* needs to get in the habit of this (and modifying them as they edit, or correcting them when there is a problem).
 Need some nice way of explaining how to compile documentation and run the unit tests, could this be in a readme or FAQ somewhere? Maybe it's there already somewhere and I've missed it.
 Shouldn't EP be in the inference package (not likelihoods)?
 When using bfgs in ipython notebook, text appears in the original console, not in the notebook.
 In sparse GPs wouldn't it be clearer to call Z inducing?
 In coregionalisation matrix, setting the W to all ones will (surely?) ensure that symmetry isn't broken. Also, but allowing it to scale like that, the output variance increases as rank is increased (and if user sets rank to more than output dim they could get very different results).
 We are inconsistent about our use of ise and ize e.g. optimize and normalize_X, but coregionalise, we should choose one and stick to it. Suggest -ize.
 Exceptions: we need to provide a list of exceptions we throw and specify what is thrown where. 
 Why is it get_params() but it's getstate()? Should be get_state(). Why is it get_gradient instead of get_gradients? Need to be consistent!! Doesn't matter which way we choose as long as it's consistent.
 In likelihood Nparams should be num_params
 In likelihood N should be num_data
 The Gaussian target in likelihood should be F What is V doing here?
 Need to check for nan values in likelihoods. These should be treated as missing values. If the likelihood can't handle the missing value an error should be throw.
 Sometimes you want to print kernpart objects, for diagnosis etc. This isn't possible currently. 
 Why do likelihoods still have YYT everywhere, didn't we agree to set observed data to Y and latent function to F?
 For some reason a stub of _get_param_names(self) wasn't available in the Parameterized base class. Have put it in (is this right?)
 Is there a quick FAQ or something on how to build the documentation? I did it once, but can't remember! Have started a FAQ.txt file where we can add this type of information. 
 Similar for the nosetests ... even ran them last week but can't remember the command!
 Now added Gaussian priors to GPLVM latent variables by default. When running the GPy.examples.dimensionality_reduction.stick() example the print out from print model has the same value for the prior+likelihood as for the prior.
 For the back constrained GP-LVM need priors to be on the Xs not on the model parameters (because they aren't parameters, they are constraints). Need to work out how to do this, perhaps by creating the full GP-LVM model then constraining around it, rather than overriding inside the GP-LVM model.
 This code fails:
 kern = GPy.kern.rbf(2)
 GPy.kern.Kern_check_dK_dX(kern, X=np.random.randn(10, 2), X2=None).checkgrad(verbose=True)
 because X2 is now equal to X, so there is a factor of 2 missing. Does this every come up? Yes, in the GP-LVM, (gplvm.py, line 64) where it is called with a corrective factor of 2! And on line 241 of sparse_gp where it is also called with a corrective factor of 2! In original matlab GPLVM, didn't allow gradients with respect to X alone, and multiplied by 2 in base code, but then add diagonal across those elements. This is missing in the new code.
 In white.py, line 41, Need to check here if X and X2 refer to the same reference too ... becaue up the pipeline somewhere someone may have set X2=X when X2 arrived originally equal to None.
--- a/GPy/testing/cgd_tests.py
+++ b/GPy/testing/cgd_tests.py
@ -7,7 +7,6 @@ import unittest
 import numpy
 from GPy.inference.conjugate_gradient_descent import CGD, RUNNING
 import pylab
 import time
 from scipy.optimize.optimize import rosen, rosen_der
 from GPy.inference.gradient_descent_update_rules import PolakRibiere
--- a/GPy/testing/examples_tests.py
+++ b/GPy/testing/examples_tests.py
@ -19,14 +19,14 @@ class ExamplesTests(unittest.TestCase):
        self.assertTrue(isinstance(Model, GPy.models))
 """
-def model_instance_generator(Model):
+def model_instance_generator(model):
    def check_model_returned(self):
-        self._model_instance(Model)
+        self._model_instance(model)
    return check_model_returned
-def checkgrads_generator(Model):
+def checkgrads_generator(model):
    def model_checkgrads(self):
-        self._checkgrad(Model)
+        self._checkgrad(model)
    return model_checkgrads
 """
@ -37,7 +37,7 @@ def model_checkgrads(model):
 def model_instance(model):
    #assert isinstance(model, GPy.core.model)
-    return isinstance(model, GPy.core.Model)
+    return isinstance(model, GPy.core.model)
@nottest
 def test_models():
--- a/GPy/testing/gplvm_tests.py
+++ b/GPy/testing/gplvm_tests.py
@ -7,33 +7,33 @@ import GPy
 class GPLVMTests(unittest.TestCase):
    def test_bias_kern(self):
-        N, num_inducing, input_dim, D = 10, 3, 2, 4
+        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
-        X = np.random.rand(N, input_dim)
+        X = np.random.rand(num_data, input_dim)
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        K = k.K(X)
-        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
+        Y = np.random.multivariate_normal(np.zeros(num_data),K,output_dim).T
        k = GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
        m.randomize()
        self.assertTrue(m.checkgrad())
    def test_linear_kern(self):
-        N, num_inducing, input_dim, D = 10, 3, 2, 4
+        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
-        X = np.random.rand(N, input_dim)
+        X = np.random.rand(num_data, input_dim)
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        K = k.K(X)
-        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
+        Y = np.random.multivariate_normal(np.zeros(num_data),K,output_dim).T
        k = GPy.kern.linear(input_dim) + GPy.kern.white(input_dim, 0.00001)
        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
        m.randomize()
        self.assertTrue(m.checkgrad())
    def test_rbf_kern(self):
-        N, num_inducing, input_dim, D = 10, 3, 2, 4
+        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
-        X = np.random.rand(N, input_dim)
+        X = np.random.rand(num_data, input_dim)
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        K = k.K(X)
-        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
+        Y = np.random.multivariate_normal(np.zeros(num_data),K,output_dim).T
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
        m.randomize()
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -1,20 +1,46 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import unittest
 import numpy as np
 import GPy
 class KernelTests(unittest.TestCase):
    def test_kerneltie(self):
        K = GPy.kern.rbf(5, ARD=True)
        K.tie_params('.*[01]')
        K.constrain_fixed('2')
        X = np.random.rand(5,5)
        Y = np.ones((5,1))
        m = GPy.models.GPRegression(X,Y,K)
        self.assertTrue(m.checkgrad())
    def test_rbfkernel(self):
        verbose = False
        kern = GPy.kern.rbf(5)
        self.assertTrue(GPy.kern.Kern_check_model(kern).is_positive_definite())
        self.assertTrue(GPy.kern.Kern_check_dK_dtheta(kern).checkgrad(verbose=verbose))
        self.assertTrue(GPy.kern.Kern_check_dKdiag_dtheta(kern).checkgrad(verbose=verbose))
        self.assertTrue(GPy.kern.Kern_check_dK_dX(kern).checkgrad(verbose=verbose))
    def test_gibbskernel(self):
        verbose = False
        kern = GPy.kern.gibbs(5, mapping=GPy.mappings.Linear(5, 1))
        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
    def test_mlpkernel(self):
        verbose = False
        kern = GPy.kern.mlp(5)
        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
    def test_polykernel(self):
        verbose = False
        kern = GPy.kern.poly(5, degree=4)
        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
    def test_fixedkernel(self):
        """
        Fixed effect kernel test
@ -42,8 +68,7 @@ class KernelTests(unittest.TestCase):
        self.assertTrue(m.checkgrad())
-
+       
 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
    unittest.main()
--- a/GPy/testing/mapping_tests.py
+++ b/GPy/testing/mapping_tests.py
@ -0,0 +1,34 @@
 # Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import unittest
 import numpy as np
 import GPy
 class MappingTests(unittest.TestCase):
    def test_kernelmapping(self):
        verbose = False
        mapping = GPy.mappings.Kernel(np.random.rand(10, 3), 2)
        self.assertTrue(GPy.core.mapping.Mapping_check_df_dtheta(mapping=mapping).checkgrad(verbose=verbose))
        self.assertTrue(GPy.core.mapping.Mapping_check_df_dX(mapping=mapping).checkgrad(verbose=verbose))
    def test_linearmapping(self):
        verbose = False
        mapping = GPy.mappings.Linear(3, 2)
        self.assertTrue(GPy.core.Mapping_check_df_dtheta(mapping=mapping).checkgrad(verbose=verbose))
        self.assertTrue(GPy.core.Mapping_check_df_dX(mapping=mapping).checkgrad(verbose=verbose))
    def test_mlpmapping(self):
        verbose = False
        mapping = GPy.mappings.MLP(input_dim=2, hidden_dim=[3, 4, 8, 2], output_dim=2)        
        self.assertTrue(GPy.core.Mapping_check_df_dtheta(mapping=mapping).checkgrad(verbose=verbose))
        self.assertTrue(GPy.core.Mapping_check_df_dX(mapping=mapping).checkgrad(verbose=verbose))
 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
    unittest.main()
--- a/GPy/testing/psi_stat_expectation_tests.py
+++ b/GPy/testing/psi_stat_expectation_tests.py
@ -7,9 +7,14 @@ import unittest
 import GPy
 import numpy as np
 from GPy import testing
 import sys
 import numpy
 from GPy.kern.parts.rbf import RBF
 from GPy.kern.parts.linear import Linear
 from copy import deepcopy
-__test__ = False
+__test__ = lambda: 'deep' in sys.argv
-np.random.seed(0)
+# np.random.seed(0)
 def ard(p):
    try:
@ -19,24 +24,37 @@ def ard(p):
        pass
    return ""
-@testing.deepTest(__test__)
+@testing.deepTest(__test__())
 class Test(unittest.TestCase):
    input_dim = 9
    num_inducing = 4
    N = 3
-    Nsamples = 6e6
+    Nsamples = 5e6
    def setUp(self):
        i_s_dim_list = [2,4,3]
        indices = numpy.cumsum(i_s_dim_list).tolist()
        input_slices = [slice(a,b) for a,b in zip([None]+indices, indices)]
        #input_slices[2] = deepcopy(input_slices[1])
        input_slice_kern = GPy.kern.kern(9, 
                                         [
                                          RBF(i_s_dim_list[0], np.random.rand(), np.random.rand(i_s_dim_list[0]), ARD=True),
                                          RBF(i_s_dim_list[1], np.random.rand(), np.random.rand(i_s_dim_list[1]), ARD=True),
                                          Linear(i_s_dim_list[2], np.random.rand(i_s_dim_list[2]), ARD=True)
                                          ],
                                         input_slices = input_slices
                                         )
        self.kerns = (
                    input_slice_kern,
 #                       (GPy.kern.rbf(self.input_dim, ARD=True) +
 #                        GPy.kern.linear(self.input_dim, ARD=True) +
 #                        GPy.kern.bias(self.input_dim) +
 #                        GPy.kern.white(self.input_dim)),
-                      (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
+#                     (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
-                       GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
+#                      GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
-                       GPy.kern.linear(self.input_dim, np.random.rand(self.input_dim), ARD=True) +
+#                      GPy.kern.linear(self.input_dim, np.random.rand(self.input_dim), ARD=True) +
-                       GPy.kern.bias(self.input_dim) +
+#                      GPy.kern.bias(self.input_dim) +
-                       GPy.kern.white(self.input_dim)),
+#                      GPy.kern.white(self.input_dim)),
 #                       GPy.kern.rbf(self.input_dim), GPy.kern.rbf(self.input_dim, ARD=True),
 #                       GPy.kern.linear(self.input_dim, ARD=False), GPy.kern.linear(self.input_dim, ARD=True),
 #                       GPy.kern.linear(self.input_dim) + GPy.kern.bias(self.input_dim),
@ -61,22 +79,22 @@ class Test(unittest.TestCase):
    def test_psi1(self):
        for kern in self.kerns:
-            Nsamples = 100
+            Nsamples = np.floor(self.Nsamples/300.)
            psi1 = kern.psi1(self.Z, self.q_x_mean, self.q_x_variance)
            K_ = np.zeros((Nsamples, self.num_inducing))
            diffs = []
            for i, q_x_sample_stripe in enumerate(np.array_split(self.q_x_samples, self.Nsamples / Nsamples)):
-                K = kern.K(q_x_sample_stripe, self.Z)
+                K = kern.K(q_x_sample_stripe[:Nsamples], self.Z)
                K_ += K
-                diffs.append(((psi1 - (K_ / (i + 1)))).mean())
+                diffs.append((np.abs(psi1 - (K_ / (i + 1)))**2).mean())
            K_ /= self.Nsamples / Nsamples
            msg = "psi1: " + "+".join([p.name + ard(p) for p in kern.parts])
            try:
                import pylab
                pylab.figure(msg)
                pylab.plot(diffs)
-                self.assertTrue(np.allclose(psi1.squeeze(), K_,
+#                 print msg, ((psi1.squeeze() - K_)**2).mean() < .01
-                                            rtol=1e-1, atol=.1),
+                self.assertTrue(((psi1.squeeze() - K_)**2).mean() < .01,
                                msg=msg + ": not matching")
 #                 sys.stdout.write(".")
            except:
@ -87,7 +105,7 @@ class Test(unittest.TestCase):
    def test_psi2(self):
        for kern in self.kerns:
-            Nsamples = 100
+            Nsamples = self.Nsamples/300.
            psi2 = kern.psi2(self.Z, self.q_x_mean, self.q_x_variance)
            K_ = np.zeros((self.num_inducing, self.num_inducing))
            diffs = []
@ -95,13 +113,14 @@ class Test(unittest.TestCase):
                K = kern.K(q_x_sample_stripe, self.Z)
                K = (K[:, :, None] * K[:, None, :]).mean(0)
                K_ += K
-                diffs.append(((psi2 - (K_ / (i + 1)))).mean())
+                diffs.append(((psi2 - (K_ / (i + 1)))**2).mean())
            K_ /= self.Nsamples / Nsamples
            msg = "psi2: {}".format("+".join([p.name + ard(p) for p in kern.parts]))
            try:
                import pylab
                pylab.figure(msg)
                pylab.plot(diffs)
 #                 print msg, np.allclose(psi2.squeeze(), K_, rtol=1e-1, atol=.1)
                self.assertTrue(np.allclose(psi2.squeeze(), K_,
                                            rtol=1e-1, atol=.1),
                                msg=msg + ": not matching")
@ -114,10 +133,8 @@ class Test(unittest.TestCase):
                pass
 if __name__ == "__main__":
    import sys
    __test__ = 'deep' in sys.argv
    sys.argv = ['',
-         'Test.test_psi0',
+         #'Test.test_psi0',
         'Test.test_psi1',
         'Test.test_psi2',
         ]
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@ -23,7 +23,7 @@ class GradientTests(unittest.TestCase):
        self.X2D = np.random.uniform(-3., 3., (40, 2))
        self.Y2D = np.sin(self.X2D[:, 0:1]) * np.sin(self.X2D[:, 1:2]) + np.random.randn(40, 1) * 0.05
-    def check_model_with_white(self, kern, model_type='GPRegression', dimension=1):
+    def check_model(self, kern, model_type='GPRegression', dimension=1, uncertain_inputs=False):
        # Get the correct gradients
        if dimension == 1:
            X = self.X1D
@ -34,9 +34,12 @@ class GradientTests(unittest.TestCase):
        # Get model type (GPRegression, SparseGPRegression, etc)
        model_fit = getattr(GPy.models, model_type)
-        noise = GPy.kern.white(dimension)
+        # noise = GPy.kern.white(dimension)
-        kern = kern + noise
+        kern = kern #  + noise
-        m = model_fit(X, Y, kernel=kern)
+        if uncertain_inputs:
            m = model_fit(X, Y, kernel=kern, X_variance=np.random.rand(X.shape[0], X.shape[1]))
        else:
            m = model_fit(X, Y, kernel=kern)
        m.randomize()
        # contrain all parameters to be positive
        self.assertTrue(m.checkgrad())
@ -44,105 +47,135 @@ class GradientTests(unittest.TestCase):
    def test_GPRegression_rbf_1d(self):
        ''' Testing the GP regression with rbf kernel with white kernel on 1d data '''
        rbf = GPy.kern.rbf(1)
-        self.check_model_with_white(rbf, model_type='GPRegression', dimension=1)
+        self.check_model(rbf, model_type='GPRegression', dimension=1)
    def test_GPRegression_rbf_2D(self):
-        ''' Testing the GP regression with rbf and white kernel on 2d data '''
+        ''' Testing the GP regression with rbf kernel on 2d data '''
        rbf = GPy.kern.rbf(2)
-        self.check_model_with_white(rbf, model_type='GPRegression', dimension=2)
+        self.check_model(rbf, model_type='GPRegression', dimension=2)
    def test_GPRegression_rbf_ARD_2D(self):
-        ''' Testing the GP regression with rbf and white kernel on 2d data '''
+        ''' Testing the GP regression with rbf kernel on 2d data '''
        k = GPy.kern.rbf(2, ARD=True)
-        self.check_model_with_white(k, model_type='GPRegression', dimension=2)
+        self.check_model(k, model_type='GPRegression', dimension=2)
    def test_GPRegression_mlp_1d(self):
        ''' Testing the GP regression with mlp kernel with white kernel on 1d data '''
        mlp = GPy.kern.mlp(1)
        self.check_model(mlp, model_type='GPRegression', dimension=1)
    def test_GPRegression_poly_1d(self):
        ''' Testing the GP regression with polynomial kernel with white kernel on 1d data '''
        mlp = GPy.kern.poly(1, degree=5)
        self.check_model(mlp, model_type='GPRegression', dimension=1)
    def test_GPRegression_matern52_1D(self):
        ''' Testing the GP regression with matern52 kernel on 1d data '''
        matern52 = GPy.kern.Matern52(1)
-        self.check_model_with_white(matern52, model_type='GPRegression', dimension=1)
+        self.check_model(matern52, model_type='GPRegression', dimension=1)
    def test_GPRegression_matern52_2D(self):
        ''' Testing the GP regression with matern52 kernel on 2d data '''
        matern52 = GPy.kern.Matern52(2)
-        self.check_model_with_white(matern52, model_type='GPRegression', dimension=2)
+        self.check_model(matern52, model_type='GPRegression', dimension=2)
    def test_GPRegression_matern52_ARD_2D(self):
        ''' Testing the GP regression with matern52 kernel on 2d data '''
        matern52 = GPy.kern.Matern52(2, ARD=True)
-        self.check_model_with_white(matern52, model_type='GPRegression', dimension=2)
+        self.check_model(matern52, model_type='GPRegression', dimension=2)
    def test_GPRegression_matern32_1D(self):
        ''' Testing the GP regression with matern32 kernel on 1d data '''
        matern32 = GPy.kern.Matern32(1)
-        self.check_model_with_white(matern32, model_type='GPRegression', dimension=1)
+        self.check_model(matern32, model_type='GPRegression', dimension=1)
    def test_GPRegression_matern32_2D(self):
        ''' Testing the GP regression with matern32 kernel on 2d data '''
        matern32 = GPy.kern.Matern32(2)
-        self.check_model_with_white(matern32, model_type='GPRegression', dimension=2)
+        self.check_model(matern32, model_type='GPRegression', dimension=2)
    def test_GPRegression_matern32_ARD_2D(self):
        ''' Testing the GP regression with matern32 kernel on 2d data '''
        matern32 = GPy.kern.Matern32(2, ARD=True)
-        self.check_model_with_white(matern32, model_type='GPRegression', dimension=2)
+        self.check_model(matern32, model_type='GPRegression', dimension=2)
    def test_GPRegression_exponential_1D(self):
        ''' Testing the GP regression with exponential kernel on 1d data '''
        exponential = GPy.kern.exponential(1)
-        self.check_model_with_white(exponential, model_type='GPRegression', dimension=1)
+        self.check_model(exponential, model_type='GPRegression', dimension=1)
    def test_GPRegression_exponential_2D(self):
        ''' Testing the GP regression with exponential kernel on 2d data '''
        exponential = GPy.kern.exponential(2)
-        self.check_model_with_white(exponential, model_type='GPRegression', dimension=2)
+        self.check_model(exponential, model_type='GPRegression', dimension=2)
    def test_GPRegression_exponential_ARD_2D(self):
        ''' Testing the GP regression with exponential kernel on 2d data '''
        exponential = GPy.kern.exponential(2, ARD=True)
-        self.check_model_with_white(exponential, model_type='GPRegression', dimension=2)
+        self.check_model(exponential, model_type='GPRegression', dimension=2)
    def test_GPRegression_bias_kern_1D(self):
        ''' Testing the GP regression with bias kernel on 1d data '''
        bias = GPy.kern.bias(1)
-        self.check_model_with_white(bias, model_type='GPRegression', dimension=1)
+        self.check_model(bias, model_type='GPRegression', dimension=1)
    def test_GPRegression_bias_kern_2D(self):
        ''' Testing the GP regression with bias kernel on 2d data '''
        bias = GPy.kern.bias(2)
-        self.check_model_with_white(bias, model_type='GPRegression', dimension=2)
+        self.check_model(bias, model_type='GPRegression', dimension=2)
    def test_GPRegression_linear_kern_1D_ARD(self):
        ''' Testing the GP regression with linear kernel on 1d data '''
        linear = GPy.kern.linear(1, ARD=True)
-        self.check_model_with_white(linear, model_type='GPRegression', dimension=1)
+        self.check_model(linear, model_type='GPRegression', dimension=1)
    def test_GPRegression_linear_kern_2D_ARD(self):
        ''' Testing the GP regression with linear kernel on 2d data '''
        linear = GPy.kern.linear(2, ARD=True)
-        self.check_model_with_white(linear, model_type='GPRegression', dimension=2)
+        self.check_model(linear, model_type='GPRegression', dimension=2)
    def test_GPRegression_linear_kern_1D(self):
        ''' Testing the GP regression with linear kernel on 1d data '''
        linear = GPy.kern.linear(1)
-        self.check_model_with_white(linear, model_type='GPRegression', dimension=1)
+        self.check_model(linear, model_type='GPRegression', dimension=1)
    def test_GPRegression_linear_kern_2D(self):
        ''' Testing the GP regression with linear kernel on 2d data '''
        linear = GPy.kern.linear(2)
-        self.check_model_with_white(linear, model_type='GPRegression', dimension=2)
+        self.check_model(linear, model_type='GPRegression', dimension=2)
    def test_SparseGPRegression_rbf_white_kern_1d(self):
        ''' Testing the sparse GP regression with rbf kernel with white kernel on 1d data '''
        rbf = GPy.kern.rbf(1)
-        self.check_model_with_white(rbf, model_type='SparseGPRegression', dimension=1)
+        self.check_model(rbf, model_type='SparseGPRegression', dimension=1)
    def test_SparseGPRegression_rbf_white_kern_2D(self):
-        ''' Testing the sparse GP regression with rbf and white kernel on 2d data '''
+        ''' Testing the sparse GP regression with rbf kernel on 2d data '''
        rbf = GPy.kern.rbf(2)
-        self.check_model_with_white(rbf, model_type='SparseGPRegression', dimension=2)
+        self.check_model(rbf, model_type='SparseGPRegression', dimension=2)
    def test_SparseGPRegression_rbf_linear_white_kern_1D(self):
        ''' Testing the sparse GP regression with rbf kernel on 2d data '''
        rbflin = GPy.kern.rbf(1) + GPy.kern.linear(1)
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=1)
    def test_SparseGPRegression_rbf_linear_white_kern_2D(self):
        ''' Testing the sparse GP regression with rbf kernel on 2d data '''
        rbflin = GPy.kern.rbf(2) + GPy.kern.linear(2)
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=2)
    def test_SparseGPRegression_rbf_linear_white_kern_2D_uncertain_inputs(self):
        ''' Testing the sparse GP regression with rbf, linear kernel on 2d data with uncertain inputs'''
        rbflin = GPy.kern.rbf(2) + GPy.kern.linear(2)
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=2, uncertain_inputs=1)
    def test_SparseGPRegression_rbf_linear_white_kern_1D_uncertain_inputs(self):
        ''' Testing the sparse GP regression with rbf, linear kernel on 1d data with uncertain inputs'''
        rbflin = GPy.kern.rbf(1) + GPy.kern.linear(1)
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=1, uncertain_inputs=1)
    def test_GPLVM_rbf_bias_white_kern_2D(self):
-        """ Testing GPLVM with rbf + bias and white kernel """
+        """ Testing GPLVM with rbf + bias kernel """
        N, input_dim, D = 50, 1, 2
        X = np.random.rand(N, input_dim)
        k = GPy.kern.rbf(input_dim, 0.5, 0.9 * np.ones((1,))) + GPy.kern.bias(input_dim, 0.1) + GPy.kern.white(input_dim, 0.05)
@ -152,7 +185,7 @@ class GradientTests(unittest.TestCase):
        self.assertTrue(m.checkgrad())
    def test_GPLVM_rbf_linear_white_kern_2D(self):
-        """ Testing GPLVM with rbf + bias and white kernel """
+        """ Testing GPLVM with rbf + bias kernel """
        N, input_dim, D = 50, 1, 2
        X = np.random.rand(N, input_dim)
        k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim, 0.1) + GPy.kern.white(input_dim, 0.05)
--- a/GPy/util/init.py
+++ b/GPy/util/init.py
@ -7,10 +7,10 @@ import misc
 import plot
 import squashers
 import Tango
 import misc
 import warping_functions
 import datasets
 import mocap
 import visualize
 import decorators
 import classification
 import latent_space_visualizations
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -1,17 +1,138 @@
 import os
 import pylab as pb
 import numpy as np
 import GPy
 import scipy.sparse
 import scipy.io
 import cPickle as pickle
-import urllib2 as url
+import urllib as url
 import zipfile
 import tarfile
 import datetime
 import sys, urllib
 def reporthook(a,b,c): 
    # ',' at the end of the line is important!
    #print "% 3.1f%% of %d bytes\r" % (min(100, float(a * b) / c * 100), c),
    #you can also use sys.stdout.write
    sys.stdout.write("\r% 3.1f%% of %d bytes" % (min(100, float(a * b) / c * 100), c))
    sys.stdout.flush()
 # Global variables
 data_path = os.path.join(os.path.dirname(__file__), 'datasets')
 default_seed = 10000
-neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/'
+overide_manual_authorize=False
-
+neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/'
 cmu_url = 'http://mocap.cs.cmu.edu/subjects/'
 # Note: there may be a better way of storing data resources. One of the pythonistas will need to take a look.
 data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'],
                                       'files' : [['ankurDataPoseSilhouette.mat']],
                                       'license' : None,
                                       'citation' : """3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.""",
                                       'details' : """Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."""},
                  'boston_housing' : {'urls' : ['http://archive.ics.uci.edu/ml/machine-learning-databases/housing/'],
                                      'files' : [['Index', 'housing.data', 'housing.names']],
                                      'citation' : """Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.""",
                                      'details' : """The Boston Housing data relates house values in Boston to a range of input variables.""",
                                      'license' : None,
                                      'size' : 51276
                                      },
                  'brendan_faces' : {'urls' : ['http://www.cs.nyu.edu/~roweis/data/'],
                                     'files': [['frey_rawface.mat']],
                                     'citation' : 'Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.',
                                     'details' : """A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.""",
                                     'license': None,
                                     'size' : 1100584},
                  'cmu_mocap_full' : {'urls' : ['http://mocap.cs.cmu.edu'],
                                 'files' : [['allasfamc.zip']],
                                 'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.
 The database was created with funding from NSF EIA-0196217.""",
                                 'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""",
                                 'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""",
                                 'size' : None},
                  'creep_rupture' : {'urls' : ['http://www.msm.cam.ac.uk/map/data/tar/'],
                                     'files' : [['creeprupt.tar']],
                                     'citation' : 'Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.',
                                     'details' : """Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.""",
                                     'license' : None,
                                     'size' : 602797},
                  'della_gatta' : {'urls' : [neil_url + 'della_gatta/'],
                                   'files': [['DellaGattadata.mat']],
                                   'citation' : 'Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008',
                                   'details': "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.",
                                   'license':None,
                                   'size':3729650},
                  'epomeo_gpx' : {'urls' : [neil_url + 'epomeo_gpx/'],
                                   'files': [['endomondo_1.gpx', 'endomondo_2.gpx', 'garmin_watch_via_endomondo.gpx','viewranger_phone.gpx','viewranger_tablet.gpx']],
                                   'citation' : '',
                                   'details': "Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).",
                                   'license':None,
                                   'size': 2031872},
                  'three_phase_oil_flow': {'urls' : [neil_url + 'three_phase_oil_flow/'],
                                           'files' : [['DataTrnLbls.txt', 'DataTrn.txt', 'DataTst.txt', 'DataTstLbls.txt', 'DataVdn.txt', 'DataVdnLbls.txt']],
                                           'citation' : 'Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593',
                                           'details' : """The three phase oil data used initially for demonstrating the Generative Topographic mapping.""",
                                           'license' : None,
                                           'size' : 712796},
                  'rogers_girolami_data' : {'urls' : ['https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/'],
                                            'files' : [['firstcoursemldata.tar.gz']],
                                            'suffices' : [['?dl=1']],
                                            'citation' : 'A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146',
                                            'details' : """Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.""",
                                            'license' : None,
                                            'size' : 21949154},
                  'olympic_marathon_men' : {'urls' : [neil_url + 'olympic_marathon_men/'],
                                            'files' : [['olympicMarathonTimes.csv']],
                                            'citation' : None,
                                            'details' : """Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data""",
                                            'license': None,
                                            'size' : 584},
                  'osu_run1' : {'urls': ['http://accad.osu.edu/research/mocap/data/', neil_url + 'stick/'],
                                'files': [['run1TXT.ZIP'],['connections.txt']],
                                'details' : "Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.",
                                'citation' : 'The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.',
                                'license' : 'Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).',
                                'size': 338103},
                  'osu_accad' : {'urls': ['http://accad.osu.edu/research/mocap/data/', neil_url + 'stick/'],
                                'files': [['swagger1TXT.ZIP','handspring1TXT.ZIP','quickwalkTXT.ZIP','run1TXT.ZIP','sprintTXT.ZIP','dogwalkTXT.ZIP','camper_04TXT.ZIP','dance_KB3_TXT.ZIP','per20_TXT.ZIP','perTWO07_TXT.ZIP','perTWO13_TXT.ZIP','perTWO14_TXT.ZIP','perTWO15_TXT.ZIP','perTWO16_TXT.ZIP'],['connections.txt']],
                                'details' : "Motion capture data of different motions from the Open Motion Data Project at Ohio State University.",
                                'citation' : 'The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.',
                                'license' : 'Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).',
                                'size': 15922790},
                  'pumadyn-32nm' : {'urls' : ['ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/'],
                                    'files' : [['pumadyn-32nm.tar.gz']],
                                    'details' : """Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.""",
                                    'citation' : """Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.""",
                                    'license' : """Data is made available by the Delve system at the University of Toronto""",
                                    'size' : 5861646},
                  'robot_wireless' : {'urls' : [neil_url + 'robot_wireless/'],
                                      'files' : [['uw-floor.txt']],
                                      'citation' : """WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.""",
                                      'details' : """Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.""",
                                      'license' : None,
                                      'size' : 284390},
                  'swiss_roll' : {'urls' : ['http://isomap.stanford.edu/'],
                                  'files' : [['swiss_roll_data.mat']],
                                  'details' : """Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""",
                                  'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000',
                                  'license' : None,
                                  'size' : 800256},
                  'ripley_prnn_data' : {'urls' : ['http://www.stats.ox.ac.uk/pub/PRNN/'],
                                        'files' : [['Cushings.dat', 'README', 'crabs.dat', 'fglass.dat', 'fglass.grp', 'pima.te', 'pima.tr', 'pima.tr2', 'synth.te', 'synth.tr', 'viruses.dat', 'virus3.dat']],
                                        'details' : """Data sets from Brian Ripley's Pattern Recognition and Neural Networks""",
                                        'citation': """Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7""",
                                        'license' : None,
                                        'size' : 93565},
                  'isomap_face_data' : {'urls' : [neil_url + 'isomap_face_data/'],
                                        'files' : [['face_data.mat']],
                                        'details' : """Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""",
                                        'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000',
                                        'license' : None,
                                        'size' : 24229368},
                  }
 def prompt_user():
    """Ask user for agreeing to data set licenses."""
    # raw_input returns the empty string for "enter"
    yes = set(['yes', 'y'])
    no = set(['no','n'])
@ -25,45 +146,163 @@ def prompt_user():
        sys.stdout.write("Please respond with 'yes', 'y' or 'no', 'n'")
        return prompt_user()
-def download_data(dataset_name=None):
+def data_available(dataset_name=None):
-    """Helper function which contains the resource locations for each data set in one place"""
+    """Check if the data set is available on the local machine already."""
-
+    for file_list in data_resources[dataset_name]['files']:
-    # Note: there may be a better way of doing this. One of the pythonistas will need to take a look. Neil
+        for file in file_list:
-    data_resources = {'oil': {'urls' : [neil_url + 'oil_data/'],
+            if not os.path.exists(os.path.join(data_path, dataset_name, file)):
-                              'files' : [['DataTrnLbls.txt', 'DataTrn.txt']],
+                return False
-                              'citation' : 'Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593',
+    return True
-                              'details' : """The three phase oil data used initially for demonstrating the Generative Topographic mapping.""",
+            
-                              'agreement' : None},
+def download_url(url, store_directory, save_name = None, messages = True, suffix=''):
-                      'brendan_faces' : {'url' : ['http://www.cs.nyu.edu/~roweis/data/'],
+    """Download a file from a url and save it to disk."""
-                                         'files': [['frey_rawface.mat']],
+    i = url.rfind('/')
-                                         'citation' : 'Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.',
+    file = url[i+1:]
-                                         'details' : """A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.""",
+    print file
-                                         'agreement': None}
+    dir_name = os.path.join(data_path, store_directory)
-                      }
+    save_name = os.path.join(dir_name, file)
-
+    print "Downloading ", url, "->", os.path.join(store_directory, file)
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
    urllib.urlretrieve(url+suffix, save_name, reporthook)
 def authorize_download(dataset_name=None):
    """Check with the user that the are happy with terms and conditions for the data set."""
    print('Acquiring resource: ' + dataset_name)
    # TODO, check resource is in dictionary!
    print('')
    dr = data_resources[dataset_name]
    print('Details of data: ')
    print(dr['details'])
    print('')
    if dr['citation']:
        print('Please cite:')
        print(dr['citation'])
-    if dr['agreement']:
+        print('')
-        print('You must also agree to the following:')
+    if dr['size']:
-        print(dr['agreement'])
+        print('After downloading the data will take up ' + str(dr['size']) + ' bytes of space.')
-    print('Do you wish to proceed with the download? [yes/no]')
+        print('')
-    if prompt_user()==False:
+    print('Data will be stored in ' + os.path.join(data_path, dataset_name) + '.')
    print('')
    if overide_manual_authorize:
        if dr['license']:
            print('You have agreed to the following license:')
            print(dr['license'])
            print('')
        return True
    else:
        if dr['license']:
            print('You must also agree to the following license:')
            print(dr['license'])
            print('')
        print('Do you wish to proceed with the download? [yes/no]')
        return prompt_user()
 def download_data(dataset_name=None):
    """Check with the user that the are happy with terms and conditions for the data set, then download it."""
    dr = data_resources[dataset_name]
    if not authorize_download(dataset_name):
        return False
-    for url, files in zip(dr['urls'], dr['files']):
+    if dr.has_key('suffices'):
-        for file in files:
+        for url, files, suffices in zip(dr['urls'], dr['files'], dr['suffices']):
-            download_resource(url + file)
+            for file, suffix in zip(files, suffices):
                download_url(os.path.join(url,file), dataset_name, dataset_name, suffix=suffix)
    else:
        for url, files in zip(dr['urls'], dr['files']):
            for file in files:
                download_url(os.path.join(url,file), dataset_name, dataset_name)
    return True
 def data_details_return(data, data_set):
    """Update the data component of the data dictionary with details drawn from the data_resources."""
    data.update(data_resources[data_set])
    return data
 def cmu_urls_files(subj_motions, messages = True):
    '''
    Find which resources are missing on the local disk for the requested CMU motion capture motions. 
    '''
    subjects_num = subj_motions[0]
    motions_num = subj_motions[1]
    resource = {'urls' : [], 'files' : []}
    # Convert numbers to strings
    subjects = []
    motions = [list() for _ in range(len(subjects_num))]
    for i in range(len(subjects_num)):
        curSubj = str(int(subjects_num[i]))
        if int(subjects_num[i]) < 10:
            curSubj = '0' + curSubj
        subjects.append(curSubj)
        for j in range(len(motions_num[i])):
            curMot = str(int(motions_num[i][j]))
            if int(motions_num[i][j]) < 10:
                curMot = '0' + curMot
            motions[i].append(curMot)
    all_skels = []
    assert len(subjects) == len(motions)
    all_motions = []
    for i in range(len(subjects)):
        skel_dir = os.path.join(data_path, 'cmu_mocap')
        cur_skel_file = os.path.join(skel_dir, subjects[i] + '.asf')
        url_required = False
        file_download = []
        if not os.path.exists(cur_skel_file):
            # Current skel file doesn't exist.
            if not os.path.isdir(skel_dir):
                os.mkdir(skel_dir)
            # Add skel file to list.
            url_required = True
            file_download.append(subjects[i] + '.asf')
        for j in range(len(motions[i])):
            file_name = subjects[i] + '_' + motions[i][j] + '.amc'
            cur_motion_file = os.path.join(skel_dir, file_name)
            if not os.path.exists(cur_motion_file):
                url_required = True
                file_download.append(subjects[i] + '_' + motions[i][j] + '.amc')
        if url_required:
            resource['urls'].append(cmu_url + subjects[i] + '/')
            resource['files'].append(file_download)
    return resource
 try:
    import gpxpy
    import gpxpy.gpx
    gpxpy_available = True
 except ImportError:
    gpxpy_available = False
 if gpxpy_available:
    def epomeo_gpx(data_set='epomeo_gpx', sample_every=4):
        if not data_available(data_set):
            download_data(data_set)
        files = ['endomondo_1', 'endomondo_2', 'garmin_watch_via_endomondo','viewranger_phone', 'viewranger_tablet']
        X = []
        for file in files:
            gpx_file = open(os.path.join(data_path, 'epomeo_gpx', file + '.gpx'), 'r')
            gpx = gpxpy.parse(gpx_file)
            segment = gpx.tracks[0].segments[0]
            points = [point for track in gpx.tracks for segment in track.segments for point in segment.points]
            data = [[(point.time-datetime.datetime(2013,8,21)).total_seconds(), point.latitude, point.longitude, point.elevation] for point in points]
            X.append(np.asarray(data)[::sample_every, :])
            gpx_file.close()        
        return data_details_return({'X' : X, 'info' : 'Data is an array containing time in seconds, latitude, longitude and elevation in that order.'}, data_set)
 del gpxpy_available
 # Some general utilities.
 def sample_class(f):
@ -72,25 +311,25 @@ def sample_class(f):
    c = np.where(c, 1, -1)
    return c
-def download_resource(resource, save_name = None, save_file = True, messages = True):
+def boston_housing(data_set='boston_housing'):
-    if messages:
+    if not data_available(data_set):
-        print "Downloading resource: " , resource, " ... ",
+        download_data(data_set)
-    response = url.urlopen(resource)
+    all_data = np.genfromtxt(os.path.join(data_path, data_set, 'housing.data'))
-    # TODO: Some error checking...
+    X = all_data[:, 0:13]
-    # ...
+    Y = all_data[:, 13:14]
-    html = response.read()
+    return data_details_return({'X' : X, 'Y': Y}, data_set)
    response.close()
    if save_file:
        # TODO: Check if already exists...
        # ...
        with open(save_name, "w") as text_file:
            text_file.write("%s"%html)
            if messages:
                print "Done!"
    return html
-def della_gatta_TRP63_gene_expression(gene_number=None):
+def brendan_faces(data_set='brendan_faces'):
-    mat_data = scipy.io.loadmat(os.path.join(data_path, 'DellaGattadata.mat'))
+    if not data_available(data_set):
        download_data(data_set)
    mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'frey_rawface.mat'))
    Y = mat_data['ff'].T
    return data_details_return({'Y': Y}, data_set)
 def della_gatta_TRP63_gene_expression(data_set='della_gatta', gene_number=None):
    if not data_available(data_set):
        download_data(data_set)
    mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'DellaGattadata.mat'))
    X = np.double(mat_data['timepoints'])
    if gene_number == None:
        Y = mat_data['exprs_tp53_RMA']
@ -98,45 +337,62 @@ def della_gatta_TRP63_gene_expression(gene_number=None):
        Y = mat_data['exprs_tp53_RMA'][:, gene_number]
        if len(Y.shape) == 1:
            Y = Y[:, None]
-    return {'X': X, 'Y': Y, 'info': "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA."}
+    return data_details_return({'X': X, 'Y': Y, 'gene_number' : gene_number}, data_set)
 def simulation_BGPLVM():
    mat_data = scipy.io.loadmat(os.path.join(data_path, 'BGPLVMSimulation.mat'))
    Y = np.array(mat_data['Y'], dtype=float)
    S = np.array(mat_data['initS'], dtype=float)
    mu = np.array(mat_data['initMu'], dtype=float)
    return {'Y': Y, 'S': S,
            'mu' : mu,
            'info': "Simulated test dataset generated in MATLAB to compare BGPLVM between python and MATLAB"}
 # The data sets
-def oil():
+def oil(data_set='three_phase_oil_flow'):
-    #if download_data('oil'):
+    """The three phase oil data from Bishop and James (1993)."""
-    oil_train_file = os.path.join(data_path, 'oil', 'DataTrn.txt')
+    if not data_available(data_set):
-    oil_trainlbls_file = os.path.join(data_path, 'oil', 'DataTrnLbls.txt')
+        download_data(data_set)
    oil_train_file = os.path.join(data_path, data_set, 'DataTrn.txt')
    oil_trainlbls_file = os.path.join(data_path, data_set, 'DataTrnLbls.txt')
    oil_test_file = os.path.join(data_path, data_set, 'DataTst.txt')
    oil_testlbls_file = os.path.join(data_path, data_set, 'DataTstLbls.txt')
    oil_valid_file = os.path.join(data_path, data_set, 'DataVdn.txt')
    oil_validlbls_file = os.path.join(data_path, data_set, 'DataVdnLbls.txt')
    fid = open(oil_train_file)
    X = np.fromfile(fid, sep='\t').reshape((-1, 12))
    fid.close()
    fid = open(oil_test_file)
    Xtest = np.fromfile(fid, sep='\t').reshape((-1, 12))
    fid.close()
    fid = open(oil_valid_file)
    Xvalid = np.fromfile(fid, sep='\t').reshape((-1, 12))
    fid.close()
    fid = open(oil_trainlbls_file)
    Y = np.fromfile(fid, sep='\t').reshape((-1, 3)) * 2. - 1.
    fid.close()
-    return {'X': X, 'Y': Y, 'info': "The oil data from Bishop and James (1993)."}
+    fid = open(oil_testlbls_file)
    Ytest = np.fromfile(fid, sep='\t').reshape((-1, 3)) * 2. - 1.
    fid.close()
    fid = open(oil_validlbls_file)
    Yvalid = np.fromfile(fid, sep='\t').reshape((-1, 3)) * 2. - 1.
    fid.close()
    return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'Xtest' : Xtest, 'Xvalid': Xvalid, 'Yvalid': Yvalid}, data_set)
    #else:
    # throw an error
-def oil_100(seed=default_seed):
+def oil_100(seed=default_seed, data_set = 'three_phase_oil_flow'):
    np.random.seed(seed=seed)
    data = oil()
    indices = np.random.permutation(1000)
    indices = indices[0:100]
    X = data['X'][indices, :]
    Y = data['Y'][indices, :]
-    return {'X': X, 'Y': Y, 'info': "Subsample of the oil data extracting 100 values randomly without replacement."}
+    return data_details_return({'X': X, 'Y': Y, 'info': "Subsample of the full oil data extracting 100 values randomly without replacement, here seed was " + str(seed)}, data_set)
-def pumadyn(seed=default_seed):
+def pumadyn(seed=default_seed, data_set='pumadyn-32nm'):
    if not data_available(data_set):
        download_data(data_set)
        path = os.path.join(data_path, data_set)
        tar = tarfile.open(os.path.join(path, 'pumadyn-32nm.tar.gz'))
        print('Extracting file.')
        tar.extractall(path=path)
        tar.close()
    # Data is variance 1, no need to normalize.
-    data = np.loadtxt(os.path.join(data_path, 'pumadyn-32nm/Dataset.data.gz'))
+    data = np.loadtxt(os.path.join(data_path, data_set, 'pumadyn-32nm', 'Dataset.data.gz'))
    indices = np.random.permutation(data.shape[0])
    indicesTrain = indices[0:7168]
    indicesTest = indices[7168:-1]
@ -146,20 +402,54 @@ def pumadyn(seed=default_seed):
    Y = data[indicesTrain, -1][:, None]
    Xtest = data[indicesTest, 0:-2]
    Ytest = data[indicesTest, -1][:, None]
-    return {'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'info': "The puma robot arm data with 32 inputs. This data is the non linear case with medium noise (pumadyn-32nm). For training 7,168 examples are sampled without replacement."}
+    return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'seed': seed}, data_set)
 def robot_wireless(data_set='robot_wireless'):
    # WiFi access point strengths on a tour around UW Paul Allen building.
    if not data_available(data_set):
        download_data(data_set)
    file_name = os.path.join(data_path, data_set, 'uw-floor.txt')
    all_time = np.genfromtxt(file_name, usecols=(0))
    macaddress = np.genfromtxt(file_name, usecols=(1), dtype='string')
    x = np.genfromtxt(file_name, usecols=(2))
    y = np.genfromtxt(file_name, usecols=(3))
    strength = np.genfromtxt(file_name, usecols=(4))
    addresses = np.unique(macaddress)
    times = np.unique(all_time)
    addresses.sort()
    times.sort()
    allY = np.zeros((len(times), len(addresses)))
    allX = np.zeros((len(times), 2))
    allY[:]=-92.
    strengths={}
    for address, j in zip(addresses, range(len(addresses))):
        ind = np.nonzero(address==macaddress)
        temp_strengths=strength[ind]
        temp_x=x[ind]
        temp_y=y[ind]
        temp_times = all_time[ind]
        for time in temp_times:
            vals = time==temp_times
            if any(vals):
                ind2 = np.nonzero(vals)
                i = np.nonzero(time==times)
                allY[i, j] = temp_strengths[ind2]
                allX[i, 0] = temp_x[ind2]
                allX[i, 1] = temp_y[ind2]
    allY = (allY + 85.)/15.
-def brendan_faces():
+    X = allX[0:215, :]
-    mat_data = scipy.io.loadmat(os.path.join(data_path, 'frey_rawface.mat'))
+    Y = allY[0:215, :]
    Y = mat_data['ff'].T
    return {'Y': Y, 'info': "Face data made available by Brendan Frey"}
    Xtest = allX[215:, :]
    Ytest = allY[215:, :]
    return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'addresses' : addresses, 'times' : times}, data_set)
-
+def silhouette(data_set='ankur_pose_data'):
 def silhouette():
    # Ankur Agarwal and Bill Trigg's silhoutte data.
-    mat_data = scipy.io.loadmat(os.path.join(data_path, 'mocap', 'ankur', 'ankurDataPoseSilhouette.mat'))
+    if not data_available(data_set):
        download_data(data_set)
    mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'ankurDataPoseSilhouette.mat'))
    inMean = np.mean(mat_data['Y'])
    inScales = np.sqrt(np.var(mat_data['Y']))
    X = mat_data['Y'] - inMean
@ -168,22 +458,35 @@ def silhouette():
    Xtest = Xtest / inScales
    Y = mat_data['Z']
    Ytest = mat_data['Z_test']
-    return {'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'info': "Artificial silhouette simulation data developed from Agarwal and Triggs (2004)."}
+    return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest}, data_set)
-def stick():
+def ripley_synth(data_set='ripley_prnn_data'):
-    #if download_data('stick'):
+    if not data_available(data_set):
-    Y, connect = GPy.util.mocap.load_text_data('run1', data_path)
+        download_data(data_set)
-    Y = Y[0:-1:4, :]
+    train = np.genfromtxt(os.path.join(data_path, data_set, 'synth.tr'), skip_header=1)
-    lbls = 'connect'
+    X = train[:, 0:2]
-    return {'Y': Y, 'connect' : connect, 'info': "Stick man data from Ohio."}
+    y = train[:, 2:3]
-    # else:
+    test = np.genfromtxt(os.path.join(data_path, data_set, 'synth.te'), skip_header=1)
-    # throw an error.
+    Xtest = test[:, 0:2]
    ytest = test[:, 2:3]
    return data_details_return({'X': X, 'y': y, 'Xtest': Xtest, 'ytest': ytest, 'info': 'Synthetic data generated by Ripley for a two class classification problem.'}, data_set)
-def swiss_roll_generated(N=1000, sigma=0.0):
+def osu_run1(data_set='osu_run1', sample_every=4):
    if not data_available(data_set):
        download_data(data_set)
    zip = zipfile.ZipFile(os.path.join(data_path, data_set, 'sprintTXT.ZIP'), 'r')
    path = os.path.join(data_path, data_set)
    for name in zip.namelist():
        zip.extract(name, path)
    Y, connect = GPy.util.mocap.load_text_data('Aug210107', path)
    Y = Y[0:-1:sample_every, :]
    return data_details_return({'Y': Y, 'connect' : connect}, data_set)
 def swiss_roll_generated(num_samples=1000, sigma=0.0):
    with open(os.path.join(data_path, 'swiss_roll.pickle')) as f:
        data = pickle.load(f)
    Na = data['Y'].shape[0]
-    perm = np.random.permutation(np.r_[:Na])[:N]
+    perm = np.random.permutation(np.r_[:Na])[:num_samples]
    Y = data['Y'][perm, :]
    t = data['t'][perm]
    c = data['colors'][perm, :]
@ -194,27 +497,49 @@ def swiss_roll_generated(N=1000, sigma=0.0):
    return {'Y':Y, 't':t, 'colors':c}
 def swiss_roll_1000():
-    mat_data = scipy.io.loadmat(os.path.join(data_path, 'swiss_roll_data'))
+    return swiss_roll(num_samples=1000)
    Y = mat_data['X_data'][:, 0:1000].transpose()
    return {'Y': Y, 'info': "Subsample of the swiss roll data extracting only the first 1000 values."}
-def swiss_roll(N=3000):
+def swiss_roll(num_samples=3000, data_set='swiss_roll'):
-    mat_data = scipy.io.loadmat(os.path.join(data_path, 'swiss_roll_data.mat'))
+    if not data_available(data_set):
-    Y = mat_data['X_data'][:, 0:N].transpose()
+        download_data(data_set)
-    return {'Y': Y, 'X': mat_data['X_data'], 'info': "The first 3,000 points from the swiss roll data of Tennenbaum, de Silva and Langford (2001)."}
+    mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'swiss_roll_data.mat'))
    Y = mat_data['X_data'][:, 0:num_samples].transpose()
    return data_details_return({'Y': Y, 'X': mat_data['X_data'], 'info': "The first " + str(num_samples) + " points from the swiss roll data of Tennenbaum, de Silva and Langford (2001)."}, data_set)
-def toy_rbf_1d(seed=default_seed):
+def isomap_faces(num_samples=698, data_set='isomap_face_data'):
    if not data_available(data_set):
        download_data(data_set)
    mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'face_data.mat'))
    Y = mat_data['images'][:, 0:num_samples].transpose()
    return data_details_return({'Y': Y, 'poses' : mat_data['poses'], 'lights': mat_data['lights'], 'info': "The first " + str(num_samples) + " points from the face data of Tennenbaum, de Silva and Langford (2001)."}, data_set)
 def simulation_BGPLVM():
    mat_data = scipy.io.loadmat(os.path.join(data_path, 'BGPLVMSimulation.mat'))
    Y = np.array(mat_data['Y'], dtype=float)
    S = np.array(mat_data['initS'], dtype=float)
    mu = np.array(mat_data['initMu'], dtype=float)
    return data_details_return({'S': S, 'Y': Y, 'mu': mu}, data_set)
    return {'Y': Y, 'S': S,
            'mu' : mu,
            'info': "Simulated test dataset generated in MATLAB to compare BGPLVM between python and MATLAB"}
 def toy_rbf_1d(seed=default_seed, num_samples=500):
    """Samples values of a function from an RBF covariance with very small noise for inputs uniformly distributed between -1 and 1.
    :param seed: seed to use for random sampling.
    :type seed: int
    :param num_samples: number of samples to sample in the function (default 500).
    :type num_samples: int
    """
    np.random.seed(seed=seed)
-    numIn = 1
+    num_in = 1
-    N = 500
+    X = np.random.uniform(low= -1.0, high=1.0, size=(num_samples, num_in))
    X = np.random.uniform(low= -1.0, high=1.0, size=(N, numIn))
    X.sort(axis=0)
-    rbf = GPy.kern.rbf(numIn, variance=1., lengthscale=np.array((0.25,)))
+    rbf = GPy.kern.rbf(num_in, variance=1., lengthscale=np.array((0.25,)))
-    white = GPy.kern.white(numIn, variance=1e-2)
+    white = GPy.kern.white(num_in, variance=1e-2)
    kernel = rbf + white
    K = kernel.K(X)
-    y = np.reshape(np.random.multivariate_normal(np.zeros(N), K), (N, 1))
+    y = np.reshape(np.random.multivariate_normal(np.zeros(num_samples), K), (num_samples, 1))
-    return {'X':X, 'Y':y, 'info': "Samples 500 values of a function from an RBF covariance with very small noise for inputs uniformly distributed between -1 and 1."}
+    return {'X':X, 'Y':y, 'info': "Sampled " + str(num_samples) + " values of a function from an RBF covariance with very small noise for inputs uniformly distributed between -1 and 1."}
 def toy_rbf_1d_50(seed=default_seed):
    np.random.seed(seed=seed)
@ -224,7 +549,7 @@ def toy_rbf_1d_50(seed=default_seed):
    indices.sort(axis=0)
    X = data['X'][indices, :]
    Y = data['Y'][indices, :]
-    return {'X': X, 'Y': Y, 'info': "Subsamples the toy_rbf_sample with 50 values randomly taken from the original sample."}
+    return {'X': X, 'Y': Y, 'info': "Subsamples the toy_rbf_sample with 50 values randomly taken from the original sample.", 'seed' : seed}
 def toy_linear_1d_classification(seed=default_seed):
@ -232,13 +557,46 @@ def toy_linear_1d_classification(seed=default_seed):
    x1 = np.random.normal(-3, 5, 20)
    x2 = np.random.normal(3, 5, 20)
    X = (np.r_[x1, x2])[:, None]
-    return {'X': X, 'Y':  sample_class(2.*X), 'F': 2.*X}
+    return {'X': X, 'Y':  sample_class(2.*X), 'F': 2.*X, 'seed' : seed}
 def olympic_100m_men(data_set='rogers_girolami_data'):
    if not data_available(data_set):
        download_data(data_set)
        path = os.path.join(data_path, data_set)
        tar_file = os.path.join(path, 'firstcoursemldata.tar.gz')
        tar = tarfile.open(tar_file)
        print('Extracting file.')
        tar.extractall(path=path)
        tar.close()
    olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['male100']
 def rogers_girolami_olympics():
    olympic_data = scipy.io.loadmat(os.path.join(data_path, 'olympics.mat'))['male100']
    X = olympic_data[:, 0][:, None]
    Y = olympic_data[:, 1][:, None]
-    return {'X': X, 'Y': Y, 'info': "Olympic sprint times for 100 m men from 1896 until 2008. Example is from Rogers and Girolami's First Course in Machine Learning."}
+    return data_details_return({'X': X, 'Y': Y, 'info': "Olympic sprint times for 100 m men from 1896 until 2008. Example is from Rogers and Girolami's First Course in Machine Learning."}, data_set)
 def olympic_100m_women(data_set='rogers_girolami_data'):
    if not data_available(data_set):
        download_data(data_set)
        path = os.path.join(data_path, data_set)
        tar_file = os.path.join(path, 'firstcoursemldata.tar.gz')
        tar = tarfile.open(tar_file)
        print('Extracting file.')
        tar.extractall(path=path)
        tar.close()
    olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['female100']
    X = olympic_data[:, 0][:, None]
    Y = olympic_data[:, 1][:, None]
    return data_details_return({'X': X, 'Y': Y, 'info': "Olympic sprint times for 100 m women from 1896 until 2008. Example is from Rogers and Girolami's First Course in Machine Learning."}, data_set)
 def olympic_marathon_men(data_set='olympic_marathon_men'):
    if not data_available(data_set):
        download_data(data_set)
    olympics = np.genfromtxt(os.path.join(data_path, data_set, 'olympicMarathonTimes.csv'), delimiter=',')
    X = olympics[:, 0:1]
    Y = olympics[:, 1:2]
    return data_details_return({'X': X, 'Y': Y}, data_set)
 # def movielens_small(partNo=1,seed=default_seed):
 #     np.random.seed(seed=seed)
@ -272,8 +630,6 @@ def rogers_girolami_olympics():
 #     return {'Y':Y, 'lbls':lbls, 'Ytest':Ytest, 'lblstest':lblstest}
 def crescent_data(num_data=200, seed=default_seed):
    """Data set formed from a mixture of four Gaussians. In each class two of the Gaussians are elongated at right angles to each other and offset to form an approximation to the crescent data that is popular in semi-supervised learning as a toy problem.
    :param num_data_part: number of data to be sampled (default is 200).
@ -302,24 +658,31 @@ def crescent_data(num_data=200, seed=default_seed):
    for i in range(0, 4):
        num_data_part.append(round(((i + 1) * num_data) / 4.))
        num_data_part[i] -= num_data_total
        # print num_data_part[i]
        part = np.random.normal(size=(num_data_part[i], 2))
        part = np.dot(np.dot(part, scales[i]), R) + means[i]
        Xparts.append(part)
        num_data_total += num_data_part[i]
    X = np.vstack((Xparts[0], Xparts[1], Xparts[2], Xparts[3]))
    Y = np.vstack((np.ones((num_data_part[0] + num_data_part[1], 1)), -np.ones((num_data_part[2] + num_data_part[3], 1))))
    return {'X':X, 'Y':Y, 'info': "Two separate classes of data formed approximately in the shape of two crescents."}
-def creep_data():
+def creep_data(data_set='creep_rupture'):
-    all_data = np.loadtxt(os.path.join(data_path, 'creep', 'taka'))
+    """Brun and Yoshida's metal creep rupture data."""
    if not data_available(data_set):
        download_data(data_set)
        path = os.path.join(data_path, data_set)
        tar_file = os.path.join(path, 'creeprupt.tar')
        tar = tarfile.open(tar_file)
        print('Extracting file.')
        tar.extractall(path=path)
        tar.close()
    all_data = np.loadtxt(os.path.join(data_path, data_set, 'taka'))
    y = all_data[:, 1:2].copy()
    features = [0]
    features.extend(range(2, 31))
    X = all_data[:, features].copy()
-    return {'X': X, 'y' : y}
+    return data_details_return({'X': X, 'y': y}, data_set)
 def cmu_mocap_49_balance():
    """Load CMU subject 49's one legged balancing motion that was used by Alvarez, Luengo and Lawrence at AISTATS 2009."""
@ -341,14 +704,19 @@ def cmu_mocap_35_walk_jog():
    data['info'] = "Walk and jog data from CMU data base subject 35. As used in Tayor, Roweis and Hinton at NIPS 2007, but without their pre-processing (i.e. as used by Lawrence at AISTATS 2007). It consists of " + data['info']
    return data
-def cmu_mocap(subject, train_motions, test_motions=[], sample_every=4):
+def cmu_mocap(subject, train_motions, test_motions=[], sample_every=4, data_set='cmu_mocap'):
    """Load a given subject's training and test motions from the CMU motion capture data."""
    # Load in subject skeleton.
-    subject_dir = os.path.join(data_path, 'mocap', 'cmu', subject)
+    subject_dir = os.path.join(data_path, data_set)
    # Make sure the data is downloaded.
-    mocap.fetch_cmu(([subject], [train_motions]), skel_store_dir=subject_dir,motion_store_dir=subject_dir)
+    all_motions = train_motions + test_motions
    resource = cmu_urls_files(([subject], [all_motions]))
    data_resources[data_set] = data_resources['cmu_mocap_full']
    data_resources[data_set]['files'] = resource['files']
    data_resources[data_set]['urls'] = resource['urls']
    if resource['urls']:
        download_data(data_set)
    skel = GPy.util.mocap.acclaim_skeleton(os.path.join(subject_dir, subject + '.asf'))
@ -413,4 +781,4 @@ def cmu_mocap(subject, train_motions, test_motions=[], sample_every=4):
        info += '.'
    if sample_every != 1:
        info += ' Data is sub-sampled to every ' + str(sample_every) + ' frames.'
-    return {'Y': Y, 'lbls' : lbls, 'Ytest': Ytest, 'lblstest' : lblstest, 'info': info, 'skel': skel}
+    return data_details_return({'Y': Y, 'lbls' : lbls, 'Ytest': Ytest, 'lblstest' : lblstest, 'info': info, 'skel': skel}, data_set)
--- a/GPy/util/datasets/DellaGattadata.mat
+++ b/GPy/util/datasets/DellaGattadata.mat
--- a/GPy/util/datasets/connections.txt
+++ b/GPy/util/datasets/connections.txt
@ -0,0 +1,22 @@
 LFHD, RFHD
 RFHD, RBHD
 RBHD, LBHD
 LBHD, LFHD
 LELB, LWRB
 LWRB, LFIN
 LELB, LSHO
 LSHO, RSHO
 RSHO, STRN
 LSHO, STRN
 RSHO, RELB
 RELB, RWRB
 RWRB, RFIN
 LSHO, LFWT
 RSHO, RFWT
 LFWT, RFWT
 LFWT, LKNE
 RFWT, RKNE
 LKNE, LHEE
 RKNE, RHEE
 RMT5, RHEE
 LMT5, LHEE
--- a/GPy/util/datasets/crabs.dat
+++ b/GPy/util/datasets/crabs.dat
@ -1,201 +0,0 @@
 sp sex index FL RW CL CW BD
 B M  1  8.1  6.7 16.1 19.0  7.0
 B M  2  8.8  7.7 18.1 20.8  7.4
 B M  3  9.2  7.8 19.0 22.4  7.7
 B M  4  9.6  7.9 20.1 23.1  8.2
 B M  5  9.8  8.0 20.3 23.0  8.2
 B M  6 10.8  9.0 23.0 26.5  9.8
 B M  7 11.1  9.9 23.8 27.1  9.8
 B M  8 11.6  9.1 24.5 28.4 10.4
 B M  9 11.8  9.6 24.2 27.8  9.7
 B M 10 11.8 10.5 25.2 29.3 10.3
 B M 11 12.2 10.8 27.3 31.6 10.9
 B M 12 12.3 11.0 26.8 31.5 11.4
 B M 13 12.6 10.0 27.7 31.7 11.4
 B M 14 12.8 10.2 27.2 31.8 10.9
 B M 15 12.8 10.9 27.4 31.5 11.0
 B M 16 12.9 11.0 26.8 30.9 11.4
 B M 17 13.1 10.6 28.2 32.3 11.0
 B M 18 13.1 10.9 28.3 32.4 11.2
 B M 19 13.3 11.1 27.8 32.3 11.3
 B M 20 13.9 11.1 29.2 33.3 12.1
 B M 21 14.3 11.6 31.3 35.5 12.7
 B M 22 14.6 11.3 31.9 36.4 13.7
 B M 23 15.0 10.9 31.4 36.4 13.2
 B M 24 15.0 11.5 32.4 37.0 13.4
 B M 25 15.0 11.9 32.5 37.2 13.6
 B M 26 15.2 12.1 32.3 36.7 13.6
 B M 27 15.4 11.8 33.0 37.5 13.6
 B M 28 15.7 12.6 35.8 40.3 14.5
 B M 29 15.9 12.7 34.0 38.9 14.2
 B M 30 16.1 11.6 33.8 39.0 14.4
 B M 31 16.1 12.8 34.9 40.7 15.7
 B M 32 16.2 13.3 36.0 41.7 15.4
 B M 33 16.3 12.7 35.6 40.9 14.9
 B M 34 16.4 13.0 35.7 41.8 15.2
 B M 35 16.6 13.5 38.1 43.4 14.9
 B M 36 16.8 12.8 36.2 41.8 14.9
 B M 37 16.9 13.2 37.3 42.7 15.6
 B M 38 17.1 12.6 36.4 42.0 15.1
 B M 39 17.1 12.7 36.7 41.9 15.6
 B M 40 17.2 13.5 37.6 43.9 16.1
 B M 41 17.7 13.6 38.7 44.5 16.0
 B M 42 17.9 14.1 39.7 44.6 16.8
 B M 43 18.0 13.7 39.2 44.4 16.2
 B M 44 18.8 15.8 42.1 49.0 17.8
 B M 45 19.3 13.5 41.6 47.4 17.8
 B M 46 19.3 13.8 40.9 46.5 16.8
 B M 47 19.7 15.3 41.9 48.5 17.8
 B M 48 19.8 14.2 43.2 49.7 18.6
 B M 49 19.8 14.3 42.4 48.9 18.3
 B M 50 21.3 15.7 47.1 54.6 20.0
 B F  1  7.2  6.5 14.7 17.1  6.1
 B F  2  9.0  8.5 19.3 22.7  7.7
 B F  3  9.1  8.1 18.5 21.6  7.7
 B F  4  9.1  8.2 19.2 22.2  7.7
 B F  5  9.5  8.2 19.6 22.4  7.8
 B F  6  9.8  8.9 20.4 23.9  8.8
 B F  7 10.1  9.3 20.9 24.4  8.4
 B F  8 10.3  9.5 21.3 24.7  8.9
 B F  9 10.4  9.7 21.7 25.4  8.3
 B F 10 10.8  9.5 22.5 26.3  9.1
 B F 11 11.0  9.8 22.5 25.7  8.2
 B F 12 11.2 10.0 22.8 26.9  9.4
 B F 13 11.5 11.0 24.7 29.2 10.1
 B F 14 11.6 11.0 24.6 28.5 10.4
 B F 15 11.6 11.4 23.7 27.7 10.0
 B F 16 11.7 10.6 24.9 28.5 10.4
 B F 17 11.9 11.4 26.0 30.1 10.9
 B F 18 12.0 10.7 24.6 28.9 10.5
 B F 19 12.0 11.1 25.4 29.2 11.0
 B F 20 12.6 12.2 26.1 31.6 11.2
 B F 21 12.8 11.7 27.1 31.2 11.9
 B F 22 12.8 12.2 26.7 31.1 11.1
 B F 23 12.8 12.2 27.9 31.9 11.5
 B F 24 13.0 11.4 27.3 31.8 11.3
 B F 25 13.1 11.5 27.6 32.6 11.1
 B F 26 13.2 12.2 27.9 32.1 11.5
 B F 27 13.4 11.8 28.4 32.7 11.7
 B F 28 13.7 12.5 28.6 33.8 11.9
 B F 29 13.9 13.0 30.0 34.9 13.1
 B F 30 14.7 12.5 30.1 34.7 12.5
 B F 31 14.9 13.2 30.1 35.6 12.0
 B F 32 15.0 13.8 31.7 36.9 14.0
 B F 33 15.0 14.2 32.8 37.4 14.0
 B F 34 15.1 13.3 31.8 36.3 13.5
 B F 35 15.1 13.5 31.9 37.0 13.8
 B F 36 15.1 13.8 31.7 36.6 13.0
 B F 37 15.2 14.3 33.9 38.5 14.7
 B F 38 15.3 14.2 32.6 38.3 13.8
 B F 39 15.4 13.3 32.4 37.6 13.8
 B F 40 15.5 13.8 33.4 38.7 14.7
 B F 41 15.6 13.9 32.8 37.9 13.4
 B F 42 15.6 14.7 33.9 39.5 14.3
 B F 43 15.7 13.9 33.6 38.5 14.1
 B F 44 15.8 15.0 34.5 40.3 15.3
 B F 45 16.2 15.2 34.5 40.1 13.9
 B F 46 16.4 14.0 34.2 39.8 15.2
 B F 47 16.7 16.1 36.6 41.9 15.4
 B F 48 17.4 16.9 38.2 44.1 16.6
 B F 49 17.5 16.7 38.6 44.5 17.0
 B F 50 19.2 16.5 40.9 47.9 18.1
 O M  1  9.1  6.9 16.7 18.6  7.4
 O M  2 10.2  8.2 20.2 22.2  9.0
 O M  3 10.7  8.6 20.7 22.7  9.2
 O M  4 11.4  9.0 22.7 24.8 10.1
 O M  5 12.5  9.4 23.2 26.0 10.8
 O M  6 12.5  9.4 24.2 27.0 11.2
 O M  7 12.7 10.4 26.0 28.8 12.1
 O M  8 13.2 11.0 27.1 30.4 12.2
 O M  9 13.4 10.1 26.6 29.6 12.0
 O M 10 13.7 11.0 27.5 30.5 12.2
 O M 11 14.0 11.5 29.2 32.2 13.1
 O M 12 14.1 10.4 28.9 31.8 13.5
 O M 13 14.1 10.5 29.1 31.6 13.1
 O M 14 14.1 10.7 28.7 31.9 13.3 
 O M 15 14.2 10.6 28.7 31.7 12.9
 O M 16 14.2 10.7 27.8 30.9 12.7
 O M 17 14.2 11.3 29.2 32.2 13.5
 O M 18 14.6 11.3 29.9 33.5 12.8
 O M 19 14.7 11.1 29.0 32.1 13.1
 O M 20 15.1 11.4 30.2 33.3 14.0
 O M 21 15.1 11.5 30.9 34.0 13.9
 O M 22 15.4 11.1 30.2 33.6 13.5
 O M 23 15.7 12.2 31.7 34.2 14.2
 O M 24 16.2 11.8 32.3 35.3 14.7
 O M 25 16.3 11.6 31.6 34.2 14.5
 O M 26 17.1 12.6 35.0 38.9 15.7
 O M 27 17.4 12.8 36.1 39.5 16.2
 O M 28 17.5 12.0 34.4 37.3 15.3
 O M 29 17.5 12.7 34.6 38.4 16.1
 O M 30 17.8 12.5 36.0 39.8 16.7
 O M 31 17.9 12.9 36.9 40.9 16.5
 O M 32 18.0 13.4 36.7 41.3 17.1
 O M 33 18.2 13.7 38.8 42.7 17.2
 O M 34 18.4 13.4 37.9 42.2 17.7
 O M 35 18.6 13.4 37.8 41.9 17.3
 O M 36 18.6 13.5 36.9 40.2 17.0
 O M 37 18.8 13.4 37.2 41.1 17.5
 O M 38 18.8 13.8 39.2 43.3 17.9
 O M 39 19.4 14.1 39.1 43.2 17.8
 O M 40 19.4 14.4 39.8 44.3 17.9
 O M 41 20.1 13.7 40.6 44.5 18.0
 O M 42 20.6 14.4 42.8 46.5 19.6
 O M 43 21.0 15.0 42.9 47.2 19.4
 O M 44 21.5 15.5 45.5 49.7 20.9
 O M 45 21.6 15.4 45.7 49.7 20.6
 O M 46 21.6 14.8 43.4 48.2 20.1
 O M 47 21.9 15.7 45.4 51.0 21.1
 O M 48 22.1 15.8 44.6 49.6 20.5
 O M 49 23.0 16.8 47.2 52.1 21.5
 O M 50 23.1 15.7 47.6 52.8 21.6
 O F  1 10.7  9.7 21.4 24.0  9.8
 O F  2 11.4  9.2 21.7 24.1  9.7
 O F  3 12.5 10.0 24.1 27.0 10.9
 O F  4 12.6 11.5 25.0 28.1 11.5
 O F  5 12.9 11.2 25.8 29.1 11.9
 O F  6 14.0 11.9 27.0 31.4 12.6
 O F  7 14.0 12.8 28.8 32.4 12.7
 O F  8 14.3 12.2 28.1 31.8 12.5
 O F  9 14.7 13.2 29.6 33.4 12.9
 O F 10 14.9 13.0 30.0 33.7 13.3
 O F 11 15.0 12.3 30.1 33.3 14.0
 O F 12 15.6 13.5 31.2 35.1 14.1
 O F 13 15.6 14.0 31.6 35.3 13.8
 O F 14 15.6 14.1 31.0 34.5 13.8
 O F 15 15.7 13.6 31.0 34.8 13.8
 O F 16 16.1 13.6 31.6 36.0 14.0
 O F 17 16.1 13.7 31.4 36.1 13.9
 O F 18 16.2 14.0 31.6 35.6 13.7
 O F 19 16.7 14.3 32.3 37.0 14.7
 O F 20 17.1 14.5 33.1 37.2 14.6
 O F 21 17.5 14.3 34.5 39.6 15.6
 O F 22 17.5 14.4 34.5 39.0 16.0
 O F 23 17.5 14.7 33.3 37.6 14.6
 O F 24 17.6 14.0 34.0 38.6 15.5
 O F 25 18.0 14.9 34.7 39.5 15.7
 O F 26 18.0 16.3 37.9 43.0 17.2
 O F 27 18.3 15.7 35.1 40.5 16.1
 O F 28 18.4 15.5 35.6 40.0 15.9
 O F 29 18.4 15.7 36.5 41.6 16.4
 O F 30 18.5 14.6 37.0 42.0 16.6
 O F 31 18.6 14.5 34.7 39.4 15.0
 O F 32 18.8 15.2 35.8 40.5 16.6
 O F 33 18.9 16.7 36.3 41.7 15.3
 O F 34 19.1 16.0 37.8 42.3 16.8
 O F 35 19.1 16.3 37.9 42.6 17.2
 O F 36 19.7 16.7 39.9 43.6 18.2
 O F 37 19.9 16.6 39.4 43.9 17.9
 O F 38 19.9 17.9 40.1 46.4 17.9
 O F 39 20.0 16.7 40.4 45.1 17.7
 O F 40 20.1 17.2 39.8 44.1 18.6
 O F 41 20.3 16.0 39.4 44.1 18.0
 O F 42 20.5 17.5 40.0 45.5 19.2
 O F 43 20.6 17.5 41.5 46.2 19.2
 O F 44 20.9 16.5 39.9 44.7 17.5
 O F 45 21.3 18.4 43.8 48.4 20.0
 O F 46 21.4 18.0 41.2 46.2 18.7
 O F 47 21.7 17.1 41.7 47.2 19.6
 O F 48 21.9 17.2 42.6 47.4 19.5
 O F 49 22.5 17.2 43.0 48.7 19.8
 O F 50 23.1 20.2 46.2 52.5 21.1
--- a/GPy/util/datasets/mocap/ankur/ankurDataPoseSilhouette.mat
+++ b/GPy/util/datasets/mocap/ankur/ankurDataPoseSilhouette.mat
--- a/GPy/util/datasets/oil/DataTrn.txt
+++ b/GPy/util/datasets/oil/DataTrn.txt
--- a/GPy/util/datasets/oil/DataTrnLbls.txt
+++ b/GPy/util/datasets/oil/DataTrnLbls.txt
--- a/GPy/util/datasets/oil/DataTst.txt
+++ b/GPy/util/datasets/oil/DataTst.txt
--- a/GPy/util/datasets/oil/DataTstLbls.txt
+++ b/GPy/util/datasets/oil/DataTstLbls.txt
--- a/GPy/util/datasets/oil/DataVdn.txt
+++ b/GPy/util/datasets/oil/DataVdn.txt
--- a/GPy/util/datasets/oil/DataVdnLbls.txt
+++ b/GPy/util/datasets/oil/DataVdnLbls.txt
--- a/GPy/util/datasets/olympics.mat
+++ b/GPy/util/datasets/olympics.mat
--- a/GPy/util/datasets/pumadyn-32nm/Dataset.data.gz
+++ b/GPy/util/datasets/pumadyn-32nm/Dataset.data.gz
--- a/GPy/util/datasets/pumadyn-32nm/Dataset.spec
+++ b/GPy/util/datasets/pumadyn-32nm/Dataset.spec
@ -1,44 +0,0 @@
 #
 # Puma forward dynamics -- 32nm = 32 inputs, high nonlinearity, med noise
 #
 #
 Origin: simulated
 Usage: assessment
 Order: uninformative
 Attributes:
  1  theta1	      u  [-3.1416,3.1416]	# ang position of joint 1 in radians
  2  theta2	      u  [-3.1416,3.1416]	# ang position of joint 2 in radians
  3  theta3	      u  [-3.1416,3.1416]	# ang position of joint 3 in radians
  4  theta4	      u  [-3.1416,3.1416]	# ang position of joint 4 in radians
  5  theta5	      u  [-3.1416,3.1416]	# ang position of joint 5 in radians
  6  theta6	      u  [-3.1416,3.1416]	# ang position of joint 6 in radians
  7  thetad1      u  (-Inf,Inf)	# ang vel of joint 1 in rad/sec
  8  thetad2      u  (-Inf,Inf)	# ang vel of joint 2 in rad/sec
  9  thetad3      u  (-Inf,Inf)	# ang vel of joint 3 in rad/sec
  10 thetad4      u  (-Inf,Inf)	# ang vel of joint 4 in rad/sec
  11 thetad5      u  (-Inf,Inf)	# ang vel of joint 5 in rad/sec
  12 thetad6      u  (-Inf,Inf)	# ang vel of joint 6 in rad/sec
  13 tau1     u  (-Inf,Inf)	# torque on jt 1 
  14 tau2     u  (-Inf,Inf)	# torque on jt 2
  15 tau3     u  (-Inf,Inf)	# torque on jt 3
  16 tau4     u  (-Inf,Inf)	# torque on jt 4
  17 tau5     u  (-Inf,Inf)	# torque on jt 5
  18 dm1   u	 [0,Inf)	# proportion change in mass of link 1
  19 dm2   u	 [0,Inf)	# prop change in mass of link 2
  20 dm3   u	 [0,Inf)	# prop change in mass of link 3
  21 dm4   u	 [0,Inf)	# prop change in mass of link 4
  22 dm5   u	 [0,Inf)	# prop change in mass of link 5
  23 da1    u	 [0,Inf)	# prop change in length of link 1
  24 da2    u	 [0,Inf)	# prop change in length of link 2
  25 da3    u	 [0,Inf)	# prop change in length of link 3
  26 da4    u	 [0,Inf)	# prop change in length of link 4
  27 da5    u	 [0,Inf)	# prop change in length of link 5
  28 db1      u	 [0,Inf)	# prop change in visc friction of link 1
  29 db2      u	 [0,Inf)	# prop change in visc friction of link 2
  30 db3      u	 [0,Inf)	# prop change in visc friction of link 3
  31 db4      u	 [0,Inf)	# prop change in visc friction of link 4
  32 db5      u	 [0,Inf)	# prop change in visc friction of link 5
  33 thetadd6     u  (-Inf,Inf)	# ang acceleration of joint 6
--- a/GPy/util/datasets/pumadyn-32nm/accel/Prototask.data.gz
+++ b/GPy/util/datasets/pumadyn-32nm/accel/Prototask.data.gz
--- a/GPy/util/datasets/pumadyn-32nm/accel/Prototask.spec
+++ b/GPy/util/datasets/pumadyn-32nm/accel/Prototask.spec
@ -1,12 +0,0 @@
 #
 # Prototask.spec
 #
 Cases: all
 Origin: simulated 
 Inputs: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
 Order: retain
 Targets: 33
 Test-Set-Size: 4096
 Training-Set-Sizes: 64 128 256 512 1024
 Test-Set-Selection: hierarchical
 Maximum-Number-Of-Instances: 8
--- a/GPy/util/datasets/pumadyn-32nm/accel/std.prior
+++ b/GPy/util/datasets/pumadyn-32nm/accel/std.prior
@ -1,33 +0,0 @@
  1   NLMH   real
  2   NLMH   real
  3   NLMH   real
  4   NLMH   real
  5   NLMH   real 
  6   NLMH   real
  7   NLMH   real
  8   NLMH   real
  9   NLMH   real
 10   NLMH   real
 11   NLMH   real
 12   NLMH   real
 13   NLMH   real
 14   NLMH   real
 15   NLMH   real
 16   NLMH   real
 17   NLMH   real
 18   NLMH   real
 19   NLMH   real
 20   NLMH   real
 21   NLMH   real
 22   NLMH   real
 23   NLMH   real
 24   NLMH   real
 25   NLMH   real
 26   NLMH   real
 27   NLMH   real
 28   NLMH   real
 29   NLMH   real
 30   NLMH   real
 31   NLMH   real
 32   NLMH   real
 33   NLMH   real
--- a/GPy/util/datasets/swiss_roll_data.mat
+++ b/GPy/util/datasets/swiss_roll_data.mat
--- a/GPy/util/datasets/synth.te
+++ b/GPy/util/datasets/synth.te
--- a/GPy/util/datasets/synth.tr
+++ b/GPy/util/datasets/synth.tr
@ -1,251 +0,0 @@
    xs            ys       yc
  0.05100797  0.16086164    0
 -0.74807425  0.08904024    0
 -0.77293371  0.26317168    0
  0.21837360  0.12706142    0
  0.37268336  0.49656200    0
 -0.62931544  0.63202159    0
 -0.43307167  0.14479166    0
 -0.84151970 -0.19131316    0
  0.47525648  0.22483671    0
  0.32082976  0.32721288    0
  0.32061253  0.33407547    0
 -0.89077472  0.41168783    0
  0.17850119  0.44691359    0
  0.31558002  0.38853383    0
  0.55777224  0.47272748    0
  0.03191877  0.01222964    0
  0.25090585  0.30716705    0
  0.23571547  0.22493837    0
 -0.07236203  0.33376524    0
  0.50440241  0.08054579    0
 -0.63223351  0.44552458    0
 -0.76784656  0.23614689    0
 -0.70017557  0.21038848    0
 -0.64713491  0.15921366    0
 -0.76739248  0.09259038    0
 -0.51788734  0.03288107    0
  0.17516644  0.34534871    0
 -0.68031190  0.47612156    0
  0.01595199  0.32167526    0
 -0.71481078  0.51421443    0
  0.07837946  0.32284981    0
 -0.80872251  0.47036593    0
 -0.84211234  0.09294232    0
 -0.98591577  0.48309267    0
  0.29104081  0.34275967    0
  0.24321541  0.51488295    0
 -0.60104419  0.05060116    0
 -1.24652451  0.45923165    0
 -0.82769016  0.36187460    0
 -0.62117301 -0.10912158    0
 -0.70584105  0.65907662    0
  0.06718867  0.60574850    0
  0.30505147  0.47417973    0
  0.60788138  0.39361588    0
 -0.78937483  0.17591675    0
 -0.53123209  0.42652809    0
  0.25202071  0.17029707    0
 -0.57880357  0.26553665    0
 -0.83176749  0.54447377    0
 -0.69859164  0.38566851    0
 -0.73642607  0.11857527    0
 -0.93496195  0.11370707    0
  0.43959309  0.41430638    0
 -0.54690854  0.24956276    0
 -0.08405550  0.36521058    0
  0.32211458  0.69087105    0
  0.10764739  0.57946932    0
 -0.71864030  0.25645757    0
 -0.87877752  0.45064757    0
 -0.69846046  0.95053870    0
  0.39757434  0.11810207    0
 -0.50451354  0.57196376    0
  0.25023622  0.39783889    0
  0.61709156  0.10185808    0
  0.31832860  0.08790562    0
 -0.57453363  0.18624195    0
  0.09761865  0.55176786    0
  0.48449339  0.35372973    0
  0.52400684  0.46616851    0
 -0.78138463 -0.07534713    0
 -0.49704591  0.59948077    0
 -0.96984525  0.46624927    0
  0.43541407  0.12192386    0
 -0.67942462  0.30753942    0
 -0.62529036  0.07099046    0
 -0.02318116  0.40442601    0
  0.23200141  0.71066846    0
  0.09384354  0.46674396    0
  0.14234301  0.17898711    0
 -0.61686357  0.25507763    0
  0.23636288  0.51543839    0
  0.38914177  0.40429568    0
 -0.95178678 -0.03772239    0
  0.24087822  0.71948890    0
  0.12446266  0.45178849    0
 -0.60566430  0.26906478    0
 -0.71397188  0.30871780    0
  0.31008428  0.34675335    0
  0.18018786  0.46204643    0
 -0.42663885  0.64723225    0
  0.06143230  0.32491150    0
  0.07736952  0.32183287    0
  0.42814970  0.13445957    0
 -0.80250753  0.66878999    0
  0.40142623  0.42516398    0
  0.37084776  0.26407123    0
 -0.80774748  0.41485899    0
  0.50163585  0.23934856    0
  0.58238323  0.22842741    0
 -0.59136100  0.30230321    0
 -0.87037236  0.26941446    0
 -0.72086765  0.19676678    0
  0.27778443  0.21792253    0
  0.33240813  0.27349865    0
 -0.14092068  0.39247351    0
 -0.59759518  0.14790267    0
 -0.85581534  0.14513961    0
 -0.88912232  0.26896001    0
  0.21345680  0.43611756    0
 -0.53467949  0.57901229    0
  0.31686848  0.39705856    0
 -0.68121733  0.04209840    0
 -0.97586127  0.45964811    0
  0.41457183  0.27141230    0
  0.32751292  0.36780137    0
 -0.93209192  0.09362034    0
  0.58395341  0.47147282    0
 -0.44437309  0.23010142    0
  0.29109441  0.19365556    0
 -0.51080722  0.41496003    0
 -0.96597511  0.17931052    0
  0.18741315  0.29747132    0
  0.17965417  0.45175449    0
 -0.72689602  0.35728387    0
 -0.54339877  0.41012013    0
 -0.59823393  0.98701425    1
 -0.20194736  0.62101680    1
  0.47146103  0.48221146    1
 -0.09821987  0.58755577    1
 -0.35657658  0.63709705    1
  0.63881392  0.42112135    1
  0.62980614  0.28146085    1
 -0.46223286  0.61661031    1
 -0.07331555  0.55821736    1
 -0.55405533  0.51253129    1
 -0.43761773  0.87811781    1
 -0.22237814  0.88850773    1
  0.09346162  0.67310494    1
  0.53174745  0.54372650    1
  0.40207539  0.51638462    1
  0.47555171  0.65056336    1
 -0.23383266  0.63642580    1
 -0.31579316  0.75031340    1
 -0.47351720  0.63854125    1
  0.59239464  0.89256953    1
 -0.22605324  0.79789454    1
 -0.43995011  0.52099256    1
 -0.54645044  0.74577198    1
  0.46404306  0.51065152    1
 -0.15194296  0.81218439    1
  0.48536395  0.82018093    1
  0.34725649  0.70813773    1
  0.43897015  0.62817158    1
 -0.21415914  0.64363951    1
  0.57380231  0.63713466    1
  0.38717361  0.58578395    1
  0.32038322  0.53529127    1
 -0.20781491  0.65132467    1
 -0.18651283  0.81754816    1
  0.24752692  0.39081936    1
  0.66049881  0.89919213    1
 -0.28658801  0.73375946    1
 -0.32588080  0.39865509    1
 -0.25204565  0.67358326    1
  0.37259022  0.49785904    1
 -0.29096564  1.04372060    1
 -0.30469807  0.86858292    1
 -0.21389978  1.09317811    1
 -0.36830015  0.75639546    1
 -0.46928218  0.88775091    1
  0.39350146  0.77975197    1
 -0.45639966  0.80523454    1
  0.51128242  0.76606136    1
  0.22550468  0.46451215    1
  0.01462984  0.40190926    1
 -0.19172785  0.80943313    1
  0.38323479  0.75601744    1
  0.49791612  0.61334375    1
  0.35335230  0.77324337    1
 -0.34722575  0.70177856    1
  0.58380468  0.76357539    1
 -0.13727764  0.71246351    1
  0.38827268  0.44977123    1
 -0.53172709  0.61934293    1
 -0.11684624  0.87851210    1
  0.54335864  0.41174865    1
 -0.45399302  0.66512988    1
 -0.21913200  0.83484947    1
  0.30485742  0.98028760    1
  0.65676798  0.75766017    1
  0.61420447  0.75039019    1
 -0.45809964  0.77968606    1
 -0.21617465  0.88626305    1
 -0.26016108  0.81008591    1
  0.31884531  0.84517725    1
 -0.23727415  0.80178784    1
  0.58310323  0.77709806    1
  0.02841337  0.75792620    1
 -0.41840136  0.68041440    1
  0.67412880  0.60245461    1
 -0.25278281  0.70526103    1
  0.51609843  0.62092390    1
  0.20392294  0.91641482    1
 -0.17207124  1.00884096    1
  0.27274507  0.29346977    1
  0.07634798  0.56222204    1
 -0.36653499  0.64831007    1
  0.44290673  0.80087721    1
 -0.19976385  0.54295162    1
 -0.54075738  0.65293033    1
 -0.07060266  1.00296912    1
  0.50715054  0.35045758    1
 -0.06048611  0.62982713    1
  0.21532928  0.60260249    1
  0.46809108  0.87182416    1
 -0.29888511  0.73669866    1
  0.86129620  0.47289330    1
  0.70120877  0.74572893    1
 -0.11342797  0.60067099    1
  0.31234354  0.90756345    1
 -0.12172541  0.84112851    1
  0.36867857  0.37052586    1
  0.57311489  0.40949740    1
 -0.25841225  0.67192335    1
  0.30937186  0.50823318    1
  0.43319338  0.77016967    1
 -0.30448035  0.57820106    1
  0.44276338  0.58023403    1
 -0.19442057  0.89876808    1
 -0.06105237  0.74184946    1
  0.07619347  0.35386246    1
  0.85826993  0.95819523    1
  0.37039200  0.72342401    1
  0.51481515  0.76203996    1
  0.43127521  0.54259166    1
  0.42286091  0.65242185    1
  0.29815001  0.93453682    1
  0.37128253  0.70089181    1
 -0.51528729  0.76473490    1
  0.38525783  0.65528189    1
 -0.34825368  0.50529981    1
  0.68510504  0.78067440    1
 -0.36528923  0.45703265    1
 -0.40903577  0.74230433    1
  0.43574387  0.44689789    1
  0.26887846  0.44559230    1
 -0.49254862  1.01443372    1
  0.07615960  0.63795180    1
  0.49226224  0.46876241    1
 -0.40249641  0.71301084    1
--- a/GPy/util/latent_space_visualizations/init.py
+++ b/GPy/util/latent_space_visualizations/init.py
@ -0,0 +1 @@
 import controllers
--- a/GPy/util/latent_space_visualizations/controllers/init.py
+++ b/GPy/util/latent_space_visualizations/controllers/init.py
@ -0,0 +1 @@
 import axis_event_controller, imshow_controller
--- a/GPy/util/latent_space_visualizations/controllers/axis_event_controller.py
+++ b/GPy/util/latent_space_visualizations/controllers/axis_event_controller.py
@ -0,0 +1,142 @@
 '''
 Created on 24 Jul 2013
@author: maxz
 '''
 import numpy
 class AxisEventController(object):
    def __init__(self, ax):
        self.ax = ax
        self.activate()
    def deactivate(self):
        for cb_class in self.ax.callbacks.callbacks.values():
            for cb_num in cb_class.keys():
                self.ax.callbacks.disconnect(cb_num)
    def activate(self):
        self.ax.callbacks.connect('xlim_changed', self.xlim_changed)
        self.ax.callbacks.connect('ylim_changed', self.ylim_changed)
    def xlim_changed(self, ax):
        pass
    def ylim_changed(self, ax):
        pass
 class AxisChangedController(AxisEventController):
    '''
    Buffered control of axis limit changes
    '''
    _changing = False
    def __init__(self, ax, update_lim=None):
        '''
        Constructor
        '''
        super(AxisChangedController, self).__init__(ax)
        self._lim_ratio_threshold = update_lim or .8
        self._x_lim = self.ax.get_xlim()
        self._y_lim = self.ax.get_ylim()
    def update(self, ax):
        pass
    def xlim_changed(self, ax):
        super(AxisChangedController, self).xlim_changed(ax)
        if not self._changing and self.lim_changed(ax.get_xlim(), self._x_lim):
            self._changing = True
            self._x_lim = ax.get_xlim()
            self.update(ax)
            self._changing = False
    def ylim_changed(self, ax):
        super(AxisChangedController, self).ylim_changed(ax)
        if not self._changing and self.lim_changed(ax.get_ylim(), self._y_lim):
            self._changing = True
            self._y_lim = ax.get_ylim()
            self.update(ax)
            self._changing = False
    def extent(self, lim):
        return numpy.subtract(*lim)
    def lim_changed(self, axlim, savedlim):
        axextent = self.extent(axlim)
        extent = self.extent(savedlim)
        lim_changed = ((axextent / extent) < self._lim_ratio_threshold ** 2
                       or (extent / axextent) < self._lim_ratio_threshold ** 2
                       or ((1 - (self.extent((axlim[0], savedlim[0])) / self.extent((savedlim[0], axlim[1]))))
                           < self._lim_ratio_threshold)
                       or ((1 - (self.extent((savedlim[0], axlim[0])) / self.extent((axlim[0], savedlim[1]))))
                           < self._lim_ratio_threshold)
                       )
        return lim_changed
    def _buffer_lim(self, lim):
        # buffer_size = 1 - self._lim_ratio_threshold
        # extent = self.extent(lim)
        return lim
 class BufferedAxisChangedController(AxisChangedController):
    def __init__(self, ax, plot_function, plot_limits, resolution=50, update_lim=None, **kwargs):
        """
        :param plot_function: 
            function to use for creating image for plotting (return ndarray-like)
            plot_function gets called with (2D!) Xtest grid if replotting required
        :type plot_function: function
        :param plot_limits:
            beginning plot limits [xmin, ymin, xmax, ymax]
        :param kwargs: additional kwargs are for pyplot.imshow(**kwargs)
        """
        super(BufferedAxisChangedController, self).__init__(ax, update_lim=update_lim)
        self.plot_function = plot_function
        xmin, xmax = self._x_lim # self._compute_buffered(*self._x_lim)
        ymin, ymax = self._y_lim # self._compute_buffered(*self._y_lim)
        self.resolution = resolution
        self._not_init = False
        self.view = self._init_view(self.ax, self.recompute_X(), xmin, xmax, ymin, ymax, **kwargs)
        self._not_init = True
    def update(self, ax):
        super(BufferedAxisChangedController, self).update(ax)
        if self._not_init:
            xmin, xmax = self._compute_buffered(*self._x_lim)
            ymin, ymax = self._compute_buffered(*self._y_lim)
            self.update_view(self.view, self.recompute_X(), xmin, xmax, ymin, ymax)
    def _init_view(self, ax, X, xmin, xmax, ymin, ymax):
        raise NotImplementedError('return view for this controller')
    def update_view(self, view, X, xmin, xmax, ymin, ymax):
        raise NotImplementedError('update view given in here')
    def get_grid(self):
        xmin, xmax = self._compute_buffered(*self._x_lim)
        ymin, ymax = self._compute_buffered(*self._y_lim)
        x, y = numpy.mgrid[xmin:xmax:1j * self.resolution, ymin:ymax:1j * self.resolution]
        return numpy.hstack((x.flatten()[:, None], y.flatten()[:, None]))
    def recompute_X(self):
        X = self.plot_function(self.get_grid())
        if isinstance(X, (tuple, list)):
            for x in X:
                x.shape = [self.resolution, self.resolution]
                x[:, :] = x.T[::-1, :]
            return X
        return X.reshape(self.resolution, self.resolution).T[::-1, :]
    def _compute_buffered(self, mi, ma):
        buffersize = self._buffersize()
        size = ma - mi
        return mi - (buffersize * size), ma + (buffersize * size)
    def _buffersize(self):
        try:
            buffersize = 1. - self._lim_ratio_threshold
        except:
            buffersize = .4
        return buffersize
--- a/GPy/util/latent_space_visualizations/controllers/imshow_controller.py
+++ b/GPy/util/latent_space_visualizations/controllers/imshow_controller.py
@ -0,0 +1,71 @@
 '''
 Created on 24 Jul 2013
@author: maxz
 '''
 from GPy.util.latent_space_visualizations.controllers.axis_event_controller import BufferedAxisChangedController
 import itertools
 import numpy
 class ImshowController(BufferedAxisChangedController):
    def __init__(self, ax, plot_function, plot_limits, resolution=50, update_lim=.5, **kwargs):
        """
        :param plot_function: 
            function to use for creating image for plotting (return ndarray-like)
            plot_function gets called with (2D!) Xtest grid if replotting required
        :type plot_function: function
        :param plot_limits:
            beginning plot limits [xmin, ymin, xmax, ymax]
        :param kwargs: additional kwargs are for pyplot.imshow(**kwargs)
        """
        super(ImshowController, self).__init__(ax, plot_function, plot_limits, resolution, update_lim, **kwargs)
    def _init_view(self, ax, X, xmin, xmax, ymin, ymax, **kwargs):
        return ax.imshow(X, extent=(xmin, xmax,
                                    ymin, ymax),
                         vmin=X.min(),
                         vmax=X.max(),
                         **kwargs)
    def update_view(self, view, X, xmin, xmax, ymin, ymax):
        view.set_data(X)
        view.set_extent((xmin, xmax, ymin, ymax))
 class ImAnnotateController(ImshowController):
    def __init__(self, ax, plot_function, plot_limits, resolution=20, update_lim=.99, **kwargs):
        """
        :param plot_function: 
            function to use for creating image for plotting (return ndarray-like)
            plot_function gets called with (2D!) Xtest grid if replotting required
        :type plot_function: function
        :param plot_limits:
            beginning plot limits [xmin, ymin, xmax, ymax]
        :param text_props: kwargs for pyplot.text(**text_props)
        :param kwargs: additional kwargs are for pyplot.imshow(**kwargs)
        """
        super(ImAnnotateController, self).__init__(ax, plot_function, plot_limits, resolution, update_lim, **kwargs)
    def _init_view(self, ax, X, xmin, xmax, ymin, ymax, text_props={}, **kwargs):
        view = [super(ImAnnotateController, self)._init_view(ax, X[0], xmin, xmax, ymin, ymax, **kwargs)]
        xoffset, yoffset = self._offsets(xmin, xmax, ymin, ymax)
        xlin = numpy.linspace(xmin, xmax, self.resolution, endpoint=False)
        ylin = numpy.linspace(ymin, ymax, self.resolution, endpoint=False)
        for [i, x], [j, y] in itertools.product(enumerate(xlin), enumerate(ylin[::-1])):
            view.append(ax.text(x + xoffset, y + yoffset, "{}".format(X[1][j, i]), ha='center', va='center', **text_props))
        return view
    def update_view(self, view, X, xmin, xmax, ymin, ymax):
        super(ImAnnotateController, self).update_view(view[0], X[0], xmin, xmax, ymin, ymax)
        xoffset, yoffset = self._offsets(xmin, xmax, ymin, ymax)
        xlin = numpy.linspace(xmin, xmax, self.resolution, endpoint=False)
        ylin = numpy.linspace(ymin, ymax, self.resolution, endpoint=False)
        for [[i, x], [j, y]], text in itertools.izip(itertools.product(enumerate(xlin), enumerate(ylin[::-1])), view[1:]):
            text.set_x(x + xoffset)
            text.set_y(y + yoffset)
            text.set_text("{}".format(X[1][j, i]))
        return view
    def _offsets(self, xmin, xmax, ymin, ymax):
        return (xmax - xmin) / (2 * self.resolution), (ymax - ymin) / (2 * self.resolution)
--- a/GPy/util/misc.py
+++ b/GPy/util/misc.py
@ -1,8 +1,8 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import weave
 def opt_wrapper(m, **kwargs):
    """
@ -58,6 +58,72 @@ def kmm_init(X, m = 10):
    inducing = np.array(inducing)
    return X[inducing]
 def fast_array_equal(A, B):
    code2="""
    int i, j;
    return_val = 1;
    #pragma omp parallel for private(i, j)
    for(i=0;i<N;i++){
       for(j=0;j<D;j++){
          if(A(i, j) != B(i, j)){
              return_val = 0;
              break;
          }
       }
    }
    """
    code3="""
    int i, j, z;
    return_val = 1;
    #pragma omp parallel for private(i, j, z)
    for(i=0;i<N;i++){
       for(j=0;j<D;j++){
         for(z=0;z<Q;z++){
            if(A(i, j, z) != B(i, j, z)){
               return_val = 0;
               break;
            }
          }
       }
    }
    """
    support_code = """
    #include <omp.h>
    #include <math.h>
    """
    weave_options = {'headers'           : ['<omp.h>'],
                     'extra_compile_args': ['-fopenmp -O3'],
                     'extra_link_args'   : ['-lgomp']}
    value = False
    if (A == None) and (B == None):
        return True
    elif ((A == None) and (B != None)) or ((A != None) and (B == None)):
        return False
    elif A.shape == B.shape:
        if A.ndim == 2:
            N, D = A.shape
            value = weave.inline(code2, support_code=support_code, libraries=['gomp'],
                                 arg_names=['A', 'B', 'N', 'D'],
                                 type_converters=weave.converters.blitz,**weave_options)
        elif A.ndim == 3:
            N, D, Q = A.shape
            value = weave.inline(code3, support_code=support_code, libraries=['gomp'],
                                 arg_names=['A', 'B', 'N', 'D', 'Q'],
                                 type_converters=weave.converters.blitz,**weave_options)
        else:
            value = np.array_equal(A,B)
    return value
 if __name__ == '__main__':
    import pylab as plt
    X = np.linspace(1,10, 100)[:, None]
--- a/GPy/util/mocap.py
+++ b/GPy/util/mocap.py
@ -692,84 +692,3 @@ skel = acclaim_skeleton()
 def fetch_cmu(subj_motions, base_url = 'http://mocap.cs.cmu.edu:8080/subjects', skel_store_dir = '.', motion_store_dir = '.', store_motions = True, return_motions = True, messages = True):
    ''' 
    Download and store the skel. and motions indicated in a tuple (A,B) where A is a list of skeletons and B
    the corresponding 2-D list of motions, ie B_ij is the j-th motion to download for skeleton A_i
    The method can optionally store the fetched data and / or return them as arrays.
    If the data are already stored, they are not fetched but just retrieved.
    e.g.
    # Download the data, do not return anything
    GPy.util.mocap.fetch_cmu(subj_motions = ([35],[[1,2,3]]), return_motions = False)
    # Fetch and return the data in a list. Do not store them anywhere
    GPy.util.mocap.fetch_cmu(subj_motions = ([35],[[1,2,3]]), return_motions = True, store_motions = False)
    In both cases above, if the data do exist in the given skel_store_dir and motion_store_dir, they are just loaded from there.
    '''
    subjectsNum = subj_motions[0]
    motionsNum = subj_motions[1]
    # Convert numbers to strings
    subjects = []
    motions = [list() for _ in range(len(subjectsNum))]
    for i in range(len(subjectsNum)):
        curSubj = str(int(subjectsNum[i]))
        if subjectsNum[i] < 10:
            curSubj = '0' + curSubj
        subjects.append(curSubj)
        for j in range(len(motionsNum[i])):
            curMot = str(int(motionsNum[i][j]))
            if motionsNum[i][j] < 10:
                curMot = '0' + curMot
            motions[i].append(curMot)
    all_skels = []
    assert len(subjects) == len(motions)
    if return_motions:
        all_motions = [list() for _ in range(len(subjects))]
    else:
        all_motions = []
    for i in range(len(subjects)):
        cur_skel_suffix = '/' + subjects[i] + '/' 
        cur_skel_dir = skel_store_dir + cur_skel_suffix
        cur_skel_file = cur_skel_dir + subjects[i] + '.asf'
        cur_skel_url = base_url + cur_skel_suffix + subjects[i] + '.asf'
        if os.path.isfile(cur_skel_file):
            if return_motions:
                with open(cur_skel_file, 'r') as f:
                    cur_skel_data = f.read()
        else:
            if store_motions:
                if not os.path.isdir(cur_skel_dir):
                    os.mkdir(cur_skel_dir)
                if not os.path.isdir(motion_store_dir + cur_skel_suffix):
                    os.mkdir(motion_store_dir + cur_skel_suffix)
            cur_skel_data = dat.download_resource(cur_skel_url, cur_skel_file, store_motions, messages)
        if return_motions:
            all_skels.append(cur_skel_data)
        for j in range(len(motions[i])):
            cur_motion_url = base_url + cur_skel_suffix + subjects[i] + '_' + motions[i][j] + '.amc'
            cur_motion_file = motion_store_dir + cur_skel_suffix  + subjects[i] + '_' + motions[i][j] + '.amc'
            if os.path.isfile(cur_motion_file):
                with open(cur_motion_file, 'r') as f:
                    if return_motions:
                        cur_motion_data = f.read()
            else:
                cur_motion_data = dat.download_resource(cur_motion_url, cur_motion_file, store_motions, messages)
            if return_motions:
                all_motions[i].append(cur_motion_data)
    return all_skels, all_motions
--- a/GPy/util/plot_latent.py
+++ b/GPy/util/plot_latent.py
@ -1,40 +1,59 @@
 import pylab as pb
 import numpy as np
 from .. import util
 from GPy.util.latent_space_visualizations.controllers.imshow_controller import ImshowController
 import itertools
-def plot_latent(model, labels=None, which_indices=None, resolution=50, ax=None, marker='o', s=40):
+def most_significant_input_dimensions(model, which_indices):
    if which_indices is None:
        if model.input_dim == 1:
            input_1 = 0
            input_2 = None
        if model.input_dim == 2:
            input_1, input_2 = 0, 1
        else:
            try:
                input_1, input_2 = np.argsort(model.input_sensitivity())[::-1][:2]
            except:
                raise ValueError, "cannot Atomatically determine which dimensions to plot, please pass 'which_indices'"
    else:
        input_1, input_2 = which_indices
    return input_1, input_2
 def plot_latent(model, labels=None, which_indices=None, 
                resolution=50, ax=None, marker='o', s=40, 
                fignum=None, plot_inducing=False, legend=True,
                aspect='auto', updates=False):
    """
    :param labels: a np.array of size model.num_data containing labels for the points (can be number, strings, etc)
    :param resolution: the resolution of the grid on which to evaluate the predictive variance
    """
    if ax is None:
-        ax = pb.gca()
+        fig = pb.figure(num=fignum)
        ax = fig.add_subplot(111)
    util.plot.Tango.reset()
    if labels is None:
        labels = np.ones(model.num_data)
    if which_indices is None:
        if model.input_dim==1:
            input_1 = 0
            input_2 = None
        if model.input_dim==2:
            input_1, input_2 = 0,1
        else:
            try:
                input_1, input_2 = np.argsort(model.input_sensitivity())[:2]
            except:
                raise ValueError, "cannot Atomatically determine which dimensions to plot, please pass 'which_indices'"
    else:
        input_1, input_2 = which_indices
-    #first, plot the output variance as a function of the latent space
+    input_1, input_2 = most_significant_input_dimensions(model, which_indices)
-    Xtest, xx,yy,xmin,xmax = util.plot.x_frame2D(model.X[:,[input_1, input_2]],resolution=resolution)
+
    # first, plot the output variance as a function of the latent space
    Xtest, xx, yy, xmin, xmax = util.plot.x_frame2D(model.X[:, [input_1, input_2]], resolution=resolution)
    Xtest_full = np.zeros((Xtest.shape[0], model.X.shape[1]))
-    Xtest_full[:, :2] = Xtest
+
-    mu, var, low, up = model.predict(Xtest_full)
+    def plot_function(x):
-    var = var[:, :1]
+        Xtest_full[:, [input_1, input_2]] = x
-    ax.imshow(var.reshape(resolution, resolution).T,
+        mu, var, low, up = model.predict(Xtest_full)
-              extent=[xmin[0], xmax[0], xmin[1], xmax[1]], cmap=pb.cm.binary,interpolation='bilinear',origin='lower')
+        var = var[:, :1]
        return np.log(var)
    view = ImshowController(ax, plot_function,
                            tuple(model.X.min(0)[:, [input_1, input_2]]) + tuple(model.X.max(0)[:, [input_1, input_2]]),
                            resolution, aspect=aspect, interpolation='bilinear',
                            cmap=pb.cm.binary)
 #     ax.imshow(var.reshape(resolution, resolution).T,
 #               extent=[xmin[0], xmax[0], xmin[1], xmax[1]], cmap=pb.cm.binary, interpolation='bilinear', origin='lower')
    # make sure labels are in order of input:
    ulabels = []
@ -42,50 +61,118 @@ def plot_latent(model, labels=None, which_indices=None, resolution=50, ax=None,
        if not lab in ulabels:
            ulabels.append(lab)
    marker = itertools.cycle(list(marker))
    for i, ul in enumerate(ulabels):
        if type(ul) is np.string_:
            this_label = ul
        elif type(ul) is np.int64:
-            this_label = 'class %i'%ul
+            this_label = 'class %i' % ul
        else:
-            this_label = 'class %i'%i
+            this_label = 'class %i' % i
-        if len(marker) == len(ulabels):
+        m = marker.next()
            m = marker[i]
        else:
            m = marker
-        index = np.nonzero(labels==ul)[0]
+        index = np.nonzero(labels == ul)[0]
-        if model.input_dim==1:
+        if model.input_dim == 1:
-            x = model.X[index,input_1]
+            x = model.X[index, input_1]
            y = np.zeros(index.size)
        else:
-            x = model.X[index,input_1]
+            x = model.X[index, input_1]
-            y = model.X[index,input_2]
+            y = model.X[index, input_2]
        ax.scatter(x, y, marker=m, s=s, color=util.plot.Tango.nextMedium(), label=this_label)
-    ax.set_xlabel('latent dimension %i'%input_1)
+    ax.set_xlabel('latent dimension %i' % input_1)
-    ax.set_ylabel('latent dimension %i'%input_2)
+    ax.set_ylabel('latent dimension %i' % input_2)
-    if not np.all(labels==1.):
+    if not np.all(labels == 1.) and legend:
-        ax.legend(loc=0,numpoints=1)
+        ax.legend(loc=0, numpoints=1)
-    ax.set_xlim(xmin[0],xmax[0])
+    ax.set_xlim(xmin[0], xmax[0])
-    ax.set_ylim(xmin[1],xmax[1])
+    ax.set_ylim(xmin[1], xmax[1])
    ax.grid(b=False) # remove the grid if present, it doesn't look good
    ax.set_aspect('auto') # set a nice aspect ratio
    if plot_inducing:
        ax.plot(model.Z[:, input_1], model.Z[:, input_2], '^w')
    if updates:
        ax.figure.canvas.show()
        raw_input('Enter to continue')
    return ax
 def plot_magnification(model, labels=None, which_indices=None, 
                resolution=60, ax=None, marker='o', s=40, 
                fignum=None, plot_inducing=False, legend=True,
                aspect='auto', updates=False):
    """
    :param labels: a np.array of size model.num_data containing labels for the points (can be number, strings, etc)
    :param resolution: the resolution of the grid on which to evaluate the predictive variance
    """
    if ax is None:
        fig = pb.figure(num=fignum)
        ax = fig.add_subplot(111)
    util.plot.Tango.reset()
-def plot_latent_indices(Model, which_indices=None, *args, **kwargs):
+    if labels is None:
        labels = np.ones(model.num_data)
-    if which_indices is None:
+    input_1, input_2 = most_significant_input_dimensions(model, which_indices)
-        try:
+
-            input_1, input_2 = np.argsort(Model.input_sensitivity())[:2]
+    # first, plot the output variance as a function of the latent space
-        except:
+    Xtest, xx, yy, xmin, xmax = util.plot.x_frame2D(model.X[:, [input_1, input_2]], resolution=resolution)
-            raise ValueError, "cannot Automatically determine which dimensions to plot, please pass 'which_indices'"
+    Xtest_full = np.zeros((Xtest.shape[0], model.X.shape[1]))
-    else:
+    def plot_function(x):
-        input_1, input_2 = which_indices
+        Xtest_full[:, [input_1, input_2]] = x
-    ax = plot_latent(Model, which_indices=[input_1, input_2], *args, **kwargs)
+        mf=model.magnification(Xtest_full)
-    # TODO: Here test if there are inducing points...
+        return mf
-    ax.plot(Model.Z[:, input_1], Model.Z[:, input_2], '^w')
+    view = ImshowController(ax, plot_function,
                            tuple(model.X.min(0)[:, [input_1, input_2]]) + tuple(model.X.max(0)[:, [input_1, input_2]]),
                            resolution, aspect=aspect, interpolation='bilinear',
                            cmap=pb.cm.gray)
    # make sure labels are in order of input:
    ulabels = []
    for lab in labels:
        if not lab in ulabels:
            ulabels.append(lab)
    marker = itertools.cycle(list(marker))
    for i, ul in enumerate(ulabels):
        if type(ul) is np.string_:
            this_label = ul
        elif type(ul) is np.int64:
            this_label = 'class %i' % ul
        else:
            this_label = 'class %i' % i
        m = marker.next()
        index = np.nonzero(labels == ul)[0]
        if model.input_dim == 1:
            x = model.X[index, input_1]
            y = np.zeros(index.size)
        else:
            x = model.X[index, input_1]
            y = model.X[index, input_2]
        ax.scatter(x, y, marker=m, s=s, color=util.plot.Tango.nextMedium(), label=this_label)
    ax.set_xlabel('latent dimension %i' % input_1)
    ax.set_ylabel('latent dimension %i' % input_2)
    if not np.all(labels == 1.) and legend:
        ax.legend(loc=0, numpoints=1)
    ax.set_xlim(xmin[0], xmax[0])
    ax.set_ylim(xmin[1], xmax[1])
    ax.grid(b=False) # remove the grid if present, it doesn't look good
    ax.set_aspect('auto') # set a nice aspect ratio
    if plot_inducing:
        ax.plot(model.Z[:, input_1], model.Z[:, input_2], '^w')
    if updates:
        ax.figure.canvas.show()
        raw_input('Enter to continue')
    pb.title('Magnification Factor')
    return ax
--- a/GPy/util/visualize.py
+++ b/GPy/util/visualize.py
@ -5,7 +5,13 @@ import numpy as np
 import matplotlib as mpl
 import time
 import Image
-#import visual
+try:
    import visual
    visual_available = True
 except ImportError:
    visual_available = False
 class data_show:
    """
@ -24,7 +30,6 @@ class data_show:
    def close(self):
        raise NotImplementedError, "this needs to be implemented to use the data_show class"
 class vpython_show(data_show):
    """
    the vpython_show class is a base class for all visualization methods that use vpython to display. It is initialized with a scene. If the scene is set to None it creates a scene window.
@ -103,7 +108,7 @@ class lvm(matplotlib_show):
            self.cid = latent_axes[0].figure.canvas.mpl_connect('axes_enter_event', self.on_enter)
        self.data_visualize = data_visualize
-        self.Model = model
+        self.model = model
        self.latent_axes = latent_axes
        self.sense_axes = sense_axes
        self.called = False
@ -120,7 +125,7 @@ class lvm(matplotlib_show):
    def modify(self, vals):
        """When latent values are modified update the latent representation and ulso update the output visualization."""
        self.vals = vals.copy()
-        y = self.Model.predict(self.vals)[0]
+        y = self.model.predict(self.vals)[0]
        self.data_visualize.modify(y)
        self.latent_handle.set_data(self.vals[self.latent_index[0]], self.vals[self.latent_index[1]])
        self.axes.figure.canvas.draw()
@ -148,15 +153,15 @@ class lvm(matplotlib_show):
        # A click in the bar chart axis for selection a dimension.
        if self.sense_axes != None:
            self.sense_axes.cla()
-            self.sense_axes.bar(np.arange(self.Model.input_dim),1./self.Model.input_sensitivity(),color='b')
+            self.sense_axes.bar(np.arange(self.model.input_dim), self.model.input_sensitivity(), color='b')
            if self.latent_index[1] == self.latent_index[0]:
-                self.sense_axes.bar(np.array(self.latent_index[0]),1./self.Model.input_sensitivity()[self.latent_index[0]],color='y')
+                self.sense_axes.bar(np.array(self.latent_index[0]), self.model.input_sensitivity()[self.latent_index[0]], color='y')
-                self.sense_axes.bar(np.array(self.latent_index[1]),1./self.Model.input_sensitivity()[self.latent_index[1]],color='y')
+                self.sense_axes.bar(np.array(self.latent_index[1]), self.model.input_sensitivity()[self.latent_index[1]], color='y')
            else:
-                self.sense_axes.bar(np.array(self.latent_index[0]),1./self.Model.input_sensitivity()[self.latent_index[0]],color='g')
+                self.sense_axes.bar(np.array(self.latent_index[0]), self.model.input_sensitivity()[self.latent_index[0]], color='g')
-                self.sense_axes.bar(np.array(self.latent_index[1]),1./self.Model.input_sensitivity()[self.latent_index[1]],color='r')
+                self.sense_axes.bar(np.array(self.latent_index[1]), self.model.input_sensitivity()[self.latent_index[1]], color='r')
            self.sense_axes.figure.canvas.draw()
@ -193,7 +198,7 @@ class lvm_dimselect(lvm):
    GPy.examples.dimensionality_reduction.BGPVLM_oil()
    """
-    def __init__(self, vals, Model, data_visualize, latent_axes=None, sense_axes=None, latent_index=[0, 1], labels=None):
+    def __init__(self, vals, model, data_visualize, latent_axes=None, sense_axes=None, latent_index=[0, 1], labels=None):
        if latent_axes==None and sense_axes==None:
            self.fig,(latent_axes,self.sense_axes) = plt.subplots(1,2)
        elif sense_axes==None:
@ -202,7 +207,7 @@ class lvm_dimselect(lvm):
        else:
            self.sense_axes = sense_axes
        self.labels = labels
-        lvm.__init__(self,vals,Model,data_visualize,latent_axes,sense_axes,latent_index)
+        lvm.__init__(self,vals,model,data_visualize,latent_axes,sense_axes,latent_index)
        self.show_sensitivities()
        print "use left and right mouse butons to select dimensions"
@ -210,7 +215,7 @@ class lvm_dimselect(lvm):
    def on_click(self, event):
        if event.inaxes==self.sense_axes:
-            new_index = max(0,min(int(np.round(event.xdata-0.5)),self.Model.input_dim-1))
+            new_index = max(0,min(int(np.round(event.xdata-0.5)),self.model.input_dim-1))
            if event.button == 1:
                # Make it red if and y-axis (red=port=left) if it is a left button click
                self.latent_index[1] = new_index                
@ -221,7 +226,7 @@ class lvm_dimselect(lvm):
            self.show_sensitivities()
            self.latent_axes.cla()
-            self.Model.plot_latent(which_indices=self.latent_index,
+            self.model.plot_latent(which_indices=self.latent_index,
                                   ax=self.latent_axes, labels=self.labels)
            self.latent_handle = self.latent_axes.plot([0],[0],'rx',mew=2)[0]
            self.modify(self.latent_values)
@ -235,7 +240,7 @@ class lvm_dimselect(lvm):
    def on_leave(self,event):
        latent_values = self.latent_values.copy()
-        y = self.Model.predict(latent_values[None,:])[0]
+        y = self.model.predict(latent_values[None,:])[0]
        self.data_visualize.modify(y)
@ -318,7 +323,7 @@ class mocap_data_show_vpython(vpython_show):
        for i in range(self.vals.shape[0]):
            self.spheres.append(visual.sphere(pos=(self.vals[i, 0], self.vals[i, 2], self.vals[i, 1]), radius=self.radius))
        self.scene.visible=True
-        
+
    def draw_edges(self):
        self.rods = []
        self.line_handle = []
@ -435,7 +440,6 @@ class mocap_data_show(matplotlib_show):
        self.axes.set_ylim(self.y_lim)
        self.axes.set_zlim(self.z_lim)
 class stick_show(mocap_data_show_vpython):
    """Show a three dimensional point cloud as a figure. Connect elements of the figure together using the matrix connect."""
    def __init__(self, vals, connect=None, scene=None):
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@ -185,7 +185,7 @@ class TanhWarpingFunction_d(WarpingFunction):
        return z
-    def f_inv(self, z, psi, max_iterations = 1000):
+    def f_inv(self, z, psi, max_iterations=1000, y=None):
        """
        calculate the numerical inverse of f
@ -195,7 +195,9 @@ class TanhWarpingFunction_d(WarpingFunction):
        """
        z = z.copy()
-        y = np.ones_like(z)
+        if y is None:
            y = np.ones_like(z)
        it = 0
        update = np.inf
@ -205,7 +207,7 @@ class TanhWarpingFunction_d(WarpingFunction):
            it += 1
        if it == max_iterations:
            print "WARNING!!! Maximum number of iterations reached in f_inv "
-            
+
        return y
--- a/doc/GPy.core.rst
+++ b/doc/GPy.core.rst
@ -9,6 +9,38 @@ core Package
    :undoc-members:
    :show-inheritance:
 :mod:`domains` Module
 ---------------------
 .. automodule:: GPy.core.domains
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`fitc` Module
 ------------------
 .. automodule:: GPy.core.fitc
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`gp` Module
 ----------------
 .. automodule:: GPy.core.gp
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`gp_base` Module
 ---------------------
 .. automodule:: GPy.core.gp_base
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`model` Module
 -------------------
@ -17,10 +49,10 @@ core Package
    :undoc-members:
    :show-inheritance:
-:mod:`parameterised` Module
+:mod:`parameterized` Module
 ---------------------------
-.. automodule:: GPy.core.parameterised
+.. automodule:: GPy.core.parameterized
    :members:
    :undoc-members:
    :show-inheritance:
@ -33,3 +65,27 @@ core Package
    :undoc-members:
    :show-inheritance:
 :mod:`sparse_gp` Module
 -----------------------
 .. automodule:: GPy.core.sparse_gp
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`svigp` Module
 -------------------
 .. automodule:: GPy.core.svigp
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`transformations` Module
 -----------------------------
 .. automodule:: GPy.core.transformations
    :members:
    :undoc-members:
    :show-inheritance:
--- a/doc/GPy.examples.rst
+++ b/doc/GPy.examples.rst
@ -25,14 +25,6 @@ examples Package
    :undoc-members:
    :show-inheritance:
 :mod:`non_gaussian` Module
 --------------------------
 .. automodule:: GPy.examples.non_gaussian
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`regression` Module
 ------------------------
@ -41,6 +33,14 @@ examples Package
    :undoc-members:
    :show-inheritance:
 :mod:`stochastic` Module
 ------------------------
 .. automodule:: GPy.examples.stochastic
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`tutorials` Module
 -----------------------
--- a/doc/GPy.inference.rst
+++ b/doc/GPy.inference.rst
@ -1,10 +1,18 @@
 inference Package
 =================
-:mod:`SGD` Module
+:mod:`conjugate_gradient_descent` Module
-----------------
+----------------------------------------
-.. automodule:: GPy.inference.SGD
+.. automodule:: GPy.inference.conjugate_gradient_descent
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`gradient_descent_update_rules` Module
 -------------------------------------------
 .. automodule:: GPy.inference.gradient_descent_update_rules
    :members:
    :undoc-members:
    :show-inheritance:
@ -25,3 +33,19 @@ inference Package
    :undoc-members:
    :show-inheritance:
 :mod:`scg` Module
 -----------------
 .. automodule:: GPy.inference.scg
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`sgd` Module
 -----------------
 .. automodule:: GPy.inference.sgd
    :members:
    :undoc-members:
    :show-inheritance:
--- a/doc/GPy.kern.rst
+++ b/doc/GPy.kern.rst
@ -9,38 +9,6 @@ kern Package
    :undoc-members:
    :show-inheritance:
 :mod:`Brownian` Module
 ----------------------
 .. automodule:: GPy.kern.Brownian
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`Matern32` Module
 ----------------------
 .. automodule:: GPy.kern.Matern32
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`Matern52` Module
 ----------------------
 .. automodule:: GPy.kern.Matern52
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`bias` Module
 ------------------
 .. automodule:: GPy.kern.bias
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`constructors` Module
 --------------------------
@ -49,30 +17,6 @@ kern Package
    :undoc-members:
    :show-inheritance:
 :mod:`coregionalise` Module
 ---------------------------
 .. automodule:: GPy.kern.coregionalise
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`exponential` Module
 -------------------------
 .. automodule:: GPy.kern.exponential
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`finite_dimensional` Module
 --------------------------------
 .. automodule:: GPy.kern.finite_dimensional
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`kern` Module
 ------------------
@ -81,107 +25,10 @@ kern Package
    :undoc-members:
    :show-inheritance:
-:mod:`kernpart` Module
+Subpackages
----------------------
+-----------
-.. automodule:: GPy.kern.kernpart
+.. toctree::
    :members:
    :undoc-members:
    :show-inheritance:
-:mod:`linear` Module
+    GPy.kern.parts
 --------------------
 .. automodule:: GPy.kern.linear
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`periodic_Matern32` Module
 -------------------------------
 .. automodule:: GPy.kern.periodic_Matern32
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`periodic_Matern52` Module
 -------------------------------
 .. automodule:: GPy.kern.periodic_Matern52
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`periodic_exponential` Module
 ----------------------------------
 .. automodule:: GPy.kern.periodic_exponential
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`prod` Module
 ------------------
 .. automodule:: GPy.kern.prod
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`prod_orthogonal` Module
 -----------------------------
 .. automodule:: GPy.kern.prod_orthogonal
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`rational_quadratic` Module
 --------------------------------
 .. automodule:: GPy.kern.rational_quadratic
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`rbf` Module
 -----------------
 .. automodule:: GPy.kern.rbf
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`spline` Module
 --------------------
 .. automodule:: GPy.kern.spline
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`symmetric` Module
 -----------------------
 .. automodule:: GPy.kern.symmetric
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`sympykern` Module
 -----------------------
 .. automodule:: GPy.kern.sympykern
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`white` Module
 -------------------
 .. automodule:: GPy.kern.white
    :members:
    :undoc-members:
    :show-inheritance:
--- a/doc/GPy.likelihoods.rst
+++ b/doc/GPy.likelihoods.rst
@ -9,7 +9,7 @@ likelihoods Package
    :undoc-members:
    :show-inheritance:
-:mod:`EP` Module
+:mod:`ep` Module
 ----------------
 .. automodule:: GPy.likelihoods.ep
@ -17,7 +17,7 @@ likelihoods Package
    :undoc-members:
    :show-inheritance:
-:mod:`Gaussian` Module
+:mod:`gaussian` Module
 ----------------------
 .. automodule:: GPy.likelihoods.gaussian
@ -41,3 +41,11 @@ likelihoods Package
    :undoc-members:
    :show-inheritance:
 :mod:`link_functions` Module
 ----------------------------
 .. automodule:: GPy.likelihoods.link_functions
    :members:
    :undoc-members:
    :show-inheritance:
--- a/doc/GPy.models.rst
+++ b/doc/GPy.models.rst
@ -9,7 +9,7 @@ models Package
    :undoc-members:
    :show-inheritance:
-:mod:`Bayesian_GPLVM` Module
+:mod:`bayesian_gplvm` Module
 ----------------------------
 .. automodule:: GPy.models.bayesian_gplvm
@ -17,18 +17,18 @@ models Package
    :undoc-members:
    :show-inheritance:
-:mod:`gp` Module
+:mod:`fitc_classification` Module
----------------
+---------------------------------
-.. automodule:: GPy.models.gp
+.. automodule:: GPy.models.fitc_classification
    :members:
    :undoc-members:
    :show-inheritance:
-:mod:`gplvm` Module
+:mod:`gp_classification` Module
-------------------
+-------------------------------
-.. automodule:: GPy.models.gplvm
+.. automodule:: GPy.models.gp_classification
    :members:
    :undoc-members:
    :show-inheritance:
@ -41,18 +41,26 @@ models Package
    :undoc-members:
    :show-inheritance:
-:mod:`sparse_gp` Module
+:mod:`gplvm` Module
-----------------------
+-------------------
-.. automodule:: GPy.models.sparse_gp
+.. automodule:: GPy.models.gplvm
    :members:
    :undoc-members:
    :show-inheritance:
-:mod:`SparseGPLVM` Module
+:mod:`mrd` Module
--------------------------
+-----------------
-.. automodule:: GPy.models.sparse_gplvm
+.. automodule:: GPy.models.mrd
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`sparse_gp_classification` Module
 --------------------------------------
 .. automodule:: GPy.models.sparse_gp_classification
    :members:
    :undoc-members:
    :show-inheritance:
@ -65,13 +73,21 @@ models Package
    :undoc-members:
    :show-inheritance:
-.. :mod:`uncollapsed_sparse_GP` Module
+:mod:`sparse_gplvm` Module
-.. -----------------------------------
+--------------------------
-.. .. automodule:: GPy.models.uncollapsed_sparse_GP
+.. automodule:: GPy.models.sparse_gplvm
-..     :members:
+    :members:
-..     :undoc-members:
+    :undoc-members:
-..     :show-inheritance:
+    :show-inheritance:
 :mod:`svigp_regression` Module
 ------------------------------
 .. automodule:: GPy.models.svigp_regression
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`warped_gp` Module
 -----------------------
--- a/doc/GPy.testing.rst
+++ b/doc/GPy.testing.rst
@ -1,6 +1,14 @@
 testing Package
 ===============
 :mod:`testing` Package
 ----------------------
 .. automodule:: GPy.testing
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`bgplvm_tests` Module
 --------------------------
@ -9,6 +17,22 @@ testing Package
    :undoc-members:
    :show-inheritance:
 :mod:`cgd_tests` Module
 -----------------------
 .. automodule:: GPy.testing.cgd_tests
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`checkgrad` Module
 -----------------------
 .. automodule:: GPy.testing.checkgrad
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`examples_tests` Module
 ----------------------------
@ -33,6 +57,14 @@ testing Package
    :undoc-members:
    :show-inheritance:
 :mod:`mrd_tests` Module
 -----------------------
 .. automodule:: GPy.testing.mrd_tests
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`prior_tests` Module
 -------------------------
@ -41,6 +73,22 @@ testing Package
    :undoc-members:
    :show-inheritance:
 :mod:`psi_stat_expactation_tests` Module
 ----------------------------------------
 .. automodule:: GPy.testing.psi_stat_expactation_tests
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`psi_stat_gradient_tests` Module
 -------------------------------------
 .. automodule:: GPy.testing.psi_stat_gradient_tests
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`sparse_gplvm_tests` Module
 --------------------------------
--- a/doc/GPy.util.rst
+++ b/doc/GPy.util.rst
@ -17,6 +17,14 @@ util Package
    :undoc-members:
    :show-inheritance:
 :mod:`classification` Module
 ----------------------------
 .. automodule:: GPy.util.classification
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`datasets` Module
 ----------------------
@ -25,6 +33,14 @@ util Package
    :undoc-members:
    :show-inheritance:
 :mod:`decorators` Module
 ------------------------
 .. automodule:: GPy.util.decorators
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`linalg` Module
 --------------------
@ -41,6 +57,22 @@ util Package
    :undoc-members:
    :show-inheritance:
 :mod:`mocap` Module
 -------------------
 .. automodule:: GPy.util.mocap
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`pca` Module
 -----------------
 .. automodule:: GPy.util.pca
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`plot` Module
 ------------------
@ -49,6 +81,14 @@ util Package
    :undoc-members:
    :show-inheritance:
 :mod:`plot_latent` Module
 -------------------------
 .. automodule:: GPy.util.plot_latent
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`squashers` Module
 -----------------------
@ -57,6 +97,22 @@ util Package
    :undoc-members:
    :show-inheritance:
 :mod:`univariate_Gaussian` Module
 ---------------------------------
 .. automodule:: GPy.util.univariate_Gaussian
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`visualize` Module
 -----------------------
 .. automodule:: GPy.util.visualize
    :members:
    :undoc-members:
    :show-inheritance:
 :mod:`warping_functions` Module
 -------------------------------
--- a/doc/conf.py
+++ b/doc/conf.py
@ -103,8 +103,11 @@ class Mock(object):
 #import mock
 print "Mocking"
-MOCK_MODULES = ['pylab', 'sympy', 'sympy.utilities', 'sympy.utilities.codegen', 'sympy.core.cache', 'sympy.core', 'sympy.parsing', 'sympy.parsing.sympy_parser', 'matplotlib']
+MOCK_MODULES = ['sympy',
-#'matplotlib', 'matplotlib.color', 'matplotlib.pyplot', 'pylab' ]
+    'sympy.utilities', 'sympy.utilities.codegen', 'sympy.core.cache',
    'sympy.core', 'sympy.parsing', 'sympy.parsing.sympy_parser',
    'matplotlib.pyplot'
    ]
 for mod_name in MOCK_MODULES:
    sys.modules[mod_name] = Mock()
@ -288,7 +291,7 @@ latex_elements = {
    #'pointsize': '10pt',
    # Additional stuff for the LaTeX preamble.
-    #'preamble': '',
+    'preamble': '\\usepackage{MnSymbol}',
 }
 # Grouping the document tree into LaTeX files. List of tuples
--- a/doc/index.rst
+++ b/doc/index.rst
@ -11,6 +11,7 @@ For a quick start, you can have a look at one of the tutorials:
 * `Interacting with models <tuto_interacting_with_models.html>`_
 * `A kernel overview <tuto_kernel_overview.html>`_ 
 * `Writing new kernels <tuto_creating_new_kernels.html>`_
 * `Writing new models <tuto_creating_new_models.html>`_
 You may also be interested by some examples in the GPy/examples folder.
--- a/doc/tuto_creating_new_models.rst
+++ b/doc/tuto_creating_new_models.rst
@ -0,0 +1,64 @@
 .. _creating_new_models:
 *******************
 Creating new Models
 *******************
 In GPy all models inherit from the base class :py:class:`~GPy.core.parameterized.Parameterized`. :py:class:`~GPy.core.parameterized.Parameterized` is a class which allows for parameterization of objects. All it holds is functionality for tying, bounding and fixing of parameters. It also provides the functionality of searching and manipulating parameters by regular expression syntax. See :py:class:`~GPy.core.parameterized.Parameterized` for more information. 
 The :py:class:`~GPy.core.model.Model` class provides parameter introspection, objective function and optimization.
 In order to fully use all functionality of :py:class:`~GPy.core.model.Model` some methods need to be implemented / overridden. In order to explain the functionality of those methods we will use a wrapper to the numpy ``rosen`` function, which holds input parameters :math:`\mathbf{X}`. Where :math:`\mathbf{X}\in\mathbb{R}^{N\times 1}`.
 Obligatory methods
 ==================
 :py:meth:`~GPy.core.model.Model.__init__` :
 	Initialize the model with the given parameters. In our example we have to store shape information of :math:`\mathbf X` and the parameters themselves::
 		self.X = X
 		self.num_inputs = self.X.shape[0]
 		assert self.X.ndim == 1, only vector inputs allowed
 :py:meth:`~GPy.core.model.Model._get_params` : 
    Return parameters of the model as a flattened numpy array-like. So, in our example we have to return the input parameters::
    	return self.X.flatten()
 :py:meth:`~GPy.core.model.Model._set_params` : 
    Set parameters, which have been fetched through :py:meth:`~GPy.core.model.Model._get_params`. In other words, "invert" the functionality of :py:meth:`~GPy.core.model.Model._get_params`::
    	self.X = params[:self.num_inputs*self.input_dim].reshape(self.num_inputs)
 :py:meth:`~GPy.core.model.Model.log_likelihood` :
 	Returns the log-likelihood of the new model. For our example this is just the call to ``rosen``::
 		return scipy.optimize.rosen(self.X)
 :py:meth:`~GPy.core.model.Model._log_likelihood_gradients` :
 	Returns the gradients with respect to all parameters::
 		return scipy.optimize.rosen_der(self.X)
 Optional methods
 ================
 If you want some special functionality please provide the following methods:
 Using the pickle functionality
 ------------------------------
 To be able to use the pickle functionality ``m.pickle(<path>)`` the methods ``getstate(self)`` and ``setstate(self, state)`` have to be provided. The convention for a ``state`` in ``GPy`` is a list of all parameters, which are needed to restore the model. All classes provided in ``GPy`` follow this convention, thus you can just append to the state of the inherited class and call the inherited class' ``setstate`` with the appropriate state.
 :py:meth:`~GPy.core.model.Model.getstate` :
 	This method returns a state of the model, following the memento pattern. As we are inheriting from :py:class:`~GPy.core.model.Model`, we have to return the state of Model as well. In out example we have `X` and `num_inputs` as state::
 		return Model.getstate(self) + [self.X, self.num_inputs]
 :py:meth:`~GPy.core.model.Model.setstate` :
 	This method restores this model with the given ``state``::
 		self.num_inputs = state.pop()
 		self.X = state.pop()
 		return Model.setstate(self, state)
--- a/doc/tuto_interacting_with_models.rst
+++ b/doc/tuto_interacting_with_models.rst
@ -1,3 +1,5 @@
 .. _interacting_with_models:
 *************************************
 Interacting with models
 *************************************
@ -210,6 +212,6 @@ white_variance and noise_variance are tied together.::
 Further Reading
 ===============
-All of the mechansiams for dealing with parameters are baked right into GPy.core.model, from which all of the classes in GPy.models inherrit. To learn how to construct your own model, you might want to read ??link?? creating_new_models. 
+All of the mechansiams for dealing with parameters are baked right into GPy.core.model, from which all of the classes in GPy.models inherrit. To learn how to construct your own model, you might want to read :ref:`creating_new_models`. 
-By deafult, GPy uses the tnc optimizer (from scipy.optimize.tnc). To use other optimisers, and to control the setting of those optimisers, as well as other funky features like automated restarts and diagnostics, you can read the optimization tutorial ??link??.
+By deafult, GPy uses the scg optimizer. To use other optimisers, and to control the setting of those optimisers, as well as other funky features like automated restarts and diagnostics, you can read the optimization tutorial ??link??.
		`@ -0,0 +1 @@`
							`import axis_event_controller, imshow_controller`