Merge branch 'params' of github.com:SheffieldML/GPy into params

2026-05-18 13:55:14 +02:00 · 2014-01-24 12:00:18 +00:00 · 2014-01-24 12:00:18 +00:00 · d4975c2bbd
commit d4975c2bbd
parent 77c02e577d 2c4d7cca76
50 changed files with 3632 additions and 2271 deletions
--- a/GPy/core/init.py
+++ b/GPy/core/init.py
@ -6,6 +6,5 @@ from parameterization import priors
 from parameterization.parameterized import *
 from gp import GP
 from sparse_gp import SparseGP
-from ..inference.latent_function_inference.fitc import FITC
 from svigp import SVIGP
 from mapping import *
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -2,26 +2,48 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-from gp_base import GPBase
-from ..util.linalg import dtrtrs, tdot
-from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
+import pylab as pb
+import warnings
+from .. import kern
+from ..util.plot import gpplot, Tango, x_frame1D, x_frame2D
+from ..util.linalg import dtrtrs
+from model import Model
+from parameterization import ObservableArray
 from .. import likelihoods
+from ..likelihoods.gaussian import Gaussian
+from ..inference.latent_function_inference import exact_gaussian_inference

-class GP(GPBase):
+class GP(Model):
    """
-    Gaussian Process model for regression and EP
+    General purpose Gaussian process model

    :param X: input observations
+    :param Y: output observations
    :param kernel: a GPy kernel, defaults to rbf+white
    :param likelihood: a GPy likelihood
-    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
-    :type normalize_X: False|True
    :rtype: model object

    .. Note:: Multiple independent outputs are allowed using columns of Y

+
    """
    def __init__(self, X, Y, kernel, likelihood, inference_method=None, name='gp'):
+        super(GP, self).__init__(name)
+
+        assert X.ndim == 2
+        self.X = ObservableArray(X)
+        self.num_data, self.input_dim = self.X.shape
+
+        assert Y.ndim == 2
+        self.Y = ObservableArray(Y)
+        assert Y.shape[0] == self.num_data
+        _, self.output_dim = self.Y.shape
+
+        assert isinstance(kernel, kern.kern)
+        self.kern = kernel
+
+        assert isinstance(likelihood, likelihoods.Likelihood)
+        self.likelihood = likelihood

        #find a sensible inference method
        if inference_method is None:
@ -30,21 +52,22 @@ class GP(GPBase):
        else:
            inference_method = expectation_propagation
            print "defaulting to ", inference_method, "for latent function inference"
+        self.inference_method = inference_method
+
+        self.add_parameter(self.kern, gradient=self.dL_dtheta_K)
+        self.add_parameter(self.likelihood, gradient=lambda:self.posterior.dL_dtheta_lik)

-        super(GP, self).__init__(X, Y, kernel, likelihood, inference_method, name)
        self.parameters_changed()

    def parameters_changed(self):
-        super(GP, self).parameters_changed()
-        self.K = self.kern.K(self.X)
-        self.posterior = self.inference_method.inference(self.K, self.likelihood, self.Y)
-
-    def dL_dtheta_K(self):
-        return self.kern.dK_dtheta(self.posterior.dL_dK, self.X)
+        self.posterior = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y)

    def log_likelihood(self):
        return self.posterior.log_marginal

+    def dL_dtheta_K(self):
+        return self.kern.dK_dtheta(self.posterior.dL_dK, self.X)
+
    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False):
        """
        Internal helper function for making predictions, does not account
@ -87,11 +110,228 @@ class GP(GPBase):

        """
        # normalize X values
-        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)

        # now push through likelihood
        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, **likelihood_args)
        return mean, var, _025pm, _975pm

+    def posterior_samples_f(self,X,size=10,which_parts='all',full_cov=True):
+        """
+        Samples the posterior GP at the points X.
+
+        :param X: The points at which to take the samples.
+        :type X: np.ndarray, Nnew x self.input_dim.
+        :param size: the number of a posteriori samples to plot.
+        :type size: int.
+        :param which_parts: which of the kernel functions to plot (additively).
+        :type which_parts: 'all', or list of bools.
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
+        :type full_cov: bool.
+        :returns: Ysim: set of simulations, a Numpy array (N x samples).
+        """
+        m, v = self._raw_predict(X, which_parts=which_parts, full_cov=full_cov)
+        v = v.reshape(m.size,-1) if len(v.shape)==3 else v
+        if not full_cov:
+            Ysim = np.random.multivariate_normal(m.flatten(), np.diag(v.flatten()), size).T
+        else:
+            Ysim = np.random.multivariate_normal(m.flatten(), v, size).T
+
+        return Ysim
+
+    def posterior_samples(self,X,size=10,which_parts='all',full_cov=True,noise_model=None):
+        """
+        Samples the posterior GP at the points X.
+
+        :param X: the points at which to take the samples.
+        :type X: np.ndarray, Nnew x self.input_dim.
+        :param size: the number of a posteriori samples to plot.
+        :type size: int.
+        :param which_parts: which of the kernel functions to plot (additively).
+        :type which_parts: 'all', or list of bools.
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
+        :type full_cov: bool.
+        :param noise_model: for mixed noise likelihood, the noise model to use in the samples.
+        :type noise_model: integer.
+        :returns: Ysim: set of simulations, a Numpy array (N x samples).
+        """
+        Ysim = self.posterior_samples_f(X, size, which_parts=which_parts, full_cov=full_cov)
+        if isinstance(self.likelihood, Gaussian):
+            noise_std = np.sqrt(self.likelihood._get_params())
+            Ysim += np.random.normal(0,noise_std,Ysim.shape)
+        elif isinstance(self.likelihood, Gaussian_Mixed_Noise):
+            assert noise_model is not None, "A noise model must be specified."
+            noise_std = np.sqrt(self.likelihood._get_params()[noise_model])
+            Ysim += np.random.normal(0,noise_std,Ysim.shape)
+        else:
+            Ysim = self.likelihood.noise_model.samples(Ysim)
+
+        return Ysim
+
+    def plot_f(self, *args, **kwargs):
+        """
+        Plot the GP's view of the world, where the data is normalized and before applying a likelihood.
+
+        This is a convenience function: we simply call self.plot with the
+        argument use_raw_predict set True. All args and kwargs are passed on to
+        plot.
+
+        see also: gp.plot
+        """
+        kwargs['plot_raw'] = True
+        self.plot(*args, **kwargs)
+
+    def plot(self, plot_limits=None, which_data_rows='all',
+            which_data_ycols='all', which_parts='all', fixed_inputs=[],
+            levels=20, samples=0, fignum=None, ax=None, resolution=None,
+            plot_raw=False,
+            linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
+        """
+        Plot the posterior of the GP.
+          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
+          - In two dimsensions, a contour-plot shows the mean predicted function
+          - In higher dimensions, use fixed_inputs to plot the GP  with some of the inputs fixed.
+
+        Can plot only part of the data and part of the posterior functions
+        using which_data_rowsm which_data_ycols and which_parts
+
+        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+        :type plot_limits: np.array
+        :param which_data_rows: which of the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice self.X, self.Y
+        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+        :type which_data_rows: 'all' or a list of integers
+        :param which_parts: which of the kernel functions to plot (additively)
+        :type which_parts: 'all', or list of bools
+        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
+        :type fixed_inputs: a list of tuples
+        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+        :type resolution: int
+        :param levels: number of levels to plot in a contour plot.
+        :type levels: int
+        :param samples: the number of a posteriori samples to plot
+        :type samples: int
+        :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+        :type output: integer (first output is 0)
+        :param linecol: color of line to plot.
+        :type linecol:
+        :param fillcol: color of fill
+        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
+        """
+        #deal with optional arguments
+        if which_data_rows == 'all':
+            which_data_rows = slice(None)
+        if which_data_ycols == 'all':
+            which_data_ycols = np.arange(self.output_dim)
+        if len(which_data_ycols)==0:
+            raise ValueError('No data selected for plotting')
+        if ax is None:
+            fig = pb.figure(num=fignum)
+            ax = fig.add_subplot(111)
+
+        #work out what the inputs are for plotting (1D or 2D)
+        fixed_dims = np.array([i for i,v in fixed_inputs])
+        free_dims = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
+
+        #one dimensional plotting
+        if len(free_dims) == 1:
+
+            #define the frame on which to plot
+            resolution = resolution or 200
+            Xnew, xmin, xmax = x_frame1D(self.X[:,free_dims], plot_limits=plot_limits)
+            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
+            Xgrid[:,free_dims] = Xnew
+            for i,v in fixed_inputs:
+                Xgrid[:,i] = v
+
+            #make a prediction on the frame and plot it
+            if plot_raw:
+                m, v = self._raw_predict(Xgrid, which_parts=which_parts)
+                lower = m - 2*np.sqrt(v)
+                upper = m + 2*np.sqrt(v)
+                Y = self.Y
+            else:
+                m, v, lower, upper = self.predict(Xgrid, which_parts=which_parts)
+                Y = self.Y
+            for d in which_data_ycols:
+                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
+                ax.plot(self.X[which_data_rows,free_dims], Y[which_data_rows, d], 'kx', mew=1.5)
+
+            #optionally plot some samples
+            if samples: #NOTE not tested with fixed_inputs
+                Ysim = self.posterior_samples(Xgrid, samples, which_parts=which_parts)
+                for yi in Ysim.T:
+                    ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
+                    #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
+
+            #set the limits of the plot to some sensible values
+            ymin, ymax = min(np.append(Y[which_data_rows, which_data_ycols].flatten(), lower)), max(np.append(Y[which_data_rows, which_data_ycols].flatten(), upper))
+            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
+            ax.set_xlim(xmin, xmax)
+            ax.set_ylim(ymin, ymax)
+
+        #2D plotting
+        elif len(free_dims) == 2:
+
+            #define the frame for plotting on
+            resolution = resolution or 50
+            Xnew, _, _, xmin, xmax = x_frame2D(self.X[:,free_dims], plot_limits, resolution)
+            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
+            Xgrid[:,free_dims] = Xnew
+            for i,v in fixed_inputs:
+                Xgrid[:,i] = v
+            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
+
+            #predict on the frame and plot
+            if plot_raw:
+                m, _ = self._raw_predict(Xgrid, which_parts=which_parts)
+                Y = self.likelihood.Y
+            else:
+                m, _, _, _ = self.predict(Xgrid, which_parts=which_parts,sampling=False)
+                Y = self.likelihood.data
+            for d in which_data_ycols:
+                m_d = m[:,d].reshape(resolution, resolution).T
+                ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
+                ax.scatter(self.X[which_data_rows, free_dims[0]], self.X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
+
+            #set the limits of the plot to some sensible values
+            ax.set_xlim(xmin[0], xmax[0])
+            ax.set_ylim(xmin[1], xmax[1])
+
+            if samples:
+                warnings.warn("Samples are rather difficult to plot for 2D inputs...")
+
+        else:
+            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+
+
+
+    def getstate(self):
+        """
+        Get the current state of the class, here we return everything that is needed to recompute the model.
+        """
+        return Model.getstate(self) + [self.X,
+                self.num_data,
+                self.input_dim,
+                self.kern,
+                self.likelihood,
+                self.output_dim,
+                self._Xoffset,
+                self._Xscale,
+                ]
+
+    def setstate(self, state):
+        self._Xscale = state.pop()
+        self._Xoffset = state.pop()
+        self.output_dim = state.pop()
+        self.likelihood = state.pop()
+        self.kern = state.pop()
+        self.input_dim = state.pop()
+        self.num_data = state.pop()
+        self.X = state.pop()
+        Model.setstate(self, state)
+

--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@ -1,274 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import numpy as np
-import pylab as pb
-import warnings
-from .. import kern
-from ..util.plot import gpplot, Tango, x_frame1D, x_frame2D
-from model import Model
-from parameterization import ObservableArray
-from .. import likelihoods
-from GPy.likelihoods.gaussian import Gaussian
-
-class GPBase(Model):
-    """
-    Gaussian process base model for holding shared behaviour between
-    sparse_GP and GP models.
-    """
-    def __init__(self, X, Y, kernel, likelihood, inference_method, name=''):
-        super(GPBase, self).__init__(name)
-        
-        assert X.ndim == 2
-        self.X = ObservableArray(X)
-        self.num_data, self.input_dim = self.X.shape
-
-        assert Y.ndim == 2
-        self.Y = ObservableArray(Y)
-        assert Y.shape[0] == self.num_data
-        _, self.output_dim = self.Y.shape
-
-        assert isinstance(kernel, kern.kern)
-        self.kern = kernel
-
-        assert isinstance(likelihood, likelihoods.Likelihood)
-        self.likelihood = likelihood
-
-        self.inference_method = inference_method
-
-        # reinstate this later TODO
-        normalize_X = False
-
-        if normalize_X:
-            self._Xoffset = X.mean(0)[None, :]
-            self._Xscale = X.std(0)[None, :]
-            self.X = ObservableArray((X.copy() - self._Xoffset) / self._Xscale)
-        else:
-            self._Xoffset = np.zeros((1, self.input_dim))
-            self._Xscale = np.ones((1, self.input_dim))
-        
-        self.add_parameter(self.kern, gradient=self.dL_dtheta_K)
-        self.add_parameter(self.likelihood, gradient=lambda:self.posterior.dL_dtheta_lik)
-        
-
-    def posterior_samples_f(self,X,size=10,which_parts='all',full_cov=True):
-        """
-        Samples the posterior GP at the points X.
-
-        :param X: The points at which to take the samples.
-        :type X: np.ndarray, Nnew x self.input_dim.
-        :param size: the number of a posteriori samples to plot.
-        :type size: int.
-        :param which_parts: which of the kernel functions to plot (additively).
-        :type which_parts: 'all', or list of bools.
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
-        :type full_cov: bool.
-        :returns: Ysim: set of simulations, a Numpy array (N x samples).
-        """
-        m, v = self._raw_predict(X, which_parts=which_parts, full_cov=full_cov)
-        v = v.reshape(m.size,-1) if len(v.shape)==3 else v
-        if not full_cov:
-            Ysim = np.random.multivariate_normal(m.flatten(), np.diag(v.flatten()), size).T
-        else:
-            Ysim = np.random.multivariate_normal(m.flatten(), v, size).T
-
-        return Ysim
-
-    def posterior_samples(self,X,size=10,which_parts='all',full_cov=True,noise_model=None):
-        """
-        Samples the posterior GP at the points X.
-
-        :param X: the points at which to take the samples.
-        :type X: np.ndarray, Nnew x self.input_dim.
-        :param size: the number of a posteriori samples to plot.
-        :type size: int.
-        :param which_parts: which of the kernel functions to plot (additively).
-        :type which_parts: 'all', or list of bools.
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
-        :type full_cov: bool.
-        :param noise_model: for mixed noise likelihood, the noise model to use in the samples.
-        :type noise_model: integer.
-        :returns: Ysim: set of simulations, a Numpy array (N x samples).
-        """
-        Ysim = self.posterior_samples_f(X, size, which_parts=which_parts, full_cov=full_cov)
-        if isinstance(self.likelihood, Gaussian):
-            noise_std = np.sqrt(self.likelihood._get_params())
-            Ysim += np.random.normal(0,noise_std,Ysim.shape)
-        elif isinstance(self.likelihood, Gaussian_Mixed_Noise):
-            assert noise_model is not None, "A noise model must be specified."
-            noise_std = np.sqrt(self.likelihood._get_params()[noise_model])
-            Ysim += np.random.normal(0,noise_std,Ysim.shape)
-        else:
-            Ysim = self.likelihood.noise_model.samples(Ysim)
-
-        return Ysim
-
-    def plot_f(self, *args, **kwargs):
-        """
-        Plot the GP's view of the world, where the data is normalized and before applying a likelihood.
-
-        This is a convenience function: we simply call self.plot with the
-        argument use_raw_predict set True. All args and kwargs are passed on to
-        plot.
-
-        see also: gp_base.plot
-        """
-        kwargs['plot_raw'] = True
-        self.plot(*args, **kwargs)
-
-    def plot(self, plot_limits=None, which_data_rows='all',
-            which_data_ycols='all', which_parts='all', fixed_inputs=[],
-            levels=20, samples=0, fignum=None, ax=None, resolution=None,
-            plot_raw=False,
-            linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
-        """
-        Plot the posterior of the GP.
-          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
-          - In two dimsensions, a contour-plot shows the mean predicted function
-          - In higher dimensions, use fixed_inputs to plot the GP  with some of the inputs fixed.
-
-        Can plot only part of the data and part of the posterior functions
-        using which_data_rowsm which_data_ycols and which_parts
-
-        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
-        :type plot_limits: np.array
-        :param which_data_rows: which of the training data to plot (default all)
-        :type which_data_rows: 'all' or a slice object to slice self.X, self.Y
-        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
-        :type which_data_rows: 'all' or a list of integers
-        :param which_parts: which of the kernel functions to plot (additively)
-        :type which_parts: 'all', or list of bools
-        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
-        :type fixed_inputs: a list of tuples
-        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
-        :type resolution: int
-        :param levels: number of levels to plot in a contour plot.
-        :type levels: int
-        :param samples: the number of a posteriori samples to plot
-        :type samples: int
-        :param fignum: figure to plot on.
-        :type fignum: figure number
-        :param ax: axes to plot on.
-        :type ax: axes handle
-        :type output: integer (first output is 0)
-        :param linecol: color of line to plot.
-        :type linecol:
-        :param fillcol: color of fill
-        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
-        """
-        #deal with optional arguments
-        if which_data_rows == 'all':
-            which_data_rows = slice(None)
-        if which_data_ycols == 'all':
-            which_data_ycols = np.arange(self.output_dim)
-        if len(which_data_ycols)==0:
-            raise ValueError('No data selected for plotting')
-        if ax is None:
-            fig = pb.figure(num=fignum)
-            ax = fig.add_subplot(111)
-
-        #work out what the inputs are for plotting (1D or 2D)
-        fixed_dims = np.array([i for i,v in fixed_inputs])
-        free_dims = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
-
-        #one dimensional plotting
-        if len(free_dims) == 1:
-
-            #define the frame on which to plot
-            resolution = resolution or 200
-            Xu = self.X * self._Xscale + self._Xoffset #NOTE self.X are the normalized values now
-            Xnew, xmin, xmax = x_frame1D(Xu[:,free_dims], plot_limits=plot_limits)
-            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
-            Xgrid[:,free_dims] = Xnew
-            for i,v in fixed_inputs:
-                Xgrid[:,i] = v
-
-            #make a prediction on the frame and plot it
-            if plot_raw:
-                m, v = self._raw_predict(Xgrid, which_parts=which_parts)
-                lower = m - 2*np.sqrt(v)
-                upper = m + 2*np.sqrt(v)
-                Y = self.Y
-            else:
-                m, v, lower, upper = self.predict(Xgrid, which_parts=which_parts) #Compute the exact mean
-                Y = self.Y
-            for d in which_data_ycols:
-                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
-                ax.plot(Xu[which_data_rows,free_dims], Y[which_data_rows, d], 'kx', mew=1.5)
-
-            #optionally plot some samples
-            if samples: #NOTE not tested with fixed_inputs
-                Ysim = self.posterior_samples(Xgrid, samples, which_parts=which_parts)
-                for yi in Ysim.T:
-                    ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
-                    #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
-
-            #set the limits of the plot to some sensible values
-            ymin, ymax = min(np.append(Y[which_data_rows, which_data_ycols].flatten(), lower)), max(np.append(Y[which_data_rows, which_data_ycols].flatten(), upper))
-            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
-            ax.set_xlim(xmin, xmax)
-            ax.set_ylim(ymin, ymax)
-
-        #2D plotting
-        elif len(free_dims) == 2:
-
-            #define the frame for plotting on
-            resolution = resolution or 50
-            Xu = self.X * self._Xscale + self._Xoffset #NOTE self.X are the normalized values now
-            Xnew, _, _, xmin, xmax = x_frame2D(Xu[:,free_dims], plot_limits, resolution)
-            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
-            Xgrid[:,free_dims] = Xnew
-            for i,v in fixed_inputs:
-                Xgrid[:,i] = v
-            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
-
-            #predict on the frame and plot
-            if plot_raw:
-                m, _ = self._raw_predict(Xgrid, which_parts=which_parts)
-                Y = self.likelihood.Y
-            else:
-                m, _, _, _ = self.predict(Xgrid, which_parts=which_parts,sampling=False)
-                Y = self.likelihood.data
-            for d in which_data_ycols:
-                m_d = m[:,d].reshape(resolution, resolution).T
-                ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
-                ax.scatter(self.X[which_data_rows, free_dims[0]], self.X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
-
-            #set the limits of the plot to some sensible values
-            ax.set_xlim(xmin[0], xmax[0])
-            ax.set_ylim(xmin[1], xmax[1])
-
-            if samples:
-                warnings.warn("Samples are rather difficult to plot for 2D inputs...")
-
-        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
-
-
-
-    def getstate(self):
-        """
-        Get the current state of the class, here we return everything that is needed to recompute the model.
-        """
-        return Model.getstate(self) + [self.X,
-                self.num_data,
-                self.input_dim,
-                self.kern,
-                self.likelihood,
-                self.output_dim,
-                self._Xoffset,
-                self._Xscale,
-                ]
-
-    def setstate(self, state):
-        self._Xscale = state.pop()
-        self._Xoffset = state.pop()
-        self.output_dim = state.pop()
-        self.likelihood = state.pop()
-        self.kern = state.pop()
-        self.input_dim = state.pop()
-        self.num_data = state.pop()
-        self.X = state.pop()
-        Model.setstate(self, state)
-
-
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@ -8,7 +8,7 @@ import pylab as pb

 class Mapping(Parameterized):
    """
-    Base model for shared behavior between models that can act like a mapping. 
+    Base model for shared behavior between models that can act like a mapping.
    """

    def __init__(self, input_dim, output_dim):
@ -36,7 +36,7 @@ class Mapping(Parameterized):

    def df_dtheta(self, dL_df, X):
        """The gradient of the outputs of the multi-layer perceptron with respect to each of the parameters.
-        
+
        :param dL_df: gradient of the objective with respect to the function.
        :type dL_df: ndarray (num_data x output_dim)
        :param X: input locations where the function is evaluated.
@ -44,14 +44,14 @@ class Mapping(Parameterized):
        :returns: Matrix containing gradients with respect to parameters of each output for each input data.
        :rtype: ndarray (num_params length)
        """
-        
+
        raise NotImplementedError

    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue']):
        """

        Plot the mapping.
-        
+
        Plots the mapping associated with the model.
          - In one dimension, the function is plotted.
          - In two dimsensions, a contour-plot shows the function
@ -110,7 +110,7 @@ class Mapping(Parameterized):
            for d in range(y.shape[1]):
                ax.plot(Xnew, f[:, d], edgecol=linecol)

-        elif self.X.shape[1] == 2: 
+        elif self.X.shape[1] == 2:
            resolution = resolution or 50
            Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
@ -123,7 +123,7 @@ class Mapping(Parameterized):
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"

-from GPy.core.model import Model
+from model import Model

 class Mapping_check_model(Model):
    """This is a dummy model class used as a base class for checking that the gradients of a given mapping are implemented correctly. It enables checkgradient() to be called independently on each mapping."""
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -38,7 +38,7 @@ class Model(Parameterized):
            raise KeyError, 'Gradient for {} not defined, please specify gradients for parameters to optimize'.format(p.name)
        return g
        raise NotImplementedError, "this needs to be implemented to use the model class"
-    
+
    def getstate(self):
        """
        Get the current state of the class.
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -26,43 +26,43 @@ UNFIXED = True
 class Parameterized(Constrainable, Pickleable, Observable):
    """
    Parameterized class
-    
+
    Say m is a handle to a parameterized class.

    Printing parameters:
-    
+
        - print m:           prints a nice summary over all parameters
        - print m.name:      prints details for param with name 'name'
-        - print m[regexp]: prints details for all the parameters 
+        - print m[regexp]: prints details for all the parameters
                             which match (!) regexp
        - print m['']:       prints details for all parameters
-    
+
        Fields:

            Name:       The name of the param, can be renamed!
            Value:      Shape or value, if one-valued
-            Constrain:  constraint of the param, curly "{c}" brackets indicate 
+            Constrain:  constraint of the param, curly "{c}" brackets indicate
                        some parameters are constrained by c. See detailed print
                        to get exact constraints.
            Tied_to:    which paramter it is tied to.
-    
+
    Getting and setting parameters:

        Set all values in param to one:

            m.name.to.param = 1
-    
+
    Handling of constraining, fixing and tieing parameters:
-        
+
        You can constrain parameters by calling the constrain on the param itself, e.g:
-        
+
            - m.name[:,1].constrain_positive()
            - m.name[0].tie_to(m.name[1])

        Fixing parameters will fix them to the value they are right now. If you change
        the parameters value, the param will be fixed to the new value!

-        If you want to operate on all parameters use m[''] to wildcard select all paramters 
+        If you want to operate on all parameters use m[''] to wildcard select all paramters
        and concatenate them. Printing m[''] will result in printing of all parameters in detail.
    """
    def __init__(self, name=None):
@ -96,18 +96,18 @@ class Parameterized(Constrainable, Pickleable, Observable):
 #         :param gradient: gradient method of the param
 #         :type gradient:  callable
 #         :param index:    (optional) index of the param when printing
-#         
+#
 #         (:param gradient_parent:  connect these parameters to this class, but tell
 #                         updates to highest_parent, this is needed when parameterized classes
 #                         contain parameterized classes, but want to access the parameters
-#                         of their children) 
-#                          
-# 
+#                         of their children)
+#
+#
 #         Set array (e.g. self.X) as param with name and gradient.
 #         I.e: self.set_as_parameter('curvature', self.lengthscale, self.dK_dlengthscale)
-#         
-#         Note: the order in which parameters are added can be adjusted by 
-#               giving an index, of where to put this param in printing  
+#
+#         Note: the order in which parameters are added can be adjusted by
+#               giving an index, of where to put this param in printing
 #         """
 #         if index is None:
 #             self._parameters_.append(Param(name, array, gradient))
@ -122,13 +122,13 @@ class Parameterized(Constrainable, Pickleable, Observable):
        """
        :param parameters:  the parameters to add
        :type parameters:   list of or one :py:class:`GPy.core.param.Param`
-        :param [gradients]: gradients for each param, 
+        :param [gradients]: gradients for each param,
                            one gradient per param
        :param [index]:     index of where to put parameters
-        
-        
-        Add all parameters to this param class, you can insert parameters 
-        at any given index using the :func:`list.insert` syntax 
+
+
+        Add all parameters to this param class, you can insert parameters
+        at any given index using the :func:`list.insert` syntax
        """
        if param in self._parameters_ and index is not None:
            # make sure fixes and constraints are indexed right
@ -139,7 +139,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
                fixes_param = self._fixes_[param_slice].copy()
                self._fixes_[param_slice] = self._fixes_[dest_slice]
                self._fixes_[dest_slice] = fixes_param
-            
+
            del self._parameters_[param._parent_index_]
            self._parameters_.insert(index, param)
        elif param not in self._parameters_:
@ -148,70 +148,70 @@ class Parameterized(Constrainable, Pickleable, Observable):
                self.size = sum(p.size for p in self._parameters_)
            if index is None:
                self._parameters_.append(param)
-                
+
                # make sure fixes and constraints are indexed right
                if param._has_fixes(): fixes_param = param._fixes_.copy()
                else: fixes_param = numpy.ones(param.size, dtype=bool)
                if self._has_fixes(): self._fixes_ = np.r_[self._fixes_, fixes_param]
                elif param._has_fixes(): self._fixes_ = np.r_[np.ones(self.size, dtype=bool), fixes_param]
-            
+
            else:
                self._parameters_.insert(index, param)
-                
+
                # make sure fixes and constraints are indexed right
                if param._has_fixes(): fixes_param = param._fixes_.copy()
                else: fixes_param = numpy.ones(param.size, dtype=bool)
                ins = sum((p.size for p in self._parameters_[:index]))
                if self._has_fixes(): self._fixes_ = np.r_[self._fixes_[:ins], fixes_param, self._fixes[ins:]]
-                elif not np.all(fixes_param): 
+                elif not np.all(fixes_param):
                    self._fixes_ = np.ones(self.size+param.size, dtype=bool)
                    self._fixes_[ins:ins+param.size] = fixes_param
            self.size += param.size
        if gradient:
-            self.gradient_mapping[param] = gradient    
+            self.gradient_mapping[param] = gradient
        self._connect_parameters()
        # make sure the constraints are pulled over:
        if hasattr(param, "_constraints_") and param._constraints_ is not None:
            for t, ind in param._constraints_.iteritems():
                self.constraints.add(t, ind+self._offset_for(param))
-            param._constraints_.clear()    
+            param._constraints_.clear()
        if self._has_fixes() and np.all(self._fixes_): # ==UNFIXED
            self._fixes_= None

    def add_parameters(self, *parameters):
        """
-        convenience method for adding several 
+        convenience method for adding several
        parameters without gradient specification
        """
        [self.add_parameter(p) for p in parameters]
-        
+
    def remove_parameter(self, *names_params_indices):
        """
-        :param names_params_indices: mix of parameter_names, param objects, or indices 
-            to remove from being a param of this parameterized object. 
-             
+        :param names_params_indices: mix of parameter_names, param objects, or indices
+            to remove from being a param of this parameterized object.
+
            note: if it is a string object it will not (!) be regexp-matched
                  automatically.
        """
-        self._parameters_ = [p for p in self._parameters_ 
-                        if not (p._parent_index_ in names_params_indices 
+        self._parameters_ = [p for p in self._parameters_
+                        if not (p._parent_index_ in names_params_indices
                                or p.name in names_params_indices
                                or p in names_params_indices)]
        self._connect_parameters()
-        
+
    def parameters_changed(self):
        """
-        This method gets called when parameters have changed. 
-        Another way of listening to param changes is to 
-        add self as a listener to the param, such that 
+        This method gets called when parameters have changed.
+        Another way of listening to param changes is to
+        add self as a listener to the param, such that
        updates get passed through. See :py:function:``GPy.core.param.Observable.add_observer``
        """
        # will be called as soon as paramters have changed
        pass
-    
+
    def _connect_parameters(self):
        # connect parameterlist to this parameterized object
-        # This just sets up the right connection for the params objects 
+        # This just sets up the right connection for the params objects
        # to be used as parameters
        if not hasattr(self, "_parameters_") or len(self._parameters_) < 1:
            # no parameters for this class
@ -228,7 +228,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
            not_unique = []
            sizes.append(p.size+sizes[-1])
            self._param_slices_.append(slice(sizes[-2], sizes[-1]))
-            pname = adjust_name_for_printing(p.name)  
+            pname = adjust_name_for_printing(p.name)
            # and makes sure to not delete programmatically added parameters
            if pname in self.__dict__:
                if isinstance(self.__dict__[pname], (Parameterized, Param)):
@ -273,7 +273,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
        Get the current state of the class,
        here just all the indices, rest can get recomputed
        For inheriting from Parameterized:
- 
+
        Allways append the state of the inherited object
        and call down to the inherited object in setstate!!
        """
@ -285,7 +285,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
                #self.gradient_mapping,
                self._added_names_,
                ]
-        
+
    def setstate(self, state):
        self._added_names_ = state.pop()
        #self.gradient_mapping = state.pop()
@ -361,7 +361,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
                return self._param_slices_[param._direct_parent_._get_original(param)._parent_index_].start
            return self._offset_for(param._direct_parent_) + param._direct_parent_._offset_for(param)
        return 0
-    
+
    def _raveled_index_for(self, param):
        """
        get the raveled index for a param
@ -369,7 +369,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
        param inside this parameterized logic.
        """
        return param._raveled_index() + self._offset_for(param)
-    
+
    def _raveled_index(self):
        """
        get the raveled index for this object,
@ -441,7 +441,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
    def _add_constrain(self, param, transform, warning=True):
        rav_i = self._raveled_index_for(param)
        reconstrained = self._remove_constrain(param, index=rav_i) # remove constraints before
-        # if removing constraints before adding new is not wanted, just delete the above line!        
+        # if removing constraints before adding new is not wanted, just delete the above line!
        self.constraints.add(transform, rav_i)
        param = self._get_original(param)
        param._set_params(transform.initialize(param._get_params()))
@ -454,7 +454,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
        if not transforms:
            transforms = self.constraints.properties()
        removed_indices = numpy.array([]).astype(int)
-        if "index" in kwargs: index = kwargs['index'] 
+        if "index" in kwargs: index = kwargs['index']
        else: index = self._raveled_index_for(param)
        for constr in transforms:
            removed = self.constraints.remove(constr, index)
@ -505,7 +505,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
        if paramlist is None:
            paramlist = self.grep_param_names(name)
        if len(paramlist) < 1: raise AttributeError, name
-        if len(paramlist) == 1: 
+        if len(paramlist) == 1:
            if isinstance(paramlist[-1], Parameterized):
                paramlist = paramlist[-1].flattened_parameters
                if len(paramlist) != 1:
@ -519,7 +519,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
 #     def __getattr__(self, name):
 #         return self.__getitem__(name)
 #     def __getattribute__(self, name):
-#         #try: 
+#         #try:
 #             return object.__getattribute__(self, name)
        #except AttributeError:
        #    _, a, tb = sys.exc_info()
@ -571,7 +571,7 @@ class Parameterized(Constrainable, Pickleable, Observable):
    def _ties_str(self):
        return [','.join(x._ties_str) for x in self.flattened_parameters]
    def __str__(self, header=True):
-        
+
        name  = adjust_name_for_printing(self.name) + "."
        constrs = self._constraints_str; ts = self._ties_str
        desc = self._description_str; names = self.parameter_names
@ -592,463 +592,4 @@ class Parameterized(Constrainable, Pickleable, Observable):
        return '\n'.format(sep).join(to_print)
    pass

-# 
-# class Parameterized_old(object):
-#     def __init__(self):
-#         """
-#         This is the base class for model and kernel. Mostly just handles tieing and constraining of parameters
-#         """
-#         self.tied_indices = []
-#         self.fixed_indices = []
-#         self.fixed_values = []
-#         self.constrained_indices = []
-#         self.constraints = []
-#  
-#     def _get_params(self):
-#         raise NotImplementedError, "this needs to be implemented to use the Parameterized class"
-#     def _set_params(self, x):
-#         raise NotImplementedError, "this needs to be implemented to use the Parameterized class"
-#  
-#     def _get_param_names(self):
-#         raise NotImplementedError, "this needs to be implemented to use the Parameterized class"
-#     #def _get_print_names(self):
-#     #    """ Override for which parameter_names to print out, when using print m """
-#     #    return self._get_param_names()
-#  
-#     def pickle(self, filename, protocol=None):
-#         if protocol is None:
-#             if self._has_get_set_state():
-#                 protocol = 0
-#             else:
-#                 protocol = -1
-#         with open(filename, 'w') as f:
-#             cPickle.dump(self, f, protocol)
-#  
-#     def copy(self):
-#         """Returns a (deep) copy of the current model """
-#         return copy.deepcopy(self)
-#  
-#     def __getstate__(self):
-#         if self._has_get_set_state():
-#             return self.getstate()
-#         return self.__dict__
-#  
-#     def __setstate__(self, state):
-#         if self._has_get_set_state():
-#             self.setstate(state) # set state
-#             self._set_params(self._get_params()) # restore all values
-#             return
-#         self.__dict__ = state
-#  
-#     def _has_get_set_state(self):
-#         return 'getstate' in vars(self.__class__) and 'setstate' in vars(self.__class__)
-#  
-#     def getstate(self):
-#         """
-#         Get the current state of the class,
-#         here just all the indices, rest can get recomputed
-#         For inheriting from Parameterized:
-#  
-#         Allways append the state of the inherited object
-#         and call down to the inherited object in setstate!!
-#         """
-#         return [self.tied_indices,
-#                 self.fixed_indices,
-#                 self.fixed_values,
-#                 self.constrained_indices,
-#                 self.constraints]
-#  
-#     def setstate(self, state):
-#         self.constraints = state.pop()
-#         self.constrained_indices = state.pop()
-#         self.fixed_values = state.pop()
-#         self.fixed_indices = state.pop()
-#         self.tied_indices = state.pop()
-#  
-#     def __getitem__(self, regexp, return_names=False):
-#         """
-#         Get a model param by name.  The name is applied as a regular
-#         expression and all parameters that match that regular expression are
-#         returned.
-#         """
-#         matches = self.grep_param_names(regexp)
-#         if len(matches):
-#             if return_names:
-#                 return self._get_params()[matches], np.asarray(self._get_param_names())[matches].tolist()
-#             else:
-#                 return self._get_params()[matches]
-#         else:
-#             raise AttributeError, "no param matches %s" % regexp
-#  
-#     def __setitem__(self, name, val):
-#         """
-#         Set model param(s) by name. The name is provided as a regular
-#         expression. All parameters matching that regular expression are set to
-#         the given value.
-#         """
-#         matches = self.grep_param_names(name)
-#         if len(matches):
-#             val = np.array(val)
-#             assert (val.size == 1) or val.size == len(matches), "Shape mismatch: {}:({},)".format(val.size, len(matches))
-#             x = self._get_params()
-#             x[matches] = val
-#             self._set_params(x)
-#         else:
-#             raise AttributeError, "no param matches %s" % name
-#  
-#     def tie_params(self, regexp):
-#         """
-#         Tie (all!) parameters matching the regular expression `regexp`. 
-#         """
-#         matches = self.grep_param_names(regexp)
-#         assert matches.size > 0, "need at least something to tie together"
-#         if len(self.tied_indices):
-#             assert not np.any(matches[:, None] == np.hstack(self.tied_indices)), "Some indices are already tied!"
-#         self.tied_indices.append(matches)
-#         # TODO only one of the priors will be evaluated. Give a warning message if the priors are not identical
-#         if hasattr(self, 'prior'):
-#             pass
-#  
-#         self._set_params_transformed(self._get_params_transformed()) # sets tied parameters to single value
-#  
-#     def untie_everything(self):
-#         """Unties all parameters by setting tied_indices to an empty list."""
-#         self.tied_indices = []
-#  
-#     def grep_param_names(self, regexp, transformed=False, search=False):
-#         """
-#         :param regexp: regular expression to select param parameter_names
-#         :type regexp: re | str | int
-#         :rtype: the indices of self._get_param_names which match the regular expression.
-#  
-#         Note:-
-#           Other objects are passed through - i.e. integers which weren't meant for grepping
-#         """
-#  
-#         if transformed:
-#             parameter_names = self._get_param_names_transformed()
-#         else:
-#             parameter_names = self._get_param_names()
-#  
-#         if type(regexp) in [str, np.string_, np.str]:
-#             regexp = re.compile(regexp)
-#         elif type(regexp) is re._pattern_type:
-#             pass
-#         else:
-#             return regexp
-#         if search:
-#             return np.nonzero([regexp.search(name) for name in parameter_names])[0]
-#         else:
-#             return np.nonzero([regexp.match(name) for name in parameter_names])[0]
-#  
-#     def num_params_transformed(self):
-#         removed = 0
-#         for tie in self.tied_indices:
-#             removed += tie.size - 1
-#  
-#         for fix in self.fixed_indices:
-#             removed += fix.size
-#  
-#         return len(self._get_params()) - removed
-#  
-#     def unconstrain(self, regexp):
-#         """Unconstrain matching parameters.  Does not untie parameters"""
-#         matches = self.grep_param_names(regexp)
-#  
-#         # tranformed contraints:
-#         for match in matches:
-#             self.constrained_indices = [i[i <> match] for i in self.constrained_indices]
-#  
-#         # remove empty constraints
-#         tmp = zip(*[(i, t) for i, t in zip(self.constrained_indices, self.constraints) if len(i)])
-#         if tmp:
-#             self.constrained_indices, self.constraints = zip(*[(i, t) for i, t in zip(self.constrained_indices, self.constraints) if len(i)])
-#             self.constrained_indices, self.constraints = list(self.constrained_indices), list(self.constraints)
-#  
-#         # fixed:
-#         self.fixed_values = [np.delete(values, np.nonzero(np.sum(indices[:, None] == matches[None, :], 1))[0]) for indices, values in zip(self.fixed_indices, self.fixed_values)]
-#         self.fixed_indices = [np.delete(indices, np.nonzero(np.sum(indices[:, None] == matches[None, :], 1))[0]) for indices in self.fixed_indices]
-#  
-#         # remove empty elements
-#         tmp = [(i, v) for i, v in zip(self.fixed_indices, self.fixed_values) if len(i)]
-#         if tmp:
-#             self.fixed_indices, self.fixed_values = zip(*tmp)
-#             self.fixed_indices, self.fixed_values = list(self.fixed_indices), list(self.fixed_values)
-#         else:
-#             self.fixed_indices, self.fixed_values = [], []
-#  
-#     def constrain_negative(self, regexp, warning=True):
-#         """ Set negative constraints. """
-#         self.constrain(regexp, transformations.NegativeLogexp(), warning)
-#  
-#     def constrain_positive(self, regexp, warning=True):
-#         """ Set positive constraints. """
-#         self.constrain(regexp, transformations.Logexp(), warning)
-#  
-#     def constrain_bounded(self, regexp, lower, upper, warning=True):
-#         """ Set bounded constraints. """
-#         self.constrain(regexp, transformations.Logistic(lower, upper), warning)
-#  
-#     def all_constrained_indices(self):
-#         if len(self.constrained_indices) or len(self.fixed_indices):
-#             return np.hstack(self.constrained_indices + self.fixed_indices)
-#         else:
-#             return np.empty(shape=(0,))
-#  
-#     def constrain(self, regexp, transform, warning=True):
-#         assert isinstance(transform, transformations.Transformation)
-#  
-#         matches = self.grep_param_names(regexp)
-#         overlap = set(matches).intersection(set(self.all_constrained_indices()))
-#         if overlap:
-#             self.unconstrain(np.asarray(list(overlap)))
-#             if warning:
-#                 print 'Warning: re-constraining these parameters'
-#                 pn = self._get_param_names()
-#                 for i in overlap:
-#                     print pn[i]
-#  
-#         self.constrained_indices.append(matches)
-#         self.constraints.append(transform)
-#         x = self._get_params()
-#         x[matches] = transform.initialize(x[matches])
-#         self._set_params(x)
-#  
-#     def constrain_fixed(self, regexp, value=None, warning=True):
-#         """
-#  
-#         :param regexp: which parameters need to be fixed.
-#         :type regexp: ndarray(dtype=int) or regular expression object or string
-#         :param value: the vlaue to fix the parameters to. If the value is not specified,
-#                  the param is fixed to the current value
-#         :type value: float
-#  
-#         **Notes**
-#  
-#         Fixing a param which is tied to another, or constrained in some way will result in an error.
-#  
-#         To fix multiple parameters to the same value, simply pass a regular expression which matches both param parameter_names, or pass both of the indexes.
-#  
-#         """
-#         matches = self.grep_param_names(regexp)
-#         overlap = set(matches).intersection(set(self.all_constrained_indices()))
-#         if overlap:
-#             self.unconstrain(np.asarray(list(overlap)))
-#             if warning:
-#                 print 'Warning: re-constraining these parameters'
-#                 pn = self._get_param_names()
-#                 for i in overlap:
-#                     print pn[i]
-#  
-#         self.fixed_indices.append(matches)
-#         if value != None:
-#             self.fixed_values.append(value)
-#         else:
-#             self.fixed_values.append(self._get_params()[self.fixed_indices[-1]])
-#  
-#         # self.fixed_values.append(value)
-#         self._set_params_transformed(self._get_params_transformed())
-#  
-#     def _get_params_transformed(self):
-#         """use self._get_params to get the 'true' parameters of the model, which are then tied, constrained and fixed"""
-#         x = self._get_params()
-#         [np.put(x, i, t.finv(x[i])) for i, t in zip(self.constrained_indices, self.constraints)]
-#  
-#         to_remove = self.fixed_indices + [t[1:] for t in self.tied_indices]
-#         if len(to_remove):
-#             return np.delete(x, np.hstack(to_remove))
-#         else:
-#             return x
-#  
-#     def _set_params_transformed(self, x):
-#         """ takes the vector x, which is then modified (by untying, reparameterising or inserting fixed values), and then call self._set_params"""
-#         self._set_params(self._untransform_params(x))
-#  
-#     def _untransform_params(self, x):
-#         """
-#         The Transformation required for _set_params_transformed.
-#  
-#         This moves the vector x seen by the optimiser (unconstrained) to the
-#         valid param vector seen by the model
-#  
-#         Note:
-#           - This function is separate from _set_params_transformed for downstream flexibility
-#         """
-#         # work out how many places are fixed, and where they are. tricky logic!
-#         fix_places = self.fixed_indices + [t[1:] for t in self.tied_indices]
-#         if len(fix_places):
-#             fix_places = np.hstack(fix_places)
-#             Nfix_places = fix_places.size
-#         else:
-#             Nfix_places = 0
-#  
-#         free_places = np.setdiff1d(np.arange(Nfix_places + x.size, dtype=np.int), fix_places)
-#  
-#         # put the models values in the vector xx
-#         xx = np.zeros(Nfix_places + free_places.size, dtype=np.float64)
-#  
-#         xx[free_places] = x
-#         [np.put(xx, i, v) for i, v in zip(self.fixed_indices, self.fixed_values)]
-#         [np.put(xx, i, v) for i, v in [(t[1:], xx[t[0]]) for t in self.tied_indices] ]
-#  
-#         [np.put(xx, i, t.f(xx[i])) for i, t in zip(self.constrained_indices, self.constraints)]
-#         if hasattr(self, 'debug'):
-#             stop # @UndefinedVariable
-#  
-#         return xx
-#  
-#     def _get_param_names_transformed(self):
-#         """
-#         Returns the param parameter_names as propagated after constraining,
-#         tying or fixing, i.e. a list of the same length as _get_params_transformed()
-#         """
-#         n = self._get_param_names()
-#  
-#         # remove/concatenate the tied param parameter_names
-#         if len(self.tied_indices):
-#             for t in self.tied_indices:
-#                 n[t[0]] = "<tie>".join([n[tt] for tt in t])
-#             remove = np.hstack([t[1:] for t in self.tied_indices])
-#         else:
-#             remove = np.empty(shape=(0,), dtype=np.int)
-#  
-#         # also remove the fixed params
-#         if len(self.fixed_indices):
-#             remove = np.hstack((remove, np.hstack(self.fixed_indices)))
-#  
-#         # add markers to show that some variables are constrained
-#         for i, t in zip(self.constrained_indices, self.constraints):
-#             for ii in i:
-#                 n[ii] = n[ii] + t.__str__()
-#  
-#         n = [nn for i, nn in enumerate(n) if not i in remove]
-#         return n
-#  
-#     #@property
-#     #def all(self):
-#     #    return self.__str__(self._get_param_names())
-#  
-#  
-#     #def __str__(self, parameter_names=None, nw=30):
-#     def __str__(self, nw=30):
-#         """
-#         Return a string describing the param parameter_names and their ties and constraints
-#         """
-#         parameter_names = self._get_param_names()
-#         #if parameter_names is None:
-#         #    parameter_names = self._get_print_names()
-#         #name_indices = self.grep_param_names("|".join(parameter_names))
-#         N = len(parameter_names)
-#  
-#         if not N:
-#             return "This object has no free parameters."
-#         header = ['Name', 'Value', 'Constraints', 'Ties']
-#         values = self._get_params() # map(str,self._get_params())
-#         #values = self._get_params()[name_indices] # map(str,self._get_params())
-#         # sort out the constraints
-#         constraints = [''] * len(parameter_names)
-#         #constraints = [''] * len(self._get_param_names())
-#         for i, t in zip(self.constrained_indices, self.constraints):
-#             for ii in i:
-#                 constraints[ii] = t.__str__()
-#         for i in self.fixed_indices:
-#             for ii in i:
-#                 constraints[ii] = 'Fixed'
-#         # sort out the ties
-#         ties = [''] * len(parameter_names)
-#         for i, tie in enumerate(self.tied_indices):
-#             for j in tie:
-#                 ties[j] = '(' + str(i) + ')'
-#  
-#         if values.size == 1:
-#             values = ['%.4f' %float(values)]
-#         else:
-#             values = ['%.4f' % float(v) for v in values]
-#         max_names = max([len(parameter_names[i]) for i in range(len(parameter_names))] + [len(header[0])])
-#         max_values = max([len(values[i]) for i in range(len(values))] + [len(header[1])])
-#         max_constraint = max([len(constraints[i]) for i in range(len(constraints))] + [len(header[2])])
-#         max_ties = max([len(ties[i]) for i in range(len(ties))] + [len(header[3])])
-#         cols = np.array([max_names, max_values, max_constraint, max_ties]) + 4
-#         # columns = cols.sum()
-#  
-#         header_string = ["{h:^{col}}".format(h=header[i], col=cols[i]) for i in range(len(cols))]
-#         header_string = map(lambda x: '|'.join(x), [header_string])
-#         separator = '-' * len(header_string[0])
-#         param_string = ["{n:^{c0}}|{v:^{c1}}|{c:^{c2}}|{t:^{c3}}".format(n=parameter_names[i], v=values[i], c=constraints[i], t=ties[i], c0=cols[0], c1=cols[1], c2=cols[2], c3=cols[3]) for i in range(len(values))]
-#  
-#  
-#         return ('\n'.join([header_string[0], separator] + param_string)) + '\n'
-#  
-#     def grep_model(self,regexp):
-#         regexp_indices = self.grep_param_names(regexp)
-#         all_names = self._get_param_names()
-#  
-#         parameter_names = [all_names[pj] for pj in regexp_indices]
-#         N = len(parameter_names)
-#  
-#         if not N:
-#             return "Match not found."
-#  
-#         header = ['Name', 'Value', 'Constraints', 'Ties']
-#         all_values = self._get_params()
-#         values = np.array([all_values[pj] for pj in regexp_indices])
-#         constraints = [''] * len(parameter_names)
-#  
-#         _constrained_indices,aux = self._pick_elements(regexp_indices,self.constrained_indices)
-#         _constraints_ = [self.constraints[pj] for pj in aux]
-#  
-#         for i, t in zip(_constrained_indices, _constraints_):
-#             for ii in i:
-#                 iii = regexp_indices.tolist().index(ii)
-#                 constraints[iii] = t.__str__()
-#  
-#         _fixed_indices,aux = self._pick_elements(regexp_indices,self.fixed_indices)
-#         for i in _fixed_indices:
-#             for ii in i:
-#                 iii = regexp_indices.tolist().index(ii)
-#                 constraints[ii] = 'Fixed'
-#  
-#         _tied_indices,aux = self._pick_elements(regexp_indices,self.tied_indices)
-#         ties = [''] * len(parameter_names)
-#         for i,ti in zip(_tied_indices,aux):
-#             for ii in i:
-#                 iii = regexp_indices.tolist().index(ii)
-#                 ties[iii] = '(' + str(ti) + ')'
-#  
-#         if values.size == 1:
-#             values = ['%.4f' %float(values)]
-#         else:
-#             values = ['%.4f' % float(v) for v in values]
-#  
-#         max_names = max([len(parameter_names[i]) for i in range(len(parameter_names))] + [len(header[0])])
-#         max_values = max([len(values[i]) for i in range(len(values))] + [len(header[1])])
-#         max_constraint = max([len(constraints[i]) for i in range(len(constraints))] + [len(header[2])])
-#         max_ties = max([len(ties[i]) for i in range(len(ties))] + [len(header[3])])
-#         cols = np.array([max_names, max_values, max_constraint, max_ties]) + 4
-#  
-#         header_string = ["{h:^{col}}".format(h=header[i], col=cols[i]) for i in range(len(cols))]
-#         header_string = map(lambda x: '|'.join(x), [header_string])
-#         separator = '-' * len(header_string[0])
-#         param_string = ["{n:^{c0}}|{v:^{c1}}|{c:^{c2}}|{t:^{c3}}".format(n=parameter_names[i], v=values[i], c=constraints[i], t=ties[i], c0=cols[0], c1=cols[1], c2=cols[2], c3=cols[3]) for i in range(len(values))]
-#  
-#         print header_string[0]
-#         print separator
-#         for string in param_string:
-#             print string
-#  
-#     def _pick_elements(self,regexp_ind,array_list):
-#         """Removes from array_list the elements different from regexp_ind"""
-#         new_array_list = [] #New list with elements matching regexp_ind
-#         array_indices = [] #Indices that matches the arrays in new_array_list and array_list
-#  
-#         array_index = 0
-#         for array in array_list:
-#             _new = []
-#             for ai in array:
-#                 if ai in regexp_ind:
-#                     _new.append(ai)
-#             if len(_new):
-#                 new_array_list.append(np.array(_new))
-#                 array_indices.append(array_index)
-#             array_index += 1
-#         return new_array_list, array_indices
+
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -4,12 +4,12 @@
 import numpy as np
 import pylab as pb
 from ..util.linalg import mdot, tdot, symmetrify, backsub_both_sides, chol_inv, dtrtrs, dpotrs, dpotri
-from gp_base import GPBase
-from GPy.core import Param
+from gp import GP
+from parameterization.param import Param

-class SparseGP(GPBase):
+class SparseGP(GP):
    """
-    Variational sparse GP model
+    A general purpose Sparse GP model

    :param X: inputs
    :type X: np.ndarray (num_data x input_dim)
@ -19,17 +19,25 @@ class SparseGP(GPBase):
    :type kernel: a GPy.kern.kern instance
    :param X_variance: The uncertainty in the measurements of X (Gaussian variance)
    :type X_variance: np.ndarray (num_data x input_dim) | None
-    :param Z: inducing inputs (optional, see note)
-    :type Z: np.ndarray (num_inducing x input_dim) | None
+    :param Z: inducing inputs
+    :type Z: np.ndarray (num_inducing x input_dim)
    :param num_inducing: Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type num_inducing: int
-    :param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales)
-    :type normalize_(X|Y): bool

    """

-    def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False, name='sparse gp'):
-        GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X, name=name)
+    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None, X_variance=None, name='sparse gp'):
+
+        #pick a sensible inference method
+        if inference_method is None:
+            if isinstance(likelihood, likelihoods.Gaussian):
+                inference_method = varDTC.Gaussian_inference()
+        else:
+            #inference_method = ??
+            raise NotImplementedError, "what to do what to do?"
+            print "defaulting to ", inference_method, "for latent function inference"
+
+        GP.__init__(self, X, Y, likelihood, inference_method, kernel, name)

        self.Z = Z
        self.num_inducing = Z.shape[0]
@ -42,39 +50,13 @@ class SparseGP(GPBase):
            self.has_uncertain_inputs = True
            self.X_variance = X_variance

-        if normalize_X:
-            self.Z = (self.Z.copy() - self._Xoffset) / self._Xscale
-
-        # normalize X uncertainty also
-        if self.has_uncertain_inputs:
-            self.X_variance /= np.square(self._Xscale)
-
-        self._const_jitter = None
-
        self.Z = Param('inducing inputs', self.Z)
        self.add_parameter(self.Z, gradient=self.dL_dZ, index=0)
        self.add_parameter(self.kern, gradient=self.dL_dtheta)
        self.add_parameter(self.likelihood, gradient=lambda:self.likelihood._gradients(partial=self.partial_for_likelihood))
-        #self.Z.add_observer(self, lambda Z: self._compute_kernel_matrices() or self._computations())

-    def getstate(self):
-        """
-        Get the current state of the class,
-        here just all the indices, rest can get recomputed
-        """
-        return GPBase.getstate(self) + [self.Z,
-                self.num_inducing,
-                self.has_uncertain_inputs,
-                self.X_variance]

-    def setstate(self, state):
-        self.X_variance = state.pop()
-        self.has_uncertain_inputs = state.pop()
-        self.num_inducing = state.pop()
-        self.Z = state.pop()
-        GPBase.setstate(self, state)
-
-    def _compute_kernel_matrices(self):
+    def parameters_changed(self):
        # kernel computations, using BGPLVM notation
        self.Kmm = self.kern.K(self.Z)
        if self.has_uncertain_inputs:
@ -85,35 +67,11 @@ class SparseGP(GPBase):
            self.psi0 = self.kern.Kdiag(self.X)
            self.psi1 = self.kern.K(self.X, self.Z)
            self.psi2 = None
-    def parameters_changed(self):
-        self._compute_kernel_matrices()
-        self._computations()
-        self.Cpsi1V = None
-        self.dL_dK = self.dL_dKmm
+
+        #self.posterior = self.inference_method.inference(??)
        super(SparseGP, self).parameters_changed()


-    def update_likelihood_approximation(self, **kwargs):
-        """
-        Approximates a non-gaussian likelihood using Expectation Propagation
-
-        For a Gaussian likelihood, no iteration is required:
-        this function does nothing
-        """
-        if not isinstance(self.likelihood, Gaussian): # Updates not needed for Gaussian likelihood
-            self.likelihood.restart()
-            if self.has_uncertain_inputs:
-                Lmi = chol_inv(self._Lm)
-                Kmmi = tdot(Lmi.T)
-                diag_tr_psi2Kmmi = np.array([np.trace(psi2_Kmmi) for psi2_Kmmi in np.dot(self.psi2, Kmmi)])
-
-                self.likelihood.fit_FITC(self.Kmm, self.psi1.T, diag_tr_psi2Kmmi, **kwargs) # This uses the fit_FITC code, but does not perfomr a FITC-EP.#TODO solve potential confusion
-                # raise NotImplementedError, "EP approximation not implemented for uncertain inputs"
-            else:
-                self.likelihood.fit_DTC(self.Kmm, self.psi1.T, **kwargs)
-                # self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
-                self._set_params(self._get_params()) # update the GP
-
    def dL_dtheta(self):
        """
        Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel
@ -143,82 +101,14 @@ class SparseGP(GPBase):

    def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
        """
-        Internal helper function for making predictions, does not account for
-        normalization or likelihood function
+        Make a prediction for the latent function values
        """
-
-        Bi, _ = dpotri(self.LB, lower=0) # WTH? this lower switch should be 1, but that doesn't work!
-        symmetrify(Bi)
-        Kmmi_LmiBLmi = backsub_both_sides(self._Lm, np.eye(self.num_inducing) - Bi)
-
-        if self.Cpsi1V is None:
-            psi1V = np.dot(self.psi1.T, self.likelihood.V)
-            tmp, _ = dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
-            tmp, _ = dpotrs(self.LB, tmp, lower=1)
-            self.Cpsi1V, _ = dtrtrs(self._Lm, tmp, lower=1, trans=1)
-
-        if X_variance_new is None:
-            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
-            mu = np.dot(Kx.T, self.Cpsi1V)
-            if full_cov:
-                Kxx = self.kern.K(Xnew, which_parts=which_parts)
-                var = Kxx - mdot(Kx.T, Kmmi_LmiBLmi, Kx) # NOTE this won't work for plotting
-            else:
-                Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)
-                var = Kxx - np.sum(Kx * np.dot(Kmmi_LmiBLmi, Kx), 0)
-        else:
-            # assert which_parts=='all', "swithching out parts of variational kernels is not implemented"
-            Kx = self.kern.psi1(self.Z, Xnew, X_variance_new) # , which_parts=which_parts) TODO: which_parts
-            mu = np.dot(Kx, self.Cpsi1V)
-            if full_cov:
-                raise NotImplementedError, "TODO"
-            else:
-                Kxx = self.kern.psi0(self.Z, Xnew, X_variance_new)
-                psi2 = self.kern.psi2(self.Z, Xnew, X_variance_new)
-                var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
-
-        return mu, var[:, None]
-
-    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False, **likelihood_args):
-        """
-        Predict the function(s) at the new point(s) Xnew.
-
-        **Arguments**
-
-        :param Xnew: The points at which to make a prediction
-        :type Xnew: np.ndarray, Nnew x self.input_dim
-        :param X_variance_new: The uncertainty in the prediction points
-        :type X_variance_new: np.ndarray, Nnew x self.input_dim
-        :param which_parts:  specifies which outputs kernel(s) to use in prediction
-        :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal
-        :type full_cov: bool
-        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
-        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
-        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
-
-
-           If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
-           This is to allow for different normalizations of the output dimensions.
-
-        """
-        # normalize X values
-        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
-        if X_variance_new is not None:
-            X_variance_new = X_variance_new / self._Xscale ** 2
-
-        # here's the actual prediction by the GP model
-        mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts)
-
-        # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, **likelihood_args)
-
-        return mean, var, _025pm, _975pm
+        #TODO!!!


    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None):
        """
-        Plot the GP's view of the world, where the data is normalized and the
+        Plot the belief in the latent function, the "GP's view of the world"
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
          - In two dimsensions, a contour-plot shows the mean predicted function
          - Not implemented in higher dimensions
@ -249,12 +139,11 @@ class SparseGP(GPBase):
        if which_data is 'all':
            which_data = slice(None)

-        GPBase.plot_f(self, samples=samples, plot_limits=plot_limits, which_data='all', which_parts='all', resolution=resolution, full_cov=full_cov, fignum=fignum, ax=ax)
+        GP.plot_f(self, samples=samples, plot_limits=plot_limits, which_data='all', which_parts='all', resolution=resolution, full_cov=full_cov, fignum=fignum, ax=ax)

        if self.X.shape[1] == 1:
            if self.has_uncertain_inputs:
-                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
-                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
+                ax.errorbar(self.X[which_data, 0], self.likelihood.data[which_data, 0],
                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
            Zu = self.Z * self._Xscale + self._Xoffset
@ -264,7 +153,6 @@ class SparseGP(GPBase):
            Zu = self.Z * self._Xscale + self._Xoffset
            ax.plot(Zu[:, 0], Zu[:, 1], 'wo')

-
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"

@ -277,12 +165,11 @@ class SparseGP(GPBase):
        if which_data is 'all':
            which_data = slice(None)

-        GPBase.plot(self, samples=samples, plot_limits=plot_limits, which_data='all', which_parts='all', resolution=resolution, levels=20, fignum=fignum, ax=ax)
+        GP.plot(self, samples=samples, plot_limits=plot_limits, which_data='all', which_parts='all', resolution=resolution, levels=20, fignum=fignum, ax=ax)

        if self.X.shape[1] == 1:
            if self.has_uncertain_inputs:
-                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
-                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
+                ax.errorbar(self.X[which_data, 0], self.likelihood.data[which_data, 0],
                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
            Zu = self.Z * self._Xscale + self._Xoffset
@ -296,145 +183,20 @@ class SparseGP(GPBase):
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"

-    def predict_single_output(self, Xnew, output=0, which_parts='all', full_cov=False):
+    def getstate(self):
        """
-        For a specific output, predict the function at the new point(s) Xnew.
-
-        :param Xnew: The points at which to make a prediction
-        :type Xnew: np.ndarray, Nnew x self.input_dim
-        :param output: output to predict
-        :type output: integer in {0,..., num_outputs-1}
-        :param which_parts:  specifies which outputs kernel(s) to use in prediction
-        :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal
-        :type full_cov: bool
-        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
-        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
-        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
-
-        .. Note:: For multiple output models only
+        Get the current state of the class,
+        here just all the indices, rest can get recomputed
        """
+        return GP.getstate(self) + [self.Z,
+                self.num_inducing,
+                self.has_uncertain_inputs,
+                self.X_variance]

-        assert hasattr(self,'multioutput')
-        index = np.ones_like(Xnew)*output
-        Xnew = np.hstack((Xnew,index))
+    def setstate(self, state):
+        self.X_variance = state.pop()
+        self.has_uncertain_inputs = state.pop()
+        self.num_inducing = state.pop()
+        self.Z = state.pop()
+        GP.setstate(self, state)

-        # normalize X values
-        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
-        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)
-
-        # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, noise_model = output)
-        return mean, var, _025pm, _975pm
-
-    def _raw_predict_single_output(self, _Xnew, output=0, X_variance_new=None, which_parts='all', full_cov=False,stop=False):
-        """
-        Internal helper function for making predictions for a specific output,
-        does not account for normalization or likelihood
-        ---------
-
-        :param Xnew: The points at which to make a prediction
-        :type Xnew: np.ndarray, Nnew x self.input_dim
-        :param output: output to predict
-        :type output: integer in {0,..., num_outputs-1}
-        :param which_parts:  specifies which outputs kernel(s) to use in prediction
-        :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal
-
-        .. Note:: For multiple output models only
-        """
-        Bi, _ = dpotri(self.LB, lower=0)  # WTH? this lower switch should be 1, but that doesn't work!
-        symmetrify(Bi)
-        Kmmi_LmiBLmi = backsub_both_sides(self._Lm, np.eye(self.num_inducing) - Bi)
-
-        if self.Cpsi1V is None:
-            psi1V = np.dot(self.psi1.T,self.likelihood.V)
-            tmp, _ = dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
-            tmp, _ = dpotrs(self.LB, tmp, lower=1)
-            self.Cpsi1V, _ = dtrtrs(self._Lm, tmp, lower=1, trans=1)
-
-        assert hasattr(self,'multioutput')
-        index = np.ones_like(_Xnew)*output
-        _Xnew = np.hstack((_Xnew,index))
-
-        if X_variance_new is None:
-            Kx = self.kern.K(self.Z, _Xnew, which_parts=which_parts)
-            mu = np.dot(Kx.T, self.Cpsi1V)
-            if full_cov:
-                Kxx = self.kern.K(_Xnew, which_parts=which_parts)
-                var = Kxx - mdot(Kx.T, Kmmi_LmiBLmi, Kx) # NOTE this won't work for plotting
-            else:
-                Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
-                var = Kxx - np.sum(Kx * np.dot(Kmmi_LmiBLmi, Kx), 0)
-        else:
-            Kx = self.kern.psi1(self.Z, _Xnew, X_variance_new)
-            mu = np.dot(Kx, self.Cpsi1V)
-            if full_cov:
-                raise NotImplementedError, "TODO"
-            else:
-                Kxx = self.kern.psi0(self.Z, _Xnew, X_variance_new)
-                psi2 = self.kern.psi2(self.Z, _Xnew, X_variance_new)
-                var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
-
-        return mu, var[:, None]
-
-
-    def plot_single_output_f(self, output=None, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None):
-
-        if ax is None:
-            fig = pb.figure(num=fignum)
-            ax = fig.add_subplot(111)
-        if fignum is None and ax is None:
-                fignum = fig.num
-        if which_data is 'all':
-            which_data = slice(None)
-
-        GPBase.plot_single_output_f(self, output=output, samples=samples, plot_limits=plot_limits, which_data='all', which_parts='all', resolution=resolution, full_cov=full_cov, fignum=fignum, ax=ax)
-
-        if self.X.shape[1] == 2:
-            if self.has_uncertain_inputs:
-                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
-                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
-                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
-                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
-            Zu = self.Z * self._Xscale + self._Xoffset
-            Zu = Zu[Zu[:,1]==output,0:1]
-            ax.plot(Zu[:,0], np.zeros_like(Zu[:,0]) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
-
-        elif self.X.shape[1] == 2:
-            Zu = self.Z * self._Xscale + self._Xoffset
-            Zu = Zu[Zu[:,1]==output,0:2]
-            ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
-
-
-        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
-
-    def plot_single_output(self, output=None, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, fignum=None, ax=None):
-        if ax is None:
-            fig = pb.figure(num=fignum)
-            ax = fig.add_subplot(111)
-        if fignum is None and ax is None:
-                fignum = fig.num
-        if which_data is 'all':
-            which_data = slice(None)
-
-        GPBase.plot_single_output(self, samples=samples, plot_limits=plot_limits, which_data='all', which_parts='all', resolution=resolution, levels=20, fignum=fignum, ax=ax, output=output)
-
-        if self.X.shape[1] == 2:
-            if self.has_uncertain_inputs:
-                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
-                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
-                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
-                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
-            Zu = self.Z * self._Xscale + self._Xoffset
-            Zu = Zu[Zu[:,1]==output,0:1]
-            ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
-
-        elif self.X.shape[1] == 3:
-            Zu = self.Z * self._Xscale + self._Xoffset
-            Zu = Zu[Zu[:,1]==output,0:1]
-            ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
-
-        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
--- a/GPy/core/svigp.py
+++ b/GPy/core/svigp.py
@ -4,12 +4,12 @@
 import numpy as np
 import pylab as pb
 from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs, jitchol, backsub_both_sides
-from gp_base import GPBase
+from gp import GP
 import time
 import sys


-class SVIGP(GPBase):
+class SVIGP(GP):
    """

    Stochastic Variational inference in a Gaussian Process
@ -22,7 +22,7 @@ class SVIGP(GPBase):

    Additional kwargs are used as for a sparse GP. They include:

-    :param q_u: canonical parameters of the distribution squasehd into a 1D array
+    :param q_u: canonical parameters of the distribution sqehd into a 1D array
    :type q_u: np.ndarray
    :param M: Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type M: int
@ -44,7 +44,7 @@ class SVIGP(GPBase):


    def __init__(self, X, likelihood, kernel, Z, q_u=None, batchsize=10, X_variance=None):
-        GPBase.__init__(self, X, likelihood, kernel, normalize_X=False)
+        GP.__init__(self, X, likelihood, kernel, normalize_X=False)
        self.batchsize=batchsize
        self.Y = self.likelihood.Y.copy()
        self.Z = Z
@ -92,7 +92,7 @@ class SVIGP(GPBase):

    def getstate(self):
        steplength_params = [self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength]
-        return GPBase.getstate(self) + \
+        return GP.getstate(self) + \
            [self.get_vb_param(),
             self.Z,
             self.num_inducing,
@ -139,7 +139,7 @@ class SVIGP(GPBase):
        self.num_inducing = state.pop()
        self.Z = state.pop()
        vb_param = state.pop()
-        GPBase.setstate(self, state)
+        GP.setstate(self, state)
        self.set_vb_param(vb_param)

    def _compute_kernel_matrices(self):
@ -489,7 +489,7 @@ class SVIGP(GPBase):
        #horrible hack here:
        data = self.likelihood.data.copy()
        self.likelihood.data = self.Y
-        GPBase.plot(self, ax=ax, **kwargs)
+        GP.plot(self, ax=ax, **kwargs)
        self.likelihood.data = data

        Zu = self.Z * self._Xscale + self._Xoffset
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -6,12 +6,11 @@
 Gaussian Processes classification
 """
 import pylab as pb
-import numpy as np
 import GPy

 default_seed = 10000

-def oil(num_inducing=50, max_iters=100, kernel=None):
+def oil(num_inducing=50, max_iters=100, kernel=None, optimize=True, plot=True):
    """
    Run a Gaussian process classification on the three phase oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.

@ -25,7 +24,7 @@ def oil(num_inducing=50, max_iters=100, kernel=None):
    Ytest[Ytest.flatten()==-1] = 0

    # Create GP model
-    m = GPy.models.SparseGPClassification(X, Y,kernel=kernel,num_inducing=num_inducing)
+    m = GPy.models.SparseGPClassification(X, Y, kernel=kernel, num_inducing=num_inducing)

    # Contrain all parameters to be positive
    m.tie_params('.*len')
@ -33,17 +32,18 @@ def oil(num_inducing=50, max_iters=100, kernel=None):
    m.update_likelihood_approximation()

    # Optimize
-    m.optimize(max_iters=max_iters)
+    if optimize:
+        m.optimize(max_iters=max_iters)
    print(m)

    #Test
    probs = m.predict(Xtest)[0]
-    GPy.util.classification.conf_matrix(probs,Ytest)
+    GPy.util.classification.conf_matrix(probs, Ytest)
    return m

-def toy_linear_1d_classification(seed=default_seed):
+def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):
    """
-    Simple 1D classification example
+    Simple 1D classification example using EP approximation

    :param seed: seed value for data generation (default is 4).
    :type seed: int
@ -58,20 +58,59 @@ def toy_linear_1d_classification(seed=default_seed):
    m = GPy.models.GPClassification(data['X'], Y)

    # Optimize
-    #m.update_likelihood_approximation()
-    # Parameters optimization:
-    #m.optimize()
-    m.pseudo_EM()
+    if optimize:
+        #m.update_likelihood_approximation()
+        # Parameters optimization:
+        #m.optimize()
+        #m.update_likelihood_approximation()
+        m.pseudo_EM()

    # Plot
-    fig, axes = pb.subplots(2,1)
-    m.plot_f(ax=axes[0])
-    m.plot(ax=axes[1])
-    print(m)
+    if plot:
+        fig, axes = pb.subplots(2, 1)
+        m.plot_f(ax=axes[0])
+        m.plot(ax=axes[1])

+    print m
    return m

-def sparse_toy_linear_1d_classification(num_inducing=10,seed=default_seed):
+def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=True):
+    """
+    Simple 1D classification example using Laplace approximation
+
+    :param seed: seed value for data generation (default is 4).
+    :type seed: int
+
+    """
+
+    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    Y = data['Y'][:, 0:1]
+    Y[Y.flatten() == -1] = 0
+
+    bern_noise_model = GPy.likelihoods.bernoulli()
+    laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), bern_noise_model)
+
+    # Model definition
+    m = GPy.models.GPClassification(data['X'], Y, likelihood=laplace_likelihood)
+    print m
+
+    # Optimize
+    if optimize:
+        #m.update_likelihood_approximation()
+        # Parameters optimization:
+        m.optimize('bfgs', messages=1)
+        #m.pseudo_EM()
+
+    # Plot
+    if plot:
+        fig, axes = pb.subplots(2, 1)
+        m.plot_f(ax=axes[0])
+        m.plot(ax=axes[1])
+
+    print m
+    return m
+
+def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, optimize=True, plot=True):
    """
    Sparse 1D classification example

@ -85,24 +124,26 @@ def sparse_toy_linear_1d_classification(num_inducing=10,seed=default_seed):
    Y[Y.flatten() == -1] = 0

    # Model definition
-    m = GPy.models.SparseGPClassification(data['X'], Y,num_inducing=num_inducing)
-    m['.*len']= 4.
+    m = GPy.models.SparseGPClassification(data['X'], Y, num_inducing=num_inducing)
+    m['.*len'] = 4.

    # Optimize
-    #m.update_likelihood_approximation()
-    # Parameters optimization:
-    #m.optimize()
-    m.pseudo_EM()
+    if optimize:
+        #m.update_likelihood_approximation()
+        # Parameters optimization:
+        #m.optimize()
+        m.pseudo_EM()

    # Plot
-    fig, axes = pb.subplots(2,1)
-    m.plot_f(ax=axes[0])
-    m.plot(ax=axes[1])
-    print(m)
+    if plot:
+        fig, axes = pb.subplots(2, 1)
+        m.plot_f(ax=axes[0])
+        m.plot(ax=axes[1])

+    print m
    return m

-def toy_heaviside(seed=default_seed):
+def toy_heaviside(seed=default_seed, optimize=True, plot=True):
    """
    Simple 1D classification example using a heavy side gp transformation

@ -116,25 +157,27 @@ def toy_heaviside(seed=default_seed):
    Y[Y.flatten() == -1] = 0

    # Model definition
-    noise_model = GPy.likelihoods.binomial(GPy.likelihoods.noise_models.gp_transformations.Heaviside())
-    likelihood = GPy.likelihoods.EP(Y,noise_model)
+    noise_model = GPy.likelihoods.bernoulli(GPy.likelihoods.noise_models.gp_transformations.Heaviside())
+    likelihood = GPy.likelihoods.EP(Y, noise_model)
    m = GPy.models.GPClassification(data['X'], likelihood=likelihood)

    # Optimize
-    m.update_likelihood_approximation()
-    # Parameters optimization:
-    m.optimize()
-    #m.pseudo_EM()
+    if optimize:
+        m.update_likelihood_approximation()
+        # Parameters optimization:
+        m.optimize()
+        #m.pseudo_EM()

    # Plot
-    fig, axes = pb.subplots(2,1)
-    m.plot_f(ax=axes[0])
-    m.plot(ax=axes[1])
-    print(m)
+    if plot:
+        fig, axes = pb.subplots(2, 1)
+        m.plot_f(ax=axes[0])
+        m.plot(ax=axes[1])

+    print m
    return m

-def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=None):
+def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=None, optimize=True, plot=True):
    """
    Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.

@ -151,7 +194,7 @@ def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=
    Y[Y.flatten()==-1] = 0

    if model_type == 'Full':
-        m = GPy.models.GPClassification(data['X'], Y,kernel=kernel)
+        m = GPy.models.GPClassification(data['X'], Y, kernel=kernel)

    elif model_type == 'DTC':
        m = GPy.models.SparseGPClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing)
@ -161,8 +204,11 @@ def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=
        m = GPy.models.FITCClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing)
        m['.*len'] = 3.

-    m.pseudo_EM()
-    print(m)
-    m.plot()
+    if optimize:
+        m.pseudo_EM()

+    if plot:
+        m.plot()
+
+    print m
    return m
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -1,96 +1,105 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
+import numpy as _np
+default_seed = _np.random.seed(123344)

-import numpy as np
-from matplotlib import pyplot as plt, cm
+def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False):
+    """
+    model for testing purposes. Samples from a GP with rbf kernel and learns
+    the samples with a new kernel. Normally not for optimization, just model cheking
+    """
+    from GPy.likelihoods.gaussian import Gaussian
+    import GPy

-from ..models.bayesian_gplvm import BayesianGPLVM
-from ..likelihoods.gaussian import Gaussian
-import GPy
+    num_inputs = 13
+    num_inducing = 5
+    if plot:
+        output_dim = 1
+        input_dim = 2
+    else:
+        input_dim = 2
+        output_dim = 25

-default_seed = np.random.seed(123344)
-
-def BGPLVM(seed=default_seed):
-    N = 5
-    num_inducing = 4
-    input_dim = 3
-    D = 2
    # generate GPLVM-like data
-    X = np.random.rand(N, input_dim)
-    lengthscales = np.random.rand(input_dim)
+    X = _np.random.rand(num_inputs, input_dim)
+    lengthscales = _np.random.rand(input_dim)
    k = (GPy.kern.rbf(input_dim, .5, lengthscales, ARD=True)
         + GPy.kern.white(input_dim, 0.01))
    K = k.K(X)
-    Y = np.random.multivariate_normal(np.zeros(N), K, D).T
+    Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, output_dim).T
    lik = Gaussian(Y, normalize=True)

-#     k = GPy.kern.rbf_inv(input_dim, .5, np.ones(input_dim) * 2., ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
-    k = GPy.kern.rbf(input_dim, ARD=1, name="rbf1") + GPy.kern.rbf(input_dim, ARD=1, name='rbf2') + GPy.kern.linear(input_dim, ARD=1, name='linear_part')
-#     k = GPy.kern.rbf(input_dim, ARD = False)
+    k = GPy.kern.rbf_inv(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
+    # k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+    # k = GPy.kern.rbf(input_dim, ARD = False)  + GPy.kern.white(input_dim, 0.00001)
+    # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.rbf(input_dim, .3, _np.ones(input_dim) * .2, ARD=True)
+    # k = GPy.kern.rbf(input_dim, .5, 2., ARD=0) + GPy.kern.rbf(input_dim, .3, .2, ARD=0)
+    # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.linear(input_dim, _np.ones(input_dim) * .2, ARD=True)

-    m = BayesianGPLVM(lik, input_dim, kernel=k, num_inducing=num_inducing)
+    m = GPy.models.BayesianGPLVM(lik, input_dim, kernel=k, num_inducing=num_inducing)
+    #===========================================================================
+    # randomly obstruct data with percentage p
+    p = .8
+    Y_obstruct = Y.copy()
+    Y_obstruct[_np.random.uniform(size=(Y.shape)) < p] = _np.nan
+    #===========================================================================
+    m2 = GPy.models.BayesianGPLVMWithMissingData(Y_obstruct, input_dim, kernel=k, num_inducing=num_inducing)
    m.lengthscales = lengthscales
-    # m.constrain_positive('(rbf|bias|noise|white|S)')
-    # m.constrain_fixed('S', 1)

-    # pb.figure()
-    # m.plot()
-    # pb.title('PCA initialisation')
-    # pb.figure()
-    # m.optimize(messages = 1)
-    # m.plot()
-    # pb.title('After optimisation')
-    # m.randomize()
-    # m.checkgrad(verbose=1)
+    if plot:
+        import matplotlib.pyplot as pb
+        m.plot()
+        pb.title('PCA initialisation')
+        m2.plot()
+        pb.title('PCA initialisation')

-    return m
+    if optimize:
+        m.optimize('scg', messages=verbose)
+        m2.optimize('scg', messages=verbose)
+        if plot:
+            m.plot()
+            pb.title('After optimisation')
+            m2.plot()
+            pb.title('After optimisation')

-def GPLVM_oil_100(optimize=True, plot=True):
+    return m, m2
+
+def gplvm_oil_100(optimize=True, verbose=1, plot=True):
+    import GPy
    data = GPy.util.datasets.oil_100()
    Y = data['X']
-
    # create simple GP model
    kernel = GPy.kern.rbf(6, ARD=True) + GPy.kern.bias(6)
    m = GPy.models.GPLVM(Y, 6, kernel=kernel)
    m.data_labels = data['Y'].argmax(axis=1)
-
-    # optimize
-    if optimize:
-        m.optimize('scg', messages=1)
-
-    # plot
-    print(m)
-    if plot:
-        m.plot_latent(labels=m.data_labels)
+    if optimize: m.optimize('scg', messages=verbose)
+    if plot: m.plot_latent(labels=m.data_labels)
    return m

-def sparseGPLVM_oil(optimize=True, N=100, input_dim=6, num_inducing=15, max_iters=50):
-    np.random.seed(0)
+def sparse_gplvm_oil(optimize=True, verbose=0, plot=True, N=100, Q=6, num_inducing=15, max_iters=50):
+    import GPy
+    _np.random.seed(0)
    data = GPy.util.datasets.oil()
-
    Y = data['X'][:N]
    Y = Y - Y.mean(0)
    Y /= Y.std(0)
+    # Create the model
+    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q)
+    m = GPy.models.SparseGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing)
+    m.data_labels = data['Y'][:N].argmax(axis=1)

-    # create simple GP model
-    kernel = GPy.kern.rbf(input_dim, ARD=True) + GPy.kern.bias(input_dim)
-    m = GPy.models.SparseGPLVM(Y, input_dim, kernel=kernel, num_inducing=num_inducing)
-    m.data_labels = data['Y'].argmax(axis=1)
-
-    # optimize
-    if optimize:
-        m.optimize('scg', messages=1, max_iters=max_iters)
-
-    # plot
-    print(m)
-    # m.plot_latent(labels=m.data_labels)
+    if optimize: m.optimize('scg', messages=verbose, max_iters=max_iters)
+    if plot:
+        m.plot_latent(labels=m.data_labels)
+        m.kern.plot_ARD()
    return m

-def swiss_roll(optimize=True, N=1000, num_inducing=15, input_dim=4, sigma=.2, plot=False):
+def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=15, Q=4, sigma=.2):
+    import GPy
    from GPy.util.datasets import swiss_roll_generated
-    from GPy.core.transformations import LogexpClipped
+    from GPy.models import BayesianGPLVM

-    data = swiss_roll_generated(N=N, sigma=sigma)
+    data = swiss_roll_generated(num_samples=N, sigma=sigma)
    Y = data['Y']
    Y -= Y.mean()
    Y /= Y.std()
@ -102,120 +111,99 @@ def swiss_roll(optimize=True, N=1000, num_inducing=15, input_dim=4, sigma=.2, pl
        from sklearn.manifold.isomap import Isomap
        iso = Isomap().fit(Y)
        X = iso.embedding_
-        if input_dim > 2:
-            X = np.hstack((X, np.random.randn(N, input_dim - 2)))
+        if Q > 2:
+            X = _np.hstack((X, _np.random.randn(N, Q - 2)))
    except ImportError:
-        X = np.random.randn(N, input_dim)
+        X = _np.random.randn(N, Q)

    if plot:
-        from mpl_toolkits import mplot3d
-        import pylab
-        fig = pylab.figure("Swiss Roll Data")
+        import matplotlib.pyplot as plt
+        from mpl_toolkits.mplot3d import Axes3D  # @UnusedImport
+        fig = plt.figure("Swiss Roll Data")
        ax = fig.add_subplot(121, projection='3d')
        ax.scatter(*Y.T, c=c)
        ax.set_title("Swiss Roll")

        ax = fig.add_subplot(122)
        ax.scatter(*X.T[:2], c=c)
-        ax.set_title("Initialization")
-
+        ax.set_title("BGPLVM init")

    var = .5
-    S = (var * np.ones_like(X) + np.clip(np.random.randn(N, input_dim) * var ** 2,
+    S = (var * _np.ones_like(X) + _np.clip(_np.random.randn(N, Q) * var ** 2,
                                         - (1 - var),
                                         (1 - var))) + .001
-    Z = np.random.permutation(X)[:num_inducing]
+    Z = _np.random.permutation(X)[:num_inducing]

-    kernel = GPy.kern.rbf(input_dim, ARD=True) + GPy.kern.bias(input_dim, np.exp(-2)) + GPy.kern.white(input_dim, np.exp(-2))
+    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))

-    m = BayesianGPLVM(Y, input_dim, X=X, X_variance=S, num_inducing=num_inducing, Z=Z, kernel=kernel)
+    m = BayesianGPLVM(Y, Q, X=X, X_variance=S, num_inducing=num_inducing, Z=Z, kernel=kernel)
    m.data_colors = c
    m.data_t = t
-
-    m['rbf_lengthscale'] = 1. # X.var(0).max() / X.var(0)
    m['noise_variance'] = Y.var() / 100.
-    m['bias_variance'] = 0.05

    if optimize:
-        m.optimize('scg', messages=1)
+        m.optimize('scg', messages=verbose, max_iters=2e3)
+
+    if plot:
+        fig = plt.figure('fitted')
+        ax = fig.add_subplot(111)
+        s = m.input_sensitivity().argsort()[::-1][:2]
+        ax.scatter(*m.X.T[s], c=c)
+
    return m

-def BGPLVM_oil(optimize=True, N=200, input_dim=7, num_inducing=40, max_iters=1000, plot=False, **k):
-    np.random.seed(0)
+def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, max_iters=1000, **k):
+    import GPy
+    from GPy.likelihoods import Gaussian
+    from matplotlib import pyplot as plt
+
+    _np.random.seed(0)
    data = GPy.util.datasets.oil()

-    # create simple GP model
-    kernel = GPy.kern.rbf_inv(input_dim, 1., [.1] * input_dim, ARD=True) + GPy.kern.bias(input_dim, np.exp(-2))
-
+    kernel = GPy.kern.rbf_inv(Q, 1., [.1] * Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2))
    Y = data['X'][:N]
    Yn = Gaussian(Y, normalize=True)
-#     Yn = Y - Y.mean(0)
-#     Yn /= Yn.std(0)
-
-    m = GPy.models.BayesianGPLVM(Yn, input_dim, kernel=kernel, num_inducing=num_inducing, **k)
+    m = GPy.models.BayesianGPLVM(Yn, Q, kernel=kernel, num_inducing=num_inducing, **k)
    m.data_labels = data['Y'][:N].argmax(axis=1)
+    m['noise'] = Yn.Y.var() / 100.

-    # m.constrain('variance|leng', LogexpClipped())
-    # m['.*lengt'] = m.X.var(0).max() / m.X.var(0)
-    m['gaussian'] = Yn.Y.var() / 100.
-
-
-    # optimize
    if optimize:
-        m.gaussian.variance.fix() # m.constrain_fixed('noise')
-        m.optimize('scg', messages=1, max_iters=200, gtol=.05)
-        m.gaussian.variance.constrain_positive() # m.constrain_positive('noise')
-        #m.constrain_bounded('white', 1e-7, 1)
-        m.optimize('scg', messages=1, max_iters=max_iters, gtol=.05)
+        m.optimize('scg', messages=verbose, max_iters=max_iters, gtol=.05)

    if plot:
        y = m.likelihood.Y[0, :]
        fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
-        plt.sca(latent_axes)
-        m.plot_latent()
+        m.plot_latent(ax=latent_axes)
        data_show = GPy.util.visualize.vector_show(y)
-        lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :], m, data_show, latent_axes=latent_axes) # , sense_axes=sense_axes)
+        lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :], # @UnusedVariable
+            m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
        raw_input('Press enter to finish')
        plt.close(fig)
    return m

-def oil_100():
-    data = GPy.util.datasets.oil_100()
-    m = GPy.models.GPLVM(data['X'], 2)
-
-    # optimize
-    m.optimize(messages=1, max_iters=2)
-
-    # plot
-    print(m)
-    # m.plot_latent(labels=data['Y'].argmax(axis=1))
-    return m
-
-
-
-def _simulate_sincos(D1, D2, D3, N, num_inducing, input_dim, plot_sim=False):
-    x = np.linspace(0, 4 * np.pi, N)[:, None]
-    s1 = np.vectorize(lambda x: np.sin(x))
-    s2 = np.vectorize(lambda x: np.cos(x))
-    s3 = np.vectorize(lambda x:-np.exp(-np.cos(2 * x)))
-    sS = np.vectorize(lambda x: np.sin(2 * x))
+def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
+    x = _np.linspace(0, 4 * _np.pi, N)[:, None]
+    s1 = _np.vectorize(lambda x: _np.sin(x))
+    s2 = _np.vectorize(lambda x: _np.cos(x))
+    s3 = _np.vectorize(lambda x:-_np.exp(-_np.cos(2 * x)))
+    sS = _np.vectorize(lambda x: _np.sin(2 * x))

    s1 = s1(x)
    s2 = s2(x)
    s3 = s3(x)
    sS = sS(x)

-    S1 = np.hstack([s1, sS])
-    S2 = np.hstack([s2, s3, sS])
-    S3 = np.hstack([s3, sS])
+    S1 = _np.hstack([s1, sS])
+    S2 = _np.hstack([s2, s3, sS])
+    S3 = _np.hstack([s3, sS])

-    Y1 = S1.dot(np.random.randn(S1.shape[1], D1))
-    Y2 = S2.dot(np.random.randn(S2.shape[1], D2))
-    Y3 = S3.dot(np.random.randn(S3.shape[1], D3))
+    Y1 = S1.dot(_np.random.randn(S1.shape[1], D1))
+    Y2 = S2.dot(_np.random.randn(S2.shape[1], D2))
+    Y3 = S3.dot(_np.random.randn(S3.shape[1], D3))

-    Y1 += .3 * np.random.randn(*Y1.shape)
-    Y2 += .2 * np.random.randn(*Y2.shape)
-    Y3 += .25 * np.random.randn(*Y3.shape)
+    Y1 += .3 * _np.random.randn(*Y1.shape)
+    Y2 += .2 * _np.random.randn(*Y2.shape)
+    Y3 += .25 * _np.random.randn(*Y3.shape)

    Y1 -= Y1.mean(0)
    Y2 -= Y2.mean(0)
@ -230,6 +218,7 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, input_dim, plot_sim=False):

    if plot_sim:
        import pylab
+        import matplotlib.cm as cm
        import itertools
        fig = pylab.figure("MRD Simulation Data", figsize=(8, 6))
        fig.clf()
@ -247,114 +236,99 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, input_dim, plot_sim=False):

    return slist, [S1, S2, S3], Ylist

-def bgplvm_simulation_matlab_compare():
-    from GPy.util.datasets import simulation_BGPLVM
-    sim_data = simulation_BGPLVM()
-    Y = sim_data['Y']
-    S = sim_data['S']
-    mu = sim_data['mu']
-    num_inducing, [_, input_dim] = 3, mu.shape
+# def bgplvm_simulation_matlab_compare():
+#     from GPy.util.datasets import simulation_BGPLVM
+#     from GPy import kern
+#     from GPy.models import BayesianGPLVM
+#
+#     sim_data = simulation_BGPLVM()
+#     Y = sim_data['Y']
+#     mu = sim_data['mu']
+#     num_inducing, [_, Q] = 3, mu.shape
+#
+#     k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2))
+#     m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k,
+#                        _debug=False)
+#     m.auto_scale_factor = True
+#     m['noise'] = Y.var() / 100.
+#     m['linear_variance'] = .01
+#     return m

-    from GPy.models import mrd
-    from GPy import kern
-    reload(mrd); reload(kern)
-    k = kern.linear(input_dim, ARD=True) + kern.bias(input_dim, np.exp(-2)) + kern.white(input_dim, np.exp(-2))
-    m = BayesianGPLVM(Y, input_dim, init="PCA", num_inducing=num_inducing, kernel=k,
-#                        X=mu,
-#                        X_variance=S,
-                       _debug=False)
-    m.auto_scale_factor = True
-    m['gaussian'] = Y.var() / 100.
-    m['linear_variance'] = .01
-    return m
-
-def bgplvm_simulation(optimize='scg',
-                      plot=True,
+def bgplvm_simulation(optimize=True, verbose=1,
+                      plot=True, plot_sim=False,
                      max_iters=2e4,
-                      plot_sim=False):
-#     from GPy.core.transformations import LogexpClipped
-    D1, D2, D3, N, num_inducing, input_dim = 15, 5, 8, 30, 3, 10
-    slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, input_dim, plot_sim)
-
-    from GPy.models import mrd
+                      ):
    from GPy import kern
-    reload(mrd); reload(kern)
+    from GPy.models import BayesianGPLVM

+    D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 3, 10
+    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
    Y = Ylist[0]
-
-    k = kern.linear(input_dim, ARD=True) + kern.bias(input_dim, np.exp(-2)) + kern.white(input_dim, np.exp(-2)) # + kern.bias(input_dim)
-    m = BayesianGPLVM(Y, input_dim, init="PCA", num_inducing=num_inducing, kernel=k)
-
-    import ipdb; ipdb.set_trace()
-    # m.constrain('variance|noise', LogexpClipped())
-    m['gaussian'] = Y.var() / 100.
+    k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+    m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k)
+    m['noise'] = Y.var() / 100.

    if optimize:
        print "Optimizing model:"
-        m.optimize(optimize, max_iters=max_iters,
-                   messages=True, gtol=.05)
+        m.optimize('scg', messages=verbose, max_iters=max_iters,
+                   gtol=.05)
    if plot:
        m.plot_X_1d("BGPLVM Latent Space 1D")
        m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
    return m

-def mrd_simulation(optimize=True, plot=True, plot_sim=True, **kw):
-    D1, D2, D3, N, num_inducing, input_dim = 60, 20, 36, 60, 6, 5
-    slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, input_dim, plot_sim)
+def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
+    from GPy import kern
+    from GPy.models import MRD
+    from GPy.likelihoods import Gaussian

+    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
+    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
    likelihood_list = [Gaussian(x, normalize=True) for x in Ylist]

-    from GPy.models import mrd
-    from GPy import kern
-
-    reload(mrd); reload(kern)
-
-    k = kern.linear(input_dim, ARD=True) + kern.bias(input_dim, np.exp(-2)) + kern.white(input_dim, np.exp(-2))
-    m = mrd.MRD(likelihood_list, input_dim=input_dim, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw)
+    k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2))
+    m = MRD(likelihood_list, input_dim=Q, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw)
    m.ensure_default_constraints()

    for i, bgplvm in enumerate(m.bgplvms):
        m['{}_noise'.format(i)] = bgplvm.likelihood.Y.var() / 500.

-
-    # DEBUG
-    # np.seterr("raise")
-
    if optimize:
        print "Optimizing Model:"
-        m.optimize(messages=1, max_iters=8e3, gtol=.1)
+        m.optimize(messages=verbose, max_iters=8e3, gtol=.1)
    if plot:
        m.plot_X_1d("MRD Latent Space 1D")
        m.plot_scales("MRD Scales")
    return m

-def brendan_faces():
-    from GPy import kern
+def brendan_faces(optimize=True, verbose=True, plot=True):
+    import GPy
+
    data = GPy.util.datasets.brendan_faces()
-    input_dim = 2
-    Y = data['Y'][0:-1:10, :]
-    # Y = data['Y']
+    Q = 2
+    Y = data['Y']
    Yn = Y - Y.mean()
    Yn /= Yn.std()

-    m = GPy.models.GPLVM(Yn, input_dim)
-    # m = GPy.models.BayesianGPLVM(Yn, input_dim, num_inducing=100)
+    m = GPy.models.GPLVM(Yn, Q)

    # optimize
-    m.constrain('rbf|noise|white', GPy.core.transformations.LogexpClipped())
+    m.constrain('rbf|noise|white', GPy.core.transformations.logexp_clipped())

-    m.optimize('scg', messages=1, max_f_eval=10000)
+    if optimize: m.optimize('scg', messages=verbose, max_iters=1000)

-    ax = m.plot_latent(which_indices=(0, 1))
-    y = m.likelihood.Y[0, :]
-    data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, invert=False, scale=False)
-    lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
-    raw_input('Press enter to finish')
+    if plot:
+        ax = m.plot_latent(which_indices=(0, 1))
+        y = m.likelihood.Y[0, :]
+        data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, order='F', invert=False, scale=False)
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        raw_input('Press enter to finish')

    return m

-def olivetti_faces():
-    from GPy import kern
+def olivetti_faces(optimize=True, verbose=True, plot=True):
+    import GPy
+
    data = GPy.util.datasets.olivetti_faces()
    Q = 2
    Y = data['Y']
@ -362,152 +336,142 @@ def olivetti_faces():
    Yn /= Yn.std()

    m = GPy.models.GPLVM(Yn, Q)
-    m.optimize('scg', messages=1, max_iters=1000)
-
-    ax = m.plot_latent(which_indices=(0, 1))
-    y = m.likelihood.Y[0, :]
-    data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
-    lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
-    raw_input('Press enter to finish')
+    if optimize: m.optimize('scg', messages=verbose, max_iters=1000)
+    if plot:
+        ax = m.plot_latent(which_indices=(0, 1))
+        y = m.likelihood.Y[0, :]
+        data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        raw_input('Press enter to finish')

    return m

-def stick_play(range=None, frame_rate=15):
+def stick_play(range=None, frame_rate=15, optimize=False, verbose=True, plot=True):
+    import GPy
    data = GPy.util.datasets.osu_run1()
    # optimize
    if range == None:
        Y = data['Y'].copy()
    else:
        Y = data['Y'][range[0]:range[1], :].copy()
-    y = Y[0, :]
-    data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-    GPy.util.visualize.data_play(Y, data_show, frame_rate)
+    if plot:
+        y = Y[0, :]
+        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        GPy.util.visualize.data_play(Y, data_show, frame_rate)
    return Y

-def stick(kernel=None):
+def stick(kernel=None, optimize=True, verbose=True, plot=True):
+    from matplotlib import pyplot as plt
+    import GPy
+
    data = GPy.util.datasets.osu_run1()
    # optimize
    m = GPy.models.GPLVM(data['Y'], 2, kernel=kernel)
-    m.optimize(messages=1, max_f_eval=10000)
-    if GPy.util.visualize.visual_available:
+    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    if plot and GPy.util.visualize.visual_available:
        plt.clf
        ax = m.plot_latent()
        y = m.likelihood.Y[0, :]
        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')

    return m

-def bcgplvm_linear_stick(kernel=None):
+def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):
+    from matplotlib import pyplot as plt
+    import GPy
+
    data = GPy.util.datasets.osu_run1()
    # optimize
    mapping = GPy.mappings.Linear(data['Y'].shape[1], 2)
    m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
-    m.optimize(messages=1, max_f_eval=10000)
-    if GPy.util.visualize.visual_available:
+    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    if plot and GPy.util.visualize.visual_available:
        plt.clf
        ax = m.plot_latent()
        y = m.likelihood.Y[0, :]
        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')

    return m

-def bcgplvm_stick(kernel=None):
+def bcgplvm_stick(kernel=None, optimize=True, verbose=True, plot=True):
+    from matplotlib import pyplot as plt
+    import GPy
+
    data = GPy.util.datasets.osu_run1()
    # optimize
    back_kernel=GPy.kern.rbf(data['Y'].shape[1], lengthscale=5.)
    mapping = GPy.mappings.Kernel(X=data['Y'], output_dim=2, kernel=back_kernel)
    m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
-    m.optimize(messages=1, max_f_eval=10000)
-    if GPy.util.visualize.visual_available:
+    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    if plot and GPy.util.visualize.visual_available:
        plt.clf
        ax = m.plot_latent()
        y = m.likelihood.Y[0, :]
        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')

    return m

-def robot_wireless():
+def robot_wireless(optimize=True, verbose=True, plot=True):
+    from matplotlib import pyplot as plt
+    import GPy
+
    data = GPy.util.datasets.robot_wireless()
    # optimize
    m = GPy.models.GPLVM(data['Y'], 2)
-    m.optimize(messages=1, max_f_eval=10000)
+    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
    m._set_params(m._get_params())
-    plt.clf
-    ax = m.plot_latent()
+    if plot:
+        m.plot_latent()

    return m

-def stick_bgplvm(model=None):
+def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
+    from GPy.models import BayesianGPLVM
+    from matplotlib import pyplot as plt
+    import GPy
+
    data = GPy.util.datasets.osu_run1()
-    input_dim = 6
-    kernel = GPy.kern.rbf(input_dim, ARD=True) + GPy.kern.bias(input_dim, np.exp(-2)) + GPy.kern.white(input_dim, np.exp(-2))
-    m = BayesianGPLVM(data['Y'], input_dim, init="PCA", num_inducing=20, kernel=kernel)
+    Q = 6
+    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))
+    m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel)
    # optimize
    m.ensure_default_constraints()
-    m.optimize('scg', messages=1, max_iters=200, xtol=1e-300, ftol=1e-300)
+    if optimize: m.optimize('scg', messages=verbose, max_iters=200, xtol=1e-300, ftol=1e-300)
    m._set_params(m._get_params())
-    plt.clf, (latent_axes, sense_axes) = plt.subplots(1, 2)
-    plt.sca(latent_axes)
-    m.plot_latent()
-    y = m.likelihood.Y[0, :].copy()
-    data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-    lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
-    raw_input('Press enter to finish')
+    if plot:
+        plt.clf, (latent_axes, sense_axes) = plt.subplots(1, 2)
+        plt.sca(latent_axes)
+        m.plot_latent()
+        y = m.likelihood.Y[0, :].copy()
+        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        GPy.util.visualize.lvm_dimselect(m.X[0, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
+        raw_input('Press enter to finish')

    return m


-def cmu_mocap(subject='35', motion=['01'], in_place=True):
+def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose=True, plot=True):
+    import GPy

    data = GPy.util.datasets.cmu_mocap(subject, motion)
-    Y = data['Y']
    if in_place:
        # Make figure move in place.
        data['Y'][:, 0:3] = 0.0
    m = GPy.models.GPLVM(data['Y'], 2, normalize_Y=True)

-    # optimize
-    m.optimize(messages=1, max_f_eval=10000)
-
-    ax = m.plot_latent()
-    y = m.likelihood.Y[0, :]
-    data_show = GPy.util.visualize.skeleton_show(y[None, :], data['skel'])
-    lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
-    raw_input('Press enter to finish')
-    lvm_visualizer.close()
+    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    if plot:
+        ax = m.plot_latent()
+        y = m.likelihood.Y[0, :]
+        data_show = GPy.util.visualize.skeleton_show(y[None, :], data['skel'])
+        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        raw_input('Press enter to finish')
+        lvm_visualizer.close()

    return m
-
-# def BGPLVM_oil():
-#     data = GPy.util.datasets.oil()
-#     Y, X = data['Y'], data['X']
-#     X -= X.mean(axis=0)
-#     X /= X.std(axis=0)
-#
-#     input_dim = 10
-#     num_inducing = 30
-#
-#     kernel = GPy.kern.rbf(input_dim, ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
-#     m = GPy.models.BayesianGPLVM(X, input_dim, kernel=kernel, num_inducing=num_inducing)
-#     # m.scale_factor = 100.0
-#     m.constrain_positive('(white|noise|bias|X_variance|rbf_variance|rbf_length)')
-#     from sklearn import cluster
-#     km = cluster.KMeans(num_inducing, verbose=10)
-#     Z = km.fit(m.X).cluster_centers_
-#     # Z = GPy.util.misc.kmm_init(m.X, num_inducing)
-#     m.set('iip', Z)
-#     m.set('bias', 1e-4)
-#     # optimize
-#
-#     import pdb; pdb.set_trace()
-#     m.optimize('tnc', messages=1)
-#     print m
-#     m.plot_latent(labels=data['Y'].argmax(axis=1))
-#     return m
-
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@ -0,0 +1,286 @@
+import GPy
+import numpy as np
+import matplotlib.pyplot as plt
+from GPy.util import datasets
+
+def student_t_approx(optimize=True, plot=True):
+    """
+    Example of regressing with a student t likelihood using Laplace
+    """
+    real_std = 0.1
+    #Start a function, any function
+    X = np.linspace(0.0, np.pi*2, 100)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*real_std
+    Y = Y/Y.max()
+    Yc = Y.copy()
+
+    X_full = np.linspace(0.0, np.pi*2, 500)[:, None]
+    Y_full = np.sin(X_full)
+    Y_full = Y_full/Y_full.max()
+
+    #Slightly noisy data
+    Yc[75:80] += 1
+
+    #Very noisy data
+    #Yc[10] += 100
+    #Yc[25] += 10
+    #Yc[23] += 10
+    #Yc[26] += 1000
+    #Yc[24] += 10
+    #Yc = Yc/Yc.max()
+
+    #Add student t random noise to datapoints
+    deg_free = 5
+    print "Real noise: ", real_std
+    initial_var_guess = 0.5
+    edited_real_sd = initial_var_guess
+
+    # Kernel object
+    kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+    kernel2 = kernel1.copy()
+    kernel3 = kernel1.copy()
+    kernel4 = kernel1.copy()
+
+    #Gaussian GP model on clean data
+    m1 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel1)
+    # optimize
+    m1.ensure_default_constraints()
+    m1.constrain_fixed('white', 1e-5)
+    m1.randomize()
+
+    #Gaussian GP model on corrupt data
+    m2 = GPy.models.GPRegression(X, Yc.copy(), kernel=kernel2)
+    m2.ensure_default_constraints()
+    m2.constrain_fixed('white', 1e-5)
+    m2.randomize()
+
+    #Student t GP model on clean data
+    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
+    m3 = GPy.models.GPRegression(X, Y.copy(), kernel3, likelihood=stu_t_likelihood)
+    m3.ensure_default_constraints()
+    m3.constrain_bounded('t_noise', 1e-6, 10.)
+    m3.constrain_fixed('white', 1e-5)
+    m3.randomize()
+
+    #Student t GP model on corrupt data
+    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
+    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
+    m4 = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood)
+    m4.ensure_default_constraints()
+    m4.constrain_bounded('t_noise', 1e-6, 10.)
+    m4.constrain_fixed('white', 1e-5)
+    m4.randomize()
+
+    if optimize:
+        optimizer='scg'
+        print "Clean Gaussian"
+        m1.optimize(optimizer, messages=1)
+        print "Corrupt Gaussian"
+        m2.optimize(optimizer, messages=1)
+        print "Clean student t"
+        m3.optimize(optimizer, messages=1)
+        print "Corrupt student t"
+        m4.optimize(optimizer, messages=1)
+
+    if plot:
+        plt.figure(1)
+        plt.suptitle('Gaussian likelihood')
+        ax = plt.subplot(211)
+        m1.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Gaussian clean')
+
+        ax = plt.subplot(212)
+        m2.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Gaussian corrupt')
+
+        plt.figure(2)
+        plt.suptitle('Student-t likelihood')
+        ax = plt.subplot(211)
+        m3.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Student-t rasm clean')
+
+        ax = plt.subplot(212)
+        m4.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Student-t rasm corrupt')
+
+    return m1, m2, m3, m4
+
+def boston_example(optimize=True, plot=True):
+    import sklearn
+    from sklearn.cross_validation import KFold
+    optimizer='bfgs'
+    messages=0
+    data = datasets.boston_housing()
+    degrees_freedoms = [3, 5, 8, 10]
+    X = data['X'].copy()
+    Y = data['Y'].copy()
+    X = X-X.mean(axis=0)
+    X = X/X.std(axis=0)
+    Y = Y-Y.mean()
+    Y = Y/Y.std()
+    num_folds = 10
+    kf = KFold(len(Y), n_folds=num_folds, indices=True)
+    num_models = len(degrees_freedoms) + 3 #3 for baseline, gaussian, gaussian laplace approx
+    score_folds = np.zeros((num_models, num_folds))
+    pred_density = score_folds.copy()
+
+    def rmse(Y, Ystar):
+        return np.sqrt(np.mean((Y-Ystar)**2))
+
+    for n, (train, test) in enumerate(kf):
+        X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
+        print "Fold {}".format(n)
+
+        noise = 1e-1 #np.exp(-2)
+        rbf_len = 0.5
+        data_axis_plot = 4
+        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+        kernelgp = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+
+        #Baseline
+        score_folds[0, n] = rmse(Y_test, np.mean(Y_train))
+
+        #Gaussian GP
+        print "Gauss GP"
+        mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp.copy())
+        mgp.ensure_default_constraints()
+        mgp.constrain_fixed('white', 1e-5)
+        mgp['rbf_len'] = rbf_len
+        mgp['noise'] = noise
+        print mgp
+        if optimize:
+            mgp.optimize(optimizer=optimizer, messages=messages)
+        Y_test_pred = mgp.predict(X_test)
+        score_folds[1, n] = rmse(Y_test, Y_test_pred[0])
+        pred_density[1, n] = np.mean(mgp.log_predictive_density(X_test, Y_test))
+        print mgp
+        print pred_density
+
+        print "Gaussian Laplace GP"
+        N, D = Y_train.shape
+        g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=noise, N=N, D=D)
+        g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution)
+        mg = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=g_likelihood)
+        mg.ensure_default_constraints()
+        mg.constrain_positive('noise_variance')
+        mg.constrain_fixed('white', 1e-5)
+        mg['rbf_len'] = rbf_len
+        mg['noise'] = noise
+        print mg
+        if optimize:
+            mg.optimize(optimizer=optimizer, messages=messages)
+        Y_test_pred = mg.predict(X_test)
+        score_folds[2, n] = rmse(Y_test, Y_test_pred[0])
+        pred_density[2, n] = np.mean(mg.log_predictive_density(X_test, Y_test))
+        print pred_density
+        print mg
+
+        for stu_num, df in enumerate(degrees_freedoms):
+            #Student T
+            print "Student-T GP {}df".format(df)
+            t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=df, sigma2=noise)
+            stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
+            mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
+            mstu_t.ensure_default_constraints()
+            mstu_t.constrain_fixed('white', 1e-5)
+            mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
+            mstu_t['rbf_len'] = rbf_len
+            mstu_t['t_noise'] = noise
+            print mstu_t
+            if optimize:
+                mstu_t.optimize(optimizer=optimizer, messages=messages)
+            Y_test_pred = mstu_t.predict(X_test)
+            score_folds[3+stu_num, n] = rmse(Y_test, Y_test_pred[0])
+            pred_density[3+stu_num, n] = np.mean(mstu_t.log_predictive_density(X_test, Y_test))
+            print pred_density
+            print mstu_t
+
+    if plot:
+        plt.figure()
+        plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+        plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+        plt.title('GP gauss')
+
+        plt.figure()
+        plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+        plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+        plt.title('Lap gauss')
+
+        plt.figure()
+        plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+        plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+        plt.title('Stu t {}df'.format(df))
+
+    print "Average scores: {}".format(np.mean(score_folds, 1))
+    print "Average pred density: {}".format(np.mean(pred_density, 1))
+
+    if plot:
+        #Plotting
+        stu_t_legends = ['Student T, df={}'.format(df) for df in degrees_freedoms]
+        legends = ['Baseline', 'Gaussian', 'Laplace Approx Gaussian'] + stu_t_legends
+
+        #Plot boxplots for RMSE density
+        fig = plt.figure()
+        ax=fig.add_subplot(111)
+        plt.title('RMSE')
+        bp = ax.boxplot(score_folds.T, notch=0, sym='+', vert=1, whis=1.5)
+        plt.setp(bp['boxes'], color='black')
+        plt.setp(bp['whiskers'], color='black')
+        plt.setp(bp['fliers'], color='red', marker='+')
+        xtickNames = plt.setp(ax, xticklabels=legends)
+        plt.setp(xtickNames, rotation=45, fontsize=8)
+        ax.set_ylabel('RMSE')
+        ax.set_xlabel('Distribution')
+        #Make grid and put it below boxes
+        ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
+                alpha=0.5)
+        ax.set_axisbelow(True)
+
+        #Plot boxplots for predictive density
+        fig = plt.figure()
+        ax=fig.add_subplot(111)
+        plt.title('Predictive density')
+        bp = ax.boxplot(pred_density[1:,:].T, notch=0, sym='+', vert=1, whis=1.5)
+        plt.setp(bp['boxes'], color='black')
+        plt.setp(bp['whiskers'], color='black')
+        plt.setp(bp['fliers'], color='red', marker='+')
+        xtickNames = plt.setp(ax, xticklabels=legends[1:])
+        plt.setp(xtickNames, rotation=45, fontsize=8)
+        ax.set_ylabel('Mean Log probability P(Y*|Y)')
+        ax.set_xlabel('Distribution')
+        #Make grid and put it below boxes
+        ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
+                alpha=0.5)
+        ax.set_axisbelow(True)
+    return mstu_t
+
+#def precipitation_example():
+    #import sklearn
+    #from sklearn.cross_validation import KFold
+    #data = datasets.boston_housing()
+    #X = data['X'].copy()
+    #Y = data['Y'].copy()
+    #X = X-X.mean(axis=0)
+    #X = X/X.std(axis=0)
+    #Y = Y-Y.mean()
+    #Y = Y/Y.std()
+    #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+    #num_folds = 10
+    #kf = KFold(len(Y), n_folds=num_folds, indices=True)
+    #score_folds = np.zeros((4, num_folds))
+    #def rmse(Y, Ystar):
+        #return np.sqrt(np.mean((Y-Ystar)**2))
+    ##for train, test in kf:
+    #for n, (train, test) in enumerate(kf):
+        #X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
+        #print "Fold {}".format(n)
+
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -1,7 +1,6 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-
 """
 Gaussian Processes regression examples
 """
@ -9,88 +8,105 @@ import pylab as pb
 import numpy as np
 import GPy

-def coregionalization_toy2(max_iters=100):
+def olympic_marathon_men(optimize=True, plot=True):
+    """Run a standard Gaussian process regression on the Olympic marathon data."""
+    data = GPy.util.datasets.olympic_marathon_men()
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(data['X'], data['Y'])
+
+    # set the lengthscale to be something sensible (defaults to 1)
+    m['rbf_lengthscale'] = 10
+
+    if optimize:
+        m.optimize('bfgs', max_iters=200)
+    if plot:
+        m.plot(plot_limits=(1850, 2050))
+
+    return m
+
+def coregionalization_toy2(optimize=True, plot=True):
    """
    A simple demonstration of coregionalization on two sinusoidal functions.
    """
+    #build a design matrix with a column of integers indicating the output
    X1 = np.random.rand(50, 1) * 8
    X2 = np.random.rand(30, 1) * 5
    index = np.vstack((np.zeros_like(X1), np.ones_like(X2)))
    X = np.hstack((np.vstack((X1, X2)), index))
+
+    #build a suitable set of observed variables
    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
    Y2 = np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + 2.
    Y = np.vstack((Y1, Y2))

+    #build the kernel
    k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
    k2 = GPy.kern.coregionalize(2,1)
-    k = k1**k2 #k = k1.prod(k2,tensor=True)
+    k = k1**k2
    m = GPy.models.GPRegression(X, Y, kernel=k)
    m.constrain_fixed('.*rbf_var', 1.)
-    # m.constrain_positive('.*kappa')
-    m.optimize('sim', messages=1, max_iters=max_iters)

-    pb.figure()
-    Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1))))
-    Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1))))
-    mean, var, low, up = m.predict(Xtest1)
-    GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up)
-    mean, var, low, up = m.predict(Xtest2)
-    GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up)
-    pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2)
-    pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2)
+    if optimize:
+        m.optimize('bfgs', max_iters=100)
+
+    if plot:
+        m.plot(fixed_inputs=[(1,0)])
+        m.plot(fixed_inputs=[(1,1)], ax=pb.gca())
+
    return m

-def coregionalization_toy(max_iters=100):
-    """
-    A simple demonstration of coregionalization on two sinusoidal functions.
-    """
-    X1 = np.random.rand(50, 1) * 8
-    X2 = np.random.rand(30, 1) * 5
-    X = np.vstack((X1, X2))
-    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
-    Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
-    Y = np.vstack((Y1, Y2))
+#FIXME: Needs recovering once likelihoods are consolidated
+#def coregionalization_toy(optimize=True, plot=True):
+#    """
+#    A simple demonstration of coregionalization on two sinusoidal functions.
+#    """
+#    X1 = np.random.rand(50, 1) * 8
+#    X2 = np.random.rand(30, 1) * 5
+#    X = np.vstack((X1, X2))
+#    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
+#    Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
+#    Y = np.vstack((Y1, Y2))
+#
+#    k1 = GPy.kern.rbf(1)
+#    m = GPy.models.GPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1])
+#    m.constrain_fixed('.*rbf_var', 1.)
+#    m.optimize(max_iters=100)
+#
+#    fig, axes = pb.subplots(2,1)
+#    m.plot(fixed_inputs=[(1,0)],ax=axes[0])
+#    m.plot(fixed_inputs=[(1,1)],ax=axes[1])
+#    axes[0].set_title('Output 0')
+#    axes[1].set_title('Output 1')
+#    return m

-    k1 = GPy.kern.rbf(1)
-    m = GPy.models.GPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1])
-    m.constrain_fixed('.*rbf_var', 1.)
-    m.optimize(max_iters=max_iters)
-
-    fig, axes = pb.subplots(2,1)
-    m.plot_single_output(output=0,ax=axes[0])
-    m.plot_single_output(output=1,ax=axes[1])
-    axes[0].set_title('Output 0')
-    axes[1].set_title('Output 1')
-    return m
-
-def coregionalization_sparse(max_iters=100):
+def coregionalization_sparse(optimize=True, plot=True):
    """
    A simple demonstration of coregionalization on two sinusoidal functions using sparse approximations.
    """
-    X1 = np.random.rand(500, 1) * 8
-    X2 = np.random.rand(300, 1) * 5
-    index = np.vstack((np.zeros_like(X1), np.ones_like(X2)))
-    X = np.hstack((np.vstack((X1, X2)), index))
-    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
-    Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
-    Y = np.vstack((Y1, Y2))
+    #fetch the data from the non sparse examples
+    m = coregionalization_toy2(optimize=False, plot=False)
+    X, Y = m.X, m.likelihood.Y

-    k1 = GPy.kern.rbf(1)
+    #construct a model
+    m = GPy.models.SparseGPRegression(X,Y)
+    m.constrain_fixed('iip_\d+_1') # don't optimize the inducing input indexes

-    m = GPy.models.SparseGPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1],num_inducing=5)
-    m.constrain_fixed('.*rbf_var',1.)
-    #m.optimize(messages=1)
-    m.optimize_restarts(5, robust=True, messages=1, max_iters=max_iters, optimizer='bfgs')
+    if optimize:
+        m.optimize('bfgs', max_iters=100, messages=1)
+
+    if plot:
+        m.plot(fixed_inputs=[(1,0)])
+        m.plot(fixed_inputs=[(1,1)], ax=pb.gca())

-    fig, axes = pb.subplots(2,1)
-    m.plot_single_output(output=0,ax=axes[0],plot_limits=(-1,9))
-    m.plot_single_output(output=1,ax=axes[1],plot_limits=(-1,9))
-    axes[0].set_title('Output 0')
-    axes[1].set_title('Output 1')
    return m

-def epomeo_gpx(max_iters=100):
-    """Perform Gaussian process regression on the latitude and longitude data from the Mount Epomeo runs. Requires gpxpy to be installed on your system to load in the data."""
+def epomeo_gpx(max_iters=200, optimize=True, plot=True):
+    """
+    Perform Gaussian process regression on the latitude and longitude data
+    from the Mount Epomeo runs. Requires gpxpy to be installed on your system
+    to load in the data.
+    """
    data = GPy.util.datasets.epomeo_gpx()
    num_data_list = []
    for Xpart in data['X']:
@ -119,14 +135,16 @@ def epomeo_gpx(max_iters=100):
    m.constrain_fixed('.*rbf_var', 1.)
    m.constrain_fixed('iip')
    m.constrain_bounded('noise_variance', 1e-3, 1e-1)
-#     m.optimize_restarts(5, robust=True, messages=1, max_iters=max_iters, optimizer='bfgs')
    m.optimize(max_iters=max_iters,messages=True)

    return m

-
-def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=10000, max_iters=300):
-    """Show an example of a multimodal error surface for Gaussian process regression. Gene 939 has bimodal behaviour where the noisy mode is higher."""
+def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=10000, max_iters=300, optimize=True, plot=True):
+    """
+    Show an example of a multimodal error surface for Gaussian process
+    regression. Gene 939 has bimodal behaviour where the noisy mode is
+    higher.
+    """

    # Contour over a range of length scales and signal/noise ratios.
    length_scales = np.linspace(0.1, 60., resolution)
@ -139,13 +157,14 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
    data['Y'] = data['Y'] - np.mean(data['Y'])

    lls = GPy.examples.regression._contour_data(data, length_scales, log_SNRs, GPy.kern.rbf)
-    pb.contour(length_scales, log_SNRs, np.exp(lls), 20, cmap=pb.cm.jet)  # @UndefinedVariable
-    ax = pb.gca()
-    pb.xlabel('length scale')
-    pb.ylabel('log_10 SNR')
+    if plot:
+        pb.contour(length_scales, log_SNRs, np.exp(lls), 20, cmap=pb.cm.jet)
+        ax = pb.gca()
+        pb.xlabel('length scale')
+        pb.ylabel('log_10 SNR')

-    xlim = ax.get_xlim()
-    ylim = ax.get_ylim()
+        xlim = ax.get_xlim()
+        ylim = ax.get_ylim()

    # Now run a few optimizations
    models = []
@ -162,25 +181,31 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
        optim_point_y[0] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);

        # optimize
-        m.optimize('scg', xtol=1e-6, ftol=1e-6, max_iters=max_iters)
+        if optimize:
+            m.optimize('scg', xtol=1e-6, ftol=1e-6, max_iters=max_iters)

        optim_point_x[1] = m['rbf_lengthscale']
        optim_point_y[1] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);

-        pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1] - optim_point_x[0], optim_point_y[1] - optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k')
+        if plot:
+            pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1] - optim_point_x[0], optim_point_y[1] - optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k')
        models.append(m)

-    ax.set_xlim(xlim)
-    ax.set_ylim(ylim)
+    if plot:
+        ax.set_xlim(xlim)
+        ax.set_ylim(ylim)
    return m # (models, lls)

 def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):
-    """Evaluate the GP objective function for a given data set for a range of signal to noise ratios and a range of lengthscales.
+    """
+    Evaluate the GP objective function for a given data set for a range of
+    signal to noise ratios and a range of lengthscales.

    :data_set: A data set from the utils.datasets director.
    :length_scales: a list of length scales to explore for the contour plot.
    :log_SNRs: a list of base 10 logarithm signal to noise ratios to explore for the contour plot.
-    :kernel: a kernel to use for the 'signal' portion of the data."""
+    :kernel: a kernel to use for the 'signal' portion of the data.
+    """

    lls = []
    total_var = np.var(data['Y'])
@ -203,75 +228,75 @@ def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):
    return np.array(lls)


-def olympic_100m_men(max_iters=100, kernel=None):
+def olympic_100m_men(optimize=True, plot=True):
    """Run a standard Gaussian process regression on the Rogers and Girolami olympics data."""
    data = GPy.util.datasets.olympic_100m_men()

    # create simple GP Model
-    m = GPy.models.GPRegression(data['X'], data['Y'], kernel)
+    m = GPy.models.GPRegression(data['X'], data['Y'])

    # set the lengthscale to be something sensible (defaults to 1)
-    if kernel==None:
-        m['rbf_lengthscale'] = 10
+    m['rbf_lengthscale'] = 10

-    # optimize
-    m.optimize(max_iters=max_iters)
+    if optimize:
+        m.optimize('bfgs', max_iters=200)

-    # plot
-    m.plot(plot_limits=(1850, 2050))
-    print(m)
+    if plot:
+        m.plot(plot_limits=(1850, 2050))
    return m

-def olympic_marathon_men(max_iters=100, kernel=None):
-    """Run a standard Gaussian process regression on the Olympic marathon data."""
-    data = GPy.util.datasets.olympic_marathon_men()
-
-    # create simple GP Model
-    m = GPy.models.GPRegression(data['X'], data['Y'], kernel)
-
-    # set the lengthscale to be something sensible (defaults to 1)
-    if kernel==None:
-        m['rbf_lengthscale'] = 10
-
-    # optimize
-    m.optimize(max_iters=max_iters)
-
-    # plot
-    m.plot(plot_limits=(1850, 2050))
-    print(m)
-    return m
-
-def toy_rbf_1d(optimizer='tnc', max_nb_eval_optim=100):
+def toy_rbf_1d(optimize=True, plot=True):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
    data = GPy.util.datasets.toy_rbf_1d()

    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])

-    # optimize
-    m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
-    # plot
-    m.plot()
-    print(m)
+    if optimize:
+        m.optimize('bfgs')
+    if plot:
+        m.plot()
+
    return m

-def toy_rbf_1d_50(max_iters=100, optimize=True):
+def toy_rbf_1d_50(optimize=True, plot=True):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
    data = GPy.util.datasets.toy_rbf_1d_50()

    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])

-    # optimize
    if optimize:
-        m.optimize(max_iters=max_iters)
+        m.optimize('bfgs')
+    if plot:
+        m.plot()

-    # plot
-    m.plot()
-    print(m)
    return m

-def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize=True):
+def toy_poisson_rbf_1d_laplace(optimize=True, plot=True):
+    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
+    optimizer='scg'
+    x_len = 30
+    X = np.linspace(0, 10, x_len)[:, None]
+    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.rbf(1).K(X))
+    Y = np.array([np.random.poisson(np.exp(f)) for f in f_true])[:,None]
+
+    noise_model = GPy.likelihoods.poisson()
+    likelihood = GPy.likelihoods.Laplace(Y,noise_model)
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(X, Y, likelihood=likelihood)
+
+    if optimize:
+        m.optimize(optimizer)
+    if plot:
+        m.plot()
+        # plot the real underlying rate function
+        pb.plot(X, np.exp(f_true), '--k', linewidth=2)
+
+    return m
+
+def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize=True, plot=True):
    # Create an artificial dataset where the values in the targets (Y)
    # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to
    # see if this dependency can be recovered
@ -301,13 +326,16 @@ def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize
    # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
    # m.set_prior('.*lengthscale',len_prior)

-    if optimize: m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
+    if optimize:
+        m.optimize(optimizer='scg', max_iters=max_iters, messages=1)

-    m.kern.plot_ARD()
-    print(m)
+    if plot:
+        m.kern.plot_ARD()
+
+    print m
    return m

-def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4):
+def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize=True, plot=True):
    # Create an artificial dataset where the values in the targets (Y)
    # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to
    # see if this dependency can be recovered
@ -338,13 +366,16 @@ def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4):
    # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
    # m.set_prior('.*lengthscale',len_prior)

-    m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
+    if optimize:
+        m.optimize(optimizer='scg', max_iters=max_iters, messages=1)

-    m.kern.plot_ARD()
-    print(m)
+    if plot:
+        m.kern.plot_ARD()
+
+    print m
    return m

-def robot_wireless(max_iters=100, kernel=None):
+def robot_wireless(max_iters=100, kernel=None, optimize=True, plot=True):
    """Predict the location of a robot given wirelss signal strength readings."""
    data = GPy.util.datasets.robot_wireless()

@ -352,20 +383,24 @@ def robot_wireless(max_iters=100, kernel=None):
    m = GPy.models.GPRegression(data['Y'], data['X'], kernel=kernel)

    # optimize
-    m.optimize(messages=True, max_iters=max_iters)
+    if optimize:
+        m.optimize(messages=True, max_iters=max_iters)
+
    Xpredict = m.predict(data['Ytest'])[0]
-    pb.plot(data['Xtest'][:, 0], data['Xtest'][:, 1], 'r-')
-    pb.plot(Xpredict[:, 0], Xpredict[:, 1], 'b-')
-    pb.axis('equal')
-    pb.title('WiFi Localization with Gaussian Processes')
-    pb.legend(('True Location', 'Predicted Location'))
+    if plot:
+        pb.plot(data['Xtest'][:, 0], data['Xtest'][:, 1], 'r-')
+        pb.plot(Xpredict[:, 0], Xpredict[:, 1], 'b-')
+        pb.axis('equal')
+        pb.title('WiFi Localization with Gaussian Processes')
+        pb.legend(('True Location', 'Predicted Location'))

    sse = ((data['Xtest'] - Xpredict)**2).sum()
-    print(m)
+
+    print m
    print('Sum of squares error on test data: ' + str(sse))
    return m

-def silhouette(max_iters=100):
+def silhouette(max_iters=100, optimize=True, plot=True):
    """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper."""
    data = GPy.util.datasets.silhouette()

@ -373,12 +408,13 @@ def silhouette(max_iters=100):
    m = GPy.models.GPRegression(data['X'], data['Y'])

    # optimize
-    m.optimize(messages=True, max_iters=max_iters)
+    if optimize:
+        m.optimize(messages=True, max_iters=max_iters)

-    print(m)
+    print m
    return m

-def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, optimize=True, checkgrad=True):
+def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, optimize=True, plot=True):
    """Run a 1D example of a sparse GP regression."""
    # sample inputs and outputs
    X = np.random.uniform(-3., 3., (num_samples, 1))
@ -387,15 +423,17 @@ def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, opti
    rbf = GPy.kern.rbf(1)
    # create simple GP Model
    m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing)
+    m.checkgrad(verbose=1)

-    if checkgrad:
-        m.checkgrad(verbose=1)
    if optimize:
        m.optimize('tnc', messages=1, max_iters=max_iters)
-    m.plot()
+
+    if plot:
+        m.plot()
+
    return m

-def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100):
+def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, optimize=True, plot=True):
    """Run a 2D example of a sparse GP regression."""
    X = np.random.uniform(-3., 3., (num_samples, 2))
    Y = np.sin(X[:, 0:1]) * np.sin(X[:, 1:2]) + np.random.randn(num_samples, 1) * 0.05
@ -411,13 +449,18 @@ def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100):

    m.checkgrad()

-    # optimize and plot
-    m.optimize('tnc', messages=1, max_iters=max_iters)
-    m.plot()
-    print(m)
+    # optimize
+    if optimize:
+        m.optimize('tnc', messages=1, max_iters=max_iters)
+
+    # plot
+    if plot:
+        m.plot()
+
+    print m
    return m

-def uncertain_inputs_sparse_regression(max_iters=100):
+def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
    """Run a 1D example of a sparse GP regression with uncertain inputs."""
    fig, axes = pb.subplots(1, 2, figsize=(12, 5))

@ -432,18 +475,23 @@ def uncertain_inputs_sparse_regression(max_iters=100):

    # create simple GP Model - no input uncertainty on this one
    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z)
-    m.optimize('scg', messages=1, max_iters=max_iters)
-    m.plot(ax=axes[0])
-    axes[0].set_title('no input uncertainty')

+    if optimize:
+        m.optimize('scg', messages=1, max_iters=max_iters)
+
+    if plot:
+        m.plot(ax=axes[0])
+        axes[0].set_title('no input uncertainty')
+    print m

    # the same Model with uncertainty
    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z, X_variance=S)
-    m.optimize('scg', messages=1, max_iters=max_iters)
-    m.plot(ax=axes[1])
-    axes[1].set_title('with input uncertainty')
-    print(m)
-
-    fig.canvas.draw()
+    if optimize:
+        m.optimize('scg', messages=1, max_iters=max_iters)
+    if plot:
+        m.plot(ax=axes[1])
+        axes[1].set_title('with input uncertainty')
+        fig.canvas.draw()

+    print m
    return m
--- a/GPy/examples/stochastic.py
+++ b/GPy/examples/stochastic.py
@ -5,7 +5,7 @@ import pylab as pb
 import numpy as np
 import GPy

-def toy_1d():
+def toy_1d(optimize=True, plot=True):
    N = 2000
    M = 20

@ -20,22 +20,18 @@ def toy_1d():

    m.param_steplength = 1e-4

-    fig = pb.figure()
-    ax = fig.add_subplot(111)
-    def cb():
-        ax.cla()
-        m.plot(ax=ax,Z_height=-3)
-        ax.set_ylim(-3,3)
-        fig.canvas.draw()
+    if plot:
+        fig = pb.figure()
+        ax = fig.add_subplot(111)
+        def cb(foo):
+            ax.cla()
+            m.plot(ax=ax,Z_height=-3)
+            ax.set_ylim(-3,3)
+            fig.canvas.draw()

-    m.optimize(500, callback=cb, callback_interval=1)
+    if optimize:
+        m.optimize(500, callback=cb, callback_interval=1)

-    m.plot_traces()
+    if plot:
+        m.plot_traces()
    return m
-
-
-
-
-
-
-
--- a/GPy/examples/tutorials.py
+++ b/GPy/examples/tutorials.py
@ -11,7 +11,7 @@ pb.ion()
 import numpy as np
 import GPy

-def tuto_GP_regression():
+def tuto_GP_regression(optimize=True, plot=True):
    """The detailed explanations of the commands used in this file can be found in the tutorial section"""

    X = np.random.uniform(-3.,3.,(20,1))
@ -22,7 +22,8 @@ def tuto_GP_regression():
    m = GPy.models.GPRegression(X, Y, kernel)

    print m
-    m.plot()
+    if plot:
+        m.plot()

    m.constrain_positive('')

@ -31,9 +32,9 @@ def tuto_GP_regression():
    m.constrain_bounded('.*lengthscale',1.,10. )
    m.constrain_fixed('.*noise',0.0025)

-    m.optimize()
-
-    m.optimize_restarts(num_restarts = 10)
+    if optimize:
+        m.optimize()
+        m.optimize_restarts(num_restarts = 10)

    #######################################################
    #######################################################
@ -51,22 +52,26 @@ def tuto_GP_regression():
    m.constrain_positive('')

    # optimize and plot
-    m.optimize('tnc', max_f_eval = 1000)
-    m.plot()
-    print(m)
+    if optimize:
+        m.optimize('tnc', max_f_eval = 1000)
+    if plot:
+        m.plot()
+
+    print m
    return(m)

-def tuto_kernel_overview():
+def tuto_kernel_overview(optimize=True, plot=True):
    """The detailed explanations of the commands used in this file can be found in the tutorial section"""
    ker1 = GPy.kern.rbf(1)  # Equivalent to ker1 = GPy.kern.rbf(input_dim=1, variance=1., lengthscale=1.)
    ker2 = GPy.kern.rbf(input_dim=1, variance = .75, lengthscale=2.)
    ker3 = GPy.kern.rbf(1, .5, .5)
-    
+
    print ker2

-    ker1.plot()
-    ker2.plot()
-    ker3.plot()
+    if plot:
+        ker1.plot()
+        ker2.plot()
+        ker3.plot()

    k1 = GPy.kern.rbf(1,1.,2.)
    k2 = GPy.kern.Matern32(1, 0.5, 0.2)
@ -77,8 +82,8 @@ def tuto_kernel_overview():

    # Sum of kernels
    k_add = k1.add(k2)                          # By default, tensor=False
-    k_addtens = k1.add(k2,tensor=True)    
-    
+    k_addtens = k1.add(k2,tensor=True)
+
    k1 = GPy.kern.rbf(1,1.,2)
    k2 = GPy.kern.periodic_Matern52(1,variance=1e3, lengthscale=1, period = 1.5, lower=-5., upper = 5)

@ -102,7 +107,7 @@ def tuto_kernel_overview():
    k.unconstrain('white')
    k.constrain_bounded('white',lower=1e-5,upper=.5)
    print k
-    
+
    k_cst = GPy.kern.bias(1,variance=1.)
    k_mat = GPy.kern.Matern52(1,variance=1., lengthscale=3)
    Kanova = (k_cst + k_mat).prod(k_cst + k_mat,tensor=True)
@ -114,30 +119,32 @@ def tuto_kernel_overview():

    # Create GP regression model
    m = GPy.models.GPRegression(X, Y, Kanova)
-    fig = pb.figure(figsize=(5,5))
-    ax = fig.add_subplot(111)
-    m.plot(ax=ax)
-   
-    pb.figure(figsize=(20,3))
-    pb.subplots_adjust(wspace=0.5)
-    axs = pb.subplot(1,5,1)
-    m.plot(ax=axs)
-    pb.subplot(1,5,2)
-    pb.ylabel("=   ",rotation='horizontal',fontsize='30')
-    axs = pb.subplot(1,5,3)
-    m.plot(ax=axs, which_parts=[False,True,False,False])
-    pb.ylabel("cst          +",rotation='horizontal',fontsize='30')
-    axs = pb.subplot(1,5,4)
-    m.plot(ax=axs, which_parts=[False,False,True,False])
-    pb.ylabel("+   ",rotation='horizontal',fontsize='30')
-    axs = pb.subplot(1,5,5)
-    pb.ylabel("+   ",rotation='horizontal',fontsize='30')
-    m.plot(ax=axs, which_parts=[False,False,False,True])
+
+    if plot:
+        fig = pb.figure(figsize=(5,5))
+        ax = fig.add_subplot(111)
+        m.plot(ax=ax)
+
+        pb.figure(figsize=(20,3))
+        pb.subplots_adjust(wspace=0.5)
+        axs = pb.subplot(1,5,1)
+        m.plot(ax=axs)
+        pb.subplot(1,5,2)
+        pb.ylabel("=   ",rotation='horizontal',fontsize='30')
+        axs = pb.subplot(1,5,3)
+        m.plot(ax=axs, which_parts=[False,True,False,False])
+        pb.ylabel("cst          +",rotation='horizontal',fontsize='30')
+        axs = pb.subplot(1,5,4)
+        m.plot(ax=axs, which_parts=[False,False,True,False])
+        pb.ylabel("+   ",rotation='horizontal',fontsize='30')
+        axs = pb.subplot(1,5,5)
+        pb.ylabel("+   ",rotation='horizontal',fontsize='30')
+        m.plot(ax=axs, which_parts=[False,False,False,True])

    return(m)


-def model_interaction():
+def model_interaction(optimize=True, plot=True):
    X = np.random.randn(20,1)
    Y = np.sin(X) + np.random.randn(*X.shape)*0.01 + 5.
    k = GPy.kern.rbf(1) + GPy.kern.bias(1)
--- a/GPy/gpy_config.cfg
+++ b/GPy/gpy_config.cfg
@ -2,6 +2,12 @@

 [parallel]
 # Enable openmp support. This speeds up some computations, depending on the number
-# of cores available. Setting up a compiler with openmp support can be difficult on 
+# of cores available. Setting up a compiler with openmp support can be difficult on
 # some platforms, hence this option.
 openmp=False
+
+[anaconda]
+# if you have an anaconda python installation please specify it here.
+installed = False
+location = None
+MKL = False # set this to true if you have the MKL optimizations installed
--- a/GPy/inference/latent_function_inference/dtcvar.py
+++ b/GPy/inference/latent_function_inference/dtcvar.py
@ -35,10 +35,9 @@ class DTCVar(object):
    def inference(self, Kmm, Kmn, Knn_diag, likelihood, Y):

        num_inducing, num_data = Kmn.shape
-        const_jitter = np.eye(num_inducing) * self.const_jitter

        #factor Kmm # TODO: cache?
-        _Lm = jitchol(Kmm + _const_jitter)
+        _Lm = jitchol(Kmm)

        # The rather complex computations of A
        if has_uncertain_inputs:
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@ -22,15 +22,15 @@ class ExactGaussianInference(object):

    def get_YYTfactor(self, Y):
        """
-        find a matrix L which satisfies LLT = YYT. 
+        find a matrix L which satisfies LL^T = YY^T.

-        Note that L may have fewer columns than Y.
+        Note that L may have fewer columns than Y, else L=Y. 
        """
        N, D = Y.shape
        if (N>D):
            return Y
        else:
-            #if Y in self.cache, return self.Cache[Y], else stor Y in cache and return L.
+            #if Y in self.cache, return self.Cache[Y], else store Y in cache and return L.
            raise NotImplementedError, 'TODO' #TODO

    def inference(self, K, likelihood, Y, Y_metadata=None):
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@ -292,8 +292,7 @@ except ImportError:
 if sympy_available:
    from parts.sympykern import spkern
    from sympy.parsing.sympy_parser import parse_expr
-    from GPy.util.symbolic import sinc
-    
+
    def rbf_sympy(input_dim, ARD=False, variance=1., lengthscale=1.):
        """
        Radial Basis Function covariance.
@ -337,27 +336,6 @@ if sympy_available:
            f =  scale_i*scale_j*sp.exp(-dist/(2*(shared_lengthscale**2 + lengthscale_i*lengthscale_j)))
        return kern(input_dim, [spkern(input_dim, f, output_dim=output_dim, name='eq_sympy')])

-    def sinc(input_dim, ARD=False, variance=1., lengthscale=1.):
-        """
-        TODO: Not clear why this isn't working, suggests argument of sinc is not a number.
-        sinc covariance funciton
-        """
-        X = sp.symbols('x_:' + str(input_dim))
-        Z = sp.symbols('z_:' + str(input_dim))
-        variance = sp.var('variance',positive=True)
-        if ARD:
-            lengthscales = [sp.var('lengthscale_%i' % i, positive=True) for i in range(input_dim)]
-            dist_string = ' + '.join(['(x_%i-z_%i)**2/lengthscale_%i**2' % (i, i, i) for i in range(input_dim)])
-            dist = parse_expr(dist_string)
-            f =  variance*sinc(sp.pi*sp.sqrt(dist))
-        else:
-            lengthscale = sp.var('lengthscale',positive=True)
-            dist_string = ' + '.join(['(x_%i-z_%i)**2' % (i, i) for i in range(input_dim)])
-            dist = parse_expr(dist_string)
-            f =  variance*sinc(sp.pi*sp.sqrt(dist)/lengthscale)
-            
-        return kern(input_dim, [spkern(input_dim, f, name='sinc')])
-
    def sympykern(input_dim, k=None, output_dim=1, name=None, param=None):
        """
        A base kernel object, where all the hard work in done by sympy.
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@ -44,16 +44,12 @@ class kern(Parameterized):
        for p in self._parameters_:
            assert isinstance(p, Kernpart), "bad kernel part"

-        
-        
-        #Parameterized_old.__init__(self)
-
    def parameters_changed(self):
        [p.parameters_changed() for p in self._parameters_]

    def connect_input(self, Xparam):
        [p.connect_input(Xparam) for p in self._parameters_]
-                
+
    def getstate(self):
        """
        Get the current state of the class,
@ -152,11 +148,11 @@ class kern(Parameterized):
 #         Apply the transformations of the kernel so that the returned vector
 #         represents the gradient in the transformed space (i.e. that given by
 #         get_params_transformed())
-# 
+#
 #         :param g: the gradient vector for the current model, usually created by dK_dtheta
 #         """
 #         x = self._get_params()
-#         [np.place(g, index, g[index] * constraint.gradfactor(x[index])) 
+#         [np.place(g, index, g[index] * constraint.gradfactor(x[index]))
 #          for constraint, index in self.constraints.iteritems() if constraint is not __fixed__]
 # #         for constraint, index in self.constraints.iteritems():
 # #             if constraint != __fixed__:
@ -221,10 +217,10 @@ class kern(Parameterized):
 #             newkern.fixed_indices = self.fixed_indices + [self.num_params + x for x in other.fixed_indices]
 #             newkern.fixed_values = self.fixed_values + other.fixed_values
 #             newkern.tied_indices = self.tied_indices + [self.num_params + x for x in other.tied_indices]
-        [newkern._add_constrain(param, transform, warning=False) 
+        [newkern._add_constrain(param, transform, warning=False)
         for param, transform in itertools.izip(
-                *itertools.chain(self.constraints.iteritems(), 
-                                 other.constraints.iteritems()))] 
+                *itertools.chain(self.constraints.iteritems(),
+                                 other.constraints.iteritems()))]
        newkern._fixes_ = ((self._fixes_ or 0) + (other._fixes_ or 0)) or None

        return newkern
@ -269,7 +265,7 @@ class kern(Parameterized):
        return newkern

 #     def _follow_constrains(self, K1, K2):
-# 
+#
 #         # Build the array that allows to go from the initial indices of the param to the new ones
 #         K1_param = []
 #         n = 0
@ -286,21 +282,21 @@ class kern(Parameterized):
 #             for p2 in K2_param:
 #                 index_param += p1 + p2
 #         index_param = np.array(index_param)
-# 
+#
 #         # Get the ties and constrains of the kernels before the multiplication
 #         prev_ties = K1.tied_indices + [arr + K1.num_params for arr in K2.tied_indices]
-# 
+#
 #         prev_constr_ind = [K1.constrained_indices] + [K1.num_params + i for i in K2.constrained_indices]
 #         prev_constr = K1.constraints + K2.constraints
-# 
+#
 #         # prev_constr_fix = K1.fixed_indices + [arr + K1.num_params for arr in K2.fixed_indices]
 #         # prev_constr_fix_values = K1.fixed_values + K2.fixed_values
-# 
+#
 #         # follow the previous ties
 #         for arr in prev_ties:
 #             for j in arr:
 #                 index_param[np.where(index_param == j)[0]] = arr[0]
-# 
+#
 #         # ties and constrains
 #         for i in range(K1.num_params + K2.num_params):
 #             index = np.where(index_param == i)[0]
@ -308,22 +304,22 @@ class kern(Parameterized):
 #                 self.tie_params(index)
 #         for i, t in zip(prev_constr_ind, prev_constr):
 #             self.constrain(np.where(index_param == i)[0], t)
-# 
+#
 #     def _get_params(self):
 #         return np.hstack(self._parameters_)
 #         return np.hstack([p._get_params() for p in self._parameters_])
-  
+
 #     def _set_params(self, x):
 #         import ipdb;ipdb.set_trace()
 #         [p._set_params(x[s]) for p, s in zip(self._parameters_, self._param_slices_)]
-  
+
 #     def _get_param_names(self):
 #         # this is a bit nasty: we want to distinguish between parts with the same name by appending a count
 #         part_names = np.array([k.name for k in self._parameters_], dtype=np.str)
 #         counts = [np.sum(part_names == ni) for i, ni in enumerate(part_names)]
 #         cum_counts = [np.sum(part_names[i:] == ni) for i, ni in enumerate(part_names)]
 #         names = [name + '_' + str(cum_count) if count > 1 else name for name, count, cum_count in zip(part_names, counts, cum_counts)]
-#  
+#
 #         return sum([[name + '_' + n for n in k._get_param_names()] for name, k in zip(names, self._parameters_)], [])

    def K(self, X, X2=None, which_parts='all'):
@ -349,6 +345,13 @@ class kern(Parameterized):
            [p.K(X[:, i_s], X2[:, i_s], target=target) for p, i_s, part_i_used in zip(self._parameters_, self.input_slices, which_parts) if part_i_used]
        return target

+    def update_gradients_full(self, dL_dK, X):
+        pass
+    def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z):
+        pass
+    def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z):
+        pass
+
    def dK_dtheta(self, dL_dK, X, X2=None):
        """
        Compute the gradient of the covariance function with respect to the parameters.
@ -382,7 +385,7 @@ class kern(Parameterized):
        :type X2: np.ndarray (num_inducing x input_dim)"""

        target = np.zeros_like(X)
-        if X2 is None: 
+        if X2 is None:
            [p.dK_dX(dL_dK, X[:, i_s], None, target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
        else:
            [p.dK_dX(dL_dK, X[:, i_s], X2[:, i_s], target[:, i_s]) for p, i_s in zip(self._parameters_, self.input_slices)]
@ -450,7 +453,7 @@ class kern(Parameterized):
    def psi2(self, Z, mu, S):
        """
        Computer the psi2 statistics for the covariance function.
-        
+
        :param Z: np.ndarray of inducing inputs (num_inducing x input_dim)
        :param mu, S: np.ndarrays of means and variances (each num_samples x input_dim)
        :returns psi2: np.ndarray (num_samples,num_inducing,num_inducing)
@ -470,7 +473,7 @@ class kern(Parameterized):
                p1.psi1(Z[:, i_s1], mu[:, i_s1], S[:, i_s1], tmp1)
                tmp2 = np.zeros((mu.shape[0], Z.shape[0]))
                p2.psi1(Z[:, i_s2], mu[:, i_s2], S[:, i_s2], tmp2)
-    
+
                prod = np.multiply(tmp1, tmp2)
                crossterms += prod[:, :, None] + prod[:, None, :]

@ -598,7 +601,7 @@ class Kern_check_model(Model):
                dL_dK = np.ones((X.shape[0], X.shape[0]))
            else:
                dL_dK = np.ones((X.shape[0], X2.shape[0]))
-        
+
        self.kernel=kernel
        self.X = X
        self.X2 = X2
@ -613,13 +616,13 @@ class Kern_check_model(Model):
            return False
        else:
            return True
-        
+
    def _get_params(self):
        return self.kernel._get_params()
- 
+
    def _get_param_names(self):
        return self.kernel._get_param_names()
- 
+
    def _set_params(self, x):
        self.kernel._set_params(x)

@ -628,7 +631,7 @@ class Kern_check_model(Model):

    def _log_likelihood_gradients(self):
        raise NotImplementedError, "This needs to be implemented to use the kern_check_model class."
-    
+
 class Kern_check_dK_dtheta(Kern_check_model):
    """This class allows gradient checks for the gradient of a kernel with respect to parameters. """
    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
@ -643,7 +646,7 @@ class Kern_check_dKdiag_dtheta(Kern_check_model):
        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
        if dL_dK==None:
            self.dL_dK = np.ones((self.X.shape[0]))
-        
+
    def log_likelihood(self):
        return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()

@ -660,7 +663,7 @@ class Kern_check_dK_dX(Kern_check_model):

    def _get_param_names(self):
        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
-                
+
    def _get_params(self):
        return self.X.flatten()

@ -682,7 +685,7 @@ class Kern_check_dKdiag_dX(Kern_check_model):

    def _get_param_names(self):
        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
-                
+
    def _get_params(self):
        return self.X.flatten()

@ -690,7 +693,10 @@ class Kern_check_dKdiag_dX(Kern_check_model):
        self.X=x.reshape(self.X.shape)

 def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False):
-    """This function runs on kernels to check the correctness of their implementation. It checks that the covariance function is positive definite for a randomly generated data set.
+    """
+    This function runs on kernels to check the correctness of their
+    implementation. It checks that the covariance function is positive definite
+    for a randomly generated data set.

    :param kern: the kernel to be tested.
    :type kern: GPy.kern.Kernpart
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -54,7 +54,7 @@ class Gaussian(Likelihood):

    def _gradients(self, partial):
        """
-        Return the derivative of the log marginal likelihood wrt self.variance, 
+        Return the derivative of the log marginal likelihood wrt self.variance,
        given the appropriate partial derivative
        """
        return np.sum(partial)
@ -82,9 +82,13 @@ class Gaussian(Likelihood):

    def predictive_values(self, mu, var, full_cov=False):
        if full_cov:
-            low, up = mu - np.diag(var)[:,None], mu + np.diag(var)[:,None]
+            var += np.eye(var.shape[0])*self.variance
+            d = 2*np.sqrt(np.diag(var))
+            low, up = mu - d, mu + d
        else:
-            low, up = mu - var, mu + var
+            var += self.variance
+            d = 2*np.sqrt(var)
+            low, up = mu - d, mu + d
        return mu, var, low, up

    def predictive_mean(self, mu, sigma):
--- a/GPy/mappings/kernel.py
+++ b/GPy/mappings/kernel.py
@ -57,4 +57,4 @@ class Kernel(Mapping):
        return np.hstack((self._df_dA.flatten(), self._df_dbias))

    def df_dX(self, dL_df, X):
-        return self.kern.dK_dX((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X) 
+        return self.kern.dK_dX((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X)
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -6,7 +6,6 @@ from gp_classification import GPClassification
 from sparse_gp_regression import SparseGPRegression
 from svigp_regression import SVIGPRegression
 from sparse_gp_classification import SparseGPClassification
-from fitc_classification import FITCClassification
 from gplvm import GPLVM
 from bcgplvm import BCGPLVM
 from sparse_gplvm import SparseGPLVM
--- a/GPy/models/fitc_classification.py
+++ b/GPy/models/fitc_classification.py
@ -1,47 +0,0 @@
-# Copyright (c) 2013, Ricardo Andrade
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from ..core import FITC
-from .. import likelihoods
-from .. import kern
-from ..likelihoods import likelihood
-
-class FITCClassification(FITC):
-    """
-    FITC approximation for classification
-
-    This is a thin wrapper around the FITC class, with a set of sensible defaults
-
-    :param X: input observations
-    :param Y: observed values
-    :param likelihood: a GPy likelihood, defaults to Binomial with probit link function
-    :param kernel: a GPy kernel, defaults to rbf+white
-    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
-    :type normalize_X: False|True
-    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
-    :type normalize_Y: False|True
-    :rtype: model object
-
-    """
-
-    def __init__(self, X, Y=None, likelihood=None, kernel=None, normalize_X=False, normalize_Y=False, Z=None, num_inducing=10):
-        if kernel is None:
-            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1],1e-3)
-
-        if likelihood is None:
-            noise_model = likelihoods.binomial()
-            likelihood = likelihoods.EP(Y, noise_model)
-        elif Y is not None:
-            if not all(Y.flatten() == likelihood.data.flatten()):
-                raise Warning, 'likelihood.data and Y are different.'
-
-        if Z is None:
-            i = np.random.permutation(X.shape[0])[:num_inducing]
-            Z = X[i].copy()
-        else:
-            assert Z.shape[1]==X.shape[1]
-
-        FITC.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X)
-        self.ensure_default_constraints()
--- a/GPy/testing/bgplvm_tests.py
+++ b/GPy/testing/bgplvm_tests.py
@ -4,7 +4,7 @@
 import unittest
 import numpy as np
 import GPy
-from GPy.models.bayesian_gplvm import BayesianGPLVM
+from ..models import BayesianGPLVM

 class BGPLVMTests(unittest.TestCase):
    def test_bias_kern(self):
--- a/GPy/testing/examples_tests.py
+++ b/GPy/testing/examples_tests.py
@ -10,6 +10,7 @@ import os
 import random
 from nose.tools import nottest
 import sys
+import itertools

 class ExamplesTests(unittest.TestCase):
    def _checkgrad(self, Model):
@ -18,29 +19,27 @@ class ExamplesTests(unittest.TestCase):
    def _model_instance(self, Model):
        self.assertTrue(isinstance(Model, GPy.models))

-"""
-def model_instance_generator(model):
-    def check_model_returned(self):
-        self._model_instance(model)
-    return check_model_returned
-
-def checkgrads_generator(model):
-    def model_checkgrads(self):
-        self._checkgrad(model)
-    return model_checkgrads
-"""
-
 def model_checkgrads(model):
    model.randomize()
-    #assert model.checkgrad()
-    return model.checkgrad()
+    #NOTE: Step as 1e-4, this should be acceptable for more peaky models
+    return model.checkgrad(step=1e-4)

 def model_instance(model):
-    #assert isinstance(model, GPy.core.model)
-    return isinstance(model, GPy.core.model)
+    return isinstance(model, GPy.core.model.Model)
+
+def flatten_nested(lst):
+    result = []
+    for element in lst:
+        if hasattr(element, '__iter__'):
+            result.extend(flatten_nested(element))
+        else:
+            result.append(element)
+    return result

@nottest
 def test_models():
+    optimize=False
+    plot=True
    examples_path = os.path.dirname(GPy.examples.__file__)
    # Load modules
    failing_models = {}
@ -54,29 +53,36 @@ def test_models():
        print "After"
        print functions
        for example in functions:
-            if example[0] in ['oil', 'silhouette', 'GPLVM_oil_100']:
-                print "SKIPPING"
-                continue
+            if example[0] in ['epomeo_gpx']:
+                #These are the edge cases that we might want to handle specially
+                if example[0] == 'epomeo_gpx' and not GPy.util.datasets.gpxpy_available:
+                    print "Skipping as gpxpy is not available to parse GPS"
+                    continue

            print "Testing example: ", example[0]
            # Generate model
+
            try:
-                model = example[1]()
+                models = [ example[1](optimize=optimize, plot=plot) ]
+                #If more than one model returned, flatten them
+                models = flatten_nested(models)
            except Exception as e:
                failing_models[example[0]] = "Cannot make model: \n{e}".format(e=e)
            else:
-                print model
+                print models
                model_checkgrads.description = 'test_checkgrads_%s' % example[0]
                try:
-                    if not model_checkgrads(model):
-                        failing_models[model_checkgrads.description] = False
+                    for model in models:
+                        if not model_checkgrads(model):
+                            failing_models[model_checkgrads.description] = False
                except Exception as e:
                    failing_models[model_checkgrads.description] = e

                model_instance.description = 'test_instance_%s' % example[0]
                try:
-                    if not model_instance(model):
-                        failing_models[model_instance.description] = False
+                    for model in models:
+                        if not model_instance(model):
+                            failing_models[model_instance.description] = False
                except Exception as e:
                    failing_models[model_instance.description] = e

--- a/GPy/testing/gp_transformation_tests.py
+++ b/GPy/testing/gp_transformation_tests.py
@ -0,0 +1,61 @@
+from nose.tools import with_setup
+from GPy.models import GradientChecker
+from GPy.likelihoods.noise_models import gp_transformations
+import inspect
+import unittest
+import numpy as np
+
+class TestTransformations(object):
+    """
+    Generic transformations checker
+    """
+    def setUp(self):
+        N = 30
+        self.fs = [np.random.rand(N, 1), float(np.random.rand(1))]
+
+
+    def tearDown(self):
+        self.fs = None
+
+    def test_transformations(self):
+        self.setUp()
+        transformations = [gp_transformations.Identity(),
+                           gp_transformations.Log(),
+                           gp_transformations.Probit(),
+                           gp_transformations.Log_ex_1(),
+                           gp_transformations.Reciprocal(),
+                           ]
+
+        for transformation in transformations:
+            for f in self.fs:
+                yield self.t_dtransf_df, transformation, f
+                yield self.t_d2transf_df2, transformation, f
+                yield self.t_d3transf_df3, transformation, f
+
+    @with_setup(setUp, tearDown)
+    def t_dtransf_df(self, transformation, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        grad = GradientChecker(transformation.transf, transformation.dtransf_df, f, 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d2transf_df2(self, transformation, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        grad = GradientChecker(transformation.dtransf_df, transformation.d2transf_df2, f, 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d3transf_df3(self, transformation, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        grad = GradientChecker(transformation.d2transf_df2, transformation.d3transf_df3, f, 'f')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+#if __name__ == "__main__":
+    #print "Running unit tests"
+    #unittest.main()
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -17,16 +17,11 @@ except ImportError:
 class KernelTests(unittest.TestCase):
    def test_kerneltie(self):
        K = GPy.kern.rbf(5, ARD=True)
-        K.rbf.lengthscale[0].tie_to(K.rbf.lengthscale[2])
-        K.rbf.lengthscale[1].tie_to(K.rbf.lengthscale[3])
-        K.rbf.lengthscale[2].constrain_fixed()
+        K.tie_params('.*[01]')
+        K.constrain_fixed('2')
        X = np.random.rand(5,5)
        Y = np.ones((5,1))
        m = GPy.models.GPRegression(X,Y,K)
-        #self.assertRaises(RuntimeError, lambda: m.kern.rbf.lengthscale[3].tie_to(m.kern.rbf.lengthscale[1]))
-        #self.assertRaises(RuntimeError, lambda: m.kern.rbf.lengthscale[3].tie_to(m.kern.rbf.lengthscale[0]))
-        #self.assertRaises(RuntimeError, lambda: m.kern.rbf.lengthscale.tie_to(m.kern.rbf.lengthscale))
-        import ipdb;ipdb.set_trace()
        self.assertTrue(m.checkgrad())

    def test_rbfkernel(self):
@ -39,12 +34,14 @@ class KernelTests(unittest.TestCase):
            self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))

    def test_eq_sympykernel(self):
-        kern = GPy.kern.eq_sympy(5, 3, output_ind=4)
-        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+        if SYMPY_AVAILABLE:
+            kern = GPy.kern.eq_sympy(5, 3)
+            self.assertTrue(GPy.kern.kern_test(kern, output_ind=4, verbose=verbose))

-    def test_sinckernel(self):
-        kern = GPy.kern.sinc(5)
-        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+    def test_ode1_eqkernel(self):
+        if SYMPY_AVAILABLE:
+            kern = GPy.kern.ode1_eq(3)
+            self.assertTrue(GPy.kern.kern_test(kern, output_ind=1, verbose=verbose, X_positive=True))

    def test_rbf_invkernel(self):
        kern = GPy.kern.rbf_inv(5)
@ -83,7 +80,7 @@ class KernelTests(unittest.TestCase):
        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))

    def test_heterokernel(self):
-        kern = GPy.kern.hetero(5, mapping=GPy.mappings.Linear(5, 1), transform=GPy.core.transformations.Logexp())
+        kern = GPy.kern.hetero(5, mapping=GPy.mappings.Linear(5, 1), transform=GPy.core.transformations.logexp())
        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))

    def test_mlpkernel(self):
@ -120,15 +117,5 @@ class KernelTests(unittest.TestCase):


 if __name__ == "__main__":
-#     K = GPy.kern.rbf(5, ARD=True)
-#     K.rbf.lengthscale[0].tie_to(K.rbf.lengthscale[2])
-#     K.rbf.lengthscale[1].tie_to(K.rbf.lengthscale[3])
-#     K.rbf.lengthscale[2].constrain_fixed()
-#     
-#     K.rbf.lengthscale[2:].tie_to(K.rbf.variance)
-#     X = np.random.rand(5,5)
-#     Y = np.ones((5,1))
-#     m = GPy.models.GPRegression(X,Y,K)
-    
    print "Running unit tests, please be (very) patient..."
    unittest.main()
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@ -0,0 +1,686 @@
+import numpy as np
+import unittest
+import GPy
+from GPy.models import GradientChecker
+import functools
+import inspect
+from GPy.likelihoods.noise_models import gp_transformations
+from functools import partial
+#np.random.seed(300)
+np.random.seed(7)
+
+def dparam_partial(inst_func, *args):
+    """
+    If we have a instance method that needs to be called but that doesn't
+    take the parameter we wish to change to checkgrad, then this function
+    will change the variable using set params.
+
+    inst_func: should be a instance function of an object that we would like
+                to change
+    param: the param that will be given to set_params
+    args: anything else that needs to be given to the function (for example
+          the f or Y that are being used in the function whilst we tweak the
+          param
+    """
+    def param_func(param, inst_func, args):
+        inst_func.im_self._set_params(param)
+        return inst_func(*args)
+    return functools.partial(param_func, inst_func=inst_func, args=args)
+
+def dparam_checkgrad(func, dfunc, params, args, constraints=None, randomize=False, verbose=False):
+    """
+    checkgrad expects a f: R^N -> R^1 and df: R^N -> R^N
+    However if we are holding other parameters fixed and moving something else
+    We need to check the gradient of each of the fixed parameters
+    (f and y for example) seperately,  whilst moving another parameter.
+    Otherwise f: gives back R^N and
+              df: gives back R^NxM where M is
+    The number of parameters and N is the number of data
+    Need to take a slice out from f and a slice out of df
+    """
+    #print "\n{} likelihood: {} vs {}".format(func.im_self.__class__.__name__,
+                                           #func.__name__, dfunc.__name__)
+    partial_f = dparam_partial(func, *args)
+    partial_df = dparam_partial(dfunc, *args)
+    gradchecking = True
+    for param in params:
+        fnum = np.atleast_1d(partial_f(param)).shape[0]
+        dfnum = np.atleast_1d(partial_df(param)).shape[0]
+        for fixed_val in range(dfnum):
+            #dlik and dlik_dvar gives back 1 value for each
+            f_ind = min(fnum, fixed_val+1) - 1
+            print "fnum: {} dfnum: {} f_ind: {} fixed_val: {}".format(fnum, dfnum, f_ind, fixed_val)
+            #Make grad checker with this param moving, note that set_params is NOT being called
+            #The parameter is being set directly with __setattr__
+            grad = GradientChecker(lambda x: np.atleast_1d(partial_f(x))[f_ind],
+                                   lambda x : np.atleast_1d(partial_df(x))[fixed_val],
+                                   param, 'p')
+            #This is not general for more than one param...
+            if constraints is not None:
+                for constraint in constraints:
+                    constraint('p', grad)
+            if randomize:
+                grad.randomize()
+            if verbose:
+                print grad
+                grad.checkgrad(verbose=1)
+            if not grad.checkgrad():
+                gradchecking = False
+
+    return gradchecking
+
+
+from nose.tools import with_setup
+class TestNoiseModels(object):
+    """
+    Generic model checker
+    """
+    def setUp(self):
+        self.N = 5
+        self.D = 3
+        self.X = np.random.rand(self.N, self.D)*10
+
+        self.real_std = 0.1
+        noise = np.random.randn(*self.X[:, 0].shape)*self.real_std
+        self.Y = (np.sin(self.X[:, 0]*2*np.pi) + noise)[:, None]
+        self.f = np.random.rand(self.N, 1)
+        self.binary_Y = np.asarray(np.random.rand(self.N) > 0.5, dtype=np.int)[:, None]
+        self.positive_Y = np.exp(self.Y.copy())
+        tmp = np.round(self.X[:, 0]*3-3)[:, None] + np.random.randint(0,3, self.X.shape[0])[:, None]
+        self.integer_Y = np.where(tmp > 0, tmp, 0)
+
+        self.var = 0.2
+
+        self.var = np.random.rand(1)
+
+        #Make a bigger step as lower bound can be quite curved
+        self.step = 1e-3
+
+    def tearDown(self):
+        self.Y = None
+        self.f = None
+        self.X = None
+
+    def test_noise_models(self):
+        self.setUp()
+
+        ####################################################
+        # Constraint wrappers so we can just list them off #
+        ####################################################
+        def constrain_negative(regex, model):
+            model.constrain_negative(regex)
+
+        def constrain_positive(regex, model):
+            model.constrain_positive(regex)
+
+        def constrain_bounded(regex, model, lower, upper):
+            """
+            Used like: partial(constrain_bounded, lower=0, upper=1)
+            """
+            model.constrain_bounded(regex, lower, upper)
+
+        """
+        Dictionary where we nest models we would like to check
+            Name: {
+                "model": model_instance,
+                "grad_params": {
+                    "names": [names_of_params_we_want, to_grad_check],
+                    "vals": [values_of_params, to_start_at],
+                    "constrain": [constraint_wrappers, listed_here]
+                    },
+                "laplace": boolean_of_whether_model_should_work_for_laplace,
+                "ep": boolean_of_whether_model_should_work_for_laplace,
+                "link_f_constraints": [constraint_wrappers, listed_here]
+                }
+        """
+        noise_models = {"Student_t_default": {
+                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [self.var],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Student_t_1_var": {
+                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [1.0],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Student_t_small_var": {
+                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [0.01],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Student_t_large_var": {
+                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [10.0],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Student_t_approx_gauss": {
+                            "model": GPy.likelihoods.student_t(deg_free=1000, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [self.var],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Student_t_log": {
+                            "model": GPy.likelihoods.student_t(gp_link=gp_transformations.Log(), deg_free=5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [self.var],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
+                        "Gaussian_default": {
+                            "model": GPy.likelihoods.gaussian(variance=self.var, D=self.D, N=self.N),
+                            "grad_params": {
+                                "names": ["noise_model_variance"],
+                                "vals": [self.var],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True,
+                            "ep": True
+                            },
+                        #"Gaussian_log": {
+                            #"model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Log(), variance=self.var, D=self.D, N=self.N),
+                            #"grad_params": {
+                                #"names": ["noise_model_variance"],
+                                #"vals": [self.var],
+                                #"constraints": [constrain_positive]
+                                #},
+                            #"laplace": True
+                            #},
+                        #"Gaussian_probit": {
+                            #"model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Probit(), variance=self.var, D=self.D, N=self.N),
+                            #"grad_params": {
+                                #"names": ["noise_model_variance"],
+                                #"vals": [self.var],
+                                #"constraints": [constrain_positive]
+                                #},
+                            #"laplace": True
+                            #},
+                        #"Gaussian_log_ex": {
+                            #"model": GPy.likelihoods.gaussian(gp_link=gp_transformations.Log_ex_1(), variance=self.var, D=self.D, N=self.N),
+                            #"grad_params": {
+                                #"names": ["noise_model_variance"],
+                                #"vals": [self.var],
+                                #"constraints": [constrain_positive]
+                                #},
+                            #"laplace": True
+                            #},
+                        "Bernoulli_default": {
+                            "model": GPy.likelihoods.bernoulli(),
+                            "link_f_constraints": [partial(constrain_bounded, lower=0, upper=1)],
+                            "laplace": True,
+                            "Y": self.binary_Y,
+                            "ep": True
+                            },
+                        "Exponential_default": {
+                            "model": GPy.likelihoods.exponential(),
+                            "link_f_constraints": [constrain_positive],
+                            "Y": self.positive_Y,
+                            "laplace": True,
+                        },
+                        "Poisson_default": {
+                            "model": GPy.likelihoods.poisson(),
+                            "link_f_constraints": [constrain_positive],
+                            "Y": self.integer_Y,
+                            "laplace": True,
+                            "ep": False #Should work though...
+                        },
+                        "Gamma_default": {
+                            "model": GPy.likelihoods.gamma(),
+                            "link_f_constraints": [constrain_positive],
+                            "Y": self.positive_Y,
+                            "laplace": True
+                        }
+                    }
+
+        for name, attributes in noise_models.iteritems():
+            model = attributes["model"]
+            if "grad_params" in attributes:
+                params = attributes["grad_params"]
+                param_vals = params["vals"]
+                param_names= params["names"]
+                param_constraints = params["constraints"]
+            else:
+                params = []
+                param_vals = []
+                param_names = []
+                constrain_positive = []
+                param_constraints = [] # ??? TODO: Saul to Fix.
+            if "link_f_constraints" in attributes:
+                link_f_constraints = attributes["link_f_constraints"]
+            else:
+                link_f_constraints = []
+            if "Y" in attributes:
+                Y = attributes["Y"].copy()
+            else:
+                Y = self.Y.copy()
+            if "f" in attributes:
+                f = attributes["f"].copy()
+            else:
+                f = self.f.copy()
+            if "laplace" in attributes:
+                laplace = attributes["laplace"]
+            else:
+                laplace = False
+            if "ep" in attributes:
+                ep = attributes["ep"]
+            else:
+                ep = False
+
+            if len(param_vals) > 1:
+                raise NotImplementedError("Cannot support multiple params in likelihood yet!")
+
+            #Required by all
+            #Normal derivatives
+            yield self.t_logpdf, model, Y, f
+            yield self.t_dlogpdf_df, model, Y, f
+            yield self.t_d2logpdf_df2, model, Y, f
+            #Link derivatives
+            yield self.t_dlogpdf_dlink, model, Y, f, link_f_constraints
+            yield self.t_d2logpdf_dlink2, model, Y, f, link_f_constraints
+            if laplace:
+                #Laplace only derivatives
+                yield self.t_d3logpdf_df3, model, Y, f
+                yield self.t_d3logpdf_dlink3, model, Y, f, link_f_constraints
+                #Params
+                yield self.t_dlogpdf_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_dlogpdf_df_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_d2logpdf2_df2_dparams, model, Y, f, param_vals, param_constraints
+                #Link params
+                yield self.t_dlogpdf_link_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_dlogpdf_dlink_dparams, model, Y, f, param_vals, param_constraints
+                yield self.t_d2logpdf2_dlink2_dparams, model, Y, f, param_vals, param_constraints
+
+                #laplace likelihood gradcheck
+                yield self.t_laplace_fit_rbf_white, model, self.X, Y, f, self.step, param_vals, param_names, param_constraints
+            if ep:
+                #ep likelihood gradcheck
+                yield self.t_ep_fit_rbf_white, model, self.X, Y, f, self.step, param_vals, param_names, param_constraints
+
+
+        self.tearDown()
+
+    #############
+    # dpdf_df's #
+    #############
+    @with_setup(setUp, tearDown)
+    def t_logpdf(self, model, Y, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        print model._get_params()
+        np.testing.assert_almost_equal(
+                               model.pdf(f.copy(), Y.copy()),
+                               np.exp(model.logpdf(f.copy(), Y.copy()))
+                               )
+
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_df(self, model, Y, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.description = "\n{}".format(inspect.stack()[0][3])
+        logpdf = functools.partial(model.logpdf, y=Y)
+        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y)
+        grad = GradientChecker(logpdf, dlogpdf_df, f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        print model
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d2logpdf_df2(self, model, Y, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y)
+        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y)
+        grad = GradientChecker(dlogpdf_df, d2logpdf_df2, f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        print model
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d3logpdf_df3(self, model, Y, f):
+        print "\n{}".format(inspect.stack()[0][3])
+        d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y)
+        d3logpdf_df3 = functools.partial(model.d3logpdf_df3, y=Y)
+        grad = GradientChecker(d2logpdf_df2, d3logpdf_df3, f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        print model
+        assert grad.checkgrad()
+
+    ##############
+    # df_dparams #
+    ##############
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_dparams(self, model, Y, f, params, param_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        assert (
+                dparam_checkgrad(model.logpdf, model.dlogpdf_dtheta,
+                    params, args=(f, Y), constraints=param_constraints,
+                    randomize=True, verbose=True)
+                )
+
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_df_dparams(self, model, Y, f, params, param_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        assert (
+                dparam_checkgrad(model.dlogpdf_df, model.dlogpdf_df_dtheta,
+                    params, args=(f, Y), constraints=param_constraints,
+                    randomize=True, verbose=True)
+                )
+
+    @with_setup(setUp, tearDown)
+    def t_d2logpdf2_df2_dparams(self, model, Y, f, params, param_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        assert (
+                dparam_checkgrad(model.d2logpdf_df2, model.d2logpdf_df2_dtheta,
+                    params, args=(f, Y), constraints=param_constraints,
+                    randomize=True, verbose=True)
+                )
+
+    ################
+    # dpdf_dlink's #
+    ################
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_dlink(self, model, Y, f, link_f_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        logpdf = functools.partial(model.logpdf_link, y=Y)
+        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y)
+        grad = GradientChecker(logpdf, dlogpdf_dlink, f.copy(), 'g')
+
+        #Apply constraints to link_f values
+        for constraint in link_f_constraints:
+            constraint('g', grad)
+
+        grad.randomize()
+        print grad
+        grad.checkgrad(verbose=1)
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d2logpdf_dlink2(self, model, Y, f, link_f_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y)
+        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y)
+        grad = GradientChecker(dlogpdf_dlink, d2logpdf_dlink2, f.copy(), 'g')
+
+        #Apply constraints to link_f values
+        for constraint in link_f_constraints:
+            constraint('g', grad)
+
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        print grad
+        assert grad.checkgrad()
+
+    @with_setup(setUp, tearDown)
+    def t_d3logpdf_dlink3(self, model, Y, f, link_f_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y)
+        d3logpdf_dlink3 = functools.partial(model.d3logpdf_dlink3, y=Y)
+        grad = GradientChecker(d2logpdf_dlink2, d3logpdf_dlink3, f.copy(), 'g')
+
+        #Apply constraints to link_f values
+        for constraint in link_f_constraints:
+            constraint('g', grad)
+
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        print grad
+        assert grad.checkgrad()
+
+    #################
+    # dlink_dparams #
+    #################
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_link_dparams(self, model, Y, f, params, param_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        assert (
+                dparam_checkgrad(model.logpdf_link, model.dlogpdf_link_dtheta,
+                    params, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
+                )
+
+    @with_setup(setUp, tearDown)
+    def t_dlogpdf_dlink_dparams(self, model, Y, f, params, param_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        assert (
+                dparam_checkgrad(model.dlogpdf_dlink, model.dlogpdf_dlink_dtheta,
+                    params, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
+                )
+
+    @with_setup(setUp, tearDown)
+    def t_d2logpdf2_dlink2_dparams(self, model, Y, f, params, param_constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        print model
+        assert (
+                dparam_checkgrad(model.d2logpdf_dlink2, model.d2logpdf_dlink2_dtheta,
+                    params, args=(f, Y), constraints=param_constraints,
+                    randomize=False, verbose=True)
+                )
+
+    ################
+    # laplace test #
+    ################
+    @with_setup(setUp, tearDown)
+    def t_laplace_fit_rbf_white(self, model, X, Y, f, step, param_vals, param_names, constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        #Normalize
+        Y = Y/Y.max()
+        white_var = 1e-6
+        kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+        laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), model)
+        m = GPy.models.GPRegression(X.copy(), Y.copy(), kernel, likelihood=laplace_likelihood)
+        m.ensure_default_constraints()
+        m.constrain_fixed('white', white_var)
+
+        for param_num in range(len(param_names)):
+            name = param_names[param_num]
+            m[name] = param_vals[param_num]
+            constraints[param_num](name, m)
+
+        print m
+        m.randomize()
+        #m.optimize(max_iters=8)
+        print m
+        m.checkgrad(verbose=1, step=step)
+        #if not m.checkgrad(step=step):
+            #m.checkgrad(verbose=1, step=step)
+            #import ipdb; ipdb.set_trace()
+            #NOTE this test appears to be stochastic for some likelihoods (student t?)
+            # appears to all be working in test mode right now...
+        assert m.checkgrad(step=step)
+
+    ###########
+    # EP test #
+    ###########
+    @with_setup(setUp, tearDown)
+    def t_ep_fit_rbf_white(self, model, X, Y, f, step, param_vals, param_names, constraints):
+        print "\n{}".format(inspect.stack()[0][3])
+        #Normalize
+        Y = Y/Y.max()
+        white_var = 1e-6
+        kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+        ep_likelihood = GPy.likelihoods.EP(Y.copy(), model)
+        m = GPy.models.GPRegression(X.copy(), Y.copy(), kernel, likelihood=ep_likelihood)
+        m.ensure_default_constraints()
+        m.constrain_fixed('white', white_var)
+
+        for param_num in range(len(param_names)):
+            name = param_names[param_num]
+            m[name] = param_vals[param_num]
+            constraints[param_num](name, m)
+
+        m.randomize()
+        m.checkgrad(verbose=1, step=step)
+        print m
+        assert m.checkgrad(step=step)
+
+
+class LaplaceTests(unittest.TestCase):
+    """
+    Specific likelihood tests, not general enough for the above tests
+    """
+
+    def setUp(self):
+        self.N = 5
+        self.D = 3
+        self.X = np.random.rand(self.N, self.D)*10
+
+        self.real_std = 0.1
+        noise = np.random.randn(*self.X[:, 0].shape)*self.real_std
+        self.Y = (np.sin(self.X[:, 0]*2*np.pi) + noise)[:, None]
+        self.f = np.random.rand(self.N, 1)
+
+        self.var = 0.2
+
+        self.var = np.random.rand(1)
+        self.stu_t = GPy.likelihoods.student_t(deg_free=5, sigma2=self.var)
+        self.gauss = GPy.likelihoods.gaussian(gp_transformations.Log(), variance=self.var, D=self.D, N=self.N)
+
+        #Make a bigger step as lower bound can be quite curved
+        self.step = 1e-6
+
+    def tearDown(self):
+        self.stu_t = None
+        self.gauss = None
+        self.Y = None
+        self.f = None
+        self.X = None
+
+    def test_gaussian_d2logpdf_df2_2(self):
+        print "\n{}".format(inspect.stack()[0][3])
+        self.Y = None
+        self.gauss = None
+
+        self.N = 2
+        self.D = 1
+        self.X = np.linspace(0, self.D, self.N)[:, None]
+        self.real_std = 0.2
+        noise = np.random.randn(*self.X.shape)*self.real_std
+        self.Y = np.sin(self.X*2*np.pi) + noise
+        self.f = np.random.rand(self.N, 1)
+        self.gauss = GPy.likelihoods.gaussian(variance=self.var, D=self.D, N=self.N)
+
+        dlogpdf_df = functools.partial(self.gauss.dlogpdf_df, y=self.Y)
+        d2logpdf_df2 = functools.partial(self.gauss.d2logpdf_df2, y=self.Y)
+        grad = GradientChecker(dlogpdf_df, d2logpdf_df2, self.f.copy(), 'g')
+        grad.randomize()
+        grad.checkgrad(verbose=1)
+        self.assertTrue(grad.checkgrad())
+
+    def test_laplace_log_likelihood(self):
+        debug = False
+        real_std = 0.1
+        initial_var_guess = 0.5
+
+        #Start a function, any function
+        X = np.linspace(0.0, np.pi*2, 100)[:, None]
+        Y = np.sin(X) + np.random.randn(*X.shape)*real_std
+        Y = Y/Y.max()
+        #Yc = Y.copy()
+        #Yc[75:80] += 1
+        kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+        kernel2 = kernel1.copy()
+
+        m1 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel1)
+        m1.constrain_fixed('white', 1e-6)
+        m1['noise'] = initial_var_guess
+        m1.constrain_bounded('noise', 1e-4, 10)
+        m1.constrain_bounded('rbf', 1e-4, 10)
+        m1.ensure_default_constraints()
+        m1.randomize()
+
+        gauss_distr = GPy.likelihoods.gaussian(variance=initial_var_guess, D=1, N=Y.shape[0])
+        laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), gauss_distr)
+        m2 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel2, likelihood=laplace_likelihood)
+        m2.ensure_default_constraints()
+        m2.constrain_fixed('white', 1e-6)
+        m2.constrain_bounded('rbf', 1e-4, 10)
+        m2.constrain_bounded('noise', 1e-4, 10)
+        m2.randomize()
+
+        if debug:
+            print m1
+            print m2
+        optimizer = 'scg'
+        print "Gaussian"
+        m1.optimize(optimizer, messages=debug)
+        print "Laplace Gaussian"
+        m2.optimize(optimizer, messages=debug)
+        if debug:
+            print m1
+            print m2
+
+        m2._set_params(m1._get_params())
+
+        #Predict for training points to get posterior mean and variance
+        post_mean, post_var, _, _ = m1.predict(X)
+        post_mean_approx, post_var_approx, _, _ = m2.predict(X)
+
+        if debug:
+            import pylab as pb
+            pb.figure(5)
+            pb.title('posterior means')
+            pb.scatter(X, post_mean, c='g')
+            pb.scatter(X, post_mean_approx, c='r', marker='x')
+
+            pb.figure(6)
+            pb.title('plot_f')
+            m1.plot_f(fignum=6)
+            m2.plot_f(fignum=6)
+            fig, axes = pb.subplots(2, 1)
+            fig.suptitle('Covariance matricies')
+            a1 = pb.subplot(121)
+            a1.matshow(m1.likelihood.covariance_matrix)
+            a2 = pb.subplot(122)
+            a2.matshow(m2.likelihood.covariance_matrix)
+
+            pb.figure(8)
+            pb.scatter(X, m1.likelihood.Y, c='g')
+            pb.scatter(X, m2.likelihood.Y, c='r', marker='x')
+
+
+
+        #Check Y's are the same
+        np.testing.assert_almost_equal(Y, m2.likelihood.Y, decimal=5)
+        #Check marginals are the same
+        np.testing.assert_almost_equal(m1.log_likelihood(), m2.log_likelihood(), decimal=2)
+        #Check marginals are the same with random
+        m1.randomize()
+        m2._set_params(m1._get_params())
+        np.testing.assert_almost_equal(m1.log_likelihood(), m2.log_likelihood(), decimal=2)
+
+        #Check they are checkgradding
+        #m1.checkgrad(verbose=1)
+        #m2.checkgrad(verbose=1)
+        self.assertTrue(m1.checkgrad())
+        self.assertTrue(m2.checkgrad())
+
+if __name__ == "__main__":
+    print "Running unit tests"
+    unittest.main()
--- a/GPy/testing/parameter_testing.py
+++ b/GPy/testing/parameter_testing.py
@ -1,141 +0,0 @@
-'''
-Created on 4 Sep 2013
-
-@author: maxz
-'''
-import unittest
-from GPy.kern.constructors import rbf, linear, white
-import numpy
-from GPy.models.bayesian_gplvm import BayesianGPLVM
-from GPy.likelihoods.gaussian import Gaussian
-import pickle
-import os
-from GPy.core.parameterized import Parameterized
-from GPy.core.parameter import Param
-
-
-class Test(unittest.TestCase):
-    N, D, Q = 10, 6, 4
-    def setUp(self):
-        self.rbf_variance = numpy.random.rand()
-        self.rbf_lengthscale = numpy.random.rand(self.Q)
-        self.linear_variance = numpy.random.rand(self.Q)
-        self.noise_variance = numpy.random.rand(1)
-        self.kern = (rbf(self.Q, self.rbf_variance, self.rbf_lengthscale, ARD=True)
-                     + linear(self.Q, self.linear_variance, ARD=True)
-                     + white(self.Q, self.noise_variance))
-        self.X = numpy.random.rand(self.N, self.Q) + 10
-        self.X_variance = numpy.random.rand(self.N, self.Q) * .2
-
-        K = self.kern.K(self.X)
-
-        self.Y = numpy.random.multivariate_normal(numpy.zeros(self.N), K + numpy.eye(self.N) * .2, self.D).T
-        
-#         self.bgplvm = BayesianGPLVM(Gaussian(self.Y, variance=self.noise_variance), self.Q, self.X, self.X_variance, kernel=self.kern)
-#         self.bgplvm.ensure_default_constraints(warning=False)
-#         self.bgplvm.tie_params("noise_variance|white_variance")
-#         self.bgplvm.constrain_fixed("rbf_var", warning=False)
-        self.parameter = Parameterized([
-                                    Parameterized([
-                                                Param('X', self.X),
-                                                Param('X_variance', self.X_variance),
-                                                ]),
-                                    Param('iip', self.bgplvm.Z),
-                                    Parameterized([
-                                                Param('rbf_variance', self.rbf_variance),
-                                                Param('rbf_lengthscale', self.rbf_lengthscale)
-                                                ]),
-                                    Param('linear_variance', self.linear_variance),
-                                    Param('white_variance', self.noise_variance),
-                                    Param('noise_variance', self.noise_variance),
-                                     ])
-        
-        self.parameter['.*variance'].constrain_positive(False)
-        self.parameter['.*length'].constrain_positive(False)
-        self.parameter.white.tie_to(self.parameter.noise)
-        self.parameter.rbf_var.constrain_fixed(False)
-
-    def tearDown(self):
-        pass
-
-#     def testGrepParamNamesTest(self):
-#         assert(self.bgplvm.grep_param_names('X_\d') == self.parameter.grep_param_names('X_\d'))
-#         assert(self.bgplvm.grep_param_names('X_\d+_1') == self.parameter.grep_param_names('X_\d+_1'))
-#         assert(self.bgplvm.grep_param_names('X_\d_1') == self.parameter.grep_param_names('X_\d_1'))
-#         assert(self.bgplvm.grep_param_names('X_.+_1') == self.parameter.grep_param_names('X_.+_1'))
-#         assert(self.bgplvm.grep_param_names('X_1_1') == self.parameter.grep_param_names('X_1_1'))
-#         assert(self.bgplvm.grep_param_names('X') == self.parameter.grep_param_names('X'))
-#         assert(self.bgplvm.grep_param_names('rbf') == self.parameter.grep_param_names('rbf'))
-#         assert(self.bgplvm.grep_param_names('rbf_l.*_1') == self.parameter.grep_param_names('rbf_l.*_1'))
-#         assert(self.bgplvm.grep_param_names('l') == self.parameter.grep_param_names('l'))
-#         assert(self.bgplvm.grep_param_names('dont_match') == self.parameter.grep_param_names('dont_match'))
-#         assert(self.bgplvm.grep_param_names('.*') == self.parameter.grep_param_names('.*'))
-
-    def testGetParams(self):
-        assert(numpy.allclose(self.bgplvm._get_params(), self.parameter._get_params()))
-        assert(numpy.allclose(self.bgplvm._get_params_transformed(), self.parameter._get_params_transformed()))
-
-    def testSetParams(self):
-        self.bgplvm.randomize()
-        self.parameter._set_params(self.bgplvm._get_params())
-        assert(numpy.allclose(self.bgplvm._get_params(), self.parameter._get_params()))
-        assert(numpy.allclose(self.bgplvm._get_params_transformed(), self.parameter._get_params_transformed()))
-        self.bgplvm.randomize()
-        self.parameter._set_params_transformed(self.bgplvm._get_params_transformed())
-        assert(numpy.allclose(self.bgplvm._get_params(), self.parameter._get_params()))
-        assert(numpy.allclose(self.bgplvm._get_params_transformed(), self.parameter._get_params_transformed()))
-
-    def testSlicing(self):
-        assert(numpy.allclose(self.parameter.X[:,1], self.X[:,1]))
-        assert(numpy.allclose(self.parameter.X[:,1], self.X[:,1]))
-        assert(numpy.allclose(self.parameter.X_variance[1,1], self.X_variance[1,1]))
-        assert(numpy.allclose(self.parameter.X_variance[:], self.X_variance[:]))
-        assert(numpy.allclose(self.parameter.X[:,:][:,0:2][:,1], self.X[:,1]))
-        assert(numpy.allclose(self.parameter.X[:,1], self.X[:,1]))
-        assert(numpy.allclose(self.parameter.X_variance[1,1], self.X_variance[1,1]))
-        assert(numpy.allclose(self.parameter.X_variance[:], self.X_variance[:]))
-
-    def testSlicingSet(self):
-        self.parameter['.*variance'] = 1.
-        assert(numpy.alltrue(self.parameter['.*variance'] == 1.))
-        self.parameter.X[0,:3] = 2
-        assert(numpy.alltrue(self.parameter.X[0,:3] == 2))
-        X = self.parameter.X.copy()
-        self.parameter.X[[0,4,9],[0,1,3]] -= 1
-        assert(numpy.alltrue((X[[0,4,9],[0,1,3]] - 1) == self.parameter.X[[0,4,9],[0,1,3]]))
-        self.parameter[''] = 10
-        assert(numpy.alltrue(self.parameter[''] == 10))
-            
-    def testConstraints(self):
-        self.parameter[''].unconstrain()
-        self.parameter.X.constrain_positive()
-        self.parameter.X[:,numpy.s_[0::2]].unconstrain_positive()
-        assert(numpy.alltrue(self.parameter.constraints.indices()[0] == numpy.r_[1:self.N*self.Q:2]))
-
-    def testNdarrayFunc(self):
-        assert(numpy.alltrue(self.parameter.X * self.parameter.X == self.X * self.X))
-        assert(numpy.alltrue(self.parameter.X[0,:] * self.parameter.X[1,:] == self.X[0,:] * self.X[1,:]))
-
-    def testPickle(self):
-        fname = '/tmp/GPy_io_test.pickle'
-        m = self.parameter
-        m.X.fix()
-        self.parameter.pickle(fname)
-        with open(fname, 'r') as f:
-            m2 = pickle.load(f)
-        self.assertEqual(m.__str__(), m2.__str__()) 
-        self.assertEqual(m.X_v.__str__(), m2.X_v.__str__())
-        os.remove(fname)
-
-
-if __name__ == "__main__":
-    import sys;sys.argv = ['', 
-                           'Test.testSlicing',
-                           'Test.testGetParams',
-                           'Test.testNdarrayFunc',
-                           'Test.testSetParams',
-                           'Test.testConstraints',
-                           'Test.testSlicingSet',
-                           'Test.testPickle',
-                           ]
-    unittest.main()
--- a/GPy/testing/psi_stat_expectation_tests.py
+++ b/GPy/testing/psi_stat_expectation_tests.py
@ -27,9 +27,9 @@ def ard(p):
@testing.deepTest(__test__())
 class Test(unittest.TestCase):
    input_dim = 9
-    num_inducing = 4
-    N = 3
-    Nsamples = 5e3
+    num_inducing = 13
+    N = 300
+    Nsamples = 1e6

    def setUp(self):
        i_s_dim_list = [2,4,3]
@ -45,20 +45,29 @@ class Test(unittest.TestCase):
                                         input_slices = input_slices
                                         )
        self.kerns = (
-                    input_slice_kern,
-                    (GPy.kern.rbf(self.input_dim, ARD=True) +
-                     GPy.kern.linear(self.input_dim, ARD=True) +
-                     GPy.kern.bias(self.input_dim) +
-                     GPy.kern.white(self.input_dim)),
-                    (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
-                     GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
-                     GPy.kern.linear(self.input_dim, np.random.rand(self.input_dim), ARD=True) +
-                     GPy.kern.bias(self.input_dim) +
-                    GPy.kern.white(self.input_dim)),
-#                       GPy.kern.rbf(self.input_dim), GPy.kern.rbf(self.input_dim, ARD=True),
+#                     input_slice_kern,
+#                       (GPy.kern.rbf(self.input_dim, ARD=True) +
+#                        GPy.kern.linear(self.input_dim, ARD=True) +
+#                        GPy.kern.bias(self.input_dim) +
+#                        GPy.kern.white(self.input_dim)),
+                    (#GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True)
+                     GPy.kern.linear(self.input_dim, np.random.rand(self.input_dim), ARD=True)
+                     +GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True)
+#                      +GPy.kern.bias(self.input_dim)
+#                      +GPy.kern.white(self.input_dim)),
+                    ),
+#                     (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
+#                      GPy.kern.bias(self.input_dim, np.random.rand())),
+#         (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True)
+#          +GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True)
+#          #+GPy.kern.bias(self.input_dim, np.random.rand())
+#          #+GPy.kern.white(self.input_dim, np.random.rand())),
+#         ),
+#                     GPy.kern.white(self.input_dim, np.random.rand())),
+#                     GPy.kern.rbf(self.input_dim), GPy.kern.rbf(self.input_dim, ARD=True),
 #                       GPy.kern.linear(self.input_dim, ARD=False), GPy.kern.linear(self.input_dim, ARD=True),
 #                       GPy.kern.linear(self.input_dim) + GPy.kern.bias(self.input_dim),
-#                       GPy.kern.rbf(self.input_dim) + GPy.kern.bias(self.input_dim),
+#                     GPy.kern.rbf(self.input_dim) + GPy.kern.bias(self.input_dim),
 #                       GPy.kern.linear(self.input_dim) + GPy.kern.bias(self.input_dim) + GPy.kern.white(self.input_dim),
 #                       GPy.kern.rbf(self.input_dim) + GPy.kern.bias(self.input_dim) + GPy.kern.white(self.input_dim),
 #                       GPy.kern.bias(self.input_dim), GPy.kern.white(self.input_dim),
@ -79,7 +88,7 @@ class Test(unittest.TestCase):

    def test_psi1(self):
        for kern in self.kerns:
-            Nsamples = np.floor(self.Nsamples/300.)
+            Nsamples = np.floor(self.Nsamples/self.N)
            psi1 = kern.psi1(self.Z, self.q_x_mean, self.q_x_variance)
            K_ = np.zeros((Nsamples, self.num_inducing))
            diffs = []
@ -105,31 +114,31 @@ class Test(unittest.TestCase):

    def test_psi2(self):
        for kern in self.kerns:
-            Nsamples = self.Nsamples/10.
+            Nsamples = int(np.floor(self.Nsamples/self.N))
            psi2 = kern.psi2(self.Z, self.q_x_mean, self.q_x_variance)
            K_ = np.zeros((self.num_inducing, self.num_inducing))
            diffs = []
            for i, q_x_sample_stripe in enumerate(np.array_split(self.q_x_samples, self.Nsamples / Nsamples)):
                K = kern.K(q_x_sample_stripe, self.Z)
-                K = (K[:, :, None] * K[:, None, :]).mean(0)
-                K_ += K
-                diffs.append(((psi2 - (K_ / (i + 1)))**2).mean())
-            K_ /= self.Nsamples / Nsamples
+                K = (K[:, :, None] * K[:, None, :])
+                K_ += K.sum(0) / self.Nsamples
+                diffs.append(((psi2 - (K_*self.Nsamples/((i+1)*Nsamples)))**2).mean())
+            #K_ /= self.Nsamples / Nsamples
            msg = "psi2: {}".format("+".join([p.name + ard(p) for p in kern.parts]))
            try:
                import pylab
                pylab.figure(msg)
-                pylab.plot(diffs)
+                pylab.plot(diffs, marker='x', mew=.2)
 #                 print msg, np.allclose(psi2.squeeze(), K_, rtol=1e-1, atol=.1)
-                self.assertTrue(np.allclose(psi2.squeeze(), K_,
-                                            rtol=1e-1, atol=.1),
+                self.assertTrue(np.allclose(psi2.squeeze(), K_),
+                                            #rtol=1e-1, atol=.1),
                                msg=msg + ": not matching")
 #                 sys.stdout.write(".")
            except:
-#                 import ipdb;ipdb.set_trace()
 #                 kern.psi2(self.Z, self.q_x_mean, self.q_x_variance)
 #                 sys.stdout.write("E")
                print msg + ": not matching"
+                import ipdb;ipdb.set_trace()
                pass

 if __name__ == "__main__":
--- a/GPy/testing/psi_stat_gradient_tests.py
+++ b/GPy/testing/psi_stat_gradient_tests.py
@ -40,10 +40,9 @@ class PsiStatModel(Model):
        return self.kern.__getattribute__(self.which)(self.Z, self.X, self.X_variance).sum()
    def _log_likelihood_gradients(self):
        psimu, psiS = self.kern.__getattribute__("d" + self.which + "_dmuS")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance)
-        try:
-            psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance)
-        except AttributeError:
-            psiZ = numpy.zeros(self.num_inducing * self.input_dim)
+        #psimu, psiS = numpy.ones(self.N * self.input_dim), numpy.ones(self.N * self.input_dim)
+        psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance)
+        #psiZ = numpy.ones(self.num_inducing * self.input_dim)
        thetagrad = self.kern.__getattribute__("d" + self.which + "_dtheta")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance).flatten()
        return numpy.hstack((psimu.flatten(), psiS.flatten(), psiZ.flatten(), thetagrad))

@ -64,40 +63,54 @@ class DPsiStatTest(unittest.TestCase):

    def testPsi0(self):
        for k in self.kernels:
-            m = PsiStatModel('psi0', X=self.X, X_variance=self.X_var, Z=self.Z,
+            m = PsiStatModel('psi0', X=self.X, X_variance=self.X_var, Z=self.Z,\
                             num_inducing=self.num_inducing, kernel=k)
+            m.ensure_default_constraints()
+            m.randomize()
            assert m.checkgrad(), "{} x psi0".format("+".join(map(lambda x: x.name, k.parts)))
-
-#     def testPsi1(self):
-#         for k in self.kernels:
-#             m = PsiStatModel('psi1', X=self.X, X_variance=self.X_var, Z=self.Z,
-#                      num_inducing=self.num_inducing, kernel=k)
-#             assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k.parts)))
+        
+    def testPsi1(self):
+        for k in self.kernels:
+            m = PsiStatModel('psi1', X=self.X, X_variance=self.X_var, Z=self.Z,
+                     num_inducing=self.num_inducing, kernel=k)
+            m.ensure_default_constraints()
+            m.randomize()
+            assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k.parts)))

    def testPsi2_lin(self):
        k = self.kernels[0]
        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
-                     num_inducing=self.num_inducing, kernel=k)
+                 num_inducing=self.num_inducing, kernel=k)
+        m.ensure_default_constraints()
+        m.randomize()
        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
    def testPsi2_lin_bia(self):
        k = self.kernels[3]
        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                     num_inducing=self.num_inducing, kernel=k)
+        m.ensure_default_constraints()
+        m.randomize()
        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
    def testPsi2_rbf(self):
        k = self.kernels[1]
        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                     num_inducing=self.num_inducing, kernel=k)
+        m.ensure_default_constraints()
+        m.randomize()
        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
    def testPsi2_rbf_bia(self):
        k = self.kernels[-1]
        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                     num_inducing=self.num_inducing, kernel=k)
+        m.ensure_default_constraints()
+        m.randomize()
        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
    def testPsi2_bia(self):
        k = self.kernels[2]
        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
                     num_inducing=self.num_inducing, kernel=k)
+        m.ensure_default_constraints()
+        m.randomize()
        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))


@ -116,9 +129,9 @@ if __name__ == "__main__":
 #         m.randomize()
 # #         self.assertTrue(m.checkgrad())
        numpy.random.seed(0)
-        input_dim = 5
-        N = 50
-        num_inducing = 10
+        input_dim = 3
+        N = 3
+        num_inducing = 2
        D = 15
        X = numpy.random.randn(N, input_dim)
        X_var = .5 * numpy.ones_like(X) + .1 * numpy.clip(numpy.random.randn(*X.shape), 0, 1)
@ -135,18 +148,35 @@ if __name__ == "__main__":
 #                      num_inducing=num_inducing, kernel=k)
 #             assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k.parts)))
 #
-#         m0 = PsiStatModel('psi0', X=X, X_variance=X_var, Z=Z,
-#                          num_inducing=num_inducing, kernel=GPy.kern.linear(input_dim))
+        m0 = PsiStatModel('psi0', X=X, X_variance=X_var, Z=Z,
+                         num_inducing=num_inducing, kernel=GPy.kern.rbf(input_dim)+GPy.kern.bias(input_dim))
 #         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
 #                          num_inducing=num_inducing, kernel=kernel)
 #         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
 #                          num_inducing=num_inducing, kernel=kernel)
 #         m2 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
 #                          num_inducing=num_inducing, kernel=GPy.kern.rbf(input_dim))
-        m3 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
-                         num_inducing=num_inducing, kernel=GPy.kern.linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)))
+#         m3 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
+#                          num_inducing=num_inducing, kernel=GPy.kern.linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)))
        # + GPy.kern.bias(input_dim))
-#         m4 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
-#                          num_inducing=num_inducing, kernel=GPy.kern.rbf(input_dim) + GPy.kern.bias(input_dim))
+#         m = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
+#                          num_inducing=num_inducing, 
+#                          kernel=(
+#             GPy.kern.rbf(input_dim, ARD=1) 
+#             +GPy.kern.linear(input_dim, ARD=1) 
+#             +GPy.kern.bias(input_dim))
+#                          )
+#         m.ensure_default_constraints()
+        m2 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
+                         num_inducing=num_inducing, kernel=(
+            GPy.kern.rbf(input_dim, numpy.random.rand(), numpy.random.rand(input_dim), ARD=1) 
+            #+GPy.kern.linear(input_dim, numpy.random.rand(input_dim), ARD=1) 
+            #+GPy.kern.rbf(input_dim, numpy.random.rand(), numpy.random.rand(input_dim), ARD=1) 
+            #+GPy.kern.rbf(input_dim, numpy.random.rand(), numpy.random.rand(), ARD=0) 
+            +GPy.kern.bias(input_dim)
+            +GPy.kern.white(input_dim)
+            )
+            )
+        m2.ensure_default_constraints()
    else:
        unittest.main()
--- a/GPy/testing/sparse_gplvm_tests.py
+++ b/GPy/testing/sparse_gplvm_tests.py
@ -4,7 +4,7 @@
 import unittest
 import numpy as np
 import GPy
-from GPy.models.sparse_gplvm import SparseGPLVM
+from ..models import SparseGPLVM

 class sparse_GPLVMTests(unittest.TestCase):
    def test_bias_kern(self):
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@ -163,14 +163,18 @@ class GradientTests(unittest.TestCase):
        rbflin = GPy.kern.rbf(2) + GPy.kern.linear(2)
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=2)

+    #@unittest.expectedFailure
    def test_SparseGPRegression_rbf_linear_white_kern_2D_uncertain_inputs(self):
        ''' Testing the sparse GP regression with rbf, linear kernel on 2d data with uncertain inputs'''
        rbflin = GPy.kern.rbf(2) + GPy.kern.linear(2)
+        raise unittest.SkipTest("This is not implemented yet!")
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=2, uncertain_inputs=1)

+    #@unittest.expectedFailure
    def test_SparseGPRegression_rbf_linear_white_kern_1D_uncertain_inputs(self):
        ''' Testing the sparse GP regression with rbf, linear kernel on 1d data with uncertain inputs'''
        rbflin = GPy.kern.rbf(1) + GPy.kern.linear(1)
+        raise unittest.SkipTest("This is not implemented yet!")
        self.check_model(rbflin, model_type='SparseGPRegression', dimension=1, uncertain_inputs=1)

    def test_GPLVM_rbf_bias_white_kern_2D(self):
@ -209,7 +213,7 @@ class GradientTests(unittest.TestCase):
        Z = np.linspace(0, 15, 4)[:, None]
        kernel = GPy.kern.rbf(1)
        m = GPy.models.SparseGPClassification(X,Y,kernel=kernel,Z=Z)
-        #distribution = GPy.likelihoods.likelihood_functions.Binomial()
+        #distribution = GPy.likelihoods.likelihood_functions.Bernoulli()
        #likelihood = GPy.likelihoods.EP(Y, distribution)
        #m = GPy.core.SparseGP(X, likelihood, kernel, Z)
        #m.ensure_default_constraints()
--- a/GPy/util/init.py
+++ b/GPy/util/init.py
@ -14,5 +14,20 @@ import visualize
 import decorators
 import classification
 import latent_space_visualizations
+try:
+    import maps
+except:
+    pass
+    maps = "warning: the maps module requires pyshp (shapefile). Install it to remove this message"
+
+try:
+    import sympy
+    _sympy_available = True
+    del sympy
+except ImportError as e:
+    _sympy_available = False
+
+if _sympy_available:
+    import symbolic

 import netpbmfile
--- a/GPy/util/block_matrices.py
+++ b/GPy/util/block_matrices.py
@ -0,0 +1,24 @@
+import numpy as np
+
+def get_blocks(A, blocksizes):
+    assert (A.shape[0]==A.shape[1]) and len(A.shape)==2, "can;t blockify this non-square matrix"
+    N = np.sum(blocksizes)
+    assert A.shape[0] == N, "bad blocksizes"
+    num_blocks = len(blocksizes)
+    B = np.empty(shape=(num_blocks, num_blocks), dtype=np.object)
+    count_i = 0
+    for Bi, i in enumerate(blocksizes):
+        count_j = 0
+        for Bj, j in enumerate(blocksizes):
+            B[Bi, Bj] = A[count_i:count_i + i, count_j : count_j + j]
+            count_j += j
+        count_i += i
+    return B
+
+
+
+if __name__=='__main__':
+    A = np.zeros((5,5))
+    B = get_blocks(A,[2,3])
+    B[0,0] += 7
+    print B
--- a/GPy/util/config.py
+++ b/GPy/util/config.py
@ -8,8 +8,8 @@ config = ConfigParser.ConfigParser()
 home = os.getenv('HOME') or os.getenv('USERPROFILE')
 user_file = os.path.join(home,'.gpy_config.cfg')
 default_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'gpy_config.cfg'))
-print user_file, os.path.isfile(user_file)
-print default_file, os.path.isfile(default_file)
+# print user_file, os.path.isfile(user_file)
+# print default_file, os.path.isfile(default_file)

 # 1. check if the user has a ~/.gpy_config.cfg
 if os.path.isfile(user_file):
--- a/GPy/util/data_resources.json
+++ b/GPy/util/data_resources.json
@ -0,0 +1,382 @@
+{
+   "rogers_girolami_data":{
+      "files":[
+         [
+            "firstcoursemldata.tar.gz"
+         ]
+      ],
+      "license":null,
+      "citation":"A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146",
+      "details":"Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.",
+      "urls":[
+         "https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/"
+      ],
+      "suffices":[
+         [
+            "?dl=1"
+         ]
+      ],
+      "size":21949154
+   },
+   "ankur_pose_data":{
+      "files":[
+         [
+            "ankurDataPoseSilhouette.mat"
+         ]
+      ],
+      "citation":"3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.",
+      "license":null,
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/ankur_pose_data/"
+      ],
+      "details":"Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing.",
+      "size":1
+   },
+   "osu_accad":{
+      "files":[
+         [
+            "swagger1TXT.ZIP",
+            "handspring1TXT.ZIP",
+            "quickwalkTXT.ZIP",
+            "run1TXT.ZIP",
+            "sprintTXT.ZIP",
+            "dogwalkTXT.ZIP",
+            "camper_04TXT.ZIP",
+            "dance_KB3_TXT.ZIP",
+            "per20_TXT.ZIP",
+            "perTWO07_TXT.ZIP",
+            "perTWO13_TXT.ZIP",
+            "perTWO14_TXT.ZIP",
+            "perTWO15_TXT.ZIP",
+            "perTWO16_TXT.ZIP"
+         ],
+         [
+            "connections.txt"
+         ]
+      ],
+      "license":"Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).",
+      "citation":"The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.",
+      "details":"Motion capture data of different motions from the Open Motion Data Project at Ohio State University.",
+      "urls":[
+         "http://accad.osu.edu/research/mocap/data/",
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/"
+      ],
+      "size":15922790
+   },
+   "isomap_face_data":{
+      "files":[
+         [
+            "face_data.mat"
+         ]
+      ],
+      "license":null,
+      "citation":"A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000",
+      "details":"Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.",
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/isomap_face_data/"
+      ],
+      "size":24229368
+   },
+   "boston_housing":{
+      "files":[
+         [
+            "Index",
+            "housing.data",
+            "housing.names"
+         ]
+      ],
+      "license":null,
+      "citation":"Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.",
+      "details":"The Boston Housing data relates house values in Boston to a range of input variables.",
+      "urls":[
+         "http://archive.ics.uci.edu/ml/machine-learning-databases/housing/"
+      ],
+      "size":51276
+   },
+   "cmu_mocap_full":{
+      "files":[
+         [
+            "allasfamc.zip"
+         ]
+      ],
+      "license":"From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.",
+      "citation":"Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.\nThe database was created with funding from NSF EIA-0196217.",
+      "details":"CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.",
+      "urls":[
+         "http://mocap.cs.cmu.edu/subjects"
+      ],
+      "size":null
+   },
+   "brendan_faces":{
+      "files":[
+         [
+            "frey_rawface.mat"
+         ]
+      ],
+      "license":null,
+      "citation":"Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.",
+      "details":"A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.",
+      "urls":[
+         "http://www.cs.nyu.edu/~roweis/data/"
+      ],
+      "size":1100584
+   },
+   "olympic_marathon_men":{
+      "files":[
+         [
+            "olympicMarathonTimes.csv"
+         ]
+      ],
+      "license":null,
+      "citation":null,
+      "details":"Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data",
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olympic_marathon_men/"
+      ],
+      "size":584
+   },
+   "pumadyn-32nm":{
+      "files":[
+         [
+            "pumadyn-32nm.tar.gz"
+         ]
+      ],
+      "license":"Data is made available by the Delve system at the University of Toronto",
+      "citation":"Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.",
+      "details":"Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.",
+      "urls":[
+         "ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/"
+      ],
+      "size":5861646
+   },
+   "ripley_prnn_data":{
+      "files":[
+         [
+            "Cushings.dat",
+            "README",
+            "crabs.dat",
+            "fglass.dat",
+            "fglass.grp",
+            "pima.te",
+            "pima.tr",
+            "pima.tr2",
+            "synth.te",
+            "synth.tr",
+            "viruses.dat",
+            "virus3.dat"
+         ]
+      ],
+      "license":null,
+      "citation":"Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7",
+      "details":"Data sets from Brian Ripley's Pattern Recognition and Neural Networks",
+      "urls":[
+         "http://www.stats.ox.ac.uk/pub/PRNN/"
+      ],
+      "size":93565
+   },
+   "three_phase_oil_flow":{
+      "files":[
+         [
+            "DataTrnLbls.txt",
+            "DataTrn.txt",
+            "DataTst.txt",
+            "DataTstLbls.txt",
+            "DataVdn.txt",
+            "DataVdnLbls.txt"
+         ]
+      ],
+      "license":null,
+      "citation":"Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593",
+      "details":"The three phase oil data used initially for demonstrating the Generative Topographic mapping.",
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/three_phase_oil_flow/"
+      ],
+      "size":712796
+   },
+   "robot_wireless":{
+      "files":[
+         [
+            "uw-floor.txt"
+         ]
+      ],
+      "license":null,
+      "citation":"WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.",
+      "details":"Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.",
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/robot_wireless/"
+      ],
+      "size":284390
+   },
+   "xw_pen":{
+      "files":[
+         [
+            "xw_pen_15.csv"
+         ]
+      ],
+      "license":null,
+      "citation":"Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005",
+      "details":"Accelerometer pen data used for robust regression by Tipping and Lawrence.",
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/xw_pen/"
+      ],
+      "size":3410
+   },
+   "swiss_roll":{
+      "files":[
+         [
+            "swiss_roll_data.mat"
+         ]
+      ],
+      "license":null,
+      "citation":"A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000",
+      "details":"Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.",
+      "urls":[
+         "http://isomap.stanford.edu/"
+      ],
+      "size":800256
+   },
+   "osu_run1":{
+      "files":[
+         [
+            "run1TXT.ZIP"
+         ],
+         [
+            "connections.txt"
+         ]
+      ],
+      "license":"Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).",
+      "citation":"The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.",
+      "details":"Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.",
+      "urls":[
+         "http://accad.osu.edu/research/mocap/data/",
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/"
+      ],
+      "size":338103
+   },
+   "creep_rupture":{
+      "files":[
+         [
+            "creeprupt.tar"
+         ]
+      ],
+      "license":null,
+      "citation":"Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.",
+      "details":"Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.",
+      "urls":[
+         "http://www.msm.cam.ac.uk/map/data/tar/"
+      ],
+      "size":602797
+   },
+   "olivetti_faces":{
+      "files":[
+         [
+            "att_faces.zip"
+         ],
+         [
+            "olivettifaces.mat"
+         ]
+      ],
+      "license":null,
+      "citation":"Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994",
+      "details":"Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. ",
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/",
+         "http://www.cs.nyu.edu/~roweis/data/"
+      ],
+      "size":8561331
+   },
+   "olivetti_glasses":{
+      "files":[
+         [
+            "has_glasses.np"
+         ],
+         [
+            "olivettifaces.mat"
+         ]
+      ],
+      "license":null,
+      "citation":"Information recorded in olivetti_faces entry. Should be used from there.",
+      "details":"Information recorded in olivetti_faces entry. Should be used from there.",
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/",
+         "http://www.cs.nyu.edu/~roweis/data/"
+      ],
+      "size":4261047
+   },
+   "della_gatta":{
+      "files":[
+         [
+            "DellaGattadata.mat"
+         ]
+      ],
+      "license":null,
+      "citation":"Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008",
+      "details":"The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.",
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/della_gatta/"
+      ],
+      "size":3729650
+   },
+   "epomeo_gpx":{
+      "files":[
+         [
+            "endomondo_1.gpx",
+            "endomondo_2.gpx",
+            "garmin_watch_via_endomondo.gpx",
+            "viewranger_phone.gpx",
+            "viewranger_tablet.gpx"
+         ]
+      ],
+      "license":null,
+      "citation":"",
+      "details":"Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).",
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/epomeo_gpx/"
+      ],
+      "size":2031872
+   },
+   "mauna_loa":{
+      "files":[
+         [
+            "co2_mm_mlo.txt"
+         ]
+      ],
+      "license":"-------------------------------------------------------------------- USE OF NOAA ESRL DATA\n\n  These data are made freely available to the public and the scientific community in the belief that their wide dissemination will lead to greater understanding and new scientific insights. The availability of these data does not constitute publication of the data.  NOAA relies on the ethics and integrity of the user to insure that ESRL receives fair credit for their work.  If the data  are obtained for potential use in a publication or presentation,  ESRL should be informed at the outset of the nature of this work.   If the ESRL data are essential to the work, or if an important  result or conclusion depends on the ESRL data, co-authorship may be appropriate.  This should be discussed at an early stage in the work.  Manuscripts using the ESRL data should be sent to ESRL for review before they are submitted for publication so we can insure that the quality and limitations of the data are accurately represented.\n\n  Contact:   Pieter Tans (303 497 6678; pieter.tans@noaa.gov)\n\n  RECIPROCITY  Use of these data implies an agreement to reciprocate. Laboratories making similar measurements agree to make their own data available to the general public and to the scientific community in an equally complete and easily accessible form. Modelers are encouraged to make available to the community, upon request, their own tools used in the interpretation of the ESRL data, namely well documented model code, transport fields, and additional information necessary for other scientists to repeat the work and to run modified versions. Model availability includes collaborative support for new users of the models.\n --------------------------------------------------------------------\n\n     See www.esrl.noaa.gov/gmd/ccgg/trends/ for additional details.",
+      "citation":"Mauna Loa Data. Dr. Pieter Tans, NOAA/ESRL (www.esrl.noaa.gov/gmd/ccgg/trends/) and Dr. Ralph Keeling, Scripps Institution of Oceanography (scrippsco2.ucsd.edu/).",
+      "details":"The 'average' column contains the monthly mean CO2 mole fraction determined from daily averages.  The mole fraction of CO2, expressed as parts per million (ppm) is the number of molecules of CO2 in every one million molecules of dried air (water vapor removed).  If there are missing days concentrated either early or late in the month, the monthly mean is corrected to the middle of the month using the average seasonal cycle.  Missing months are denoted by -99.99. The 'interpolated' column includes average values from the preceding column and interpolated values where data are missing.  Interpolated values are computed in two steps.  First, we compute for each month the average seasonal cycle in a 7-year window around each monthly value.  In this way the seasonal cycle is allowed to change slowly over time.  We then determine the 'trend' value for each month by removing the seasonal cycle; this result is shown in the 'trend' column.  Trend values are linearly interpolated for missing months. The interpolated monthly mean is then the sum of the average seasonal cycle value and the trend value for the missing month.\n\nNOTE: In general, the data presented for the last year are subject to change, depending on recalibration of the reference gas mixtures used, and other quality control procedures. Occasionally, earlier years may also be changed for the same reasons.  Usually these changes are minor.\n\nCO2 expressed as a mole fraction in dry air, micromol/mol, abbreviated as ppm \n\n (-99.99 missing data;  -1 no data for daily means in month)",
+      "urls":[
+         "ftp://aftp.cmdl.noaa.gov/products/trends/co2/"
+      ],
+      "size":46779
+   },
+   "boxjenkins_airline":{
+      "files":[
+         [
+            "boxjenkins_airline.csv"
+         ]
+      ],
+      "license":"You may copy and redistribute the data. You may make derivative works from the data. You may use the data for commercial purposes. You may not sublicence the data when redistributing it. You may not redistribute the data under a different license. Source attribution on any use of this data: Must refer source.",
+      "citation":"Box & Jenkins (1976), in file: data/airpass, Description: International airline passengers: monthly totals in thousands. Jan 49 – Dec 60",
+      "details":"International airline passengers, monthly totals from January 1949 to December 1960.",
+      "urls":[
+                  "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/boxjenkins_airline/"
+      ],
+      "size":46779
+   },
+
+   "decampos_characters":{
+      "files":[
+         [
+            "characters.npy",
+            "digits.npy"
+         ]
+      ],
+      "license":null,
+      "citation":"T. de Campos, B. R. Babu, and M. Varma. Character recognition in natural images. VISAPP 2009.",
+      "details":"Examples of hand written digits taken from the de Campos et al paper on Character Recognition in Natural Images.",
+      "urls":[
+         "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/decampos_digits/"
+      ],
+      "size":2031872
+   }
+}
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -3,11 +3,10 @@ import numpy as np
 import GPy
 import scipy.io
 import cPickle as pickle
-import urllib as url
 import zipfile
 import tarfile
 import datetime
-
+import json
 ipython_available=True
 try:
    import IPython
@ -15,137 +14,28 @@ except ImportError:
    ipython_available=False


-import sys, urllib
+import sys, urllib2

-def reporthook(a,b,c): 
+def reporthook(a,b,c):
    # ',' at the end of the line is important!
    #print "% 3.1f%% of %d bytes\r" % (min(100, float(a * b) / c * 100), c),
    #you can also use sys.stdout.write
    sys.stdout.write("\r% 3.1f%% of %d bytes" % (min(100, float(a * b) / c * 100), c))
    sys.stdout.flush()
-     
+
 # Global variables
 data_path = os.path.join(os.path.dirname(__file__), 'datasets')
 default_seed = 10000
 overide_manual_authorize=False
 neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/'
-sam_url = 'http://www.cs.nyu.edu/~roweis/data/'
-cmu_url = 'http://mocap.cs.cmu.edu/subjects/'

-# Note: there may be a better way of storing data resources, for the
-# moment we are storing them in a dictionary.
-data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'],
-                                       'files' : [['ankurDataPoseSilhouette.mat']],
-                                       'license' : None,
-                                       'citation' : """3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.""",
-                                       'details' : """Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."""},
-                   
-                  'boston_housing' : {'urls' : ['http://archive.ics.uci.edu/ml/machine-learning-databases/housing/'],
-                                      'files' : [['Index', 'housing.data', 'housing.names']],
-                                      'citation' : """Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.""",
-                                      'details' : """The Boston Housing data relates house values in Boston to a range of input variables.""",
-                                      'license' : None,
-                                      'size' : 51276
-                                      },
-                  'brendan_faces' : {'urls' : [sam_url],
-                                     'files': [['frey_rawface.mat']],
-                                     'citation' : 'Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.',
-                                     'details' : """A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.""",
-                                     'license': None,
-                                     'size' : 1100584},
-                  'cmu_mocap_full' : {'urls' : ['http://mocap.cs.cmu.edu'],
-                                 'files' : [['allasfamc.zip']],
-                                 'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.
-The database was created with funding from NSF EIA-0196217.""",
-                                 'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""",
-                                 'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""",
-                                 'size' : None},
-                  'creep_rupture' : {'urls' : ['http://www.msm.cam.ac.uk/map/data/tar/'],
-                                     'files' : [['creeprupt.tar']],
-                                     'citation' : 'Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.',
-                                     'details' : """Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.""",
-                                     'license' : None,
-                                     'size' : 602797},
-                  'della_gatta' : {'urls' : [neil_url + 'della_gatta/'],
-                                   'files': [['DellaGattadata.mat']],
-                                   'citation' : 'Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008',
-                                   'details': "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.",
-                                   'license':None,
-                                   'size':3729650},
-                  'epomeo_gpx' : {'urls' : [neil_url + 'epomeo_gpx/'],
-                                   'files': [['endomondo_1.gpx', 'endomondo_2.gpx', 'garmin_watch_via_endomondo.gpx','viewranger_phone.gpx','viewranger_tablet.gpx']],
-                                   'citation' : '',
-                                   'details': "Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).",
-                                   'license':None,
-                                   'size': 2031872},
-                  'three_phase_oil_flow': {'urls' : [neil_url + 'three_phase_oil_flow/'],
-                                           'files' : [['DataTrnLbls.txt', 'DataTrn.txt', 'DataTst.txt', 'DataTstLbls.txt', 'DataVdn.txt', 'DataVdnLbls.txt']],
-                                           'citation' : 'Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593',
-                                           'details' : """The three phase oil data used initially for demonstrating the Generative Topographic mapping.""",
-                                           'license' : None,
-                                           'size' : 712796},
-                  'rogers_girolami_data' : {'urls' : ['https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/'],
-                                            'files' : [['firstcoursemldata.tar.gz']],
-                                            'suffices' : [['?dl=1']],
-                                            'citation' : 'A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146',
-                                            'details' : """Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.""",
-                                            'license' : None,
-                                            'size' : 21949154},
-                  'olivetti_faces' : {'urls' : [neil_url + 'olivetti_faces/', sam_url],
-                                      'files' : [['att_faces.zip'], ['olivettifaces.mat']],
-                                            'citation' : 'Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994',
-                                            'details' : """Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. """,
-                                            'license': None,
-                                            'size' : 8561331},
-                  'olympic_marathon_men' : {'urls' : [neil_url + 'olympic_marathon_men/'],
-                                            'files' : [['olympicMarathonTimes.csv']],
-                                            'citation' : None,
-                                            'details' : """Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data""",
-                                            'license': None,
-                                            'size' : 584},
-                  'osu_run1' : {'urls': ['http://accad.osu.edu/research/mocap/data/', neil_url + 'stick/'],
-                                'files': [['run1TXT.ZIP'],['connections.txt']],
-                                'details' : "Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.",
-                                'citation' : 'The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.',
-                                'license' : 'Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).',
-                                'size': 338103},
-                  'osu_accad' : {'urls': ['http://accad.osu.edu/research/mocap/data/', neil_url + 'stick/'],
-                                'files': [['swagger1TXT.ZIP','handspring1TXT.ZIP','quickwalkTXT.ZIP','run1TXT.ZIP','sprintTXT.ZIP','dogwalkTXT.ZIP','camper_04TXT.ZIP','dance_KB3_TXT.ZIP','per20_TXT.ZIP','perTWO07_TXT.ZIP','perTWO13_TXT.ZIP','perTWO14_TXT.ZIP','perTWO15_TXT.ZIP','perTWO16_TXT.ZIP'],['connections.txt']],
-                                'details' : "Motion capture data of different motions from the Open Motion Data Project at Ohio State University.",
-                                'citation' : 'The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.',
-                                'license' : 'Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).',
-                                'size': 15922790},
-                  'pumadyn-32nm' : {'urls' : ['ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/'],
-                                    'files' : [['pumadyn-32nm.tar.gz']],
-                                    'details' : """Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.""",
-                                    'citation' : """Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.""",
-                                    'license' : """Data is made available by the Delve system at the University of Toronto""",
-                                    'size' : 5861646},
-                  'robot_wireless' : {'urls' : [neil_url + 'robot_wireless/'],
-                                      'files' : [['uw-floor.txt']],
-                                      'citation' : """WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.""",
-                                      'details' : """Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.""",
-                                      'license' : None,
-                                      'size' : 284390},
-                  'swiss_roll' : {'urls' : ['http://isomap.stanford.edu/'],
-                                  'files' : [['swiss_roll_data.mat']],
-                                  'details' : """Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""",
-                                  'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000',
-                                  'license' : None,
-                                  'size' : 800256},
-                  'ripley_prnn_data' : {'urls' : ['http://www.stats.ox.ac.uk/pub/PRNN/'],
-                                        'files' : [['Cushings.dat', 'README', 'crabs.dat', 'fglass.dat', 'fglass.grp', 'pima.te', 'pima.tr', 'pima.tr2', 'synth.te', 'synth.tr', 'viruses.dat', 'virus3.dat']],
-                                        'details' : """Data sets from Brian Ripley's Pattern Recognition and Neural Networks""",
-                                        'citation': """Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7""",
-                                        'license' : None,
-                                        'size' : 93565},
-                  'isomap_face_data' : {'urls' : [neil_url + 'isomap_face_data/'],
-                                        'files' : [['face_data.mat']],
-                                        'details' : """Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""",
-                                        'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000',
-                                        'license' : None,
-                                        'size' : 24229368},
-                  }
+# Read data resources from json file.
+# Don't do this when ReadTheDocs is scanning as it breaks things
+on_rtd = os.environ.get('READTHEDOCS', None) == 'True' #Checks if RTD is scanning
+if not (on_rtd):
+    path = os.path.join(os.path.dirname(__file__), 'data_resources.json')
+    json_data=open(path).read()
+    data_resources = json.loads(json_data)


 def prompt_user(prompt):
@ -158,14 +48,14 @@ def prompt_user(prompt):
        print(prompt)
        choice = raw_input().lower()
        # would like to test for exception here, but not sure if we can do that without importing IPython
-    except: 
+    except:
        print('Stdin is not implemented.')
        print('You need to set')
        print('overide_manual_authorize=True')
        print('to proceed with the download. Please set that variable and continue.')
        raise

-    
+
    if choice in yes:
        return True
    elif choice in no:
@ -183,7 +73,7 @@ def data_available(dataset_name=None):
            if not os.path.exists(os.path.join(data_path, dataset_name, file)):
                return False
    return True
-            
+
 def download_url(url, store_directory, save_name = None, messages = True, suffix=''):
    """Download a file from a url and save it to disk."""
    i = url.rfind('/')
@ -194,7 +84,21 @@ def download_url(url, store_directory, save_name = None, messages = True, suffix
    print "Downloading ", url, "->", os.path.join(store_directory, file)
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
-    urllib.urlretrieve(url+suffix, save_name, reporthook)
+    try:
+        response = urllib2.urlopen(url+suffix)
+    except urllib2.URLError, e:
+        if not hasattr(e, "code"):
+            raise
+        response = e
+        if response.code > 399 and response.code<500:
+            raise ValueError('Tried url ' + url + suffix + ' and received client error ' + str(response.code))
+        elif response.code > 499:
+            raise ValueError('Tried url ' + url + suffix + ' and received server error ' + str(response.code))
+    # if we wanted to get more sophisticated maybe we should check the response code here again even for successes.
+    with open(save_name, 'wb') as f:
+        f.write(response.read())
+
+    #urllib.urlretrieve(url+suffix, save_name, reporthook)

 def authorize_download(dataset_name=None):
    """Check with the user that the are happy with terms and conditions for the data set."""
@ -243,18 +147,20 @@ def download_data(dataset_name=None):
            for file in files:
                download_url(os.path.join(url,file), dataset_name, dataset_name)
    return True
-                  
+
 def data_details_return(data, data_set):
    """Update the data component of the data dictionary with details drawn from the data_resources."""
    data.update(data_resources[data_set])
    return data

-    
+
 def cmu_urls_files(subj_motions, messages = True):
    '''
-    Find which resources are missing on the local disk for the requested CMU motion capture motions. 
+    Find which resources are missing on the local disk for the requested CMU motion capture motions.
    '''
-    
+    dr = data_resources['cmu_mocap_full']
+    cmu_url = dr['urls'][0]
+
    subjects_num = subj_motions[0]
    motions_num = subj_motions[1]

@ -274,15 +180,15 @@ def cmu_urls_files(subj_motions, messages = True):
            motions[i].append(curMot)

    all_skels = []
-    
+
    assert len(subjects) == len(motions)
-    
+
    all_motions = []
-            
+
    for i in range(len(subjects)):
        skel_dir = os.path.join(data_path, 'cmu_mocap')
        cur_skel_file = os.path.join(skel_dir, subjects[i] + '.asf')
-        
+
        url_required = False
        file_download = []
        if not os.path.exists(cur_skel_file):
@ -299,7 +205,7 @@ def cmu_urls_files(subj_motions, messages = True):
                url_required = True
                file_download.append(subjects[i] + '_' + motions[i][j] + '.amc')
        if url_required:
-            resource['urls'].append(cmu_url + subjects[i] + '/')
+            resource['urls'].append(cmu_url + '/' + subjects[i] + '/')
            resource['files'].append(file_download)
    return resource

@ -326,10 +232,10 @@ if gpxpy_available:
            points = [point for track in gpx.tracks for segment in track.segments for point in segment.points]
            data = [[(point.time-datetime.datetime(2013,8,21)).total_seconds(), point.latitude, point.longitude, point.elevation] for point in points]
            X.append(np.asarray(data)[::sample_every, :])
-            gpx_file.close()        
+            gpx_file.close()
        return data_details_return({'X' : X, 'info' : 'Data is an array containing time in seconds, latitude, longitude and elevation in that order.'}, data_set)

-del gpxpy_available
+#del gpxpy_available



@ -402,7 +308,7 @@ def oil(data_set='three_phase_oil_flow'):
    return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'Xtest' : Xtest, 'Xvalid': Xvalid, 'Yvalid': Yvalid}, data_set)
    #else:
    # throw an error
-    
+
 def oil_100(seed=default_seed, data_set = 'three_phase_oil_flow'):
    np.random.seed(seed=seed)
    data = oil()
@ -489,6 +395,18 @@ def silhouette(data_set='ankur_pose_data'):
    Ytest = mat_data['Z_test']
    return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest}, data_set)

+def decampos_digits(data_set='decampos_characters', which_digits=[0,1,2,3,4,5,6,7,8,9]):
+    if not data_available(data_set):
+        download_data(data_set)
+    path = os.path.join(data_path, data_set)
+    digits = np.load(os.path.join(path, 'digits.npy'))
+    digits = digits[which_digits,:,:,:]
+    num_classes, num_samples, height, width = digits.shape
+    Y = digits.reshape((digits.shape[0]*digits.shape[1],digits.shape[2]*digits.shape[3]))
+    lbls = np.array([[l]*num_samples for l in which_digits]).reshape(Y.shape[0], 1)
+    str_lbls = np.array([[str(l)]*num_samples for l in which_digits])
+    return data_details_return({'Y': Y, 'lbls': lbls, 'str_lbls' : str_lbls, 'info': 'Digits data set from the de Campos characters data'}, data_set)
+    
 def ripley_synth(data_set='ripley_prnn_data'):
    if not data_available(data_set):
        download_data(data_set)
@ -498,7 +416,36 @@ def ripley_synth(data_set='ripley_prnn_data'):
    test = np.genfromtxt(os.path.join(data_path, data_set, 'synth.te'), skip_header=1)
    Xtest = test[:, 0:2]
    ytest = test[:, 2:3]
-    return data_details_return({'X': X, 'y': y, 'Xtest': Xtest, 'ytest': ytest, 'info': 'Synthetic data generated by Ripley for a two class classification problem.'}, data_set)
+    return data_details_return({'X': X, 'Y': y, 'Xtest': Xtest, 'Ytest': ytest, 'info': 'Synthetic data generated by Ripley for a two class classification problem.'}, data_set)
+
+def mauna_loa(data_set='mauna_loa', num_train=543, refresh_data=False):
+    path = os.path.join(data_path, data_set)
+    if data_available(data_set) and not refresh_data:
+        print 'Using cached version of the data set, to use latest version set refresh_data to True'
+    else:
+        download_data(data_set)
+    data = np.loadtxt(os.path.join(data_path, data_set, 'co2_mm_mlo.txt'))
+    print 'Most recent data observation from month ', data[-1, 1], ' in year ', data[-1, 0]
+    allX = data[data[:, 3]!=-99.99, 2:3]
+    allY = data[data[:, 3]!=-99.99, 3:4]
+    X = allX[:num_train, 0:1]
+    Xtest = allX[num_train:, 0:1]
+    Y = allY[:num_train, 0:1]
+    Ytest = allY[num_train:, 0:1]
+    return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'info': "Mauna Loa data with " + str(num_train) + " values used as training points."}, data_set)
+    
+
+def boxjenkins_airline(data_set='boxjenkins_airline', num_train=96):
+    path = os.path.join(data_path, data_set)
+    if not data_available(data_set):
+        download_data(data_set)
+    data = np.loadtxt(os.path.join(data_path, data_set, 'boxjenkins_airline.csv'), delimiter=',')
+    Y = data[:num_train, 1:2]
+    X = data[:num_train, 0:1]
+    Xtest = data[num_train:, 0:1]
+    Ytest = data[num_train:, 1:2]
+    return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'info': "Montly airline passenger data from Box & Jenkins 1976."}, data_set)
+    

 def osu_run1(data_set='osu_run1', sample_every=4):
    path = os.path.join(data_path, data_set)
@ -547,7 +494,7 @@ def simulation_BGPLVM():
    Y = np.array(mat_data['Y'], dtype=float)
    S = np.array(mat_data['initS'], dtype=float)
    mu = np.array(mat_data['initMu'], dtype=float)
-    return data_details_return({'S': S, 'Y': Y, 'mu': mu}, mat_data)
+    #return data_details_return({'S': S, 'Y': Y, 'mu': mu}, data_set)
    return {'Y': Y, 'S': S,
            'mu' : mu,
            'info': "Simulated test dataset generated in MATLAB to compare BGPLVM between python and MATLAB"}
@ -591,6 +538,21 @@ def toy_linear_1d_classification(seed=default_seed):
    X = (np.r_[x1, x2])[:, None]
    return {'X': X, 'Y':  sample_class(2.*X), 'F': 2.*X, 'seed' : seed}

+def olivetti_glasses(data_set='olivetti_glasses', num_training=200, seed=default_seed):
+    path = os.path.join(data_path, data_set)
+    if not data_available(data_set):
+        download_data(data_set)
+    y = np.load(os.path.join(path, 'has_glasses.np'))
+    y = np.where(y=='y',1,0).reshape(-1,1)
+    faces = scipy.io.loadmat(os.path.join(path, 'olivettifaces.mat'))['faces'].T
+    np.random.seed(seed=seed)
+    index = np.random.permutation(faces.shape[0])
+    X = faces[index[:num_training],:]
+    Xtest = faces[index[num_training:],:]
+    Y = y[index[:num_training],:]
+    Ytest = y[index[num_training:]]
+    return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'seed' : seed, 'info': "ORL Faces with labels identifiying who is wearing glasses and who isn't. Data is randomly partitioned according to given seed. Presence or absence of glasses was labelled by James Hensman."}, 'olivetti_faces')
+    
 def olivetti_faces(data_set='olivetti_faces'):
    path = os.path.join(data_path, data_set)
    if not data_available(data_set):
@ -608,8 +570,16 @@ def olivetti_faces(data_set='olivetti_faces'):
    Y = np.asarray(Y)
    lbls = np.asarray(lbls)[:, None]
    return data_details_return({'Y': Y, 'lbls' : lbls, 'info': "ORL Faces processed to 64x64 images."}, data_set)
-    
-def download_rogers_girolami_data():
+
+def xw_pen(data_set='xw_pen'):
+    if not data_available(data_set):
+        download_data(data_set)
+    Y = np.loadtxt(os.path.join(data_path, data_set, 'xw_pen_15.csv'), delimiter=',')
+    X = np.arange(485)[:, None]
+    return data_details_return({'Y': Y, 'X': X, 'info': "Tilt data from a personalized digital assistant pen. Plot in original paper showed regression between time steps 175 and 275."}, data_set)
+
+
+def download_rogers_girolami_data(data_set='rogers_girolami_data'):
    if not data_available('rogers_girolami_data'):
        download_data(data_set)
        path = os.path.join(data_path, data_set)
@ -675,7 +645,7 @@ def olympic_marathon_men(data_set='olympic_marathon_men'):
    Y = olympics[:, 1:2]
    return data_details_return({'X': X, 'Y': Y}, data_set)

-def olympics():
+def olympic_sprints(data_set='rogers_girolami_data'):
    """All olympics sprint winning times for multiple output prediction."""
    X = np.zeros((0, 2))
    Y = np.zeros((0, 1))
@ -693,7 +663,18 @@ def olympics():
    data['X'] = X
    data['Y'] = Y
    data['info'] = "Olympics sprint event winning for men and women to 2008. Data is from Rogers and Girolami's First Course in Machine Learning."
-    return data
+    return data_details_return({
+        'X': X,
+        'Y': Y,
+        'info': "Olympics sprint event winning for men and women to 2008. Data is from Rogers and Girolami's First Course in Machine Learning.",
+        'output_info': {
+          0:'100m Men',
+          1:'100m Women',
+          2:'200m Men',
+          3:'200m Women',
+          4:'400m Men',
+          5:'400m Women'}
+        }, data_set)

 # def movielens_small(partNo=1,seed=default_seed):
 #     np.random.seed(seed=seed)
@ -786,15 +767,15 @@ def creep_data(data_set='creep_rupture'):
    X = all_data[:, features].copy()
    return data_details_return({'X': X, 'y': y}, data_set)

-def cmu_mocap_49_balance():
+def cmu_mocap_49_balance(data_set='cmu_mocap'):
    """Load CMU subject 49's one legged balancing motion that was used by Alvarez, Luengo and Lawrence at AISTATS 2009."""
    train_motions = ['18', '19']
    test_motions = ['20']
-    data = cmu_mocap('49', train_motions, test_motions, sample_every=4)
+    data = cmu_mocap('49', train_motions, test_motions, sample_every=4, data_set=data_set)
    data['info'] = "One legged balancing motions from CMU data base subject 49. As used in Alvarez, Luengo and Lawrence at AISTATS 2009. It consists of " + data['info']
    return data

-def cmu_mocap_35_walk_jog():
+def cmu_mocap_35_walk_jog(data_set='cmu_mocap'):
    """Load CMU subject 35's walking and jogging motions, the same data that was used by Taylor, Roweis and Hinton at NIPS 2007. but without their preprocessing. Also used by Lawrence at AISTATS 2007."""
    train_motions = ['01', '02', '03', '04', '05', '06',
                '07', '08', '09', '10', '11', '12',
@ -802,7 +783,7 @@ def cmu_mocap_35_walk_jog():
                '20', '21', '22', '23', '24', '25',
                '26', '28', '30', '31', '32', '33', '34']
    test_motions = ['18', '29']
-    data = cmu_mocap('35', train_motions, test_motions, sample_every=4)
+    data = cmu_mocap('35', train_motions, test_motions, sample_every=4, data_set=data_set)
    data['info'] = "Walk and jog data from CMU data base subject 35. As used in Tayor, Roweis and Hinton at NIPS 2007, but without their pre-processing (i.e. as used by Lawrence at AISTATS 2007). It consists of " + data['info']
    return data

@ -814,7 +795,7 @@ def cmu_mocap(subject, train_motions, test_motions=[], sample_every=4, data_set=
    # Make sure the data is downloaded.
    all_motions = train_motions + test_motions
    resource = cmu_urls_files(([subject], [all_motions]))
-    data_resources[data_set] = data_resources['cmu_mocap_full']
+    data_resources[data_set] = data_resources['cmu_mocap_full'].copy()
    data_resources[data_set]['files'] = resource['files']
    data_resources[data_set]['urls'] = resource['urls']
    if resource['urls']:
@ -884,3 +865,5 @@ def cmu_mocap(subject, train_motions, test_motions=[], sample_every=4, data_set=
    if sample_every != 1:
        info += ' Data is sub-sampled to every ' + str(sample_every) + ' frames.'
    return data_details_return({'Y': Y, 'lbls' : lbls, 'Ytest': Ytest, 'lblstest' : lblstest, 'info': info, 'skel': skel}, data_set)
+
+
--- a/GPy/util/datasets/data_resources_create.py
+++ b/GPy/util/datasets/data_resources_create.py
@ -0,0 +1,127 @@
+import json
+
+neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/'
+sam_url = 'http://www.cs.nyu.edu/~roweis/data/'
+cmu_url = 'http://mocap.cs.cmu.edu/subjects/'
+
+data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'],
+                                       'files' : [['ankurDataPoseSilhouette.mat']],
+                                       'license' : None,
+                                       'citation' : """3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.""",
+                                       'details' : """Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."""},
+
+                  'boston_housing' : {'urls' : ['http://archive.ics.uci.edu/ml/machine-learning-databases/housing/'],
+                                      'files' : [['Index', 'housing.data', 'housing.names']],
+                                      'citation' : """Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.""",
+                                      'details' : """The Boston Housing data relates house values in Boston to a range of input variables.""",
+                                      'license' : None,
+                                      'size' : 51276
+                                      },
+                  'brendan_faces' : {'urls' : [sam_url],
+                                     'files': [['frey_rawface.mat']],
+                                     'citation' : 'Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.',
+                                     'details' : """A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.""",
+                                     'license': None,
+                                     'size' : 1100584},
+                  'cmu_mocap_full' : {'urls' : ['http://mocap.cs.cmu.edu'],
+                                 'files' : [['allasfamc.zip']],
+                                 'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.
+The database was created with funding from NSF EIA-0196217.""",
+                                 'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""",
+                                 'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""",
+                                 'size' : None},
+                  'creep_rupture' : {'urls' : ['http://www.msm.cam.ac.uk/map/data/tar/'],
+                                     'files' : [['creeprupt.tar']],
+                                     'citation' : 'Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.',
+                                     'details' : """Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.""",
+                                     'license' : None,
+                                     'size' : 602797},
+                  'della_gatta' : {'urls' : [neil_url + 'della_gatta/'],
+                                   'files': [['DellaGattadata.mat']],
+                                   'citation' : 'Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008',
+                                   'details': "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.",
+                                   'license':None,
+                                   'size':3729650},
+                  'epomeo_gpx' : {'urls' : [neil_url + 'epomeo_gpx/'],
+                                   'files': [['endomondo_1.gpx', 'endomondo_2.gpx', 'garmin_watch_via_endomondo.gpx','viewranger_phone.gpx','viewranger_tablet.gpx']],
+                                   'citation' : '',
+                                   'details': "Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).",
+                                   'license':None,
+                                   'size': 2031872},
+                  'three_phase_oil_flow': {'urls' : [neil_url + 'three_phase_oil_flow/'],
+                                           'files' : [['DataTrnLbls.txt', 'DataTrn.txt', 'DataTst.txt', 'DataTstLbls.txt', 'DataVdn.txt', 'DataVdnLbls.txt']],
+                                           'citation' : 'Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593',
+                                           'details' : """The three phase oil data used initially for demonstrating the Generative Topographic mapping.""",
+                                           'license' : None,
+                                           'size' : 712796},
+                  'rogers_girolami_data' : {'urls' : ['https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/'],
+                                            'files' : [['firstcoursemldata.tar.gz']],
+                                            'suffices' : [['?dl=1']],
+                                            'citation' : 'A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146',
+                                            'details' : """Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.""",
+                                            'license' : None,
+                                            'size' : 21949154},
+                  'olivetti_faces' : {'urls' : [neil_url + 'olivetti_faces/', sam_url],
+                                      'files' : [['att_faces.zip'], ['olivettifaces.mat']],
+                                            'citation' : 'Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994',
+                                            'details' : """Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. """,
+                                            'license': None,
+                                            'size' : 8561331},
+                  'olympic_marathon_men' : {'urls' : [neil_url + 'olympic_marathon_men/'],
+                                            'files' : [['olympicMarathonTimes.csv']],
+                                            'citation' : None,
+                                            'details' : """Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data""",
+                                            'license': None,
+                                            'size' : 584},
+                  'osu_run1' : {'urls': ['http://accad.osu.edu/research/mocap/data/', neil_url + 'stick/'],
+                                'files': [['run1TXT.ZIP'],['connections.txt']],
+                                'details' : "Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.",
+                                'citation' : 'The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.',
+                                'license' : 'Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).',
+                                'size': 338103},
+                  'osu_accad' : {'urls': ['http://accad.osu.edu/research/mocap/data/', neil_url + 'stick/'],
+                                'files': [['swagger1TXT.ZIP','handspring1TXT.ZIP','quickwalkTXT.ZIP','run1TXT.ZIP','sprintTXT.ZIP','dogwalkTXT.ZIP','camper_04TXT.ZIP','dance_KB3_TXT.ZIP','per20_TXT.ZIP','perTWO07_TXT.ZIP','perTWO13_TXT.ZIP','perTWO14_TXT.ZIP','perTWO15_TXT.ZIP','perTWO16_TXT.ZIP'],['connections.txt']],
+                                'details' : "Motion capture data of different motions from the Open Motion Data Project at Ohio State University.",
+                                'citation' : 'The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.',
+                                'license' : 'Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).',
+                                'size': 15922790},
+                  'pumadyn-32nm' : {'urls' : ['ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/'],
+                                    'files' : [['pumadyn-32nm.tar.gz']],
+                                    'details' : """Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.""",
+                                    'citation' : """Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.""",
+                                    'license' : """Data is made available by the Delve system at the University of Toronto""",
+                                    'size' : 5861646},
+                  'robot_wireless' : {'urls' : [neil_url + 'robot_wireless/'],
+                                      'files' : [['uw-floor.txt']],
+                                      'citation' : """WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.""",
+                                      'details' : """Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.""",
+                                      'license' : None,
+                                      'size' : 284390},
+                  'swiss_roll' : {'urls' : ['http://isomap.stanford.edu/'],
+                                  'files' : [['swiss_roll_data.mat']],
+                                  'details' : """Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""",
+                                  'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000',
+                                  'license' : None,
+                                  'size' : 800256},
+                  'ripley_prnn_data' : {'urls' : ['http://www.stats.ox.ac.uk/pub/PRNN/'],
+                                        'files' : [['Cushings.dat', 'README', 'crabs.dat', 'fglass.dat', 'fglass.grp', 'pima.te', 'pima.tr', 'pima.tr2', 'synth.te', 'synth.tr', 'viruses.dat', 'virus3.dat']],
+                                        'details' : """Data sets from Brian Ripley's Pattern Recognition and Neural Networks""",
+                                        'citation': """Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7""",
+                                        'license' : None,
+                                        'size' : 93565},
+                  'isomap_face_data' : {'urls' : [neil_url + 'isomap_face_data/'],
+                                        'files' : [['face_data.mat']],
+                                        'details' : """Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""",
+                                        'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000',
+                                        'license' : None,
+                                        'size' : 24229368},
+                  'xw_pen' : {'urls' : [neil_url + 'xw_pen/'],
+                                        'files' : [['xw_pen_15.csv']],
+                                        'details' : """Accelerometer pen data used for robust regression by Tipping and Lawrence.""",
+                                        'citation' : 'Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005',
+                                        'license' : None,
+                                        'size' : 3410}
+                  }
+
+with open('data_resources.json', 'w') as file:
+    json.dump(data_resources, file)
--- a/GPy/util/diag.py
+++ b/GPy/util/diag.py
@ -0,0 +1,114 @@
+'''
+.. module:: GPy.util.diag
+
+.. moduleauthor:: Max Zwiessele <ibinbei@gmail.com>
+
+'''
+__updated__ = '2013-12-03'
+
+import numpy as np
+
+def view(A, offset=0):
+    """
+    Get a view on the diagonal elements of a 2D array.
+    
+    This is actually a view (!) on the diagonal of the array, so you can 
+    in-place adjust the view.
+    
+    :param :class:`ndarray` A: 2 dimensional numpy array
+    :param int offset: view offset to give back (negative entries allowed)
+    :rtype: :class:`ndarray` view of diag(A)
+    
+    >>> import numpy as np
+    >>> X = np.arange(9).reshape(3,3)
+    >>> view(X)
+    array([0, 4, 8])
+    >>> d = view(X)
+    >>> d += 2
+    >>> view(X)
+    array([ 2,  6, 10])
+    >>> view(X, offset=-1)
+    array([3, 7])
+    >>> subtract(X, 3, offset=-1)
+    array([[ 2,  1,  2],
+           [ 0,  6,  5],
+           [ 6,  4, 10]])
+    """
+    from numpy.lib.stride_tricks import as_strided
+    assert A.ndim == 2, "only implemented for 2 dimensions"
+    assert A.shape[0] == A.shape[1], "attempting to get the view of non-square matrix?!" 
+    if offset > 0:
+        return as_strided(A[0, offset:], shape=(A.shape[0] - offset, ), strides=((A.shape[0]+1)*A.itemsize, ))
+    elif offset < 0:
+        return as_strided(A[-offset:, 0], shape=(A.shape[0] + offset, ), strides=((A.shape[0]+1)*A.itemsize, ))
+    else:
+        return as_strided(A, shape=(A.shape[0], ), strides=((A.shape[0]+1)*A.itemsize, ))
+
+def _diag_ufunc(A,b,offset,func):
+    dA = view(A, offset); func(dA,b,dA)
+    return A
+
+def times(A, b, offset=0):
+    """
+    Times the view of A with b in place (!).
+    Returns modified A 
+    Broadcasting is allowed, thus b can be scalar.
+    
+    if offset is not zero, make sure b is of right shape!
+    
+    :param ndarray A: 2 dimensional array
+    :param ndarray-like b: either one dimensional or scalar
+    :param int offset: same as in view.
+    :rtype: view of A, which is adjusted inplace
+    """
+    return _diag_ufunc(A, b, offset, np.multiply)
+multiply = times
+
+def divide(A, b, offset=0):
+    """
+    Divide the view of A by b in place (!).
+    Returns modified A 
+    Broadcasting is allowed, thus b can be scalar.
+    
+    if offset is not zero, make sure b is of right shape!
+    
+    :param ndarray A: 2 dimensional array
+    :param ndarray-like b: either one dimensional or scalar
+    :param int offset: same as in view.
+    :rtype: view of A, which is adjusted inplace
+    """
+    return _diag_ufunc(A, b, offset, np.divide)
+
+def add(A, b, offset=0):
+    """
+    Add b to the view of A in place (!).
+    Returns modified A.
+    Broadcasting is allowed, thus b can be scalar.
+    
+    if offset is not zero, make sure b is of right shape!
+    
+    :param ndarray A: 2 dimensional array
+    :param ndarray-like b: either one dimensional or scalar
+    :param int offset: same as in view.
+    :rtype: view of A, which is adjusted inplace
+    """
+    return _diag_ufunc(A, b, offset, np.add)
+
+def subtract(A, b, offset=0):
+    """
+    Subtract b from the view of A in place (!).
+    Returns modified A.
+    Broadcasting is allowed, thus b can be scalar.
+    
+    if offset is not zero, make sure b is of right shape!
+    
+    :param ndarray A: 2 dimensional array
+    :param ndarray-like b: either one dimensional or scalar
+    :param int offset: same as in view.
+    :rtype: view of A, which is adjusted inplace
+    """
+    return _diag_ufunc(A, b, offset, np.subtract)
+        
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -12,19 +12,34 @@ import ctypes
 from ctypes import byref, c_char, c_int, c_double # TODO
 # import scipy.lib.lapack
 import scipy
+import warnings
+import os
+from config import *

 if np.all(np.float64((scipy.__version__).split('.')[:2]) >= np.array([0, 12])):
    import scipy.linalg.lapack as lapack
 else:
    from scipy.linalg.lapack import flapack as lapack

-try:
-    _blaslib = ctypes.cdll.LoadLibrary(np.core._dotblas.__file__) # @UndefinedVariable
-    _blas_available = True
-    assert hasattr('dsyrk_',_blaslib)
-    assert hasattr('dsyr_',_blaslib)
-except:
-    _blas_available = False
+
+if config.getboolean('anaconda', 'installed') and config.getboolean('anaconda', 'MKL'):
+    try:
+        anaconda_path = str(config.get('anaconda', 'location'))
+        mkl_rt = ctypes.cdll.LoadLibrary(os.path.join(anaconda_path, 'DLLs', 'mkl_rt.dll'))
+        dsyrk = mkl_rt.dsyrk
+        dsyr = mkl_rt.dsyr
+        _blas_available = True
+    except:
+        _blas_available = False
+else:
+    try:
+        _blaslib = ctypes.cdll.LoadLibrary(np.core._dotblas.__file__) # @UndefinedVariable
+        dsyrk = _blaslib.dsyrk_
+        dsyr = _blaslib.dsyr_
+        _blas_available = True
+    except AttributeError as e:
+        _blas_available = False
+        warnings.warn("warning: caught this exception:" + str(e))

 def dtrtrs(A, B, lower=0, trans=0, unitdiag=0):
    """
@ -61,6 +76,14 @@ def dpotri(A, lower=0):
    """
    return lapack.dpotri(A, lower=lower)

+def pddet(A):
+    """
+    Determinant of a positive definite matrix, only symmetric matricies though
+    """
+    L = jitchol(A)
+    logdetA = 2*sum(np.log(np.diag(L)))
+    return logdetA
+
 def trace_dot(a, b):
    """
    Efficiently compute the trace of the matrix product of a and b
@ -205,7 +228,7 @@ def multiple_pdinv(A):
    return np.dstack(invs), np.array(halflogdets)


-def PCA(Y, input_dim):
+def pca(Y, input_dim):
    """
    Principal component analysis: maximum likelihood solution by SVD

@ -218,7 +241,7 @@ def PCA(Y, input_dim):

    """
    if not np.allclose(Y.mean(axis=0), 0.0):
-        print "Y is not zero mean, centering it locally (GPy.util.linalg.PCA)"
+        print "Y is not zero mean, centering it locally (GPy.util.linalg.pca)"

        # Y -= Y.mean(axis=0)

@ -229,6 +252,124 @@ def PCA(Y, input_dim):
    W *= v;
    return X, W.T

+def ppca(Y, Q, iterations=100):
+    """
+    EM implementation for probabilistic pca.
+
+    :param array-like Y: Observed Data
+    :param int Q: Dimensionality for reduced array
+    :param int iterations: number of iterations for EM
+    """
+    from numpy.ma import dot as madot
+    N, D = Y.shape
+    # Initialise W randomly
+    W = np.random.randn(D, Q) * 1e-3
+    Y = np.ma.masked_invalid(Y, copy=0)
+    mu = Y.mean(0)
+    Ycentered = Y - mu
+    try:
+        for _ in range(iterations):
+            exp_x = np.asarray_chkfinite(np.linalg.solve(W.T.dot(W), madot(W.T, Ycentered.T))).T
+            W = np.asarray_chkfinite(np.linalg.solve(exp_x.T.dot(exp_x), madot(exp_x.T, Ycentered))).T
+    except np.linalg.linalg.LinAlgError:
+        #"converged"
+        pass
+    return np.asarray_chkfinite(exp_x), np.asarray_chkfinite(W)
+
+def ppca_missing_data_at_random(Y, Q, iters=100):
+    """
+    EM implementation of Probabilistic pca for when there is missing data.
+    
+    Taken from <SheffieldML, https://github.com/SheffieldML>
+
+    .. math:
+        \\mathbf{Y} = \mathbf{XW} + \\epsilon \\text{, where}
+        \\epsilon = \\mathcal{N}(0, \\sigma^2 \mathbf{I})
+        
+    :returns: X, W, sigma^2 
+    """
+    from numpy.ma import dot as madot
+    import diag
+    from GPy.util.subarray_and_sorting import common_subarrays
+    import time
+    debug = 1
+    # Initialise W randomly
+    N, D = Y.shape
+    W = np.random.randn(Q, D) * 1e-3
+    Y = np.ma.masked_invalid(Y, copy=1)
+    nu = 1.
+    #num_obs_i = 1./Y.count()
+    Ycentered = Y - Y.mean(0)
+    
+    X = np.zeros((N,Q))
+    cs = common_subarrays(Y.mask)
+    cr = common_subarrays(Y.mask, 1)
+    Sigma = np.zeros((N, Q, Q))
+    Sigma2 = np.zeros((N, Q, Q))
+    mu = np.zeros(D)
+    if debug:
+        import matplotlib.pyplot as pylab
+        fig = pylab.figure("FIT MISSING DATA"); 
+        ax = fig.gca()
+        ax.cla()
+        lines = pylab.plot(np.zeros((N,Q)).dot(W))
+    W2 = np.zeros((Q,D))
+
+    for i in range(iters):
+#         Sigma = np.linalg.solve(diag.add(madot(W,W.T), nu), diag.times(np.eye(Q),nu))
+#         exp_x = madot(madot(Ycentered, W.T),Sigma)/nu
+#         Ycentered = (Y - exp_x.dot(W).mean(0))
+#         #import ipdb;ipdb.set_trace()
+#         #Ycentered = mu
+#         W = np.linalg.solve(madot(exp_x.T,exp_x) + Sigma, madot(exp_x.T, Ycentered))
+#         nu = (((Ycentered - madot(exp_x, W))**2).sum(0) + madot(W.T,madot(Sigma,W)).sum(0)).sum()/N
+        for csi, (mask, index) in enumerate(cs.iteritems()):
+            mask = ~np.array(mask)
+            Sigma2[index, :, :] = nu * np.linalg.inv(diag.add(W2[:,mask].dot(W2[:,mask].T), nu))
+            #X[index,:] = madot((Sigma[csi]/nu),madot(W,Ycentered[index].T))[:,0]
+        X2 = ((Sigma2/nu) * (madot(Ycentered,W2.T).base)[:,:,None]).sum(-1)
+        mu2 = (Y - X.dot(W)).mean(0)
+        for n in range(N):
+            Sigma[n] = nu * np.linalg.inv(diag.add(W[:,~Y.mask[n]].dot(W[:,~Y.mask[n]].T), nu))
+            X[n, :] = (Sigma[n]/nu).dot(W[:,~Y.mask[n]].dot(Ycentered[n,~Y.mask[n]].T))
+        for d in range(D):
+            mu[d] = (Y[~Y.mask[:,d], d] - X[~Y.mask[:,d]].dot(W[:, d])).mean()
+        Ycentered = (Y - mu)
+        nu3 = 0.
+        for cri, (mask, index) in enumerate(cr.iteritems()):
+            mask = ~np.array(mask)
+            W2[:,index] = np.linalg.solve(X[mask].T.dot(X[mask]) + Sigma[mask].sum(0), madot(X[mask].T, Ycentered[mask,index]))[:,None]
+            W2[:,index] = np.linalg.solve(X.T.dot(X) + Sigma.sum(0), madot(X.T, Ycentered[:,index]))
+            #nu += (((Ycentered[mask,index] - X[mask].dot(W[:,index]))**2).sum(0) + W[:,index].T.dot(Sigma[mask].sum(0).dot(W[:,index])).sum(0)).sum()
+            nu3 += (((Ycentered[index] - X.dot(W[:,index]))**2).sum(0) + W[:,index].T.dot(Sigma.sum(0).dot(W[:,index])).sum(0)).sum()
+        nu3 /= N
+        nu = 0.
+        nu2 = 0.
+        W = np.zeros((Q,D))
+        for j in range(D):
+            W[:,j] = np.linalg.solve(X[~Y.mask[:,j]].T.dot(X[~Y.mask[:,j]]) + Sigma[~Y.mask[:,j]].sum(0), madot(X[~Y.mask[:,j]].T, Ycentered[~Y.mask[:,j],j]))
+            nu2f = np.tensordot(W[:,j].T, Sigma[~Y.mask[:,j],:,:], [0,1]).dot(W[:,j])
+            nu2s = W[:,j].T.dot(Sigma[~Y.mask[:,j],:,:].sum(0).dot(W[:,j]))
+            nu2 += (((Ycentered[~Y.mask[:,j],j] - X[~Y.mask[:,j],:].dot(W[:,j]))**2) + nu2f).sum()
+            for i in range(N):
+                if not Y.mask[i,j]:
+                    nu += ((Ycentered[i,j] - X[i,:].dot(W[:,j]))**2) + W[:,j].T.dot(Sigma[i,:,:].dot(W[:,j]))
+        nu /= N
+        nu2 /= N
+        nu4 = (((Ycentered - X.dot(W))**2).sum(0) + W.T.dot(Sigma.sum(0).dot(W)).sum(0)).sum()/N
+        import ipdb;ipdb.set_trace()
+        if debug:
+            #print Sigma[0]
+            print "nu:", nu, "sum(X):", X.sum()
+            pred_y = X.dot(W)
+            for x, l in zip(pred_y.T, lines):
+                l.set_ydata(x)
+            ax.autoscale_view()
+            ax.set_ylim(pred_y.min(), pred_y.max())
+            fig.canvas.draw()
+            time.sleep(.3)
+    return np.asarray_chkfinite(X), np.asarray_chkfinite(W), nu
+

 def tdot_numpy(mat, out=None):
    return np.dot(mat, mat.T, out)
@ -264,7 +405,7 @@ def tdot_blas(mat, out=None):
    BETA = c_double(0.0)
    C = out.ctypes.data_as(ctypes.c_void_p)
    LDC = c_int(np.max(out.strides) / 8)
-    _blaslib.dsyrk_(byref(UPLO), byref(TRANS), byref(N), byref(K),
+    dsyrk(byref(UPLO), byref(TRANS), byref(N), byref(K),
            byref(ALPHA), A, byref(LDA), byref(BETA), C, byref(LDC))

    symmetrify(out, upper=True)
@ -294,7 +435,7 @@ def DSYR_blas(A, x, alpha=1.):
    A_ = A.ctypes.data_as(ctypes.c_void_p)
    x_ = x.ctypes.data_as(ctypes.c_void_p)
    INCX = c_int(1)
-    _blaslib.dsyr_(byref(UPLO), byref(N), byref(ALPHA),
+    dsyr(byref(UPLO), byref(N), byref(ALPHA),
            x_, byref(INCX), A_, byref(LDA))
    symmetrify(A, upper=True)

@ -325,7 +466,7 @@ def symmetrify(A, upper=False):
    """
    N, M = A.shape
    assert N == M
-    
+
    c_contig_code = """
    int iN;
    for (int i=1; i<N; i++){
@ -406,3 +547,27 @@ def backsub_both_sides(L, X, transpose='left'):
    else:
        tmp, _ = lapack.dtrtrs(L, np.asfortranarray(X), lower=1, trans=0)
        return lapack.dtrtrs(L, np.asfortranarray(tmp.T), lower=1, trans=0)[0].T
+
+def PCA(Y, input_dim):
+    """
+Principal component analysis: maximum likelihood solution by SVD
+
+:param Y: NxD np.array of data
+:param input_dim: int, dimension of projection
+
+
+:rval X: - Nxinput_dim np.array of dimensionality reduced data
+:rval W: - input_dimxD mapping from X to Y
+
+"""
+    if not np.allclose(Y.mean(axis=0), 0.0):
+        print "Y is not zero mean, centering it locally (GPy.util.linalg.PCA)"
+
+        # Y -= Y.mean(axis=0)
+
+    Z = linalg.svd(Y - Y.mean(axis=0), full_matrices=False)
+    [X, W] = [Z[0][:, 0:input_dim], np.dot(np.diag(Z[1]), Z[2]).T[:, 0:input_dim]]
+    v = X.std(axis=0)
+    X /= v;
+    W *= v;
+    return X, W.T
--- a/GPy/util/maps.py
+++ b/GPy/util/maps.py
@ -0,0 +1,161 @@
+import numpy as np
+import pylab as pb
+import matplotlib.patches as patches
+from matplotlib.patches import Polygon
+from matplotlib.collections import PatchCollection
+#from matplotlib import cm
+import shapefile
+import re
+
+pb.ion()
+
+def plot(shape_records,facecolor='w',edgecolor='k',linewidths=.5, ax=None,xlims=None,ylims=None):
+    """
+    Plot the geometry of a shapefile
+
+    :param shape_records: geometry and attributes list
+    :type shape_records: ShapeRecord object (output of a shapeRecords() method)
+    :param facecolor: color to be used to fill in polygons
+    :param edgecolor: color to be used for lines
+    :param ax: axes to plot on.
+    :type ax: axes handle
+    """
+    #Axes handle
+    if ax is None:
+        fig     = pb.figure()
+        ax      = fig.add_subplot(111)
+
+    #Iterate over shape_records
+    for srec in shape_records:
+        points = np.vstack(srec.shape.points)
+        sparts = srec.shape.parts
+        par = list(sparts) + [points.shape[0]]
+
+        polygs = []
+        for pj in xrange(len(sparts)):
+            polygs.append(Polygon(points[par[pj]:par[pj+1]]))
+        ax.add_collection(PatchCollection(polygs,facecolor=facecolor,edgecolor=edgecolor, linewidths=linewidths))
+
+    #Plot limits
+    _box = np.vstack([srec.shape.bbox for srec in shape_records])
+    minx,miny = np.min(_box[:,:2],0)
+    maxx,maxy = np.max(_box[:,2:],0)
+
+    if xlims is not None:
+        minx,maxx = xlims
+    if ylims is not None:
+        miny,maxy = ylims
+    ax.set_xlim(minx,maxx)
+    ax.set_ylim(miny,maxy)
+
+
+def string_match(sf,regex,field=2):
+    """
+    Return the geometry and attributes of a shapefile whose fields match a regular expression given
+
+    :param sf: shapefile
+    :type sf: shapefile object
+    :regex: regular expression to match
+    :type regex: string
+    :field: field number to be matched with the regex
+    :type field: integer
+    """
+    index = []
+    shape_records = []
+    for rec in enumerate(sf.shapeRecords()):
+        m = re.search(regex,rec[1].record[field])
+        if m is not None:
+            index.append(rec[0])
+            shape_records.append(rec[1])
+    return index,shape_records
+
+def bbox_match(sf,bbox,inside_only=True):
+    """
+    Return the geometry and attributes of a shapefile that lie within (or intersect) a bounding box
+
+    :param sf: shapefile
+    :type sf: shapefile object
+    :param bbox: bounding box
+    :type bbox: list of floats [x_min,y_min,x_max,y_max]
+    :inside_only: True if the objects returned are those that lie within the bbox and False if the objects returned are any that intersect the bbox
+    :type inside_only: Boolean
+    """
+    A,B,C,D = bbox
+    index = []
+    shape_records = []
+    for rec in enumerate(sf.shapeRecords()):
+        a,b,c,d = rec[1].shape.bbox
+        if inside_only:
+            if A <= a and B <= b and C >= c and D >= d:
+                index.append(rec[0])
+                shape_records.append(rec[1])
+        else:
+            cond1 = A <= a and B <= b and C >= a and D >= b
+            cond2 = A <= c and B <= d and C >= c and D >= d
+            cond3 = A <= a and D >= d and C >= a and B <= d
+            cond4 = A <= c and D >= b and C >= c and B <= b
+            cond5 = a <= C and b <= B and d >= D
+            cond6 = c <= A and b <= B and d >= D
+            cond7 = d <= B and a <= A and c >= C
+            cond8 = b <= D and a <= A and c >= C
+            if cond1 or cond2 or cond3 or cond4 or cond5 or cond6 or cond7 or cond8:
+                index.append(rec[0])
+                shape_records.append(rec[1])
+    return index,shape_records
+
+
+def plot_bbox(sf,bbox,inside_only=True):
+    """
+    Plot the geometry of a shapefile within a bbox
+
+    :param sf: shapefile
+    :type sf: shapefile object
+    :param bbox: bounding box
+    :type bbox: list of floats [x_min,y_min,x_max,y_max]
+    :inside_only: True if the objects returned are those that lie within the bbox and False if the objects returned are any that intersect the bbox
+    :type inside_only: Boolean
+    """
+    index,shape_records = bbox_match(sf,bbox,inside_only)
+    A,B,C,D = bbox
+    plot(shape_records,xlims=[bbox[0],bbox[2]],ylims=[bbox[1],bbox[3]])
+
+def plot_string_match(sf,regex,field):
+    """
+    Plot the geometry of a shapefile whose fields match a regular expression given
+
+    :param sf: shapefile
+    :type sf: shapefile object
+    :regex: regular expression to match
+    :type regex: string
+    :field: field number to be matched with the regex
+    :type field: integer
+    """
+    index,shape_records = string_match(sf,regex,field)
+    plot(shape_records)
+
+
+def new_shape_string(sf,name,regex,field=2,type=shapefile.POINT):
+
+    newshp = shapefile.Writer(shapeType = sf.shapeType)
+    newshp.autoBalance = 1
+
+    index,shape_records = string_match(sf,regex,field)
+
+    _fi = [sf.fields[j] for j in index]
+    for f in _fi:
+        newshp.field(name=f[0],fieldType=f[1],size=f[2],decimal=f[3])
+
+    _shre = shape_records
+    for sr in _shre:
+        _points = []
+        _parts = []
+        for point in sr.shape.points:
+            _points.append(point)
+        _parts.append(_points)
+
+        newshp.line(parts=_parts)
+        newshp.records.append(sr.record)
+        print len(sr.record)
+
+    newshp.save(name)
+    print index
--- a/GPy/util/misc.py
+++ b/GPy/util/misc.py
@ -32,18 +32,6 @@ def chain_3(d3f_dg3, dg_dx, d2f_dg2, d2g_dx2, df_dg, d3g_dx3):
    """
    return d3f_dg3*(dg_dx**3) + 3*d2f_dg2*dg_dx*d2g_dx2 + df_dg*d3g_dx3

-### make a parameter to its corresponding array:
-def param_to_array(*param):
-    """
-    Convert an arbitrary number of parameters to :class:ndarray class objects. This is for
-    converting parameter objects to numpy arrays, when using scipy.weave.inline routine.
-    In scipy.weave.blitz there is no automatic array detection (even when the array inherits
-    from :class:ndarray)"""
-    assert len(param) > 0, "At least one parameter needed"
-    if len(param) == 1:
-        return param[0].view(np.ndarray)
-    return map(lambda x: x.view(np.ndarray), param)
-
 def opt_wrapper(m, **kwargs):
    """
    This function just wraps the optimization procedure of a GPy
@ -171,7 +159,6 @@ def fast_array_equal(A, B):
    elif ((A == None) and (B != None)) or ((A != None) and (B == None)):
        return False
    elif A.shape == B.shape:
-        A, B = param_to_array(A, B)
        if A.ndim == 2:
            N, D = [int(i) for i in A.shape]
            value = weave.inline(code2, support_code=support_code,
@ -187,12 +174,14 @@ def fast_array_equal(A, B):

    return value

-if __name__ == '__main__':
-    import pylab as plt
-    X = np.linspace(1,10, 100)[:, None]
-    X = X[np.random.permutation(X.shape[0])[:20]]
-    inducing = kmm_init(X, m = 5)
-    plt.figure()
-    plt.plot(X.flatten(), np.ones((X.shape[0],)), 'x')
-    plt.plot(inducing, 0.5* np.ones((len(inducing),)), 'o')
-    plt.ylim((0.0, 10.0))
+### make a parameter to its corresponding array:
+def param_to_array(*param):
+    """
+Convert an arbitrary number of parameters to :class:ndarray class objects. This is for
+converting parameter objects to numpy arrays, when using scipy.weave.inline routine.
+In scipy.weave.blitz there is no automatic array detection (even when the array inherits
+from :class:ndarray)"""
+    assert len(param) > 0, "At least one parameter needed"
+    if len(param) == 1:
+        return param[0].view(np.ndarray)
+    return map(lambda x: x.view(np.ndarray), param)
--- a/GPy/util/mocap.py
+++ b/GPy/util/mocap.py
@ -67,14 +67,14 @@ class tree:
        for i in range(len(self.vertices)):
            if self.vertices[i].id == id:
                return i
-        raise Error, 'Reverse look up of id failed.'
+        raise ValueError('Reverse look up of id failed.')

    def get_index_by_name(self, name):
        """Give the index associated with a given vertex name."""
        for i in range(len(self.vertices)):
            if self.vertices[i].name == name:
                return i
-        raise Error, 'Reverse look up of name failed.'
+        raise ValueError('Reverse look up of name failed.')

    def order_vertices(self):
        """Order vertices in the graph such that parents always have a lower index than children."""
@ -433,6 +433,8 @@ class acclaim_skeleton(skeleton):
        lin = self.read_line(fid)
        while lin != ':DEGREES':
            lin = self.read_line(fid)
+            if lin == '':
+                raise ValueError('Could not find :DEGREES in ' + fid.name)

        counter = 0
        lin = self.read_line(fid)
@ -443,9 +445,9 @@ class acclaim_skeleton(skeleton):
                if frame_no:
                    counter += 1
                    if counter != frame_no:
-                        raise Error, 'Unexpected frame number.'
+                        raise ValueError('Unexpected frame number.')
                else:
-                    raise Error, 'Single bone name  ...'
+                    raise ValueError('Single bone name  ...')
            else:
                ind = self.get_index_by_name(parts[0])
                bones[ind].append(np.array([float(channel) for channel in parts[1:]]))
@ -573,7 +575,7 @@ class acclaim_skeleton(skeleton):
                        return
                    lin = self.read_line(fid)
            else:
-                raise Error, 'Unrecognised file format'
+                raise ValueError('Unrecognised file format')
            self.finalize()
            
    def read_units(self, fid):
--- a/GPy/util/plot_latent.py
+++ b/GPy/util/plot_latent.py
@ -20,8 +20,8 @@ def most_significant_input_dimensions(model, which_indices):
        input_1, input_2 = which_indices
    return input_1, input_2

-def plot_latent(model, labels=None, which_indices=None, 
-                resolution=50, ax=None, marker='o', s=40, 
+def plot_latent(model, labels=None, which_indices=None,
+                resolution=50, ax=None, marker='o', s=40,
                fignum=None, plot_inducing=False, legend=True,
                aspect='auto', updates=False):
    """
@ -38,11 +38,9 @@ def plot_latent(model, labels=None, which_indices=None,

    input_1, input_2 = most_significant_input_dimensions(model, which_indices)

-    X = np.asarray(model.X)
-    
    # first, plot the output variance as a function of the latent space
-    Xtest, xx, yy, xmin, xmax = util.plot.x_frame2D(X[:, [input_1, input_2]], resolution=resolution)
-    Xtest_full = np.zeros((Xtest.shape[0], X.shape[1]))
+    Xtest, xx, yy, xmin, xmax = util.plot.x_frame2D(model.X[:, [input_1, input_2]], resolution=resolution)
+    Xtest_full = np.zeros((Xtest.shape[0], model.X.shape[1]))

    def plot_function(x):
        Xtest_full[:, [input_1, input_2]] = x
@ -50,10 +48,10 @@ def plot_latent(model, labels=None, which_indices=None,
        var = var[:, :1]
        return np.log(var)
    view = ImshowController(ax, plot_function,
-                            tuple(X[:, [input_1, input_2]].min(0)) + tuple(X[:, [input_1, input_2]].max(0)),
+                            tuple(model.X[:, [input_1, input_2]].min(0)) + tuple(model.X[:, [input_1, input_2]].max(0)),
                            resolution, aspect=aspect, interpolation='bilinear',
                            cmap=pb.cm.binary)
-    
+
 #     ax.imshow(var.reshape(resolution, resolution).T,
 #               extent=[xmin[0], xmax[0], xmin[1], xmax[1]], cmap=pb.cm.binary, interpolation='bilinear', origin='lower')

@ -76,11 +74,11 @@ def plot_latent(model, labels=None, which_indices=None,

        index = np.nonzero(labels == ul)[0]
        if model.input_dim == 1:
-            x = X[index, input_1]
+            x = model.X[index, input_1]
            y = np.zeros(index.size)
        else:
-            x = X[index, input_1]
-            y = X[index, input_2]
+            x = model.X[index, input_1]
+            y = model.X[index, input_2]
        ax.scatter(x, y, marker=m, s=s, color=util.plot.Tango.nextMedium(), label=this_label)

    ax.set_xlabel('latent dimension %i' % input_1)
@ -102,8 +100,8 @@ def plot_latent(model, labels=None, which_indices=None,
        raw_input('Enter to continue')
    return ax

-def plot_magnification(model, labels=None, which_indices=None, 
-                resolution=60, ax=None, marker='o', s=40, 
+def plot_magnification(model, labels=None, which_indices=None,
+                resolution=60, ax=None, marker='o', s=40,
                fignum=None, plot_inducing=False, legend=True,
                aspect='auto', updates=False):
    """
@ -119,17 +117,16 @@ def plot_magnification(model, labels=None, which_indices=None,
        labels = np.ones(model.num_data)

    input_1, input_2 = most_significant_input_dimensions(model, which_indices)
-    X = np.asarray(model.X)
-    
+
    # first, plot the output variance as a function of the latent space
-    Xtest, xx, yy, xmin, xmax = util.plot.x_frame2D(X[:, [input_1, input_2]], resolution=resolution)
-    Xtest_full = np.zeros((Xtest.shape[0], X.shape[1]))
+    Xtest, xx, yy, xmin, xmax = util.plot.x_frame2D(model.X[:, [input_1, input_2]], resolution=resolution)
+    Xtest_full = np.zeros((Xtest.shape[0], model.X.shape[1]))
    def plot_function(x):
        Xtest_full[:, [input_1, input_2]] = x
        mf=model.magnification(Xtest_full)
        return mf
    view = ImshowController(ax, plot_function,
-                            tuple(X.min(0)[:, [input_1, input_2]]) + tuple(X.max(0)[:, [input_1, input_2]]),
+                            tuple(model.X.min(0)[:, [input_1, input_2]]) + tuple(model.X.max(0)[:, [input_1, input_2]]),
                            resolution, aspect=aspect, interpolation='bilinear',
                            cmap=pb.cm.gray)

@ -152,11 +149,11 @@ def plot_magnification(model, labels=None, which_indices=None,

        index = np.nonzero(labels == ul)[0]
        if model.input_dim == 1:
-            x = X[index, input_1]
+            x = model.X[index, input_1]
            y = np.zeros(index.size)
        else:
-            x = X[index, input_1]
-            y = X[index, input_2]
+            x = model.X[index, input_1]
+            y = model.X[index, input_2]
        ax.scatter(x, y, marker=m, s=s, color=util.plot.Tango.nextMedium(), label=this_label)

    ax.set_xlabel('latent dimension %i' % input_1)
--- a/GPy/util/subarray_and_sorting.py
+++ b/GPy/util/subarray_and_sorting.py
@ -0,0 +1,56 @@
+'''
+.. module:: GPy.util.subarray_and_sorting
+
+.. moduleauthor:: Max Zwiessele <ibinbei@gmail.com>
+
+'''
+__updated__ = '2013-12-02'
+
+import numpy as np
+
+def common_subarrays(X, axis=0):
+    """
+    Find common subarrays of 2 dimensional X, where axis is the axis to apply the search over.
+    Common subarrays are returned as a dictionary of <subarray, [index]> pairs, where
+    the subarray is a tuple representing the subarray and the index is the index
+    for the subarray in X, where index is the index to the remaining axis.
+    
+    :param :class:`np.ndarray` X: 2d array to check for common subarrays in
+    :param int axis: axis to apply subarray detection over. 
+        When the index is 0, compare rows, columns, otherwise.   
+    
+    Examples:
+    =========
+
+    In a 2d array:    
+    >>> import numpy as np
+    >>> X = np.zeros((3,6), dtype=bool)
+    >>> X[[1,1,1],[0,4,5]] = 1; X[1:,[2,3]] = 1
+    >>> X
+    array([[False, False, False, False, False, False],
+           [ True, False,  True,  True,  True,  True],
+           [False, False,  True,  True, False, False]], dtype=bool)
+    >>> d = common_subarrays(X,axis=1)
+    >>> len(d)
+    3
+    >>> X[:, d[tuple(X[:,0])]]
+    array([[False, False, False],
+           [ True,  True,  True],
+           [False, False, False]], dtype=bool)
+    >>> d[tuple(X[:,4])] == d[tuple(X[:,0])] == [0, 4, 5]
+    True
+    >>> d[tuple(X[:,1])]
+    [1]
+    """
+    from collections import defaultdict
+    from itertools import count
+    from operator import iadd
+    assert X.ndim == 2 and axis in (0,1), "Only implemented for 2D arrays"
+    subarrays = defaultdict(list)
+    cnt = count()
+    np.apply_along_axis(lambda x: iadd(subarrays[tuple(x)], [cnt.next()]), 1-axis, X)
+    return subarrays
+
+if __name__ == '__main__':
+    import doctest
+    doctest.testmod()
--- a/GPy/util/symbolic.py
+++ b/GPy/util/symbolic.py
@ -1,4 +1,4 @@
-from sympy import Function, S, oo, I, cos, sin, asin, log, erf,pi,exp, sqrt
+from sympy import Function, S, oo, I, cos, sin, asin, log, erf, pi, exp, sqrt, sign


 class ln_diff_erf(Function):
@ -10,7 +10,7 @@ class ln_diff_erf(Function):
            return -2*exp(-x1**2)/(sqrt(pi)*(erf(x0)-erf(x1)))
        elif argindex == 1:
            x0, x1 = self.args
-            return 2*exp(-x0**2)/(sqrt(pi)*(erf(x0)-erf(x1)))
+            return 2.*exp(-x0**2)/(sqrt(pi)*(erf(x0)-erf(x1)))
        else:
            raise ArgumentIndexError(self, argindex)
        
@ -19,25 +19,209 @@ class ln_diff_erf(Function):
        if x0.is_Number and x1.is_Number:            
            return log(erf(x0)-erf(x1))

-class sim_h(Function):
+class dh_dd_i(Function):
    nargs = 5
-
-    def fdiff(self, argindex=5):
-        t, tprime, d_i, d_j, l = self.args
-        if argindex == 5:
-            return -2*sin(t)
-        elif argindex == 4:
-            return 2*exp(d_i)
-        elif argindex == 3:
-            return 4*exp(d_j)
-        elif argindex == 2:
-            return 3*exp(l)
-        elif argindex == 1:
-            return 2*d_j
-            
    @classmethod
    def eval(cls, t, tprime, d_i, d_j, l):
-        return exp((d_j/2*l)**2)/(d_i+d_j)*(exp(-d_j*(tprime - t))*(erf((tprime-t)/l - d_j/2*l) + erf(t/l + d_j/2*l)) - exp(-(d_j*tprime + d_i))*(erf(tprime/l - d_j/2*l) + erf(d_j/2*l)))
+        if (t.is_Number
+            and tprime.is_Number
+            and d_i.is_Number
+            and d_j.is_Number
+            and l.is_Number):
+
+            diff_t = (t-tprime)
+            l2 = l*l
+            h = h(t, tprime, d_i, d_j, l)
+            half_l_di = 0.5*l*d_i
+            arg_1 = half_l_di + tprime/l
+            arg_2 = half_l_di - (t-tprime)/l
+            ln_part_1 = ln_diff_erf(arg_1, arg_2)
+            arg_1 = half_l_di 
+            arg_2 = half_l_di - t/l
+            sign_val = sign(t/l)
+            ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
+
+            base = ((0.5*d_i*l2*(d_i+d_j)-1)*h 
+                    + (-diff_t*sign_val*exp(half_l_di*half_l_di
+                                          -d_i*diff_t
+                                          +ln_part_1)
+                       +t*sign_val*exp(half_l_di*half_l_di
+                                          -d_i*t-d_j*tprime
+                                          +ln_part_2))
+                    + l/sqrt(pi)*(-exp(-diff_t*diff_t/l2)
+                                     +exp(-tprime*tprime/l2-d_i*t)
+                                     +exp(-t*t/l2-d_j*tprime)
+                                     -exp(-(d_i*t + d_j*tprime))))
+            return base/(d_i+d_j)
+
+class dh_dd_j(Function):
+    nargs = 5
+    @classmethod
+    def eval(cls, t, tprime, d_i, d_j, l):
+        if (t.is_Number
+            and tprime.is_Number
+            and d_i.is_Number
+            and d_j.is_Number
+            and l.is_Number):
+            diff_t = (t-tprime)
+            l2 = l*l
+            half_l_di = 0.5*l*d_i
+            h = h(t, tprime, d_i, d_j, l)
+            arg_1 = half_l_di + tprime/l
+            arg_2 = half_l_di - (t-tprime)/l
+            ln_part_1 = ln_diff_erf(arg_1, arg_2)
+            arg_1 = half_l_di 
+            arg_2 = half_l_di - t/l
+            sign_val = sign(t/l)
+            ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
+            sign_val = sign(t/l)
+            base = tprime*sign_val*exp(half_l_di*half_l_di-(d_i*t+d_j*tprime)+ln_part_2)-h
+            return base/(d_i+d_j)
+    
+class dh_dl(Function):
+    nargs = 5
+    @classmethod
+    def eval(cls, t, tprime, d_i, d_j, l):
+        if (t.is_Number
+            and tprime.is_Number
+            and d_i.is_Number
+            and d_j.is_Number
+            and l.is_Number):
+
+            diff_t = (t-tprime)
+            l2 = l*l
+            h = h(t, tprime, d_i, d_j, l)
+            return 0.5*d_i*d_i*l*h + 2./(sqrt(pi)*(d_i+d_j))*((-diff_t/l2-d_i/2.)*exp(-diff_t*diff_t/l2)+(-tprime/l2+d_i/2.)*exp(-tprime*tprime/l2-d_i*t)-(-t/l2-d_i/2.)*exp(-t*t/l2-d_j*tprime)-d_i/2.*exp(-(d_i*t+d_j*tprime)))
+
+class dh_dt(Function):
+    nargs = 5
+    @classmethod
+    def eval(cls, t, tprime, d_i, d_j, l):
+        if (t.is_Number
+            and tprime.is_Number
+            and d_i.is_Number
+            and d_j.is_Number
+            and l.is_Number):
+            if (t is S.NaN
+                or tprime is S.NaN
+                or d_i is S.NaN
+                or d_j is S.NaN
+                or l is S.NaN):
+                return S.NaN
+            else:
+                half_l_di = 0.5*l*d_i
+                arg_1 = half_l_di + tprime/l
+                arg_2 = half_l_di - (t-tprime)/l
+                ln_part_1 = ln_diff_erf(arg_1, arg_2)
+                arg_1 = half_l_di 
+                arg_2 = half_l_di - t/l
+                sign_val = sign(t/l)
+                ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
+
+                
+                return (sign_val*exp(half_l_di*half_l_di
+                                        - d_i*(t-tprime)
+                                        + ln_part_1
+                                        - log(d_i + d_j))
+                        - sign_val*exp(half_l_di*half_l_di
+                                          - d_i*t - d_j*tprime
+                                          + ln_part_2
+                                          - log(d_i + d_j))).diff(t)
+
+class dh_dtprime(Function):
+    nargs = 5
+    @classmethod
+    def eval(cls, t, tprime, d_i, d_j, l):
+        if (t.is_Number
+            and tprime.is_Number
+            and d_i.is_Number
+            and d_j.is_Number
+            and l.is_Number):
+            if (t is S.NaN
+                or tprime is S.NaN
+                or d_i is S.NaN
+                or d_j is S.NaN
+                or l is S.NaN):
+                return S.NaN
+            else:
+                half_l_di = 0.5*l*d_i
+                arg_1 = half_l_di + tprime/l
+                arg_2 = half_l_di - (t-tprime)/l
+                ln_part_1 = ln_diff_erf(arg_1, arg_2)
+                arg_1 = half_l_di 
+                arg_2 = half_l_di - t/l
+                sign_val = sign(t/l)
+                ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
+
+                
+                return (sign_val*exp(half_l_di*half_l_di
+                                        - d_i*(t-tprime)
+                                        + ln_part_1
+                                        - log(d_i + d_j))
+                        - sign_val*exp(half_l_di*half_l_di
+                                          - d_i*t - d_j*tprime
+                                          + ln_part_2
+                                          - log(d_i + d_j))).diff(tprime)
+
+
+class h(Function):
+    nargs = 5
+    def fdiff(self, argindex=5):
+        t, tprime, d_i, d_j, l = self.args
+        if argindex == 1:
+            return dh_dt(t, tprime, d_i, d_j, l)
+        elif argindex == 2:
+            return dh_dtprime(t, tprime, d_i, d_j, l)
+        elif argindex == 3:
+            return dh_dd_i(t, tprime, d_i, d_j, l)
+        elif argindex == 4:
+            return dh_dd_j(t, tprime, d_i, d_j, l)
+        elif argindex == 5:
+            return dh_dl(t, tprime, d_i, d_j, l)
+                                                                
+    
+    @classmethod
+    def eval(cls, t, tprime, d_i, d_j, l):
+        # putting in the is_Number stuff forces it to look for a fdiff method for derivative. If it's left out, then when asking for self.diff, it just does the diff on the eval symbolic terms directly. We want to avoid that because we are looking to ensure everything is numerically stable. Maybe it's because of the if statement that this happens? 
+        if (t.is_Number
+            and tprime.is_Number
+            and d_i.is_Number
+            and d_j.is_Number
+            and l.is_Number):
+            if (t is S.NaN
+                or tprime is S.NaN
+                or d_i is S.NaN
+                or d_j is S.NaN
+                or l is S.NaN):
+                return S.NaN
+            else:
+                half_l_di = 0.5*l*d_i
+                arg_1 = half_l_di + tprime/l
+                arg_2 = half_l_di - (t-tprime)/l
+                ln_part_1 = ln_diff_erf(arg_1, arg_2)
+                arg_1 = half_l_di 
+                arg_2 = half_l_di - t/l
+                sign_val = sign(t/l)
+                ln_part_2 = ln_diff_erf(half_l_di, half_l_di - t/l)
+
+                
+                return (sign_val*exp(half_l_di*half_l_di
+                                        - d_i*(t-tprime)
+                                        + ln_part_1
+                                        - log(d_i + d_j))
+                        - sign_val*exp(half_l_di*half_l_di
+                                          - d_i*t - d_j*tprime
+                                          + ln_part_2
+                                          - log(d_i + d_j)))
+            
+                                  
+                # return (exp((d_j/2.*l)**2)/(d_i+d_j)
+                #         *(exp(-d_j*(tprime - t))
+                #           *(erf((tprime-t)/l - d_j/2.*l)
+                #             + erf(t/l + d_j/2.*l))
+                #           - exp(-(d_j*tprime + d_i))
+                #           *(erf(tprime/l - d_j/2.*l)
+                #             + erf(d_j/2.*l))))

 class erfc(Function):
    nargs = 1
@ -53,52 +237,3 @@ class erfcx(Function):
    def eval(cls, arg):
        return erfc(arg)*exp(arg*arg)

-class sinc_grad(Function):
-    nargs = 1
-    
-    def fdiff(self, argindex=1):
-        if argindex==1:
-            # Strictly speaking this should be computed separately, as it won't work when x=0. See http://calculus.subwiki.org/wiki/Sinc_function
-            return ((2-x*x)*sin(self.args[0]) - 2*x*cos(x))/(x*x*x)
-        else:
-            raise ArgumentIndexError(self, argindex)
-
-    
-    @classmethod
-    def eval(cls, x):
-        if x.is_Number:
-            if x is S.NaN:
-                return S.NaN
-            elif x is S.Zero:
-                return S.Zero
-            else:
-                return (x*cos(x) - sin(x))/(x*x)
-            
-class sinc(Function):
-    
-    nargs = 1
-    
-    def fdiff(self, argindex=1):
-        if argindex==1:
-            return sinc_grad(self.args[0])
-        else:
-            raise ArgumentIndexError(self, argindex)
-
-    
-    @classmethod
-    def eval(cls, arg):
-        if arg.is_Number:
-            if arg is S.NaN:
-                return S.NaN
-            elif arg is S.Zero:
-                return S.One
-            else:
-                return sin(arg)/arg
-
-        if arg.func is asin:
-            x = arg.args[0]
-            return x / arg
-
-    def _eval_is_real(self):
-        return self.args[0].is_real
-
--- a/GPy/util/univariate_Gaussian.py
+++ b/GPy/util/univariate_Gaussian.py
@ -13,24 +13,32 @@ def std_norm_cdf(x):
    Cumulative standard Gaussian distribution
    Based on Abramowitz, M. and Stegun, I. (1970)
    """
+    #Generalize for many x
+    x = np.asarray(x).copy()
+    cdf_x = np.zeros_like(x)
+    N = x.size
    support_code = "#include <math.h>"
    code = """

-    double sign = 1.0;
-    if (x < 0.0){
-        sign = -1.0;
-        x = -x;
+    double sign, t, erf;
+    for (int i=0; i<N; i++){
+        sign = 1.0;
+        if (x[i] < 0.0){
+            sign = -1.0;
+            x[i] = -x[i];
+        }
+        x[i] = x[i]/sqrt(2.0);
+
+        t = 1.0/(1.0 +  0.3275911*x[i]);
+
+        erf = 1. - exp(-x[i]*x[i])*t*(0.254829592 + t*(-0.284496736 + t*(1.421413741 + t*(-1.453152027 + t*(1.061405429)))));
+
+        //return_val = 0.5*(1.0 + sign*erf);
+        cdf_x[i] = 0.5*(1.0 + sign*erf);
    }
-    x = x/sqrt(2.0);
-
-    double t = 1.0/(1.0 +  0.3275911*x);
-
-    double erf = 1. - exp(-x*x)*t*(0.254829592 + t*(-0.284496736 + t*(1.421413741 + t*(-1.453152027 + t*(1.061405429)))));
-
-    return_val = 0.5*(1.0 + sign*erf);
    """
-    x = float(x)
-    return weave.inline(code,arg_names=['x'],support_code=support_code)
+    weave.inline(code, arg_names=['x', 'cdf_x', 'N'], support_code=support_code)
+    return cdf_x

 def inv_std_norm_cdf(x):
    """
--- a/GPy/util/visualize.py
+++ b/GPy/util/visualize.py
@ -4,7 +4,7 @@ import GPy
 import numpy as np
 import matplotlib as mpl
 import time
-from PIL import Image
+import Image
 try:
    import visual
    visual_available = True
@ -92,7 +92,7 @@ class lvm(matplotlib_show):
        :param latent_axes: the axes where the latent visualization should be plotted.
        """
        if vals == None:
-            vals = np.asarray(model.X[0])
+            vals = model.X[0]

        matplotlib_show.__init__(self, vals, axes=latent_axes)

@ -137,6 +137,7 @@ class lvm(matplotlib_show):
        pass

    def on_click(self, event):
+        print 'click!'
        if event.inaxes!=self.latent_axes: return
        self.move_on = not self.move_on
        self.called = True
@ -171,21 +172,21 @@ class lvm_subplots(lvm):
    latent_axes is a np array of dimension np.ceil(input_dim/2),
    one for each pair of the latent dimensions.
    """
-    def __init__(self, vals, model, data_visualize, latent_axes=None, sense_axes=None):
-        self.nplots = int(np.ceil(model.input_dim/2.))+1
+    def __init__(self, vals, Model, data_visualize, latent_axes=None, sense_axes=None):
+        self.nplots = int(np.ceil(Model.input_dim/2.))+1
        assert len(latent_axes)==self.nplots
        if vals==None:
-            vals = np.asarray(model.X[0, :])
+            vals = Model.X[0, :]
        self.latent_values = vals 

        for i, axis in enumerate(latent_axes):
            if i == self.nplots-1:
-                if self.nplots*2!=model.input_dim:
+                if self.nplots*2!=Model.input_dim:
                    latent_index = [i*2, i*2]
-                lvm.__init__(self, self.latent_vals, model, data_visualize, axis, sense_axes, latent_index=latent_index)
+                lvm.__init__(self, self.latent_vals, Model, data_visualize, axis, sense_axes, latent_index=latent_index)
            else:
                latent_index = [i*2, i*2+1]
-                lvm.__init__(self, self.latent_vals, model, data_visualize, axis, latent_index=latent_index)
+                lvm.__init__(self, self.latent_vals, Model, data_visualize, axis, latent_index=latent_index)



@ -459,17 +460,17 @@ class mocap_data_show(matplotlib_show):
        self.axes.set_ylim(self.y_lim)
        self.axes.set_zlim(self.z_lim)

-class stick_show(mocap_data_show_vpython):
+class stick_show(mocap_data_show):
    """Show a three dimensional point cloud as a figure. Connect elements of the figure together using the matrix connect."""
-    def __init__(self, vals, connect=None, scene=None):
-        mocap_data_show_vpython.__init__(self, vals, scene=scene, connect=connect, radius=0.04)
+    def __init__(self, vals, connect=None, axes=None):
+        mocap_data_show.__init__(self, vals, axes=axes, connect=connect)

    def process_values(self):
        self.vals = self.vals.reshape((3, self.vals.shape[1]/3)).T

-class skeleton_show(mocap_data_show_vpython):
+class skeleton_show(mocap_data_show):
    """data_show class for visualizing motion capture data encoded as a skeleton with angles."""
-    def __init__(self, vals, skel, scene=None, padding=0):
+    def __init__(self, vals, skel, axes=None, padding=0):
        """data_show class for visualizing motion capture data encoded as a skeleton with angles.
        :param vals: set of modeled angles to use for printing in the axis when it's first created.
        :type vals: np.array
@ -481,7 +482,7 @@ class skeleton_show(mocap_data_show_vpython):
        self.skel = skel
        self.padding = padding
        connect = skel.connection_matrix()
-        mocap_data_show_vpython.__init__(self, vals, scene=scene, connect=connect, radius=0.4)
+        mocap_data_show.__init__(self, vals, axes=axes, connect=connect)
    def process_values(self):
        """Takes a set of angles and converts them to the x,y,z coordinates in the internal prepresentation of the class, ready for plotting.

--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@ -222,7 +222,7 @@ class TanhWarpingFunction_d(WarpingFunction):
        """


-        mpsi = psi.coSpy()
+        mpsi = psi.copy()
        d = psi[-1]
        mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)