Merge branch 'devel' of https://github.com/SheffieldML/GPy into devel

2026-06-05 14:55:15 +02:00 · 2013-12-10 17:12:23 +00:00 · 2013-12-10 17:12:23 +00:00 · 9b32fd47ee
commit 9b32fd47ee
parent bab477f149 4a751fd2da
56 changed files with 1934 additions and 1657 deletions
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@ -16,7 +16,7 @@ class GPBase(Model):
    def __init__(self, X, likelihood, kernel, normalize_X=False):
        if len(X.shape)==1:
            X = X.reshape(-1,1)
-            warning.warn("One dimension output (N,) being reshaped to (N,1)")
+            warnings.warn("One dimension output (N,) being reshaped to (N,1)")
        self.X = X
        assert len(self.X.shape) == 2, "too many dimensions for X input"
        self.num_data, self.input_dim = self.X.shape
@ -76,7 +76,7 @@ class GPBase(Model):
        :type noise_model: integer.
        :returns: Ysim: set of simulations, a Numpy array (N x samples).
        """
-        Ysim = self.posterior_samples_f(X, size, which_parts=which_parts, full_cov=True)
+        Ysim = self.posterior_samples_f(X, size, which_parts=which_parts)
        if isinstance(self.likelihood,Gaussian):
            noise_std = np.sqrt(self.likelihood._get_params())
            Ysim += np.random.normal(0,noise_std,Ysim.shape)
@ -107,7 +107,7 @@ class GPBase(Model):
            levels=20, samples=0, fignum=None, ax=None, resolution=None,
            plot_raw=False,
            linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
-        """ 
+        """
        Plot the posterior of the GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
          - In two dimsensions, a contour-plot shows the mean predicted function
@ -176,8 +176,8 @@ class GPBase(Model):
                upper = m + 2*np.sqrt(v)
                Y = self.likelihood.Y
            else:
-                m, v, lower, upper = self.predict(Xgrid, which_parts=which_parts,sampling=False) #Compute the exact mean
-                m_, v_, lower, upper = self.predict(Xgrid, which_parts=which_parts,sampling=True,num_samples=15000) #Apporximate the percentiles
+                m, v, lower, upper = self.predict(Xgrid, which_parts=which_parts, sampling=False) #Compute the exact mean
+                m_, v_, lower, upper = self.predict(Xgrid, which_parts=which_parts, sampling=True, num_samples=15000) #Apporximate the percentiles
                Y = self.likelihood.data
            for d in which_data_ycols:
                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
@ -185,7 +185,7 @@ class GPBase(Model):

            #optionally plot some samples
            if samples: #NOTE not tested with fixed_inputs
-                Ysim = self.posterior_samples(Xgrid, samples, which_parts=which_parts, full_cov=True)
+                Ysim = self.posterior_samples(Xgrid, samples, which_parts=which_parts)
                for yi in Ysim.T:
                    ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
                    #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -453,7 +453,12 @@ class Model(Parameterized):

        if not verbose:
            # just check the global ratio
-            dx = step * np.sign(np.random.uniform(-1, 1, x.size))
+
+            #choose a random direction to find the linear approximation in
+            if x.size==2:
+                dx = step * np.ones(2) # random direction for 2 parameters can fail dure to symmetry
+            else:
+                dx = step * np.sign(np.random.uniform(-1, 1, x.size))

            # evaulate around the point x
            f1, g1 = self.objective_and_gradients(x + dx)
--- a/GPy/core/svigp.py
+++ b/GPy/core/svigp.py
@ -31,7 +31,6 @@ class SVIGP(GPBase):

    """

-
    def __init__(self, X, likelihood, kernel, Z, q_u=None, batchsize=10, X_variance=None):
        GPBase.__init__(self, X, likelihood, kernel, normalize_X=False)
        self.batchsize=batchsize
@ -433,7 +432,7 @@ class SVIGP(GPBase):
            else:
                return mu, diag_var[:,None]

-    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
+    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False, sampling=False, num_samples=15000):
        # normalize X values
        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
        if X_variance_new is not None:
@ -443,7 +442,7 @@ class SVIGP(GPBase):
        mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts)

        # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
+        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, sampling=sampling, num_samples=num_samples)

        return mean, var, _025pm, _975pm

--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -6,12 +6,11 @@
 Gaussian Processes classification
 """
 import pylab as pb
-import numpy as np
 import GPy

 default_seed = 10000

-def oil(num_inducing=50, max_iters=100, kernel=None):
+def oil(num_inducing=50, max_iters=100, kernel=None, optimize=True, plot=True):
    """
    Run a Gaussian process classification on the three phase oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.

@ -25,7 +24,7 @@ def oil(num_inducing=50, max_iters=100, kernel=None):
    Ytest[Ytest.flatten()==-1] = 0

    # Create GP model
-    m = GPy.models.SparseGPClassification(X, Y,kernel=kernel,num_inducing=num_inducing)
+    m = GPy.models.SparseGPClassification(X, Y, kernel=kernel, num_inducing=num_inducing)

    # Contrain all parameters to be positive
    m.tie_params('.*len')
@ -33,15 +32,16 @@ def oil(num_inducing=50, max_iters=100, kernel=None):
    m.update_likelihood_approximation()

    # Optimize
-    m.optimize(max_iters=max_iters)
+    if optimize:
+        m.optimize(max_iters=max_iters)
    print(m)

    #Test
    probs = m.predict(Xtest)[0]
-    GPy.util.classification.conf_matrix(probs,Ytest)
+    GPy.util.classification.conf_matrix(probs, Ytest)
    return m

-def toy_linear_1d_classification(seed=default_seed):
+def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):
    """
    Simple 1D classification example using EP approximation

@ -58,21 +58,23 @@ def toy_linear_1d_classification(seed=default_seed):
    m = GPy.models.GPClassification(data['X'], Y)

    # Optimize
-    #m.update_likelihood_approximation()
-    # Parameters optimization:
-    #m.optimize()
-    #m.update_likelihood_approximation()
-    m.pseudo_EM()
+    if optimize:
+        #m.update_likelihood_approximation()
+        # Parameters optimization:
+        #m.optimize()
+        #m.update_likelihood_approximation()
+        m.pseudo_EM()

    # Plot
-    fig, axes = pb.subplots(2,1)
-    m.plot_f(ax=axes[0])
-    m.plot(ax=axes[1])
-    print(m)
+    if plot:
+        fig, axes = pb.subplots(2, 1)
+        m.plot_f(ax=axes[0])
+        m.plot(ax=axes[1])

+    print m
    return m

-def toy_linear_1d_classification_laplace(seed=default_seed):
+def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=True):
    """
    Simple 1D classification example using Laplace approximation

@ -90,24 +92,25 @@ def toy_linear_1d_classification_laplace(seed=default_seed):

    # Model definition
    m = GPy.models.GPClassification(data['X'], Y, likelihood=laplace_likelihood)
-
    print m
+
    # Optimize
-    #m.update_likelihood_approximation()
-    # Parameters optimization:
-    m.optimize('bfgs', messages=1)
-    #m.pseudo_EM()
+    if optimize:
+        #m.update_likelihood_approximation()
+        # Parameters optimization:
+        m.optimize('bfgs', messages=1)
+        #m.pseudo_EM()

    # Plot
-    fig, axes = pb.subplots(2,1)
-    m.plot_f(ax=axes[0])
-    m.plot(ax=axes[1])
-    print(m)
+    if plot:
+        fig, axes = pb.subplots(2, 1)
+        m.plot_f(ax=axes[0])
+        m.plot(ax=axes[1])

+    print m
    return m

-
-def sparse_toy_linear_1d_classification(num_inducing=10,seed=default_seed):
+def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, optimize=True, plot=True):
    """
    Sparse 1D classification example

@ -121,24 +124,26 @@ def sparse_toy_linear_1d_classification(num_inducing=10,seed=default_seed):
    Y[Y.flatten() == -1] = 0

    # Model definition
-    m = GPy.models.SparseGPClassification(data['X'], Y,num_inducing=num_inducing)
-    m['.*len']= 4.
+    m = GPy.models.SparseGPClassification(data['X'], Y, num_inducing=num_inducing)
+    m['.*len'] = 4.

    # Optimize
-    #m.update_likelihood_approximation()
-    # Parameters optimization:
-    #m.optimize()
-    m.pseudo_EM()
+    if optimize:
+        #m.update_likelihood_approximation()
+        # Parameters optimization:
+        #m.optimize()
+        m.pseudo_EM()

    # Plot
-    fig, axes = pb.subplots(2,1)
-    m.plot_f(ax=axes[0])
-    m.plot(ax=axes[1])
-    print(m)
+    if plot:
+        fig, axes = pb.subplots(2, 1)
+        m.plot_f(ax=axes[0])
+        m.plot(ax=axes[1])

+    print m
    return m

-def toy_heaviside(seed=default_seed):
+def toy_heaviside(seed=default_seed, optimize=True, plot=True):
    """
    Simple 1D classification example using a heavy side gp transformation

@ -153,24 +158,26 @@ def toy_heaviside(seed=default_seed):

    # Model definition
    noise_model = GPy.likelihoods.bernoulli(GPy.likelihoods.noise_models.gp_transformations.Heaviside())
-    likelihood = GPy.likelihoods.EP(Y,noise_model)
+    likelihood = GPy.likelihoods.EP(Y, noise_model)
    m = GPy.models.GPClassification(data['X'], likelihood=likelihood)

    # Optimize
-    m.update_likelihood_approximation()
-    # Parameters optimization:
-    m.optimize()
-    #m.pseudo_EM()
+    if optimize:
+        m.update_likelihood_approximation()
+        # Parameters optimization:
+        m.optimize()
+        #m.pseudo_EM()

    # Plot
-    fig, axes = pb.subplots(2,1)
-    m.plot_f(ax=axes[0])
-    m.plot(ax=axes[1])
-    print(m)
+    if plot:
+        fig, axes = pb.subplots(2, 1)
+        m.plot_f(ax=axes[0])
+        m.plot(ax=axes[1])

+    print m
    return m

-def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=None):
+def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=None, optimize=True, plot=True):
    """
    Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.

@ -187,7 +194,7 @@ def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=
    Y[Y.flatten()==-1] = 0

    if model_type == 'Full':
-        m = GPy.models.GPClassification(data['X'], Y,kernel=kernel)
+        m = GPy.models.GPClassification(data['X'], Y, kernel=kernel)

    elif model_type == 'DTC':
        m = GPy.models.SparseGPClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing)
@ -197,8 +204,11 @@ def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=
        m = GPy.models.FITCClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing)
        m['.*len'] = 3.

-    m.pseudo_EM()
-    print(m)
-    m.plot()
+    if optimize:
+        m.pseudo_EM()

+    if plot:
+        m.plot()
+
+    print m
    return m
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -1,99 +1,93 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
+import numpy as _np
+default_seed = _np.random.seed(123344)

-import numpy as np
-from matplotlib import pyplot as plt, cm
+def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False):
+    """
+    model for testing purposes. Samples from a GP with rbf kernel and learns
+    the samples with a new kernel. Normally not for optimization, just model cheking
+    """
+    from GPy.likelihoods.gaussian import Gaussian
+    import GPy

-import GPy
-from GPy.core.transformations import logexp
-from GPy.likelihoods.gaussian import Gaussian
-from GPy.models import BayesianGPLVM
-
-default_seed = np.random.seed(123344)
-
-def BGPLVM(seed=default_seed):
-    N = 13
+    num_inputs = 13
    num_inducing = 5
-    Q = 6
-    D = 25
+    if plot:
+        output_dim = 1
+        input_dim = 2
+    else:
+        input_dim = 2
+        output_dim = 25
+
    # generate GPLVM-like data
-    X = np.random.rand(N, Q)
-    lengthscales = np.random.rand(Q)
-    k = (GPy.kern.rbf(Q, .5, lengthscales, ARD=True)
-         + GPy.kern.white(Q, 0.01))
+    X = _np.random.rand(num_inputs, input_dim)
+    lengthscales = _np.random.rand(input_dim)
+    k = (GPy.kern.rbf(input_dim, .5, lengthscales, ARD=True)
+         + GPy.kern.white(input_dim, 0.01))
    K = k.K(X)
-    Y = np.random.multivariate_normal(np.zeros(N), K, D).T
+    Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, output_dim).T
    lik = Gaussian(Y, normalize=True)

-    # k = GPy.kern.rbf_inv(Q, .5, np.ones(Q) * 2., ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q)
-    # k = GPy.kern.linear(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
-    # k = GPy.kern.rbf(Q, ARD = False)  + GPy.kern.white(Q, 0.00001)
-    # k = GPy.kern.rbf(Q, .5, np.ones(Q) * 2., ARD=True) + GPy.kern.rbf(Q, .3, np.ones(Q) * .2, ARD=True)
-    k = GPy.kern.rbf(Q, .5, np.ones(Q) * 2., ARD=True) + GPy.kern.linear(Q, np.ones(Q) * .2, ARD=True)
-    # k = GPy.kern.rbf(Q, .5, 2., ARD=0) + GPy.kern.rbf(Q, .3, .2, ARD=0)
+    k = GPy.kern.rbf_inv(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
+    # k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+    # k = GPy.kern.rbf(input_dim, ARD = False)  + GPy.kern.white(input_dim, 0.00001)
+    # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.rbf(input_dim, .3, _np.ones(input_dim) * .2, ARD=True)
+    # k = GPy.kern.rbf(input_dim, .5, 2., ARD=0) + GPy.kern.rbf(input_dim, .3, .2, ARD=0)
+    # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.linear(input_dim, _np.ones(input_dim) * .2, ARD=True)

-    m = GPy.models.BayesianGPLVM(lik, Q, kernel=k, num_inducing=num_inducing)
+    m = GPy.models.BayesianGPLVM(lik, input_dim, kernel=k, num_inducing=num_inducing)
    m.lengthscales = lengthscales
-    # m.constrain_positive('(rbf|bias|noise|white|S)')
-    # m.constrain_fixed('S', 1)

-    # pb.figure()
-    # m.plot()
-    # pb.title('PCA initialisation')
-    # pb.figure()
-    # m.optimize(messages = 1)
-    # m.plot()
-    # pb.title('After optimisation')
-    # m.randomize()
-    # m.checkgrad(verbose=1)
+    if plot:
+        import matplotlib.pyplot as pb
+        m.plot()
+        pb.title('PCA initialisation')
+
+    if optimize:
+        m.optimize('scg', messages=verbose)
+        if plot:
+            m.plot()
+            pb.title('After optimisation')

    return m

-def GPLVM_oil_100(optimize=True):
+def gplvm_oil_100(optimize=True, verbose=1, plot=True):
+    import GPy
    data = GPy.util.datasets.oil_100()
    Y = data['X']
-
    # create simple GP model
    kernel = GPy.kern.rbf(6, ARD=True) + GPy.kern.bias(6)
    m = GPy.models.GPLVM(Y, 6, kernel=kernel)
    m.data_labels = data['Y'].argmax(axis=1)
-
-    # optimize
-    if optimize:
-        m.optimize('scg', messages=1)
-
-    # plot
-    print(m)
-    m.plot_latent(labels=m.data_labels)
+    if optimize: m.optimize('scg', messages=verbose)
+    if plot: m.plot_latent(labels=m.data_labels)
    return m

-def sparseGPLVM_oil(optimize=True, N=100, Q=6, num_inducing=15, max_iters=50):
-    np.random.seed(0)
+def sparse_gplvm_oil(optimize=True, verbose=0, plot=True, N=100, Q=6, num_inducing=15, max_iters=50):
+    import GPy
+    _np.random.seed(0)
    data = GPy.util.datasets.oil()
-
    Y = data['X'][:N]
    Y = Y - Y.mean(0)
    Y /= Y.std(0)
-
-    # create simple GP model
+    # Create the model
    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q)
    m = GPy.models.SparseGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing)
-    m.data_labels = data['Y'].argmax(axis=1)
+    m.data_labels = data['Y'][:N].argmax(axis=1)

-    # optimize
-    if optimize:
-        m.optimize('scg', messages=1, max_iters=max_iters)
-
-    # plot
-    print(m)
-    # m.plot_latent(labels=m.data_labels)
+    if optimize: m.optimize('scg', messages=verbose, max_iters=max_iters)
+    if plot:
+        m.plot_latent(labels=m.data_labels)
+        m.kern.plot_ARD()
    return m

-def swiss_roll(optimize=True, N=1000, num_inducing=15, Q=4, sigma=.2, plot=False):
+def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=15, Q=4, sigma=.2):
+    import GPy
    from GPy.util.datasets import swiss_roll_generated
-    from GPy.core.transformations import logexp_clipped
+    from GPy.models import BayesianGPLVM

-    data = swiss_roll_generated(N=N, sigma=sigma)
+    data = swiss_roll_generated(num_samples=N, sigma=sigma)
    Y = data['Y']
    Y -= Y.mean()
    Y /= Y.std()
@ -106,119 +100,98 @@ def swiss_roll(optimize=True, N=1000, num_inducing=15, Q=4, sigma=.2, plot=False
        iso = Isomap().fit(Y)
        X = iso.embedding_
        if Q > 2:
-            X = np.hstack((X, np.random.randn(N, Q - 2)))
+            X = _np.hstack((X, _np.random.randn(N, Q - 2)))
    except ImportError:
-        X = np.random.randn(N, Q)
+        X = _np.random.randn(N, Q)

    if plot:
-        from mpl_toolkits import mplot3d
-        import pylab
-        fig = pylab.figure("Swiss Roll Data")
+        import matplotlib.pyplot as plt
+        from mpl_toolkits.mplot3d import Axes3D  # @UnusedImport
+        fig = plt.figure("Swiss Roll Data")
        ax = fig.add_subplot(121, projection='3d')
        ax.scatter(*Y.T, c=c)
        ax.set_title("Swiss Roll")

        ax = fig.add_subplot(122)
        ax.scatter(*X.T[:2], c=c)
-        ax.set_title("Initialization")
-
+        ax.set_title("BGPLVM init")

    var = .5
-    S = (var * np.ones_like(X) + np.clip(np.random.randn(N, Q) * var ** 2,
+    S = (var * _np.ones_like(X) + _np.clip(_np.random.randn(N, Q) * var ** 2,
                                         - (1 - var),
                                         (1 - var))) + .001
-    Z = np.random.permutation(X)[:num_inducing]
+    Z = _np.random.permutation(X)[:num_inducing]

-    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2)) + GPy.kern.white(Q, np.exp(-2))
+    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))

    m = BayesianGPLVM(Y, Q, X=X, X_variance=S, num_inducing=num_inducing, Z=Z, kernel=kernel)
    m.data_colors = c
    m.data_t = t
-
-    m['rbf_lengthscale'] = 1. # X.var(0).max() / X.var(0)
    m['noise_variance'] = Y.var() / 100.
-    m['bias_variance'] = 0.05

    if optimize:
-        m.optimize('scg', messages=1)
+        m.optimize('scg', messages=verbose, max_iters=2e3)
+
+    if plot:
+        fig = plt.figure('fitted')
+        ax = fig.add_subplot(111)
+        s = m.input_sensitivity().argsort()[::-1][:2]
+        ax.scatter(*m.X.T[s], c=c)
+
    return m

-def BGPLVM_oil(optimize=True, N=200, Q=7, num_inducing=40, max_iters=1000, plot=False, **k):
-    np.random.seed(0)
+def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, max_iters=1000, **k):
+    import GPy
+    from GPy.likelihoods import Gaussian
+    from matplotlib import pyplot as plt
+
+    _np.random.seed(0)
    data = GPy.util.datasets.oil()

-    # create simple GP model
-    kernel = GPy.kern.rbf_inv(Q, 1., [.1] * Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2))
-
+    kernel = GPy.kern.rbf_inv(Q, 1., [.1] * Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2))
    Y = data['X'][:N]
    Yn = Gaussian(Y, normalize=True)
-#     Yn = Y - Y.mean(0)
-#     Yn /= Yn.std(0)
-
    m = GPy.models.BayesianGPLVM(Yn, Q, kernel=kernel, num_inducing=num_inducing, **k)
    m.data_labels = data['Y'][:N].argmax(axis=1)
-
-    # m.constrain('variance|leng', logexp_clipped())
-    # m['.*lengt'] = m.X.var(0).max() / m.X.var(0)
    m['noise'] = Yn.Y.var() / 100.

-
-    # optimize
    if optimize:
-        m.constrain_fixed('noise')
-        m.optimize('scg', messages=1, max_iters=200, gtol=.05)
-        m.constrain_positive('noise')
-        m.constrain_bounded('white', 1e-7, 1)
-        m.optimize('scg', messages=1, max_iters=max_iters, gtol=.05)
+        m.optimize('scg', messages=verbose, max_iters=max_iters, gtol=.05)

    if plot:
        y = m.likelihood.Y[0, :]
        fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
-        plt.sca(latent_axes)
-        m.plot_latent()
+        m.plot_latent(ax=latent_axes)
        data_show = GPy.util.visualize.vector_show(y)
-        lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :], m, data_show, latent_axes=latent_axes) # , sense_axes=sense_axes)
+        lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :], # @UnusedVariable
+            m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
        raw_input('Press enter to finish')
        plt.close(fig)
    return m

-def oil_100():
-    data = GPy.util.datasets.oil_100()
-    m = GPy.models.GPLVM(data['X'], 2)
-
-    # optimize
-    m.optimize(messages=1, max_iters=2)
-
-    # plot
-    print(m)
-    # m.plot_latent(labels=data['Y'].argmax(axis=1))
-    return m
-
-
-
 def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
-    x = np.linspace(0, 4 * np.pi, N)[:, None]
-    s1 = np.vectorize(lambda x: np.sin(x))
-    s2 = np.vectorize(lambda x: np.cos(x))
-    s3 = np.vectorize(lambda x:-np.exp(-np.cos(2 * x)))
-    sS = np.vectorize(lambda x: np.sin(2 * x))
+    x = _np.linspace(0, 4 * _np.pi, N)[:, None]
+    s1 = _np.vectorize(lambda x: _np.sin(x))
+    s2 = _np.vectorize(lambda x: _np.cos(x))
+    s3 = _np.vectorize(lambda x:-_np.exp(-_np.cos(2 * x)))
+    sS = _np.vectorize(lambda x: _np.sin(2 * x))

    s1 = s1(x)
    s2 = s2(x)
    s3 = s3(x)
    sS = sS(x)

-    S1 = np.hstack([s1, sS])
-    S2 = np.hstack([s2, s3, sS])
-    S3 = np.hstack([s3, sS])
+    S1 = _np.hstack([s1, sS])
+    S2 = _np.hstack([s2, s3, sS])
+    S3 = _np.hstack([s3, sS])

-    Y1 = S1.dot(np.random.randn(S1.shape[1], D1))
-    Y2 = S2.dot(np.random.randn(S2.shape[1], D2))
-    Y3 = S3.dot(np.random.randn(S3.shape[1], D3))
+    Y1 = S1.dot(_np.random.randn(S1.shape[1], D1))
+    Y2 = S2.dot(_np.random.randn(S2.shape[1], D2))
+    Y3 = S3.dot(_np.random.randn(S3.shape[1], D3))

-    Y1 += .3 * np.random.randn(*Y1.shape)
-    Y2 += .2 * np.random.randn(*Y2.shape)
-    Y3 += .25 * np.random.randn(*Y3.shape)
+    Y1 += .3 * _np.random.randn(*Y1.shape)
+    Y2 += .2 * _np.random.randn(*Y2.shape)
+    Y3 += .25 * _np.random.randn(*Y3.shape)

    Y1 -= Y1.mean(0)
    Y2 -= Y2.mean(0)
@ -233,6 +206,7 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):

    if plot_sim:
        import pylab
+        import matplotlib.cm as cm
        import itertools
        fig = pylab.figure("MRD Simulation Data", figsize=(8, 6))
        fig.clf()
@ -243,95 +217,81 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
        ax.legend()
        for i, Y in enumerate(Ylist):
            ax = fig.add_subplot(2, len(Ylist), len(Ylist) + 1 + i)
-            ax.imshow(Y, aspect='auto', cmap=cm.gray) # @UndefinedVariable
+            ax.imshow(Y, aspect='auto', cmap=cm.gray)
            ax.set_title("Y{}".format(i + 1))
        pylab.draw()
        pylab.tight_layout()

    return slist, [S1, S2, S3], Ylist

-def bgplvm_simulation_matlab_compare():
-    from GPy.util.datasets import simulation_BGPLVM
-    sim_data = simulation_BGPLVM()
-    Y = sim_data['Y']
-    S = sim_data['S']
-    mu = sim_data['mu']
-    num_inducing, [_, Q] = 3, mu.shape
+# def bgplvm_simulation_matlab_compare():
+#     from GPy.util.datasets import simulation_BGPLVM
+#     from GPy import kern
+#     from GPy.models import BayesianGPLVM
+#
+#     sim_data = simulation_BGPLVM()
+#     Y = sim_data['Y']
+#     mu = sim_data['mu']
+#     num_inducing, [_, Q] = 3, mu.shape
+#
+#     k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2))
+#     m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k,
+#                        _debug=False)
+#     m.auto_scale_factor = True
+#     m['noise'] = Y.var() / 100.
+#     m['linear_variance'] = .01
+#     return m

-    from GPy.models import mrd
-    from GPy import kern
-    reload(mrd); reload(kern)
-    k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2))
-    m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k,
-#                        X=mu,
-#                        X_variance=S,
-                       _debug=False)
-    m.auto_scale_factor = True
-    m['noise'] = Y.var() / 100.
-    m['linear_variance'] = .01
-    return m
-
-def bgplvm_simulation(optimize='scg',
-                      plot=True,
+def bgplvm_simulation(optimize=True, verbose=1,
+                      plot=True, plot_sim=False,
                      max_iters=2e4,
-                      plot_sim=False):
-#     from GPy.core.transformations import logexp_clipped
-    D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 3, 10
-    slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
-
-    from GPy.models import mrd
+                      ):
    from GPy import kern
-    reload(mrd); reload(kern)
+    from GPy.models import BayesianGPLVM

+    D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 3, 10
+    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
    Y = Ylist[0]
-
-    k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2)) # + kern.bias(Q)
+    k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
    m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k)
-
-    # m.constrain('variance|noise', logexp_clipped())
    m['noise'] = Y.var() / 100.

    if optimize:
        print "Optimizing model:"
-        m.optimize(optimize, max_iters=max_iters,
-                   messages=True, gtol=.05)
+        m.optimize('scg', messages=verbose, max_iters=max_iters,
+                   gtol=.05)
    if plot:
        m.plot_X_1d("BGPLVM Latent Space 1D")
        m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
    return m

-def mrd_simulation(optimize=True, plot=True, plot_sim=True, **kw):
-    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
-    slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
+def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
+    from GPy import kern
+    from GPy.models import MRD
+    from GPy.likelihoods import Gaussian

+    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
+    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
    likelihood_list = [Gaussian(x, normalize=True) for x in Ylist]

-    from GPy.models import mrd
-    from GPy import kern
-
-    reload(mrd); reload(kern)
-
-    k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2))
-    m = mrd.MRD(likelihood_list, input_dim=Q, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw)
+    k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2))
+    m = MRD(likelihood_list, input_dim=Q, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw)
    m.ensure_default_constraints()

    for i, bgplvm in enumerate(m.bgplvms):
        m['{}_noise'.format(i)] = bgplvm.likelihood.Y.var() / 500.

-
-    # DEBUG
-    # np.seterr("raise")
-
    if optimize:
        print "Optimizing Model:"
-        m.optimize(messages=1, max_iters=8e3, gtol=.1)
+        m.optimize(messages=verbose, max_iters=8e3, gtol=.1)
    if plot:
        m.plot_X_1d("MRD Latent Space 1D")
        m.plot_scales("MRD Scales")
    return m

-def brendan_faces():
-    from GPy import kern
+def brendan_faces(optimize=True, verbose=True, plot=True):
+    import GPy
+
    data = GPy.util.datasets.brendan_faces()
    Q = 2
    Y = data['Y']
@ -343,18 +303,20 @@ def brendan_faces():
    # optimize
    m.constrain('rbf|noise|white', GPy.core.transformations.logexp_clipped())

-    m.optimize('scg', messages=1, max_iters=1000)
+    if optimize: m.optimize('scg', messages=verbose, max_iters=1000)

-    ax = m.plot_latent(which_indices=(0, 1))
-    y = m.likelihood.Y[0, :]
-    data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, order='F', invert=False, scale=False)
-    lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
-    raw_input('Press enter to finish')
+    if plot:
+        ax = m.plot_latent(which_indices=(0, 1))
+        y = m.likelihood.Y[0, :]
+        data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, order='F', invert=False, scale=False)
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        raw_input('Press enter to finish')

    return m

-def olivetti_faces():
-    from GPy import kern
+def olivetti_faces(optimize=True, verbose=True, plot=True):
+    import GPy
+
    data = GPy.util.datasets.olivetti_faces()
    Q = 2
    Y = data['Y']
@ -362,153 +324,145 @@ def olivetti_faces():
    Yn /= Yn.std()

    m = GPy.models.GPLVM(Yn, Q)
-    m.optimize('scg', messages=1, max_iters=1000)
-
-    ax = m.plot_latent(which_indices=(0, 1))
-    y = m.likelihood.Y[0, :]
-    data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
-    lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
-    raw_input('Press enter to finish')
+    if optimize: m.optimize('scg', messages=verbose, max_iters=1000)
+    if plot:
+        ax = m.plot_latent(which_indices=(0, 1))
+        y = m.likelihood.Y[0, :]
+        data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        raw_input('Press enter to finish')

    return m

-def stick_play(range=None, frame_rate=15):
-
+def stick_play(range=None, frame_rate=15, optimize=False, verbose=True, plot=True):
+    import GPy
    data = GPy.util.datasets.osu_run1()
    # optimize
    if range == None:
        Y = data['Y'].copy()
    else:
        Y = data['Y'][range[0]:range[1], :].copy()
-    y = Y[0, :]
-    data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-    GPy.util.visualize.data_play(Y, data_show, frame_rate)
+    if plot:
+        y = Y[0, :]
+        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        GPy.util.visualize.data_play(Y, data_show, frame_rate)
    return Y

-def stick(kernel=None):
+def stick(kernel=None, optimize=True, verbose=True, plot=True):
+    from matplotlib import pyplot as plt
+    import GPy
+
    data = GPy.util.datasets.osu_run1()
    # optimize
    m = GPy.models.GPLVM(data['Y'], 2, kernel=kernel)
-    m.optimize(messages=1, max_f_eval=10000)
-    if GPy.util.visualize.visual_available:
+    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    if plot and GPy.util.visualize.visual_available:
        plt.clf
        ax = m.plot_latent()
        y = m.likelihood.Y[0, :]
        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')

    return m

-def bcgplvm_linear_stick(kernel=None):
+def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):
+    from matplotlib import pyplot as plt
+    import GPy
+
    data = GPy.util.datasets.osu_run1()
    # optimize
    mapping = GPy.mappings.Linear(data['Y'].shape[1], 2)
    m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
-    m.optimize(messages=1, max_f_eval=10000)
-    if GPy.util.visualize.visual_available:
+    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    if plot and GPy.util.visualize.visual_available:
        plt.clf
        ax = m.plot_latent()
        y = m.likelihood.Y[0, :]
        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')

    return m

-def bcgplvm_stick(kernel=None):
+def bcgplvm_stick(kernel=None, optimize=True, verbose=True, plot=True):
+    from matplotlib import pyplot as plt
+    import GPy
+
    data = GPy.util.datasets.osu_run1()
    # optimize
    back_kernel=GPy.kern.rbf(data['Y'].shape[1], lengthscale=5.)
    mapping = GPy.mappings.Kernel(X=data['Y'], output_dim=2, kernel=back_kernel)
    m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
-    m.optimize(messages=1, max_f_eval=10000)
-    if GPy.util.visualize.visual_available:
+    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    if plot and GPy.util.visualize.visual_available:
        plt.clf
        ax = m.plot_latent()
        y = m.likelihood.Y[0, :]
        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')

    return m

-def robot_wireless():
+def robot_wireless(optimize=True, verbose=True, plot=True):
+    from matplotlib import pyplot as plt
+    import GPy
+
    data = GPy.util.datasets.robot_wireless()
    # optimize
    m = GPy.models.GPLVM(data['Y'], 2)
-    m.optimize(messages=1, max_f_eval=10000)
+    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
    m._set_params(m._get_params())
-    plt.clf
-    ax = m.plot_latent()
+    if plot:
+        m.plot_latent()

    return m

-def stick_bgplvm(model=None):
+def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
+    from GPy.models import BayesianGPLVM
+    from matplotlib import pyplot as plt
+    import GPy
+
    data = GPy.util.datasets.osu_run1()
    Q = 6
-    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2)) + GPy.kern.white(Q, np.exp(-2))
+    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))
    m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel)
    # optimize
    m.ensure_default_constraints()
-    m.optimize('scg', messages=1, max_iters=200, xtol=1e-300, ftol=1e-300)
+    if optimize: m.optimize('scg', messages=verbose, max_iters=200, xtol=1e-300, ftol=1e-300)
    m._set_params(m._get_params())
-    plt.clf, (latent_axes, sense_axes) = plt.subplots(1, 2)
-    plt.sca(latent_axes)
-    m.plot_latent()
-    y = m.likelihood.Y[0, :].copy()
-    data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-    lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
-    raw_input('Press enter to finish')
+    if plot:
+        plt.clf, (latent_axes, sense_axes) = plt.subplots(1, 2)
+        plt.sca(latent_axes)
+        m.plot_latent()
+        y = m.likelihood.Y[0, :].copy()
+        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        GPy.util.visualize.lvm_dimselect(m.X[0, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
+        raw_input('Press enter to finish')

    return m


-def cmu_mocap(subject='35', motion=['01'], in_place=True):
+def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose=True, plot=True):
+    import GPy

    data = GPy.util.datasets.cmu_mocap(subject, motion)
-    Y = data['Y']
    if in_place:
        # Make figure move in place.
        data['Y'][:, 0:3] = 0.0
+
    m = GPy.models.GPLVM(data['Y'], 2, normalize_Y=True)

-    # optimize
-    m.optimize(messages=1, max_f_eval=10000)
+    if optimize:
+        m.optimize(messages=verbose, max_f_eval=10000)

-    ax = m.plot_latent()
-    y = m.likelihood.Y[0, :]
-    data_show = GPy.util.visualize.skeleton_show(y[None, :], data['skel'])
-    lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
-    raw_input('Press enter to finish')
-    lvm_visualizer.close()
+    if plot:
+        ax = m.plot_latent()
+        y = m.likelihood.Y[0, :]
+        data_show = GPy.util.visualize.skeleton_show(y[None, :], data['skel'])
+        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        raw_input('Press enter to finish')
+        lvm_visualizer.close()

    return m
-
-# def BGPLVM_oil():
-#     data = GPy.util.datasets.oil()
-#     Y, X = data['Y'], data['X']
-#     X -= X.mean(axis=0)
-#     X /= X.std(axis=0)
-#
-#     Q = 10
-#     num_inducing = 30
-#
-#     kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q)
-#     m = GPy.models.BayesianGPLVM(X, Q, kernel=kernel, num_inducing=num_inducing)
-#     # m.scale_factor = 100.0
-#     m.constrain_positive('(white|noise|bias|X_variance|rbf_variance|rbf_length)')
-#     from sklearn import cluster
-#     km = cluster.KMeans(num_inducing, verbose=10)
-#     Z = km.fit(m.X).cluster_centers_
-#     # Z = GPy.util.misc.kmm_init(m.X, num_inducing)
-#     m.set('iip', Z)
-#     m.set('bias', 1e-4)
-#     # optimize
-#
-#     import pdb; pdb.set_trace()
-#     m.optimize('tnc', messages=1)
-#     print m
-#     m.plot_latent(labels=data['Y'].argmax(axis=1))
-#     return m
-
--- a/GPy/examples/laplace_approximations.py
+++ b/GPy/examples/laplace_approximations.py
@ -1,296 +0,0 @@
-import GPy
-import numpy as np
-import matplotlib.pyplot as plt
-from GPy.util import datasets
-np.random.seed(1)
-
-def student_t_approx():
-    """
-    Example of regressing with a student t likelihood
-    """
-    real_std = 0.1
-    #Start a function, any function
-    X = np.linspace(0.0, np.pi*2, 100)[:, None]
-    Y = np.sin(X) + np.random.randn(*X.shape)*real_std
-    Yc = Y.copy()
-
-    X_full = np.linspace(0.0, np.pi*2, 500)[:, None]
-    Y_full = np.sin(X_full)
-
-    Y = Y/Y.max()
-
-    #Slightly noisy data
-    Yc[75:80] += 1
-
-    #Very noisy data
-    #Yc[10] += 100
-    #Yc[25] += 10
-    #Yc[23] += 10
-    #Yc[26] += 1000
-    #Yc[24] += 10
-    #Yc = Yc/Yc.max()
-
-    #Add student t random noise to datapoints
-    deg_free = 5
-    print "Real noise: ", real_std
-    initial_var_guess = 0.5
-
-    #t_rv = t(deg_free, loc=0, scale=real_var)
-    #noise = t_rvrvs(size=Y.shape)
-    #Y += noise
-
-    plt.figure(1)
-    plt.suptitle('Gaussian likelihood')
-    # Kernel object
-    kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
-    kernel2 = kernel1.copy()
-    kernel3 = kernel1.copy()
-    kernel4 = kernel1.copy()
-    kernel5 = kernel1.copy()
-    kernel6 = kernel1.copy()
-
-    print "Clean Gaussian"
-    #A GP should completely break down due to the points as they get a lot of weight
-    # create simple GP model
-    m = GPy.models.GPRegression(X, Y, kernel=kernel1)
-    # optimize
-    m.ensure_default_constraints()
-    m.constrain_fixed('white', 1e-4)
-    m.randomize()
-    m.optimize()
-    # plot
-    ax = plt.subplot(211)
-    m.plot(ax=ax)
-    plt.plot(X_full, Y_full)
-    plt.ylim(-1.5, 1.5)
-    plt.title('Gaussian clean')
-    print m
-
-    #Corrupt
-    print "Corrupt Gaussian"
-    m = GPy.models.GPRegression(X, Yc, kernel=kernel2)
-    m.ensure_default_constraints()
-    m.constrain_fixed('white', 1e-4)
-    m.randomize()
-    m.optimize()
-    ax = plt.subplot(212)
-    m.plot(ax=ax)
-    plt.plot(X_full, Y_full)
-    plt.ylim(-1.5, 1.5)
-    plt.title('Gaussian corrupt')
-    print m
-
-    plt.figure(2)
-    plt.suptitle('Student-t likelihood')
-    edited_real_sd = initial_var_guess
-
-    print "Clean student t, rasm"
-    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
-    m = GPy.models.GPRegression(X, Y.copy(), kernel6, likelihood=stu_t_likelihood)
-    m.ensure_default_constraints()
-    m.constrain_positive('t_noise')
-    m.constrain_fixed('white', 1e-4)
-    m.randomize()
-    #m.update_likelihood_approximation()
-    m.optimize()
-    print(m)
-    ax = plt.subplot(211)
-    m.plot(ax=ax)
-    plt.plot(X_full, Y_full)
-    plt.ylim(-1.5, 1.5)
-    plt.title('Student-t rasm clean')
-
-    print "Corrupt student t, rasm"
-    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
-    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
-    m = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood)
-    m.ensure_default_constraints()
-    m.constrain_positive('t_noise')
-    m.constrain_fixed('white', 1e-4)
-    m.randomize()
-    for a in range(1):
-        m.randomize()
-        m_start = m.copy()
-        print m
-        m.optimize('scg', messages=1)
-    print(m)
-    ax = plt.subplot(212)
-    m.plot(ax=ax)
-    plt.plot(X_full, Y_full)
-    plt.ylim(-1.5, 1.5)
-    plt.title('Student-t rasm corrupt')
-
-    return m
-
-def boston_example():
-    import sklearn
-    from sklearn.cross_validation import KFold
-    optimizer='bfgs'
-    messages=0
-    data = datasets.boston_housing()
-    degrees_freedoms = [3, 5, 8, 10]
-    X = data['X'].copy()
-    Y = data['Y'].copy()
-    X = X-X.mean(axis=0)
-    X = X/X.std(axis=0)
-    Y = Y-Y.mean()
-    Y = Y/Y.std()
-    num_folds = 10
-    kf = KFold(len(Y), n_folds=num_folds, indices=True)
-    num_models = len(degrees_freedoms) + 3 #3 for baseline, gaussian, gaussian laplace approx
-    score_folds = np.zeros((num_models, num_folds))
-    pred_density = score_folds.copy()
-
-    def rmse(Y, Ystar):
-        return np.sqrt(np.mean((Y-Ystar)**2))
-
-    for n, (train, test) in enumerate(kf):
-        X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
-        print "Fold {}".format(n)
-
-        noise = 1e-1 #np.exp(-2)
-        rbf_len = 0.5
-        data_axis_plot = 4
-        plot = False
-        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
-        kernelgp = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
-
-        #Baseline
-        score_folds[0, n] = rmse(Y_test, np.mean(Y_train))
-
-        #Gaussian GP
-        print "Gauss GP"
-        mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp.copy())
-        mgp.ensure_default_constraints()
-        mgp.constrain_fixed('white', 1e-5)
-        mgp['rbf_len'] = rbf_len
-        mgp['noise'] = noise
-        print mgp
-        mgp.optimize(optimizer=optimizer, messages=messages)
-        Y_test_pred = mgp.predict(X_test)
-        score_folds[1, n] = rmse(Y_test, Y_test_pred[0])
-        pred_density[1, n] = np.mean(mgp.log_predictive_density(X_test, Y_test))
-        print mgp
-        print pred_density
-        if plot:
-            plt.figure()
-            plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
-            plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
-            plt.title('GP gauss')
-
-        print "Gaussian Laplace GP"
-        N, D = Y_train.shape
-        g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=noise, N=N, D=D)
-        g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution)
-        mg = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=g_likelihood)
-        mg.ensure_default_constraints()
-        mg.constrain_positive('noise_variance')
-        mg.constrain_fixed('white', 1e-5)
-        mg['rbf_len'] = rbf_len
-        mg['noise'] = noise
-        print mg
-        try:
-            mg.optimize(optimizer=optimizer, messages=messages)
-        except Exception:
-            print "Blew up"
-        Y_test_pred = mg.predict(X_test)
-        score_folds[2, n] = rmse(Y_test, Y_test_pred[0])
-        pred_density[2, n] = np.mean(mg.log_predictive_density(X_test, Y_test))
-        print pred_density
-        print mg
-        if plot:
-            plt.figure()
-            plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
-            plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
-            plt.title('Lap gauss')
-
-        for stu_num, df in enumerate(degrees_freedoms):
-            #Student T
-            print "Student-T GP {}df".format(df)
-            t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=df, sigma2=noise)
-            stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
-            mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
-            mstu_t.ensure_default_constraints()
-            mstu_t.constrain_fixed('white', 1e-5)
-            mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
-            mstu_t['rbf_len'] = rbf_len
-            mstu_t['t_noise'] = noise
-            print mstu_t
-            try:
-                mstu_t.optimize(optimizer=optimizer, messages=messages)
-            except Exception:
-                print "Blew up"
-            Y_test_pred = mstu_t.predict(X_test)
-            score_folds[3+stu_num, n] = rmse(Y_test, Y_test_pred[0])
-            pred_density[3+stu_num, n] = np.mean(mstu_t.log_predictive_density(X_test, Y_test))
-            print pred_density
-            print mstu_t
-            if plot:
-                plt.figure()
-                plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
-                plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
-                plt.title('Stu t {}df'.format(df))
-
-    print "Average scores: {}".format(np.mean(score_folds, 1))
-    print "Average pred density: {}".format(np.mean(pred_density, 1))
-
-    #Plotting
-    stu_t_legends = ['Student T, df={}'.format(df) for df in degrees_freedoms]
-    legends = ['Baseline', 'Gaussian', 'Laplace Approx Gaussian'] + stu_t_legends
-
-    #Plot boxplots for RMSE density
-    fig = plt.figure()
-    ax=fig.add_subplot(111)
-    plt.title('RMSE')
-    bp = ax.boxplot(score_folds.T, notch=0, sym='+', vert=1, whis=1.5)
-    plt.setp(bp['boxes'], color='black')
-    plt.setp(bp['whiskers'], color='black')
-    plt.setp(bp['fliers'], color='red', marker='+')
-    xtickNames = plt.setp(ax, xticklabels=legends)
-    plt.setp(xtickNames, rotation=45, fontsize=8)
-    ax.set_ylabel('RMSE')
-    ax.set_xlabel('Distribution')
-    #Make grid and put it below boxes
-    ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
-              alpha=0.5)
-    ax.set_axisbelow(True)
-
-    #Plot boxplots for predictive density
-    fig = plt.figure()
-    ax=fig.add_subplot(111)
-    plt.title('Predictive density')
-    bp = ax.boxplot(pred_density[1:,:].T, notch=0, sym='+', vert=1, whis=1.5)
-    plt.setp(bp['boxes'], color='black')
-    plt.setp(bp['whiskers'], color='black')
-    plt.setp(bp['fliers'], color='red', marker='+')
-    xtickNames = plt.setp(ax, xticklabels=legends[1:])
-    plt.setp(xtickNames, rotation=45, fontsize=8)
-    ax.set_ylabel('Mean Log probability P(Y*|Y)')
-    ax.set_xlabel('Distribution')
-    #Make grid and put it below boxes
-    ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
-              alpha=0.5)
-    ax.set_axisbelow(True)
-    return mstu_t
-
-def precipitation_example():
-    import sklearn
-    from sklearn.cross_validation import KFold
-    data = datasets.boston_housing()
-    X = data['X'].copy()
-    Y = data['Y'].copy()
-    X = X-X.mean(axis=0)
-    X = X/X.std(axis=0)
-    Y = Y-Y.mean()
-    Y = Y/Y.std()
-    import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
-    num_folds = 10
-    kf = KFold(len(Y), n_folds=num_folds, indices=True)
-    score_folds = np.zeros((4, num_folds))
-    def rmse(Y, Ystar):
-        return np.sqrt(np.mean((Y-Ystar)**2))
-    #for train, test in kf:
-    for n, (train, test) in enumerate(kf):
-        X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
-        print "Fold {}".format(n)
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@ -0,0 +1,286 @@
+import GPy
+import numpy as np
+import matplotlib.pyplot as plt
+from GPy.util import datasets
+
+def student_t_approx(optimize=True, plot=True):
+    """
+    Example of regressing with a student t likelihood using Laplace
+    """
+    real_std = 0.1
+    #Start a function, any function
+    X = np.linspace(0.0, np.pi*2, 100)[:, None]
+    Y = np.sin(X) + np.random.randn(*X.shape)*real_std
+    Y = Y/Y.max()
+    Yc = Y.copy()
+
+    X_full = np.linspace(0.0, np.pi*2, 500)[:, None]
+    Y_full = np.sin(X_full)
+    Y_full = Y_full/Y_full.max()
+
+    #Slightly noisy data
+    Yc[75:80] += 1
+
+    #Very noisy data
+    #Yc[10] += 100
+    #Yc[25] += 10
+    #Yc[23] += 10
+    #Yc[26] += 1000
+    #Yc[24] += 10
+    #Yc = Yc/Yc.max()
+
+    #Add student t random noise to datapoints
+    deg_free = 5
+    print "Real noise: ", real_std
+    initial_var_guess = 0.5
+    edited_real_sd = initial_var_guess
+
+    # Kernel object
+    kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+    kernel2 = kernel1.copy()
+    kernel3 = kernel1.copy()
+    kernel4 = kernel1.copy()
+
+    #Gaussian GP model on clean data
+    m1 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel1)
+    # optimize
+    m1.ensure_default_constraints()
+    m1.constrain_fixed('white', 1e-5)
+    m1.randomize()
+
+    #Gaussian GP model on corrupt data
+    m2 = GPy.models.GPRegression(X, Yc.copy(), kernel=kernel2)
+    m2.ensure_default_constraints()
+    m2.constrain_fixed('white', 1e-5)
+    m2.randomize()
+
+    #Student t GP model on clean data
+    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
+    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
+    m3 = GPy.models.GPRegression(X, Y.copy(), kernel3, likelihood=stu_t_likelihood)
+    m3.ensure_default_constraints()
+    m3.constrain_bounded('t_noise', 1e-6, 10.)
+    m3.constrain_fixed('white', 1e-5)
+    m3.randomize()
+
+    #Student t GP model on corrupt data
+    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
+    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
+    m4 = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood)
+    m4.ensure_default_constraints()
+    m4.constrain_bounded('t_noise', 1e-6, 10.)
+    m4.constrain_fixed('white', 1e-5)
+    m4.randomize()
+
+    if optimize:
+        optimizer='scg'
+        print "Clean Gaussian"
+        m1.optimize(optimizer, messages=1)
+        print "Corrupt Gaussian"
+        m2.optimize(optimizer, messages=1)
+        print "Clean student t"
+        m3.optimize(optimizer, messages=1)
+        print "Corrupt student t"
+        m4.optimize(optimizer, messages=1)
+
+    if plot:
+        plt.figure(1)
+        plt.suptitle('Gaussian likelihood')
+        ax = plt.subplot(211)
+        m1.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Gaussian clean')
+
+        ax = plt.subplot(212)
+        m2.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Gaussian corrupt')
+
+        plt.figure(2)
+        plt.suptitle('Student-t likelihood')
+        ax = plt.subplot(211)
+        m3.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Student-t rasm clean')
+
+        ax = plt.subplot(212)
+        m4.plot(ax=ax)
+        plt.plot(X_full, Y_full)
+        plt.ylim(-1.5, 1.5)
+        plt.title('Student-t rasm corrupt')
+
+    return m1, m2, m3, m4
+
+def boston_example(optimize=True, plot=True):
+    import sklearn
+    from sklearn.cross_validation import KFold
+    optimizer='bfgs'
+    messages=0
+    data = datasets.boston_housing()
+    degrees_freedoms = [3, 5, 8, 10]
+    X = data['X'].copy()
+    Y = data['Y'].copy()
+    X = X-X.mean(axis=0)
+    X = X/X.std(axis=0)
+    Y = Y-Y.mean()
+    Y = Y/Y.std()
+    num_folds = 10
+    kf = KFold(len(Y), n_folds=num_folds, indices=True)
+    num_models = len(degrees_freedoms) + 3 #3 for baseline, gaussian, gaussian laplace approx
+    score_folds = np.zeros((num_models, num_folds))
+    pred_density = score_folds.copy()
+
+    def rmse(Y, Ystar):
+        return np.sqrt(np.mean((Y-Ystar)**2))
+
+    for n, (train, test) in enumerate(kf):
+        X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
+        print "Fold {}".format(n)
+
+        noise = 1e-1 #np.exp(-2)
+        rbf_len = 0.5
+        data_axis_plot = 4
+        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+        kernelgp = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+
+        #Baseline
+        score_folds[0, n] = rmse(Y_test, np.mean(Y_train))
+
+        #Gaussian GP
+        print "Gauss GP"
+        mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp.copy())
+        mgp.ensure_default_constraints()
+        mgp.constrain_fixed('white', 1e-5)
+        mgp['rbf_len'] = rbf_len
+        mgp['noise'] = noise
+        print mgp
+        if optimize:
+            mgp.optimize(optimizer=optimizer, messages=messages)
+        Y_test_pred = mgp.predict(X_test)
+        score_folds[1, n] = rmse(Y_test, Y_test_pred[0])
+        pred_density[1, n] = np.mean(mgp.log_predictive_density(X_test, Y_test))
+        print mgp
+        print pred_density
+
+        print "Gaussian Laplace GP"
+        N, D = Y_train.shape
+        g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=noise, N=N, D=D)
+        g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution)
+        mg = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=g_likelihood)
+        mg.ensure_default_constraints()
+        mg.constrain_positive('noise_variance')
+        mg.constrain_fixed('white', 1e-5)
+        mg['rbf_len'] = rbf_len
+        mg['noise'] = noise
+        print mg
+        if optimize:
+            mg.optimize(optimizer=optimizer, messages=messages)
+        Y_test_pred = mg.predict(X_test)
+        score_folds[2, n] = rmse(Y_test, Y_test_pred[0])
+        pred_density[2, n] = np.mean(mg.log_predictive_density(X_test, Y_test))
+        print pred_density
+        print mg
+
+        for stu_num, df in enumerate(degrees_freedoms):
+            #Student T
+            print "Student-T GP {}df".format(df)
+            t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=df, sigma2=noise)
+            stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
+            mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
+            mstu_t.ensure_default_constraints()
+            mstu_t.constrain_fixed('white', 1e-5)
+            mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
+            mstu_t['rbf_len'] = rbf_len
+            mstu_t['t_noise'] = noise
+            print mstu_t
+            if optimize:
+                mstu_t.optimize(optimizer=optimizer, messages=messages)
+            Y_test_pred = mstu_t.predict(X_test)
+            score_folds[3+stu_num, n] = rmse(Y_test, Y_test_pred[0])
+            pred_density[3+stu_num, n] = np.mean(mstu_t.log_predictive_density(X_test, Y_test))
+            print pred_density
+            print mstu_t
+
+    if plot:
+        plt.figure()
+        plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+        plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+        plt.title('GP gauss')
+
+        plt.figure()
+        plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+        plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+        plt.title('Lap gauss')
+
+        plt.figure()
+        plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
+        plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
+        plt.title('Stu t {}df'.format(df))
+
+    print "Average scores: {}".format(np.mean(score_folds, 1))
+    print "Average pred density: {}".format(np.mean(pred_density, 1))
+
+    if plot:
+        #Plotting
+        stu_t_legends = ['Student T, df={}'.format(df) for df in degrees_freedoms]
+        legends = ['Baseline', 'Gaussian', 'Laplace Approx Gaussian'] + stu_t_legends
+
+        #Plot boxplots for RMSE density
+        fig = plt.figure()
+        ax=fig.add_subplot(111)
+        plt.title('RMSE')
+        bp = ax.boxplot(score_folds.T, notch=0, sym='+', vert=1, whis=1.5)
+        plt.setp(bp['boxes'], color='black')
+        plt.setp(bp['whiskers'], color='black')
+        plt.setp(bp['fliers'], color='red', marker='+')
+        xtickNames = plt.setp(ax, xticklabels=legends)
+        plt.setp(xtickNames, rotation=45, fontsize=8)
+        ax.set_ylabel('RMSE')
+        ax.set_xlabel('Distribution')
+        #Make grid and put it below boxes
+        ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
+                alpha=0.5)
+        ax.set_axisbelow(True)
+
+        #Plot boxplots for predictive density
+        fig = plt.figure()
+        ax=fig.add_subplot(111)
+        plt.title('Predictive density')
+        bp = ax.boxplot(pred_density[1:,:].T, notch=0, sym='+', vert=1, whis=1.5)
+        plt.setp(bp['boxes'], color='black')
+        plt.setp(bp['whiskers'], color='black')
+        plt.setp(bp['fliers'], color='red', marker='+')
+        xtickNames = plt.setp(ax, xticklabels=legends[1:])
+        plt.setp(xtickNames, rotation=45, fontsize=8)
+        ax.set_ylabel('Mean Log probability P(Y*|Y)')
+        ax.set_xlabel('Distribution')
+        #Make grid and put it below boxes
+        ax.yaxis.grid(True, linestyle='-', which='major', color='lightgrey',
+                alpha=0.5)
+        ax.set_axisbelow(True)
+    return mstu_t
+
+#def precipitation_example():
+    #import sklearn
+    #from sklearn.cross_validation import KFold
+    #data = datasets.boston_housing()
+    #X = data['X'].copy()
+    #Y = data['Y'].copy()
+    #X = X-X.mean(axis=0)
+    #X = X/X.std(axis=0)
+    #Y = Y-Y.mean()
+    #Y = Y/Y.std()
+    #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+    #num_folds = 10
+    #kf = KFold(len(Y), n_folds=num_folds, indices=True)
+    #score_folds = np.zeros((4, num_folds))
+    #def rmse(Y, Ystar):
+        #return np.sqrt(np.mean((Y-Ystar)**2))
+    ##for train, test in kf:
+    #for n, (train, test) in enumerate(kf):
+        #X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
+        #print "Fold {}".format(n)
+
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -1,7 +1,6 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-
 """
 Gaussian Processes regression examples
 """
@ -9,88 +8,105 @@ import pylab as pb
 import numpy as np
 import GPy

-def coregionalization_toy2(max_iters=100):
+def olympic_marathon_men(optimize=True, plot=True):
+    """Run a standard Gaussian process regression on the Olympic marathon data."""
+    data = GPy.util.datasets.olympic_marathon_men()
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(data['X'], data['Y'])
+
+    # set the lengthscale to be something sensible (defaults to 1)
+    m['rbf_lengthscale'] = 10
+
+    if optimize:
+        m.optimize('bfgs', max_iters=200)
+    if plot:
+        m.plot(plot_limits=(1850, 2050))
+
+    return m
+
+def coregionalization_toy2(optimize=True, plot=True):
    """
    A simple demonstration of coregionalization on two sinusoidal functions.
    """
+    #build a design matrix with a column of integers indicating the output
    X1 = np.random.rand(50, 1) * 8
    X2 = np.random.rand(30, 1) * 5
    index = np.vstack((np.zeros_like(X1), np.ones_like(X2)))
    X = np.hstack((np.vstack((X1, X2)), index))
+
+    #build a suitable set of observed variables
    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
    Y2 = np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + 2.
    Y = np.vstack((Y1, Y2))

+    #build the kernel
    k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
    k2 = GPy.kern.coregionalize(2,1)
-    k = k1**k2 #k = k1.prod(k2,tensor=True)
+    k = k1**k2
    m = GPy.models.GPRegression(X, Y, kernel=k)
    m.constrain_fixed('.*rbf_var', 1.)
-    # m.constrain_positive('.*kappa')
-    m.optimize('sim', messages=1, max_iters=max_iters)

-    pb.figure()
-    Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1))))
-    Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1))))
-    mean, var, low, up = m.predict(Xtest1)
-    GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up)
-    mean, var, low, up = m.predict(Xtest2)
-    GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up)
-    pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2)
-    pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2)
+    if optimize:
+        m.optimize('bfgs', max_iters=100)
+
+    if plot:
+        m.plot(fixed_inputs=[(1,0)])
+        m.plot(fixed_inputs=[(1,1)], ax=pb.gca())
+
    return m

-def coregionalization_toy(max_iters=100):
-    """
-    A simple demonstration of coregionalization on two sinusoidal functions.
-    """
-    X1 = np.random.rand(50, 1) * 8
-    X2 = np.random.rand(30, 1) * 5
-    X = np.vstack((X1, X2))
-    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
-    Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
-    Y = np.vstack((Y1, Y2))
+#FIXME: Needs recovering once likelihoods are consolidated
+#def coregionalization_toy(optimize=True, plot=True):
+#    """
+#    A simple demonstration of coregionalization on two sinusoidal functions.
+#    """
+#    X1 = np.random.rand(50, 1) * 8
+#    X2 = np.random.rand(30, 1) * 5
+#    X = np.vstack((X1, X2))
+#    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
+#    Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
+#    Y = np.vstack((Y1, Y2))
+#
+#    k1 = GPy.kern.rbf(1)
+#    m = GPy.models.GPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1])
+#    m.constrain_fixed('.*rbf_var', 1.)
+#    m.optimize(max_iters=100)
+#
+#    fig, axes = pb.subplots(2,1)
+#    m.plot(fixed_inputs=[(1,0)],ax=axes[0])
+#    m.plot(fixed_inputs=[(1,1)],ax=axes[1])
+#    axes[0].set_title('Output 0')
+#    axes[1].set_title('Output 1')
+#    return m

-    k1 = GPy.kern.rbf(1)
-    m = GPy.models.GPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1])
-    m.constrain_fixed('.*rbf_var', 1.)
-    m.optimize(max_iters=max_iters)
-
-    fig, axes = pb.subplots(2,1)
-    m.plot(fixed_inputs=[(1,0)],ax=axes[0])
-    m.plot(fixed_inputs=[(1,1)],ax=axes[1])
-    axes[0].set_title('Output 0')
-    axes[1].set_title('Output 1')
-    return m
-
-def coregionalization_sparse(max_iters=100):
+def coregionalization_sparse(optimize=True, plot=True):
    """
    A simple demonstration of coregionalization on two sinusoidal functions using sparse approximations.
    """
-    X1 = np.random.rand(500, 1) * 8
-    X2 = np.random.rand(300, 1) * 5
-    index = np.vstack((np.zeros_like(X1), np.ones_like(X2)))
-    X = np.hstack((np.vstack((X1, X2)), index))
-    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
-    Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
-    Y = np.vstack((Y1, Y2))
+    #fetch the data from the non sparse examples
+    m = coregionalization_toy2(optimize=False, plot=False)
+    X, Y = m.X, m.likelihood.Y

-    k1 = GPy.kern.rbf(1)
+    #construct a model
+    m = GPy.models.SparseGPRegression(X,Y)
+    m.constrain_fixed('iip_\d+_1') # don't optimize the inducing input indexes

-    m = GPy.models.SparseGPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1],num_inducing=5)
-    m.constrain_fixed('.*rbf_var',1.)
-    #m.optimize(messages=1)
-    m.optimize_restarts(5, robust=True, messages=1, max_iters=max_iters, optimizer='bfgs')
+    if optimize:
+        m.optimize('bfgs', max_iters=100, messages=1)
+
+    if plot:
+        m.plot(fixed_inputs=[(1,0)])
+        m.plot(fixed_inputs=[(1,1)], ax=pb.gca())

-    fig, axes = pb.subplots(2,1)
-    m.plot_single_output(output=0,ax=axes[0],plot_limits=(-1,9))
-    m.plot_single_output(output=1,ax=axes[1],plot_limits=(-1,9))
-    axes[0].set_title('Output 0')
-    axes[1].set_title('Output 1')
    return m

-def epomeo_gpx(max_iters=100):
-    """Perform Gaussian process regression on the latitude and longitude data from the Mount Epomeo runs. Requires gpxpy to be installed on your system to load in the data."""
+def epomeo_gpx(max_iters=200, optimize=True, plot=True):
+    """
+    Perform Gaussian process regression on the latitude and longitude data
+    from the Mount Epomeo runs. Requires gpxpy to be installed on your system
+    to load in the data.
+    """
    data = GPy.util.datasets.epomeo_gpx()
    num_data_list = []
    for Xpart in data['X']:
@ -119,14 +135,16 @@ def epomeo_gpx(max_iters=100):
    m.constrain_fixed('.*rbf_var', 1.)
    m.constrain_fixed('iip')
    m.constrain_bounded('noise_variance', 1e-3, 1e-1)
-#     m.optimize_restarts(5, robust=True, messages=1, max_iters=max_iters, optimizer='bfgs')
    m.optimize(max_iters=max_iters,messages=True)

    return m

-
-def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=10000, max_iters=300):
-    """Show an example of a multimodal error surface for Gaussian process regression. Gene 939 has bimodal behaviour where the noisy mode is higher."""
+def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=10000, max_iters=300, optimize=True, plot=True):
+    """
+    Show an example of a multimodal error surface for Gaussian process
+    regression. Gene 939 has bimodal behaviour where the noisy mode is
+    higher.
+    """

    # Contour over a range of length scales and signal/noise ratios.
    length_scales = np.linspace(0.1, 60., resolution)
@ -139,13 +157,14 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
    data['Y'] = data['Y'] - np.mean(data['Y'])

    lls = GPy.examples.regression._contour_data(data, length_scales, log_SNRs, GPy.kern.rbf)
-    pb.contour(length_scales, log_SNRs, np.exp(lls), 20, cmap=pb.cm.jet)
-    ax = pb.gca()
-    pb.xlabel('length scale')
-    pb.ylabel('log_10 SNR')
+    if plot:
+        pb.contour(length_scales, log_SNRs, np.exp(lls), 20, cmap=pb.cm.jet)
+        ax = pb.gca()
+        pb.xlabel('length scale')
+        pb.ylabel('log_10 SNR')

-    xlim = ax.get_xlim()
-    ylim = ax.get_ylim()
+        xlim = ax.get_xlim()
+        ylim = ax.get_ylim()

    # Now run a few optimizations
    models = []
@ -162,25 +181,31 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
        optim_point_y[0] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);

        # optimize
-        m.optimize('scg', xtol=1e-6, ftol=1e-6, max_iters=max_iters)
+        if optimize:
+            m.optimize('scg', xtol=1e-6, ftol=1e-6, max_iters=max_iters)

        optim_point_x[1] = m['rbf_lengthscale']
        optim_point_y[1] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);

-        pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1] - optim_point_x[0], optim_point_y[1] - optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k')
+        if plot:
+            pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1] - optim_point_x[0], optim_point_y[1] - optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k')
        models.append(m)

-    ax.set_xlim(xlim)
-    ax.set_ylim(ylim)
+    if plot:
+        ax.set_xlim(xlim)
+        ax.set_ylim(ylim)
    return m # (models, lls)

 def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):
-    """Evaluate the GP objective function for a given data set for a range of signal to noise ratios and a range of lengthscales.
+    """
+    Evaluate the GP objective function for a given data set for a range of
+    signal to noise ratios and a range of lengthscales.

    :data_set: A data set from the utils.datasets director.
    :length_scales: a list of length scales to explore for the contour plot.
    :log_SNRs: a list of base 10 logarithm signal to noise ratios to explore for the contour plot.
-    :kernel: a kernel to use for the 'signal' portion of the data."""
+    :kernel: a kernel to use for the 'signal' portion of the data.
+    """

    lls = []
    total_var = np.var(data['Y'])
@ -203,79 +228,58 @@ def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):
    return np.array(lls)


-def olympic_100m_men(max_iters=100, kernel=None):
+def olympic_100m_men(optimize=True, plot=True):
    """Run a standard Gaussian process regression on the Rogers and Girolami olympics data."""
    data = GPy.util.datasets.olympic_100m_men()

    # create simple GP Model
-    m = GPy.models.GPRegression(data['X'], data['Y'], kernel)
+    m = GPy.models.GPRegression(data['X'], data['Y'])

    # set the lengthscale to be something sensible (defaults to 1)
-    if kernel==None:
-        m['rbf_lengthscale'] = 10
+    m['rbf_lengthscale'] = 10

-    # optimize
-    m.optimize(max_iters=max_iters)
+    if optimize:
+        m.optimize('bfgs', max_iters=200)

-    # plot
-    m.plot(plot_limits=(1850, 2050))
-    print(m)
+    if plot:
+        m.plot(plot_limits=(1850, 2050))
    return m

-def olympic_marathon_men(max_iters=100, kernel=None):
-    """Run a standard Gaussian process regression on the Olympic marathon data."""
-    data = GPy.util.datasets.olympic_marathon_men()
-
-    # create simple GP Model
-    m = GPy.models.GPRegression(data['X'], data['Y'], kernel)
-
-    # set the lengthscale to be something sensible (defaults to 1)
-    if kernel==None:
-        m['rbf_lengthscale'] = 10
-
-    # optimize
-    m.optimize(max_iters=max_iters)
-
-    # plot
-    m.plot(plot_limits=(1850, 2050))
-    print(m)
-    return m
-
-def toy_rbf_1d(optimizer='tnc', max_nb_eval_optim=100):
+def toy_rbf_1d(optimize=True, plot=True):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
    data = GPy.util.datasets.toy_rbf_1d()

    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])

-    # optimize
-    m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
-    # plot
-    m.plot()
-    print(m)
+    if optimize:
+        m.optimize('bfgs')
+    if plot:
+        m.plot()
+
    return m

-def toy_rbf_1d_50(max_iters=100):
+def toy_rbf_1d_50(optimize=True, plot=True):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
    data = GPy.util.datasets.toy_rbf_1d_50()

    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])

-    # optimize
-    m.optimize(max_iters=max_iters)
+    if optimize:
+        m.optimize('bfgs')
+    if plot:
+        m.plot()

-    # plot
-    m.plot()
-    print(m)
    return m

-def toy_poisson_rbf_1d(optimizer='bfgs', max_nb_eval_optim=100):
+
+def toy_poisson_rbf_1d(optimize=True, plot=True):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
    x_len = 400
    X = np.linspace(0, 10, x_len)[:, None]
    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.rbf(1).K(X))
-    Y = np.array([np.random.poisson(np.exp(f)) for f in f_true])[:,None]
+    Y = np.array([np.random.poisson(np.exp(f)) for f in f_true]).reshape(x_len,1)

    noise_model = GPy.likelihoods.poisson()
    likelihood = GPy.likelihoods.EP(Y,noise_model)
@ -283,15 +287,16 @@ def toy_poisson_rbf_1d(optimizer='bfgs', max_nb_eval_optim=100):
    # create simple GP Model
    m = GPy.models.GPRegression(X, Y, likelihood=likelihood)

-    # optimize
-    m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
-    # plot
-    m.plot()
-    print(m)
+    if optimize:
+        m.optimize('bfgs')
+    if plot:
+        m.plot()
+
    return m

-def toy_poisson_rbf_1d_laplace(optimizer='bfgs', max_nb_eval_optim=100):
+def toy_poisson_rbf_1d_laplace(optimize=True, plot=True):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
+    optimizer='scg'
    x_len = 30
    X = np.linspace(0, 10, x_len)[:, None]
    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.rbf(1).K(X))
@ -303,18 +308,16 @@ def toy_poisson_rbf_1d_laplace(optimizer='bfgs', max_nb_eval_optim=100):
    # create simple GP Model
    m = GPy.models.GPRegression(X, Y, likelihood=likelihood)

-    # optimize
-    m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
-    # plot
-    m.plot()
-    # plot the real underlying rate function
-    pb.plot(X, np.exp(f_true), '--k', linewidth=2)
-    print(m)
+    if optimize:
+        m.optimize(optimizer)
+    if plot:
+        m.plot()
+        # plot the real underlying rate function
+        pb.plot(X, np.exp(f_true), '--k', linewidth=2)
+
    return m

-
-
-def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4):
+def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize=True, plot=True):
    # Create an artificial dataset where the values in the targets (Y)
    # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to
    # see if this dependency can be recovered
@ -344,13 +347,16 @@ def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4):
    # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
    # m.set_prior('.*lengthscale',len_prior)

-    m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
+    if optimize:
+        m.optimize(optimizer='scg', max_iters=max_iters, messages=1)

-    m.kern.plot_ARD()
-    print(m)
+    if plot:
+        m.kern.plot_ARD()
+
+    print m
    return m

-def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4):
+def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize=True, plot=True):
    # Create an artificial dataset where the values in the targets (Y)
    # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to
    # see if this dependency can be recovered
@ -381,13 +387,16 @@ def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4):
    # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
    # m.set_prior('.*lengthscale',len_prior)

-    m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
+    if optimize:
+        m.optimize(optimizer='scg', max_iters=max_iters, messages=1)

-    m.kern.plot_ARD()
-    print(m)
+    if plot:
+        m.kern.plot_ARD()
+
+    print m
    return m

-def robot_wireless(max_iters=100, kernel=None):
+def robot_wireless(max_iters=100, kernel=None, optimize=True, plot=True):
    """Predict the location of a robot given wirelss signal strength readings."""
    data = GPy.util.datasets.robot_wireless()

@ -395,20 +404,24 @@ def robot_wireless(max_iters=100, kernel=None):
    m = GPy.models.GPRegression(data['Y'], data['X'], kernel=kernel)

    # optimize
-    m.optimize(messages=True, max_iters=max_iters)
+    if optimize:
+        m.optimize(messages=True, max_iters=max_iters)
+
    Xpredict = m.predict(data['Ytest'])[0]
-    pb.plot(data['Xtest'][:, 0], data['Xtest'][:, 1], 'r-')
-    pb.plot(Xpredict[:, 0], Xpredict[:, 1], 'b-')
-    pb.axis('equal')
-    pb.title('WiFi Localization with Gaussian Processes')
-    pb.legend(('True Location', 'Predicted Location'))
+    if plot:
+        pb.plot(data['Xtest'][:, 0], data['Xtest'][:, 1], 'r-')
+        pb.plot(Xpredict[:, 0], Xpredict[:, 1], 'b-')
+        pb.axis('equal')
+        pb.title('WiFi Localization with Gaussian Processes')
+        pb.legend(('True Location', 'Predicted Location'))

    sse = ((data['Xtest'] - Xpredict)**2).sum()
-    print(m)
+
+    print m
    print('Sum of squares error on test data: ' + str(sse))
    return m

-def silhouette(max_iters=100):
+def silhouette(max_iters=100, optimize=True, plot=True):
    """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper."""
    data = GPy.util.datasets.silhouette()

@ -416,12 +429,13 @@ def silhouette(max_iters=100):
    m = GPy.models.GPRegression(data['X'], data['Y'])

    # optimize
-    m.optimize(messages=True, max_iters=max_iters)
+    if optimize:
+        m.optimize(messages=True, max_iters=max_iters)

-    print(m)
+    print m
    return m

-def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100):
+def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, optimize=True, plot=True):
    """Run a 1D example of a sparse GP regression."""
    # sample inputs and outputs
    X = np.random.uniform(-3., 3., (num_samples, 1))
@ -430,14 +444,17 @@ def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100):
    rbf = GPy.kern.rbf(1)
    # create simple GP Model
    m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing)
-
-
    m.checkgrad(verbose=1)
-    m.optimize('tnc', messages=1, max_iters=max_iters)
-    m.plot()
+
+    if optimize:
+        m.optimize('tnc', messages=1, max_iters=max_iters)
+
+    if plot:
+        m.plot()
+
    return m

-def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100):
+def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, optimize=True, plot=True):
    """Run a 2D example of a sparse GP regression."""
    X = np.random.uniform(-3., 3., (num_samples, 2))
    Y = np.sin(X[:, 0:1]) * np.sin(X[:, 1:2]) + np.random.randn(num_samples, 1) * 0.05
@ -453,13 +470,18 @@ def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100):

    m.checkgrad()

-    # optimize and plot
-    m.optimize('tnc', messages=1, max_iters=max_iters)
-    m.plot()
-    print(m)
+    # optimize
+    if optimize:
+        m.optimize('tnc', messages=1, max_iters=max_iters)
+
+    # plot
+    if plot:
+        m.plot()
+
+    print m
    return m

-def uncertain_inputs_sparse_regression(max_iters=100):
+def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
    """Run a 1D example of a sparse GP regression with uncertain inputs."""
    fig, axes = pb.subplots(1, 2, figsize=(12, 5))

@ -474,18 +496,23 @@ def uncertain_inputs_sparse_regression(max_iters=100):

    # create simple GP Model - no input uncertainty on this one
    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z)
-    m.optimize('scg', messages=1, max_iters=max_iters)
-    m.plot(ax=axes[0])
-    axes[0].set_title('no input uncertainty')

+    if optimize:
+        m.optimize('scg', messages=1, max_iters=max_iters)
+
+    if plot:
+        m.plot(ax=axes[0])
+        axes[0].set_title('no input uncertainty')
+    print m

    # the same Model with uncertainty
    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z, X_variance=S)
-    m.optimize('scg', messages=1, max_iters=max_iters)
-    m.plot(ax=axes[1])
-    axes[1].set_title('with input uncertainty')
-    print(m)
-
-    fig.canvas.draw()
+    if optimize:
+        m.optimize('scg', messages=1, max_iters=max_iters)
+    if plot:
+        m.plot(ax=axes[1])
+        axes[1].set_title('with input uncertainty')
+        fig.canvas.draw()

+    print m
    return m
--- a/GPy/examples/stochastic.py
+++ b/GPy/examples/stochastic.py
@ -5,7 +5,7 @@ import pylab as pb
 import numpy as np
 import GPy

-def toy_1d():
+def toy_1d(optimize=True, plot=True):
    N = 2000
    M = 20

@ -20,22 +20,18 @@ def toy_1d():

    m.param_steplength = 1e-4

-    fig = pb.figure()
-    ax = fig.add_subplot(111)
-    def cb():
-        ax.cla()
-        m.plot(ax=ax,Z_height=-3)
-        ax.set_ylim(-3,3)
-        fig.canvas.draw()
+    if plot:
+        fig = pb.figure()
+        ax = fig.add_subplot(111)
+        def cb(foo):
+            ax.cla()
+            m.plot(ax=ax,Z_height=-3)
+            ax.set_ylim(-3,3)
+            fig.canvas.draw()

-    m.optimize(500, callback=cb, callback_interval=1)
+    if optimize:
+        m.optimize(500, callback=cb, callback_interval=1)

-    m.plot_traces()
+    if plot:
+        m.plot_traces()
    return m
-
-
-
-
-
-
-
--- a/GPy/examples/tutorials.py
+++ b/GPy/examples/tutorials.py
@ -11,7 +11,7 @@ pb.ion()
 import numpy as np
 import GPy

-def tuto_GP_regression():
+def tuto_GP_regression(optimize=True, plot=True):
    """The detailed explanations of the commands used in this file can be found in the tutorial section"""

    X = np.random.uniform(-3.,3.,(20,1))
@ -22,7 +22,8 @@ def tuto_GP_regression():
    m = GPy.models.GPRegression(X, Y, kernel)

    print m
-    m.plot()
+    if plot:
+        m.plot()

    m.constrain_positive('')

@ -31,9 +32,9 @@ def tuto_GP_regression():
    m.constrain_bounded('.*lengthscale',1.,10. )
    m.constrain_fixed('.*noise',0.0025)

-    m.optimize()
-
-    m.optimize_restarts(num_restarts = 10)
+    if optimize:
+        m.optimize()
+        m.optimize_restarts(num_restarts = 10)

    #######################################################
    #######################################################
@ -51,22 +52,26 @@ def tuto_GP_regression():
    m.constrain_positive('')

    # optimize and plot
-    m.optimize('tnc', max_f_eval = 1000)
-    m.plot()
-    print(m)
+    if optimize:
+        m.optimize('tnc', max_f_eval = 1000)
+    if plot:
+        m.plot()
+
+    print m
    return(m)

-def tuto_kernel_overview():
+def tuto_kernel_overview(optimize=True, plot=True):
    """The detailed explanations of the commands used in this file can be found in the tutorial section"""
    ker1 = GPy.kern.rbf(1)  # Equivalent to ker1 = GPy.kern.rbf(input_dim=1, variance=1., lengthscale=1.)
    ker2 = GPy.kern.rbf(input_dim=1, variance = .75, lengthscale=2.)
    ker3 = GPy.kern.rbf(1, .5, .5)
-    
+
    print ker2

-    ker1.plot()
-    ker2.plot()
-    ker3.plot()
+    if plot:
+        ker1.plot()
+        ker2.plot()
+        ker3.plot()

    k1 = GPy.kern.rbf(1,1.,2.)
    k2 = GPy.kern.Matern32(1, 0.5, 0.2)
@ -77,8 +82,8 @@ def tuto_kernel_overview():

    # Sum of kernels
    k_add = k1.add(k2)                          # By default, tensor=False
-    k_addtens = k1.add(k2,tensor=True)    
-    
+    k_addtens = k1.add(k2,tensor=True)
+
    k1 = GPy.kern.rbf(1,1.,2)
    k2 = GPy.kern.periodic_Matern52(1,variance=1e3, lengthscale=1, period = 1.5, lower=-5., upper = 5)

@ -102,7 +107,7 @@ def tuto_kernel_overview():
    k.unconstrain('white')
    k.constrain_bounded('white',lower=1e-5,upper=.5)
    print k
-    
+
    k_cst = GPy.kern.bias(1,variance=1.)
    k_mat = GPy.kern.Matern52(1,variance=1., lengthscale=3)
    Kanova = (k_cst + k_mat).prod(k_cst + k_mat,tensor=True)
@ -114,30 +119,32 @@ def tuto_kernel_overview():

    # Create GP regression model
    m = GPy.models.GPRegression(X, Y, Kanova)
-    fig = pb.figure(figsize=(5,5))
-    ax = fig.add_subplot(111)
-    m.plot(ax=ax)
-   
-    pb.figure(figsize=(20,3))
-    pb.subplots_adjust(wspace=0.5)
-    axs = pb.subplot(1,5,1)
-    m.plot(ax=axs)
-    pb.subplot(1,5,2)
-    pb.ylabel("=   ",rotation='horizontal',fontsize='30')
-    axs = pb.subplot(1,5,3)
-    m.plot(ax=axs, which_parts=[False,True,False,False])
-    pb.ylabel("cst          +",rotation='horizontal',fontsize='30')
-    axs = pb.subplot(1,5,4)
-    m.plot(ax=axs, which_parts=[False,False,True,False])
-    pb.ylabel("+   ",rotation='horizontal',fontsize='30')
-    axs = pb.subplot(1,5,5)
-    pb.ylabel("+   ",rotation='horizontal',fontsize='30')
-    m.plot(ax=axs, which_parts=[False,False,False,True])
+
+    if plot:
+        fig = pb.figure(figsize=(5,5))
+        ax = fig.add_subplot(111)
+        m.plot(ax=ax)
+
+        pb.figure(figsize=(20,3))
+        pb.subplots_adjust(wspace=0.5)
+        axs = pb.subplot(1,5,1)
+        m.plot(ax=axs)
+        pb.subplot(1,5,2)
+        pb.ylabel("=   ",rotation='horizontal',fontsize='30')
+        axs = pb.subplot(1,5,3)
+        m.plot(ax=axs, which_parts=[False,True,False,False])
+        pb.ylabel("cst          +",rotation='horizontal',fontsize='30')
+        axs = pb.subplot(1,5,4)
+        m.plot(ax=axs, which_parts=[False,False,True,False])
+        pb.ylabel("+   ",rotation='horizontal',fontsize='30')
+        axs = pb.subplot(1,5,5)
+        pb.ylabel("+   ",rotation='horizontal',fontsize='30')
+        m.plot(ax=axs, which_parts=[False,False,False,True])

    return(m)


-def model_interaction():
+def model_interaction(optimize=True, plot=True):
    X = np.random.randn(20,1)
    Y = np.sin(X) + np.random.randn(*X.shape)*0.01 + 5.
    k = GPy.kern.rbf(1) + GPy.kern.bias(1)
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@ -487,12 +487,11 @@ class kern(Parameterized):
                p1.psi1(Z, mu, S, psi11)
                Mu, Sigma = p1._crossterm_mu_S(Z, mu, S)
                Mu, Sigma = Mu.reshape(NM,self.input_dim), Sigma.reshape(NM,self.input_dim)
-                
+
                p2.psi1(Z, Mu, Sigma, psi12)
                eK2 = psi12.reshape(N, M, M)
                crossterms = eK2 * (psi11[:, :, None] + psi11[:, None, :])
                target += crossterms
-                #import ipdb;ipdb.set_trace()
            else:
                raise NotImplementedError, "psi2 cannot be computed for this kernel"
        return target        
@ -540,15 +539,15 @@ class kern(Parameterized):
                    # turn around to have rbf in front
                    p1, p2 = self.parts[i2], self.parts[i1]
                    ps1, ps2 = self.param_slices[i2], self.param_slices[i1]  
-                
+
                N, M = mu.shape[0], Z.shape[0]; NM=N*M

                psi11 = np.zeros((N, M))
                p1.psi1(Z, mu, S, psi11)
-                
+
                Mu, Sigma = p1._crossterm_mu_S(Z, mu, S)
                Mu, Sigma = Mu.reshape(NM,self.input_dim), Sigma.reshape(NM,self.input_dim)
-                
+
                tmp1 = np.zeros_like(target[ps1])
                tmp2 = np.zeros_like(target[ps2])
 #                 for n in range(N):
@ -559,7 +558,7 @@ class kern(Parameterized):
 #                             Mu, Sigma= Mu.reshape(N,M,self.input_dim), Sigma.reshape(N,M,self.input_dim)
 #                             p2.dpsi1_dtheta((dL_dpsi2[n:n+1,m:m+1,m_prime:m_prime+1]*(psi11[n:n+1,m_prime:m_prime+1]))[0], Z[m:m+1], Mu[n:n+1,m], Sigma[n:n+1,m], target[ps2])
 #                             p2.dpsi1_dtheta((dL_dpsi2[n:n+1,m:m+1,m_prime:m_prime+1]*(psi11[n:n+1,m:m+1]))[0], Z[m_prime:m_prime+1], Mu[n:n+1, m_prime], Sigma[n:n+1, m_prime], target[ps2])#Z[m_prime:m_prime+1], Mu[n+m:(n+m)+1], Sigma[n+m:(n+m)+1], target[ps2])
-                
+
                if isinstance(p1, RBF) and isinstance(p2, RBF):
                    psi12 = np.zeros((N, M))
                    p2.psi1(Z, mu, S, psi12)
@ -571,11 +570,11 @@ class kern(Parameterized):
                if isinstance(p1, RBF) and isinstance(p2, Linear):
                    #import ipdb;ipdb.set_trace()
                    pass
-                
+
                p2.dpsi1_dtheta((dL_dpsi2*(psi11[:,:,None] + psi11[:,None,:])).reshape(NM,M), Z, Mu, Sigma, tmp2)
-                
+
                target[ps1] += tmp1
-                target[ps2] += tmp2                
+                target[ps2] += tmp2
            else:
                raise NotImplementedError, "psi2 cannot be computed for this kernel"

@ -615,17 +614,17 @@ class kern(Parameterized):
                psi11 = np.zeros((N, M))
                psi12 = np.zeros((NM, M))
                #psi12_t = np.zeros((N,M))
-                
+
                p1.psi1(Z, mu, S, psi11)
                Mu, Sigma = p1._crossterm_mu_S(Z, mu, S)
                Mu, Sigma = Mu.reshape(NM,self.input_dim), Sigma.reshape(NM,self.input_dim)
-                
+
                p2.psi1(Z, Mu, Sigma, psi12)
                tmp1 = np.zeros_like(target)
                p1.dpsi1_dZ((dL_dpsi2*psi12.reshape(N,M,M)).sum(1), Z, mu, S, tmp1)
                p1.dpsi1_dZ((dL_dpsi2*psi12.reshape(N,M,M)).sum(2), Z, mu, S, tmp1)
                target += tmp1
-                
+
                #p2.dpsi1_dtheta((dL_dpsi2*(psi11[:,:,None] + psi11[:,None,:])).reshape(NM,M), Z, Mu, Sigma, target)
                p2.dpsi1_dZ((dL_dpsi2*(psi11[:,:,None] + psi11[:,None,:])).reshape(NM,M), Z, Mu, Sigma, target)
            else:
@ -666,21 +665,21 @@ class kern(Parameterized):
                psi11 = np.zeros((N, M))
                psi12 = np.zeros((NM, M))
                #psi12_t = np.zeros((N,M))
-                
+
                p1.psi1(Z, mu, S, psi11)
                Mu, Sigma = p1._crossterm_mu_S(Z, mu, S)
                Mu, Sigma = Mu.reshape(NM,self.input_dim), Sigma.reshape(NM,self.input_dim)
-                
+
                p2.psi1(Z, Mu, Sigma, psi12)
                p1.dpsi1_dmuS((dL_dpsi2*psi12.reshape(N,M,M)).sum(1), Z, mu, S, target_mu, target_S)
                p1.dpsi1_dmuS((dL_dpsi2*psi12.reshape(N,M,M)).sum(2), Z, mu, S, target_mu, target_S)
-                
+
                #p2.dpsi1_dtheta((dL_dpsi2*(psi11[:,:,None] + psi11[:,None,:])).reshape(NM,M), Z, Mu, Sigma, target)
                p2.dpsi1_dmuS((dL_dpsi2*(psi11[:,:,None])).sum(1)*2, Z, Mu.reshape(N,M,self.input_dim).sum(1), Sigma.reshape(N,M,self.input_dim).sum(1), target_mu, target_S)
            else:
                raise NotImplementedError, "psi2 cannot be computed for this kernel"
        return target_mu, target_S
-    
+
    def plot(self, x=None, plot_limits=None, which_parts='all', resolution=None, *args, **kwargs):
        if which_parts == 'all':
            which_parts = [True] * self.num_parts
@ -737,15 +736,16 @@ class kern(Parameterized):
        else:
            raise NotImplementedError, "Cannot plot a kernel with more than two input dimensions"

-from GPy.core.model import Model
-
+from ..core.model import Model
 class Kern_check_model(Model):
    """This is a dummy model class used as a base class for checking that the gradients of a given kernel are implemented correctly. It enables checkgradient() to be called independently on a kernel."""
    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
        num_samples = 20
        num_samples2 = 10
        if kernel==None:
+            import GPy
            kernel = GPy.kern.rbf(1)
+            del GPy
        if X==None:
            X = np.random.normal(size=(num_samples, kernel.input_dim))
        if dL_dK==None:
@ -753,14 +753,14 @@ class Kern_check_model(Model):
                dL_dK = np.ones((X.shape[0], X.shape[0]))
            else:
                dL_dK = np.ones((X.shape[0], X2.shape[0]))
-        
+
        self.kernel=kernel
        self.X = X
        self.X2 = X2
        self.dL_dK = dL_dK
        #self.constrained_indices=[]
        #self.constraints=[]
-        Model.__init__(self)
+        super(Kern_check_model, self).__init__()

    def is_positive_definite(self):
        v = np.linalg.eig(self.kernel.K(self.X))[0]
@ -768,7 +768,7 @@ class Kern_check_model(Model):
            return False
        else:
            return True
-        
+
    def _get_params(self):
        return self.kernel._get_params()

@ -783,7 +783,7 @@ class Kern_check_model(Model):

    def _log_likelihood_gradients(self):
        raise NotImplementedError, "This needs to be implemented to use the kern_check_model class."
-    
+
 class Kern_check_dK_dtheta(Kern_check_model):
    """This class allows gradient checks for the gradient of a kernel with respect to parameters. """
    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
@ -798,7 +798,7 @@ class Kern_check_dKdiag_dtheta(Kern_check_model):
        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
        if dL_dK==None:
            self.dL_dK = np.ones((self.X.shape[0]))
-        
+
    def log_likelihood(self):
        return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()

@ -815,7 +815,7 @@ class Kern_check_dK_dX(Kern_check_model):

    def _get_param_names(self):
        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
-                
+
    def _get_params(self):
        return self.X.flatten()

@ -837,7 +837,7 @@ class Kern_check_dKdiag_dX(Kern_check_model):

    def _get_param_names(self):
        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
-                
+
    def _get_params(self):
        return self.X.flatten()

@ -861,13 +861,15 @@ def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False, X_positive=
        if X_positive:
            X = abs(X)
        if output_ind is not None:
-            X[:, output_ind] = np.random.randint(kern.parts[0].output_dim, X.shape[0])
+            assert(output_ind<kern.input_dim)
+            X[:, output_ind] = np.random.randint(low=0,high=kern.parts[0].output_dim, size=X.shape[0])
    if X2==None:
        X2 = np.random.randn(20, kern.input_dim)
        if X_positive:
            X2 = abs(X2)
        if output_ind is not None:
-            X2[:, output_ind] = np.random.randint(kern.parts[0].output_dim, X2.shape[0])
+            assert(output_ind<kern.input_dim)
+            X2[:, output_ind] = np.random.randint(low=0, high=kern.parts[0].output_dim, size=X2.shape[0])

    if verbose:
        print("Checking covariance function is positive definite.")
@ -961,3 +963,4 @@ def kern_test(kern, X=None, X2=None, output_ind=None, verbose=False, X_positive=
        return False

    return pass_checks
+del Model
--- a/GPy/kern/parts/sympy_helpers.cpp
+++ b/GPy/kern/parts/sympy_helpers.cpp
@ -80,7 +80,7 @@ double ln_diff_erf(double x0, double x1){
  else //x0 and x1 non-positive
    return log(erfcx(-x0)-erfcx(-x1)*exp(x0*x0 - x1*x1))-x0*x0;
 }
-
+// TODO: For all these computations of h things are very efficient at the moment. Need to recode sympykern to allow the precomputations to take place and all the gradients to be computed in one function. Not sure of best way forward for that yet. Neil
 double h(double t, double tprime, double d_i, double d_j, double l){
  // Compute the h function for the sim covariance.
  double half_l_di = 0.5*l*d_i;
@ -170,9 +170,27 @@ double dh_dl(double t, double tprime, double d_i, double d_j, double l){
 }

 double dh_dt(double t, double tprime, double d_i, double d_j, double l){
-  return 0.0;
+  // compute gradient of h function with respect to t.
+  double diff_t = t - tprime;
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - diff_t/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_2 = half_l_di - t/l;
+  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
+  
+  return (d_i*exp(ln_part_2-d_i*t - d_j*tprime) - d_i*exp(ln_part_1-d_i*diff_t) + 2*exp(-d_i*diff_t - pow(half_l_di - diff_t/l, 2))/(sqrt(M_PI)*l) - 2*exp(-d_i*t - d_j*tprime - pow(half_l_di - t/l,2))/(sqrt(M_PI)*l))*exp(half_l_di*half_l_di)/(d_i + d_j);
 }

 double dh_dtprime(double t, double tprime, double d_i, double d_j, double l){
-  return 0.0;
+  // compute gradient of h function with respect to tprime.
+  double diff_t = t - tprime;
+  double half_l_di = 0.5*l*d_i;
+  double arg_1 = half_l_di + tprime/l;
+  double arg_2 = half_l_di - diff_t/l;
+  double ln_part_1 = ln_diff_erf(arg_1, arg_2);
+  arg_2 = half_l_di - t/l;
+  double ln_part_2 = ln_diff_erf(half_l_di, arg_2);
+
+  return (d_i*exp(ln_part_1-d_i*diff_t) + d_j*exp(ln_part_2-d_i*t - d_j*tprime) + (-2*exp(-pow(half_l_di - diff_t/l,2)) + 2*exp(-pow(half_l_di + tprime/l,2)))*exp(-d_i*diff_t)/(sqrt(M_PI)*l))*exp(half_l_di*half_l_di)/(d_i + d_j);
 }
--- a/GPy/likelihoods/laplace.py
+++ b/GPy/likelihoods/laplace.py
@ -15,6 +15,7 @@ import scipy as sp
 from likelihood import likelihood
 from ..util.linalg import mdot, jitchol, pddet, dpotrs
 from functools import partial as partial_func
+import warnings

 class Laplace(likelihood):
    """Laplace approximation to a posterior"""
@ -64,12 +65,12 @@ class Laplace(likelihood):
        self.YYT = None

        self.old_Ki_f = None
+        self.bad_fhat = False

-    def predictive_values(self, mu, var, full_cov):
+    def predictive_values(self,mu,var,full_cov,**noise_args):
        if full_cov:
-            raise NotImplementedError("Cannot make correlated predictions\
-                    with an Laplace likelihood")
-        return self.noise_model.predictive_values(mu, var)
+            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
+        return self.noise_model.predictive_values(mu,var,**noise_args)

    def log_predictive_density(self, y_test, mu_star, var_star):
        """
@ -199,15 +200,16 @@ class Laplace(likelihood):
        Y_tilde = Wi*self.Ki_f + self.f_hat

        self.Wi_K_i = self.W12BiW12
-        self.ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
-        self.lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
-        self.y_Wi_Ki_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)
+        ln_det_Wi_K = pddet(self.Sigma_tilde + self.K)
+        lik = self.noise_model.logpdf(self.f_hat, self.data, extra_data=self.extra_data)
+        y_Wi_K_i_y = mdot(Y_tilde.T, self.Wi_K_i, Y_tilde)

-        Z_tilde = (+ self.lik
+        Z_tilde = (+ lik
                   - 0.5*self.ln_B_det
-                   + 0.5*self.ln_det_Wi_K
+                   + 0.5*ln_det_Wi_K
                   - 0.5*self.f_Ki_f
-                   + 0.5*self.y_Wi_Ki_i_y
+                   + 0.5*y_Wi_K_i_y
+                   + self.NORMAL_CONST
                  )

        #Convert to float as its (1, 1) and Z must be a scalar
@ -247,7 +249,10 @@ class Laplace(likelihood):
        #At this point get the hessian matrix (or vector as W is diagonal)
        self.W = -self.noise_model.d2logpdf_df2(self.f_hat, self.data, extra_data=self.extra_data)

-        #TODO: Could save on computation when using rasm by returning these, means it isn't just a "mode finder" though
+        if not self.noise_model.log_concave:
+            #print "Under 1e-10: {}".format(np.sum(self.W < 1e-6))
+            self.W[self.W < 1e-6] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+
        self.W12BiW12, self.ln_B_det = self._compute_B_statistics(self.K, self.W, np.eye(self.N))

        self.Ki_f = self.Ki_f
@ -268,7 +273,7 @@ class Laplace(likelihood):
        :returns: (W12BiW12, ln_B_det)
        """
        if not self.noise_model.log_concave:
-            #print "Under 1e-10: {}".format(np.sum(W < 1e-10))
+            #print "Under 1e-10: {}".format(np.sum(W < 1e-6))
            W[W < 1e-6] = 1e-6  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                # To cause the posterior to become less certain than the prior and likelihood,
@ -278,16 +283,13 @@ class Laplace(likelihood):
        #W is diagonal so its sqrt is just the sqrt of the diagonal elements
        W_12 = np.sqrt(W)
        B = np.eye(self.N) + W_12*K*W_12.T
-        try:
-            L = jitchol(B)
-        except:
-            import ipdb; ipdb.set_trace()
+        L = jitchol(B)

-        W12BiW12 = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0]
+        W12BiW12a = W_12*dpotrs(L, np.asfortranarray(W_12*a), lower=1)[0]
        ln_B_det = 2*np.sum(np.log(np.diag(L)))
-        return W12BiW12, ln_B_det
+        return W12BiW12a, ln_B_det

-    def rasm_mode(self, K, MAX_ITER=30):
+    def rasm_mode(self, K, MAX_ITER=40):
        """
        Rasmussen's numerically stable mode finding
        For nomenclature see Rasmussen & Williams 2006
@ -302,9 +304,10 @@ class Laplace(likelihood):
        """
        #old_Ki_f = np.zeros((self.N, 1))

-        #Start f's at zero originally
-        if self.old_Ki_f is None:
-            old_Ki_f = np.zeros((self.N, 1))
+        #Start f's at zero originally of if we have gone off track, try restarting
+        if self.old_Ki_f is None or self.bad_fhat:
+            old_Ki_f = np.random.rand(self.N, 1)/50.0
+            #old_Ki_f = self.Y
            f = np.dot(K, old_Ki_f)
        else:
            #Start at the old best point
@ -318,7 +321,7 @@ class Laplace(likelihood):
            return -0.5*np.dot(Ki_f.T, f) + self.noise_model.logpdf(f, self.data, extra_data=self.extra_data)

        difference = np.inf
-        epsilon = 1e-5
+        epsilon = 1e-7
        #step_size = 1
        #rs = 0
        i = 0
@ -349,7 +352,8 @@ class Laplace(likelihood):
            #Find the stepsize that minimizes the objective function using a brent line search
            #The tolerance and maxiter matter for speed! Seems to be best to keep them low and make more full
            #steps than get this exact then make a step, if B was bigger it might be the other way around though
-            new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':5}).fun
+            #new_obj = sp.optimize.minimize_scalar(i_o, method='brent', tol=1e-4, options={'maxiter':5}).fun
+            new_obj = sp.optimize.brent(i_o, tol=1e-4, maxiter=10)
            f = self.tmp_f.copy()
            Ki_f = self.tmp_Ki_f.copy()

@ -380,14 +384,20 @@ class Laplace(likelihood):

            #difference = abs(new_obj - old_obj)
            #old_obj = new_obj.copy()
-            #difference = np.abs(np.sum(f - f_old))
-            difference = np.abs(np.sum(Ki_f - old_Ki_f))
+            difference = np.abs(np.sum(f - f_old)) + np.abs(np.sum(Ki_f - old_Ki_f))
+            #difference = np.abs(np.sum(Ki_f - old_Ki_f))/np.float(self.N)
            old_Ki_f = Ki_f.copy()
            i += 1

        self.old_Ki_f = old_Ki_f.copy()
+
+        #Warn of bad fits
        if difference > epsilon:
-            print "Not perfect f_hat fit difference: {}".format(difference)
+            self.bad_fhat = True
+            warnings.warn("Not perfect f_hat fit difference: {}".format(difference))
+        elif self.bad_fhat:
+            self.bad_fhat = False
+            warnings.warn("f_hat now perfect again")

        self.Ki_f = Ki_f
        return f
--- a/GPy/models.py
+++ b/GPy/models.py
@ -1,22 +1,31 @@
 '''
-Created on 14 Nov 2013
+GPy Models
+==========

-@author: maxz
+Implementations for common models used in GP regression and classification.
+The different models can be viewed in :mod:`GPy.models_modules`, which holds
+detailed explanations for the different models.
+
+:warning: This module is a convienince module for endusers to use. For developers 
+see :mod:`GPy.models_modules`, which holds the implementions for each model. 
 '''

-from _models.bayesian_gplvm import BayesianGPLVM
-from _models.gp_regression import GPRegression
-from _models.gp_classification import GPClassification#; _gp_classification = gp_classification ; del gp_classification 
-from _models.sparse_gp_regression import SparseGPRegression#; _sparse_gp_regression = sparse_gp_regression ; del sparse_gp_regression 
-from _models.svigp_regression import SVIGPRegression#; _svigp_regression = svigp_regression ; del svigp_regression 
-from _models.sparse_gp_classification import SparseGPClassification#; _sparse_gp_classification = sparse_gp_classification ; del sparse_gp_classification 
-from _models.fitc_classification import FITCClassification#; _fitc_classification = fitc_classification ; del fitc_classification 
-from _models.gplvm import GPLVM#; _gplvm = gplvm ; del gplvm 
-from _models.bcgplvm import BCGPLVM#; _bcgplvm = bcgplvm; del bcgplvm
-from _models.sparse_gplvm import SparseGPLVM#; _sparse_gplvm = sparse_gplvm ; del sparse_gplvm 
-from _models.warped_gp import WarpedGP#; _warped_gp = warped_gp ; del warped_gp 
-from _models.bayesian_gplvm import BayesianGPLVM#; _bayesian_gplvm = bayesian_gplvm ; del bayesian_gplvm 
-from _models.mrd import MRD#; _mrd = mrd; del mrd 
-from _models.gradient_checker import GradientChecker#; _gradient_checker = gradient_checker ; del gradient_checker 
-from _models.gp_multioutput_regression import GPMultioutputRegression#; _gp_multioutput_regression = gp_multioutput_regression ; del gp_multioutput_regression 
-from _models.sparse_gp_multioutput_regression import SparseGPMultioutputRegression#; _sparse_gp_multioutput_regression = sparse_gp_multioutput_regression ; del sparse_gp_multioutput_regression 
+__updated__ = '2013-11-28'
+
+from models_modules.bayesian_gplvm import BayesianGPLVM
+from models_modules.gp_regression import GPRegression
+from models_modules.gp_classification import GPClassification#; _gp_classification = gp_classification ; del gp_classification 
+from models_modules.sparse_gp_regression import SparseGPRegression#; _sparse_gp_regression = sparse_gp_regression ; del sparse_gp_regression 
+from models_modules.svigp_regression import SVIGPRegression#; _svigp_regression = svigp_regression ; del svigp_regression 
+from models_modules.sparse_gp_classification import SparseGPClassification#; _sparse_gp_classification = sparse_gp_classification ; del sparse_gp_classification 
+from models_modules.fitc_classification import FITCClassification#; _fitc_classification = fitc_classification ; del fitc_classification 
+from models_modules.gplvm import GPLVM#; _gplvm = gplvm ; del gplvm 
+from models_modules.bcgplvm import BCGPLVM#; _bcgplvm = bcgplvm; del bcgplvm
+from models_modules.sparse_gplvm import SparseGPLVM#; _sparse_gplvm = sparse_gplvm ; del sparse_gplvm 
+from models_modules.warped_gp import WarpedGP#; _warped_gp = warped_gp ; del warped_gp 
+from models_modules.bayesian_gplvm import BayesianGPLVM#; _bayesian_gplvm = bayesian_gplvm ; del bayesian_gplvm 
+from models_modules.mrd import MRD#; _mrd = mrd; del mrd 
+from models_modules.gradient_checker import GradientChecker#; _gradient_checker = gradient_checker ; del gradient_checker 
+from models_modules.gp_multioutput_regression import GPMultioutputRegression#; _gp_multioutput_regression = gp_multioutput_regression ; del gp_multioutput_regression 
+from models_modules.sparse_gp_multioutput_regression import SparseGPMultioutputRegression#; _sparse_gp_multioutput_regression = sparse_gp_multioutput_regression ; del sparse_gp_multioutput_regression 
+from models_modules.gradient_checker import GradientChecker
--- a/GPy/models_modules/init.py
+++ b/GPy/models_modules/init.py
--- a/GPy/models_modules/bayesian_gplvm.py
+++ b/GPy/models_modules/bayesian_gplvm.py
@ -12,6 +12,7 @@ from GPy.util import plot_latent, linalg
 from .gplvm import GPLVM
 from GPy.util.plot_latent import most_significant_input_dimensions
 from matplotlib import pyplot
+from GPy.core.model import Model

 class BayesianGPLVM(SparseGP, GPLVM):
    """
@ -285,6 +286,57 @@ class BayesianGPLVM(SparseGP, GPLVM):
        self.init = state.pop()
        SparseGP.setstate(self, state)

+class BayesianGPLVMWithMissingData(Model):
+    """
+    Bayesian Gaussian Process Latent Variable Model with missing data support.
+    NOTE: Missing data is assumed to be missing at random!
+    
+    This extension comes with a large memory and computing time deficiency.
+    Use only if fraction of missing data at random is higher than 60%.
+    Otherwise, try filtering data before using this extension.
+    
+    Y can hold missing data as given by `missing`, standard is :class:`~numpy.nan`.
+    
+    If likelihood is given for Y, this likelihood will be discarded, but the parameters
+    of the likelihood will be taken. Also every effort of creating the same likelihood
+    will be done.
+     
+    :param likelihood_or_Y: observed data (np.ndarray) or GPy.likelihood
+    :type likelihood_or_Y: :class:`~numpy.ndarray` | :class:`~GPy.likelihoods.likelihood.likelihood` instance
+    :param int input_dim: latent dimensionality
+    :param init: initialisation method for the latent space
+    :type init: 'PCA' | 'random'
+    """
+    def __init__(self, likelihood_or_Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
+                 Z=None, kernel=None, missing=np.nan, **kwargs):
+        if type(likelihood_or_Y) is np.ndarray:
+            likelihood = Gaussian(likelihood_or_Y)
+        else:
+            likelihood = likelihood_or_Y
+
+        if X == None:
+            X = self.initialise_latent(init, input_dim, likelihood.Y)
+        self.init = init
+
+        if X_variance is None:
+            X_variance = np.clip((np.ones_like(X) * 0.5) + .01 * np.random.randn(*X.shape), 0.001, 1)
+
+        if Z is None:
+            Z = np.random.permutation(X.copy())[:num_inducing]
+        assert Z.shape[1] == X.shape[1]
+
+        if kernel is None:
+            kernel = kern.rbf(input_dim) # + kern.white(input_dim)
+
+        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, X_variance=X_variance, **kwargs)
+        self.ensure_default_constraints()
+
+    def _get_param_names(self):
+        X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
+        S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
+        return (X_names + S_names + SparseGP._get_param_names(self))
+
+    pass

 def latent_cost_and_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
    """
--- a/GPy/models_modules/bcgplvm.py
+++ b/GPy/models_modules/bcgplvm.py
--- a/GPy/models_modules/fitc_classification.py
+++ b/GPy/models_modules/fitc_classification.py
--- a/GPy/models_modules/gp_classification.py
+++ b/GPy/models_modules/gp_classification.py
--- a/GPy/models_modules/gp_multioutput_regression.py
+++ b/GPy/models_modules/gp_multioutput_regression.py
--- a/GPy/models_modules/gp_regression.py
+++ b/GPy/models_modules/gp_regression.py
--- a/GPy/models_modules/gplvm.py
+++ b/GPy/models_modules/gplvm.py
--- a/GPy/models_modules/gradient_checker.py
+++ b/GPy/models_modules/gradient_checker.py
@ -28,38 +28,37 @@ class GradientChecker(Model):
        :param df: Gradient of function to check
        :param x0:
            Initial guess for inputs x (if it has a shape (a,b) this will be reflected in the parameter names).
-            Can be a list of arrays, if takes a list of arrays. This list will be passed
+            Can be a list of arrays, if f takes a list of arrays. This list will be passed
            to f and df in the same order as given here.
-            If only one argument, make sure not to pass a list!!!
-
+            If f takes only one argument, make sure not to pass a list for x0!!!
        :type x0: [array-like] | array-like | float | int
-        :param names:
+        :param list names:
            Names to print, when performing gradcheck. If a list was passed to x0
            a list of names with the same length is expected.
-        :param args: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs)
+        :param args kwargs: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs)

        Examples:
        ---------
-            from GPy.models import GradientChecker
-            N, M, Q = 10, 5, 3
+        from GPy.models import GradientChecker
+        N, M, Q = 10, 5, 3

-            Sinusoid:
+        Sinusoid:

-                X = numpy.random.rand(N, Q)
-                grad = GradientChecker(numpy.sin,numpy.cos,X,'x')
-                grad.checkgrad(verbose=1)
+            X = numpy.random.rand(N, Q)
+            grad = GradientChecker(numpy.sin,numpy.cos,X,'sin_in')
+            grad.checkgrad(verbose=1)

-            Using GPy:
+        Using GPy:

-                X, Z = numpy.random.randn(N,Q), numpy.random.randn(M,Q)
-                kern = GPy.kern.linear(Q, ARD=True) + GPy.kern.rbf(Q, ARD=True)
-                grad = GradientChecker(kern.K,
-                                       lambda x: 2*kern.dK_dX(numpy.ones((1,1)), x),
-                                       x0 = X.copy(),
-                                       names='X')
-                grad.checkgrad(verbose=1)
-                grad.randomize()
-                grad.checkgrad(verbose=1)
+            X, Z = numpy.random.randn(N,Q), numpy.random.randn(M,Q)
+            kern = GPy.kern.linear(Q, ARD=True) + GPy.kern.rbf(Q, ARD=True)
+            grad = GradientChecker(kern.K,
+                                   lambda x: kern.dK_dX(numpy.ones((1,1)), x),
+                                   x0 = X.copy(),
+                                   names=['X_input'])
+            grad.checkgrad(verbose=1)
+            grad.randomize()
+            grad.checkgrad(verbose=1)
        """
        Model.__init__(self)
        if isinstance(x0, (list, tuple)) and names is None:
--- a/GPy/models_modules/mrd.py
+++ b/GPy/models_modules/mrd.py
--- a/GPy/models_modules/sparse_gp_classification.py
+++ b/GPy/models_modules/sparse_gp_classification.py
--- a/GPy/models_modules/sparse_gp_multioutput_regression.py
+++ b/GPy/models_modules/sparse_gp_multioutput_regression.py
--- a/GPy/models_modules/sparse_gp_regression.py
+++ b/GPy/models_modules/sparse_gp_regression.py
--- a/GPy/models_modules/sparse_gplvm.py
+++ b/GPy/models_modules/sparse_gplvm.py
@ -66,5 +66,5 @@ class SparseGPLVM(SparseGPRegression, GPLVM):
        pb.plot(mu[:, 0] , mu[:, 1], 'ko')

    def plot_latent(self, *args, **kwargs):
-        input_1, input_2 = GPLVM.plot_latent(*args, **kwargs)
-        pb.plot(m.Z[:, input_1], m.Z[:, input_2], '^w')
+        GPLVM.plot_latent(self, *args, **kwargs)
+        #pb.plot(self.Z[:, input_1], self.Z[:, input_2], '^w')
--- a/GPy/models_modules/svigp_regression.py
+++ b/GPy/models_modules/svigp_regression.py
--- a/GPy/models_modules/warped_gp.py
+++ b/GPy/models_modules/warped_gp.py
--- a/GPy/testing/examples_tests.py
+++ b/GPy/testing/examples_tests.py
@ -10,6 +10,7 @@ import os
 import random
 from nose.tools import nottest
 import sys
+import itertools

 class ExamplesTests(unittest.TestCase):
    def _checkgrad(self, Model):
@ -39,8 +40,19 @@ def model_instance(model):
    #assert isinstance(model, GPy.core.model)
    return isinstance(model, GPy.core.model.Model)

-@nottest
+def flatten_nested(lst):
+    result = []
+    for element in lst:
+        if hasattr(element, '__iter__'):
+            result.extend(flatten_nested(element))
+        else:
+            result.append(element)
+    return result
+
+#@nottest
 def test_models():
+    optimize=False
+    plot=True
    examples_path = os.path.dirname(GPy.examples.__file__)
    # Load modules
    failing_models = {}
@ -54,29 +66,34 @@ def test_models():
        print "After"
        print functions
        for example in functions:
-            if example[0] in ['oil', 'silhouette', 'GPLVM_oil_100', 'brendan_faces']:
-                print "SKIPPING"
-                continue
+            #if example[0] in ['oil', 'silhouette', 'GPLVM_oil_100', 'brendan_faces']:
+                #print "SKIPPING"
+                #continue

            print "Testing example: ", example[0]
            # Generate model
+
            try:
-                model = example[1]()
+                models = [ example[1](optimize=optimize, plot=plot) ]
+                #If more than one model returned, flatten them
+                models = flatten_nested(models)
            except Exception as e:
                failing_models[example[0]] = "Cannot make model: \n{e}".format(e=e)
            else:
-                print model
+                print models
                model_checkgrads.description = 'test_checkgrads_%s' % example[0]
                try:
-                    if not model_checkgrads(model):
-                        failing_models[model_checkgrads.description] = False
+                    for model in models:
+                        if not model_checkgrads(model):
+                            failing_models[model_checkgrads.description] = False
                except Exception as e:
                    failing_models[model_checkgrads.description] = e

                model_instance.description = 'test_instance_%s' % example[0]
                try:
-                    if not model_instance(model):
-                        failing_models[model_instance.description] = False
+                    for model in models:
+                        if not model_instance(model):
+                            failing_models[model_instance.description] = False
                except Exception as e:
                    failing_models[model_instance.description] = e

--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -36,7 +36,7 @@ class KernelTests(unittest.TestCase):
    def test_eq_sympykernel(self):
        if SYMPY_AVAILABLE:
            kern = GPy.kern.eq_sympy(5, 3)
-            self.assertTrue(GPy.kern.kern_test(kern, output_ind=3, verbose=verbose))
+            self.assertTrue(GPy.kern.kern_test(kern, output_ind=4, verbose=verbose))

    def test_ode1_eqkernel(self):
        if SYMPY_AVAILABLE:
--- a/GPy/testing/likelihoods_tests.py
+++ b/GPy/testing/likelihoods_tests.py
@ -6,6 +6,8 @@ import functools
 import inspect
 from GPy.likelihoods.noise_models import gp_transformations
 from functools import partial
+#np.random.seed(300)
+np.random.seed(7)

 def dparam_partial(inst_func, *args):
    """
@ -144,7 +146,7 @@ class TestNoiseModels(object):
                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
                            "grad_params": {
                                "names": ["t_noise"],
-                                "vals": [1],
+                                "vals": [1.0],
                                "constraints": [constrain_positive]
                                },
                            "laplace": True
@ -158,6 +160,15 @@ class TestNoiseModels(object):
                                },
                            "laplace": True
                            },
+                        "Student_t_large_var": {
+                            "model": GPy.likelihoods.student_t(deg_free=5, sigma2=self.var),
+                            "grad_params": {
+                                "names": ["t_noise"],
+                                "vals": [10.0],
+                                "constraints": [constrain_positive]
+                                },
+                            "laplace": True
+                            },
                        "Student_t_approx_gauss": {
                            "model": GPy.likelihoods.student_t(deg_free=1000, sigma2=self.var),
                            "grad_params": {
@ -315,9 +326,11 @@ class TestNoiseModels(object):
    def t_logpdf(self, model, Y, f):
        print "\n{}".format(inspect.stack()[0][3])
        print model
+        print model._get_params()
        np.testing.assert_almost_equal(
-                               np.log(model.pdf(f.copy(), Y.copy())),
-                               model.logpdf(f.copy(), Y.copy()))
+                               model.pdf(f.copy(), Y.copy()),
+                               np.exp(model.logpdf(f.copy(), Y.copy()))
+                               )

    @with_setup(setUp, tearDown)
    def t_dlogpdf_df(self, model, Y, f):
@ -363,7 +376,7 @@ class TestNoiseModels(object):
        assert (
                dparam_checkgrad(model.logpdf, model.dlogpdf_dtheta,
                    params, args=(f, Y), constraints=param_constraints,
-                    randomize=False, verbose=True)
+                    randomize=True, verbose=True)
                )

    @with_setup(setUp, tearDown)
@ -373,7 +386,7 @@ class TestNoiseModels(object):
        assert (
                dparam_checkgrad(model.dlogpdf_df, model.dlogpdf_df_dtheta,
                    params, args=(f, Y), constraints=param_constraints,
-                    randomize=False, verbose=True)
+                    randomize=True, verbose=True)
                )

    @with_setup(setUp, tearDown)
@ -383,7 +396,7 @@ class TestNoiseModels(object):
        assert (
                dparam_checkgrad(model.d2logpdf_df2, model.d2logpdf_df2_dtheta,
                    params, args=(f, Y), constraints=param_constraints,
-                    randomize=False, verbose=True)
+                    randomize=True, verbose=True)
                )

    ################
@ -478,7 +491,7 @@ class TestNoiseModels(object):
        print "\n{}".format(inspect.stack()[0][3])
        #Normalize
        Y = Y/Y.max()
-        white_var = 0.001
+        white_var = 1e-6
        kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
        laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), model)
        m = GPy.models.GPRegression(X.copy(), Y.copy(), kernel, likelihood=laplace_likelihood)
@ -490,12 +503,13 @@ class TestNoiseModels(object):
            m[name] = param_vals[param_num]
            constraints[param_num](name, m)

+        print m
        m.randomize()
-        m.optimize(max_iters=8)
+        #m.optimize(max_iters=8)
        print m
        m.checkgrad(verbose=1, step=step)
-        if not m.checkgrad(step=step):
-            m.checkgrad(verbose=1, step=step)
+        #if not m.checkgrad(step=step):
+            #m.checkgrad(verbose=1, step=step)
            #import ipdb; ipdb.set_trace()
            #NOTE this test appears to be stochastic for some likelihoods (student t?)
            # appears to all be working in test mode right now...
@ -509,7 +523,7 @@ class TestNoiseModels(object):
        print "\n{}".format(inspect.stack()[0][3])
        #Normalize
        Y = Y/Y.max()
-        white_var = 0.001
+        white_var = 1e-6
        kernel = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
        ep_likelihood = GPy.likelihoods.EP(Y.copy(), model)
        m = GPy.models.GPRegression(X.copy(), Y.copy(), kernel, likelihood=ep_likelihood)
@ -579,6 +593,95 @@ class LaplaceTests(unittest.TestCase):
        grad.checkgrad(verbose=1)
        self.assertTrue(grad.checkgrad())

+    #@unittest.skip('Not working yet, needs to be checked')
+    def test_laplace_log_likelihood(self):
+        debug = False
+        real_std = 0.1
+        initial_var_guess = 0.5
+
+        #Start a function, any function
+        X = np.linspace(0.0, np.pi*2, 100)[:, None]
+        Y = np.sin(X) + np.random.randn(*X.shape)*real_std
+        Y = Y/Y.max()
+        #Yc = Y.copy()
+        #Yc[75:80] += 1
+        kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+        kernel2 = kernel1.copy()
+
+        m1 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel1)
+        m1.constrain_fixed('white', 1e-6)
+        m1['noise'] = initial_var_guess
+        m1.constrain_bounded('noise', 1e-4, 10)
+        m1.constrain_bounded('rbf', 1e-4, 10)
+        m1.ensure_default_constraints()
+        m1.randomize()
+
+        gauss_distr = GPy.likelihoods.gaussian(variance=initial_var_guess, D=1, N=Y.shape[0])
+        laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), gauss_distr)
+        m2 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel2, likelihood=laplace_likelihood)
+        m2.ensure_default_constraints()
+        m2.constrain_fixed('white', 1e-6)
+        m2.constrain_bounded('rbf', 1e-4, 10)
+        m2.constrain_bounded('noise', 1e-4, 10)
+        m2.randomize()
+
+        if debug:
+            print m1
+            print m2
+        optimizer = 'scg'
+        print "Gaussian"
+        m1.optimize(optimizer, messages=debug)
+        print "Laplace Gaussian"
+        m2.optimize(optimizer, messages=debug)
+        if debug:
+            print m1
+            print m2
+
+        m2._set_params(m1._get_params())
+
+        #Predict for training points to get posterior mean and variance
+        post_mean, post_var, _, _ = m1.predict(X)
+        post_mean_approx, post_var_approx, _, _ = m2.predict(X)
+
+        if debug:
+            import pylab as pb
+            pb.figure(5)
+            pb.title('posterior means')
+            pb.scatter(X, post_mean, c='g')
+            pb.scatter(X, post_mean_approx, c='r', marker='x')
+
+            pb.figure(6)
+            pb.title('plot_f')
+            m1.plot_f(fignum=6)
+            m2.plot_f(fignum=6)
+            fig, axes = pb.subplots(2, 1)
+            fig.suptitle('Covariance matricies')
+            a1 = pb.subplot(121)
+            a1.matshow(m1.likelihood.covariance_matrix)
+            a2 = pb.subplot(122)
+            a2.matshow(m2.likelihood.covariance_matrix)
+
+            pb.figure(8)
+            pb.scatter(X, m1.likelihood.Y, c='g')
+            pb.scatter(X, m2.likelihood.Y, c='r', marker='x')
+
+
+
+        #Check Y's are the same
+        np.testing.assert_almost_equal(Y, m2.likelihood.Y, decimal=5)
+        #Check marginals are the same
+        np.testing.assert_almost_equal(m1.log_likelihood(), m2.log_likelihood(), decimal=2)
+        #Check marginals are the same with random
+        m1.randomize()
+        m2._set_params(m1._get_params())
+        np.testing.assert_almost_equal(m1.log_likelihood(), m2.log_likelihood(), decimal=2)
+
+        #Check they are checkgradding
+        #m1.checkgrad(verbose=1)
+        #m2.checkgrad(verbose=1)
+        self.assertTrue(m1.checkgrad())
+        self.assertTrue(m2.checkgrad())
+
 if __name__ == "__main__":
    print "Running unit tests"
    unittest.main()
--- a/GPy/util/init.py
+++ b/GPy/util/init.py
@ -14,6 +14,15 @@ import visualize
 import decorators
 import classification
 import latent_space_visualizations
-import symbolic
+
+try:
+    import sympy
+    _sympy_available = True
+    del sympy
+except ImportError as e:
+    _sympy_available = False
+
+if _sympy_available:
+    import symbolic

 import netpbmfile
--- a/GPy/util/data_resources.json
+++ b/GPy/util/data_resources.json
@ -102,7 +102,7 @@
      "citation":"Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.\nThe database was created with funding from NSF EIA-0196217.",
      "details":"CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.",
      "urls":[
-         "http://mocap.cs.cmu.edu"
+         "http://mocap.cs.cmu.edu/subjects"
      ],
      "size":null
   },
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -3,7 +3,6 @@ import numpy as np
 import GPy
 import scipy.io
 import cPickle as pickle
-import urllib as url
 import zipfile
 import tarfile
 import datetime
@ -15,7 +14,7 @@ except ImportError:
    ipython_available=False


-import sys, urllib
+import sys, urllib2

 def reporthook(a,b,c):
    # ',' at the end of the line is important!
@ -82,7 +81,21 @@ def download_url(url, store_directory, save_name = None, messages = True, suffix
    print "Downloading ", url, "->", os.path.join(store_directory, file)
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
-    urllib.urlretrieve(url+suffix, save_name, reporthook)
+    try:
+        response = urllib2.urlopen(url+suffix)
+    except urllib2.URLError, e:
+        if not hasattr(e, "code"):
+            raise
+        response = e
+        if response.code > 399 and response.code<500:
+            raise ValueError('Tried url ' + url + suffix + ' and received client error ' + str(response.code))
+        elif response.code > 499:
+            raise ValueError('Tried url ' + url + suffix + ' and received server error ' + str(response.code))
+    # if we wanted to get more sophisticated maybe we should check the response code here again even for successes.
+    with open(save_name, 'wb') as f:
+        f.write(response.read())
+    
+    #urllib.urlretrieve(url+suffix, save_name, reporthook)

 def authorize_download(dataset_name=None):
    """Check with the user that the are happy with terms and conditions for the data set."""
@ -142,6 +155,8 @@ def cmu_urls_files(subj_motions, messages = True):
    '''
    Find which resources are missing on the local disk for the requested CMU motion capture motions.
    '''
+    dr = data_resources['cmu_mocap_full']
+    cmu_url = dr['urls'][0]

    subjects_num = subj_motions[0]
    motions_num = subj_motions[1]
@ -187,7 +202,7 @@ def cmu_urls_files(subj_motions, messages = True):
                url_required = True
                file_download.append(subjects[i] + '_' + motions[i][j] + '.amc')
        if url_required:
-            resource['urls'].append(cmu_url + subjects[i] + '/')
+            resource['urls'].append(cmu_url + '/' + subjects[i] + '/')
            resource['files'].append(file_download)
    return resource

@ -435,7 +450,7 @@ def simulation_BGPLVM():
    Y = np.array(mat_data['Y'], dtype=float)
    S = np.array(mat_data['initS'], dtype=float)
    mu = np.array(mat_data['initMu'], dtype=float)
-    return data_details_return({'S': S, 'Y': Y, 'mu': mu}, data_set)
+    #return data_details_return({'S': S, 'Y': Y, 'mu': mu}, data_set)
    return {'Y': Y, 'S': S,
            'mu' : mu,
            'info': "Simulated test dataset generated in MATLAB to compare BGPLVM between python and MATLAB"}
@ -594,11 +609,11 @@ def olympic_sprints(data_set='rogers_girolami_data'):
        'Y': Y,
        'info': "Olympics sprint event winning for men and women to 2008. Data is from Rogers and Girolami's First Course in Machine Learning.",
        'output_info': {
-          0:'100m Men', 
-          1:'100m Women', 
-          2:'200m Men', 
-          3:'200m Women', 
-          4:'400m Men', 
+          0:'100m Men',
+          1:'100m Women',
+          2:'200m Men',
+          3:'200m Women',
+          4:'400m Men',
          5:'400m Women'}
        }, data_set)

@ -693,15 +708,15 @@ def creep_data(data_set='creep_rupture'):
    X = all_data[:, features].copy()
    return data_details_return({'X': X, 'y': y}, data_set)

-def cmu_mocap_49_balance():
+def cmu_mocap_49_balance(data_set='cmu_mocap'):
    """Load CMU subject 49's one legged balancing motion that was used by Alvarez, Luengo and Lawrence at AISTATS 2009."""
    train_motions = ['18', '19']
    test_motions = ['20']
-    data = cmu_mocap('49', train_motions, test_motions, sample_every=4)
+    data = cmu_mocap('49', train_motions, test_motions, sample_every=4, data_set=data_set)
    data['info'] = "One legged balancing motions from CMU data base subject 49. As used in Alvarez, Luengo and Lawrence at AISTATS 2009. It consists of " + data['info']
    return data

-def cmu_mocap_35_walk_jog():
+def cmu_mocap_35_walk_jog(data_set='cmu_mocap'):
    """Load CMU subject 35's walking and jogging motions, the same data that was used by Taylor, Roweis and Hinton at NIPS 2007. but without their preprocessing. Also used by Lawrence at AISTATS 2007."""
    train_motions = ['01', '02', '03', '04', '05', '06',
                '07', '08', '09', '10', '11', '12',
@ -709,7 +724,7 @@ def cmu_mocap_35_walk_jog():
                '20', '21', '22', '23', '24', '25',
                '26', '28', '30', '31', '32', '33', '34']
    test_motions = ['18', '29']
-    data = cmu_mocap('35', train_motions, test_motions, sample_every=4)
+    data = cmu_mocap('35', train_motions, test_motions, sample_every=4, data_set=data_set)
    data['info'] = "Walk and jog data from CMU data base subject 35. As used in Tayor, Roweis and Hinton at NIPS 2007, but without their pre-processing (i.e. as used by Lawrence at AISTATS 2007). It consists of " + data['info']
    return data

@ -721,7 +736,7 @@ def cmu_mocap(subject, train_motions, test_motions=[], sample_every=4, data_set=
    # Make sure the data is downloaded.
    all_motions = train_motions + test_motions
    resource = cmu_urls_files(([subject], [all_motions]))
-    data_resources[data_set] = data_resources['cmu_mocap_full']
+    data_resources[data_set] = data_resources['cmu_mocap_full'].copy()
    data_resources[data_set]['files'] = resource['files']
    data_resources[data_set]['urls'] = resource['urls']
    if resource['urls']:
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -12,6 +12,7 @@ import ctypes
 from ctypes import byref, c_char, c_int, c_double # TODO
 # import scipy.lib.lapack
 import scipy
+import warnings

 if np.all(np.float64((scipy.__version__).split('.')[:2]) >= np.array([0, 12])):
    import scipy.linalg.lapack as lapack
@ -25,6 +26,9 @@ try:
    assert hasattr(_blaslib, 'dsyr_')
 except AssertionError:
    _blas_available = False
+except AttributeError as e:
+    _blas_available = False
+    warnings.warn("warning: caught this exception:" + str(e))

 def dtrtrs(A, B, lower=0, trans=0, unitdiag=0):
    """
--- a/GPy/util/mocap.py
+++ b/GPy/util/mocap.py
@ -67,14 +67,14 @@ class tree:
        for i in range(len(self.vertices)):
            if self.vertices[i].id == id:
                return i
-        raise Error, 'Reverse look up of id failed.'
+        raise ValueError('Reverse look up of id failed.')

    def get_index_by_name(self, name):
        """Give the index associated with a given vertex name."""
        for i in range(len(self.vertices)):
            if self.vertices[i].name == name:
                return i
-        raise Error, 'Reverse look up of name failed.'
+        raise ValueError('Reverse look up of name failed.')

    def order_vertices(self):
        """Order vertices in the graph such that parents always have a lower index than children."""
@ -433,6 +433,8 @@ class acclaim_skeleton(skeleton):
        lin = self.read_line(fid)
        while lin != ':DEGREES':
            lin = self.read_line(fid)
+            if lin == '':
+                raise ValueError('Could not find :DEGREES in ' + fid.name)

        counter = 0
        lin = self.read_line(fid)
@ -443,9 +445,9 @@ class acclaim_skeleton(skeleton):
                if frame_no:
                    counter += 1
                    if counter != frame_no:
-                        raise Error, 'Unexpected frame number.'
+                        raise ValueError('Unexpected frame number.')
                else:
-                    raise Error, 'Single bone name  ...'
+                    raise ValueError('Single bone name  ...')
            else:
                ind = self.get_index_by_name(parts[0])
                bones[ind].append(np.array([float(channel) for channel in parts[1:]]))
@ -573,7 +575,7 @@ class acclaim_skeleton(skeleton):
                        return
                    lin = self.read_line(fid)
            else:
-                raise Error, 'Unrecognised file format'
+                raise ValueError('Unrecognised file format')
            self.finalize()
            
    def read_units(self, fid):