diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index ce3bd6d8..3f3eb447 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -3,6 +3,11 @@
 """
 Gaussian Processes classification examples
 """
+try:
+    import matplotlib.pyplot as plt
+except ImportError:
+    MPL_AVAILABLE = False
+
 import GPy
 
 default_seed = 10000
@@ -78,9 +83,7 @@ def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):
         # m.pseudo_EM()
 
     # Plot
-    if plot:
-        from matplotlib import pyplot as plt
-
+    if MPL_AVAILABLE and plot:
         fig, axes = plt.subplots(2, 1)
         m.plot_f(ax=axes[0])
         m.plot(ax=axes[1])
@@ -117,15 +120,12 @@ def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=
 
     # Optimize
     if optimize:
-        try:
-            m.optimize("scg", messages=1)
-        except Exception as e:
-            return m
+        m.optimize("scg", messages=True)
+
+    return m
 
     # Plot
-    if plot:
-        from matplotlib import pyplot as plt
-
+    if MPL_AVAILABLE and plot:
         fig, axes = plt.subplots(2, 1)
         m.plot_f(ax=axes[0])
         m.plot(ax=axes[1])
@@ -162,9 +162,7 @@ def sparse_toy_linear_1d_classification(
         m.optimize()
 
     # Plot
-    if plot:
-        from matplotlib import pyplot as plt
-
+    if MPL_AVAILABLE and plot:
         fig, axes = plt.subplots(2, 1)
         m.plot_f(ax=axes[0])
         m.plot(ax=axes[1])
@@ -207,9 +205,7 @@ def sparse_toy_linear_1d_classification_uncertain_input(
         m.optimize()
 
     # Plot
-    if plot:
-        from matplotlib import pyplot as plt
-
+    if MPL_AVAILABLE and plot:
         fig, axes = plt.subplots(2, 1)
         m.plot_f(ax=axes[0])
         m.plot(ax=axes[1])
@@ -259,9 +255,7 @@ def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):
         print(m)
 
     # Plot
-    if plot:
-        from matplotlib import pyplot as plt
-
+    if MPL_AVAILABLE and plot:
         fig, axes = plt.subplots(2, 1)
         m.plot_f(ax=axes[0])
         m.plot(ax=axes[1])
@@ -314,7 +308,7 @@ def crescent_data(
     if optimize:
         m.optimize(messages=1)
 
-    if plot:
+    if MPL_AVAILABLE and plot:
         m.plot()
 
     print(m)
diff --git a/GPy/examples/non_gaussian.py b/GPy/examples/non_gaussian.py
index cb86c83c..c685c4db 100644
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@@ -7,8 +7,8 @@ from GPy.util import datasets
 
 try:
     import matplotlib.pyplot as plt
-except:
-    pass
+except ImportError:
+    MPL_AVAILABLE = False
 
 
 def student_t_approx(optimize=True, plot=True):
@@ -102,7 +102,7 @@ def student_t_approx(optimize=True, plot=True):
         print("Corrupt student t")
         m4.optimize(optimizer, messages=1)
 
-    if plot:
+    if MPL_AVAILABLE and plot:
         plt.figure(1)
         plt.suptitle("Gaussian likelihood")
         ax = plt.subplot(211)
@@ -251,7 +251,7 @@ def boston_example(optimize=True, plot=True):
             print(pred_density)
             print(mstu_t)
 
-    if plot:
+    if MPL_AVAILABLE and plot:
         plt.figure()
         plt.scatter(X_test[:, data_axis_plot], Y_test_pred[0])
         plt.scatter(X_test[:, data_axis_plot], Y_test, c="r", marker="x")
@@ -270,7 +270,7 @@ def boston_example(optimize=True, plot=True):
     print("Average scores: {}".format(np.mean(score_folds, 1)))
     print("Average pred density: {}".format(np.mean(pred_density, 1)))
 
-    if plot:
+    if MPL_AVAILABLE and plot:
         # Plotting
         stu_t_legends = ["Student T, df={}".format(df) for df in degrees_freedoms]
         legends = ["Baseline", "Gaussian", "Laplace Approx Gaussian"] + stu_t_legends
diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index 9ade6552..fb1c9323 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -5,9 +5,10 @@
 Gaussian Processes regression examples
 """
 try:
-    from matplotlib import pyplot as pb
-except:
-    pass
+    import matplotlib.pyplot as plt
+except ImportError:
+    MPL_AVAILABLE = False
+
 import numpy as np
 import GPy
 
@@ -29,7 +30,7 @@ def olympic_marathon_men(optimize=True, plot=True):
 
     if optimize:
         m.optimize("bfgs", max_iters=200)
-    if plot:
+    if MPL_AVAILABLE and plot:
         m.plot(plot_limits=(1850, 2050))
 
     return m
@@ -52,7 +53,7 @@ def coregionalization_toy(optimize=True, plot=True):
     if optimize:
         m.optimize("bfgs", max_iters=100)
 
-    if plot:
+    if MPL_AVAILABLE and plot:
         slices = GPy.util.multioutput.get_slices([X1, X2])
         m.plot(
             fixed_inputs=[(1, 0)],
@@ -63,15 +64,14 @@ def coregionalization_toy(optimize=True, plot=True):
             fixed_inputs=[(1, 1)],
             which_data_rows=slices[1],
             Y_metadata={"output_index": 1},
-            ax=pb.gca(),
+            ax=plt.gca(),
         )
     return m
 
 
 def coregionalization_sparse(optimize=True, plot=True):
-    """
-    A simple demonstration of coregionalization on two sinusoidal functions using sparse approximations.
-    """
+    """A simple demonstration of coregionalization on two sinusoidal
+    functions using sparse approximations. """
     # build a design matrix with a column of integers indicating the output
     X1 = np.random.rand(50, 1) * 8
     X2 = np.random.rand(30, 1) * 5
@@ -85,7 +85,7 @@ def coregionalization_sparse(optimize=True, plot=True):
     if optimize:
         m.optimize("bfgs", max_iters=100)
 
-    if plot:
+    if MPL_AVAILABLE and plot:
         slices = GPy.util.multioutput.get_slices([X1, X2])
         m.plot(
             fixed_inputs=[(1, 0)],
@@ -96,9 +96,9 @@ def coregionalization_sparse(optimize=True, plot=True):
             fixed_inputs=[(1, 1)],
             which_data_rows=slices[1],
             Y_metadata={"output_index": 1},
-            ax=pb.gca(),
+            ax=plt.gca(),
         )
-        pb.ylim(-3,)
+        plt.ylim(-3,)
 
     return m
 
@@ -187,11 +187,11 @@ def multiple_optima(
     lls = GPy.examples.regression._contour_data(
         data, length_scales, log_SNRs, GPy.kern.RBF
     )
-    if plot:
-        pb.contour(length_scales, log_SNRs, np.exp(lls), 20, cmap=pb.cm.jet)
-        ax = pb.gca()
-        pb.xlabel("length scale")
-        pb.ylabel("log_10 SNR")
+    if MPL_AVAILABLE and plot:
+        plt.contour(length_scales, log_SNRs, np.exp(lls), 20, cmap=plt.cm.jet)
+        ax = plt.gca()
+        plt.xlabel("length scale")
+        plt.ylabel("log_10 SNR")
 
         xlim = ax.get_xlim()
         ylim = ax.get_ylim()
@@ -202,7 +202,9 @@ def multiple_optima(
     optim_point_y = np.empty(2)
     np.random.seed(seed=seed)
     for i in range(0, model_restarts):
-        # kern = GPy.kern.RBF(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.))
+        # kern = GPy.kern.RBF(
+        #     1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.)
+        # )
         kern = GPy.kern.RBF(
             1, variance=np.random.uniform(1e-3, 1), lengthscale=np.random.uniform(5, 50)
         )
@@ -219,8 +221,8 @@ def multiple_optima(
         optim_point_x[1] = m.rbf.lengthscale
         optim_point_y[1] = np.log10(m.rbf.variance) - np.log10(m.likelihood.variance)
 
-        if plot:
-            pb.arrow(
+        if MPL_AVAILABLE and plot:
+            plt.arrow(
                 optim_point_x[0],
                 optim_point_y[0],
                 optim_point_x[1] - optim_point_x[0],
@@ -233,7 +235,7 @@ def multiple_optima(
             )
         models.append(m)
 
-    if plot:
+    if MPL_AVAILABLE and plot:
         ax.set_xlim(xlim)
         ax.set_ylim(ylim)
     return m  # (models, lls)
@@ -289,7 +291,7 @@ def olympic_100m_men(optimize=True, plot=True):
     if optimize:
         m.optimize("bfgs", max_iters=200)
 
-    if plot:
+    if MPL_AVAILABLE and plot:
         m.plot(plot_limits=(1850, 2050))
     return m
 
@@ -308,14 +310,16 @@ def toy_rbf_1d(optimize=True, plot=True):
 
     if optimize:
         m.optimize("bfgs")
-    if plot:
+    if MPL_AVAILABLE and plot:
         m.plot()
 
     return m
 
 
 def toy_rbf_1d_50(optimize=True, plot=True):
-    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
+    """Run a simple demonstration of a standard Gaussian process fitting
+it to data sampled from an RBF covariance."""
+
     try:
         import pods
     except ImportError:
@@ -328,7 +332,7 @@ def toy_rbf_1d_50(optimize=True, plot=True):
 
     if optimize:
         m.optimize("bfgs")
-    if plot:
+    if MPL_AVAILABLE and plot:
         m.plot()
 
     return m
@@ -353,10 +357,10 @@ def toy_poisson_rbf_1d_laplace(optimize=True, plot=True):
 
     if optimize:
         m.optimize(optimizer)
-    if plot:
+    if MPL_AVAILABLE and plot:
         m.plot()
         # plot the real underlying rate function
-        pb.plot(X, np.exp(f_true), "--k", linewidth=2)
+        plt.plot(X, np.exp(f_true), "--k", linewidth=2)
 
     return m
 
@@ -396,7 +400,7 @@ def toy_ARD(
     if optimize:
         m.optimize(optimizer="scg", max_iters=max_iters)
 
-    if plot:
+    if MPL_AVAILABLE and plot:
         m.kern.plot_ARD()
 
     return m
@@ -438,7 +442,7 @@ def toy_ARD_sparse(
     if optimize:
         m.optimize(optimizer="scg", max_iters=max_iters)
 
-    if plot:
+    if MPL_AVAILABLE and plot:
         m.kern.plot_ARD()
 
     return m
@@ -461,12 +465,12 @@ def robot_wireless(max_iters=100, kernel=None, optimize=True, plot=True):
         m.optimize(max_iters=max_iters)
 
     Xpredict = m.predict(data["Ytest"])[0]
-    if plot:
-        pb.plot(data["Xtest"][:, 0], data["Xtest"][:, 1], "r-")
-        pb.plot(Xpredict[:, 0], Xpredict[:, 1], "b-")
-        pb.axis("equal")
-        pb.title("WiFi Localization with Gaussian Processes")
-        pb.legend(("True Location", "Predicted Location"))
+    if MPL_AVAILABLE and plot:
+        plt.plot(data["Xtest"][:, 0], data["Xtest"][:, 1], "r-")
+        plt.plot(Xpredict[:, 0], Xpredict[:, 1], "b-")
+        plt.axis("equal")
+        plt.title("WiFi Localization with Gaussian Processes")
+        plt.legend(("True Location", "Predicted Location"))
 
     sse = ((data["Xtest"] - Xpredict) ** 2).sum()
 
@@ -517,7 +521,7 @@ def sparse_GP_regression_1D(
     if optimize:
         m.optimize("tnc", max_iters=max_iters)
 
-    if plot:
+    if MPL_AVAILABLE and plot:
         m.plot()
 
     return m
@@ -550,7 +554,7 @@ def sparse_GP_regression_2D(
         m.optimize("tnc", messages=1, max_iters=max_iters)
 
     # plot
-    if plot:
+    if MPL_AVAILABLE and plot:
         m.plot()
 
     print(m)
@@ -559,7 +563,8 @@ def sparse_GP_regression_2D(
 
 def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
     """Run a 1D example of a sparse GP regression with uncertain inputs."""
-    fig, axes = pb.subplots(1, 2, figsize=(12, 5), sharex=True, sharey=True)
+    if MPL_AVAILABLE and plot:
+        fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharex=True, sharey=True)
 
     # sample inputs and outputs
     S = np.ones((20, 1))
@@ -575,7 +580,7 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
     if optimize:
         m.optimize("scg", messages=1, max_iters=max_iters)
 
-    if plot:
+    if MPL_AVAILABLE and plot:
         m.plot(ax=axes[0])
         axes[0].set_title("no input uncertainty")
     print(m)
@@ -584,7 +589,7 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
     m = GPy.models.SparseGPRegression(X, Y, kernel=GPy.kern.RBF(1), Z=Z, X_variance=S)
     if optimize:
         m.optimize("scg", messages=1, max_iters=max_iters)
-    if plot:
+    if MPL_AVAILABLE and plot:
         m.plot(ax=axes[1])
         axes[1].set_title("with input uncertainty")
         fig.canvas.draw()
@@ -610,7 +615,7 @@ def simple_mean_function(max_iters=100, optimize=True, plot=True):
     m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
     if optimize:
         m.optimize(max_iters=max_iters)
-    if plot:
+    if MPL_AVAILABLE and plot:
         m.plot(plot_limits=(-10, 15))
     return m
 
@@ -633,12 +638,12 @@ def parametric_mean_function(max_iters=100, optimize=True, plot=True):
     m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
     if optimize:
         m.optimize(max_iters=max_iters)
-    if plot:
+    if MPL_AVAILABLE and plot:
         m.plot()
     return m
 
 
-def warped_gp_cubic_sine(max_iters=100):
+def warped_gp_cubic_sine(max_iters=100, plot=True):
     """
     A test replicating the cubic sine regression problem from
     Snelson's paper.
@@ -652,7 +657,7 @@ def warped_gp_cubic_sine(max_iters=100):
     warp_k = GPy.kern.RBF(1)
     warp_f = GPy.util.warping_functions.TanhFunction(n_terms=2)
     warp_m = GPy.models.WarpedGP(X, Y, kernel=warp_k, warping_function=warp_f)
-    warp_m[".*\.d"].constrain_fixed(1.0)
+    warp_m[".*\\.d"].constrain_fixed(1.0)
     m = GPy.models.GPRegression(X, Y)
     m.optimize_restarts(
         parallel=False, robust=True, num_restarts=5, max_iters=max_iters
@@ -666,35 +671,18 @@ def warped_gp_cubic_sine(max_iters=100):
     print(warp_m)
     print(warp_m[".*warp.*"])
 
-    warp_m.predict_in_warped_space = False
-    warp_m.plot(title="Warped GP - Latent space")
-    warp_m.predict_in_warped_space = True
-    warp_m.plot(title="Warped GP - Warped space")
-    m.plot(title="Standard GP")
-    warp_m.plot_warping()
-    pb.show()
+    if MPL_AVAILABLE and plot:
+        warp_m.predict_in_warped_space = False
+        warp_m.plot(title="Warped GP - Latent space")
+        warp_m.predict_in_warped_space = True
+        warp_m.plot(title="Warped GP - Warped space")
+        m.plot(title="Standard GP")
+        warp_m.plot_warping()
+        plt.show()
     return warp_m
 
 
-def multioutput_gp_with_derivative_observations():
-    def plot_gp_vs_real(
-        m, x, yreal, size_inputs, title, fixed_input=1, xlim=[0, 11], ylim=[-1.5, 3]
-    ):
-        fig, ax = pb.subplots()
-        ax.set_title(title)
-        pb.plot(x, yreal, "r", label="Real function")
-        rows = (
-            slice(0, size_inputs[0])
-            if fixed_input == 0
-            else slice(size_inputs[0], size_inputs[0] + size_inputs[1])
-        )
-        m.plot(
-            fixed_inputs=[(1, fixed_input)],
-            which_data_rows=rows,
-            xlim=xlim,
-            ylim=ylim,
-            ax=ax,
-        )
+def multioutput_gp_with_derivative_observations(plot=True):
 
     f = lambda x: np.sin(x) + 0.1 * (x - 2.0) ** 2 - 0.005 * x ** 3
     fd = lambda x: np.cos(x) + 0.2 * (x - 2.0) - 0.015 * x ** 2
@@ -735,27 +723,47 @@ def multioutput_gp_with_derivative_observations():
     # Optimize the model
     m.optimize(messages=0, ipython_notebook=False)
 
-    # Plot the model, the syntax is same as for multioutput models:
-    plot_gp_vs_real(
-        m,
-        xpred,
-        ydpred_true,
-        [x.shape[0], xd.shape[0]],
-        title="Latent function derivatives",
-        fixed_input=1,
-        xlim=[0, 11],
-        ylim=[-1.5, 3],
-    )
-    plot_gp_vs_real(
-        m,
-        xpred,
-        ypred_true,
-        [x.shape[0], xd.shape[0]],
-        title="Latent function",
-        fixed_input=0,
-        xlim=[0, 11],
-        ylim=[-1.5, 3],
-    )
+    if MPL_AVAILABLE and plot:
+        def plot_gp_vs_real(
+            m, x, yreal, size_inputs, title, fixed_input=1, xlim=[0, 11], ylim=[-1.5, 3]
+        ):
+            fig, ax = plt.subplots()
+            ax.set_title(title)
+            plt.plot(x, yreal, "r", label="Real function")
+            rows = (
+                slice(0, size_inputs[0])
+                if fixed_input == 0
+                else slice(size_inputs[0], size_inputs[0] + size_inputs[1])
+            )
+            m.plot(
+                fixed_inputs=[(1, fixed_input)],
+                which_data_rows=rows,
+                xlim=xlim,
+                ylim=ylim,
+                ax=ax,
+            )
+        
+        # Plot the model, the syntax is same as for multioutput models:
+        plot_gp_vs_real(
+            m,
+            xpred,
+            ydpred_true,
+            [x.shape[0], xd.shape[0]],
+            title="Latent function derivatives",
+            fixed_input=1,
+            xlim=[0, 11],
+            ylim=[-1.5, 3],
+        )
+        plot_gp_vs_real(
+            m,
+            xpred,
+            ypred_true,
+            [x.shape[0], xd.shape[0]],
+            title="Latent function",
+            fixed_input=0,
+            xlim=[0, 11],
+            ylim=[-1.5, 3],
+        )
 
     # making predictions for the values:
     mu, var = m.predict_noiseless(Xnew=[xpred, np.empty((0, 1))])