diff --git a/GPy/examples/__init__.py b/GPy/examples/__init__.py
index 4e9e984e..8c7a5b65 100644
--- a/GPy/examples/__init__.py
+++ b/GPy/examples/__init__.py
@@ -1,6 +1,11 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
+"""
+Examples for GPy.
 
+The examples in this package usually depend on `pods <https://github.com/sods/ods>`_ so make sure 
+you have that installed before running examples.
+"""
 from . import classification
 from . import regression
 from . import dimensionality_reduction
diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index ae4e9ba3..af1c8c7a 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -1,7 +1,5 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
 """
 Gaussian Processes classification examples
 """
diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index c4464db9..dd0cfa25 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -1,3 +1,11 @@
+"""
+Kernel module the kernels to sit in.
+
+.. automodule:: GPy.kern.src
+   :members:
+   :private-members:
+"""
+from . import _src as src
 from ._src.kern import Kern
 from ._src.rbf import RBF
 from ._src.linear import Linear, LinearFull
@@ -20,5 +28,4 @@ from ._src.splitKern import SplitKern,DEtime
 from ._src.splitKern import DEtime as DiffGenomeKern
 from ._src.spline import Spline
 from ._src.eq_ode2 import EQ_ODE2
-from ._src.basis_funcs import LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel
-
+from ._src.basis_funcs import LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel
\ No newline at end of file
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index 49f2e2fb..049d2814 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -1,6 +1,5 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-
 import sys
 import numpy as np
 from ...core.parameterization.parameterized import Parameterized
diff --git a/GPy/plotting/__init__.py b/GPy/plotting/__init__.py
index 2a297f28..1787d707 100644
--- a/GPy/plotting/__init__.py
+++ b/GPy/plotting/__init__.py
@@ -55,7 +55,8 @@ if config.get('plotting', 'library') is not 'none':
     bayesian_gplvm_minibatch.BayesianGPLVMMiniBatch.plot_steepest_gradient_map = gpy_plot.latent_plots.plot_steepest_gradient_map
     
     from ..kern import Kern
-    #Kern.plot_covariance = gpy_plot.kern_plots.plot_kern
+    Kern.plot_covariance = gpy_plot.kernel_plots.plot_covariance
+    Kern.plot_covariance = gpy_plot.kernel_plots.plot_ARD
     
     # Variational plot!
 
diff --git a/GPy/plotting/abstract_plotting_library.py b/GPy/plotting/abstract_plotting_library.py
index b7b0c1aa..dd59ef2e 100644
--- a/GPy/plotting/abstract_plotting_library.py
+++ b/GPy/plotting/abstract_plotting_library.py
@@ -57,7 +57,7 @@ class AbstractPlottingLibrary(object):
         return self.__defaults
         #===============================================================================
     
-    def get_new_canvas(self, xlabel=None, ylabel=None, zlabel=None, title=None, projection='2d', legend=True, **kwargs):
+    def get_new_canvas(self, projection='2d', legend=True, **kwargs):
         """
         Return a canvas, kwargupdate for your plotting library. 
         
@@ -67,19 +67,14 @@ class AbstractPlottingLibrary(object):
         
         the kwargs are plotting library specific kwargs!
 
-        :param bool plot_3d: whether to plot in 3d.
-        :param xlabel: the label to put on the xaxis
-        :param ylabel: the label to put on the yaxis
-        :param zlabel: the label to put on the zaxis (if plotting in 3d)
-        :param title: the title of the plot
-        :param legend: whether to put a legend on
+        :param {'2d'|'3d'} projection: The projection to use.
 
         E.g. in matplotlib this means it deletes references to ax, as
         plotting is done on the axis itself and is not a kwarg. 
         """
         raise NotImplementedError("Implement all plot functions in AbstractPlottingLibrary in order to use your own plotting library")
 
-    def show_canvas(self, canvas, plots, xlim=None, ylim=None, zlim=None, **kwargs):
+    def show_canvas(self, canvas, plots, xlabel=None, ylabel=None, zlabel=None, title=None, xlim=None, ylim=None, zlim=None, **kwargs):
         """
         Show the canvas given. 
         plots is a dictionary with the plots
@@ -87,6 +82,11 @@ class AbstractPlottingLibrary(object):
         
         the kwargs are plotting library specific kwargs!
 
+        :param xlabel: the label to put on the xaxis
+        :param ylabel: the label to put on the yaxis
+        :param zlabel: the label to put on the zaxis (if plotting in 3d)
+        :param title: the title of the plot
+        :param legend: if True, plot a legend, if int make legend rows in the legend
         :param (float, float) xlim: the limits for the xaxis
         :param (float, float) ylim: the limits for the yaxis
         :param (float, float) zlim: the limits for the zaxis (if plotting in 3d)
@@ -185,6 +185,8 @@ class AbstractPlottingLibrary(object):
         Just ignore the plot_function, if you do not have the option
         to have interactive changes.
         
+        The origin of the image show is (0,0), such that X[0,0] gets plotted at [0,0] of the image!
+        
         the kwargs are plotting library specific kwargs!
         """
         raise NotImplementedError("Implement all plot functions in AbstractPlottingLibrary in order to use your own plotting library")
@@ -211,6 +213,7 @@ class AbstractPlottingLibrary(object):
         :param str label: the label for the heatmap
         :param plot_function: the function, which generates new data for given input locations X
         :param int resolution: the resolution of the interactive plot redraw - this is only needed when giving a plot_function
+        :return: a list of both the heatmap and annotation plots [heatmap, annotation] 
         """
         raise NotImplementedError("Implement all plot functions in AbstractPlottingLibrary in order to use your own plotting library")
 
diff --git a/GPy/plotting/gpy_plot/kernel_plots.py b/GPy/plotting/gpy_plot/kernel_plots.py
index 6f457a3a..26eb4b8c 100644
--- a/GPy/plotting/gpy_plot/kernel_plots.py
+++ b/GPy/plotting/gpy_plot/kernel_plots.py
@@ -68,15 +68,7 @@ def plot_ARD(kernel, filtering=None, **kwargs):
     ax.set_xlim(-.5, kernel.input_dim - .5)
     add_bar_labels(fig, ax, [bars[-1]], bottom=bottom-last_bottom)
 
-    if legend:
-        if title is '':
-            mode = 'expand'
-            if len(bars) > 1:
-                mode = 'expand'
-            ax.legend(bbox_to_anchor=(0., 1.02, 1., 1.02), loc=3,
-                      ncol=len(bars), mode=mode, borderaxespad=0.)
-            fig.tight_layout(rect=(0, 0, 1, .9))
-        else:
-            ax.legend()
+    return dict(barplots=bars)
 
-    return dict(barplots=bars)
\ No newline at end of file
+def plot_covariance():
+    pass
\ No newline at end of file
diff --git a/GPy/plotting/gpy_plot/latent_plots.py b/GPy/plotting/gpy_plot/latent_plots.py
index 06f3f226..00037709 100644
--- a/GPy/plotting/gpy_plot/latent_plots.py
+++ b/GPy/plotting/gpy_plot/latent_plots.py
@@ -29,19 +29,23 @@
 #===============================================================================
 import numpy as np
 from . import pl
-from .plot_util import get_x_y_var, get_free_dims, get_which_data_ycols,\
+from .plot_util import get_x_y_var, get_which_data_ycols,\
     get_which_data_rows, update_not_existing_kwargs, helper_predict_with_model,\
-    helper_for_plot_data
-import itertools
-from GPy.plotting.gpy_plot.plot_util import scatter_label_generator, subsample_X
+    helper_for_plot_data, scatter_label_generator, subsample_X,\
+    find_best_layout_for_subplots
 
 def _wait_for_updates(view, updates):
-    if updates:
-        clear = raw_input('yes or enter to deactivate updates - otherwise still do updates - use plots[imshow].deactivate() to clear')
-        if clear.lower() in 'yes' or clear == '':
+    try:
+        if updates:
+            clear = raw_input('yes or enter to deactivate updates - otherwise still do updates - use plots[imshow].deactivate() to clear')
+            if clear.lower() in 'yes' or clear == '':
+                view.deactivate()
+        else:
             view.deactivate()
-    else:
-        view.deactivate()
+    except AttributeError:
+        # No updateable view:
+        pass
+            
 
 def plot_prediction_fit(self, plot_limits=None,
         which_data_rows='all', which_data_ycols='all', 
@@ -160,12 +164,11 @@ def _plot_magnification(self, canvas, input_1, input_2, Xgrid,
         Xtest_full = np.zeros((x.shape[0], Xgrid.shape[1]))
         Xtest_full[:, [input_1, input_2]] = x
         mf = self.predict_magnification(Xtest_full, kern=kern, mean=mean, covariance=covariance)
-        return mf.reshape(resolution, resolution).T[::-1, :]
-
+        return mf.reshape(resolution, resolution).T
     imshow_kwargs = update_not_existing_kwargs(imshow_kwargs, pl.defaults.magnification)
     Y = plot_function(Xgrid[:, [input_1, input_2]])
     view = pl.imshow(canvas, Y, 
-                     (xmin[0], xmin[1], xmax[1], xmax[1]), 
+                     (xmin[0], xmax[0], xmin[1], xmax[1]), 
                      None, plot_function, resolution,
                      vmin=Y.min(), vmax=Y.max(), 
                      **imshow_kwargs)
@@ -177,8 +180,7 @@ def plot_magnification(self, labels=None, which_indices=None,
                 updates=False, 
                 mean=True, covariance=True, 
                 kern=None, num_samples=1000,
-                imshow_kwargs=None,
-                **scatter_kwargs):
+                scatter_kwargs=None, **imshow_kwargs):
     """
     Plot the magnification factor of the GP on the inputs. This is the 
     density of the GP as a gray scale.
@@ -199,12 +201,16 @@ def plot_magnification(self, labels=None, which_indices=None,
     :param kwargs: the kwargs for the scatter plots
     """
     input_1, input_2 = self.get_most_significant_input_dimensions(which_indices)
-    canvas, scatter_kwargs = pl.get_new_canvas(xlabel='latent dimension %i' % input_1, ylabel='latent dimension %i' % input_2, **scatter_kwargs)
+    canvas, imshow_kwargs = pl.get_new_canvas(**imshow_kwargs)
     X, _, _, _, _, Xgrid, _, _, xmin, xmax, resolution = helper_for_plot_data(self, plot_limits, (input_1, input_2), None, resolution)    
-    scatters = _plot_latent_scatter(self, canvas, X, input_1, input_2, labels, marker, num_samples, **scatter_kwargs)
-    if imshow_kwargs is None: imshow_kwargs = {}
+    scatters = _plot_latent_scatter(self, canvas, X, input_1, input_2, labels, marker, num_samples, **scatter_kwargs or {})
     view = _plot_magnification(self, canvas, input_1, input_2, Xgrid, xmin, xmax, resolution, mean, covariance, kern, **imshow_kwargs)
-    plots = pl.show_canvas(canvas, dict(scatter=scatters, imshow=view), legend=legend and (labels is not None), xlim=(xmin[0], xmax[0]), ylim=(xmin[1], xmax[1]))
+    if (legend is True) and (labels is not None):
+        legend = find_best_layout_for_subplots(len(np.unique(labels)))[1]
+    plots = pl.show_canvas(canvas, dict(scatter=scatters, imshow=view), 
+                           legend=legend, 
+                           xlim=(xmin[0], xmax[0]), ylim=(xmin[1], xmax[1]),
+                           xlabel='latent dimension %i' % input_1, ylabel='latent dimension %i' % input_2)
     _wait_for_updates(view, updates)
     return plots
 
@@ -219,12 +225,12 @@ def _plot_latent(self, canvas, input_1, input_2, Xgrid,
         Xtest_full = np.zeros((x.shape[0], Xgrid.shape[1]))
         Xtest_full[:, [input_1, input_2]] = x
         mf = np.log(self.predict(Xtest_full, kern=kern)[1])
-        return mf.reshape(resolution, resolution).T[::-1, :]
+        return mf.reshape(resolution, resolution).T
 
     imshow_kwargs = update_not_existing_kwargs(imshow_kwargs, pl.defaults.latent)
-    Y = plot_function(Xgrid[:, [input_1, input_2]]).reshape(resolution, resolution).T[::-1, :]
+    Y = plot_function(Xgrid[:, [input_1, input_2]]).reshape(resolution, resolution).T
     view = pl.imshow(canvas, Y, 
-                     (xmin[0], xmin[1], xmax[1], xmax[1]), 
+                     (xmin[0], xmax[0], xmin[1], xmax[1]), 
                      None, plot_function, resolution,
                      vmin=Y.min(), vmax=Y.max(), 
                      **imshow_kwargs)
@@ -236,7 +242,7 @@ def plot_latent(self, labels=None, which_indices=None,
                 updates=False, 
                 kern=None, marker='<>^vsd', 
                 num_samples=1000,
-                imshow_kwargs=None, **scatter_kwargs):
+                scatter_kwargs=None, **imshow_kwargs):
     """
     Plot the latent space of the GP on the inputs. This is the 
     density of the GP posterior as a grey scale and the 
@@ -256,12 +262,16 @@ def plot_latent(self, labels=None, which_indices=None,
     :param scatter_kwargs: the kwargs for the scatter plots
     """
     input_1, input_2 = self.get_most_significant_input_dimensions(which_indices)
-    canvas, scatter_kwargs = pl.get_new_canvas(xlabel='latent dimension %i' % input_1, ylabel='latent dimension %i' % input_2, **scatter_kwargs)
+    canvas, imshow_kwargs = pl.get_new_canvas(**imshow_kwargs)
     X, _, _, _, _, Xgrid, _, _, xmin, xmax, resolution = helper_for_plot_data(self, plot_limits, (input_1, input_2), None, resolution)    
-    scatters = _plot_latent_scatter(self, canvas, X, input_1, input_2, labels, marker, num_samples, **scatter_kwargs)
-    if imshow_kwargs is None: imshow_kwargs = {}
+    scatters = _plot_latent_scatter(self, canvas, X, input_1, input_2, labels, marker, num_samples, **scatter_kwargs or {})
     view = _plot_latent(self, canvas, input_1, input_2, Xgrid, xmin, xmax, resolution, kern, **imshow_kwargs)
-    plots = pl.show_canvas(canvas, dict(scatter=scatters, imshow=view), legend=legend and (labels is not None), xlim=(xmin[0], xmax[0]), ylim=(xmin[1], xmax[1]))
+    if (legend is True) and (labels is not None):
+        legend = find_best_layout_for_subplots(len(np.unique(labels)))[1]
+    plots = pl.show_canvas(canvas, dict(scatter=scatters, imshow=view), 
+                           legend=legend, 
+                           xlim=(xmin[0], xmax[0]), ylim=(xmin[1], xmax[1]),
+                           xlabel='latent dimension %i' % input_1, ylabel='latent dimension %i' % input_2)
     _wait_for_updates(view, updates)
     return plots
 
@@ -275,14 +285,14 @@ def _plot_steepest_gradient_map(self, canvas, input_1, input_2, Xgrid,
         Xgrid[:, [input_1, input_2]] = x
         dmu_dX = self.predictive_gradients(Xgrid, kern=kern)[0].sum(1)
         argmax = np.argmax(dmu_dX, 1).astype(int)
-        return dmu_dX.max(1).reshape(resolution, resolution).T[::-1, :], np.array(output_labels)[argmax].reshape(resolution, resolution)
+        return dmu_dX.max(1).reshape(resolution, resolution).T, np.array(output_labels)[argmax].reshape(resolution, resolution)
     Y, annotation = plot_function(Xgrid[:, [input_1, input_2]])
     annotation_kwargs = update_not_existing_kwargs(annotation_kwargs or {}, pl.defaults.annotation)
     imshow_kwargs = update_not_existing_kwargs(imshow_kwargs or {}, pl.defaults.gradient)
-    annotation = pl.annotation_heatmap(canvas, Y, annotation, (xmin[0], xmin[1], xmax[1], xmax[1]), 
+    imshow, annotation = pl.annotation_heatmap(canvas, Y, annotation, (xmin[0], xmax[0], xmin[1], xmax[1]), 
                        None, plot_function, resolution, imshow_kwargs=imshow_kwargs, **annotation_kwargs)
     imshow_kwargs = update_not_existing_kwargs(imshow_kwargs, pl.defaults.gradient)
-    return dict(annotation=annotation)
+    return dict(heatmap=imshow, annotation=annotation)
 
 def plot_steepest_gradient_map(self, output_labels=None, data_labels=None, which_indices=None,
                 resolution=15, legend=True,
@@ -300,7 +310,7 @@ def plot_steepest_gradient_map(self, output_labels=None, data_labels=None, which
     :param array-like labels: a label for each data point (row) of the inputs
     :param (int, int) which_indices: which input dimensions to plot against each other
     :param int resolution: the resolution at which we predict the magnification factor
-    :param bool legend: whether to plot the legend on the figure
+    :param bool legend: whether to plot the legend on the figure, if int plot legend columns on legend
     :param plot_limits: the plot limits for the plot
     :type plot_limits: (xmin, xmax, ymin, ymax) or ((xmin, xmax), (ymin, ymax))
     :param bool updates: if possible, make interactive updates using the specific library you are using
@@ -312,12 +322,16 @@ def plot_steepest_gradient_map(self, output_labels=None, data_labels=None, which
     :param scatter_kwargs: the kwargs for the scatter plots
     """
     input_1, input_2 = self.get_most_significant_input_dimensions(which_indices)
-    canvas, imshow_kwargs = pl.get_new_canvas(xlabel='latent dimension %i' % input_1, ylabel='latent dimension %i' % input_2, **imshow_kwargs)
+    canvas, imshow_kwargs = pl.get_new_canvas(**imshow_kwargs)
     X, _, _, _, _, Xgrid, _, _, xmin, xmax, resolution = helper_for_plot_data(self, plot_limits, (input_1, input_2), None, resolution)    
-    scatters = _plot_latent_scatter(self, canvas, X, input_1, input_2, data_labels, marker, num_samples, **scatter_kwargs or {})
-    view = _plot_steepest_gradient_map(self, canvas, input_1, input_2, Xgrid, xmin, xmax, resolution, output_labels, kern, annotation_kwargs=annotation_kwargs, **imshow_kwargs)
-    plots = pl.show_canvas(canvas, dict(scatter=scatters, imshow=view), legend=legend and (data_labels is not None), xlim=(xmin[0], xmax[0]), ylim=(xmin[1], xmax[1]))
-    _wait_for_updates(view['annotation'], updates)
+    plots = dict(scatter=_plot_latent_scatter(self, canvas, X, input_1, input_2, data_labels, marker, num_samples, **scatter_kwargs or {}))
+    plots.update(_plot_steepest_gradient_map(self, canvas, input_1, input_2, Xgrid, xmin, xmax, resolution, output_labels, kern, annotation_kwargs=annotation_kwargs, **imshow_kwargs))
+    if (legend is True) and (data_labels is not None):
+        legend = find_best_layout_for_subplots(len(np.unique(data_labels)))[1]
+    pl.show_canvas(canvas, plots, legend=legend, 
+                           xlim=(xmin[0], xmax[0]), ylim=(xmin[1], xmax[1]),
+                           xlabel='latent dimension %i' % input_1, ylabel='latent dimension %i' % input_2)
+    _wait_for_updates(plots['annotation'], updates)
     return plots
 
 
diff --git a/GPy/plotting/gpy_plot/plot_util.py b/GPy/plotting/gpy_plot/plot_util.py
index 51d05081..fd1255b8 100644
--- a/GPy/plotting/gpy_plot/plot_util.py
+++ b/GPy/plotting/gpy_plot/plot_util.py
@@ -32,6 +32,17 @@ import numpy as np
 from scipy import sparse
 import itertools
 
+
+def find_best_layout_for_subplots(num_subplots):
+    r, c = 1, 1
+    while (r*c) < num_subplots:
+        if (c==(r+1)) or (r==c):
+            c += 1
+        elif c==(r+2):
+            r += 1
+            c -= 1
+    return r, c
+
 def helper_predict_with_model(self, Xgrid, plot_raw, apply_link, percentiles, which_data_ycols, predict_kw, samples=0):
     """
     Make the right decisions for prediction with a model 
diff --git a/GPy/plotting/matplot_dep/controllers/axis_event_controller.py b/GPy/plotting/matplot_dep/controllers/axis_event_controller.py
index 042afb59..55750cde 100644
--- a/GPy/plotting/matplot_dep/controllers/axis_event_controller.py
+++ b/GPy/plotting/matplot_dep/controllers/axis_event_controller.py
@@ -14,7 +14,7 @@ class AxisEventController(object):
         return self
     def deactivate(self):
         for cb_class in self.ax.callbacks.callbacks.values():
-            for cb_num in cb_class.keys():
+            for cb_num in cb_class.keys()[:]:
                 self.ax.callbacks.disconnect(cb_num)
     def activate(self):
         self.ax.callbacks.connect('xlim_changed', self.xlim_changed)
@@ -98,7 +98,7 @@ class BufferedAxisChangedController(AxisChangedController):
         """
         super(BufferedAxisChangedController, self).__init__(ax, update_lim=update_lim)
         self.plot_function = plot_function
-        xmin, ymin, xmax, ymax = plot_limits#self._x_lim # self._compute_buffered(*self._x_lim)
+        xmin, xmax, ymin, ymax = plot_limits#self._x_lim # self._compute_buffered(*self._x_lim)
         # imshow acts on the limits of the plot, this is why we need to override the limits here, to make sure the right plot limits are used:
         self._x_lim = xmin, xmax
         self._y_lim = ymin, ymax
diff --git a/GPy/plotting/matplot_dep/controllers/imshow_controller.py b/GPy/plotting/matplot_dep/controllers/imshow_controller.py
index 093d7859..63ac743b 100644
--- a/GPy/plotting/matplot_dep/controllers/imshow_controller.py
+++ b/GPy/plotting/matplot_dep/controllers/imshow_controller.py
@@ -23,14 +23,21 @@ class ImshowController(BufferedAxisChangedController):
         super(ImshowController, self).__init__(ax, plot_function, plot_limits, resolution, update_lim, **kwargs)
 
     def _init_view(self, canvas, X, xmin, xmax, ymin, ymax, vmin=None, vmax=None, **kwargs):
-        return canvas.imshow(X, extent=(xmin, xmax,
-                                    ymin, ymax),
+        xoffset, yoffset = self._offsets(xmin, xmax, ymin, ymax)
+        return canvas.imshow(X, extent=(xmin-xoffset, xmax+xoffset, 
+                                        ymin-yoffset, ymax+yoffset),
                              vmin=vmin, vmax=vmax,
-                         **kwargs)
+                             **kwargs)
 
     def update_view(self, view, X, xmin, xmax, ymin, ymax):
         view.set_data(X)
-        view.set_extent((xmin, xmax, ymin, ymax))
+        xoffset, yoffset = self._offsets(xmin, xmax, ymin, ymax)
+        view.set_extent((xmin-xoffset, xmax+xoffset, 
+                         ymin-yoffset, ymax+yoffset))
+
+    def _offsets(self, xmin, xmax, ymin, ymax):
+        return (xmax - xmin) / (2 * self.resolution), (ymax - ymin) / (2 * self.resolution)
+
 
 class ImAnnotateController(ImshowController):
     def __init__(self, ax, plot_function, plot_limits, resolution=20, update_lim=.99, imshow_kwargs=None, **kwargs):
@@ -65,7 +72,4 @@ class ImAnnotateController(ImshowController):
             text.set_x(x + xoffset)
             text.set_y(y + yoffset)
             text.set_text("{}".format(X[1][j, i]))
-        return view
-
-    def _offsets(self, xmin, xmax, ymin, ymax):
-        return (xmax - xmin) / (2 * self.resolution), (ymax - ymin) / (2 * self.resolution)
+        return view
\ No newline at end of file
diff --git a/GPy/plotting/matplot_dep/defaults.py b/GPy/plotting/matplot_dep/defaults.py
index 7333e450..f67b6e14 100644
--- a/GPy/plotting/matplot_dep/defaults.py
+++ b/GPy/plotting/matplot_dep/defaults.py
@@ -69,7 +69,7 @@ ard = dict(edgecolor='k', linewidth=1.2)
 
 # Input plots:
 latent = dict(aspect='auto', cmap='Greys', interpolation='bicubic')
-gradient = dict(aspect='auto', cmap='RdBu', interpolation='nearest')
+gradient = dict(aspect='auto', cmap='RdBu', interpolation='nearest', alpha=.7)
 magnification = dict(aspect='auto', cmap='Greys', interpolation='bicubic')
 latent_scatter = dict(s=40, linewidth=.2, edgecolor='k', alpha=.9)
-annotation = dict(fontdict=dict(family='sans-serif', weight='light', fontsize=9), zorder=.3)
\ No newline at end of file
+annotation = dict(fontdict=dict(family='sans-serif', weight='light', fontsize=9), zorder=.3, alpha=.7)
\ No newline at end of file
diff --git a/GPy/plotting/matplot_dep/plot_definitions.py b/GPy/plotting/matplot_dep/plot_definitions.py
index 76e557a4..c1084c91 100644
--- a/GPy/plotting/matplot_dep/plot_definitions.py
+++ b/GPy/plotting/matplot_dep/plot_definitions.py
@@ -35,13 +35,14 @@ from . import defaults
 from matplotlib.colors import LinearSegmentedColormap
 from .controllers import ImshowController, ImAnnotateController
 import itertools
+from GPy.plotting.matplot_dep.util import legend_ontop
 
 class MatplotlibPlots(AbstractPlottingLibrary):
     def __init__(self):
         super(MatplotlibPlots, self).__init__()
         self._defaults = defaults.__dict__
     
-    def get_new_canvas(self, xlabel=None, ylabel=None, zlabel=None, title=None, projection='2d', **kwargs):
+    def get_new_canvas(self, projection='2d', **kwargs):
         if projection == '3d':
             from mpl_toolkits.mplot3d import Axes3D
         elif projection == '2d':
@@ -57,24 +58,23 @@ class MatplotlibPlots(AbstractPlottingLibrary):
         else:
             ax = plt.figure().add_subplot(111, projection=projection)
             
+        return ax, kwargs
+    
+    def show_canvas(self, ax, plots, xlabel=None, ylabel=None, zlabel=None, title=None, xlim=None, ylim=None, zlim=None, legend=False, **kwargs):
+        ax.autoscale_view()
+        ax.set_xlim(xlim)
+        ax.set_ylim(ylim)
         if xlabel is not None: ax.set_xlabel(xlabel)
         if ylabel is not None: ax.set_ylabel(ylabel)
         if zlabel is not None: ax.set_zlabel(zlabel)
         if title is not None: ax.set_title(title)
-        return ax, kwargs
-    
-    def show_canvas(self, ax, plots, xlim=None, ylim=None, zlim=None, legend=False, **kwargs):
-        try:
-            ax.autoscale_view()
-            ax.set_xlim(xlim)
-            ax.set_ylim(ylim)
-            if legend:
-                ax.legend()
-            if zlim is not None:
-                ax.set_zlim(zlim)
-            ax.figure.canvas.draw()
-        except:
-            pass
+        fontdict=dict(family='sans-serif', weight='light', size=9)
+        if legend >= 1:
+            #ax.legend(prop=fontdict)
+            legend_ontop(ax, ncol=legend)
+        if zlim is not None:
+            ax.set_zlim(zlim)
+        ax.figure.canvas.draw()
         return plots
     
     def scatter(self, ax, X, Y, Z=None, color=Tango.colorsHex['mediumBlue'], label=None, marker='o', **kwargs):
@@ -119,27 +119,32 @@ class MatplotlibPlots(AbstractPlottingLibrary):
             return ax.errorbar(X, Y, Z, yerr=error, ecolor=color, label=label, **kwargs)
         return ax.errorbar(X, Y, yerr=error, ecolor=color, label=label, **kwargs)
     
-    def imshow(self, ax, X, extent=None, label=None, plot_function=None, resolution=None, vmin=None, vmax=None, **kwargs):
+    def imshow(self, ax, X, extent=None, label=None, plot_function=None, resolution=None, vmin=None, vmax=None, **imshow_kwargs):
+        if 'origin' not in imshow_kwargs:
+            imshow_kwargs['origin'] = 'lower'
         if plot_function is not None:
-            return ImshowController(ax, plot_function, extent, resolution=resolution, vmin=vmin, vmax=vmax, **kwargs)
-        return ax.imshow(X, label=label, extent=extent, vmin=vmin, vmax=vmax, **kwargs)
+            return ImshowController(ax, plot_function, extent, resolution=resolution, vmin=vmin, vmax=vmax, **imshow_kwargs)
+        return ax.imshow(X, label=label, extent=extent, vmin=vmin, vmax=vmax, **imshow_kwargs)
     
     def annotation_heatmap(self, ax, X, annotation, extent, label=None, plot_function=None, resolution=None, imshow_kwargs=None, **annotation_kwargs):
+        imshow_kwargs = imshow_kwargs or {}
+        if 'origin' not in imshow_kwargs:
+            imshow_kwargs['origin'] = 'lower'
         if plot_function is not None:
             return ImAnnotateController(ax, plot_function, extent, resolution=resolution, imshow_kwargs=imshow_kwargs or {}, **annotation_kwargs)
         if ('ha' not in annotation_kwargs) and ('horizontalalignment' not in annotation_kwargs):
             annotation_kwargs['ha'] = 'center'
         if ('va' not in annotation_kwargs) and ('verticalalignment' not in annotation_kwargs):
             annotation_kwargs['va'] = 'center'
+        imshow = self.imshow(ax, X, extent, label, None, resolution, **imshow_kwargs)
         xmin, xmax, ymin, ymax = extent
-        self.imshow(X, extent, label, None, resolution, **imshow_kwargs or {})
-        xoffset, yoffset = (xmax - xmin) / (2 * self.resolution), (ymax - ymin) / (2 * self.resolution)
-        xlin = np.linspace(xmin, xmax, self.resolution, endpoint=False)
-        ylin = np.linspace(ymin, ymax, self.resolution, endpoint=False)
+        xoffset, yoffset = (xmax - xmin) / (2. * resolution), (ymax - ymin) / (2. * resolution)
+        xlin = np.linspace(xmin, xmax, resolution, endpoint=False)
+        ylin = np.linspace(ymin, ymax, resolution, endpoint=False)
         annotations = []
         for [i, x], [j, y] in itertools.product(enumerate(xlin), enumerate(ylin[::-1])):
             annotations.append(ax.text(x + xoffset, y + yoffset, "{}".format(annotation[j, i]), **annotation_kwargs))
-        return annotations
+        return [imshow, annotations]
     
     def contour(self, ax, X, Y, C, levels=20, label=None, **kwargs):
         return ax.contour(X, Y, C, levels=np.linspace(C.min(), C.max(), levels), label=label, **kwargs)
diff --git a/GPy/plotting/matplot_dep/util.py b/GPy/plotting/matplot_dep/util.py
index eff9edf7..562c7a71 100644
--- a/GPy/plotting/matplot_dep/util.py
+++ b/GPy/plotting/matplot_dep/util.py
@@ -31,6 +31,18 @@
 from matplotlib import pyplot as plt
 import numpy as np
 
+def legend_ontop(ax, mode='expand', ncol=3, fontdict=None):
+    from mpl_toolkits.axes_grid1 import make_axes_locatable
+    handles, labels = ax.get_legend_handles_labels()
+    divider = make_axes_locatable(ax)
+    cax = divider.append_axes("top", "5%", pad="1%")
+    lgd = cax.legend(handles, labels, bbox_to_anchor=(0., 0., 1., 1.), loc=3,
+            ncol=ncol, mode=mode, borderaxespad=0., prop=fontdict or {})
+    cax.set_axis_off()
+    #lgd = cax.legend(bbox_to_anchor=(0., 1.02, 1., 1.02), loc=3,
+    #        ncol=ncol, mode=mode, borderaxespad=0., prop=fontdict or {})
+    return lgd
+
 def removeRightTicks(ax=None):
     ax = ax or plt.gca()
     for i, line in enumerate(ax.get_yticklines()):
diff --git a/GPy/testing/plotting_tests/baseline/gp_2d_data.png b/GPy/testing/plotting_tests/baseline/gp_2d_data.png
index e16283d4..64414880 100644
Binary files a/GPy/testing/plotting_tests/baseline/gp_2d_data.png and b/GPy/testing/plotting_tests/baseline/gp_2d_data.png differ
diff --git a/GPy/testing/plotting_tests/baseline/gp_2d_mean.png b/GPy/testing/plotting_tests/baseline/gp_2d_mean.png
index 2b9161be..d668ebc3 100644
Binary files a/GPy/testing/plotting_tests/baseline/gp_2d_mean.png and b/GPy/testing/plotting_tests/baseline/gp_2d_mean.png differ
diff --git a/GPy/testing/plotting_tests/baseline/gp_3d_data.png b/GPy/testing/plotting_tests/baseline/gp_3d_data.png
index 0bbf7173..a7fa15b3 100644
Binary files a/GPy/testing/plotting_tests/baseline/gp_3d_data.png and b/GPy/testing/plotting_tests/baseline/gp_3d_data.png differ
diff --git a/GPy/testing/plotting_tests/baseline/gp_3d_mean.png b/GPy/testing/plotting_tests/baseline/gp_3d_mean.png
index df7fddb9..4fa611a1 100644
Binary files a/GPy/testing/plotting_tests/baseline/gp_3d_mean.png and b/GPy/testing/plotting_tests/baseline/gp_3d_mean.png differ
diff --git a/GPy/testing/plotting_tests/baseline/gp_class_.png b/GPy/testing/plotting_tests/baseline/gp_class_.png
index a41d3c01..7569660f 100644
Binary files a/GPy/testing/plotting_tests/baseline/gp_class_.png and b/GPy/testing/plotting_tests/baseline/gp_class_.png differ
diff --git a/GPy/testing/plotting_tests/baseline/gp_class_link.png b/GPy/testing/plotting_tests/baseline/gp_class_link.png
index a41d3c01..7569660f 100644
Binary files a/GPy/testing/plotting_tests/baseline/gp_class_link.png and b/GPy/testing/plotting_tests/baseline/gp_class_link.png differ
diff --git a/GPy/testing/plotting_tests/baseline/gp_class_raw.png b/GPy/testing/plotting_tests/baseline/gp_class_raw.png
index 5a3fe51a..af1242a9 100644
Binary files a/GPy/testing/plotting_tests/baseline/gp_class_raw.png and b/GPy/testing/plotting_tests/baseline/gp_class_raw.png differ
diff --git a/GPy/testing/plotting_tests/baseline/gp_class_raw_link.png b/GPy/testing/plotting_tests/baseline/gp_class_raw_link.png
index 692d8292..a81f23c3 100644
Binary files a/GPy/testing/plotting_tests/baseline/gp_class_raw_link.png and b/GPy/testing/plotting_tests/baseline/gp_class_raw_link.png differ
diff --git a/GPy/testing/plotting_tests/baseline/gp_conf.png b/GPy/testing/plotting_tests/baseline/gp_conf.png
index 20ade891..4a5716c2 100644
Binary files a/GPy/testing/plotting_tests/baseline/gp_conf.png and b/GPy/testing/plotting_tests/baseline/gp_conf.png differ
diff --git a/GPy/testing/plotting_tests/baseline/gp_data.png b/GPy/testing/plotting_tests/baseline/gp_data.png
index c78a8df1..1f143b05 100644
Binary files a/GPy/testing/plotting_tests/baseline/gp_data.png and b/GPy/testing/plotting_tests/baseline/gp_data.png differ
diff --git a/GPy/testing/plotting_tests/baseline/gp_density.png b/GPy/testing/plotting_tests/baseline/gp_density.png
index 67974360..8f46208e 100644
Binary files a/GPy/testing/plotting_tests/baseline/gp_density.png and b/GPy/testing/plotting_tests/baseline/gp_density.png differ
diff --git a/GPy/testing/plotting_tests/baseline/gp_error.png b/GPy/testing/plotting_tests/baseline/gp_error.png
index 38c65afc..a334bfc2 100644
Binary files a/GPy/testing/plotting_tests/baseline/gp_error.png and b/GPy/testing/plotting_tests/baseline/gp_error.png differ
diff --git a/GPy/testing/plotting_tests/baseline/gp_mean.png b/GPy/testing/plotting_tests/baseline/gp_mean.png
index 9168abe8..7bee55da 100644
Binary files a/GPy/testing/plotting_tests/baseline/gp_mean.png and b/GPy/testing/plotting_tests/baseline/gp_mean.png differ
diff --git a/GPy/testing/plotting_tests/baseline/gp_samples.png b/GPy/testing/plotting_tests/baseline/gp_samples.png
index 5addd077..10b3e589 100644
Binary files a/GPy/testing/plotting_tests/baseline/gp_samples.png and b/GPy/testing/plotting_tests/baseline/gp_samples.png differ
diff --git a/GPy/testing/plotting_tests/baseline/gplvm_gradient.png b/GPy/testing/plotting_tests/baseline/gplvm_gradient.png
deleted file mode 100644
index 1576d940..00000000
Binary files a/GPy/testing/plotting_tests/baseline/gplvm_gradient.png and /dev/null differ
diff --git a/GPy/testing/plotting_tests/baseline/gplvm_latent.png b/GPy/testing/plotting_tests/baseline/gplvm_latent.png
deleted file mode 100644
index acb56fe4..00000000
Binary files a/GPy/testing/plotting_tests/baseline/gplvm_latent.png and /dev/null differ
diff --git a/GPy/testing/plotting_tests/baseline/gplvm_magnification.png b/GPy/testing/plotting_tests/baseline/gplvm_magnification.png
deleted file mode 100644
index b2c18f4f..00000000
Binary files a/GPy/testing/plotting_tests/baseline/gplvm_magnification.png and /dev/null differ
diff --git a/GPy/testing/plotting_tests/baseline/sparse_gp_class_.png b/GPy/testing/plotting_tests/baseline/sparse_gp_class_.png
index f39cd024..aaf92882 100644
Binary files a/GPy/testing/plotting_tests/baseline/sparse_gp_class_.png and b/GPy/testing/plotting_tests/baseline/sparse_gp_class_.png differ
diff --git a/GPy/testing/plotting_tests/baseline/sparse_gp_class_link.png b/GPy/testing/plotting_tests/baseline/sparse_gp_class_link.png
index f39cd024..aaf92882 100644
Binary files a/GPy/testing/plotting_tests/baseline/sparse_gp_class_link.png and b/GPy/testing/plotting_tests/baseline/sparse_gp_class_link.png differ
diff --git a/GPy/testing/plotting_tests/baseline/sparse_gp_class_raw.png b/GPy/testing/plotting_tests/baseline/sparse_gp_class_raw.png
index c188bb4e..7327e67c 100644
Binary files a/GPy/testing/plotting_tests/baseline/sparse_gp_class_raw.png and b/GPy/testing/plotting_tests/baseline/sparse_gp_class_raw.png differ
diff --git a/GPy/testing/plotting_tests/baseline/sparse_gp_class_raw_link.png b/GPy/testing/plotting_tests/baseline/sparse_gp_class_raw_link.png
index 453941fc..93175e37 100644
Binary files a/GPy/testing/plotting_tests/baseline/sparse_gp_class_raw_link.png and b/GPy/testing/plotting_tests/baseline/sparse_gp_class_raw_link.png differ
diff --git a/GPy/testing/plotting_tests/baseline/sparse_gp_inducing.png b/GPy/testing/plotting_tests/baseline/sparse_gp_inducing.png
index 02a20eb2..38517df2 100644
Binary files a/GPy/testing/plotting_tests/baseline/sparse_gp_inducing.png and b/GPy/testing/plotting_tests/baseline/sparse_gp_inducing.png differ
diff --git a/doc/source/.#conf.py b/doc/source/.#conf.py
deleted file mode 120000
index 307d8733..00000000
--- a/doc/source/.#conf.py
+++ /dev/null
@@ -1 +0,0 @@
-maxz@maxz-sitran.8058:1442579222
\ No newline at end of file
diff --git a/doc/source/conf.py b/doc/source/conf.py
index c2418a34..a02da5e4 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -36,6 +36,7 @@ extensions = [
     'sphinx.ext.viewcode',
 ]
 
+#----- Autodoc
 import sys
 try:
     from unittest.mock import MagicMock
@@ -52,6 +53,19 @@ MOCK_MODULES = ['scipy.linalg.blas', 'blas', 'scipy.optimize', 'scipy.optimize.l
                 'nose', 'nose.tools']
 sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
 
+autodoc_default_flags = ['members',
+                         #'undoc-members',
+                         #'private-members',
+                         #'special-members',
+                         #'inherited-members',
+                         'show-inheritance']
+autodoc_member_order = 'groupwise'
+add_function_parentheses = False
+add_module_names = False
+modindex_common_prefix = ['GPy.']
+show_authors = True
+
+# ------ Sphinx
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 
@@ -119,7 +133,7 @@ exclude_patterns = []
 #show_authors = False
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+#pygments_style = 'sphinx'
 
 # A list of ignored prefixes for module index sorting.
 #modindex_common_prefix = []
@@ -135,12 +149,12 @@ todo_include_todos = False
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-html_theme = 'alabaster'
+html_theme = 'sphinx_rtd_theme'
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
-#html_theme_options = {}
+#html_theme_options = dict(sidebarwidth='20}
 
 # Add any paths that contain custom themes here, relative to this directory.
 #html_theme_path = []
@@ -180,20 +194,22 @@ html_static_path = ['_static']
 #html_use_smartypants = True
 
 # Custom sidebar templates, maps document names to template names.
-#html_sidebars = {}
-
+#html_sidebars = {
+#   '**': ['globaltoc.html', 'localtoc.html', 'sourcelink.html', 'searchbox.html'],
+#   'using/windows': ['windowssidebar.html', 'searchbox.html'],
+#}
 # Additional templates that should be rendered to pages, maps page names to
 # template names.
 #html_additional_pages = {}
 
 # If false, no module index is generated.
-#html_domain_indices = True
+#html_domain_indices = False
 
 # If false, no index is generated.
-#html_use_index = True
+#html_use_index = False
 
 # If true, the index is split into individual pages for each letter.
-#html_split_index = False
+html_split_index = True
 
 # If true, links to the reST sources are added to the pages.
 #html_show_sourcelink = True
diff --git a/doc/source/index.rst b/doc/source/index.rst
index c35f1e3f..7841408e 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -13,13 +13,30 @@ This documentation is mostly aimed at developers interacting closely with the co
 
 The code can be found on our `Github project page <https://github.com/SheffieldML/GPy>`_. It is open source and provided under the BSD license.
 
+For developers:
+
+- `Writing new kernels <tuto_creating_new_kernels.html>`_
+- `Writing new models <tuto_creating_new_models.html>`_
+- `Parameterization handles <tuto_parameterized.html>`_
+
 Contents:
 
 .. toctree::
-   :maxdepth: 4
-
-   GPy
+   :maxdepth: 1
 
+   GPy.models
+   GPy.kern
+   GPy.likelihoods
+   GPy.mappings
+   GPy.examples
+   GPy.util
+   GPy.plotting.gpy_plot
+   GPy.plotting.matplot_dep
+   GPy.core
+   GPy.core.parameterization
+   GPy.inference.optimization
+   GPy.inference.latent_function_inference
+   GPy.inference.mcmc
 	      
 Indices and tables
 ==================
diff --git a/doc/source/tuto_creating_new_kernels.rst b/doc/source/tuto_creating_new_kernels.rst
new file mode 100644
index 00000000..7b833892
--- /dev/null
+++ b/doc/source/tuto_creating_new_kernels.rst
@@ -0,0 +1,236 @@
+********************
+Creating new kernels
+********************
+
+We will see in this tutorial how to create new kernels in GPy. We will also give details on how to implement each function of the kernel and illustrate with a running example: the rational quadratic kernel. 
+
+Structure of a kernel in GPy
+============================
+
+In GPy a kernel object is made of a list of kernpart objects, which correspond to symetric positive definite functions. More precisely, the kernel should be understood as the sum of the kernparts. In order to implement a new covariance, the following steps must be followed
+
+    1. implement the new covariance as a :py:class:`GPy.kern._src.kern.Kern` object
+    2. update the :py:mod:`GPy.kern._src` file
+
+Theses three steps are detailed below.
+
+Implementing a Kern object
+==============================
+
+We advise the reader to start with copy-pasting an existing kernel and
+to modify the new file. We will now give a description of the various
+functions that can be found in a Kern object, some of which are
+mandatory for the new kernel to work.
+
+Header
+~~~~~~
+
+The header is similar to all kernels: ::
+
+    from .kern import Kern
+    import numpy as np
+
+    class RationalQuadratic(Kern):
+
+:py:func:`GPy.kern._src.kern.Kern.__init__` ``(self, input_dim, param1, param2, *args)``
+~~~~~~~~~~~~~~~~~~~
+    
+The implementation of this function in mandatory.
+
+For all Kerns the first parameter ``input_dim`` corresponds to the
+dimension of the input space, and the following parameters stand for
+the parameterization of the kernel.
+
+You have to call ``super(<class_name>, self).__init__(input_dim,
+name)`` to make sure the input dimension and name of the kernel are
+stored in the right place. These attributes are available as
+``self.input_dim`` and ``self.name`` at runtime.  Parameterization is
+done by adding :py:class:`~GPy.core.parameterization.param.Param`
+objects to ``self`` and use them as normal numpy ``array-like`` s in
+your code. The parameters have to be added by calling
+:py:func:`~GPy.core.parameterization.parameterized.Parameterized.link_parameters`
+``(*parameters)`` with the
+:py:class:`~GPy.core.parameterization.param.Param` objects as
+arguments::
+
+    def __init__(self,input_dim,variance=1.,lengthscale=1.,power=1.):
+        super(RationalQuadratic, self).__init__(input_dim, 'rat_quad')
+	assert input_dim == 1, "For this kernel we assume input_dim=1"
+        self.variance = Param('variance', variance)
+        self.lengthscale = Param('lengtscale', lengthscale)
+        self.power = Param('power', power)
+	self.add_parameters(self.variance, self.lengthscale, self.power)
+
+From now on you can use the parameters ``self.variance,
+self.lengthscale, self.power`` as normal numpy ``array-like`` s in your
+code. Updates from the optimization routine will be done
+automatically.
+
+:py:func:`~GPy.core.parameterization.parameter_core.Parameterizable.parameters_changed` ``(self)``
+~~~~~~~~~~~~~~~~~~~
+
+The implementation of this function is optional.
+
+This functions deals as a callback for each optimization iteration. If
+one optimization step was successfull and the parameters (added by
+:py:func:`~GPy.core.parameterization.parameterized.Parameterized.link_parameters`
+``(*parameters)``) this callback function will be called to be able to
+update any precomputations for the kernel. Do not implement the
+gradient updates here, as those are being done by the model enclosing
+the kernel::
+
+    def parameters_changed(self):
+        # nothing todo here
+	pass
+
+
+:py:func:`~GPy.kern._src.kern.Kern.K` ``(self,X,X2)``
+~~~~~~~~~~~~~~~~~~~
+
+The implementation of this function in mandatory.
+
+This function is used to compute the covariance matrix associated with
+the inputs X, X2 (np.arrays with arbitrary number of line (say
+:math:`n_1`, :math:`n_2`) and ``self.input_dim`` columns). ::
+
+    def K(self,X,X2):
+        if X2 is None: X2 = X
+        dist2 = np.square((X-X2.T)/self.lengthscale)
+        return self.variance*(1 + dist2/2.)**(-self.power)
+
+:py:func:`~GPy.kern._src.kern.Kern.Kdiag` ``(self,X)``
+~~~~~~~~~~~~~~~~~~~
+
+The implementation of this function is mandatory.
+
+This function is similar to ``K`` but it computes only the values of
+the kernel on the diagonal. Thus, ``target`` is a 1-dimensional
+np.array of length :math:`n \times 1`. ::
+
+    def Kdiag(self,X):
+        return self.variance*np.ones(X.shape[0])
+
+:py:func:`~GPy.kern._src.kern.Kern.update_gradients_full` ``(self, dL_dK, X, X2=None)``
+~~~~~~~~~~~~~~~~~~~
+
+This function is required for the optimization of the parameters.
+
+Computes the gradients and sets them on the parameters of this model.
+For example, if the kernel is parameterized by
+:math:`\sigma^2, \theta`, then
+
+.. math::
+
+   \frac{\partial L}{\partial\sigma^2}
+    = \frac{\partial L}{\partial K} \frac{\partial K}{\partial\sigma^2}
+
+is added to the gradient of :math:`\sigma^2`: ``self.variance.gradient = <gradient>``
+and
+
+.. math::
+
+   \frac{\partial L}{\partial\theta}
+    = \frac{\partial L}{\partial K} \frac{\partial K}{\partial\theta}
+
+to :math:`\theta`. ::
+	  
+    def update_gradients_full(self, dL_dK, X, X2):
+        if X2 is None: X2 = X
+        dist2 = np.square((X-X2.T)/self.lengthscale)
+
+        dvar = (1 + dist2/2.)**(-self.power)
+        dl = self.power * self.variance * dist2 * self.lengthscale**(-3) * (1 + dist2/2./self.power)**(-self.power-1)
+        dp = - self.variance * np.log(1 + dist2/2.) * (1 + dist2/2.)**(-self.power)
+
+        self.variance.gradient = np.sum(dvar*dL_dK)
+        self.lengthscale.gradient = np.sum(dl*dL_dK)
+        self.power.gradient = np.sum(dp*dL_dK)
+
+
+:py:func:`~GPy.kern._src.kern.Kern.update_gradients_diag` ``(self,dL_dKdiag,X,target)``
+~~~~~~~~~~~~~~~~~~~
+    
+This function is required for BGPLVM, sparse models and uncertain inputs.
+
+As previously, target is an ``self.num_params`` array and
+
+.. math::
+
+   \frac{\partial L}{\partial Kdiag}
+    \frac{\partial Kdiag}{\partial param}
+
+is set to each ``param``. ::
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        self.variance.gradient = np.sum(dL_dKdiag)
+        # here self.lengthscale and self.power have no influence on Kdiag so target[1:] are unchanged
+
+:py:func:`~GPy.kern._src.kern.Kern.gradients_X` ``(self,dL_dK, X, X2)``
+~~~~~~~~~~~~~~~~~~~
+
+This function is required for GPLVM, BGPLVM, sparse models and uncertain inputs.
+
+Computes the derivative of the likelihood with respect to the inputs
+``X`` (a :math:`n \times q` np.array). The result is returned by the
+function which is a :math:`n \times q` np.array. ::
+
+    def gradients_X(self,dL_dK,X,X2):
+        """derivative of the covariance matrix with respect to X."""
+        if X2 is None: X2 = X
+        dist2 = np.square((X-X2.T)/self.lengthscale)
+
+        dX = -self.variance*self.power * (X-X2.T)/self.lengthscale**2 *  (1 + dist2/2./self.lengthscale)**(-self.power-1)
+        return np.sum(dL_dK*dX,1)[:,None]
+
+:py:func:`~GPy.kern._src.kern.Kern.gradients_X_diag` ``(self,dL_dKdiag,X)``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    
+This function is required for BGPLVM, sparse models and uncertain
+inputs. As for ``dKdiag_dtheta``,
+
+.. math::
+
+   \frac{\partial L}{\partial Kdiag} \frac{\partial Kdiag}{\partial X}
+
+is added to each element of target. ::
+
+    def gradients_X_diag(self,dL_dKdiag,X):
+        # no diagonal gradients
+        pass
+
+**Second order derivatives**
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+These functions are required for the magnification factor and are the same as the first order gradients for X, but
+as the second order derivatives:
+
+.. math:: \frac{\partial^2 K}{\partial X\partial X2}
+
+- :py:func:`GPy.kern._src.kern.gradients_XX` ``(self,dL_dK, X, X2)``
+- :py:func:`GPy.kern._src.kern.gradients_XX_diag` ``(self,dL_dKdiag, X)``
+	
+**Psi statistics**
+~~~~~~~~~~~~~
+
+The psi statistics and their derivatives are required for BGPLVM and
+GPS with uncertain inputs only, the expressions are as follows
+
+- `psi0(self, Z, variational_posterior)`
+   .. math::
+
+     \psi_0 = \sum_{i=0}^{n}E_{q(X)}[k(X_i, X_i)]
+
+- `psi1(self, Z, variational_posterior)`::
+   .. math::
+
+      \psi_1^{n,m} = E_{q(X)}[k(X_n, Z_m)]
+	
+- `psi2(self, Z, variational_posterior)`
+   .. math::
+
+      \psi_2^{m,m'} = \sum_{i=0}^{n}E_{q(X)}[ k(Z_m, X_i) k(X_i, Z_{m'})]
+	
+- `psi2n(self, Z, variational_posterior)`
+   .. math::
+
+      \psi_2^{n,m,m'} = E_{q(X)}[ k(Z_m, X_n) k(X_n, Z_{m'})]
diff --git a/doc/source/tuto_creating_new_models.rst b/doc/source/tuto_creating_new_models.rst
new file mode 100644
index 00000000..07f6194f
--- /dev/null
+++ b/doc/source/tuto_creating_new_models.rst
@@ -0,0 +1,100 @@
+.. _creating_new_models:
+
+*******************
+Creating new Models
+*******************
+
+In GPy all models inherit from the base class :py:class:`~GPy.core.parameterized.Parameterized`. :py:class:`~GPy.core.parameterized.Parameterized` is a class which allows for parameterization of objects. All it holds is functionality for tying, bounding and fixing of parameters. It also provides the functionality of searching and manipulating parameters by regular expression syntax. See :py:class:`~GPy.core.parameterized.Parameterized` for more information. 
+
+The :py:class:`~GPy.core.model.Model` class provides parameter introspection, objective function and optimization.
+
+In order to fully use all functionality of
+:py:class:`~GPy.core.model.Model` some methods need to be implemented
+/ overridden. And the model needs to be  told its parameters, such
+that it can provide optimized parameter distribution and handling. 
+In order to explain the functionality of those methods
+we will use a wrapper to the numpy ``rosen`` function, which holds
+input parameters :math:`\mathbf{X}`. Where
+:math:`\mathbf{X}\in\mathbb{R}^{N\times 1}`.
+
+Obligatory methods
+==================
+
+:py:func:`~GPy.core.model.Model.__init__` :
+	Initialize the model with the given parameters. These need to
+	be added to the model by calling
+	`self.add_parameter(<param>)`, where param needs to be a
+	parameter handle (See parameterized_ for details).::
+	
+		self.X = GPy.Param("input", X)
+		self.add_parameter(self.X)
+		
+:py:meth:`~GPy.core.model.Model.log_likelihood` :
+	Returns the log-likelihood of the new model. For our example
+	this is just the call to ``rosen`` and as we want to minimize
+	it, we need to negate the objective.::
+
+		return -scipy.optimize.rosen(self.X)
+
+:py:meth:`~GPy.core.model.Model.parameters_changed` :
+    Updates the internal state of the model and sets the gradient of
+    each parameter handle in the hierarchy with respect to the
+    log_likelihod. Thus here we need to set the negative derivative of
+    the rosenbrock function for the parameters. In this case it is the
+    gradient for self.X.::
+
+ 		self.X.gradient = -scipy.optimize.rosen_der(self.X)
+
+
+Here the full code for the `Rosen` class::
+
+  from GPy import Model, Param
+  import scipy
+  class Rosen(Model):
+      def __init__(self, X, name='rosenbrock'):
+          super(Rosen, self).__init__(name=name)
+          self.X = Param("input", X)
+	  self.add_parameter(self.X)
+      def log_likelihood(self):
+          return -scipy.optimize.rosen(self.X)
+      def parameters_changed(self):
+          self.X.gradient = -scipy.optimize.rosen_der(self.X)
+
+In order to test the newly created model, we can check the gradients
+and optimize a standard rosenbrock run::
+
+  >>> m = Rosen(np.array([-1,-1]))
+  >>> print m
+  Name                 : rosenbrock
+  Log-likelihood       : -404.0
+  Number of Parameters : 2
+  Parameters:
+    rosenbrock.  |  Value  |  Constraint  |  Prior  |  Tied to
+    input        |   (2,)  |              |         |         
+  >>> m.checkgrad(verbose=True)
+             Name           |     Ratio     |  Difference   |  Analytical   |   Numerical   
+  ------------------------------------------------------------------------------------------
+   rosenbrock.input[[0]]    |   1.000000    |   0.000000    |  -804.000000  |  -804.000000  
+   rosenbrock.input[[1]]    |   1.000000    |   0.000000    |  -400.000000  |  -400.000000  
+  >>> m.optimize()
+  >>> print m
+  Name                 : rosenbrock
+  Log-likelihood       : -6.52150088871e-15
+  Number of Parameters : 2
+  Parameters:
+    rosenbrock.  |  Value  |  Constraint  |  Prior  |  Tied to
+    input        |   (2,)  |              |         |         
+  >>> print m.input
+    Index  |  rosenbrock.input  |  Constraint  |   Prior   |  Tied to
+     [0]   |        0.99999994  |              |           |    N/A    
+     [1]   |        0.99999987  |              |           |    N/A    
+  >>> print m.gradient
+  [ -1.91169809e-06,   1.01852309e-06]
+  
+This is the optimium for the 2D Rosenbrock function, as expected, and
+the gradient of the inputs are almost zero.
+
+Optional methods
+================
+
+Currently none.
diff --git a/doc/source/tuto_parameterized.rst b/doc/source/tuto_parameterized.rst
new file mode 100644
index 00000000..507ec109
--- /dev/null
+++ b/doc/source/tuto_parameterized.rst
@@ -0,0 +1,23 @@
+.. _parameterized:
+
+*******************
+Parameterization handling
+*******************
+
+Parameterization in GPy is done through so called parameter handles. The parameter handles are handles to parameters of a model of any kind. A parameter handle can be constrained, fixed, randomized and others. All parameters in GPy have a name, with which they can be accessed in the model. The most common way of accesssing a parameter programmatically though, is by variable name. 
+
+Parameter handles
+==============
+
+A parameter handle in GPy is a handle on a parameter, as the name suggests. A parameter can be constrained, fixed, randomized and more (See e.g. `working with models`). This gives the freedom to the model to handle parameter distribution and model updates as efficiently as possible. All parameter handles share a common memory space, which is just a flat numpy array, stored in the highest parent of a model hierarchy.
+In the following we will introduce and elucidate the different parameter handles which exist in GPy.
+
+:py:class:`~GPy.core.parameterization.parameterized.Parameterized`
+==========
+
+A parameterized object itself holds parameter handles and is just a summarization of the parameters below. It can use those parameters to change the internal state of the model and GPy ensures those parameters to allways hold the right value when in an optimization routine or any other update.
+
+:py:class:`~GPy.core.parameterization.param.Param`
+===========
+
+The lowest level of parameter is a numpy array. This Param class inherits all functionality of a numpy array and can simply be used as if it where a numpy array. These parameters can be accessed in the same way as a numpy array is indexed.
diff --git a/setup.py b/setup.py
index 60ae2a7f..8b134c24 100644
--- a/setup.py
+++ b/setup.py
@@ -99,23 +99,32 @@ setup(name = 'GPy',
       keywords = "machine-learning gaussian-processes kernels",
       url = "http://sheffieldml.github.com/GPy/",
       ext_modules = ext_mods,
-      packages = ["GPy.models",
+      packages = ["GPy",
+                  "GPy.core",
+                  "GPy.core.parameterization", 
+                  "GPy.kern",
+                  "GPy.kern._src",
+                  "GPy.kern._src.psi_comp", 
+                  "GPy.models",
+                  "GPy.inference",
                   "GPy.inference.optimization",
                   "GPy.inference.mcmc",
-                  "GPy.inference",
                   "GPy.inference.latent_function_inference",
-                  "GPy.likelihoods", "GPy.mappings",
-                  "GPy.examples", "GPy.core.parameterization",
-                  "GPy.core", "GPy.testing",
-                  "GPy", "GPy.util", "GPy.kern",
-                  "GPy.kern._src.psi_comp", "GPy.kern._src",
+                  "GPy.likelihoods", 
+                  "GPy.mappings",
+                  "GPy.examples",
+                  "GPy.testing",
+                  "GPy.util", 
+                  "GPy.plotting",
+                  "GPy.plotting.gpy_plot",
                   "GPy.plotting.matplot_dep.controllers",
-                  "GPy.plotting.matplot_dep", "GPy.plotting",
-                  "GPy.plotting.gpy_plot"],
+                  "GPy.plotting.matplot_dep", 
+                  ],
       package_dir={'GPy': 'GPy'},
       package_data = {'GPy': ['defaults.cfg', 'installation.cfg',
                               'util/data_resources.json',
                               'util/football_teams.json',
+                              'plotting/plotting_tests/baseline/*.png'
                               ]},
       include_package_data = True,
       py_modules = ['GPy.__init__'],