Merge branch 'master' into new_warping

2026-06-11 15:15:15 +02:00 · 2013-03-13 13:50:00 +00:00 · 2013-03-13 13:50:00 +00:00 · d057825f43
commit d057825f43
parent 794f918eb0 e812368a87
76 changed files with 2781 additions and 999 deletions
--- a/GPy/init.py
+++ b/GPy/init.py
@ -9,3 +9,10 @@ import util
 import examples
 from core import priors
 import likelihoods
+import testing
+from numpy.testing import Tester
+from nose.tools import nottest
+
+@nottest
+def tests():
+    Tester(testing).test(verbose=10)
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -121,9 +121,6 @@ class model(parameterised):
        else:
            raise AttributeError, "no parameter matches %s"%name

-
-
-
    def log_prior(self):
        """evaluate the prior"""
        return np.sum([p.lnpdf(x) for p, x in zip(self.priors,self._get_params()) if p is not None])
@ -135,12 +132,11 @@ class model(parameterised):
        [np.put(ret,i,p.lnpdf_grad(xx)) for i,(p,xx) in enumerate(zip(self.priors,x)) if not p is None]
        return ret

-    def _log_likelihood_gradients_transformed(self):
+    def _transform_gradients(self, g):
        """
-        Use self.log_likelihood_gradients and self.prior_gradients to get the gradients of the model.
-        Adjust the gradient for constraints and ties, return.
+        Takes a list of gradients and return an array of transformed gradients (positive/negative/tied/and so on)
        """
-        g = self._log_likelihood_gradients() + self._log_prior_gradients()
+
        x = self._get_params()
        g[self.constrained_positive_indices] = g[self.constrained_positive_indices]*x[self.constrained_positive_indices]
        g[self.constrained_negative_indices] = g[self.constrained_negative_indices]*x[self.constrained_negative_indices]
@ -152,6 +148,7 @@ class model(parameterised):
        else:
            return g

+
    def randomize(self):
        """
        Randomize the model.
@ -241,6 +238,27 @@ class model(parameterised):
                        print "Warning! constraining %s postive"%name


+    def objective_function(self, x):
+        """
+        The objective function passed to the optimizer. It combines the likelihood and the priors.
+        """
+        self._set_params_transformed(x)
+        return -self.log_likelihood() - self.log_prior()
+
+    def objective_function_gradients(self, x):
+        """
+        Gets the gradients from the likelihood and the priors.
+        """
+        self._set_params_transformed(x)
+        LL_gradients = self._transform_gradients(self._log_likelihood_gradients())
+        prior_gradients = self._transform_gradients(self._log_prior_gradients())
+        return -LL_gradients - prior_gradients
+
+    def objective_and_gradients(self, x):
+        obj_f = self.objective_function(x)
+        obj_grads = self.objective_function_gradients(x)
+        return obj_f, obj_grads
+
    def optimize(self, optimizer=None, start=None, **kwargs):
        """
        Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
@ -254,22 +272,12 @@ class model(parameterised):
        if optimizer is None:
            optimizer = self.preferred_optimizer

-        def f(x):
-            self._set_params_transformed(x)
-            return -self.log_likelihood()-self.log_prior()
-        def fp(x):
-            self._set_params_transformed(x)
-            return -self._log_likelihood_gradients_transformed()
-        def f_fp(x):
-            self._set_params_transformed(x)
-            return -self.log_likelihood()-self.log_prior(),-self._log_likelihood_gradients_transformed()
-
        if start == None:
            start = self._get_params_transformed()

        optimizer = optimization.get_optimizer(optimizer)
        opt = optimizer(start, model = self, **kwargs)
-        opt.run(f_fp=f_fp, f=f, fp=fp)
+        opt.run(f_fp=self.objective_and_gradients, f=self.objective_function, fp=self.objective_function_gradients)
        self.optimization_runs.append(opt)

        self._set_params_transformed(opt.x_opt)
@ -357,12 +365,9 @@ class model(parameterised):
            dx = step*np.sign(np.random.uniform(-1,1,x.size))

            #evaulate around the point x
-            self._set_params_transformed(x+dx)
-            f1,g1 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()
-            self._set_params_transformed(x-dx)
-            f2,g2 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()
-            self._set_params_transformed(x)
-            gradient = self._log_likelihood_gradients_transformed()
+            f1, g1 = self.objective_and_gradients(x+dx)
+            f2, g2 = self.objective_and_gradients(x-dx)
+            gradient = self.objective_function_gradients(x)

            numerical_gradient = (f1-f2)/(2*dx)
            global_ratio = (f1-f2)/(2*np.dot(dx,gradient))
@ -398,14 +403,10 @@ class model(parameterised):
            for i in param_list:
                xx = x.copy()
                xx[i] += step
-                self._set_params_transformed(xx)
-                f1,g1 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()[i]
+                f1, g1 = self.objective_and_gradients(xx)
                xx[i] -= 2.*step
-                self._set_params_transformed(xx)
-                f2,g2 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()[i]
-                self._set_params_transformed(x)
-                gradient = self._log_likelihood_gradients_transformed()[i]
-
+                f2, g2 = self.objective_and_gradients(xx)
+                gradient = self.objective_function_gradients(x)[i]

                numerical_gradient = (f1-f2)/(2*step)
                ratio = (f1-f2)/(2*step*gradient)
--- a/GPy/core/parameterised.py
+++ b/GPy/core/parameterised.py
@ -56,7 +56,7 @@ class parameterised(object):
        return copy.deepcopy(self)


-    def tie_param(self, which):
+    def tie_params(self, which):
        matches = self.grep_param_names(which)
        assert matches.size > 0, "need at least something to tie together"
        if len(self.tied_indices):
--- a/GPy/examples/BGPLVM_demo.py
+++ b/GPy/examples/BGPLVM_demo.py
@ -1,37 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import numpy as np
-import pylab as pb
-import GPy
-np.random.seed(123344)
-
-N = 10
-M = 3
-Q = 2
-D = 4
-#generate GPLVM-like data
-X = np.random.rand(N, Q)
-k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
-K = k.K(X)
-Y = np.random.multivariate_normal(np.zeros(N),K,D).T
-
-k = GPy.kern.linear(Q, ARD = True) + GPy.kern.white(Q)
-# k = GPy.kern.rbf(Q) + GPy.kern.rbf(Q) + GPy.kern.white(Q)
-# k = GPy.kern.rbf(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
-# k = GPy.kern.rbf(Q, ARD = False)  + GPy.kern.white(Q, 0.00001)
-
-m = GPy.models.Bayesian_GPLVM(Y, Q, kernel = k,  M=M)
-m.constrain_positive('(rbf|bias|noise|white|S)')
-# m.constrain_fixed('S', 1)
-
-# pb.figure()
-# m.plot()
-# pb.title('PCA initialisation')
-# pb.figure()
-# m.optimize(messages = 1)
-# m.plot()
-# pb.title('After optimisation')
-m.ensure_default_constraints()
-m.randomize()
-m.checkgrad(verbose = 1)
--- a/GPy/examples/init.py
+++ b/GPy/examples/init.py
@ -1,8 +1,8 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-# Please don't delete this without explaining to Neil the right way of doing this. I want to be able to run:
-# GPy.examples.regression.toy_rbf_1D() from ipython having imported GPy, and this seems to be the way to do it!
 import classification
 import regression
-import unsupervised
+import dimensionality_reduction
+import non_gaussian
+import tutorials
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -10,8 +10,7 @@ import numpy as np
 import GPy

 default_seed=10000
-
-def crescent_data(model_type='Full', inducing=10, seed=default_seed): #FIXME
+def crescent_data(seed=default_seed): #FIXME
    """Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.

    :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
@ -31,11 +30,8 @@ def crescent_data(model_type='Full', inducing=10, seed=default_seed): #FIXME
    likelihood = GPy.likelihoods.EP(data['Y'],distribution)


-    if model_type=='Full':
-        m = GPy.models.GP(data['X'],likelihood,kernel)
-    else:
-        # create sparse GP EP model
-        m = GPy.models.sparse_GP_EP(data['X'],likelihood=likelihood,inducing=inducing,ep_proxy=model_type)
+    m = GPy.models.GP(data['X'],likelihood,kernel)
+    m.ensure_default_constraints()

    m.update_likelihood_approximation()
    print(m)
@ -65,7 +61,7 @@ def oil():

    # Contrain all parameters to be positive
    m.constrain_positive('')
-    m.tie_param('lengthscale')
+    m.tie_params('lengthscale')
    m.update_likelihood_approximation()

    # Optimize
@ -94,16 +90,13 @@ def toy_linear_1d_classification(seed=default_seed):

    # Model definition
    m = GPy.models.GP(data['X'],likelihood=likelihood,kernel=kernel)
+    m.ensure_default_constraints()

    # Optimize
-    """
-    EPEM runs a loop that consists of two steps:
-    1) EP likelihood approximation:
-        m.update_likelihood_approximation()
-    2) Parameters optimization:
-        m.optimize()
-    """
-    m.EPEM()
+    m.update_likelihood_approximation()
+    # Parameters optimization:
+    m.optimize()
+    #m.EPEM() #FIXME

    # Plot
    pb.subplot(211)
@ -113,3 +106,81 @@ def toy_linear_1d_classification(seed=default_seed):
    print(m)

    return m
+
+def sparse_toy_linear_1d_classification(seed=default_seed):
+    """
+    Simple 1D classification example
+    :param seed : seed value for data generation (default is 4).
+    :type seed: int
+    """
+
+    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    Y = data['Y'][:, 0:1]
+    Y[Y == -1] = 0
+
+    # Kernel object
+    kernel = GPy.kern.rbf(1) + GPy.kern.white(1)
+
+    # Likelihood object
+    distribution = GPy.likelihoods.likelihood_functions.probit()
+    likelihood = GPy.likelihoods.EP(Y,distribution)
+
+    Z = np.random.uniform(data['X'].min(),data['X'].max(),(10,1))
+
+    # Model definition
+    m = GPy.models.sparse_GP(data['X'],likelihood=likelihood,kernel=kernel,Z=Z,normalize_X=False)
+    m.set('len',2.)
+
+    m.ensure_default_constraints()
+    # Optimize
+    m.update_likelihood_approximation()
+    # Parameters optimization:
+    m.optimize()
+    #m.EPEM() #FIXME
+
+    # Plot
+    pb.subplot(211)
+    m.plot_f()
+    pb.subplot(212)
+    m.plot()
+    print(m)
+
+    return m
+
+def sparse_crescent_data(inducing=10, seed=default_seed):
+    """Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
+
+    :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
+    :param seed : seed value for data generation.
+    :type seed: int
+    :param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
+    :type inducing: int
+    """
+
+    data = GPy.util.datasets.crescent_data(seed=seed)
+
+    # Kernel object
+    kernel = GPy.kern.rbf(data['X'].shape[1]) + GPy.kern.white(data['X'].shape[1])
+
+    # Likelihood object
+    distribution = GPy.likelihoods.likelihood_functions.probit()
+    likelihood = GPy.likelihoods.EP(data['Y'],distribution)
+
+    sample = np.random.randint(0,data['X'].shape[0],inducing)
+    Z = data['X'][sample,:]
+    #Z = (np.random.random_sample(2*inducing)*(data['X'].max()-data['X'].min())+data['X'].min()).reshape(inducing,-1)
+
+    # create sparse GP EP model
+    m = GPy.models.sparse_GP(data['X'],likelihood=likelihood,kernel=kernel,Z=Z)
+    m.ensure_default_constraints()
+    m.set('len',10.)
+
+    m.update_likelihood_approximation()
+
+    # optimize
+    m.optimize()
+    print(m)
+
+    # plot
+    m.plot()
+    return m
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -0,0 +1,57 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+import pylab as pb
+import GPy
+
+default_seed = np.random.seed(123344)
+
+def BGPLVM(seed = default_seed):
+    N = 10
+    M = 3
+    Q = 2
+    D = 4
+    #generate GPLVM-like data
+    X = np.random.rand(N, Q)
+    k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
+    K = k.K(X)
+    Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+
+    k = GPy.kern.linear(Q, ARD = True) + GPy.kern.white(Q)
+    # k = GPy.kern.rbf(Q) + GPy.kern.rbf(Q) + GPy.kern.white(Q)
+    # k = GPy.kern.rbf(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
+    # k = GPy.kern.rbf(Q, ARD = False)  + GPy.kern.white(Q, 0.00001)
+
+    m = GPy.models.Bayesian_GPLVM(Y, Q, kernel = k,  M=M)
+    m.constrain_positive('(rbf|bias|noise|white|S)')
+    # m.constrain_fixed('S', 1)
+
+    # pb.figure()
+    # m.plot()
+    # pb.title('PCA initialisation')
+    # pb.figure()
+    # m.optimize(messages = 1)
+    # m.plot()
+    # pb.title('After optimisation')
+    m.ensure_default_constraints()
+    m.randomize()
+    m.checkgrad(verbose = 1)
+
+    return m
+
+def GPLVM_oil_100():
+    data = GPy.util.datasets.oil_100()
+
+    # create simple GP model
+    kernel = GPy.kern.rbf(6, ARD = True) + GPy.kern.bias(6)
+    m = GPy.models.GPLVM(data['X'], 6, kernel = kernel)
+
+    # optimize
+    m.ensure_default_constraints()
+    m.optimize(messages=1)
+
+    # plot
+    print(m)
+    m.plot_latent(labels=data['Y'].argmax(axis=1))
+    return m
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@ -11,7 +11,7 @@ import GPy

 default_seed=10000

-def  toy_1d(seed=default_seed):
+def  toy_poisson_1d(seed=default_seed):
    """
    Simple 1D classification example
    :param seed : seed value for data generation (default is 4).
--- a/GPy/examples/oil_flow_demo.py
+++ b/GPy/examples/oil_flow_demo.py
@ -1,57 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import cPickle as pickle
-import numpy as np
-import pylab as pb
-import GPy
-import pylab as plt
-np.random.seed(3)
-
-def plot_oil(X, theta, labels, label):
-    plt.figure()
-    X = X[:,np.argsort(theta)[:2]]
-    flow_type = (X[labels[:,0]==1])
-    plt.plot(flow_type[:,0], flow_type[:,1], 'rx')
-    flow_type = (X[labels[:,1]==1])
-    plt.plot(flow_type[:,0], flow_type[:,1], 'gx')
-    flow_type = (X[labels[:,2]==1])
-    plt.plot(flow_type[:,0], flow_type[:,1], 'bx')
-    plt.title(label)
-
-data = pickle.load(open('../../../GPy_assembla/datasets/oil_flow_3classes.pickle', 'r'))
-
-Y = data['DataTrn']
-N, D = Y.shape
-selected = np.random.permutation(N)[:350]
-labels = data['DataTrnLbls'][selected]
-Y = Y[selected]
-N, D = Y.shape
-Y -= Y.mean(axis=0)
-# Y /= Y.std(axis=0)
-
-Q = 5
-k = GPy.kern.linear(Q, ARD = True) + GPy.kern.white(Q)
-m = GPy.models.Bayesian_GPLVM(Y, Q, kernel = k, M = 20)
-m.constrain_positive('(rbf|bias|S|linear|white|noise)')
-
-# m.unconstrain('noise')
-# m.constrain_fixed('noise_precision', 50.0)
-# m.unconstrain('white')
-# m.constrain_bounded('white', 1e-6, 10.0)
-# plot_oil(m.X, np.array([1,1]), labels, 'PCA initialization')
-m.optimize(messages = True)
-# m.optimize('tnc', messages = True)
-# plot_oil(m.X, m.kern.parts[0].lengthscale, labels, 'B-GPLVM')
-# # pb.figure()
-# m.plot()
-# pb.title('PCA initialisation')
-# pb.figure()
-# m.optimize(messages = 1)
-# m.plot()
-# pb.title('After optimisation')
-# m = GPy.models.GPLVM(Y, Q)
-# m.constrain_positive('(white|rbf|bias|noise)')
-# m.optimize()
-# plot_oil(m.X, np.array([1,1]), labels, 'GPLVM')
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -41,10 +41,6 @@ def rogers_girolami_olympics():
    print(m)
    return m

-def della_gatta_TRP63_gene_expression(number=942):
-    """Run a standard Gaussian process regression on the della Gatta et al TRP63 Gene Expression data set for a given gene number."""
-
-
 def toy_rbf_1d_50():
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
    data = GPy.util.datasets.toy_rbf_1d_50()
@ -75,6 +71,113 @@ def silhouette():
    print(m)
    return m

+def coregionalisation_toy2():
+    """
+    A simple demonstration of coregionalisation on two sinusoidal functions
+    """
+    X1 = np.random.rand(50,1)*8
+    X2 = np.random.rand(30,1)*5
+    index = np.vstack((np.zeros_like(X1),np.ones_like(X2)))
+    X = np.hstack((np.vstack((X1,X2)),index))
+    Y1 = np.sin(X1) + np.random.randn(*X1.shape)*0.05
+    Y2 = np.sin(X2) + np.random.randn(*X2.shape)*0.05 + 2.
+    Y = np.vstack((Y1,Y2))
+
+    k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
+    k2 = GPy.kern.coregionalise(2,1)
+    k = k1.prod_orthogonal(k2)
+    m = GPy.models.GP_regression(X,Y,kernel=k)
+    m.constrain_fixed('rbf_var',1.)
+    m.constrain_positive('kappa')
+    m.ensure_default_constraints()
+    m.optimize('sim',max_f_eval=5000,messages=1)
+    #m.optimize()
+
+    pb.figure()
+    Xtest1 = np.hstack((np.linspace(0,9,100)[:,None],np.zeros((100,1))))
+    Xtest2 = np.hstack((np.linspace(0,9,100)[:,None],np.ones((100,1))))
+    mean, var,low,up = m.predict(Xtest1)
+    GPy.util.plot.gpplot(Xtest1[:,0],mean,low,up)
+    mean, var,low,up = m.predict(Xtest2)
+    GPy.util.plot.gpplot(Xtest2[:,0],mean,low,up)
+    pb.plot(X1[:,0],Y1[:,0],'rx',mew=2)
+    pb.plot(X2[:,0],Y2[:,0],'gx',mew=2)
+    return m
+
+def coregionalisation_toy():
+    """
+    A simple demonstration of coregionalisation on two sinusoidal functions
+    """
+    X1 = np.random.rand(50,1)*8
+    X2 = np.random.rand(30,1)*5
+    index = np.vstack((np.zeros_like(X1),np.ones_like(X2)))
+    X = np.hstack((np.vstack((X1,X2)),index))
+    Y1 = np.sin(X1) + np.random.randn(*X1.shape)*0.05
+    Y2 = -np.sin(X2) + np.random.randn(*X2.shape)*0.05
+    Y = np.vstack((Y1,Y2))
+
+    k1 = GPy.kern.rbf(1)
+    k2 = GPy.kern.coregionalise(2,2)
+    k = k1.prod_orthogonal(k2)
+    m = GPy.models.GP_regression(X,Y,kernel=k)
+    m.constrain_fixed('rbf_var',1.)
+    m.constrain_positive('kappa')
+    m.ensure_default_constraints()
+    m.optimize()
+
+    pb.figure()
+    Xtest1 = np.hstack((np.linspace(0,9,100)[:,None],np.zeros((100,1))))
+    Xtest2 = np.hstack((np.linspace(0,9,100)[:,None],np.ones((100,1))))
+    mean, var,low,up = m.predict(Xtest1)
+    GPy.util.plot.gpplot(Xtest1[:,0],mean,low,up)
+    mean, var,low,up = m.predict(Xtest2)
+    GPy.util.plot.gpplot(Xtest2[:,0],mean,low,up)
+    pb.plot(X1[:,0],Y1[:,0],'rx',mew=2)
+    pb.plot(X2[:,0],Y2[:,0],'gx',mew=2)
+    return m
+
+
+def coregionalisation_sparse():
+    """
+    A simple demonstration of coregionalisation on two sinusoidal functions
+    """
+    X1 = np.random.rand(500,1)*8
+    X2 = np.random.rand(300,1)*5
+    index = np.vstack((np.zeros_like(X1),np.ones_like(X2)))
+    X = np.hstack((np.vstack((X1,X2)),index))
+    Y1 = np.sin(X1) + np.random.randn(*X1.shape)*0.05
+    Y2 = -np.sin(X2) + np.random.randn(*X2.shape)*0.05
+    Y = np.vstack((Y1,Y2))
+
+    M = 40
+    Z = np.hstack((np.random.rand(M,1)*8,np.random.randint(0,2,M)[:,None]))
+
+    k1 = GPy.kern.rbf(1)
+    k2 = GPy.kern.coregionalise(2,2)
+    k = k1.prod_orthogonal(k2) + GPy.kern.white(2,0.001)
+
+    m = GPy.models.sparse_GP_regression(X,Y,kernel=k,Z=Z)
+    m.scale_factor = 10000.
+    m.constrain_fixed('rbf_var',1.)
+    m.constrain_positive('kappa')
+    m.constrain_fixed('iip')
+    m.ensure_default_constraints()
+    m.optimize_restarts(5,robust=True,messages=1)
+
+    pb.figure()
+    Xtest1 = np.hstack((np.linspace(0,9,100)[:,None],np.zeros((100,1))))
+    Xtest2 = np.hstack((np.linspace(0,9,100)[:,None],np.ones((100,1))))
+    mean, var,low,up = m.predict(Xtest1)
+    GPy.util.plot.gpplot(Xtest1[:,0],mean,low,up)
+    mean, var,low,up = m.predict(Xtest2)
+    GPy.util.plot.gpplot(Xtest2[:,0],mean,low,up)
+    pb.plot(X1[:,0],Y1[:,0],'rx',mew=2)
+    pb.plot(X2[:,0],Y2[:,0],'gx',mew=2)
+    y = pb.ylim()[0]
+    pb.plot(Z[:,0][Z[:,1]==0],np.zeros(np.sum(Z[:,1]==0))+y,'r|',mew=2)
+    pb.plot(Z[:,0][Z[:,1]==1],np.zeros(np.sum(Z[:,1]==1))+y,'g|',mew=2)
+    return m
+

 def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000):
    """Show an example of a multimodal error surface for Gaussian process regression. Gene 939 has bimodal behaviour where the noisey mode is higher."""
@ -91,7 +194,7 @@ def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000
    # Remove the mean (no bias kernel to ensure signal/noise is in RBF/white)
    data['Y'] = data['Y'] - np.mean(data['Y'])

-    lls = GPy.examples.regression.contour_data(data, length_scales, log_SNRs, GPy.kern.rbf)
+    lls = GPy.examples.regression._contour_data(data, length_scales, log_SNRs, GPy.kern.rbf)
    pb.contour(length_scales, log_SNRs, np.exp(lls), 20)
    ax = pb.gca()
    pb.xlabel('length scale')
@ -99,7 +202,7 @@ def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000

    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
-    
+
    # Now run a few optimizations
    models = []
    optim_point_x = np.empty(2)
@ -107,18 +210,18 @@ def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000
    np.random.seed(seed=seed)
    for i in range(0, model_restarts):
        kern = GPy.kern.rbf(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.)) + GPy.kern.white(1,variance=np.random.exponential(1.))
-        
+
        m = GPy.models.GP_regression(data['X'],data['Y'], kernel=kern)
        optim_point_x[0] = m.get('rbf_lengthscale')
        optim_point_y[0] = np.log10(m.get('rbf_variance')) - np.log10(m.get('white_variance'));
-        
+
        # optimize
        m.ensure_default_constraints()
        m.optimize(xtol=1e-6,ftol=1e-6)

        optim_point_x[1] = m.get('rbf_lengthscale')
        optim_point_y[1] = np.log10(m.get('rbf_variance')) - np.log10(m.get('white_variance'));
-        
+
        pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1]-optim_point_x[0], optim_point_y[1]-optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k')
        models.append(m)

@ -126,7 +229,7 @@ def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000
    ax.set_ylim(ylim)
    return (models, lls)

-def contour_data(data, length_scales, log_SNRs, signal_kernel_call=GPy.kern.rbf):
+def _contour_data(data, length_scales, log_SNRs, signal_kernel_call=GPy.kern.rbf):
    """Evaluate the GP objective function for a given data set for a range of signal to noise ratios and a range of lengthscales.

    :data_set: A data set from the utils.datasets director.
@ -152,7 +255,7 @@ def contour_data(data, length_scales, log_SNRs, signal_kernel_call=GPy.kern.rbf)
            total_var = (np.dot(np.dot(data['Y'].T,GPy.util.linalg.pdinv(K)[0]), data['Y'])/data['Y'].shape[0])[0,0]
            noise_var *= total_var
            signal_var *= total_var
-            
+
            kernel = signal_kernel_call(1, variance=signal_var, lengthscale=length_scale) + GPy.kern.white(1, variance=noise_var)

            model = GPy.models.GP_regression(data['X'], data['Y'], kernel=kernel)
@ -160,3 +263,71 @@ def contour_data(data, length_scales, log_SNRs, signal_kernel_call=GPy.kern.rbf)
            length_scale_lls.append(model.log_likelihood())
        lls.append(length_scale_lls)
    return np.array(lls)
+
+def sparse_GP_regression_1D(N = 400, M = 5):
+    """Run a 1D example of a sparse GP regression."""
+    # sample inputs and outputs
+    X = np.random.uniform(-3.,3.,(N,1))
+    Y = np.sin(X)+np.random.randn(N,1)*0.05
+    # construct kernel
+    rbf =  GPy.kern.rbf(1)
+    noise = GPy.kern.white(1)
+    kernel = rbf + noise
+    # create simple GP model
+    m = GPy.models.sparse_GP_regression(X, Y, kernel, M=M)
+
+    m.constrain_positive('(variance|lengthscale|precision)')
+
+    m.checkgrad(verbose=1)
+    m.optimize('tnc', messages = 1)
+    m.plot()
+    return m
+
+def sparse_GP_regression_2D(N = 400, M = 50):
+    """Run a 2D example of a sparse GP regression."""
+    X = np.random.uniform(-3.,3.,(N,2))
+    Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(N,1)*0.05
+
+    # construct kernel
+    rbf =  GPy.kern.rbf(2)
+    noise = GPy.kern.white(2)
+    kernel = rbf + noise
+
+    # create simple GP model
+    m = GPy.models.sparse_GP_regression(X,Y,kernel, M = M)
+
+    # contrain all parameters to be positive (but not inducing inputs)
+    m.constrain_positive('(variance|lengthscale|precision)')
+    m.set('len',2.)
+
+    m.checkgrad()
+
+    # optimize and plot
+    pb.figure()
+    m.optimize('tnc', messages = 1)
+    m.plot()
+    print(m)
+    return m
+
+def uncertain_inputs_sparse_regression():
+    """Run a 1D example of a sparse GP regression with uncertain inputs."""
+    # sample inputs and outputs
+    S = np.ones((20,1))
+    X = np.random.uniform(-3.,3.,(20,1))
+    Y = np.sin(X)+np.random.randn(20,1)*0.05
+    likelihood = GPy.likelihoods.Gaussian(Y)
+    Z = np.random.uniform(-3.,3.,(7,1))
+
+    k = GPy.kern.rbf(1) + GPy.kern.white(1)
+
+    # create simple GP model
+    m = GPy.models.sparse_GP(X, likelihood, kernel=k, Z=Z, X_uncertainty=S)
+
+    # contrain all parameters to be positive
+    m.constrain_positive('(variance|prec)')
+
+    # optimize and plot
+    m.optimize('tnc', max_f_eval = 1000, messages=1)
+    m.plot()
+    print(m)
+    return m
--- a/GPy/examples/sparse_GPLVM_demo.py
+++ b/GPy/examples/sparse_GPLVM_demo.py
@ -1,30 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-import pylab as pb
-import GPy
-np.random.seed(1)
-print "sparse GPLVM with RBF kernel"
-
-N = 100
-M = 8
-Q = 1
-D = 2
-#generate GPLVM-like data
-X = np.random.rand(N, Q)
-k = GPy.kern.rbf(Q, 1.0, 2.0) + GPy.kern.white(Q, 0.00001)
-K = k.K(X)
-Y = np.random.multivariate_normal(np.zeros(N),K,D).T
-
-m = GPy.models.sparse_GPLVM(Y, Q, M=M)
-m.constrain_positive('(rbf|bias|noise|white)')
-
-pb.figure()
-m.plot()
-pb.title('PCA initialisation')
-pb.figure()
-m.optimize(messages = 1)
-m.plot()
-pb.title('After optimisation')
--- a/GPy/examples/sparse_GP_regression_demo.py
+++ b/GPy/examples/sparse_GP_regression_demo.py
@ -1,64 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-"""
-Sparse Gaussian Processes regression with an RBF kernel
-"""
-import pylab as pb
-import numpy as np
-import GPy
-np.random.seed(2)
-pb.ion()
-N = 400
-M = 5
-
-######################################
-## 1 dimensional example
-
-# sample inputs and outputs
-X = np.random.uniform(-3.,3.,(N,1))
-Y = np.sin(X)+np.random.randn(N,1)*0.05
-
-# construct kernel
-rbf =  GPy.kern.rbf(1)
-noise = GPy.kern.white(1)
-kernel = rbf + noise
-
-# create simple GP model
-m = GPy.models.sparse_GP_regression(X, Y, kernel, M=M)
-
-m.constrain_positive('(variance|lengthscale|precision)')
-
-m.checkgrad(verbose=1)
-m.optimize('tnc', messages = 1)
-m.plot()
-
-######################################
-## 2 dimensional example
-
-# # sample inputs and outputs
-# X = np.random.uniform(-3.,3.,(N,2))
-# Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(N,1)*0.05
-
-# # construct kernel
-# rbf =  GPy.kern.rbf(2)
-# noise = GPy.kern.white(2)
-# kernel = rbf + noise
-
-# # create simple GP model
-# m2 = GPy.models.sparse_GP_regression(X,Y,kernel, M = 50)
-# create simple GP model
-
-# # contrain all parameters to be positive (but not inducing inputs)
-# m2.constrain_positive('(variance|lengthscale|precision)')
-
-# #check gradient FIXME unit test please
-# m2.checkgrad()
-
-# # optimize and plot
-# pb.figure()
-# m2.optimize('tnc', messages = 1)
-# m2.plot()
-# print(m2)
--- a/GPy/examples/sparse_ep_fix.py
+++ b/GPy/examples/sparse_ep_fix.py
@ -1,60 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-"""
-Sparse Gaussian Processes regression with an RBF kernel
-"""
-import pylab as pb
-import numpy as np
-import GPy
-np.random.seed(2)
-pb.ion()
-N = 500
-M = 5
-
-pb.close('all')
-######################################
-## 1 dimensional example
-
-# sample inputs and outputs
-X = np.random.uniform(-3.,3.,(N,1))
-#Y = np.sin(X)+np.random.randn(N,1)*0.05
-F = np.sin(X)+np.random.randn(N,1)*0.05
-Y = np.ones([F.shape[0],1])
-Y[F<0] = -1
-likelihood = GPy.inference.likelihoods.probit(Y)
-
-# construct kernel
-rbf =  GPy.kern.rbf(1)
-noise = GPy.kern.white(1)
-kernel = rbf + noise
-
-# create simple GP model
-#m = GPy.models.sparse_GP(X,Y=None, kernel=kernel, M=M,likelihood= likelihood)
-
-# contrain all parameters to be positive
-#m.constrain_fixed('prec',100.)
-m = GPy.models.sparse_GP(X, Y, kernel, M=M)
-m.ensure_default_constraints()
-#if not isinstance(m.likelihood,GPy.inference.likelihoods.gaussian):
-#    m.approximate_likelihood()
-print m.checkgrad()
-m.optimize('tnc', messages = 1)
-m.plot(samples=3)
-print m
-
-n = GPy.models.sparse_GP(X,Y=None, kernel=kernel, M=M,likelihood= likelihood)
-n.ensure_default_constraints()
-if not isinstance(n.likelihood,GPy.inference.likelihoods.gaussian):
-    n.approximate_likelihood()
-print n.checkgrad()
-pb.figure()
-n.plot()
-
-"""
-m = GPy.models.sparse_GP_regression(X, Y, kernel, M=M)
-m.ensure_default_constraints()
-print m.checkgrad()
-"""
--- a/GPy/examples/tutorials.py
+++ b/GPy/examples/tutorials.py
@ -0,0 +1,200 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+"""
+Code of Tutorials
+"""
+
+import pylab as pb
+pb.ion()
+import numpy as np
+import GPy
+
+def tuto_GP_regression():
+    """The detailed explanations of the commands used in this file can be found in the tutorial section"""
+
+    X = np.random.uniform(-3.,3.,(20,1))
+    Y = np.sin(X) + np.random.randn(20,1)*0.05
+
+    kernel = GPy.kern.rbf(D=1, variance=1., lengthscale=1.)
+
+    m = GPy.models.GP_regression(X,Y,kernel)
+
+    print m
+    m.plot()
+
+    m.constrain_positive('')
+
+    m.unconstrain('')                            # Required to remove the previous constrains
+    m.constrain_positive('rbf_variance')
+    m.constrain_bounded('lengthscale',1.,10. )
+    m.constrain_fixed('noise',0.0025)
+
+    m.optimize()
+
+    m.optimize_restarts(Nrestarts = 10)
+
+    ###########################
+    #  2-dimensional example  #
+    ###########################
+
+    # sample inputs and outputs
+    X = np.random.uniform(-3.,3.,(50,2))
+    Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(50,1)*0.05
+
+    # define kernel
+    ker = GPy.kern.Matern52(2,ARD=True) + GPy.kern.white(2)
+
+    # create simple GP model
+    m = GPy.models.GP_regression(X,Y,ker)
+
+    # contrain all parameters to be positive
+    m.constrain_positive('')
+
+    # optimize and plot
+    pb.figure()
+    m.optimize('tnc', max_f_eval = 1000)
+
+    m.plot()
+    print(m)
+
+
+def tuto_kernel_overview():
+    """The detailed explanations of the commands used in this file can be found in the tutorial section"""
+    pb.ion()
+
+    ker1 = GPy.kern.rbf(1)  # Equivalent to ker1 = GPy.kern.rbf(D=1, variance=1., lengthscale=1.)
+    ker2 = GPy.kern.rbf(D=1, variance = .75, lengthscale=2.)
+    ker3 = GPy.kern.rbf(1, .5, .5)
+
+    print ker2
+    ker1.plot()
+    ker2.plot()
+    ker3.plot()
+
+    k1 = GPy.kern.rbf(1,1.,2.)
+    k2 = GPy.kern.Matern32(1, 0.5, 0.2)
+
+    # Product of kernels
+    k_prod = k1.prod(k2)
+    k_prodorth = k1.prod_orthogonal(k2)
+
+    # Sum of kernels
+    k_add = k1.add(k2)
+    k_addorth = k1.add_orthogonal(k2)
+
+    pb.figure(figsize=(8,8))
+    pb.subplot(2,2,1)
+    k_prod.plot()
+    pb.title('prod')
+    pb.subplot(2,2,2)
+    k_prodorth.plot()
+    pb.title('prod_orthogonal')
+    pb.subplot(2,2,3)
+    k_add.plot()
+    pb.title('add')
+    pb.subplot(2,2,4)
+    k_addorth.plot()
+    pb.title('add_orthogonal')
+    pb.subplots_adjust(wspace=0.3, hspace=0.3)
+
+    k1 = GPy.kern.rbf(1,1.,2)
+    k2 = GPy.kern.periodic_Matern52(1,variance=1e3, lengthscale=1, period = 1.5, lower=-5., upper = 5)
+
+    k = k1 * k2  # equivalent to k = k1.prod(k2)
+    print k
+
+    # Simulate sample paths
+    X = np.linspace(-5,5,501)[:,None]
+    Y = np.random.multivariate_normal(np.zeros(501),k.K(X),1)
+
+    # plot
+    pb.figure(figsize=(10,4))
+    pb.subplot(1,2,1)
+    k.plot()
+    pb.subplot(1,2,2)
+    pb.plot(X,Y.T)
+    pb.ylabel("Sample path")
+    pb.subplots_adjust(wspace=0.3)
+
+    k = (k1+k2)*(k1+k2)
+    print k.parts[0].name, '\n', k.parts[1].name, '\n', k.parts[2].name, '\n', k.parts[3].name
+
+    k1 = GPy.kern.rbf(1)
+    k2 = GPy.kern.Matern32(1)
+    k3 = GPy.kern.white(1)
+
+    k = k1 + k2 + k3
+    print k
+
+    k.constrain_positive('var')
+    k.constrain_fixed(np.array([1]),1.75)
+    k.tie_params('len')
+    k.unconstrain('white')
+    k.constrain_bounded('white',lower=1e-5,upper=.5)
+    print k
+
+    k_cst = GPy.kern.bias(1,variance=1.)
+    k_mat = GPy.kern.Matern52(1,variance=1., lengthscale=3)
+    Kanova = (k_cst + k_mat).prod_orthogonal(k_cst + k_mat)
+    print Kanova
+
+    # sample inputs and outputs
+    X = np.random.uniform(-3.,3.,(40,2))
+    Y = 0.5*X[:,:1] + 0.5*X[:,1:] + 2*np.sin(X[:,:1]) * np.sin(X[:,1:])
+
+    # Create GP regression model
+    m = GPy.models.GP_regression(X,Y,Kanova)
+    pb.figure(figsize=(5,5))
+    m.plot()
+
+    pb.figure(figsize=(20,3))
+    pb.subplots_adjust(wspace=0.5)
+    pb.subplot(1,5,1)
+    m.plot()
+    pb.subplot(1,5,2)
+    pb.ylabel("=   ",rotation='horizontal',fontsize='30')
+    pb.subplot(1,5,3)
+    m.plot(which_functions=[False,True,False,False])
+    pb.ylabel("cst          +",rotation='horizontal',fontsize='30')
+    pb.subplot(1,5,4)
+    m.plot(which_functions=[False,False,True,False])
+    pb.ylabel("+   ",rotation='horizontal',fontsize='30')
+    pb.subplot(1,5,5)
+    pb.ylabel("+   ",rotation='horizontal',fontsize='30')
+    m.plot(which_functions=[False,False,False,True])
+
+    ker1 = GPy.kern.rbf(D=1)  # Equivalent to ker1 = GPy.kern.rbf(D=1, variance=1., lengthscale=1.)
+    ker2 = GPy.kern.rbf(D=1, variance = .75, lengthscale=3.)
+    ker3 = GPy.kern.rbf(1, .5, .25)
+
+    ker1.plot()
+    ker2.plot()
+    ker3.plot()
+    #pb.savefig("Figures/tuto_kern_overview_basicdef.png")
+
+    kernels = [GPy.kern.rbf(1), GPy.kern.exponential(1), GPy.kern.Matern32(1), GPy.kern.Matern52(1),  GPy.kern.Brownian(1), GPy.kern.bias(1), GPy.kern.linear(1), GPy.kern.spline(1), GPy.kern.periodic_exponential(1), GPy.kern.periodic_Matern32(1), GPy.kern.periodic_Matern52(1), GPy.kern.white(1)]
+    kernel_names = ["GPy.kern.rbf", "GPy.kern.exponential", "GPy.kern.Matern32", "GPy.kern.Matern52", "GPy.kern.Brownian", "GPy.kern.bias", "GPy.kern.linear", "GPy.kern.spline", "GPy.kern.periodic_exponential", "GPy.kern.periodic_Matern32", "GPy.kern.periodic_Matern52", "GPy.kern.white"]
+
+    pb.figure(figsize=(16,12))
+    pb.subplots_adjust(wspace=.5, hspace=.5)
+    for i, kern in enumerate(kernels):
+       pb.subplot(3,4,i+1)
+       kern.plot(x=7.5,plot_limits=[0.00001,15.])
+       pb.title(kernel_names[i]+ '\n')
+
+    # actual plot for the noise
+    i = 11
+    X = np.linspace(0.,15.,201)
+    WN = 0*X
+    WN[100] = 1.
+    pb.subplot(3,4,i+1)
+    pb.plot(X,WN,'b')
+
+def model_interaction():
+    X = np.random.randn(20,1)
+    Y = np.sin(X) + np.random.randn(*X.shape)*0.01 + 5.
+    k = GPy.kern.rbf(1) + GPy.kern.bias(1)
+    return GPy.models.GP_regression(X,Y,kernel=k)
+
--- a/GPy/examples/uncertain_input_GP_regression_demo.py
+++ b/GPy/examples/uncertain_input_GP_regression_demo.py
@ -1,27 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import pylab as pb
-import numpy as np
-import GPy
-pb.ion()
-pb.close('all')
-
-
-# sample inputs and outputs
-S = np.ones((20,1))
-X = np.random.uniform(-3.,3.,(20,1))
-Y = np.sin(X)+np.random.randn(20,1)*0.05
-
-k = GPy.kern.rbf(1) + GPy.kern.white(1)
-
-# create simple GP model
-m = GPy.models.sparse_GP_regression(X,Y,X_uncertainty=S,kernel=k)
-
-# contrain all parameters to be positive
-m.constrain_positive('(variance|prec)')
-
-# optimize and plot
-m.optimize('tnc', max_f_eval = 1000, messages=1)
-m.plot()
-print(m)
--- a/GPy/examples/uncollapsed_GP_demo.py
+++ b/GPy/examples/uncollapsed_GP_demo.py
@ -1,32 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-"""
-Sparse Gaussian Processes regression with an RBF kernel, 
-using the uncollapsed sparse GP (where the distribution of the 
-inducing points is explicitley represented)
-"""
-import pylab as pb
-import numpy as np
-import GPy
-np.random.seed(2)
-pb.ion()
-N = 500
-M = 20
-
-# sample inputs and outputs
-X = np.random.uniform(-3.,3.,(N,1))
-Y = np.sin(X)+np.random.randn(N,1)*0.05
-
-kernel = GPy.kern.rbf(1) + GPy.kern.white(1)
-
-# create simple GP model
-m = GPy.models.uncollapsed_sparse_GP(X, Y, kernel=kernel, M=M)#, X_uncertainty=np.zeros_like(X)+0.01)
-
-# contrain all parameters to be positive
-m.ensure_default_constraints()
-m.checkgrad()
-# optimize and plot
-m.plot()
--- a/GPy/examples/unsupervised.py
+++ b/GPy/examples/unsupervised.py
@ -1,25 +0,0 @@
-"""
-Usupervised learning with Gaussian Processes.
-"""
-import pylab as pb
-import numpy as np
-import GPy
-
-
-######################################
-## Oil data subsampled to 100 points.
-def oil_100():
-    data = GPy.util.datasets.oil_100()
-
-    # create simple GP model
-    m = GPy.models.GPLVM(data['X'], 2)
-
-
-    # optimize
-    m.ensure_default_constraints()
-    m.optimize()
-
-    # plot
-    print(m)
-    return m
-
--- a/GPy/inference/SCG.py
+++ b/GPy/inference/SCG.py
@ -0,0 +1,141 @@
+#Copyright I. Nabney, N.Lawrence and James Hensman (1996 - 2012)
+
+#Scaled Conjuagte Gradients, originally in Matlab as part of the Netlab toolbox by I. Nabney, converted to python N. Lawrence and given a pythonic interface by James Hensman
+
+#      THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
+#      HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+#      EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+#      NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+#      MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+#      PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+#      REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+#      DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#      EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#      (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+#      OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#      DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+#      HOWEVER CAUSED AND ON ANY THEORY OF
+#      LIABILITY, WHETHER IN CONTRACT, STRICT
+#      LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+#      OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#      OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#      POSSIBILITY OF SUCH DAMAGE.
+
+
+import numpy as np
+
+def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xtol=1e-6, ftol=1e-6):
+    """
+    Optimisation through Scaled Conjugate Gradients (SCG)
+
+    f: the objective function
+    gradf : the gradient function (should return a 1D np.ndarray)
+    x : the initial condition
+
+    Returns
+    x the optimal value for x
+    flog : a list of all the objective values
+
+    """
+
+    sigma0 = 1.0e-4
+    fold = f(x, *optargs)	# Initial function value.
+    function_eval = 1
+    fnow = fold
+    gradnew = gradf(x, *optargs)	# Initial gradient.
+    gradold = gradnew.copy()
+    d = -gradnew				# Initial search direction.
+    success = True				# Force calculation of directional derivs.
+    nsuccess = 0				# nsuccess counts number of successes.
+    beta = 1.0				# Initial scale parameter.
+    betamin = 1.0e-15 			# Lower bound on scale.
+    betamax = 1.0e100			# Upper bound on scale.
+    status = "Not converged"
+
+    flog = [fold]
+
+    iteration = 0
+
+    # Main optimization loop.
+    while iteration < maxiters:
+
+        # Calculate first and second directional derivatives.
+        if success:
+            mu = np.dot(d, gradnew)
+            if mu >= 0:
+                d = -gradnew
+                mu = np.dot(d, gradnew)
+            kappa = np.dot(d, d)
+            sigma = sigma0/np.sqrt(kappa)
+            xplus = x + sigma*d
+            gplus = gradf(xplus, *optargs)
+            theta = np.dot(d, (gplus - gradnew))/sigma
+
+        # Increase effective curvature and evaluate step size alpha.
+        delta = theta + beta*kappa
+        if delta <= 0:
+            delta = beta*kappa
+            beta = beta - theta/kappa
+
+        alpha = - mu/delta
+
+        # Calculate the comparison ratio.
+        xnew = x + alpha*d
+        fnew = f(xnew, *optargs)
+        function_eval += 1
+
+        if function_eval >= max_f_eval:
+            status = "Maximum number of function evaluations exceeded"
+            return x, flog, function_eval, status
+
+        Delta = 2.*(fnew - fold)/(alpha*mu)
+        if Delta  >= 0.:
+            success = True
+            nsuccess += 1
+            x = xnew
+            fnow = fnew
+        else:
+            success = False
+            fnow = fold
+
+        # Store relevant variables
+        flog.append(fnow)		# Current function value
+
+        iteration += 1
+        if display:
+            print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta
+
+        if success:
+            # Test for termination
+            if np.max(np.abs(alpha*d)) < xtol or np.max(np.abs(fnew-fold)) < ftol:
+                return x, flog, function_eval, status
+
+            else:
+                # Update variables for new position
+                fold = fnew
+                gradold = gradnew
+                gradnew = gradf(x, *optargs)
+                # If the gradient is zero then we are done.
+                if np.dot(gradnew,gradnew) == 0:
+                    return x, flog, function_eval, status
+
+        # Adjust beta according to comparison ratio.
+        if Delta < 0.25:
+            beta = min(4.0*beta, betamax)
+        if Delta > 0.75:
+            beta = max(0.5*beta, betamin)
+
+        # Update search direction using Polak-Ribiere formula, or re-start
+        # in direction of negative gradient after nparams steps.
+        if nsuccess == x.size:
+            d = -gradnew
+            nsuccess = 0
+        elif success:
+            gamma = np.dot(gradold - gradnew,gradnew)/(mu)
+            d = gamma*d - gradnew
+
+    # If we get here, then we haven't terminated in the given number of
+    # iterations.
+    status = "maxiter exceeded"
+
+    return x, flog, function_eval, status
--- a/GPy/inference/optimization.py
+++ b/GPy/inference/optimization.py
@ -1,18 +1,18 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-
+import pdb
+import pylab as pb
+import datetime as dt
 from scipy import optimize
+import numpy as np

 try:
    import rasmussens_minimize as rasm
    rasm_available = True
 except ImportError:
    rasm_available = False
-
-import pdb
-import pylab as pb
-import datetime as dt
+from SCG import SCG

 class Optimizer():
    """
@ -29,7 +29,7 @@ class Optimizer():
    :rtype: optimizer object.

    """
-    def __init__(self, x_init, messages=False, model = None, max_f_eval=1e4, ftol=None, gtol=None, xtol=None):
+    def __init__(self, x_init, messages=False, model = None, max_f_eval=1e4, max_iters = 1e3, ftol=None, gtol=None, xtol=None):
        self.opt_name = None
        self.x_init = x_init
        self.messages = messages
@ -38,6 +38,7 @@ class Optimizer():
        self.funct_eval = None
        self.status = None
        self.max_f_eval = int(max_f_eval)
+        self.max_iters = int(max_iters)
        self.trace = None
        self.time = "Not available"
        self.xtol = xtol
@ -63,12 +64,13 @@ class Optimizer():
            pb.xlabel('Iteration')
            pb.ylabel('f(x)')

-    def diagnostics(self):
-        print "Optimizer: \t\t\t\t %s" % self.opt_name
-        print "f(x_opt): \t\t\t\t %.3f" % self.f_opt
-        print "Number of function evaluations: \t %d" % self.funct_eval
-        print "Optimization status: \t\t\t %s" % self.status
-        print "Time elapsed: \t\t\t\t %s" % self.time
+    def __str__(self):
+        diagnostics = "Optimizer: \t\t\t\t %s\n" % self.opt_name
+        diagnostics += "f(x_opt): \t\t\t\t %.3f\n" % self.f_opt
+        diagnostics += "Number of function evaluations: \t %d\n" % self.funct_eval
+        diagnostics += "Optimization status: \t\t\t %s\n" % self.status
+        diagnostics += "Time elapsed: \t\t\t\t %s\n" % self.time
+        return diagnostics

 class opt_tnc(Optimizer):
    def __init__(self, *args, **kwargs):
@ -161,7 +163,6 @@ class opt_simplex(Optimizer):
        self.f_opt = opt_result[1]
        self.funct_eval = opt_result[3]
        self.status = statuses[opt_result[4]]
-
        self.trace = None


@ -196,13 +197,28 @@ class opt_rasm(Optimizer):

        self.trace = opt_result[1]

+class opt_SCG(Optimizer):
+    def __init__(self, *args, **kwargs):
+        Optimizer.__init__(self, *args, **kwargs)
+        self.opt_name = "Scaled Conjugate Gradients"
+
+    def opt(self, f_fp = None, f = None, fp = None):
+        assert not f is None
+        assert not fp is None
+        opt_result = SCG(f,fp,self.x_init, display=self.messages, maxiters=self.max_iters, max_f_eval=self.max_f_eval, xtol=self.xtol, ftol=self.ftol)
+        self.x_opt = opt_result[0]
+        self.trace = opt_result[1]
+        self.f_opt = self.trace[-1]
+        self.funct_eval = opt_result[2]
+        self.status = opt_result[3]
+
 def get_optimizer(f_min):
-    # import rasmussens_minimize as rasm
    from SGD import opt_SGD
-    
+
    optimizers = {'fmin_tnc': opt_tnc,
          'simplex': opt_simplex,
          'lbfgsb': opt_lbfgsb,
+          'scg': opt_SCG,
          'sgd': opt_SGD}

    if rasm_available:
--- a/GPy/kern/Matern32.py
+++ b/GPy/kern/Matern32.py
@ -76,7 +76,7 @@ class Matern32(kernpart):
        """Compute the diagonal of the covariance matrix associated to X."""
        np.add(target,self.variance,target)

-    def dK_dtheta(self,partial,X,X2,target):
+    def dK_dtheta(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
@ -84,29 +84,29 @@ class Matern32(kernpart):
        invdist = 1./np.where(dist!=0.,dist,np.inf)
        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscale**3
        #dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
-        target[0] += np.sum(dvar*partial)
+        target[0] += np.sum(dvar*dL_dK)
        if self.ARD == True:
            dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
            #dl = self.variance*dvar[:,:,None]*dist2M*invdist[:,:,None]
-            target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
+            target[1:] += (dl*dL_dK[:,:,None]).sum(0).sum(0)
        else:
            dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist)) * dist2M.sum(-1)*invdist
            #dl = self.variance*dvar*dist2M.sum(-1)*invdist
-            target[1] += np.sum(dl*partial)
+            target[1] += np.sum(dl*dL_dK)

-    def dKdiag_dtheta(self,partial,X,target):
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
-        target[0] += np.sum(partial)
+        target[0] += np.sum(dL_dKdiag)

-    def dK_dX(self,partial,X,X2,target):
+    def dK_dX(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
        dK_dX = - np.transpose(3*self.variance*dist*np.exp(-np.sqrt(3)*dist)*ddist_dX,(1,0,2))
-        target += np.sum(dK_dX*partial.T[:,:,None],0)
+        target += np.sum(dK_dX*dL_dK.T[:,:,None],0)

-    def dKdiag_dX(self,partial,X,target):
+    def dKdiag_dX(self,dL_dKdiag,X,target):
        pass

    def Gram_matrix(self,F,F1,F2,lower,upper):
--- a/GPy/kern/Matern52.py
+++ b/GPy/kern/Matern52.py
@ -74,7 +74,7 @@ class Matern52(kernpart):
        """Compute the diagonal of the covariance matrix associated to X."""
        np.add(target,self.variance,target)

-    def dK_dtheta(self,partial,X,X2,target):
+    def dK_dtheta(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
@ -82,29 +82,29 @@ class Matern52(kernpart):
        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscale**3
        dvar = (1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist)
        dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
-        target[0] += np.sum(dvar*partial)
+        target[0] += np.sum(dvar*dL_dK)
        if self.ARD:
            dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
            #dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
-            target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
+            target[1:] += (dl*dL_dK[:,:,None]).sum(0).sum(0)
        else:
            dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist)) * dist2M.sum(-1)*invdist
            #dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist)) * dist2M.sum(-1)*invdist
-            target[1] += np.sum(dl*partial)
+            target[1] += np.sum(dl*dL_dK)

-    def dKdiag_dtheta(self,X,target):
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
-        target[0] += np.sum(partial)
+        target[0] += np.sum(dL_dKdiag)

-    def dK_dX(self,partial,X,X2,target):
+    def dK_dX(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
        dK_dX = -  np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2))
-        target += np.sum(dK_dX*partial.T[:,:,None],0)
+        target += np.sum(dK_dX*dL_dK.T[:,:,None],0)

-    def dKdiag_dX(self,partial,X,target):
+    def dKdiag_dX(self,dL_dKdiag,X,target):
        pass

    def Gram_matrix(self,F,F1,F2,F3,lower,upper):
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -2,5 +2,5 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, spline, Brownian, rbf_sympy, sympykern, periodic_exponential, periodic_Matern32, periodic_Matern52, product, product_orthogonal
+from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, spline, Brownian, rbf_sympy, sympykern, periodic_exponential, periodic_Matern32, periodic_Matern52, prod, prod_orthogonal, symmetric, coregionalise, rational_quadratic
 from kern import kern
--- a/GPy/kern/bias.py
+++ b/GPy/kern/bias.py
@ -35,16 +35,17 @@ class bias(kernpart):
    def Kdiag(self,X,target):
        target += self.variance

-    def dK_dtheta(self,partial,X,X2,target):
-        target += partial.sum()
+    def dK_dtheta(self,dL_dKdiag,X,X2,target):
+        target += dL_dKdiag.sum()

-    def dKdiag_dtheta(self,partial,X,target):
-        target += partial.sum()

-    def dK_dX(self, partial,X, X2, target):
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        target += dL_dKdiag.sum()
+
+    def dK_dX(self, dL_dK,X, X2, target):
        pass

-    def dKdiag_dX(self,partial,X,target):
+    def dKdiag_dX(self,dL_dKdiag,X,target):
        pass

    #---------------------------------------#
@ -60,30 +61,29 @@ class bias(kernpart):
    def psi2(self, Z, mu, S, target):
        target += self.variance**2

-    def dpsi0_dtheta(self, partial, Z, mu, S, target):
-        target += partial.sum()
+    def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S, target):
+        target += dL_dpsi0.sum()

-    def dpsi1_dtheta(self, partial, Z, mu, S, target):
-        target += partial.sum()
+    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target):
+        target += dL_dpsi1.sum()

-    def dpsi2_dtheta(self, partial, Z, mu, S, target):
-        target += 2.*self.variance*partial.sum()
+    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target):
+        target += 2.*self.variance*dL_dpsi2.sum()

-    
-    def dpsi0_dZ(self, partial, Z, mu, S, target):
+    def dpsi0_dZ(self, dL_dpsi0, Z, mu, S, target):
        pass

-    def dpsi0_dmuS(self, partial, Z, mu, S, target_mu, target_S):
+    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S, target_mu, target_S):
        pass

-    def dpsi1_dZ(self, partial, Z, mu, S, target):
+    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
        pass

-    def dpsi1_dmuS(self, partial, Z, mu, S, target_mu, target_S):
+    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
        pass

-    def dpsi2_dZ(self, partial, Z, mu, S, target):
+    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
        pass

-    def dpsi2_dmuS(self, partial, Z, mu, S, target_mu, target_S):
+    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
        pass
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@ -18,8 +18,11 @@ from Brownian import Brownian as Brownianpart
 from periodic_exponential import periodic_exponential as periodic_exponentialpart
 from periodic_Matern32 import periodic_Matern32 as periodic_Matern32part
 from periodic_Matern52 import periodic_Matern52 as periodic_Matern52part
-from product import product as productpart
-from product_orthogonal import product_orthogonal as product_orthogonalpart
+from prod import prod as prodpart
+from prod_orthogonal import prod_orthogonal as prod_orthogonalpart
+from symmetric import symmetric as symmetric_part
+from coregionalise import coregionalise as coregionalise_part
+from rational_quadratic import rational_quadratic as rational_quadraticpart
 #TODO these s=constructors are not as clean as we'd like. Tidy the code up
 #using meta-classes to make the objects construct properly wthout them.

@ -243,7 +246,7 @@ def periodic_Matern52(D,variance=1., lengthscale=None, period=2*np.pi,n_freq=10,
    part = periodic_Matern52part(D,variance, lengthscale, period, n_freq, lower, upper)
    return kern(D, [part])

-def product(k1,k2):
+def prod(k1,k2):
    """
     Construct a product kernel over D from two kernels over D

@ -251,10 +254,10 @@ def product(k1,k2):
    :type k1, k2: kernpart
    :rtype: kernel object
    """
-    part = productpart(k1,k2)
+    part = prodpart(k1,k2)
    return kern(k1.D, [part])

-def product_orthogonal(k1,k2):
+def prod_orthogonal(k1,k2):
    """
     Construct a product kernel over D1 x D2 from a kernel over D1 and another over D2.

@ -262,5 +265,34 @@ def product_orthogonal(k1,k2):
    :type k1, k2: kernpart
    :rtype: kernel object
    """
-    part = product_orthogonalpart(k1,k2)
+    part = prod_orthogonalpart(k1,k2)
    return kern(k1.D+k2.D, [part])
+
+def symmetric(k):
+    """
+    Construct a symmetrical kernel from an existing kernel
+    """
+    k_ = k.copy()
+    k_.parts = [symmetric_part(p) for p in k.parts]
+    return k_
+
+def coregionalise(Nout,R=1, W=None, kappa=None):
+    p = coregionalise_part(Nout,R,W,kappa)
+    return kern(1,[p])
+
+
+def rational_quadratic(D,variance=1., lengthscale=1., power=1.):
+    """
+     Construct rational quadratic kernel.
+
+    :param D: the number of input dimensions
+    :type D: int (D=1 is the only value currently supported)
+    :param variance: the variance :math:`\sigma^2`
+    :type variance: float
+    :param lengthscale: the lengthscale :math:`\ell`
+    :type lengthscale: float
+    :rtype: kern object
+
+    """
+    part = rational_quadraticpart(D,variance, lengthscale, power)
+    return kern(D, [part])
--- a/GPy/kern/coregionalise.py
+++ b/GPy/kern/coregionalise.py
@ -0,0 +1,90 @@
+# Copyright (c) 2012, James Hensman and Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import kernpart
+import numpy as np
+from GPy.util.linalg import mdot, pdinv
+import pdb
+
+class coregionalise(kernpart):
+    """
+    Kernel for Intrisec Corregionalization Models
+    """
+    def __init__(self,Nout,R=1, W=None, kappa=None):
+        self.D = 1
+        self.name = 'coregion'
+        self.Nout = Nout
+        self.R = R
+        if W is None:
+            self.W = np.ones((self.Nout,self.R))
+        else:
+            assert W.shape==(self.Nout,self.R)
+            self.W = W
+        if kappa is None:
+            kappa = np.ones(self.Nout)
+        else:
+            assert kappa.shape==(self.Nout,)
+        self.kappa = kappa
+        self.Nparam = self.Nout*(self.R + 1)
+        self._set_params(np.hstack([self.W.flatten(),self.kappa]))
+
+    def _get_params(self):
+        return np.hstack([self.W.flatten(),self.kappa])
+
+    def _set_params(self,x):
+        assert x.size == self.Nparam
+        self.kappa = x[-self.Nout:]
+        self.W = x[:-self.Nout].reshape(self.Nout,self.R)
+        self.B = np.dot(self.W,self.W.T) + np.diag(self.kappa)
+
+    def _get_param_names(self):
+        return sum([['W%i_%i'%(i,j) for j in range(self.R)] for i in range(self.Nout)],[]) + ['kappa_%i'%i for i in range(self.Nout)]
+
+    def K(self,index,index2,target):
+        index = np.asarray(index,dtype=np.int)
+        if index2 is None:
+            index2 = index
+        else:
+            index2 = np.asarray(index2,dtype=np.int)
+        ii,jj = np.meshgrid(index,index2)
+        ii,jj = ii.T, jj.T
+        target += self.B[ii,jj]
+
+    def Kdiag(self,index,target):
+        target += np.diag(self.B)[np.asarray(index,dtype=np.int).flatten()]
+
+    def dK_dtheta(self,dL_dK,index,index2,target):
+        index = np.asarray(index,dtype=np.int)
+        if index2 is None:
+            index2 = index
+        else:
+            index2 = np.asarray(index2,dtype=np.int)
+        ii,jj = np.meshgrid(index,index2)
+        ii,jj = ii.T, jj.T
+
+        dL_dK_small = np.zeros_like(self.B)
+        for i in range(self.Nout):
+            for j in range(self.Nout):
+                tmp = np.sum(dL_dK[(ii==i)*(jj==j)])
+                dL_dK_small[i,j] = tmp
+
+        dkappa = np.diag(dL_dK_small)
+        dL_dK_small += dL_dK_small.T
+        dW = (self.W[:,None,:]*dL_dK_small[:,:,None]).sum(0)
+
+        target += np.hstack([dW.flatten(),dkappa])
+
+    def dKdiag_dtheta(self,dL_dKdiag,index,target):
+        index = np.asarray(index,dtype=np.int).flatten()
+        dL_dKdiag_small = np.zeros(self.Nout)
+        for i in range(self.Nout):
+            dL_dKdiag_small[i] += np.sum(dL_dKdiag[index==i])
+        dW = 2.*self.W*dL_dKdiag_small[:,None]
+        dkappa = dL_dKdiag_small
+        target += np.hstack([dW.flatten(),dkappa])
+
+    def dK_dX(self,dL_dK,X,X2,target):
+        pass
+
+
+
--- a/GPy/kern/exponential.py
+++ b/GPy/kern/exponential.py
@ -74,35 +74,35 @@ class exponential(kernpart):
        """Compute the diagonal of the covariance matrix associated to X."""
        np.add(target,self.variance,target)

-    def dK_dtheta(self,partial,X,X2,target):
+    def dK_dtheta(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
        invdist = 1./np.where(dist!=0.,dist,np.inf)
        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscale**3
        dvar = np.exp(-dist)
-        target[0] += np.sum(dvar*partial)
+        target[0] += np.sum(dvar*dL_dK)
        if self.ARD == True:
            dl = self.variance*dvar[:,:,None]*dist2M*invdist[:,:,None]
-            target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
+            target[1:] += (dl*dL_dK[:,:,None]).sum(0).sum(0)
        else:
            dl = self.variance*dvar*dist2M.sum(-1)*invdist
-            target[1] += np.sum(dl*partial)
+            target[1] += np.sum(dl*dL_dK)

-    def dKdiag_dtheta(self,partial,X,target):
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
        #NB: derivative of diagonal elements wrt lengthscale is 0
-        target[0] += np.sum(partial)
+        target[0] += np.sum(dL_dKdiag)

-    def dK_dX(self,partial,X,X2,target):
+    def dK_dX(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
        if X2 is None: X2 = X
        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
        dK_dX = - np.transpose(self.variance*np.exp(-dist)*ddist_dX,(1,0,2))
-        target += np.sum(dK_dX*partial.T[:,:,None],0)
+        target += np.sum(dK_dX*dL_dK.T[:,:,None],0)

-    def dKdiag_dX(self,partial,X,target):
+    def dKdiag_dX(self,dL_dKdiag,X,target):
        pass

    def Gram_matrix(self,F,F1,lower,upper):
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@ -7,8 +7,8 @@ import pylab as pb
 from ..core.parameterised import parameterised
 from kernpart import kernpart
 import itertools
-from product_orthogonal import product_orthogonal
-from product import product
+from prod_orthogonal import prod_orthogonal
+from prod import prod

 class kern(parameterised):
    def __init__(self,D,parts=[], input_slices=None):
@ -161,7 +161,7 @@ class kern(parameterised):
        K1 = self.copy()
        K2 = other.copy()

-        newkernparts = [product(k1,k2) for k1, k2 in itertools.product(K1.parts,K2.parts)]
+        newkernparts = [prod(k1,k2) for k1, k2 in itertools.product(K1.parts,K2.parts)]

        slices = []
        for sl1, sl2 in itertools.product(K1.input_slices,K2.input_slices):
@ -183,7 +183,7 @@ class kern(parameterised):
        K1 = self.copy()
        K2 = other.copy()

-        newkernparts = [product_orthogonal(k1,k2) for k1, k2 in itertools.product(K1.parts,K2.parts)]
+        newkernparts = [prod_orthogonal(k1,k2) for k1, k2 in itertools.product(K1.parts,K2.parts)]

        slices = []
        for sl1, sl2 in itertools.product(K1.input_slices,K2.input_slices):
@ -237,7 +237,7 @@ class kern(parameterised):
        for i in range(K1.Nparam + K2.Nparam):
            index = np.where(index_param==i)[0]
            if index.size > 1:
-                self.tie_param(index)
+                self.tie_params(index)
        for i in prev_constr_pos:
            self.constrain_positive(np.where(index_param==i)[0])
        for i in prev_constr_neg:
@ -271,10 +271,10 @@ class kern(parameterised):
        [p.K(X[s1,i_s],X2[s2,i_s],target=target[s1,s2]) for p,i_s,s1,s2 in zip(self.parts,self.input_slices,slices1,slices2)]
        return target

-    def dK_dtheta(self,partial,X,X2=None,slices1=None,slices2=None):
+    def dK_dtheta(self,dL_dK,X,X2=None,slices1=None,slices2=None):
        """
-        :param partial: An array of partial derivaties, dL_dK
-        :type partial: Np.ndarray (N x M)
+        :param dL_dK: An array of dL_dK derivaties, dL_dK
+        :type dL_dK: Np.ndarray (N x M)
        :param X: Observed data inputs
        :type X: np.ndarray (N x D)
        :param X2: Observed dara inputs (optional, defaults to X)
@ -288,16 +288,16 @@ class kern(parameterised):
        if X2 is None:
            X2 = X
        target = np.zeros(self.Nparam)
-        [p.dK_dtheta(partial[s1,s2],X[s1,i_s],X2[s2,i_s],target[ps]) for p,i_s,ps,s1,s2 in zip(self.parts, self.input_slices, self.param_slices, slices1, slices2)]
+        [p.dK_dtheta(dL_dK[s1,s2],X[s1,i_s],X2[s2,i_s],target[ps]) for p,i_s,ps,s1,s2 in zip(self.parts, self.input_slices, self.param_slices, slices1, slices2)]

        return self._transform_gradients(target)

-    def dK_dX(self,partial,X,X2=None,slices1=None,slices2=None):
+    def dK_dX(self,dL_dK,X,X2=None,slices1=None,slices2=None):
        if X2 is None:
            X2 = X
        slices1, slices2 = self._process_slices(slices1,slices2)
        target = np.zeros_like(X)
-        [p.dK_dX(partial[s1,s2],X[s1,i_s],X2[s2,i_s],target[s1,i_s]) for p, i_s, s1, s2 in zip(self.parts, self.input_slices, slices1, slices2)]
+        [p.dK_dX(dL_dK[s1,s2],X[s1,i_s],X2[s2,i_s],target[s1,i_s]) for p, i_s, s1, s2 in zip(self.parts, self.input_slices, slices1, slices2)]
        return target

    def Kdiag(self,X,slices=None):
@ -307,20 +307,20 @@ class kern(parameterised):
        [p.Kdiag(X[s,i_s],target=target[s]) for p,i_s,s in zip(self.parts,self.input_slices,slices)]
        return target

-    def dKdiag_dtheta(self,partial,X,slices=None):
+    def dKdiag_dtheta(self,dL_dKdiag,X,slices=None):
        assert X.shape[1]==self.D
-        assert len(partial.shape)==1
-        assert partial.size==X.shape[0]
+        assert len(dL_dKdiag.shape)==1
+        assert dL_dKdiag.size==X.shape[0]
        slices = self._process_slices(slices,False)
        target = np.zeros(self.Nparam)
-        [p.dKdiag_dtheta(partial[s],X[s,i_s],target[ps]) for p,i_s,s,ps in zip(self.parts,self.input_slices,slices,self.param_slices)]
+        [p.dKdiag_dtheta(dL_dKdiag[s],X[s,i_s],target[ps]) for p,i_s,s,ps in zip(self.parts,self.input_slices,slices,self.param_slices)]
        return self._transform_gradients(target)

-    def dKdiag_dX(self, partial, X, slices=None):
+    def dKdiag_dX(self, dL_dKdiag, X, slices=None):
        assert X.shape[1]==self.D
        slices = self._process_slices(slices,False)
        target = np.zeros_like(X)
-        [p.dKdiag_dX(partial[s],X[s,i_s],target[s,i_s]) for p,i_s,s in zip(self.parts,self.input_slices,slices)]
+        [p.dKdiag_dX(dL_dKdiag[s],X[s,i_s],target[s,i_s]) for p,i_s,s in zip(self.parts,self.input_slices,slices)]
        return target

    def psi0(self,Z,mu,S,slices=None):
@ -329,16 +329,16 @@ class kern(parameterised):
        [p.psi0(Z,mu[s],S[s],target[s]) for p,s in zip(self.parts,slices)]
        return target

-    def dpsi0_dtheta(self,partial,Z,mu,S,slices=None):
+    def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,slices=None):
        slices = self._process_slices(slices,False)
        target = np.zeros(self.Nparam)
-        [p.dpsi0_dtheta(partial[s],Z,mu[s],S[s],target[ps]) for p,ps,s in zip(self.parts, self.param_slices,slices)]
+        [p.dpsi0_dtheta(dL_dpsi0[s],Z,mu[s],S[s],target[ps]) for p,ps,s in zip(self.parts, self.param_slices,slices)]
        return self._transform_gradients(target)

-    def dpsi0_dmuS(self,partial,Z,mu,S,slices=None):
+    def dpsi0_dmuS(self,dL_dpsi0,Z,mu,S,slices=None):
        slices = self._process_slices(slices,False)
        target_mu,target_S = np.zeros_like(mu),np.zeros_like(S)
-        [p.dpsi0_dmuS(partial,Z,mu[s],S[s],target_mu[s],target_S[s]) for p,s in zip(self.parts,slices)]
+        [p.dpsi0_dmuS(dL_dpsi0,Z,mu[s],S[s],target_mu[s],target_S[s]) for p,s in zip(self.parts,slices)]
        return target_mu,target_S

    def psi1(self,Z,mu,S,slices1=None,slices2=None):
@ -348,96 +348,163 @@ class kern(parameterised):
        [p.psi1(Z[s2],mu[s1],S[s1],target[s1,s2]) for p,s1,s2 in zip(self.parts,slices1,slices2)]
        return target

-    def dpsi1_dtheta(self,partial,Z,mu,S,slices1=None,slices2=None):
+    def dpsi1_dtheta(self,dL_dpsi1,Z,mu,S,slices1=None,slices2=None):
        """N,M,(Ntheta)"""
        slices1, slices2 = self._process_slices(slices1,slices2)
        target = np.zeros((self.Nparam))
-        [p.dpsi1_dtheta(partial[s2,s1],Z[s2,i_s],mu[s1,i_s],S[s1,i_s],target[ps]) for p,ps,s1,s2,i_s in zip(self.parts, self.param_slices,slices1,slices2,self.input_slices)]
+        [p.dpsi1_dtheta(dL_dpsi1[s2,s1],Z[s2,i_s],mu[s1,i_s],S[s1,i_s],target[ps]) for p,ps,s1,s2,i_s in zip(self.parts, self.param_slices,slices1,slices2,self.input_slices)]
        return self._transform_gradients(target)

-    def dpsi1_dZ(self,partial,Z,mu,S,slices1=None,slices2=None):
+    def dpsi1_dZ(self,dL_dpsi1,Z,mu,S,slices1=None,slices2=None):
        """N,M,Q"""
        slices1, slices2 = self._process_slices(slices1,slices2)
        target = np.zeros_like(Z)
-        [p.dpsi1_dZ(partial[s2,s1],Z[s2,i_s],mu[s1,i_s],S[s1,i_s],target[s2,i_s]) for p,i_s,s1,s2 in zip(self.parts,self.input_slices,slices1,slices2)]
+        [p.dpsi1_dZ(dL_dpsi1[s2,s1],Z[s2,i_s],mu[s1,i_s],S[s1,i_s],target[s2,i_s]) for p,i_s,s1,s2 in zip(self.parts,self.input_slices,slices1,slices2)]
        return target

-    def dpsi1_dmuS(self,partial,Z,mu,S,slices1=None,slices2=None):
+    def dpsi1_dmuS(self,dL_dpsi1,Z,mu,S,slices1=None,slices2=None):
        """return shapes are N,M,Q"""
        slices1, slices2 = self._process_slices(slices1,slices2)
        target_mu, target_S = np.zeros((2,mu.shape[0],mu.shape[1]))
-        [p.dpsi1_dmuS(partial[s2,s1],Z[s2,i_s],mu[s1,i_s],S[s1,i_s],target_mu[s1,i_s],target_S[s1,i_s]) for p,i_s,s1,s2 in zip(self.parts,self.input_slices,slices1,slices2)]
+        [p.dpsi1_dmuS(dL_dpsi1[s2,s1],Z[s2,i_s],mu[s1,i_s],S[s1,i_s],target_mu[s1,i_s],target_S[s1,i_s]) for p,i_s,s1,s2 in zip(self.parts,self.input_slices,slices1,slices2)]
        return target_mu, target_S

    def psi2(self,Z,mu,S,slices1=None,slices2=None):
        """
-        :Z: np.ndarray of inducing inputs (M x Q)
-        : mu, S: np.ndarrays of means and variacnes (each N x Q)
-        :returns psi2: np.ndarray (N,M,M,Q) """
+        :param Z: np.ndarray of inducing inputs (M x Q)
+        :param mu, S: np.ndarrays of means and variances (each N x Q)
+        :returns psi2: np.ndarray (N,M,M)
+        """
        target = np.zeros((mu.shape[0],Z.shape[0],Z.shape[0]))
        slices1, slices2 = self._process_slices(slices1,slices2)
        [p.psi2(Z[s2,i_s],mu[s1,i_s],S[s1,i_s],target[s1,s2,s2]) for p,i_s,s1,s2 in zip(self.parts,self.input_slices,slices1,slices2)]

+        #compute the "cross" terms
+        for p1, p2 in itertools.combinations(self.parts,2):
+            #white doesn;t combine with anything
+            if p1.name=='white' or p2.name=='white':
+                pass
+            #rbf X bias
+            elif p1.name=='bias' and p2.name=='rbf':
+                target += p1.variance*(p2._psi1[:,:,None]+p2._psi1[:,None,:])
+            elif p2.name=='bias' and p1.name=='rbf':
+                target += p2.variance*(p1._psi1[:,:,None]+p1._psi1[:,None,:])
+            #linear X bias
+            elif p1.name=='bias' and p2.name=='linear':
+                tmp = np.zeros((mu.shape[0],Z.shape[0]))
+                p2.psi1(Z,mu,S,tmp)
+                target += p1.variance*(tmp[:,:,None] + tmp[:,None,:])
+            elif p2.name=='bias' and p1.name=='linear':
+                tmp = np.zeros((mu.shape[0],Z.shape[0]))
+                p1.psi1(Z,mu,S,tmp)
+                target += p2.variance*(tmp[:,:,None] + tmp[:,None,:])
+            #rbf X linear
+            elif p1.name=='linear' and p2.name=='rbf':
+                raise NotImplementedError #TODO
+            elif p2.name=='linear' and p1.name=='rbf':
+                raise NotImplementedError #TODO
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"
+        return target

-
-        # "crossterms". Here we are recomputing psi1 for white (we don't need to), but it's
-        # not really expensive, since it's just a matrix of zeroes.
-        # psi1_matrices = [np.zeros((mu.shape[0], Z.shape[0])) for p in self.parts]
-        # [p.psi1(Z[s2],mu[s1],S[s1],psi1_target[s1,s2]) for p,s1,s2,psi1_target in zip(self.parts,slices1,slices2, psi1_matrices)]
-
-        crossterms = 0.0
-        # for 3 kernels this returns something like
-        # [(0,1), (0,2), (1,2)]
-        # in theory, we should also account for (1,0), (2,0) and so on, but
-        # the transpose deals exactly with that
-        # for a,b in itertools.combinations(psi1_matrices, 2):
-        #     tmp = np.multiply(a,b)
-        #     crossterms += tmp[:,None,:] + tmp[:, :,None]
-
-        return target + crossterms
-
-    def dpsi2_dtheta(self,partial,partial1,Z,mu,S,slices1=None,slices2=None):
+    def dpsi2_dtheta(self,dL_dpsi2,Z,mu,S,slices1=None,slices2=None):
        """Returns shape (N,M,M,Ntheta)"""
        slices1, slices2 = self._process_slices(slices1,slices2)
        target = np.zeros(self.Nparam)
-        [p.dpsi2_dtheta(partial[s1,s2,s2],Z[s2,i_s],mu[s1,i_s],S[s1,i_s],target[ps]) for p,i_s,s1,s2,ps in zip(self.parts,self.input_slices,slices1,slices2,self.param_slices)]
+        [p.dpsi2_dtheta(dL_dpsi2[s1,s2,s2],Z[s2,i_s],mu[s1,i_s],S[s1,i_s],target[ps]) for p,i_s,s1,s2,ps in zip(self.parts,self.input_slices,slices1,slices2,self.param_slices)]

-        # # "crossterms"
-        # # 1. get all the psi1 statistics
-        # psi1_matrices = [np.zeros((mu.shape[0], Z.shape[0])) for p in self.parts]
-        # [p.psi1(Z[s2],mu[s1],S[s1],psi1_target[s1,s2]) for p,s1,s2,psi1_target in zip(self.parts,slices1,slices2, psi1_matrices)]
+        #compute the "cross" terms
+        #TODO: better looping
+        for i1, i2 in itertools.combinations(range(len(self.parts)),2):
+            p1,p2 = self.parts[i1], self.parts[i2]
+            ipsl1, ipsl2 = self.input_slices[i1], self.input_slices[i2]
+            ps1, ps2 = self.param_slices[i1], self.param_slices[i2]

-        # partial1 = np.ones_like(partial1)
-        # # 2. get all the dpsi1/dtheta gradients
-        # psi1_gradients = [np.zeros(self.Nparam) for p in self.parts]
-        # [p.dpsi1_dtheta(partial1[s2,s1],Z[s2,i_s],mu[s1,i_s],S[s1,i_s],psi1g_target[ps]) for p,ps,s1,s2,i_s,psi1g_target in zip(self.parts, self.param_slices,slices1,slices2,self.input_slices,psi1_gradients)]
+            #white doesn;t combine with anything
+            if p1.name=='white' or p2.name=='white':
+                pass
+            #rbf X bias
+            elif p1.name=='bias' and p2.name=='rbf':
+                p2.dpsi1_dtheta(dL_dpsi2.sum(1)*p1.variance*2.,Z,mu,S,target[ps2])
+                p1.dpsi1_dtheta(dL_dpsi2.sum(1)*p2._psi1*2.,Z,mu,S,target[ps1])
+            elif p2.name=='bias' and p1.name=='rbf':
+                p1.dpsi1_dtheta(dL_dpsi2.sum(1)*p2.variance*2.,Z,mu,S,target[ps1])
+                p2.dpsi1_dtheta(dL_dpsi2.sum(1)*p1._psi1*2.,Z,mu,S,target[ps2])
+            #linear X bias
+            elif p1.name=='bias' and p2.name=='linear':
+                p2.dpsi1_dtheta(dL_dpsi2.sum(1)*p1.variance*2., Z, mu, S, target[ps1])
+            elif p2.name=='bias' and p1.name=='linear':
+                p1.dpsi1_dtheta(dL_dpsi2.sum(1)*p2.variance*2., Z, mu, S, target[ps1])
+            #rbf X linear
+            elif p1.name=='linear' and p2.name=='rbf':
+                raise NotImplementedError #TODO
+            elif p2.name=='linear' and p1.name=='rbf':
+                raise NotImplementedError #TODO
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"

-
-        # # 3. multiply them somehow
-        # for a,b in itertools.combinations(range(len(psi1_matrices)), 2):
-
-        #     tmp = (psi1_gradients[a][None, None] * psi1_matrices[b][:,:, None])
-        #     # target += (tmp[None] + tmp[:,None]).sum(0).sum(0).sum(0)
-        #     # gne = (psi1_gradients[a].sum()*psi1_matrices[b].sum())
-        #     # target += gne
-        #     #target += (gne[None] + gne[:, None]).sum(0)
-        #     target += (partial.sum(0)[:,:,None] * (tmp[:, None] + tmp[:,:,None]).sum(0)).sum(0).sum(0)
        return self._transform_gradients(target)

-    def dpsi2_dZ(self,partial,Z,mu,S,slices1=None,slices2=None):
+    def dpsi2_dZ(self,dL_dpsi2,Z,mu,S,slices1=None,slices2=None):
        slices1, slices2 = self._process_slices(slices1,slices2)
        target = np.zeros_like(Z)
-        [p.dpsi2_dZ(partial[s1,s2,s2],Z[s2,i_s],mu[s1,i_s],S[s1,i_s],target[s2,i_s]) for p,i_s,s1,s2 in zip(self.parts,self.input_slices,slices1,slices2)]
+        [p.dpsi2_dZ(dL_dpsi2[s1,s2,s2],Z[s2,i_s],mu[s1,i_s],S[s1,i_s],target[s2,i_s]) for p,i_s,s1,s2 in zip(self.parts,self.input_slices,slices1,slices2)]
+
+        #compute the "cross" terms
+        for p1, p2 in itertools.combinations(self.parts,2):
+            #white doesn;t combine with anything
+            if p1.name=='white' or p2.name=='white':
+                pass
+            #rbf X bias
+            elif p1.name=='bias' and p2.name=='rbf':
+                p2.dpsi1_dX(dL_dpsi2.sum(1).T*p1.variance,Z,mu,S,target)
+            elif p2.name=='bias' and p1.name=='rbf':
+                p1.dpsi1_dZ(dL_dpsi2.sum(1).T*p2.variance,Z,mu,S,target)
+            #linear X bias
+            elif p1.name=='bias' and p2.name=='linear':
+                p2.dpsi1_dZ(dL_dpsi2.sum(1).T*p1.variance, Z, mu, S, target)
+            elif p2.name=='bias' and p1.name=='linear':
+                p1.dpsi1_dZ(dL_dpsi2.sum(1).T*p2.variance, Z, mu, S, target)
+            #rbf X linear
+            elif p1.name=='linear' and p2.name=='rbf':
+                raise NotImplementedError #TODO
+            elif p2.name=='linear' and p1.name=='rbf':
+                raise NotImplementedError #TODO
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"
+

        return target

-    def dpsi2_dmuS(self,partial,Z,mu,S,slices1=None,slices2=None):
+    def dpsi2_dmuS(self,dL_dpsi2,Z,mu,S,slices1=None,slices2=None):
        """return shapes are N,M,M,Q"""
        slices1, slices2 = self._process_slices(slices1,slices2)
        target_mu, target_S = np.zeros((2,mu.shape[0],mu.shape[1]))
-        [p.dpsi2_dmuS(partial[s1,s2,s2],Z[s2,i_s],mu[s1,i_s],S[s1,i_s],target_mu[s1,i_s],target_S[s1,i_s]) for p,i_s,s1,s2 in zip(self.parts,self.input_slices,slices1,slices2)]
+        [p.dpsi2_dmuS(dL_dpsi2[s1,s2,s2],Z[s2,i_s],mu[s1,i_s],S[s1,i_s],target_mu[s1,i_s],target_S[s1,i_s]) for p,i_s,s1,s2 in zip(self.parts,self.input_slices,slices1,slices2)]
+
+        #compute the "cross" terms
+        for p1, p2 in itertools.combinations(self.parts,2):
+            #white doesn;t combine with anything
+            if p1.name=='white' or p2.name=='white':
+                pass
+            #rbf X bias
+            elif p1.name=='bias' and p2.name=='rbf':
+                p2.dpsi1_dmuS(dL_dpsi2.sum(1).T*p1.variance*2.,Z,mu,S,target_mu,target_S)
+            elif p2.name=='bias' and p1.name=='rbf':
+                p1.dpsi1_dmuS(dL_dpsi2.sum(1).T*p2.variance*2.,Z,mu,S,target_mu,target_S)
+            #linear X bias
+            elif p1.name=='bias' and p2.name=='linear':
+                p2.dpsi1_dmuS(dL_dpsi2.sum(1).T*p1.variance*2., Z, mu, S, target_mu, target_S)
+            elif p2.name=='bias' and p1.name=='linear':
+                p1.dpsi1_dmuS(dL_dpsi2.sum(1).T*p2.variance*2., Z, mu, S, target_mu, target_S)
+            #rbf X linear
+            elif p1.name=='linear' and p2.name=='rbf':
+                raise NotImplementedError #TODO
+            elif p2.name=='linear' and p1.name=='rbf':
+                raise NotImplementedError #TODO
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"

-        #TODO: there are some extra terms to compute here!
        return target_mu, target_S

    def plot(self, x = None, plot_limits=None,which_functions='all',resolution=None,*args,**kwargs):
--- a/GPy/kern/kernpart.py
+++ b/GPy/kern/kernpart.py
@ -26,31 +26,31 @@ class kernpart(object):
        raise NotImplementedError
    def Kdiag(self,X,target):
        raise NotImplementedError
-    def dK_dtheta(self,partial,X,X2,target):
+    def dK_dtheta(self,dL_dK,X,X2,target):
        raise NotImplementedError
-    def dKdiag_dtheta(self,partial,X,target):
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
        raise NotImplementedError
    def psi0(self,Z,mu,S,target):
        raise NotImplementedError
-    def dpsi0_dtheta(self,partial,Z,mu,S,target):
+    def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target):
        raise NotImplementedError
-    def dpsi0_dmuS(self,partial,Z,mu,S,target_mu,target_S):
+    def dpsi0_dmuS(self,dL_dpsi0,Z,mu,S,target_mu,target_S):
        raise NotImplementedError
    def psi1(self,Z,mu,S,target):
        raise NotImplementedError
    def dpsi1_dtheta(self,Z,mu,S,target):
        raise NotImplementedError
-    def dpsi1_dZ(self,partial,Z,mu,S,target):
+    def dpsi1_dZ(self,dL_dpsi1,Z,mu,S,target):
        raise NotImplementedError
-    def dpsi1_dmuS(self,partial,Z,mu,S,target_mu,target_S):
+    def dpsi1_dmuS(self,dL_dpsi1,Z,mu,S,target_mu,target_S):
        raise NotImplementedError
    def psi2(self,Z,mu,S,target):
        raise NotImplementedError
-    def dpsi2_dZ(self,partial,Z,mu,S,target):
+    def dpsi2_dZ(self,dL_dpsi2,Z,mu,S,target):
        raise NotImplementedError
-    def dpsi2_dtheta(self,partial,Z,mu,S,target):
+    def dpsi2_dtheta(self,dL_dpsi2,Z,mu,S,target):
        raise NotImplementedError
-    def dpsi2_dmuS(self,partial,Z,mu,S,target_mu,target_S):
+    def dpsi2_dmuS(self,dL_dpsi2,Z,mu,S,target_mu,target_S):
        raise NotImplementedError
    def dK_dX(self,X,X2,target):
        raise NotImplementedError
--- a/GPy/kern/linear.py
+++ b/GPy/kern/linear.py
@ -74,16 +74,23 @@ class linear(kernpart):
    def Kdiag(self,X,target):
        np.add(target,np.sum(self.variances*np.square(X),-1),target)

-    def dK_dtheta(self,partial,X,X2,target):
+    def dK_dtheta(self,dL_dK,X,X2,target):
        if self.ARD:
            product = X[:,None,:]*X2[None,:,:]
-            target += (partial[:,:,None]*product).sum(0).sum(0)
+            target += (dL_dK[:,:,None]*product).sum(0).sum(0)
        else:
            self._K_computations(X, X2)
-            target += np.sum(self._dot_product*partial)
+            target += np.sum(self._dot_product*dL_dK)

-    def dK_dX(self,partial,X,X2,target):
-        target += (((X2[:, None, :] * self.variances)) * partial[:,:, None]).sum(0)
+    def dKdiag_dtheta(self,dL_dKdiag, X, target):
+        tmp = dL_dKdiag[:,None]*X**2
+        if self.ARD:
+            target += tmp.sum(0)
+        else:
+            target += tmp.sum()
+
+    def dK_dX(self,dL_dK,X,X2,target):
+        target += (((X2[:, None, :] * self.variances)) * dL_dK[:,:, None]).sum(0)

    #---------------------------------------#
    #             PSI statistics            #
@ -93,33 +100,33 @@ class linear(kernpart):
        self._psi_computations(Z,mu,S)
        target += np.sum(self.variances*self.mu2_S,1)

-    def dpsi0_dtheta(self,partial,Z,mu,S,target):
+    def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
-        tmp = partial[:, None] * self.mu2_S
+        tmp = dL_dpsi0[:, None] * self.mu2_S
        if self.ARD:
            target += tmp.sum(0)
        else:
            target += tmp.sum()

-    def dpsi0_dmuS(self,partial, Z,mu,S,target_mu,target_S):
-        target_mu += partial[:, None] * (2.0*mu*self.variances)
-        target_S += partial[:, None] * self.variances
+    def dpsi0_dmuS(self,dL_dpsi0, Z,mu,S,target_mu,target_S):
+        target_mu += dL_dpsi0[:, None] * (2.0*mu*self.variances)
+        target_S += dL_dpsi0[:, None] * self.variances

    def psi1(self,Z,mu,S,target):
        """the variance, it does nothing"""
        self.K(mu,Z,target)

-    def dpsi1_dtheta(self,partial,Z,mu,S,target):
+    def dpsi1_dtheta(self,dL_dpsi1,Z,mu,S,target):
        """the variance, it does nothing"""
-        self.dK_dtheta(partial,mu,Z,target)
+        self.dK_dtheta(dL_dpsi1,mu,Z,target)

-    def dpsi1_dmuS(self,partial,Z,mu,S,target_mu,target_S):
+    def dpsi1_dmuS(self,dL_dpsi1,Z,mu,S,target_mu,target_S):
        """Do nothing for S, it does not affect psi1"""
        self._psi_computations(Z,mu,S)
-        target_mu += (partial.T[:,:, None]*(Z*self.variances)).sum(1)
+        target_mu += (dL_dpsi1.T[:,:, None]*(Z*self.variances)).sum(1)

-    def dpsi1_dZ(self,partial,Z,mu,S,target):
-        self.dK_dX(partial.T,Z,mu,target)
+    def dpsi1_dZ(self,dL_dpsi1,Z,mu,S,target):
+        self.dK_dX(dL_dpsi1.T,Z,mu,target)

    def psi2(self,Z,mu,S,target):
        """
@ -128,26 +135,27 @@ class linear(kernpart):
        self._psi_computations(Z,mu,S)
        psi2 = self.ZZ*np.square(self.variances)*self.mu2_S[:, None, None, :]
        target += psi2.sum(-1)
+        #TODO: this could be faster using np.tensordot

-    def dpsi2_dtheta(self,partial,Z,mu,S,target):
+    def dpsi2_dtheta(self,dL_dpsi2,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
-        tmp = (partial[:,:,:,None]*(2.*self.ZZ*self.mu2_S[:,None,None,:]*self.variances))
+        tmp = (dL_dpsi2[:,:,:,None]*(2.*self.ZZ*self.mu2_S[:,None,None,:]*self.variances))
        if self.ARD:
            target += tmp.sum(0).sum(0).sum(0)
        else:
            target += tmp.sum()

-    def dpsi2_dmuS(self,partial,Z,mu,S,target_mu,target_S):
+    def dpsi2_dmuS(self,dL_dpsi2,Z,mu,S,target_mu,target_S):
        """Think N,M,M,Q """
        self._psi_computations(Z,mu,S)
        tmp = self.ZZ*np.square(self.variances) # M,M,Q
-        target_mu += (partial[:,:,:,None]*tmp*2.*mu[:,None,None,:]).sum(1).sum(1)
-        target_S += (partial[:,:,:,None]*tmp).sum(1).sum(1)
+        target_mu += (dL_dpsi2[:,:,:,None]*tmp*2.*mu[:,None,None,:]).sum(1).sum(1)
+        target_S += (dL_dpsi2[:,:,:,None]*tmp).sum(1).sum(1)

-    def dpsi2_dZ(self,partial,Z,mu,S,target):
+    def dpsi2_dZ(self,dL_dpsi2,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
        mu2_S = np.sum(self.mu2_S,0)# Q,
-        target += (partial[:,:,:,None] * (self.mu2_S[:,None,None,:]*(Z*np.square(self.variances)[None,:])[None,None,:,:])).sum(0).sum(1)
+        target += (dL_dpsi2[:,:,:,None] * (self.mu2_S[:,None,None,:]*(Z*np.square(self.variances)[None,:])[None,None,:,:])).sum(0).sum(1)

    #---------------------------------------#
    #            Precomputations            #
--- a/GPy/kern/periodic_Matern32.py
+++ b/GPy/kern/periodic_Matern32.py
@ -1,6 +1,11 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
 from kernpart import kernpart
 import numpy as np
 from GPy.util.linalg import mdot, pdinv
+from GPy.util.decorators import silence_errors

 class periodic_Matern32(kernpart):
    """
@ -39,12 +44,16 @@ class periodic_Matern32(kernpart):
        def f(x):
            return alpha*np.cos(omega*x+phase)
        return f
+
+    @silence_errors
    def _cos_factorization(self,alpha,omega,phase):
        r1 = np.sum(alpha*np.cos(phase),axis=1)[:,None]
        r2 = np.sum(alpha*np.sin(phase),axis=1)[:,None]
        r =  np.sqrt(r1**2 + r2**2)
        psi = np.where(r1 != 0, (np.arctan(r2/r1) + (r1<0.)*np.pi),np.arcsin(r2))
        return r,omega[:,0:1], psi
+
+    @silence_errors
    def _int_computation(self,r1,omega1,phi1,r2,omega2,phi2):
        Gint1 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) + 1./(omega1-omega2.T)*( np.sin((omega1-omega2.T)*self.upper+phi1-phi2.T) - np.sin((omega1-omega2.T)*self.lower+phi1-phi2.T) )
        Gint2 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) +  np.cos(phi1-phi2.T)*(self.upper-self.lower)
@ -55,6 +64,7 @@ class periodic_Matern32(kernpart):
    def _get_params(self):
        """return the value of the parameters."""
        return np.hstack((self.variance,self.lengthscale,self.period))
+    
    def _set_params(self,x):
        """set the value of the parameters."""
        assert x.size==3
@ -101,7 +111,8 @@ class periodic_Matern32(kernpart):
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target)

-    def dK_dtheta(self,partial,X,X2,target):
+    @silence_errors
+    def dK_dtheta(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters (shape is NxMxNparam)"""
        if X2 is None: X2 = X
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
@ -166,8 +177,72 @@ class periodic_Matern32(kernpart):
        dK_dper = mdot(dFX_dper,self.Gi,FX2.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX2.T) + mdot(FX,self.Gi,dFX2_dper.T)

        # np.add(target[:,:,0],dK_dvar, target[:,:,0])
-        target[0] += np.sum(dK_dvar*partial)
+        target[0] += np.sum(dK_dvar*dL_dK)
        #np.add(target[:,:,1],dK_dlen, target[:,:,1])
-        target[1] += np.sum(dK_dlen*partial)
+        target[1] += np.sum(dK_dlen*dL_dK)
        #np.add(target[:,:,2],dK_dper, target[:,:,2])
-        target[2] += np.sum(dK_dper*partial)
+        target[2] += np.sum(dK_dper*dL_dK)
+
+    @silence_errors
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        """derivative of the diagonal covariance matrix with respect to the parameters"""
+        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega, self.a[2]*self.basis_omega**2))
+        Lo = np.column_stack((self.basis_omega,self.basis_omega,self.basis_omega))
+        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2,self.basis_phi+np.pi))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+
+        #dK_dvar
+        dK_dvar = 1./self.variance*mdot(FX,self.Gi,FX.T)
+
+        #dK_dlen
+        da_dlen = [-6/self.lengthscale**3,-2*np.sqrt(3)/self.lengthscale**2,0.]
+        db_dlen = [0.,2*self.lengthscale/3.]
+        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)),da_dlen[1]*self.basis_omega,da_dlen[2]*self.basis_omega**2))
+        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
+        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
+        dGint_dlen = dGint_dlen + dGint_dlen.T
+        dG_dlen = self.lengthscale**2/(4*np.sqrt(3))*Gint + self.lengthscale**3/(12*np.sqrt(3))*dGint_dlen + db_dlen[0]*np.dot(Flower,Flower.T) + db_dlen[1]*np.dot(F1lower,F1lower.T)
+        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX.T)
+
+        #dK_dper
+        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
+
+        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period, -self.a[2]*self.basis_omega**3/self.period))
+        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi,self.basis_phi+np.pi*3/2))
+        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
+
+        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
+        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
+        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
+        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
+        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)
+
+        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
+        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
+        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + 1./2*self.upper**2*np.cos(phi-phi1.T)
+        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + 1./2*self.lower**2*np.cos(phi-phi1.T)
+        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
+
+        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period, -2*self.a[2]*self.basis_omega**2/self.period))
+        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi))
+        r2,omega2,phi2 =  self._cos_factorization(dLa_dper2,Lo[:,0:2],dLp_dper2)
+
+        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) +  self._int_computation(r2,omega2,phi2, r,omega,phi)
+        dGint_dper = dGint_dper + dGint_dper.T
+
+        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        dF1lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower)+self._cos(-self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+
+        dG_dper = 1./self.variance*(self.lengthscale**3/(12*np.sqrt(3))*dGint_dper + self.b[0]*(np.dot(dFlower_dper,Flower.T)+np.dot(Flower,dFlower_dper.T)) + self.b[1]*(np.dot(dF1lower_dper,F1lower.T)+np.dot(F1lower,dF1lower_dper.T)))
+
+        dK_dper = 2* mdot(dFX_dper,self.Gi,FX.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX.T)
+
+        target[0] += np.sum(np.diag(dK_dvar)*dL_dKdiag)
+        target[1] += np.sum(np.diag(dK_dlen)*dL_dKdiag)
+        target[2] += np.sum(np.diag(dK_dper)*dL_dKdiag)
--- a/GPy/kern/periodic_Matern52.py
+++ b/GPy/kern/periodic_Matern52.py
@ -1,6 +1,11 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
 from kernpart import kernpart
 import numpy as np
 from GPy.util.linalg import mdot, pdinv
+from GPy.util.decorators import silence_errors

 class periodic_Matern52(kernpart):
    """
@ -40,13 +45,14 @@ class periodic_Matern52(kernpart):
            return alpha*np.cos(omega*x+phase)
        return f

+    @silence_errors
    def _cos_factorization(self,alpha,omega,phase):
        r1 = np.sum(alpha*np.cos(phase),axis=1)[:,None]
        r2 = np.sum(alpha*np.sin(phase),axis=1)[:,None]
        r =  np.sqrt(r1**2 + r2**2)
        psi = np.where(r1 != 0, (np.arctan(r2/r1) + (r1<0.)*np.pi),np.arcsin(r2))
        return r,omega[:,0:1], psi
-    
+
    def _int_computation(self,r1,omega1,phi1,r2,omega2,phi2):
        Gint1 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) + 1./(omega1-omega2.T)*( np.sin((omega1-omega2.T)*self.upper+phi1-phi2.T) - np.sin((omega1-omega2.T)*self.lower+phi1-phi2.T) )
        Gint2 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) +  np.cos(phi1-phi2.T)*(self.upper-self.lower)
@ -57,6 +63,7 @@ class periodic_Matern52(kernpart):
    def _get_params(self):
        """return the value of the parameters."""
        return np.hstack((self.variance,self.lengthscale,self.period))
+    
    def _set_params(self,x):
        """set the value of the parameters."""
        assert x.size==3
@ -105,7 +112,8 @@ class periodic_Matern52(kernpart):
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target)

-    def dK_dtheta(self,partial,X,X2,target):
+    @silence_errors
+    def dK_dtheta(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters (shape is NxMxNparam)"""
        if X2 is None: X2 = X
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
@ -178,8 +186,80 @@ class periodic_Matern52(kernpart):
        dK_dper = mdot(dFX_dper,self.Gi,FX2.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX2.T) + mdot(FX,self.Gi,dFX2_dper.T)

        # np.add(target[:,:,0],dK_dvar, target[:,:,0])
-        target[0] += np.sum(dK_dvar*partial)
+        target[0] += np.sum(dK_dvar*dL_dK)
        #np.add(target[:,:,1],dK_dlen, target[:,:,1])
-        target[1] += np.sum(dK_dlen*partial)
+        target[1] += np.sum(dK_dlen*dL_dK)
        #np.add(target[:,:,2],dK_dper, target[:,:,2])
-        target[2] += np.sum(dK_dper*partial)
+        target[2] += np.sum(dK_dper*dL_dK)
+
+    @silence_errors
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        """derivative of the diagonal of the covariance matrix with respect to the parameters"""
+        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)), self.a[1]*self.basis_omega, self.a[2]*self.basis_omega**2, self.a[3]*self.basis_omega**3))
+        Lo = np.column_stack((self.basis_omega, self.basis_omega, self.basis_omega, self.basis_omega))
+        Lp = np.column_stack((self.basis_phi, self.basis_phi+np.pi/2, self.basis_phi+np.pi, self.basis_phi+np.pi*3/2))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        F2lower = np.array(self._cos(self.basis_alpha*self.basis_omega**2,self.basis_omega,self.basis_phi+np.pi)(self.lower))[:,None]
+
+        #dK_dvar
+        dK_dvar = 1./self.variance*mdot(FX,self.Gi,FX2.T)
+
+        #dK_dlen
+        da_dlen = [-3*self.a[0]/self.lengthscale, -2*self.a[1]/self.lengthscale, -self.a[2]/self.lengthscale, 0.]
+        db_dlen = [0., 4*self.b[1]/self.lengthscale, 2*self.b[2]/self.lengthscale, 2*self.b[3]/self.lengthscale, 2*self.b[4]/self.lengthscale]
+        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)), da_dlen[1]*self.basis_omega, da_dlen[2]*self.basis_omega**2, da_dlen[3]*self.basis_omega**3))
+        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
+        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
+        dGint_dlen = dGint_dlen + dGint_dlen.T
+        dlower_terms_dlen = db_dlen[0]*np.dot(Flower,Flower.T) + db_dlen[1]*np.dot(F2lower,F2lower.T) + db_dlen[2]*np.dot(F1lower,F1lower.T) + db_dlen[3]*np.dot(F2lower,Flower.T) + db_dlen[4]*np.dot(Flower,F2lower.T)
+        dG_dlen = 15*self.lengthscale**4/(400*np.sqrt(5))*Gint + 3*self.lengthscale**5/(400*np.sqrt(5))*dGint_dlen + dlower_terms_dlen
+        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX.T)
+
+        #dK_dper
+        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
+
+        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period, -self.a[2]*self.basis_omega**3/self.period, -self.a[3]*self.basis_omega**4/self.period))
+        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi,self.basis_phi+np.pi*3/2,self.basis_phi))
+        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
+
+        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
+        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
+        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
+        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
+        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)
+
+        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
+        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
+        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + .5*self.upper**2*np.cos(phi-phi1.T)
+        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + .5*self.lower**2*np.cos(phi-phi1.T)
+        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
+
+        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period, -2*self.a[2]*self.basis_omega**2/self.period, -3*self.a[3]*self.basis_omega**3/self.period))
+        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2, self.basis_phi+np.pi, self.basis_phi+np.pi*3/2))
+        r2,omega2,phi2 =  self._cos_factorization(dLa_dper2,Lo[:,0:2],dLp_dper2)
+
+        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) +  self._int_computation(r2,omega2,phi2, r,omega,phi)
+        dGint_dper = dGint_dper + dGint_dper.T
+
+        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        dF1lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower)+self._cos(-self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        dF2lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**3/self.period,self.basis_omega,self.basis_phi+np.pi*3/2)(self.lower) + self._cos(-2*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower))[:,None]
+
+        dlower_terms_dper  = self.b[0] * (np.dot(dFlower_dper,Flower.T) + np.dot(Flower.T,dFlower_dper))
+        dlower_terms_dper += self.b[1] * (np.dot(dF2lower_dper,F2lower.T) + np.dot(F2lower,dF2lower_dper.T)) - 4*self.b[1]/self.period*np.dot(F2lower,F2lower.T)
+        dlower_terms_dper += self.b[2] * (np.dot(dF1lower_dper,F1lower.T) + np.dot(F1lower,dF1lower_dper.T)) - 2*self.b[2]/self.period*np.dot(F1lower,F1lower.T)
+        dlower_terms_dper += self.b[3] * (np.dot(dF2lower_dper,Flower.T) + np.dot(F2lower,dFlower_dper.T)) - 2*self.b[3]/self.period*np.dot(F2lower,Flower.T)
+        dlower_terms_dper += self.b[4] * (np.dot(dFlower_dper,F2lower.T) + np.dot(Flower,dF2lower_dper.T)) - 2*self.b[4]/self.period*np.dot(Flower,F2lower.T)
+
+        dG_dper = 1./self.variance*(3*self.lengthscale**5/(400*np.sqrt(5))*dGint_dper + 0.5*dlower_terms_dper)
+        dK_dper = 2*mdot(dFX_dper,self.Gi,FX.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX.T)
+
+        target[0] += np.sum(np.diag(dK_dvar)*dL_dKdiag)
+        target[1] += np.sum(np.diag(dK_dlen)*dL_dKdiag)
+        target[2] += np.sum(np.diag(dK_dper)*dL_dKdiag)
--- a/GPy/kern/periodic_exponential.py
+++ b/GPy/kern/periodic_exponential.py
@ -1,6 +1,11 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
 from kernpart import kernpart
 import numpy as np
 from GPy.util.linalg import mdot, pdinv
+from GPy.util.decorators import silence_errors

 class periodic_exponential(kernpart):
    """
@ -40,6 +45,7 @@ class periodic_exponential(kernpart):
            return alpha*np.cos(omega*x+phase)
        return f

+    @silence_errors
    def _cos_factorization(self,alpha,omega,phase):
        r1 = np.sum(alpha*np.cos(phase),axis=1)[:,None]
        r2 = np.sum(alpha*np.sin(phase),axis=1)[:,None]
@ -57,6 +63,7 @@ class periodic_exponential(kernpart):
    def _get_params(self):
        """return the value of the parameters."""
        return np.hstack((self.variance,self.lengthscale,self.period))
+    
    def _set_params(self,x):
        """set the value of the parameters."""
        assert x.size==3
@ -101,7 +108,8 @@ class periodic_exponential(kernpart):
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target)

-    def dK_dtheta(self,partial,X,X2,target):
+    @silence_errors
+    def dK_dtheta(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters (shape is NxMxNparam)"""
        if X2 is None: X2 = X
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
@ -162,9 +170,67 @@ class periodic_exponential(kernpart):

        dK_dper = mdot(dFX_dper,self.Gi,FX2.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX2.T) + mdot(FX,self.Gi,dFX2_dper.T)

-        # np.add(target[:,:,0],dK_dvar, target[:,:,0])
-        target[0] += np.sum(dK_dvar*partial)
-        #np.add(target[:,:,1],dK_dlen, target[:,:,1])
-        target[1] += np.sum(dK_dlen*partial)
-        #np.add(target[:,:,2],dK_dper, target[:,:,2])
-        target[2] += np.sum(dK_dper*partial)
+        target[0] += np.sum(dK_dvar*dL_dK)
+        target[1] += np.sum(dK_dlen*dL_dK)
+        target[2] += np.sum(dK_dper*dL_dK)
+
+    @silence_errors
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        """derivative of the diagonal of the covariance matrix with respect to the parameters"""
+        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega))
+        Lo = np.column_stack((self.basis_omega,self.basis_omega))
+        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+
+        #dK_dvar
+        dK_dvar = 1./self.variance*mdot(FX,self.Gi,FX.T)
+
+        #dK_dlen
+        da_dlen = [-1./self.lengthscale**2,0.]
+        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)),da_dlen[1]*self.basis_omega))
+        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
+        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
+        dGint_dlen = dGint_dlen + dGint_dlen.T
+        dG_dlen = 1./2*Gint + self.lengthscale/2*dGint_dlen
+        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX.T)
+
+        #dK_dper
+        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
+
+        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period))
+        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi))
+        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
+
+        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
+        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
+        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
+        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
+        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)
+
+        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
+        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
+        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + 1./2*self.upper**2*np.cos(phi-phi1.T)
+        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + 1./2*self.lower**2*np.cos(phi-phi1.T)
+        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
+
+        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period))
+        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2))
+        r2,omega2,phi2 = dLa_dper2.T,Lo[:,0:1],dLp_dper2.T
+
+        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) + self._int_computation(r2,omega2,phi2, r,omega,phi)
+        dGint_dper = dGint_dper + dGint_dper.T
+
+        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+
+        dG_dper = 1./self.variance*(self.lengthscale/2*dGint_dper + self.b[0]*(np.dot(dFlower_dper,Flower.T)+np.dot(Flower,dFlower_dper.T)))
+
+        dK_dper = 2*mdot(dFX_dper,self.Gi,FX.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX.T)
+
+        target[0] += np.sum(np.diag(dK_dvar)*dL_dKdiag)
+        target[1] += np.sum(np.diag(dK_dlen)*dL_dKdiag)
+        target[2] += np.sum(np.diag(dK_dper)*dL_dKdiag)
--- a/GPy/kern/product.py
+++ b/GPy/kern/product.py
@ -6,7 +6,7 @@ import numpy as np
 import hashlib
 #from scipy import integrate # This may not be necessary (Nicolas, 20th Feb)

-class product(kernpart):
+class prod(kernpart):
    """
    Computes the product of 2 kernels that are defined on the same space

@ -55,7 +55,7 @@ class product(kernpart):
        self.k2.Kdiag(X,target2)
        target += target1 * target2

-    def dK_dtheta(self,partial,X,X2,target):
+    def dK_dtheta(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
        K1 = np.zeros((X.shape[0],X2.shape[0]))
@ -65,13 +65,13 @@ class product(kernpart):

        k1_target = np.zeros(self.k1.Nparam)
        k2_target = np.zeros(self.k2.Nparam)
-        self.k1.dK_dtheta(partial*K2, X, X2, k1_target)
-        self.k2.dK_dtheta(partial*K1, X, X2, k2_target)
+        self.k1.dK_dtheta(dL_dK*K2, X, X2, k1_target)
+        self.k2.dK_dtheta(dL_dK*K1, X, X2, k2_target)

        target[:self.k1.Nparam] += k1_target
        target[self.k1.Nparam:] += k2_target

-    def dK_dX(self,partial,X,X2,target):
+    def dK_dX(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
        if X2 is None: X2 = X
        K1 = np.zeros((X.shape[0],X2.shape[0]))
@ -79,19 +79,19 @@ class product(kernpart):
        self.k1.K(X,X2,K1)
        self.k2.K(X,X2,K2)

-        self.k1.dK_dX(partial*K2, X, X2, target)
-        self.k2.dK_dX(partial*K1, X, X2, target)
+        self.k1.dK_dX(dL_dK*K2, X, X2, target)
+        self.k2.dK_dX(dL_dK*K1, X, X2, target)

-    def dKdiag_dX(self,partial,X,target):
+    def dKdiag_dX(self,dL_dKdiag,X,target):
        target1 = np.zeros((X.shape[0],))
        target2 = np.zeros((X.shape[0],))
        self.k1.Kdiag(X,target1)
        self.k2.Kdiag(X,target2)

-        self.k1.dKdiag_dX(partial*target2, X, target)
-        self.k2.dKdiag_dX(partial*target1, X, target)
+        self.k1.dKdiag_dX(dL_dKdiag*target2, X, target)
+        self.k2.dKdiag_dX(dL_dKdiag*target1, X, target)

-    def dKdiag_dtheta(self,partial,X,target):
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
        """Compute the diagonal of the covariance matrix associated to X."""
        target1 = np.zeros((X.shape[0],))
        target2 = np.zeros((X.shape[0],))
@ -100,8 +100,8 @@ class product(kernpart):

        k1_target = np.zeros(self.k1.Nparam)
        k2_target = np.zeros(self.k2.Nparam)
-        self.k1.dKdiag_dtheta(partial*target2, X, k1_target)
-        self.k2.dKdiag_dtheta(partial*target1, X, k2_target)
+        self.k1.dKdiag_dtheta(dL_dKdiag*target2, X, k1_target)
+        self.k2.dKdiag_dtheta(dL_dKdiag*target1, X, k2_target)

        target[:self.k1.Nparam] += k1_target
        target[self.k1.Nparam:] += k2_target
--- a/GPy/kern/product_orthogonal.py
+++ b/GPy/kern/product_orthogonal.py
@ -6,7 +6,7 @@ import numpy as np
 import hashlib
 #from scipy import integrate # This may not be necessary (Nicolas, 20th Feb)

-class product_orthogonal(kernpart):
+class prod_orthogonal(kernpart):
    """
    Computes the product of 2 kernels

@ -36,41 +36,44 @@ class product_orthogonal(kernpart):
    def _get_param_names(self):
        """return parameter names."""
        return [self.k1.name + '_' + param_name for param_name in self.k1._get_param_names()] + [self.k2.name + '_' + param_name for param_name in self.k2._get_param_names()]
-    
+
    def K(self,X,X2,target):
        """Compute the covariance matrix between X and X2."""
        if X2 is None: X2 = X
-        target1 = np.zeros((X.shape[0],X2.shape[0]))
-        target2 = np.zeros((X.shape[0],X2.shape[0]))
-        self.k1.K(X[:,0:self.k1.D],X2[:,0:self.k1.D],target1)
+        target1 = np.zeros_like(target)
+        target2 = np.zeros_like(target)
+        self.k1.K(X[:,:self.k1.D],X2[:,:self.k1.D],target1)
        self.k2.K(X[:,self.k1.D:],X2[:,self.k1.D:],target2)
        target += target1 * target2

-    def Kdiag(self,X,target):
-        """Compute the diagonal of the covariance matrix associated to X."""
-        target1 = np.zeros((X.shape[0],))
-        target2 = np.zeros((X.shape[0],))
-        self.k1.Kdiag(X[:,0:self.k1.D],target1)
-        self.k2.Kdiag(X[:,self.k1.D:],target2)
-        target += target1 * target2
-
-    def dK_dtheta(self,partial,X,X2,target):
+    def dK_dtheta(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
        K1 = np.zeros((X.shape[0],X2.shape[0]))
        K2 = np.zeros((X.shape[0],X2.shape[0]))
-        self.k1.K(X[:,0:self.k1.D],X2[:,0:self.k1.D],K1)
+        self.k1.K(X[:,:self.k1.D],X2[:,:self.k1.D],K1)
        self.k2.K(X[:,self.k1.D:],X2[:,self.k1.D:],K2)

-        k1_target = np.zeros(self.k1.Nparam)
-        k2_target = np.zeros(self.k2.Nparam)
-        self.k1.dK_dtheta(partial*K2, X[:,:self.k1.D], X2[:,:self.k1.D], k1_target)
-        self.k2.dK_dtheta(partial*K1, X[:,self.k1.D:], X2[:,self.k1.D:], k2_target)
+        self.k1.dK_dtheta(dL_dK*K2, X[:,:self.k1.D], X2[:,:self.k1.D], target[:self.k1.Nparam])
+        self.k2.dK_dtheta(dL_dK*K1, X[:,self.k1.D:], X2[:,self.k1.D:], target[self.k1.Nparam:])

-        target[:self.k1.Nparam] += k1_target
-        target[self.k1.Nparam:] += k2_target
+    def Kdiag(self,X,target):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        target1 = np.zeros(X.shape[0])
+        target2 = np.zeros(X.shape[0])
+        self.k1.Kdiag(X[:,:self.k1.D],target1)
+        self.k2.Kdiag(X[:,self.k1.D:],target2)
+        target += target1 * target2

-    def dK_dX(self,partial,X,X2,target):
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        K1 = np.zeros(X.shape[0])
+        K2 = np.zeros(X.shape[0])
+        self.k1.Kdiag(X[:,:self.k1.D],K1)
+        self.k2.Kdiag(X[:,self.k1.D:],K2)
+        self.k1.dKdiag_dtheta(dL_dKdiag*K2,X[:,:self.k1.D],target[:self.k1.Nparam])
+        self.k2.dKdiag_dtheta(dL_dKdiag*K1,X[:,self.k1.D:],target[self.k1.Nparam:])
+
+    def dK_dX(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
        if X2 is None: X2 = X
        K1 = np.zeros((X.shape[0],X2.shape[0]))
@ -78,8 +81,15 @@ class product_orthogonal(kernpart):
        self.k1.K(X[:,0:self.k1.D],X2[:,0:self.k1.D],K1)
        self.k2.K(X[:,self.k1.D:],X2[:,self.k1.D:],K2)

-        self.k1.dK_dX(partial*K2, X[:,:self.k1.D], X2[:,:self.k1.D], target)
-        self.k2.dK_dX(partial*K1, X[:,self.k1.D:], X2[:,self.k1.D:], target)
+        self.k1.dK_dX(dL_dK*K2, X[:,:self.k1.D], X2[:,:self.k1.D], target)
+        self.k2.dK_dX(dL_dK*K1, X[:,self.k1.D:], X2[:,self.k1.D:], target)
+
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        K1 = np.zeros(X.shape[0])
+        K2 = np.zeros(X.shape[0])
+        self.k1.Kdiag(X[:,0:self.k1.D],K1)
+        self.k2.Kdiag(X[:,self.k1.D:],K2)
+
+        self.k1.dK_dX(dL_dKdiag*K2, X[:,:self.k1.D], target)
+        self.k2.dK_dX(dL_dKdiag*K1, X[:,self.k1.D:], target)

-    def dKdiag_dX(self,X,target):
-        pass
--- a/GPy/kern/rational_quadratic.py
+++ b/GPy/kern/rational_quadratic.py
@ -0,0 +1,80 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import kernpart
+import numpy as np
+
+class rational_quadratic(kernpart):
+    """
+    rational quadratic kernel
+
+    .. math::
+
+       k(r) = \sigma^2 \\bigg( 1 + \\frac{r^2}{2 \ell^2} \\bigg)^{- \\alpha} \ \ \ \ \  \\text{ where  } r^2 = (x-y)^2
+
+    :param D: the number of input dimensions
+    :type D: int (D=1 is the only value currently supported)
+    :param variance: the variance :math:`\sigma^2`
+    :type variance: float
+    :param lengthscale: the lengthscale :math:`\ell`
+    :type lengthscale: float
+    :param power: the power :math:`\\alpha`
+    :type power: float
+    :rtype: kernpart object
+
+    """
+    def __init__(self,D,variance=1.,lengthscale=1.,power=1.):
+        assert D == 1, "For this kernel we assume D=1"
+        self.D = D
+        self.Nparam = 3
+        self.name = 'rat_quad'
+        self.variance = variance
+        self.lengthscale = lengthscale
+        self.power = power
+
+    def _get_params(self):
+        return np.hstack((self.variance,self.lengthscale,self.power))
+
+    def _set_params(self,x):
+        self.variance = x[0]
+        self.lengthscale = x[1]
+        self.power = x[2]
+
+    def _get_param_names(self):
+        return ['variance','lengthscale','power']
+
+    def K(self,X,X2,target):
+        if X2 is None: X2 = X
+        dist2 = np.square((X-X2.T)/self.lengthscale)
+        target += self.variance*(1 + dist2/2.)**(-self.power)
+
+    def Kdiag(self,X,target):
+        target += self.variance
+
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        if X2 is None: X2 = X
+        dist2 = np.square((X-X2.T)/self.lengthscale)
+
+        dvar = (1 + dist2/2.)**(-self.power)
+        dl = self.power * self.variance * dist2 * self.lengthscale**(-3) * (1 + dist2/2./self.power)**(-self.power-1)
+        dp = - self.variance * np.log(1 + dist2/2.) * (1 + dist2/2.)**(-self.power)
+
+        target[0] += np.sum(dvar*dL_dK)
+        target[1] += np.sum(dl*dL_dK)
+        target[2] += np.sum(dp*dL_dK)
+
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        target[0] += np.sum(dL_dKdiag)
+        # here self.lengthscale and self.power have no influence on Kdiag so target[1:] are unchanged
+
+    def dK_dX(self,dL_dK,X,X2,target):
+        """derivative of the covariance matrix with respect to X."""
+        if X2 is None: X2 = X
+        dist2 = np.square((X-X2.T)/self.lengthscale)
+
+        dX = -self.variance*self.power * (X-X2.T)/self.lengthscale**2 *  (1 + dist2/2./self.lengthscale)**(-self.power-1)
+        target += np.sum(dL_dK*dX,1)[:,np.newaxis]
+
+    def dKdiag_dX(self,dL_dKdiag,X,target):
+        pass
--- a/GPy/kern/rbf.py
+++ b/GPy/kern/rbf.py
@ -12,7 +12,7 @@ class rbf(kernpart):

    .. math::

-       k(r) = \sigma^2 \exp(- \frac{1}{2}r^2) \ \ \ \ \  \\text{ where  } r^2 = \sum_{i=1}^d \frac{ (x_i-x^\prime_i)^2}{\ell_i^2}}
+       k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \ \ \ \ \  \\text{ where  } r^2 = \sum_{i=1}^d \\frac{ (x_i-x^\prime_i)^2}{\ell_i^2}

    where \ell_i is the lengthscale, \sigma^2 the variance and d the dimensionality of the input.

@ -82,27 +82,27 @@ class rbf(kernpart):
    def Kdiag(self,X,target):
        np.add(target,self.variance,target)

-    def dK_dtheta(self,partial,X,X2,target):
+    def dK_dtheta(self,dL_dK,X,X2,target):
        self._K_computations(X,X2)
-        target[0] += np.sum(self._K_dvar*partial)
+        target[0] += np.sum(self._K_dvar*dL_dK)
        if self.ARD == True:
            dl = self._K_dvar[:,:,None]*self.variance*self._K_dist2/self.lengthscale
-            target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
+            target[1:] += (dl*dL_dK[:,:,None]).sum(0).sum(0)
        else:
-            target[1] += np.sum(self._K_dvar*self.variance*(self._K_dist2.sum(-1))/self.lengthscale*partial)
-        #np.sum(self._K_dvar*self.variance*self._K_dist2/self.lengthscale*partial)
+            target[1] += np.sum(self._K_dvar*self.variance*(self._K_dist2.sum(-1))/self.lengthscale*dL_dK)
+        #np.sum(self._K_dvar*self.variance*self._K_dist2/self.lengthscale*dL_dK)

-    def dKdiag_dtheta(self,partial,X,target):
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
        #NB: derivative of diagonal elements wrt lengthscale is 0
-        target[0] += np.sum(partial)
+        target[0] += np.sum(dL_dKdiag)

-    def dK_dX(self,partial,X,X2,target):
+    def dK_dX(self,dL_dK,X,X2,target):
        self._K_computations(X,X2)
        _K_dist = X[:,None,:]-X2[None,:,:]
        dK_dX = np.transpose(-self.variance*self._K_dvar[:,:,np.newaxis]*_K_dist/self.lengthscale2,(1,0,2))
-        target += np.sum(dK_dX*partial.T[:,:,None],0)
+        target += np.sum(dK_dX*dL_dK.T[:,:,None],0)

-    def dKdiag_dX(self,partial,X,target):
+    def dKdiag_dX(self,dL_dKdiag,X,target):
        pass


@ -113,69 +113,69 @@ class rbf(kernpart):
    def psi0(self,Z,mu,S,target):
        target += self.variance

-    def dpsi0_dtheta(self,partial,Z,mu,S,target):
-        target[0] += np.sum(partial)
+    def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target):
+        target[0] += np.sum(dL_dpsi0)

-    def dpsi0_dmuS(self,partial,Z,mu,S,target_mu,target_S):
+    def dpsi0_dmuS(self,dL_dpsi0,Z,mu,S,target_mu,target_S):
        pass

    def psi1(self,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
        target += self._psi1

-    def dpsi1_dtheta(self,partial,Z,mu,S,target):
+    def dpsi1_dtheta(self,dL_dpsi1,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
        denom_deriv = S[:,None,:]/(self.lengthscale**3+self.lengthscale*S[:,None,:])
        d_length = self._psi1[:,:,None]*(self.lengthscale*np.square(self._psi1_dist/(self.lengthscale2+S[:,None,:])) + denom_deriv)
-        target[0] += np.sum(partial*self._psi1/self.variance)
-        dpsi1_dlength = d_length*partial[:,:,None]
+        target[0] += np.sum(dL_dpsi1*self._psi1/self.variance)
+        dpsi1_dlength = d_length*dL_dpsi1[:,:,None]
        if not self.ARD:
            target[1] += dpsi1_dlength.sum()
        else:
            target[1:] += dpsi1_dlength.sum(0).sum(0)

-    def dpsi1_dZ(self,partial,Z,mu,S,target):
+    def dpsi1_dZ(self,dL_dpsi1,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
        denominator = (self.lengthscale2*(self._psi1_denom))
        dpsi1_dZ = - self._psi1[:,:,None] * ((self._psi1_dist/denominator))
-        target += np.sum(partial.T[:,:,None] * dpsi1_dZ, 0)
+        target += np.sum(dL_dpsi1.T[:,:,None] * dpsi1_dZ, 0)

-    def dpsi1_dmuS(self,partial,Z,mu,S,target_mu,target_S):
+    def dpsi1_dmuS(self,dL_dpsi1,Z,mu,S,target_mu,target_S):
        self._psi_computations(Z,mu,S)
        tmp = self._psi1[:,:,None]/self.lengthscale2/self._psi1_denom
-        target_mu += np.sum(partial.T[:, :, None]*tmp*self._psi1_dist,1)
-        target_S += np.sum(partial.T[:, :, None]*0.5*tmp*(self._psi1_dist_sq-1),1)
+        target_mu += np.sum(dL_dpsi1.T[:, :, None]*tmp*self._psi1_dist,1)
+        target_S += np.sum(dL_dpsi1.T[:, :, None]*0.5*tmp*(self._psi1_dist_sq-1),1)

    def psi2(self,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
        target += self._psi2

-    def dpsi2_dtheta(self,partial,Z,mu,S,target):
+    def dpsi2_dtheta(self,dL_dpsi2,Z,mu,S,target):
        """Shape N,M,M,Ntheta"""
        self._psi_computations(Z,mu,S)
        d_var = 2.*self._psi2/self.variance
        d_length = self._psi2[:,:,:,None]*(0.5*self._psi2_Zdist_sq*self._psi2_denom + 2.*self._psi2_mudist_sq + 2.*S[:,None,None,:]/self.lengthscale2)/(self.lengthscale*self._psi2_denom)

-        target[0] += np.sum(partial*d_var)
-        dpsi2_dlength = d_length*partial[:,:,:,None]
+        target[0] += np.sum(dL_dpsi2*d_var)
+        dpsi2_dlength = d_length*dL_dpsi2[:,:,:,None]
        if not self.ARD:
            target[1] += dpsi2_dlength.sum()
        else:
            target[1:] += dpsi2_dlength.sum(0).sum(0).sum(0)
-            
-    def dpsi2_dZ(self,partial,Z,mu,S,target):
+
+    def dpsi2_dZ(self,dL_dpsi2,Z,mu,S,target):
        self._psi_computations(Z,mu,S)
        term1 = 0.5*self._psi2_Zdist/self.lengthscale2 # M, M, Q
        term2 = self._psi2_mudist/self._psi2_denom/self.lengthscale2 # N, M, M, Q
        dZ = self._psi2[:,:,:,None] * (term1[None] + term2)
-        target += (partial[:,:,:,None]*dZ).sum(0).sum(0)
+        target += (dL_dpsi2[:,:,:,None]*dZ).sum(0).sum(0)

-    def dpsi2_dmuS(self,partial,Z,mu,S,target_mu,target_S):
+    def dpsi2_dmuS(self,dL_dpsi2,Z,mu,S,target_mu,target_S):
        """Think N,M,M,Q """
        self._psi_computations(Z,mu,S)
        tmp = self._psi2[:,:,:,None]/self.lengthscale2/self._psi2_denom
-        target_mu += (partial[:,:,:,None]*-tmp*2.*self._psi2_mudist).sum(1).sum(1)
-        target_S += (partial[:,:,:,None]*tmp*(2.*self._psi2_mudist_sq-1)).sum(1).sum(1)
+        target_mu += (dL_dpsi2[:,:,:,None]*-tmp*2.*self._psi2_mudist).sum(1).sum(1)
+        target_S += (dL_dpsi2[:,:,:,None]*tmp*(2.*self._psi2_mudist_sq-1)).sum(1).sum(1)


    #---------------------------------------#
--- a/GPy/kern/symmetric.py
+++ b/GPy/kern/symmetric.py
@ -0,0 +1,92 @@
+# Copyright (c) 2012 James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import kernpart
+import numpy as np
+
+class symmetric(kernpart):
+    """
+    Symmetrical kernels
+
+    :param k: the kernel to symmetrify
+    :type k: kernpart
+    :param transform: the transform to use in symmetrification (allows symmetry on specified axes)
+    :type transform: A numpy array (D x D) specifiying the transform
+    :rtype: kernpart
+
+    """
+    def __init__(self,k,transform=None):
+        if transform is None:
+            transform = np.eye(k.D)*-1.
+        assert transform.shape == (k.D, k.D)
+        self.transform = transform
+        self.D = k.D
+        self.Nparam = k.Nparam
+        self.name = k.name + '_symm'
+        self.k = k
+        self._set_params(k._get_params())
+
+    def _get_params(self):
+        """return the value of the parameters."""
+        return self.k._get_params()
+
+    def _set_params(self,x):
+        """set the value of the parameters."""
+        self.k._set_params(x)
+
+    def _get_param_names(self):
+        """return parameter names."""
+        return self.k._get_param_names()
+
+    def K(self,X,X2,target):
+        """Compute the covariance matrix between X and X2."""
+        AX = np.dot(X,self.transform)
+        if X2 is None:
+            X2 = X
+            AX2 = AX
+        else:
+            AX2 = np.dot(X2, self.transform)
+        self.k.K(X,X2,target)
+        self.k.K(AX,X2,target)
+        self.k.K(X,AX2,target)
+        self.k.K(AX,AX2,target)
+
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        """derivative of the covariance matrix with respect to the parameters."""
+        AX = np.dot(X,self.transform)
+        if X2 is None:
+            X2 = X
+            ZX2 = AX
+        else:
+            AX2 = np.dot(X2, self.transform)
+        self.k.dK_dtheta(dL_dK,X,X2,target)
+        self.k.dK_dtheta(dL_dK,AX,X2,target)
+        self.k.dK_dtheta(dL_dK,X,AX2,target)
+        self.k.dK_dtheta(dL_dK,AX,AX2,target)
+
+
+    def dK_dX(self,dL_dK,X,X2,target):
+        """derivative of the covariance matrix with respect to X."""
+        AX = np.dot(X,self.transform)
+        if X2 is None:
+            X2 = X
+            ZX2 = AX
+        else:
+            AX2 = np.dot(X2, self.transform)
+        self.k.dK_dX(dL_dK, X, X2, target)
+        self.k.dK_dX(dL_dK, AX, X2, target)
+        self.k.dK_dX(dL_dK, X, AX2, target)
+        self.k.dK_dX(dL_dK, AX ,AX2, target)
+
+    def Kdiag(self,X,target):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        foo = np.zeros((X.shape[0],X.shape[0]))
+        self.K(X,X,foo)
+        target += np.diag(foo)
+
+    def dKdiag_dX(self,dL_dKdiag,X,target):
+        raise NotImplementedError
+
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        raise NotImplementedError
--- a/GPy/kern/white.py
+++ b/GPy/kern/white.py
@ -37,50 +37,50 @@ class white(kernpart):
    def Kdiag(self,X,target):
        target += self.variance

-    def dK_dtheta(self,partial,X,X2,target):
+    def dK_dtheta(self,dL_dK,X,X2,target):
        if X.shape==X2.shape:
            if np.all(X==X2):
-                target += np.trace(partial)
+                target += np.trace(dL_dK)

-    def dKdiag_dtheta(self,partial,X,target):
-        target += np.sum(partial)
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        target += np.sum(dL_dKdiag)

-    def dK_dX(self,partial,X,X2,target):
+    def dK_dX(self,dL_dK,X,X2,target):
        pass

-    def dKdiag_dX(self,partial,X,target):
+    def dKdiag_dX(self,dL_dKdiag,X,target):
        pass

    def psi0(self,Z,mu,S,target):
        target += self.variance

-    def dpsi0_dtheta(self,partial,Z,mu,S,target):
-        target += partial.sum()
+    def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target):
+        target += dL_dpsi0.sum()

-    def dpsi0_dmuS(self,partial,Z,mu,S,target_mu,target_S):
+    def dpsi0_dmuS(self,dL_dpsi0,Z,mu,S,target_mu,target_S):
        pass

    def psi1(self,Z,mu,S,target):
        pass

-    def dpsi1_dtheta(self,partial,Z,mu,S,target):
+    def dpsi1_dtheta(self,dL_dpsi1,Z,mu,S,target):
        pass

-    def dpsi1_dZ(self,partial,Z,mu,S,target):
+    def dpsi1_dZ(self,dL_dpsi1,Z,mu,S,target):
        pass

-    def dpsi1_dmuS(self,partial,Z,mu,S,target_mu,target_S):
+    def dpsi1_dmuS(self,dL_dpsi1,Z,mu,S,target_mu,target_S):
        pass

    def psi2(self,Z,mu,S,target):
        pass

-    def dpsi2_dZ(self,partial,Z,mu,S,target):
+    def dpsi2_dZ(self,dL_dpsi2,Z,mu,S,target):
        pass

-    def dpsi2_dtheta(self,partial,Z,mu,S,target):
+    def dpsi2_dtheta(self,dL_dpsi2,Z,mu,S,target):
        pass

-    def dpsi2_dmuS(self,partial,Z,mu,S,target_mu,target_S):
+    def dpsi2_dmuS(self,dL_dpsi2,Z,mu,S,target_mu,target_S):
        pass

--- a/GPy/likelihoods/EP.py
+++ b/GPy/likelihoods/EP.py
@ -17,7 +17,7 @@ class EP(likelihood):
        self.epsilon = epsilon
        self.eta, self.delta = power_ep
        self.data = data
-        self.N = self.data.size
+        self.N, self.D = self.data.shape
        self.is_heteroscedastic = True
        self.Nparams = 0

@ -29,7 +29,7 @@ class EP(likelihood):
        #initial values for the GP variables
        self.Y = np.zeros((self.N,1))
        self.covariance_matrix = np.eye(self.N)
-        self.precision = np.ones(self.N)
+        self.precision = np.ones(self.N)[:,None]
        self.Z = 0
        self.YYT = None

@ -54,18 +54,14 @@ class EP(likelihood):

        self.Y =  mu_tilde[:,None]
        self.YYT = np.dot(self.Y,self.Y.T)
-        self.precision = self.tau_tilde
-        self.covariance_matrix = np.diag(1./self.precision)
+        self.covariance_matrix = np.diag(1./self.tau_tilde)
+        self.precision = self.tau_tilde[:,None]

    def fit_full(self,K):
        """
        The expectation-propagation algorithm.
        For nomenclature see Rasmussen & Williams 2006.
        """
-        #Prior distribution parameters: p(f|X) = N(f|0,K)
-
-        self.tau_tilde = np.zeros(self.N)
-        self.v_tilde = np.zeros(self.N)
        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
        mu = np.zeros(self.N)
        Sigma = K.copy()
@ -114,7 +110,7 @@ class EP(likelihood):
            Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K
            B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
            L = jitchol(B)
-            V,info = linalg.flapack.dtrtrs(L,Sroot_tilde_K,lower=1)
+            V,info = linalg.lapack.flapack.dtrtrs(L,Sroot_tilde_K,lower=1)
            Sigma = K - np.dot(V.T,V)
            mu = np.dot(Sigma,self.v_tilde)
            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
@ -124,13 +120,14 @@ class EP(likelihood):

        return self._compute_GP_variables()

-    def fit_DTC(self, Knn_diag, Kmn, Kmm):
+    #def fit_DTC(self, Knn_diag, Kmn, Kmm):
+    def fit_DTC(self, Kmm, Kmn):
        """
        The expectation-propagation algorithm with sparse pseudo-input.
        For nomenclature see ... 2013.
        """

-        #TODO: this doesn;t work with uncertain inputs!
+        #TODO: this doesn't work with uncertain inputs!

        """
        Prior approximation parameters:
@ -158,12 +155,12 @@ class EP(likelihood):
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
-        tau_ = np.empty(self.N,dtype=float)
-        v_ = np.empty(self.N,dtype=float)
+        self.tau_ = np.empty(self.N,dtype=float)
+        self.v_ = np.empty(self.N,dtype=float)

        #Initial values - Marginal moments
        z = np.empty(self.N,dtype=float)
-        Z_hat = np.empty(self.N,dtype=float)
+        self.Z_hat = np.empty(self.N,dtype=float)
        phi = np.empty(self.N,dtype=float)
        mu_hat = np.empty(self.N,dtype=float)
        sigma2_hat = np.empty(self.N,dtype=float)
@ -172,49 +169,50 @@ class EP(likelihood):
        epsilon_np1 = 1
        epsilon_np2 = 1
       	self.iterations = 0
-        np1 = [tau_tilde.copy()]
-        np2 = [v_tilde.copy()]
+        np1 = [self.tau_tilde.copy()]
+        np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
            update_order = np.random.permutation(self.N)
            for i in update_order:
                #Cavity distribution parameters
-                tau_[i] = 1./Sigma_diag[i] - self.eta*tau_tilde[i]
-                v_[i] = mu[i]/Sigma_diag[i] - self.eta*v_tilde[i]
+                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
+                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(self.data[i],tau_[i],v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(self.data[i],self.tau_[i],self.v_[i])
                #Site parameters update
-                Delta_tau = delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
+                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
-                tau_tilde[i] = tau_tilde[i] + Delta_tau
-                v_tilde[i] = v_tilde[i] + Delta_v
+                self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau
+                self.v_tilde[i] = self.v_tilde[i] + Delta_v
                #Posterior distribution parameters update
                LLT = LLT + np.outer(Kmn[:,i],Kmn[:,i])*Delta_tau
                L = jitchol(LLT)
-                V,info = linalg.flapack.dtrtrs(L,Kmn,lower=1)
+                V,info = linalg.lapack.flapack.dtrtrs(L,Kmn,lower=1)
                Sigma_diag = np.sum(V*V,-2)
                si = np.sum(V.T*V[:,i],-1)
                mu = mu + (Delta_v-Delta_tau*mu[i])*si
                self.iterations += 1
            #Sigma recomputation with Cholesky decompositon
-            LLT0 = LLT0 + np.dot(Kmn*tau_tilde[None,:],Kmn.T)
+            LLT0 = LLT0 + np.dot(Kmn*self.tau_tilde[None,:],Kmn.T)
            L = jitchol(LLT)
-            V,info = linalg.flapack.dtrtrs(L,Kmn,lower=1)
-            V2,info = linalg.flapack.dtrtrs(L.T,V,lower=0)
+            V,info = linalg.lapack.flapack.dtrtrs(L,Kmn,lower=1)
+            V2,info = linalg.lapack.flapack.dtrtrs(L.T,V,lower=0)
            Sigma_diag = np.sum(V*V,-2)
-            Knmv_tilde = np.dot(Kmn,v_tilde)
+            Knmv_tilde = np.dot(Kmn,self.v_tilde)
            mu = np.dot(V2.T,Knmv_tilde)
-            epsilon_np1 = sum((tau_tilde-np1[-1])**2)/self.N
-            epsilon_np2 = sum((v_tilde-np2[-1])**2)/self.N
-            np1.append(tau_tilde.copy())
-            np2.append(v_tilde.copy())
+            epsilon_np1 = sum((self.tau_tilde-np1[-1])**2)/self.N
+            epsilon_np2 = sum((self.v_tilde-np2[-1])**2)/self.N
+            np1.append(self.tau_tilde.copy())
+            np2.append(self.v_tilde.copy())

        self._compute_GP_variables()

-    def fit_FITC(self, Knn_diag, Kmn):
+    def fit_FITC(self, Kmm, Kmn, Knn_diag):
        """
        The expectation-propagation algorithm with sparse pseudo-input.
        For nomenclature see Naish-Guzman and Holden, 2008.
        """
+        M = Kmm.shape[0]

        """
        Prior approximation parameters:
@ -235,7 +233,7 @@ class EP(likelihood):
        mu = w + P*gamma
        """
        self.w = np.zeros(self.N)
-        self.gamma = np.zeros(self.M)
+        self.gamma = np.zeros(M)
        mu = np.zeros(self.N)
        P = P0.copy()
        R = R0.copy()
@ -271,7 +269,7 @@ class EP(likelihood):
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(self.data[i],self.tau_[i],self.v_[i])
                #Site parameters update
                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
@ -281,10 +279,10 @@ class EP(likelihood):
                dtd1 = Delta_tau*Diag[i] + 1.
                dii = Diag[i]
                Diag[i] = dii - (Delta_tau * dii**2.)/dtd1
-                pi_ = P[i,:].reshape(1,self.M)
+                pi_ = P[i,:].reshape(1,M)
                P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_
                Rp_i = np.dot(R,pi_.T)
-                RTR = np.dot(R.T,np.dot(np.eye(self.M) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R))
+                RTR = np.dot(R.T,np.dot(np.eye(M) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R))
                R = jitchol(RTR).T
                self.w[i] = self.w[i] + (Delta_v - Delta_tau*self.w[i])*dii/dtd1
                self.gamma = self.gamma + (Delta_v - Delta_tau*mu[i])*np.dot(RTR,P[i,:].T)
@ -296,8 +294,8 @@ class EP(likelihood):
            Diag = Diag0/(1.+ Diag0 * self.tau_tilde)
            P = (Diag / Diag0)[:,None] * P0
            RPT0 = np.dot(R0,P0.T)
-            L = jitchol(np.eye(self.M) + np.dot(RPT0,(1./Diag0 - Diag/(Diag0**2))[:,None]*RPT0.T))
-            R,info = linalg.flapack.dtrtrs(L,R0,lower=1)
+            L = jitchol(np.eye(M) + np.dot(RPT0,(1./Diag0 - Diag/(Diag0**2))[:,None]*RPT0.T))
+            R,info = linalg.lapack.flapack.dtrtrs(L,R0,lower=1)
            RPT = np.dot(R,P.T)
            Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
            self.w = Diag * self.v_tilde
--- a/GPy/likelihoods/Gaussian.py
+++ b/GPy/likelihoods/Gaussian.py
@ -8,7 +8,7 @@ class Gaussian(likelihood):
        self.Z = 0. # a correction factor which accounts for the approximation made
        N, self.D = data.shape

-        #normalisation
+        #normaliztion
        if normalize:
            self._mean = data.mean(0)[None,:]
            self._std = data.std(0)[None,:]
@ -45,7 +45,7 @@ class Gaussian(likelihood):

    def predictive_values(self,mu,var):
        """
-        Un-normalise the prediction and add the likelihood variance, then return the 5%, 95% interval
+        Un-normalize the prediction and add the likelihood variance, then return the 5%, 95% interval
        """
        mean = mu*self._std + self._mean
        true_var = (var + self._variance)*self._std**2
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@ -37,8 +37,8 @@ class probit(likelihood_function):
        :param tau_i: precision of the cavity distribution (float)
        :param v_i: mean/variance of the cavity distribution (float)
        """
-        # TODO: some version of assert np.sum(np.abs(Y)-1) == 0, "Output values must be either -1 or 1"
-        if data_i == 0: data_i = -1 #NOTE Binary classification works better classes {-1,1}, 1D-plotting works better with classes {0,1}.
+        if data_i == 0: data_i = -1 #NOTE Binary classification algorithm works better with classes {-1,1}, 1D-plotting works better with classes {0,1}.
+        # TODO: some version of assert
        z = data_i*v_i/np.sqrt(tau_i**2 + tau_i)
        Z_hat = stats.norm.cdf(z)
        phi = stats.norm.pdf(z)
--- a/GPy/models/Bayesian_GPLVM.py
+++ b/GPy/models/Bayesian_GPLVM.py
@ -41,7 +41,7 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):

    def _get_param_names(self):
        X_names = sum([['X_%i_%i'%(n,q) for q in range(self.Q)] for n in range(self.N)],[])
-        S_names = sum([['S_%i_%i'%(n,q) for q in range(self.Q)] for n in range(self.N)],[])
+        S_names = sum([['X_variance_%i_%i'%(n,q) for q in range(self.Q)] for n in range(self.N)],[])
        return (X_names + S_names + sparse_GP._get_param_names(self))

    def _get_params(self):
@ -83,3 +83,7 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):

    def _log_likelihood_gradients(self):
        return np.hstack((self.dL_dmuS().flatten(), sparse_GP._log_likelihood_gradients(self)))
+
+    def plot_latent(self, *args, **kwargs):
+        input_1, input_2 = GPLVM.plot_latent(self, *args, **kwargs)
+        pb.plot(self.Z[:, input_1], self.Z[:, input_2], '^w')
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@ -30,7 +30,6 @@ class GP(model):
    .. Note:: Multiple independent outputs are allowed using columns of Y

    """
-    #FIXME normalize vs normalise
    def __init__(self, X, likelihood, kernel, normalize_X=False, Xslices=None):

        # parse arguments
@ -41,7 +40,7 @@ class GP(model):
        assert isinstance(kernel, kern.kern)
        self.kern = kernel

-        #here's some simple normalisation for the inputs
+        #here's some simple normalization for the inputs
        if normalize_X:
            self._Xmean = X.mean(0)[None,:]
            self._Xstd = X.std(0)[None,:]
@ -60,13 +59,19 @@ class GP(model):

        model.__init__(self)

+    def dL_dZ(self):
+        """
+        TODO: one day we might like to learn Z by gradient methods?
+        """
+        return np.zeros_like(self.Z)
+
    def _set_params(self,p):
        self.kern._set_params_transformed(p[:self.kern.Nparam])
        #self.likelihood._set_params(p[self.kern.Nparam:])               # test by Nicolas
        self.likelihood._set_params(p[self.kern.Nparam_transformed():])    # test by Nicolas


-        self.K = self.kern.K(self.X,slices1=self.Xslices)
+        self.K = self.kern.K(self.X,slices1=self.Xslices,slices2=self.Xslices)
        self.K += self.likelihood.covariance_matrix

        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
@ -123,12 +128,12 @@ class GP(model):

        For the likelihood parameters, pass in alpha = K^-1 y
        """
-        return np.hstack((self.kern.dK_dtheta(partial=self.dL_dK,X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
+        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK,X=self.X,slices1=self.Xslices,slices2=self.Xslices), self.likelihood._gradients(partial=np.diag(self.dL_dK))))

    def _raw_predict(self,_Xnew,slices=None, full_cov=False):
        """
        Internal helper function for making predictions, does not account
-        for normalisation or likelihood
+        for normalization or likelihood
        """
        Kx = self.kern.K(self.X,_Xnew, slices1=self.Xslices,slices2=slices)
        mu = np.dot(np.dot(Kx.T,self.Ki),self.likelihood.Y)
@ -166,10 +171,10 @@ class GP(model):
             - If a list of booleans, specifying which kernel parts are active

           If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew.
-           This is to allow for different normalisations of the output dimensions.
+           This is to allow for different normalizations of the output dimensions.

        """
-        #normalise X values
+        #normalize X values
        Xnew = (Xnew.copy() - self._Xmean) / self._Xstd
        mu, var = self._raw_predict(Xnew, slices, full_cov)

@ -181,7 +186,7 @@ class GP(model):

    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_functions='all', resolution=None, full_cov=False):
        """
-        Plot the GP's view of the world, where the data is normalised and the likelihood is Gaussian
+        Plot the GP's view of the world, where the data is normalized and the likelihood is Gaussian

        :param samples: the number of a posteriori samples to plot
        :param which_data: which if the training data to plot (default all)
@ -197,7 +202,7 @@ class GP(model):
          - In higher dimensions, we've no implemented this yet !TODO!

        Can plot only part of the data and part of the posterior functions using which_data and which_functions
-        Plot the data's view of the world, with non-normalised values and GP predictions passed through the likelihood
+        Plot the data's view of the world, with non-normalized values and GP predictions passed through the likelihood
        """
        if which_functions=='all':
            which_functions = [True]*self.kern.Nparts
@ -215,7 +220,7 @@ class GP(model):
                Ysim = np.random.multivariate_normal(m.flatten(),v,samples)
                gpplot(Xnew,m,m-2*np.sqrt(np.diag(v)[:,None]),m+2*np.sqrt(np.diag(v))[:,None])
                for i in range(samples):
-                    pb.plot(Xnew,Ysim[i,:],Tango.coloursHex['darkBlue'],linewidth=0.25)
+                    pb.plot(Xnew,Ysim[i,:],Tango.colorsHex['darkBlue'],linewidth=0.25)
            pb.plot(self.X[which_data],self.likelihood.Y[which_data],'kx',mew=1.5)
            pb.xlim(xmin,xmax)
            ymin,ymax = min(np.append(self.likelihood.Y,m-2*np.sqrt(np.diag(v)[:,None]))), max(np.append(self.likelihood.Y,m+2*np.sqrt(np.diag(v)[:,None])))
@ -236,7 +241,12 @@ class GP(model):
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"

-    def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None,full_cov=False):
+    def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None,levels=20):
+        """
+        TODO: Docstrings!
+        :param levels: for 2D plotting, the number of contour levels to use
+
+        """
        # TODO include samples
        if which_functions=='all':
            which_functions = [True]*self.kern.Nparts
@ -258,6 +268,8 @@ class GP(model):
            if hasattr(self,'Z'):
                Zu = self.Z*self._Xstd + self._Xmean
                pb.plot(Zu,Zu*0+pb.ylim()[0],'r|',mew=1.5,markersize=12)
+                if self.has_uncertain_inputs:
+                    pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten()))

        elif self.X.shape[1]==2: #FIXME
            resolution = resolution or 50
@ -265,10 +277,13 @@ class GP(model):
            x, y = np.linspace(xmin[0],xmax[0],resolution), np.linspace(xmin[1],xmax[1],resolution)
            m, var, lower, upper = self.predict(Xnew, slices=which_functions)
            m = m.reshape(resolution,resolution).T
-            pb.contour(x,y,m,vmin=m.min(),vmax=m.max(),cmap=pb.cm.jet)
+            pb.contour(x,y,m,levels,vmin=m.min(),vmax=m.max(),cmap=pb.cm.jet)
            Yf = self.likelihood.Y.flatten()
            pb.scatter(self.X[:,0], self.X[:,1], 40, Yf, cmap=pb.cm.jet,vmin=m.min(),vmax=m.max(), linewidth=0.)
            pb.xlim(xmin[0],xmax[0])
            pb.ylim(xmin[1],xmax[1])
+            if hasattr(self,'Z'):
+                pb.plot(self.Z[:,0],self.Z[:,1],'wo')
+
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
--- a/GPy/models/GPLVM.py
+++ b/GPy/models/GPLVM.py
@ -10,6 +10,7 @@ from ..core import model
 from ..util.linalg import pdinv, PCA
 from GP import GP
 from ..likelihoods import Gaussian
+from .. import util

 class GPLVM(GP):
    """
@ -59,5 +60,66 @@ class GPLVM(GP):
        mu, var, upper, lower = self.predict(Xnew)
        pb.plot(mu[:,0], mu[:,1],'k',linewidth=1.5)

-    def plot_latent(self):
-        raise NotImplementedError
+    def plot_latent(self,labels=None, which_indices=None, resolution=50):
+        """
+        :param labels: a np.array of size self.N containing labels for the points (can be number, strings, etc)
+        :param resolution: the resolution of the grid on which to evaluate the predictive variance
+        """
+
+        util.plot.Tango.reset()
+        
+        if labels is None:
+            labels = np.ones(self.N)
+        if which_indices is None:
+            if self.Q==1:
+                input_1 = 0
+                input_2 = None
+            if self.Q==2:
+                input_1, input_2 = 0,1
+            else:
+                #try to find a linear of RBF kern in the kernel
+                k = [p for p in self.kern.parts if p.name in ['rbf','linear']]
+                if (not len(k)==1) or (not k[0].ARD):
+                    raise ValueError, "cannot Atomatically determine which dimensions to plot, please pass 'which_indices'"
+                k = k[0]
+                if k.name=='rbf':
+                    input_1, input_2 = np.argsort(k.lengthscale)[:2]
+                elif k.name=='linear':
+                    input_1, input_2 = np.argsort(k.variances)[::-1][:2]
+
+        #first, plot the output variance as a function of the latent space
+        Xtest, xx,yy,xmin,xmax = util.plot.x_frame2D(self.X[:,[input_1, input_2]],resolution=resolution)
+	Xtest_full = np.zeros((Xtest.shape[0], self.X.shape[1]))
+	Xtest_full[:, :2] = Xtest
+	mu, var, low, up = self.predict(Xtest_full)
+	var = var[:, :2]
+        pb.imshow(var.reshape(resolution,resolution).T[::-1,:],extent=[xmin[0],xmax[0],xmin[1],xmax[1]],cmap=pb.cm.binary,interpolation='bilinear')
+
+
+        for i,ul in enumerate(np.unique(labels)):
+            if type(ul) is np.string_:
+                this_label = ul
+            elif type(ul) is np.int64:
+                this_label = 'class %i'%ul
+            else:
+                this_label = 'class %i'%i
+
+            index = np.nonzero(labels==ul)[0]
+            if self.Q==1:
+                x = self.X[index,input_1]
+                y = np.zeros(index.size)
+            else:
+                x = self.X[index,input_1]
+                y = self.X[index,input_2]
+            pb.plot(x,y,marker='o',color=util.plot.Tango.nextMedium(),mew=0,label=this_label,linewidth=0)
+
+        pb.xlabel('latent dimension %i'%input_1)
+        pb.ylabel('latent dimension %i'%input_2)
+
+        if not np.all(labels==1.):
+            pb.legend(loc=0,numpoints=1)
+
+        pb.xlim(xmin[0],xmax[0])
+        pb.ylim(xmin[1],xmax[1])
+
+        return input_1, input_2
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -10,4 +10,4 @@ from GPLVM import GPLVM
 from warped_GP import warpedGP
 from sparse_GPLVM import sparse_GPLVM
 from uncollapsed_sparse_GP import uncollapsed_sparse_GP
-from BGPLVM import Bayesian_GPLVM
+from Bayesian_GPLVM import Bayesian_GPLVM
--- a/GPy/models/sparse_GP.py
+++ b/GPy/models/sparse_GP.py
@ -3,7 +3,7 @@

 import numpy as np
 import pylab as pb
-from ..util.linalg import mdot, jitchol, chol_inv, pdinv
+from ..util.linalg import mdot, jitchol, chol_inv, pdinv, trace_dot
 from ..util.plot import gpplot
 from .. import kern
 from GP import GP
@ -54,37 +54,44 @@ class sparse_GP(GP):

        GP.__init__(self, X, likelihood, kernel=kernel, normalize_X=normalize_X, Xslices=Xslices)

-        #normalise X uncertainty also
+        #normalize X uncertainty also
        if self.has_uncertain_inputs:
            self.X_uncertainty /= np.square(self._Xstd)

-    def _computations(self):
-        # TODO find routine to multiply triangular matrices
-        #TODO: slices for psi statistics (easy enough)
-
-        sf = self.scale_factor
-        sf2 = sf**2

+    def _compute_kernel_matrices(self):
        # kernel computations, using BGPLVM notation
        self.Kmm = self.kern.K(self.Z)
        if self.has_uncertain_inputs:
            self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty)
            self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T
            self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)
-            if self.likelihood.is_heteroscedastic:
-                self.psi2_beta_scaled = (self.psi2*(self.likelihood.precision.reshape(self.N,1,1)/sf2)).sum(0)
-                #TODO: what is the likelihood is heterscedatic and there are multiple independent outputs?
-            else:
-                self.psi2_beta_scaled = (self.psi2*(self.likelihood.precision/sf2)).sum(0)
        else:
            self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices)
            self.psi1 = self.kern.K(self.Z,self.X)
-            if self.likelihood.is_heteroscedastic:
-                tmp = self.psi1*(np.sqrt(self.likelihood.precision.reshape(self.N,1))/sf)
+            self.psi2 = None
+
+    def _computations(self):
+        #TODO: find routine to multiply triangular matrices
+        #TODO: slices for psi statistics (easy enough)
+
+        sf = self.scale_factor
+        sf2 = sf**2
+
+        #The rather complex computations of psi2_beta_scaled
+        if self.likelihood.is_heteroscedastic:
+            assert self.likelihood.D == 1 #TODO: what if the likelihood is heterscedatic and there are multiple independent outputs?
+            if self.has_uncertain_inputs:
+                self.psi2_beta_scaled = (self.psi2*(self.likelihood.precision.flatten().reshape(self.N,1,1)/sf2)).sum(0)
+            else:
+                tmp = self.psi1*(np.sqrt(self.likelihood.precision.flatten().reshape(1,self.N))/sf)
+                self.psi2_beta_scaled = np.dot(tmp,tmp.T)
+        else:
+            if self.has_uncertain_inputs:
+                self.psi2_beta_scaled = (self.psi2*(self.likelihood.precision/sf2)).sum(0)
            else:
                tmp = self.psi1*(np.sqrt(self.likelihood.precision)/sf)
-            self.psi2_beta_scaled = np.dot(tmp,tmp.T)
-            self.psi2 = self.psi1.T[:,:,None]*self.psi1.T[:,None,:] # TODO: remove me for efficiency and stability
+                self.psi2_beta_scaled = np.dot(tmp,tmp.T)

        self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)

@ -95,28 +102,44 @@ class sparse_GP(GP):
        self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B)

        self.psi1V = np.dot(self.psi1, self.V)
-        self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T)
-        self.C = mdot(self.Lmi.T, self.Bi, self.Lmi)
-        self.E = mdot(self.C, self.psi1VVpsi1/sf2, self.C.T)
+        tmp = np.dot(self.Lmi.T, self.LBi.T)
+        self.C = np.dot(tmp,tmp.T)
+        self.Cpsi1V = np.dot(self.C,self.psi1V)
+        self.Cpsi1VVpsi1 = np.dot(self.Cpsi1V,self.psi1V.T)
+        self.E = np.dot(self.Cpsi1VVpsi1,self.C)/sf2

-        # Compute dL_dpsi # FIXME: this is untested for the het. case
-        self.dL_dpsi0 = - 0.5 * self.D * self.likelihood.precision * np.ones(self.N)
-        self.dL_dpsi1 = mdot(self.V, self.psi1V.T,self.C).T
+        # Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertin inputs case
+        self.dL_dpsi0 = - 0.5 * self.D * (self.likelihood.precision * np.ones([self.N,1])).flatten()
+        #self.dL_dpsi1 = mdot(self.V, self.psi1V.T,self.C).T
+        self.dL_dpsi1 = np.dot(self.Cpsi1V,self.V.T)
        if self.likelihood.is_heteroscedastic:
-            self.dL_dpsi2 = 0.5 * self.likelihood.precision[:,None,None] * self.D * self.Kmmi[None,:,:] # dB
-            self.dL_dpsi2 += - 0.5 * self.likelihood.precision[:,None,None]/sf2 * self.D * self.C[None,:,:] # dC
-            self.dL_dpsi2 += - 0.5 * self.likelihood.precision[:,None,None]* self.E[None,:,:] # dD
+            if self.has_uncertain_inputs:
+                self.dL_dpsi2 = 0.5 * self.likelihood.precision[:,None,None] * self.D * self.Kmmi[None,:,:] # dB
+                self.dL_dpsi2 += - 0.5 * self.likelihood.precision[:,None,None]/sf2 * self.D * self.C[None,:,:] # dC
+                self.dL_dpsi2 += - 0.5 * self.likelihood.precision[:,None,None]* self.E[None,:,:] # dD
+            else:
+                self.dL_dpsi1 += mdot(self.Kmmi,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)) #dB
+                self.dL_dpsi1 += -mdot(self.C,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)/sf2) #dC
+                self.dL_dpsi1 += -mdot(self.E,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)) #dD
+                self.dL_dpsi2 = None
+
        else:
            self.dL_dpsi2 = 0.5 * self.likelihood.precision * self.D * self.Kmmi # dB
            self.dL_dpsi2 += - 0.5 * self.likelihood.precision/sf2 * self.D * self.C # dC
            self.dL_dpsi2 += - 0.5 * self.likelihood.precision * self.E # dD
-            #repeat for each of the N psi_2 matrices
-            self.dL_dpsi2 = np.repeat(self.dL_dpsi2[None,:,:],self.N,axis=0)
+            if self.has_uncertain_inputs:
+                #repeat for each of the N psi_2 matrices
+                self.dL_dpsi2 = np.repeat(self.dL_dpsi2[None,:,:],self.N,axis=0)
+            else:
+                self.dL_dpsi1 += 2.*np.dot(self.dL_dpsi2,self.psi1)
+                self.dL_dpsi2 = None
+

        # Compute dL_dKmm
        self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi)*sf2 # dB
-        self.dL_dKmm += -0.5 * self.D * (- self.C/sf2 - 2.*mdot(self.C, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC
-        self.dL_dKmm +=  np.dot(np.dot(self.E*sf2, self.psi2_beta_scaled) - np.dot(self.C, self.psi1VVpsi1), self.Kmmi) + 0.5*self.E # dD
+        #self.dL_dKmm += -0.5 * self.D * (- self.C/sf2 - 2.*mdot(self.C, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC
+        #self.dL_dKmm +=  np.dot(np.dot(self.E*sf2, self.psi2_beta_scaled) - self.Cpsi1VVpsi1, self.Kmmi) + 0.5*self.E # dD
+        self.dL_dKmm += 0.5*(self.D*(self.C/sf2 -self.Kmmi) + self.E) + np.dot(np.dot(self.D*self.C + self.E*sf2,self.psi2_beta_scaled) - self.Cpsi1VVpsi1,self.Kmmi) # d(C+D)

        #the partial derivative vector for the likelihood
        if self.likelihood.Nparams ==0:
@ -130,18 +153,31 @@ class sparse_GP(GP):
            #self.partial_for_likelihood += -np.diag(np.dot((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) , self.psi1VVpsi1 ))*self.likelihood.precision #dD
        else:
            #likelihood is not heterscedatic
-            beta = self.likelihood.precision
-            dbeta =   0.5 * self.N*self.D/beta - 0.5 * np.sum(np.square(self.likelihood.Y))
-            dbeta += - 0.5 * self.D * (self.psi0.sum() - np.trace(self.A)/beta*sf2)
-            dbeta += - 0.5 * self.D * np.sum(self.Bi*self.A)/beta
-            dbeta += np.sum((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) * self.psi1VVpsi1 )/beta
-            self.partial_for_likelihood = -dbeta*self.likelihood.precision**2
+            self.partial_for_likelihood =   - 0.5 * self.N*self.D*self.likelihood.precision + 0.5 * np.sum(np.square(self.likelihood.Y))*self.likelihood.precision**2
+            self.partial_for_likelihood += 0.5 * self.D * (self.psi0.sum()*self.likelihood.precision**2 - np.trace(self.A)*self.likelihood.precision*sf2)
+            self.partial_for_likelihood += 0.5 * self.D * trace_dot(self.Bi,self.A)*self.likelihood.precision
+            self.partial_for_likelihood += self.likelihood.precision*(0.5*trace_dot(self.psi2_beta_scaled,self.E*sf2) - np.trace(self.Cpsi1VVpsi1))


+
+    def log_likelihood(self):
+        """ Compute the (lower bound on the) log marginal likelihood """
+        sf2 = self.scale_factor**2
+        if self.likelihood.is_heteroscedastic:
+            A = -0.5*self.N*self.D*np.log(2.*np.pi) +0.5*np.sum(np.log(self.likelihood.precision)) -0.5*np.sum(self.V*self.likelihood.Y)
+            B = -0.5*self.D*(np.sum(self.likelihood.precision.flatten()*self.psi0) - np.trace(self.A)*sf2)
+        else:
+            A = -0.5*self.N*self.D*(np.log(2.*np.pi) + np.log(self.likelihood._variance)) -0.5*self.likelihood.precision*self.likelihood.trYYT
+            B = -0.5*self.D*(np.sum(self.likelihood.precision*self.psi0) - np.trace(self.A)*sf2)
+        C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2))
+        D = 0.5*np.trace(self.Cpsi1VVpsi1)
+        return A+B+C+D
+
    def _set_params(self, p):
        self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
        self.kern._set_params(p[self.Z.size:self.Z.size+self.kern.Nparam])
        self.likelihood._set_params(p[self.Z.size+self.kern.Nparam:])
+        self._compute_kernel_matrices()
        self._computations()

    def _get_params(self):
@ -150,17 +186,20 @@ class sparse_GP(GP):
    def _get_param_names(self):
        return sum([['iip_%i_%i'%(i,j) for j in range(self.Z.shape[1])] for i in range(self.Z.shape[0])],[]) + GP._get_param_names(self)

-    def log_likelihood(self):
-        """ Compute the (lower bound on the) log marginal likelihood """
-        sf2 = self.scale_factor**2
-        if self.likelihood.is_heteroscedastic:
-            A = -0.5*self.N*self.D*np.log(2.*np.pi) +0.5*np.sum(np.log(self.likelihood.precision)) -0.5*np.sum(self.V*self.likelihood.Y)
+    def update_likelihood_approximation(self):
+        """
+        Approximates a non-gaussian likelihood using Expectation Propagation
+
+        For a Gaussian (or direct: TODO) likelihood, no iteration is required:
+        this function does nothing
+        """
+        if self.has_uncertain_inputs:
+            raise NotImplementedError, "EP approximation not implemented for uncertain inputs"
        else:
-            A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.likelihood.precision)) -0.5*self.likelihood.precision*self.likelihood.trYYT
-        B = -0.5*self.D*(np.sum(self.likelihood.precision*self.psi0) - np.trace(self.A)*sf2)
-        C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2))
-        D = +0.5*np.sum(self.psi1VVpsi1 * self.C)
-        return A+B+C+D
+            self.likelihood.fit_DTC(self.Kmm,self.psi1)
+            #self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
+            self._set_params(self._get_params()) # update the GP
+

    def _log_likelihood_gradients(self):
        return np.hstack((self.dL_dZ().flatten(), self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))
@ -173,15 +212,9 @@ class sparse_GP(GP):
        if self.has_uncertain_inputs:
            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty)
            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
-            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.dL_dpsi1.T, self.Z,self.X, self.X_uncertainty)
+            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z,self.X, self.X_uncertainty)
        else:
-            #re-cast computations in psi2 back to psi1:
-            #dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2.sum(0),self.psi1)
-            if not self.likelihood.is_heteroscedastic:
-                dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2[0,:,:],self.psi1)
-            else:
-                raise NotImplementedError, "TODO"
-            dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X)
+            dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1,self.Z,self.X)
            dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)

        return dL_dtheta
@ -195,16 +228,11 @@ class sparse_GP(GP):
            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1,self.Z,self.X, self.X_uncertainty)
            dL_dZ += 2.*self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # 'stripes'
        else:
-            #re-cast computations in psi2 back to psi1:
-            if not self.likelihood.is_heteroscedastic:
-                dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2[0,:,:],self.psi1)
-            else:
-                raise NotImplementedError, "TODO"
-            dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X)
+            dL_dZ += self.kern.dK_dX(self.dL_dpsi1,self.Z,self.X)
        return dL_dZ

    def _raw_predict(self, Xnew, slices, full_cov=False):
-        """Internal helper function for making predictions, does not account for normalisation"""
+        """Internal helper function for making predictions, does not account for normalization"""

        Kx = self.kern.K(self.Z, Xnew)
        mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
@ -216,14 +244,3 @@ class sparse_GP(GP):
            var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0)

        return mu,var[:,None]
-
-    def plot(self, *args, **kwargs):
-        """
-        Plot the fitted model: just call the GP plot function and then add inducing inputs
-        """
-        GP.plot(self,*args,**kwargs)
-        if self.Q==1:
-            if self.has_uncertain_inputs:
-                pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten()))
-        if self.Q==2:
-            pb.plot(self.Z[:,0],self.Z[:,1],'wo')
--- a/GPy/models/sparse_GPLVM.py
+++ b/GPy/models/sparse_GPLVM.py
@ -42,11 +42,8 @@ class sparse_GPLVM(sparse_GP_regression, GPLVM):
        return sparse_GP_regression.log_likelihood(self)

    def dL_dX(self):
-        #dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
-	dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2[0,:,:],self.psi1)
-
        dL_dX = self.kern.dKdiag_dX(self.dL_dpsi0,self.X)
-        dL_dX += self.kern.dK_dX(dL_dpsi1.T,self.X,self.Z)
+        dL_dX += self.kern.dK_dX(self.dL_dpsi1.T,self.X,self.Z)

        return dL_dX

@ -58,3 +55,7 @@ class sparse_GPLVM(sparse_GP_regression, GPLVM):
        #passing Z without a small amout of jitter will induce the white kernel where we don;t want it!
        mu, var, upper, lower = sparse_GP_regression.predict(self, self.Z+np.random.randn(*self.Z.shape)*0.0001)
        pb.plot(mu[:, 0] , mu[:, 1], 'ko')
+
+    def plot_latent(self, *args, **kwargs):
+        input_1, input_2 = GPLVM.plot_latent(*args, **kwargs)
+        pb.plot(m.Z[:, input_1], m.Z[:, input_2], '^w')
--- a/GPy/models/uncollapsed_sparse_GP.py
+++ b/GPy/models/uncollapsed_sparse_GP.py
@ -93,7 +93,7 @@ class uncollapsed_sparse_GP(sparse_GP):
        return A+B+C+D+E

    def _raw_predict(self, Xnew, slices,full_cov=False):
-        """Internal helper function for making predictions, does not account for normalisation"""
+        """Internal helper function for making predictions, does not account for normalization"""
        Kx = self.kern.K(Xnew,self.Z)
        mu = mdot(Kx,self.Kmmi,self.q_u_expectation[0])

--- a/GPy/notes.txt
+++ b/GPy/notes.txt
@ -1,42 +1,61 @@
-Fails in weird ways if you pass a integer as the input instead of a double to the kernel.
+the predict method for GP_regression returns a covariance matrix which is a bad idea as this takes a lot to compute, it's also confusing for first time users. Should only be returned if the user explicitly requests it. 
+FIXED

-The Matern kernels (at least the 52) still is working in the ARD manner which means it wouldn't run for very large input dimension. Needs to be fixed to match the RBF.
-
-Implementing new covariances is too complicated at the moment. We need a barebones example of what to implement and where. Commenting in the covariance matrices needs to be improved. It's not clear to a user what all the psi parts are for. Maybe we need a cut down and simplified example to help with this (perhaps a cut down version of the RBF?). And then we should provide a simple list of what you need to do to get a new kernel going.
-
-Missing kernels: polynomial, rational quadratic.
-
-Kernel implementations are far to obscure. Need to be easily readable for a first time user.
-
-Need an implementation of scaled conjugate gradients for the optimizers.
-
-Need an implementation of gradient descent for the optimizers (works well with GP-LVM for small random initializations)
-
-Need Carl Rasmussen's permission to add his conjugate gradients algorithm. In fact, we can just provide a hook for it, and post a separate python implementation of his algorithm.
-
-Change get_param and set_param to get_params and set_params
-
-Get constrain param by default inside model creation.
-
-Randomize doesn't seem to cover a wide enough range for restarts ... try it for a model where inputs are widely spaced apart and length scale is too short. Sampling from N(0,1) is too conservative. Dangerous for people who naively use restarts. Since we have the model we could maybe come up with some sensible heuristics for setting these things. Maybe we should also consider having '.initialize()'. If we can't do this well we should disable the restart method.
-
-
-Tolerances for optimizers, do we need to introduce some standardization? At the moment does each have its own defaults?
-
-Do all optimizers work only in terms of function evaluations? Do we need to check for one that uses iterations?
+When computing kernel.K for kernels like rbf, you can't compute a version with rbf.K(X) you have to do rbf.K(X, X)
+FIXED

 Change Youter to YYT (Youter doesn't mean anything for matrices).
+FIXED
+
+Change get_param and set_param to get_params and set_params
+FIXED
+
+Fails in weird ways if you pass a integer as the input instead of a double to the kernel.
+FIXED
+
+The Matern kernels (at least the 52) still is working in the ARD manner which means it wouldn't run for very large input dimension. Needs to be fixed to match the RBF.
+FIXED
+
+Implementing new covariances is too complicated at the moment. We need a barebones example of what to implement and where. Commenting in the covariance matrices needs to be improved. It's not clear to a user what all the psi parts are for. Maybe we need a cut down and simplified example to help with this (perhaps a cut down version of the RBF?). And then we should provide a simple list of what you need to do to get a new kernel going.
+TODO, a priority for this release
+
+Missing kernels: polynomial, rational quadratic.
+TODO, should be straightforward when the above is fixed.
+
+Need an implementation of scaled conjugate gradients for the optimizers.
+UPSTREAM: scipy are tidying up the optimize module. let's wait for their next release. 
+
+Need an implementation of gradient descent for the optimizers (works well with GP-LVM for small random initializations)
+As above. 
+
+Need Carl Rasmussen's permission to add his conjugate gradients algorithm. In fact, we can just provide a hook for it, and post a separate python implementation of his algorithm.
+Any word from Carl yet?
+
+Get constrain param by default inside model creation.
+Well, we have ensure_default_constraints. There are some techinical difficulties in doing it inside model creation, so perhaps this is something for a later release.

 Bug when running classification.crescent_data()
+TODO.
+
+Do all optimizers work only in terms of function evaluations? Do we need to check for one that uses iterations?
+Upstream: Waiting for the new scipy, where the optimisers have been unified. Obviously it's be much better to be able to specify a unified set of args. 
+
+Tolerances for optimizers, do we need to introduce some standardization? At the moment does each have its own defaults?
+Upstream, as above

 A dictionary for parameter storage? So we can go through names easily?
+Wontfix. Dictionaries bring up all kinds of problems since they're not ordered. it's easy enough to do:
+for val, name in zip(m._get_params(), m._get_param_names()): foobar

 A flag on covariance functions that indicates when they are not associated with an underlying function (like white noise or a coregionalization matrix).
+TODO, agree this would be helpful.

 Diagonal noise covariance function
+TODO this is now straightforward using the likelihood framework, or as a kern. NF also requires a similar kind of kern function (a fixed form kernel)

 Long term: automatic Lagrange multiplier calculation for optimizers: constrain two parameters in an unusual way and the model automatically does the Lagrangian. Also augment the parameters with new ones, so define data variance to be white noise plus RBF variance and optimize over that and signal to noise ratio ... for example constrain the sum of variances to equal the known variance of the data.

-When computing kernel.K for kernels like rbf, you can't compute a version with rbf.K(X) you have to do rbf.K(X, X)
+Randomize doesn't seem to cover a wide enough range for restarts ... try it for a model where inputs are widely spaced apart and length scale is too short. Sampling from N(0,1) is too conservative. Dangerous for people who naively use restarts. Since we have the model we could maybe come up with some sensible heuristics for setting these things. Maybe we should also consider having '.initialize()'. If we can't do this well we should disable the restart method.
+Excellent proposal, but lots of work: suggest leaving for the next release?
+

-the predict method for GP_regression returns a covariance matrix which is a bad idea as this takes a lot to compute, it's also confusing for first time users. Should only be returned if the user explicitly requests it. 
--- a/GPy/testing/init.py
+++ b/GPy/testing/init.py
--- a/GPy/testing/bgplvm_tests.py
+++ b/GPy/testing/bgplvm_tests.py
@ -12,9 +12,10 @@ class BGPLVMTests(unittest.TestCase):
        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
        K = k.K(X)
        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        Y -= Y.mean(axis=0)
        k = GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
        m = GPy.models.Bayesian_GPLVM(Y, Q, kernel = k,  M=M)
-        m.constrain_positive('(rbf|bias|noise|white|S)')
+        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())

@ -24,9 +25,10 @@ class BGPLVMTests(unittest.TestCase):
        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
        K = k.K(X)
        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        Y -= Y.mean(axis=0)
        k = GPy.kern.linear(Q) + GPy.kern.white(Q, 0.00001)
        m = GPy.models.Bayesian_GPLVM(Y, Q, kernel = k,  M=M)
-        m.constrain_positive('(linear|bias|noise|white|S)')
+        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())

@ -36,13 +38,41 @@ class BGPLVMTests(unittest.TestCase):
        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
        K = k.K(X)
        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        Y -= Y.mean(axis=0)
        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
        m = GPy.models.Bayesian_GPLVM(Y, Q, kernel = k,  M=M)
-        m.constrain_positive('(rbf|bias|noise|white|S)')
+        m.ensure_default_constraints()
        m.randomize()
        self.assertTrue(m.checkgrad())

-        
+    def test_rbf_bias_kern(self):
+        N, M, Q, D = 10, 3, 2, 4
+        X = np.random.rand(N, Q)
+        k = GPy.kern.rbf(Q) +  GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        Y -= Y.mean(axis=0)
+        k = GPy.kern.rbf(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
+        m = GPy.models.Bayesian_GPLVM(Y, Q, kernel = k,  M=M)
+        m.ensure_default_constraints()
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+    #@unittest.skip('psi2 cross terms are NotImplemented for this combination')
+    def test_linear_bias_kern(self):
+        N, M, Q, D = 10, 3, 2, 4
+        X = np.random.rand(N, Q)
+        k = GPy.kern.linear(Q) +  GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        Y -= Y.mean(axis=0)
+        k = GPy.kern.linear(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
+        m = GPy.models.Bayesian_GPLVM(Y, Q, kernel = k,  M=M)
+        m.ensure_default_constraints()
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+
 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
    unittest.main()
--- a/GPy/testing/examples_tests.py
+++ b/GPy/testing/examples_tests.py
@ -0,0 +1,81 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import unittest
+import numpy as np
+import GPy
+import inspect
+import pkgutil
+import os
+import random
+from nose.tools import nottest
+
+class ExamplesTests(unittest.TestCase):
+    def _checkgrad(self, model):
+        self.assertTrue(model.checkgrad())
+
+    def _model_instance(self, model):
+        self.assertTrue(isinstance(model, GPy.models))
+
+"""
+def model_instance_generator(model):
+    def check_model_returned(self):
+        self._model_instance(model)
+    return check_model_returned
+
+def checkgrads_generator(model):
+    def model_checkgrads(self):
+        self._checkgrad(model)
+    return model_checkgrads
+"""
+
+def model_checkgrads(model):
+    model.randomize()
+    assert model.checkgrad()
+
+
+def model_instance(model):
+    assert isinstance(model, GPy.core.model)
+
+@nottest
+def test_models():
+    examples_path = os.path.dirname(GPy.examples.__file__)
+    #Load modules
+    for loader, module_name, is_pkg in pkgutil.iter_modules([examples_path]):
+        #Load examples
+        module_examples = loader.find_module(module_name).load_module(module_name)
+        print "MODULE", module_examples
+        print "Before"
+        print inspect.getmembers(module_examples, predicate=inspect.isfunction)
+        functions = [ func for func in inspect.getmembers(module_examples, predicate=inspect.isfunction) if func[0].startswith('_') is False ][::-1]
+        print "After"
+        print functions
+        for example in functions:
+            if example[0] in ['oil', 'silhouette', 'GPLVM_oil_100']:
+                print "SKIPPING"
+                continue
+
+            print "Testing example: ", example[0]
+            #Generate model
+            model = example[1]()
+            print model
+
+            #Create tests for instance check
+            """
+            test = model_instance_generator(model)
+            test.__name__ = 'test_instance_%s' % example[0]
+            setattr(ExamplesTests, test.__name__, test)
+
+            #Create tests for checkgrads check
+            test = checkgrads_generator(model)
+            test.__name__ = 'test_checkgrads_%s' % example[0]
+            setattr(ExamplesTests, test.__name__, test)
+            """
+            model_checkgrads.description = 'test_checkgrads_%s' % example[0]
+            yield model_checkgrads, model
+            model_instance.description = 'test_instance_%s' % example[0]
+            yield model_instance, model
+
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()
--- a/GPy/testing/gplvm_tests.py
+++ b/GPy/testing/gplvm_tests.py
@ -0,0 +1,47 @@
+# Copyright (c) 2012, Nicolo Fusi
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import unittest
+import numpy as np
+import GPy
+
+class GPLVMTests(unittest.TestCase):
+    def test_bias_kern(self):
+        N, M, Q, D = 10, 3, 2, 4
+        X = np.random.rand(N, Q)
+        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        k = GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
+        m = GPy.models.GPLVM(Y, Q, kernel = k)
+        m.ensure_default_constraints()
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+    def test_linear_kern(self):
+        N, M, Q, D = 10, 3, 2, 4
+        X = np.random.rand(N, Q)
+        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        k = GPy.kern.linear(Q) + GPy.kern.white(Q, 0.00001)
+        m = GPy.models.GPLVM(Y, Q, kernel = k)
+        m.ensure_default_constraints()
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+    def test_rbf_kern(self):
+        N, M, Q, D = 10, 3, 2, 4
+        X = np.random.rand(N, Q)
+        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
+        m = GPy.models.GPLVM(Y, Q, kernel = k)
+        m.ensure_default_constraints()
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -8,14 +8,29 @@ import GPy
 class KernelTests(unittest.TestCase):
    def test_kerneltie(self):
        K = GPy.kern.rbf(5, ARD=True)
-        K.tie_param('[01]')
+        K.tie_params('[01]')
        K.constrain_fixed('2')
        X = np.random.rand(5,5)
        Y = np.ones((5,1))
        m = GPy.models.GP_regression(X,Y,K)
-        print m
        self.assertTrue(m.checkgrad())

+    def test_coregionalisation(self):
+        X1 = np.random.rand(50,1)*8
+        X2 = np.random.rand(30,1)*5
+        index = np.vstack((np.zeros_like(X1),np.ones_like(X2)))
+        X = np.hstack((np.vstack((X1,X2)),index))
+        Y1 = np.sin(X1) + np.random.randn(*X1.shape)*0.05
+        Y2 = np.sin(X2) + np.random.randn(*X2.shape)*0.05 + 2.
+        Y = np.vstack((Y1,Y2))
+
+        k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
+        k2 = GPy.kern.coregionalise(2,1)
+        k = k1.prod_orthogonal(k2)
+        m = GPy.models.GP_regression(X,Y,kernel=k)
+        self.assertTrue(m.checkgrad())
+
+


 if __name__ == "__main__":
--- a/GPy/testing/sparse_gplvm_tests.py
+++ b/GPy/testing/sparse_gplvm_tests.py
@ -0,0 +1,48 @@
+# Copyright (c) 2012, Nicolo Fusi, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import unittest
+import numpy as np
+import GPy
+
+class sparse_GPLVMTests(unittest.TestCase):
+    def test_bias_kern(self):
+        N, M, Q, D = 10, 3, 2, 4
+        X = np.random.rand(N, Q)
+        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        k = GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
+        m = GPy.models.sparse_GPLVM(Y, Q, kernel = k, M=M)
+        m.ensure_default_constraints()
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+    @unittest.skip('linear kernels do not have dKdiag_dX')
+    def test_linear_kern(self):
+        N, M, Q, D = 10, 3, 2, 4
+        X = np.random.rand(N, Q)
+        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        k = GPy.kern.linear(Q) + GPy.kern.white(Q, 0.00001)
+        m = GPy.models.sparse_GPLVM(Y, Q, kernel = k, M=M)
+        m.ensure_default_constraints()
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+    def test_rbf_kern(self):
+        N, M, Q, D = 10, 3, 2, 4
+        X = np.random.rand(N, Q)
+        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
+        m = GPy.models.sparse_GPLVM(Y, Q, kernel = k, M=M)
+        m.ensure_default_constraints()
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@ -157,13 +157,28 @@ class GradientTests(unittest.TestCase):
    def test_GP_EP_probit(self):
        N = 20
        X = np.hstack([np.random.normal(5,2,N/2),np.random.normal(10,2,N/2)])[:,None]
-        Y = np.hstack([np.ones(N/2),np.repeat(-1,N/2)])[:,None]
+        Y = np.hstack([np.ones(N/2),np.zeros(N/2)])[:,None]
        kernel = GPy.kern.rbf(1)
        distribution = GPy.likelihoods.likelihood_functions.probit()
        likelihood = GPy.likelihoods.EP(Y, distribution)
        m = GPy.models.GP(X, likelihood, kernel)
        m.ensure_default_constraints()
-        self.assertTrue(m.EPEM)
+        m.update_likelihood_approximation()
+        self.assertTrue(m.checkgrad())
+        #self.assertTrue(m.EPEM)
+
+    def test_sparse_EP_DTC_probit(self):
+        N = 20
+        X = np.hstack([np.random.normal(5,2,N/2),np.random.normal(10,2,N/2)])[:,None]
+        Y = np.hstack([np.ones(N/2),np.zeros(N/2)])[:,None]
+        Z = np.linspace(0,15,4)[:,None]
+        kernel = GPy.kern.rbf(1)
+        distribution = GPy.likelihoods.likelihood_functions.probit()
+        likelihood = GPy.likelihoods.EP(Y, distribution)
+        m = GPy.models.sparse_GP(X, likelihood, kernel,Z)
+        m.ensure_default_constraints()
+        m.update_likelihood_approximation()
+        self.assertTrue(m.checkgrad())

    @unittest.skip("FITC will be broken for a while")
    def test_generalized_FITC(self):
@ -177,17 +192,6 @@ class GradientTests(unittest.TestCase):
        m.approximate_likelihood()
        self.assertTrue(m.checkgrad())

-    def test_warped_GP(self):
-        xmin, xmax = 1, 2.5*np.pi
-        b, C, SNR = 1, 0, 0.1
-        X = np.linspace(xmin, xmax, 500)
-        y  = b*X + C + 1*np.sin(X)
-        y += 0.05*np.random.randn(len(X))
-        X, y = X[:, None], y[:, None]
-        m = GPy.models.warpedGP(X, y, warping_terms = 3)
-        m.constrain_positive('(tanh_a|tanh_b|rbf|white|bias)')
-        self.assertTrue(m.checkgrad())
-

 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
--- a/GPy/util/Tango.py
+++ b/GPy/util/Tango.py
@ -25,7 +25,7 @@ def fewerXticks(ax=None,divideby=2):
    ax.set_xticks(ax.get_xticks()[::divideby])


-coloursHex = {\
+colorsHex = {\
 "Aluminium6":"#2e3436",\
 "Aluminium5":"#555753",\
 "Aluminium4":"#888a85",\
@ -54,9 +54,9 @@ coloursHex = {\
 "mediumButter":"#edd400",\
 "darkButter":"#c4a000"}

-darkList = [coloursHex['darkBlue'],coloursHex['darkRed'],coloursHex['darkGreen'], coloursHex['darkOrange'], coloursHex['darkButter'], coloursHex['darkPurple'], coloursHex['darkChocolate'], coloursHex['Aluminium6']]
-mediumList = [coloursHex['mediumBlue'], coloursHex['mediumRed'],coloursHex['mediumGreen'], coloursHex['mediumOrange'], coloursHex['mediumButter'], coloursHex['mediumPurple'], coloursHex['mediumChocolate'], coloursHex['Aluminium5']]
-lightList = [coloursHex['lightBlue'], coloursHex['lightRed'],coloursHex['lightGreen'], coloursHex['lightOrange'], coloursHex['lightButter'], coloursHex['lightPurple'], coloursHex['lightChocolate'], coloursHex['Aluminium4']]
+darkList = [colorsHex['darkBlue'],colorsHex['darkRed'],colorsHex['darkGreen'], colorsHex['darkOrange'], colorsHex['darkButter'], colorsHex['darkPurple'], colorsHex['darkChocolate'], colorsHex['Aluminium6']]
+mediumList = [colorsHex['mediumBlue'], colorsHex['mediumRed'],colorsHex['mediumGreen'], colorsHex['mediumOrange'], colorsHex['mediumButter'], colorsHex['mediumPurple'], colorsHex['mediumChocolate'], colorsHex['Aluminium5']]
+lightList = [colorsHex['lightBlue'], colorsHex['lightRed'],colorsHex['lightGreen'], colorsHex['lightOrange'], colorsHex['lightButter'], colorsHex['lightPurple'], colorsHex['lightChocolate'], colorsHex['Aluminium4']]

 def currentDark():
    return darkList[-1]
@ -76,85 +76,85 @@ def nextLight():
    return lightList[-1]

 def reset():
-    while not darkList[0]==coloursHex['darkBlue']:
+    while not darkList[0]==colorsHex['darkBlue']:
        darkList.append(darkList.pop(0))
-    while not mediumList[0]==coloursHex['mediumBlue']:
+    while not mediumList[0]==colorsHex['mediumBlue']:
        mediumList.append(mediumList.pop(0))
-    while not lightList[0]==coloursHex['lightBlue']:
+    while not lightList[0]==colorsHex['lightBlue']:
        lightList.append(lightList.pop(0))

 def setLightFigures():
-    mpl.rcParams['axes.edgecolor']=coloursHex['Aluminium6']
-    mpl.rcParams['axes.facecolor']=coloursHex['Aluminium2']
-    mpl.rcParams['axes.labelcolor']=coloursHex['Aluminium6']
-    mpl.rcParams['figure.edgecolor']=coloursHex['Aluminium6']
-    mpl.rcParams['figure.facecolor']=coloursHex['Aluminium2']
-    mpl.rcParams['grid.color']=coloursHex['Aluminium6']
-    mpl.rcParams['savefig.edgecolor']=coloursHex['Aluminium2']
-    mpl.rcParams['savefig.facecolor']=coloursHex['Aluminium2']
-    mpl.rcParams['text.color']=coloursHex['Aluminium6']
-    mpl.rcParams['xtick.color']=coloursHex['Aluminium6']
-    mpl.rcParams['ytick.color']=coloursHex['Aluminium6']
+    mpl.rcParams['axes.edgecolor']=colorsHex['Aluminium6']
+    mpl.rcParams['axes.facecolor']=colorsHex['Aluminium2']
+    mpl.rcParams['axes.labelcolor']=colorsHex['Aluminium6']
+    mpl.rcParams['figure.edgecolor']=colorsHex['Aluminium6']
+    mpl.rcParams['figure.facecolor']=colorsHex['Aluminium2']
+    mpl.rcParams['grid.color']=colorsHex['Aluminium6']
+    mpl.rcParams['savefig.edgecolor']=colorsHex['Aluminium2']
+    mpl.rcParams['savefig.facecolor']=colorsHex['Aluminium2']
+    mpl.rcParams['text.color']=colorsHex['Aluminium6']
+    mpl.rcParams['xtick.color']=colorsHex['Aluminium6']
+    mpl.rcParams['ytick.color']=colorsHex['Aluminium6']

 def setDarkFigures():
-    mpl.rcParams['axes.edgecolor']=coloursHex['Aluminium2']
-    mpl.rcParams['axes.facecolor']=coloursHex['Aluminium6']
-    mpl.rcParams['axes.labelcolor']=coloursHex['Aluminium2']
-    mpl.rcParams['figure.edgecolor']=coloursHex['Aluminium2']
-    mpl.rcParams['figure.facecolor']=coloursHex['Aluminium6']
-    mpl.rcParams['grid.color']=coloursHex['Aluminium2']
-    mpl.rcParams['savefig.edgecolor']=coloursHex['Aluminium6']
-    mpl.rcParams['savefig.facecolor']=coloursHex['Aluminium6']
-    mpl.rcParams['text.color']=coloursHex['Aluminium2']
-    mpl.rcParams['xtick.color']=coloursHex['Aluminium2']
-    mpl.rcParams['ytick.color']=coloursHex['Aluminium2']
+    mpl.rcParams['axes.edgecolor']=colorsHex['Aluminium2']
+    mpl.rcParams['axes.facecolor']=colorsHex['Aluminium6']
+    mpl.rcParams['axes.labelcolor']=colorsHex['Aluminium2']
+    mpl.rcParams['figure.edgecolor']=colorsHex['Aluminium2']
+    mpl.rcParams['figure.facecolor']=colorsHex['Aluminium6']
+    mpl.rcParams['grid.color']=colorsHex['Aluminium2']
+    mpl.rcParams['savefig.edgecolor']=colorsHex['Aluminium6']
+    mpl.rcParams['savefig.facecolor']=colorsHex['Aluminium6']
+    mpl.rcParams['text.color']=colorsHex['Aluminium2']
+    mpl.rcParams['xtick.color']=colorsHex['Aluminium2']
+    mpl.rcParams['ytick.color']=colorsHex['Aluminium2']

 def hex2rgb(hexcolor):
    hexcolor = [hexcolor[1+2*i:1+2*(i+1)] for i in range(3)]
    r,g,b = [int(n,16) for n in hexcolor]
    return (r,g,b)

-coloursRGB = dict([(k,hex2rgb(i)) for k,i in coloursHex.items()])
+colorsRGB = dict([(k,hex2rgb(i)) for k,i in colorsHex.items()])

-cdict_RB = {'red' :((0.,coloursRGB['mediumRed'][0]/256.,coloursRGB['mediumRed'][0]/256.),
-                     (.5,coloursRGB['mediumPurple'][0]/256.,coloursRGB['mediumPurple'][0]/256.),
-                     (1.,coloursRGB['mediumBlue'][0]/256.,coloursRGB['mediumBlue'][0]/256.)),
-            'green':((0.,coloursRGB['mediumRed'][1]/256.,coloursRGB['mediumRed'][1]/256.),
-                     (.5,coloursRGB['mediumPurple'][1]/256.,coloursRGB['mediumPurple'][1]/256.),
-                     (1.,coloursRGB['mediumBlue'][1]/256.,coloursRGB['mediumBlue'][1]/256.)),
-            'blue':((0.,coloursRGB['mediumRed'][2]/256.,coloursRGB['mediumRed'][2]/256.),
-                      (.5,coloursRGB['mediumPurple'][2]/256.,coloursRGB['mediumPurple'][2]/256.),
-                      (1.,coloursRGB['mediumBlue'][2]/256.,coloursRGB['mediumBlue'][2]/256.))}
+cdict_RB = {'red' :((0.,colorsRGB['mediumRed'][0]/256.,colorsRGB['mediumRed'][0]/256.),
+                     (.5,colorsRGB['mediumPurple'][0]/256.,colorsRGB['mediumPurple'][0]/256.),
+                     (1.,colorsRGB['mediumBlue'][0]/256.,colorsRGB['mediumBlue'][0]/256.)),
+            'green':((0.,colorsRGB['mediumRed'][1]/256.,colorsRGB['mediumRed'][1]/256.),
+                     (.5,colorsRGB['mediumPurple'][1]/256.,colorsRGB['mediumPurple'][1]/256.),
+                     (1.,colorsRGB['mediumBlue'][1]/256.,colorsRGB['mediumBlue'][1]/256.)),
+            'blue':((0.,colorsRGB['mediumRed'][2]/256.,colorsRGB['mediumRed'][2]/256.),
+                      (.5,colorsRGB['mediumPurple'][2]/256.,colorsRGB['mediumPurple'][2]/256.),
+                      (1.,colorsRGB['mediumBlue'][2]/256.,colorsRGB['mediumBlue'][2]/256.))}

-cdict_BGR = {'red' :((0.,coloursRGB['mediumBlue'][0]/256.,coloursRGB['mediumBlue'][0]/256.),
-                     (.5,coloursRGB['mediumGreen'][0]/256.,coloursRGB['mediumGreen'][0]/256.),
-                     (1.,coloursRGB['mediumRed'][0]/256.,coloursRGB['mediumRed'][0]/256.)),
-            'green':((0.,coloursRGB['mediumBlue'][1]/256.,coloursRGB['mediumBlue'][1]/256.),
-                     (.5,coloursRGB['mediumGreen'][1]/256.,coloursRGB['mediumGreen'][1]/256.),
-                     (1.,coloursRGB['mediumRed'][1]/256.,coloursRGB['mediumRed'][1]/256.)),
-            'blue':((0.,coloursRGB['mediumBlue'][2]/256.,coloursRGB['mediumBlue'][2]/256.),
-                      (.5,coloursRGB['mediumGreen'][2]/256.,coloursRGB['mediumGreen'][2]/256.),
-                      (1.,coloursRGB['mediumRed'][2]/256.,coloursRGB['mediumRed'][2]/256.))}
+cdict_BGR = {'red' :((0.,colorsRGB['mediumBlue'][0]/256.,colorsRGB['mediumBlue'][0]/256.),
+                     (.5,colorsRGB['mediumGreen'][0]/256.,colorsRGB['mediumGreen'][0]/256.),
+                     (1.,colorsRGB['mediumRed'][0]/256.,colorsRGB['mediumRed'][0]/256.)),
+            'green':((0.,colorsRGB['mediumBlue'][1]/256.,colorsRGB['mediumBlue'][1]/256.),
+                     (.5,colorsRGB['mediumGreen'][1]/256.,colorsRGB['mediumGreen'][1]/256.),
+                     (1.,colorsRGB['mediumRed'][1]/256.,colorsRGB['mediumRed'][1]/256.)),
+            'blue':((0.,colorsRGB['mediumBlue'][2]/256.,colorsRGB['mediumBlue'][2]/256.),
+                      (.5,colorsRGB['mediumGreen'][2]/256.,colorsRGB['mediumGreen'][2]/256.),
+                      (1.,colorsRGB['mediumRed'][2]/256.,colorsRGB['mediumRed'][2]/256.))}


-cdict_Alu = {'red' :((0./5,coloursRGB['Aluminium1'][0]/256.,coloursRGB['Aluminium1'][0]/256.),
-                     (1./5,coloursRGB['Aluminium2'][0]/256.,coloursRGB['Aluminium2'][0]/256.),
-                     (2./5,coloursRGB['Aluminium3'][0]/256.,coloursRGB['Aluminium3'][0]/256.),
-                     (3./5,coloursRGB['Aluminium4'][0]/256.,coloursRGB['Aluminium4'][0]/256.),
-                     (4./5,coloursRGB['Aluminium5'][0]/256.,coloursRGB['Aluminium5'][0]/256.),
-                     (5./5,coloursRGB['Aluminium6'][0]/256.,coloursRGB['Aluminium6'][0]/256.)),
-           'green' :((0./5,coloursRGB['Aluminium1'][1]/256.,coloursRGB['Aluminium1'][1]/256.),
-                     (1./5,coloursRGB['Aluminium2'][1]/256.,coloursRGB['Aluminium2'][1]/256.),
-                     (2./5,coloursRGB['Aluminium3'][1]/256.,coloursRGB['Aluminium3'][1]/256.),
-                     (3./5,coloursRGB['Aluminium4'][1]/256.,coloursRGB['Aluminium4'][1]/256.),
-                     (4./5,coloursRGB['Aluminium5'][1]/256.,coloursRGB['Aluminium5'][1]/256.),
-                     (5./5,coloursRGB['Aluminium6'][1]/256.,coloursRGB['Aluminium6'][1]/256.)),
-            'blue' :((0./5,coloursRGB['Aluminium1'][2]/256.,coloursRGB['Aluminium1'][2]/256.),
-                     (1./5,coloursRGB['Aluminium2'][2]/256.,coloursRGB['Aluminium2'][2]/256.),
-                     (2./5,coloursRGB['Aluminium3'][2]/256.,coloursRGB['Aluminium3'][2]/256.),
-                     (3./5,coloursRGB['Aluminium4'][2]/256.,coloursRGB['Aluminium4'][2]/256.),
-                     (4./5,coloursRGB['Aluminium5'][2]/256.,coloursRGB['Aluminium5'][2]/256.),
-                     (5./5,coloursRGB['Aluminium6'][2]/256.,coloursRGB['Aluminium6'][2]/256.))}
+cdict_Alu = {'red' :((0./5,colorsRGB['Aluminium1'][0]/256.,colorsRGB['Aluminium1'][0]/256.),
+                     (1./5,colorsRGB['Aluminium2'][0]/256.,colorsRGB['Aluminium2'][0]/256.),
+                     (2./5,colorsRGB['Aluminium3'][0]/256.,colorsRGB['Aluminium3'][0]/256.),
+                     (3./5,colorsRGB['Aluminium4'][0]/256.,colorsRGB['Aluminium4'][0]/256.),
+                     (4./5,colorsRGB['Aluminium5'][0]/256.,colorsRGB['Aluminium5'][0]/256.),
+                     (5./5,colorsRGB['Aluminium6'][0]/256.,colorsRGB['Aluminium6'][0]/256.)),
+           'green' :((0./5,colorsRGB['Aluminium1'][1]/256.,colorsRGB['Aluminium1'][1]/256.),
+                     (1./5,colorsRGB['Aluminium2'][1]/256.,colorsRGB['Aluminium2'][1]/256.),
+                     (2./5,colorsRGB['Aluminium3'][1]/256.,colorsRGB['Aluminium3'][1]/256.),
+                     (3./5,colorsRGB['Aluminium4'][1]/256.,colorsRGB['Aluminium4'][1]/256.),
+                     (4./5,colorsRGB['Aluminium5'][1]/256.,colorsRGB['Aluminium5'][1]/256.),
+                     (5./5,colorsRGB['Aluminium6'][1]/256.,colorsRGB['Aluminium6'][1]/256.)),
+            'blue' :((0./5,colorsRGB['Aluminium1'][2]/256.,colorsRGB['Aluminium1'][2]/256.),
+                     (1./5,colorsRGB['Aluminium2'][2]/256.,colorsRGB['Aluminium2'][2]/256.),
+                     (2./5,colorsRGB['Aluminium3'][2]/256.,colorsRGB['Aluminium3'][2]/256.),
+                     (3./5,colorsRGB['Aluminium4'][2]/256.,colorsRGB['Aluminium4'][2]/256.),
+                     (4./5,colorsRGB['Aluminium5'][2]/256.,colorsRGB['Aluminium5'][2]/256.),
+                     (5./5,colorsRGB['Aluminium6'][2]/256.,colorsRGB['Aluminium6'][2]/256.))}
 # cmap_Alu = mpl.colors.LinearSegmentedColormap('TangoAluminium',cdict_Alu,256)
 # cmap_BGR = mpl.colors.LinearSegmentedColormap('TangoRedBlue',cdict_BGR,256)
 # cmap_RB = mpl.colors.LinearSegmentedColormap('TangoRedBlue',cdict_RB,256)
--- a/GPy/util/init.py
+++ b/GPy/util/init.py
@ -10,3 +10,4 @@ import Tango
 import misc
 import warping_functions
 import datasets
+import decorators
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -46,7 +46,7 @@ def oil_100(seed=default_seed):
    return {'X': X, 'Y': Y, 'info': "Subsample of the oil data extracting 100 values randomly without replacement."}

 def pumadyn(seed=default_seed):
-    # Data is variance 1, no need to normalise.
+    # Data is variance 1, no need to normalize.
    data = np.loadtxt(os.path.join(data_path, 'pumadyn-32nm/Dataset.data.gz'))
    indices = np.random.permutation(data.shape[0])
    indicesTrain = indices[0:7168]
--- a/GPy/util/decorators.py
+++ b/GPy/util/decorators.py
@ -0,0 +1,16 @@
+import numpy as np
+from functools import wraps
+
+def silence_errors(f):
+    """
+    This wraps a function and it silences numpy errors that
+    happen during the execution. After the function has exited, it restores
+    the previous state of the warnings.
+    """
+    @wraps(f)
+    def wrapper(*args, **kwds):
+        status = np.seterr(all='ignore')
+        result = f(*args, **kwds)
+        np.seterr(**status)
+        return result
+    return wrapper
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -11,9 +11,15 @@ import re
 import pdb
 import cPickle
 import types
-import scipy.lib.lapack.flapack
+#import scipy.lib.lapack.flapack
 import scipy as sp

+def trace_dot(a,b):
+    """
+    efficiently compute the trace of the matrix product of a and b
+    """
+    return np.sum(a*b)
+
 def mdot(*args):
   """Multiply all the arguments using matrix product rules.
   The output is equivalent to multiplying the arguments one by one
@ -101,7 +107,7 @@ def chol_inv(L):

    """

-    return linalg.flapack.dtrtri(L, lower = True)[0]
+    return linalg.lapack.flapack.dtrtri(L, lower = True)[0]


 def multiple_pdinv(A):
@ -118,7 +124,7 @@ def multiple_pdinv(A):
    N = A.shape[-1]
    chols = [jitchol(A[:,:,i]) for i in range(N)]
    halflogdets = [np.sum(np.log(np.diag(L[0]))) for L in chols]
-    invs = [linalg.flapack.dpotri(L[0],True)[0] for L in chols]
+    invs = [linalg.lapack.flapack.dpotri(L[0],True)[0] for L in chols]
    invs = [np.triu(I)+np.triu(I,1).T for I in invs]
    return np.dstack(invs),np.array(halflogdets)

--- a/GPy/util/plot.py
+++ b/GPy/util/plot.py
@ -6,7 +6,7 @@ import Tango
 import pylab as pb
 import numpy as np

-def gpplot(x,mu,lower,upper,edgecol=Tango.coloursHex['darkBlue'],fillcol=Tango.coloursHex['lightBlue'],axes=None,**kwargs):
+def gpplot(x,mu,lower,upper,edgecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue'],axes=None,**kwargs):
    if axes is None:
        axes = pb.gca()
    mu = mu.flatten()
--- a/README.md
+++ b/README.md
@ -4,4 +4,7 @@ GPy
 A Gaussian processes framework in python

 * [Online documentation](https://gpy.readthedocs.org/en/latest/)
-* [Unit tests (Travis-CI)](https://travis-ci.org/SheffieldML/GPy)
+* [Unit tests (Travis-CI)](https://travis-ci.org/SheffieldML/GPy)
+
+
+Continuous integration status: ![CI status](https://travis-ci.org/SheffieldML/GPy.png)
--- a/doc/Figures/tick.png
+++ b/doc/Figures/tick.png
--- a/doc/GPy.examples.rst
+++ b/doc/GPy.examples.rst
@ -9,14 +9,6 @@ examples Package
    :undoc-members:
    :show-inheritance:

-:mod:`BGPLVM_demo` Module
-------------------------
-
-.. automodule:: GPy.examples.BGPLVM_demo
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
 :mod:`classification` Module
 ----------------------------

@ -25,18 +17,18 @@ examples Package
    :undoc-members:
    :show-inheritance:

-:mod:`oil_flow_demo` Module
---------------------------
+:mod:`dimensionality_reduction` Module
+--------------------------------------

-.. automodule:: GPy.examples.oil_flow_demo
+.. automodule:: GPy.examples.dimensionality_reduction
    :members:
    :undoc-members:
    :show-inheritance:

-:mod:`poisson` Module
---------------------
+:mod:`non_gaussian` Module
+--------------------------

-.. automodule:: GPy.examples.poisson
+.. automodule:: GPy.examples.non_gaussian
    :members:
    :undoc-members:
    :show-inheritance:
@ -49,50 +41,10 @@ examples Package
    :undoc-members:
    :show-inheritance:

-:mod:`sparse_GPLVM_demo` Module
-------------------------------
+:mod:`tutorials` Module
+-----------------------

-.. automodule:: GPy.examples.sparse_GPLVM_demo
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-:mod:`sparse_GP_regression_demo` Module
---------------------------------------
-
-.. automodule:: GPy.examples.sparse_GP_regression_demo
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-:mod:`sparse_ep_fix` Module
---------------------------
-
-.. automodule:: GPy.examples.sparse_ep_fix
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-:mod:`uncertain_input_GP_regression_demo` Module
------------------------------------------------
-
-.. automodule:: GPy.examples.uncertain_input_GP_regression_demo
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-:mod:`uncollapsed_GP_demo` Module
---------------------------------
-
-.. automodule:: GPy.examples.uncollapsed_GP_demo
-    :members:
-    :undoc-members:
-    :show-inheritance:
-
-:mod:`unsupervised` Module
--------------------------
-
-.. automodule:: GPy.examples.unsupervised
+.. automodule:: GPy.examples.tutorials
    :members:
    :undoc-members:
    :show-inheritance:
--- a/doc/GPy.kern.rst
+++ b/doc/GPy.kern.rst
@ -49,6 +49,14 @@ kern Package
    :undoc-members:
    :show-inheritance:

+:mod:`coregionalise` Module
+---------------------------
+
+.. automodule:: GPy.kern.coregionalise
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 :mod:`exponential` Module
 -------------------------

@ -113,18 +121,26 @@ kern Package
    :undoc-members:
    :show-inheritance:

-:mod:`product` Module
---------------------
+:mod:`prod` Module
+------------------

-.. automodule:: GPy.kern.product
+.. automodule:: GPy.kern.prod
    :members:
    :undoc-members:
    :show-inheritance:

-:mod:`product_orthogonal` Module
+:mod:`prod_orthogonal` Module
+-----------------------------
+
+.. automodule:: GPy.kern.prod_orthogonal
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`rational_quadratic` Module
 --------------------------------

-.. automodule:: GPy.kern.product_orthogonal
+.. automodule:: GPy.kern.rational_quadratic
    :members:
    :undoc-members:
    :show-inheritance:
@ -145,6 +161,14 @@ kern Package
    :undoc-members:
    :show-inheritance:

+:mod:`symmetric` Module
+-----------------------
+
+.. automodule:: GPy.kern.symmetric
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
 :mod:`sympykern` Module
 -----------------------

--- a/doc/GPy.models.rst
+++ b/doc/GPy.models.rst
@ -9,10 +9,10 @@ models Package
    :undoc-members:
    :show-inheritance:

-:mod:`BGPLVM` Module
--------------------
+:mod:`Bayesian_GPLVM` Module
+----------------------------

-.. automodule:: GPy.models.BGPLVM
+.. automodule:: GPy.models.Bayesian_GPLVM
    :members:
    :undoc-members:
    :show-inheritance:
--- a/doc/GPy.rst
+++ b/doc/GPy.rst
@ -20,5 +20,6 @@ Subpackages
    GPy.kern
    GPy.likelihoods
    GPy.models
+    GPy.testing
    GPy.util

--- a/doc/GPy.testing.rst
+++ b/doc/GPy.testing.rst
@ -0,0 +1,59 @@
+testing Package
+===============
+
+:mod:`bgplvm_tests` Module
+--------------------------
+
+.. automodule:: GPy.testing.bgplvm_tests
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`examples_tests` Module
+----------------------------
+
+.. automodule:: GPy.testing.examples_tests
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`gplvm_tests` Module
+-------------------------
+
+.. automodule:: GPy.testing.gplvm_tests
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`kernel_tests` Module
+--------------------------
+
+.. automodule:: GPy.testing.kernel_tests
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`prior_tests` Module
+-------------------------
+
+.. automodule:: GPy.testing.prior_tests
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`sparse_gplvm_tests` Module
+--------------------------------
+
+.. automodule:: GPy.testing.sparse_gplvm_tests
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
+:mod:`unit_tests` Module
+------------------------
+
+.. automodule:: GPy.testing.unit_tests
+    :members:
+    :undoc-members:
+    :show-inheritance:
+
--- a/doc/index.rst
+++ b/doc/index.rst
@ -8,9 +8,9 @@ Welcome to GPy's documentation!
 For a quick start, you can have a look at one of the tutorials:

 * `Basic Gaussian process regression <tuto_GP_regression.html>`_  
+* `Interacting with models <tuto_interacting_with_models.html>`_
 * `A kernel overview <tuto_kernel_overview.html>`_ 
-* Advanced GP regression (Forthcoming)
-* Writting kernels (Forthcoming)
+* `Writing new kernels <tuto_creating_new_kernels.html>`_

 You may also be interested by some examples in the GPy/examples folder.

--- a/doc/kernel_implementation.rst
+++ b/doc/kernel_implementation.rst
@ -0,0 +1,49 @@
+
+***************************
+List of implemented kernels
+***************************
+
+The following table shows the implemented kernels in GPy and gives the details of the implemented function for each kernel.
+
+==================== =========== =====  ===========  ======  ======= =========== =============== ======= =========== ====== ====== =======
+NAME                  Dimension   ARD   get/set      K       Kdiag   dK_dtheta   dKdiag_dtheta   dK_dX   dKdiag_dX   psi0   psi1   psi2
+==================== =========== =====  ===========  ======  ======= =========== =============== ======= =========== ====== ====== =======
+bias                 n                  |tick|       |tick|  |tick|  |tick|      |tick|          |tick|  |tick|      |tick| |tick| |tick|
+-------------------- ----------- -----  -----------  ------  ------- ----------- --------------- ------- ----------- ------ ------ -------
+Brownian             1                  |tick|       |tick|  |tick|  |tick|      |tick|          |tick|  |tick|                 
+-------------------- ----------- -----  -----------  ------  ------- ----------- --------------- ------- ----------- ------ ------ -------
+exponential          n           yes    |tick|       |tick|  |tick|  |tick|      |tick|          |tick|  |tick|
+-------------------- ----------- -----  -----------  ------  ------- ----------- --------------- ------- ----------- ------ ------ -------
+finite_dimensional   n                  |tick|       |tick|  |tick|  |tick|      |tick| 
+-------------------- ----------- -----  -----------  ------  ------- ----------- --------------- ------- ----------- ------ ------ -------
+linear               n           yes    |tick|       |tick|  |tick|  |tick|      |tick|          |tick|              |tick| |tick| |tick|
+-------------------- ----------- -----  -----------  ------  ------- ----------- --------------- ------- ----------- ------ ------ -------
+Matern32             n           yes    |tick|       |tick|  |tick|  |tick|      |tick|          |tick|  |tick|        
+-------------------- ----------- -----  -----------  ------  ------- ----------- --------------- ------- ----------- ------ ------ -------
+Matern52             n           yes    |tick|       |tick|  |tick|  |tick|      |tick|          |tick|  |tick|
+-------------------- ----------- -----  -----------  ------  ------- ----------- --------------- ------- ----------- ------ ------ -------
+periodic_exponential 1                  |tick|       |tick|  |tick|  |tick|      |tick|
+-------------------- ----------- -----  -----------  ------  ------- ----------- --------------- ------- ----------- ------ ------ -------
+periodic_Matern32    1                  |tick|       |tick|  |tick|  |tick|      |tick|
+-------------------- ----------- -----  -----------  ------  ------- ----------- --------------- ------- ----------- ------ ------ -------
+periodic_Matern52    1                  |tick|       |tick|  |tick|  |tick|      |tick|
+-------------------- ----------- -----  -----------  ------  ------- ----------- --------------- ------- ----------- ------ ------ -------
+rational quadratic   1                  |tick|       |tick|  |tick|  |tick|      |tick|          |tick|  |tick|                           
+-------------------- ----------- -----  -----------  ------  ------- ----------- --------------- ------- ----------- ------ ------ -------
+rbf                  n           yes    |tick|       |tick|  |tick|  |tick|      |tick|          |tick|  |tick|      |tick| |tick| |tick|
+-------------------- ----------- -----  -----------  ------  ------- ----------- --------------- ------- ----------- ------ ------ -------
+spline               1                  |tick|       |tick|  |tick|  |tick|      |tick|                  |tick|     
+-------------------- ----------- -----  -----------  ------  ------- ----------- --------------- ------- ----------- ------ ------ -------
+white                n                  |tick|       |tick|  |tick|  |tick|      |tick|          |tick|  |tick|      |tick| |tick| |tick|
+==================== =========== =====  ===========  ======  ======= =========== =============== ======= =========== ====== ====== =======
+
+Depending on the use, all functions may not be required
+
+    * ``get/set, K, Kdiag``: compulsory
+    * ``dK_dtheta``: necessary to optimize the model
+    * ``dKdiag_dtheta``: sparse models, BGPLVM, GPs with uncertain inputs
+    * ``dK_dX``: sparse models, GPLVM, BGPLVM, GPs with uncertain inputs
+    * ``dKdiag_dX``: sparse models, BGPLVM, GPs with uncertain inputs
+    * ``psi0, psi1, psi2``: BGPLVM, GPs with uncertain inputs
+
+..  |tick| image:: Figures/tick.png
--- a/doc/tuto_GP_regression.rst
+++ b/doc/tuto_GP_regression.rst
@ -2,7 +2,7 @@
 Gaussian process regression tutorial
 *************************************

-We will see in this tutorial the basics for building a 1 dimensional and a 2 dimensional Gaussian process regression model, also known as a kriging model.
+We will see in this tutorial the basics for building a 1 dimensional and a 2 dimensional Gaussian process regression model, also known as a kriging model. The code shown in this tutorial can be obtained at GPy/examples/tutorials.py, or by running ``GPy.examples.tutorials.tuto_GP_regression()``.

 We first import the libraries we will need: ::

--- a/doc/tuto_creating_new_kernels.rst
+++ b/doc/tuto_creating_new_kernels.rst
@ -0,0 +1,183 @@
+********************
+Creating new kernels
+********************
+
+We will see in this tutorial how to create new kernels in GPy. We will also give details on how to implement each function of the kernel and illustrate with a running example: the rational quadratic kernel. 
+
+Structure of a kernel in GPy
+============================
+
+In GPy a kernel object is made of a list of kernpart objects, which correspond to symetric positive definite functions. More precisely, the kernel should be understood as the sum of the kernparts. In order to implement a new covariance, the following steps must be followed
+
+    1. implement the new covariance as a kernpart object
+    2. update the constructors that allow to use the kernpart as a kern object
+    3. update the __init__.py file
+
+Theses three steps are detailed below.
+
+Implementing a kernpart object
+==============================
+
+We advise the reader to start with copy-pasting an existing kernel and to modify the new file. We will now give a description of the various functions that can be found in a kernpart object.
+
+**Header**
+
+The header is similar to all kernels: ::
+
+    from kernpart import kernpart
+    import numpy as np
+
+    class rational_quadratic(kernpart):
+
+**__init__(self,D, param1, param2, ...)**
+
+The implementation of this function in mandatory.
+
+For all kernparts the first parameter ``D`` corresponds to the dimension of the input space, and the following parameters stand for the parameterization of the kernel.
+
+The following attributes are compulsory: ``self.D`` (the dimension, integer), ``self.name`` (name of the kernel, string), ``self.Nparam`` (number of parameters, integer). ::
+
+    def __init__(self,D,variance=1.,lengthscale=1.,power=1.):
+        assert D == 1, "For this kernel we assume D=1"
+        self.D = D
+        self.Nparam = 3
+        self.name = 'rat_quad'
+        self.variance = variance
+        self.lengthscale = lengthscale
+        self.power = power
+
+**_get_params(self)**
+
+The implementation of this function in mandatory.
+
+This function returns a one dimensional array of length ``self.Nparam`` containing the value of the parameters. ::
+
+    def _get_params(self):
+        return np.hstack((self.variance,self.lengthscale,self.power))
+
+**_set_params(self,x)**
+
+The implementation of this function in mandatory.
+
+The input is a one dimensional array of length ``self.Nparam`` containing the value of the parameters. The function has no output but it updates the values of the attribute associated to the parameters (such as ``self.variance``, ``self.lengthscale``, ...). ::
+
+    def _set_params(self,x):
+        self.variance = x[0]
+        self.lengthscale = x[1]
+        self.power = x[2]
+
+**_get_param_names(self)**
+
+The implementation of this function in mandatory.
+
+It returns a list of strings of length ``self.Nparam`` corresponding to the parameter names. ::
+
+    def _get_param_names(self):
+        return ['variance','lengthscale','power']
+
+**K(self,X,X2,target)**
+
+The implementation of this function in mandatory.
+
+This function is used to compute the covariance matrix associated with the inputs X, X2 (np.arrays with arbitrary number of line (say :math:`n_1`, :math:`n_2`) and ``self.D`` columns). This function does not returns anything but it adds the :math:`n_1 \times n_2` covariance matrix to the kernpart to the object ``target`` (a :math:`n_1 \times n_2` np.array). This trick allows to compute the covariance matrix of a kernel containing many kernparts with a limited memory use. ::
+
+    def K(self,X,X2,target):
+        if X2 is None: X2 = X
+        dist2 = np.square((X-X2.T)/self.lengthscale)
+        target += self.variance*(1 + dist2/2.)**(-self.power)
+
+**Kdiag(self,X,target)**
+
+The implementation of this function in mandatory.
+
+This function is similar to ``K`` but it computes only the values of the kernel on the diagonal. Thus, ``target`` is a 1-dimensional np.array of length :math:`n_1`. ::
+
+    def Kdiag(self,X,target):
+        target += self.variance    
+
+
+**dK_dtheta(self,dL_dK,X,X2,target)**
+
+This function is required for the optimization of the parameters.
+
+Computes the derivative of the likelihood. As previously, the values are added to the object target which is a 1-dimensional np.array of length ``self.Nparam``. For example, if the kernel is parameterized by :math:`\sigma^2,\ \theta`, then :math:`\frac{dL}{d\sigma^2} = \frac{dL}{d K} \frac{dK}{d\sigma^2}` is added to the first element of target and :math:`\frac{dL}{d\theta} = \frac{dL}{d K} \frac{dK}{d\theta}` to the second. ::
+
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        if X2 is None: X2 = X
+        dist2 = np.square((X-X2.T)/self.lengthscale)
+
+        dvar = (1 + dist2/2.)**(-self.power)
+        dl = self.power * self.variance * dist2 * self.lengthscale**(-3) * (1 + dist2/2./self.power)**(-self.power-1)
+        dp = - self.variance * np.log(1 + dist2/2.) * (1 + dist2/2.)**(-self.power)
+
+        target[0] += np.sum(dvar*dL_dK)
+        target[1] += np.sum(dl*dL_dK)
+        target[2] += np.sum(dp*dL_dK)
+
+
+**dKdiag_dtheta(self,dL_dKdiag,X,target)**
+
+This function is required for BGPLVM, sparse models and uncertain inputs.
+
+As previously, target is an ``self.Nparam`` array and :math:`\frac{dL}{d Kdiag} \frac{dKdiag}{dparam}` is added to each element. ::
+
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        target[0] += np.sum(dL_dKdiag)
+        # here self.lengthscale and self.power have no influence on Kdiag so target[1:] are unchanged
+
+**dK_dX(self,dL_dK,X,X2,target)**
+
+This function is required for GPLVM, BGPLVM, sparse models and uncertain inputs.
+
+Computes the derivative of the likelihood with respect to the inputs ``X`` (a :math:`n \times D` np.array). The result is added to target which is a :math:`n \times D` np.array. ::
+
+    def dK_dX(self,dL_dK,X,X2,target):
+        """derivative of the covariance matrix with respect to X."""
+        if X2 is None: X2 = X
+        dist2 = np.square((X-X2.T)/self.lengthscale)
+
+        dX = -self.variance*self.power * (X-X2.T)/self.lengthscale**2 *  (1 + dist2/2./self.lengthscale)**(-self.power-1)
+        target += np.sum(dL_dK*dX,1)[:,np.newaxis]
+
+**dKdiag_dX(self,dL_dKdiag,X,target)**
+
+This function is required for BGPLVM, sparse models and uncertain inputs. As for ``dKdiag_dtheta``, :math:`\frac{dL}{d Kdiag} \frac{dKdiag}{dX}` is added to each element of target. ::
+
+    def dKdiag_dX(self,dL_dKdiag,X,target):
+        pass
+
+**Psi statistics**
+
+The psi statistics and their derivatives are required for BGPLVM and GPS with uncertain inputs.
+
+The expressions of the psi statistics are:
+
+TODO
+
+For the rational quadratic we have:
+
+TODO
+
+Update the constructor 
+======================
+
+Once the required functions have been implemented as a kernpart object, the file GPy/kern/constructors.py has to be updated to allow to build a kernel based on the kernpart object.
+
+The following line should be added in the preamble of the file::
+
+    from rational_quadratic import rational_quadratic as rational_quadratic_part
+
+as well as the following block ::
+
+    def rational_quadratic(D,variance=1., lengthscale=1., power=1.):
+        part = rational_quadraticpart(D,variance, lengthscale, power)
+        return kern(D, [part])
+
+
+Update initialization
+=====================
+
+The last step is to update the list of kernels imported from constructor in GPy/kern/__init__.py.
+
+
+
--- a/doc/tuto_interacting_with_models.rst
+++ b/doc/tuto_interacting_with_models.rst
@ -0,0 +1,217 @@
+*************************************
+Interacting with models
+*************************************
+
+The GPy model class has a set of features which are 
+designed to make it simple to explore the parameter 
+space of the model. By default, the scipy optimisers 
+are used to fit GPy models (via model.optimize()), 
+for which we provide mechanisms for 'free' optimisation: 
+GPy can ensure that naturally positive parameters 
+(such as variances) remain positive. But these mechanisms 
+are much more powerful than simple reparameterisation, 
+as we shall see. 
+
+Along this tutorial we'll use a sparse GP regression model 
+as example. This example can be in ``GPy.examples.regression``.  
+All of the examples included in GPy return an instance 
+of a model class, and therefore they can be called in 
+the following way: ::
+
+    import pylab as pb
+    pb.ion()
+    import GPy
+    m = GPy.examples.regression.sparse_GP_regression_1D()
+
+Examining the model using print
+===============================
+To see the current state of the model parameters, 
+and the model's (marginal) likelihood just print the model ::
+
+    print m
+
+The first thing displayed on the screen is the log-likelihood 
+value of the model with its current parameters. Below the 
+log-likelihood, a table with all the model's parameters 
+is shown. For each parameter, the table contains the name 
+of the parameter, the current value, and in case there are 
+defined: constraints, ties and prior distrbutions associated. ::
+
+	Log-likelihood: 6.309e+02
+
+	     Name        |   Value   |  Constraints  |  Ties  |  Prior  
+	------------------------------------------------------------------
+	    iip_0_0      |  -1.4671  |               |        |         
+	    iip_1_0      |  2.6378   |               |        |         
+	    iip_2_0      |  -0.0396  |               |        |         
+	    iip_3_0      |  -2.6372  |               |        |         
+	    iip_4_0      |  1.4704   |               |        |         
+	 rbf_variance    |  1.5672   |     (+ve)     |        |         
+	rbf_lengthscale  |  2.5625   |     (+ve)     |        |         
+	white_variance   |  0.0000   |     (+ve)     |        |         
+	noise_variance   |  0.0022   |     (+ve)     |        |         
+
+In this case the kernel parameters (``rbf_variance``, 
+``rbf_lengthscale`` and ``white_variance``) as well as 
+the noise parameter (``noise_variance``), are constrained 
+to be positive, while the inducing inputs have not 
+constraints associated. Also there are no ties or prior defined.
+
+Setting and fetching parameters by name
+=======================================
+Another way to interact with the model's parameters is through
+the functions ``_get_param_names()``, ``_get_params()`` and 
+``_set_params()``.
+
+``_get_param_names()`` returns a list of the parameters names ::
+
+	['iip_0_0',
+	 'iip_1_0',
+	 'iip_2_0',
+	 'iip_3_0',
+	 'iip_4_0',
+	 'rbf_variance',
+	 'rbf_lengthscale',
+	 'white_variance',
+	 'noise_variance']
+
+``_get_params()`` returns an array of the parameters values ::
+
+	array([ -1.46705227e+00,   2.63782176e+00,  -3.96422982e-02,
+		-2.63715255e+00,   1.47038653e+00,   1.56724596e+00,
+		 2.56248679e+00,   2.20963633e-10,   2.18379922e-03])
+
+``_set_params()`` takes an array as input and substitutes 
+the current values of the parameters for those of the array. For example,
+we can define a new array of values and change the parameters as follows: ::
+
+	new_params = np.array([1.,2.,3.,4.,1.,1.,1.,1.,1.])
+	m._set_params(new_params)
+
+If we call the function ``_get_params()`` again, we will obtain the new
+parameters we have just set.
+
+Parameters can be also set by name using the function ``_set()``. For example,
+lets change the lengthscale to .5: ::
+
+	m.set('rbf_lengthscale',.5)
+
+``_set()`` function accepts regular expression as it first
+input, and therefore all parameters matching that regular 
+expression are set to the given value. In this case rather 
+than passing as second output a single value, we can also 
+use a list of arrays. For example, lets change the inducing 
+inputs: ::
+
+	m.set('iip',np.arange(-4,0))
+
+Getting the model's likelihood and gradients
+===========================================
+Appart form the printing the model,  the marginal 
+log-likelihood can be obtained by using the function
+``log_likelihood()``. Also, the log-likelihood gradients
+wrt. each parameter can be obtained with the funcion
+``_log_likelihood_gradients()``. ::
+
+    m.log_likelihood()
+    -791.15371409346153
+
+    m._log_likelihood_gradients()
+    array([  7.08278455e-03,   1.37118783e+01,   2.66948031e+00,
+             3.50184014e+00,   7.08278455e-03,  -1.43501702e+02,
+	     6.10662266e+01,  -2.18472649e+02,   2.14663691e+02])
+
+Removing the model's constraints
+================================
+When we initially call the example, it was optimized and hence the
+log-likelihood gradients were close to zero. However, since
+we have been changing the parameters, the gradients are far from zero now.
+Next we are going to show how to optimize the model setting different 
+restrictions on the parameters. 
+
+Once a constrain has been set on a parameter, it is not possible to
+define a new constraint for it unless we explicitly remove the previous
+one. The command to remove the constraints is ``unconstrain()``, and
+just as the ``set()`` command, it also accepts regular expression.
+In this case we will remove all the constraints: ::
+
+	m.unconstrain('')
+
+Constraining and optimising the model
+=====================================
+A requisite needed for some parameters, such as variances,
+is to be positive. This is constraint is easily set 
+with the function ``constrain_positive()``. Regular expressions
+are also accepted. ::
+
+    m.constrain_positive('var')
+
+For convenience, GPy also provides a catch all function 
+which ensures that anything which appears to require 
+positivity is constrianed appropriately::
+
+    m.ensure_default_constraints()
+
+Fixing parameters
+=================
+Parameters values can be fixed using ``constrain_fixed()``. 
+For example we can define the first inducing input to be 
+fixed on zero: ::
+
+    m.constrain_fixed('iip_0',0)
+	
+Bounding parameters
+===================
+Defining bounding constraints is an easily task in GPy too,
+it only requires to use the function ``constrain_bounded()``.
+For example, lets bound inducing inputs 2 and 3 to have
+values between -4 and -1: ::
+
+    m.constrain_bounded('iip_(1|2)',-4,-1)
+
+Tying Parameters
+================
+The values of two or more parameters can be tied together,
+so that they share the same value during optimization.
+The function to do so is ``tie_params()``. For the example
+we are using, it doesn't make sense to tie parameters together,
+however for the sake of the example we will tie the white noise
+and the variance together. See `A kernel overview <tuto_kernel_overview.html>`_.
+for a proper use of the tying capabilities.::
+
+    m.tie_params('e_var')
+
+Optimizing the model
+====================
+Once we have finished defining the constraints, 
+we can now optimize the model with the function
+``optimize``.::
+
+    m.optimize()
+
+We can print again the model and check the new results.
+The table now shows that ``iip_0_0`` is fixed, ``iip_1_0`` 
+and ``iip_2_0`` are bounded and the kernel parameters are constrained to
+be positive. In addition the table now indicates that
+white_variance and noise_variance are tied together.::
+
+	Log-likelihood: 9.967e+01
+
+  	     Name        |   Value   |  Constraints  |  Ties  |  Prior  
+	------------------------------------------------------------------
+	    iip_0_0      |  0.0000   |     Fixed     |        |         
+	    iip_1_0      |  -2.8834  |   (-4, -1)    |        |         
+	    iip_2_0      |  -1.9152  |   (-4, -1)    |        |         
+	    iip_3_0      |  1.5034   |               |        |         
+	    iip_4_0      |  -1.0162  |               |        |         
+	 rbf_variance    |  0.0158   |     (+ve)     |        |         
+	rbf_lengthscale  |  0.9760   |     (+ve)     |        |         
+	white_variance   |  0.0049   |     (+ve)     |  (0)   |         
+	noise_variance   |  0.0049   |     (+ve)     |  (0)   |         
+
+
+Further Reading
+===============
+All of the mechansiams for dealing with parameters are baked right into GPy.core.model, from which all of the classes in GPy.models inherrit. To learn how to construct your own model, you might want to read ??link?? creating_new_models. 
+
+By deafult, GPy uses the tnc optimizer (from scipy.optimize.tnc). To use other optimisers, and to control the setting of those optimisers, as well as other funky features like automated restarts and diagnostics, you can read the optimization tutorial ??link??.
--- a/doc/tuto_kernel_overview.rst
+++ b/doc/tuto_kernel_overview.rst
@ -2,6 +2,7 @@
 ****************************
 tutorial : A kernel overview
 ****************************
+The aim of this tutorial is to give a better understanding of the kernel objects in GPy and to list the ones that are already implemented. The code shown in this tutorial can be obtained at GPy/examples/tutorials.py or by running ``GPy.examples.tutorials.tuto_kernel_overview()``.

 First we import the libraries we will need ::

@ -38,7 +39,7 @@ return::
 Implemented kernels
 ===================

-Many kernels are already implemented in GPy. Here is a summary of most of them:
+Many kernels are already implemented in GPy. The following figure gives a summary of most of them (a comprehensive list can be list can be found `here <kernel_implementation.html>`_):

 .. figure::  Figures/tuto_kern_overview_allkern.png
    :align:  center
@ -132,7 +133,7 @@ Various constrains can be applied to the parameters of a kernel
    * ``constrain_fixed`` to fix the value of a parameter (the value will not be modified during optimisation)
    * ``constrain_positive`` to make sure the parameter is greater than 0.
    * ``constrain_bounded`` to impose the parameter to be in a given range.
-    * ``tie_param`` to impose the value of two (or more) parameters to be equal.
+    * ``tie_params`` to impose the value of two (or more) parameters to be equal.

 When calling one of these functions, the parameters to constrain can either by specified by a regular expression that matches its name or by a number that corresponds to the rank of the parameter. Here is an example ::

@ -145,7 +146,7 @@ When calling one of these functions, the parameters to constrain can either by s

    k.constrain_positive('var')
    k.constrain_fixed(np.array([1]),1.75)
-    k.tie_param('len')
+    k.tie_params('len')
    k.unconstrain('white')
    k.constrain_bounded('white',lower=1e-5,upper=.5)
    print k
--- a/setup.py
+++ b/setup.py
@ -2,8 +2,7 @@
 # -*- coding: utf-8 -*-

 import os
-from numpy.distutils.core import Extension, setup
-#from sphinx.setup_command import BuildDoc
+from setuptools import setup

 # Version number
 version = '0.2'
@ -13,12 +12,12 @@ def read(fname):

 setup(name = 'GPy',
      version = version,
-      author = 'James Hensman, Nicolo Fusi, Ricardo Andrade, Nicolas Durrande, Alan Saul, Neil D. Lawrence',
+      author = read('AUTHORS.txt'),
      author_email = "james.hensman@gmail.com",
      description = ("The Gaussian Process Toolbox"),
      license = "BSD 3-clause",
      keywords = "machine-learning gaussian-processes kernels",
-      url = "http://ml.sheffield.ac.uk/GPy/",
+      url = "http://sheffieldml.github.com/GPy/",
      packages = ['GPy', 'GPy.core', 'GPy.kern', 'GPy.util', 'GPy.models', 'GPy.inference', 'GPy.examples', 'GPy.likelihoods'],
      package_dir={'GPy': 'GPy'},
      package_data = {'GPy': ['GPy/examples']},
@ -26,14 +25,12 @@ setup(name = 'GPy',
      long_description=read('README.md'),
      #ext_modules =  [Extension(name = 'GPy.kern.lfmUpsilonf2py',
      #          sources = ['GPy/kern/src/lfmUpsilonf2py.f90'])],
-      install_requires=['sympy', 'numpy>=1.6', 'scipy>=0.9','matplotlib>=1.1'],
+      install_requires=['sympy', 'numpy>=1.6', 'scipy>=0.9','matplotlib>=1.1', 'nose'],
      extras_require = {
        'docs':['Sphinx', 'ipython'],
      },
      #setup_requires=['sphinx'],
      #cmdclass = {'build_sphinx': BuildDoc},
      classifiers=[
-      "Development Status :: 1 - Alpha",
-      "Topic :: Machine Learning",
      "License :: OSI Approved :: BSD License"],
      )