diff --git a/GPy/core/model.py b/GPy/core/model.py
index b6cedbaf..703e615d 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -121,9 +121,6 @@ class model(parameterised):
         else:
             raise AttributeError, "no parameter matches %s"%name
 
-
-
-
     def log_prior(self):
         """evaluate the prior"""
         return np.sum([p.lnpdf(x) for p, x in zip(self.priors,self._get_params()) if p is not None])
@@ -135,12 +132,11 @@ class model(parameterised):
         [np.put(ret,i,p.lnpdf_grad(xx)) for i,(p,xx) in enumerate(zip(self.priors,x)) if not p is None]
         return ret
 
-    def _log_likelihood_gradients_transformed(self):
+    def _transform_gradients(self, g):
         """
-        Use self.log_likelihood_gradients and self.prior_gradients to get the gradients of the model.
-        Adjust the gradient for constraints and ties, return.
+        Takes a list of gradients and return an array of transformed gradients (positive/negative/tied/and so on)
         """
-        g = self._log_likelihood_gradients() + self._log_prior_gradients()
+
         x = self._get_params()
         g[self.constrained_positive_indices] = g[self.constrained_positive_indices]*x[self.constrained_positive_indices]
         g[self.constrained_negative_indices] = g[self.constrained_negative_indices]*x[self.constrained_negative_indices]
@@ -152,6 +148,7 @@ class model(parameterised):
         else:
             return g
 
+
     def randomize(self):
         """
         Randomize the model.
@@ -241,6 +238,27 @@ class model(parameterised):
                         print "Warning! constraining %s postive"%name
 
 
+    def objective_function(self, x):
+        """
+        The objective function passed to the optimizer. It combines the likelihood and the priors.
+        """
+        self._set_params_transformed(x)
+        return -self.log_likelihood() - self.log_prior()
+
+    def objective_function_gradients(self, x):
+        """
+        Gets the gradients from the likelihood and the priors.
+        """
+        self._set_params_transformed(x)
+        LL_gradients = self._transform_gradients(self._log_likelihood_gradients())
+        prior_gradients = self._transform_gradients(self._log_prior_gradients())
+        return -LL_gradients - prior_gradients
+
+    def objective_and_gradients(self, x):
+        obj_f = self.objective_function(x)
+        obj_grads = self.objective_function_gradients(x)
+        return obj_f, obj_grads
+
     def optimize(self, optimizer=None, start=None, **kwargs):
         """
         Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
@@ -254,22 +272,12 @@ class model(parameterised):
         if optimizer is None:
             optimizer = self.preferred_optimizer
 
-        def f(x):
-            self._set_params_transformed(x)
-            return -self.log_likelihood()-self.log_prior()
-        def fp(x):
-            self._set_params_transformed(x)
-            return -self._log_likelihood_gradients_transformed()
-        def f_fp(x):
-            self._set_params_transformed(x)
-            return -self.log_likelihood()-self.log_prior(),-self._log_likelihood_gradients_transformed()
-
         if start == None:
             start = self._get_params_transformed()
 
         optimizer = optimization.get_optimizer(optimizer)
         opt = optimizer(start, model = self, **kwargs)
-        opt.run(f_fp=f_fp, f=f, fp=fp)
+        opt.run(f_fp=self.objective_and_gradients, f=self.objective_function, fp=self.objective_function_gradients)
         self.optimization_runs.append(opt)
 
         self._set_params_transformed(opt.x_opt)
@@ -357,12 +365,9 @@ class model(parameterised):
             dx = step*np.sign(np.random.uniform(-1,1,x.size))
 
             #evaulate around the point x
-            self._set_params_transformed(x+dx)
-            f1,g1 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()
-            self._set_params_transformed(x-dx)
-            f2,g2 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()
-            self._set_params_transformed(x)
-            gradient = self._log_likelihood_gradients_transformed()
+            f1, g1 = self.objective_and_gradients(x+dx)
+            f2, g2 = self.objective_and_gradients(x-dx)
+            gradient = self.objective_function_gradients(x)
 
             numerical_gradient = (f1-f2)/(2*dx)
             global_ratio = (f1-f2)/(2*np.dot(dx,gradient))
@@ -398,14 +403,10 @@ class model(parameterised):
             for i in param_list:
                 xx = x.copy()
                 xx[i] += step
-                self._set_params_transformed(xx)
-                f1,g1 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()[i]
+                f1, g1 = self.objective_and_gradients(xx)
                 xx[i] -= 2.*step
-                self._set_params_transformed(xx)
-                f2,g2 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed()[i]
-                self._set_params_transformed(x)
-                gradient = self._log_likelihood_gradients_transformed()[i]
-
+                f2, g2 = self.objective_and_gradients(xx)
+                gradient = self.objective_function_gradients(x)[i]
 
                 numerical_gradient = (f1-f2)/(2*step)
                 ratio = (f1-f2)/(2*step*gradient)
diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index 592299d8..031cc915 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -11,7 +11,7 @@ import GPy
 
 default_seed=10000
 
-def crescent_data(model_type='Full', inducing=10, seed=default_seed): #FIXME
+def crescent_data(seed=default_seed): #FIXME
     """Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
 
     :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
@@ -31,11 +31,8 @@ def crescent_data(model_type='Full', inducing=10, seed=default_seed): #FIXME
     likelihood = GPy.likelihoods.EP(data['Y'],distribution)
 
 
-    if model_type=='Full':
-        m = GPy.models.GP(data['X'],likelihood,kernel)
-    else:
-        # create sparse GP EP model
-        m = GPy.models.sparse_GP_EP(data['X'],likelihood=likelihood,inducing=inducing,ep_proxy=model_type)
+    m = GPy.models.GP(data['X'],likelihood,kernel)
+    m.ensure_default_constraints()
 
     m.update_likelihood_approximation()
     print(m)
@@ -94,16 +91,13 @@ def toy_linear_1d_classification(seed=default_seed):
 
     # Model definition
     m = GPy.models.GP(data['X'],likelihood=likelihood,kernel=kernel)
+    m.ensure_default_constraints()
 
     # Optimize
-    """
-    EPEM runs a loop that consists of two steps:
-    1) EP likelihood approximation:
-        m.update_likelihood_approximation()
-    2) Parameters optimization:
-        m.optimize()
-    """
-    m.EPEM()
+    m.update_likelihood_approximation()
+    # Parameters optimization:
+    m.optimize()
+    #m.EPEM() #FIXME
 
     # Plot
     pb.subplot(211)
diff --git a/GPy/examples/oil_flow_demo.py b/GPy/examples/oil_flow_demo.py
index 71fb1bd3..1e9f4f5a 100644
--- a/GPy/examples/oil_flow_demo.py
+++ b/GPy/examples/oil_flow_demo.py
@@ -41,7 +41,7 @@ m.constrain_positive('(rbf|bias|S|linear|white|noise)')
 # m.unconstrain('white')
 # m.constrain_bounded('white', 1e-6, 10.0)
 # plot_oil(m.X, np.array([1,1]), labels, 'PCA initialization')
-m.optimize(messages = True)
+#m.optimize(messages = True)
 # m.optimize('tnc', messages = True)
 # plot_oil(m.X, m.kern.parts[0].lengthscale, labels, 'B-GPLVM')
 # # pb.figure()
diff --git a/GPy/examples/sparse_ep_fix.py b/GPy/examples/sparse_ep_fix.py
index defcb4eb..acbd506c 100644
--- a/GPy/examples/sparse_ep_fix.py
+++ b/GPy/examples/sparse_ep_fix.py
@@ -10,51 +10,86 @@ import pylab as pb
 import numpy as np
 import GPy
 np.random.seed(2)
-pb.ion()
 N = 500
 M = 5
 
-pb.close('all')
-######################################
-## 1 dimensional example
+default_seed=10000
 
-# sample inputs and outputs
-X = np.random.uniform(-3.,3.,(N,1))
-#Y = np.sin(X)+np.random.randn(N,1)*0.05
-F = np.sin(X)+np.random.randn(N,1)*0.05
-Y = np.ones([F.shape[0],1])
-Y[F<0] = -1
-likelihood = GPy.inference.likelihoods.probit(Y)
+def crescent_data(inducing=10, seed=default_seed):
+    """Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
 
-# construct kernel
-rbf =  GPy.kern.rbf(1)
-noise = GPy.kern.white(1)
-kernel = rbf + noise
+    :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
+    :param seed : seed value for data generation.
+    :type seed: int
+    :param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
+    :type inducing: int
+    """
 
-# create simple GP model
-#m = GPy.models.sparse_GP(X,Y=None, kernel=kernel, M=M,likelihood= likelihood)
+    data = GPy.util.datasets.crescent_data(seed=seed)
 
-# contrain all parameters to be positive
-#m.constrain_fixed('prec',100.)
-m = GPy.models.sparse_GP(X, Y, kernel, M=M)
-m.ensure_default_constraints()
-#if not isinstance(m.likelihood,GPy.inference.likelihoods.gaussian):
-#    m.approximate_likelihood()
-print m.checkgrad()
-m.optimize('tnc', messages = 1)
-m.plot(samples=3)
-print m
+    # Kernel object
+    kernel = GPy.kern.rbf(data['X'].shape[1]) + GPy.kern.white(data['X'].shape[1])
 
-n = GPy.models.sparse_GP(X,Y=None, kernel=kernel, M=M,likelihood= likelihood)
-n.ensure_default_constraints()
-if not isinstance(n.likelihood,GPy.inference.likelihoods.gaussian):
-    n.approximate_likelihood()
-print n.checkgrad()
-pb.figure()
-n.plot()
+    # Likelihood object
+    distribution = GPy.likelihoods.likelihood_functions.probit()
+    likelihood = GPy.likelihoods.EP(data['Y'],distribution)
+
+    sample = np.random.randint(0,data['X'].shape[0],inducing)
+    Z = data['X'][sample,:]
+    #Z = (np.random.random_sample(2*inducing)*(data['X'].max()-data['X'].min())+data['X'].min()).reshape(inducing,-1)
+
+    # create sparse GP EP model
+    m = GPy.models.sparse_GP(data['X'],likelihood=likelihood,kernel=kernel,Z=Z)
+    m.ensure_default_constraints()
+
+    m.update_likelihood_approximation()
+    print(m)
+
+    # optimize
+    m.optimize()
+    print(m)
+
+    # plot
+    m.plot()
+    return m
+
+
+def toy_linear_1d_classification(seed=default_seed):
+    """
+    Simple 1D classification example
+    :param seed : seed value for data generation (default is 4).
+    :type seed: int
+    """
+
+    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    Y = data['Y'][:, 0:1]
+    Y[Y == -1] = 0
+
+    # Kernel object
+    kernel = GPy.kern.rbf(1)
+
+    # Likelihood object
+    distribution = GPy.likelihoods.likelihood_functions.probit()
+    likelihood = GPy.likelihoods.EP(Y,distribution)
+
+    Z = np.random.uniform(data['X'].min(),data['X'].max(),(10,1))
+
+    # Model definition
+    m = GPy.models.sparse_GP(data['X'],likelihood=likelihood,kernel=kernel,Z=Z)
+
+    m.ensure_default_constraints()
+    # Optimize
+    m.update_likelihood_approximation()
+    # Parameters optimization:
+    m.optimize()
+    #m.EPEM() #FIXME
+
+    # Plot
+    pb.subplot(211)
+    m.plot_f()
+    pb.subplot(212)
+    m.plot()
+    print(m)
+
+    return m
 
-"""
-m = GPy.models.sparse_GP_regression(X, Y, kernel, M=M)
-m.ensure_default_constraints()
-print m.checkgrad()
-"""
diff --git a/GPy/examples/tuto_GP_regression.py b/GPy/examples/tuto_GP_regression.py
new file mode 100644
index 00000000..b3953de0
--- /dev/null
+++ b/GPy/examples/tuto_GP_regression.py
@@ -0,0 +1,56 @@
+# The detailed explanations of the commands used in this file can be found in the tutorial section
+
+import pylab as pb
+pb.ion()
+import numpy as np
+import GPy
+
+X = np.random.uniform(-3.,3.,(20,1))
+Y = np.sin(X) + np.random.randn(20,1)*0.05
+
+kernel = GPy.kern.rbf(D=1, variance=1., lengthscale=1.)
+
+m = GPy.models.GP_regression(X,Y,kernel)
+
+print m
+m.plot()
+
+m.constrain_positive('')
+
+m.unconstrain('')                            # Required to remove the previous constrains
+m.constrain_positive('rbf_variance')
+m.constrain_bounded('lengthscale',1.,10. )
+m.constrain_fixed('noise',0.0025)
+
+m.optimize()
+
+m.optimize_restarts(Nrestarts = 10)
+
+###########################
+#  2-dimensional example  #
+###########################
+
+import pylab as pb
+pb.ion()
+import numpy as np
+import GPy
+
+# sample inputs and outputs
+X = np.random.uniform(-3.,3.,(50,2))
+Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(50,1)*0.05
+
+# define kernel
+ker = GPy.kern.Matern52(2,ARD=True) + GPy.kern.white(2)
+
+# create simple GP model
+m = GPy.models.GP_regression(X,Y,ker)
+
+# contrain all parameters to be positive
+m.constrain_positive('')
+
+# optimize and plot
+pb.figure()
+m.optimize('tnc', max_f_eval = 1000)
+
+m.plot()
+print(m)
diff --git a/GPy/examples/tuto_kernel_overview.py b/GPy/examples/tuto_kernel_overview.py
new file mode 100644
index 00000000..ebd19d76
--- /dev/null
+++ b/GPy/examples/tuto_kernel_overview.py
@@ -0,0 +1,139 @@
+# The detailed explanations of the commands used in this file can be found in the tutorial section
+
+import pylab as pb
+import numpy as np
+import GPy
+pb.ion()
+
+ker1 = GPy.kern.rbf(1)  # Equivalent to ker1 = GPy.kern.rbf(D=1, variance=1., lengthscale=1.)
+ker2 = GPy.kern.rbf(D=1, variance = .75, lengthscale=2.)
+ker3 = GPy.kern.rbf(1, .5, .5)
+
+print ker2
+ker1.plot()
+ker2.plot()
+ker3.plot()
+
+k1 = GPy.kern.rbf(1,1.,2.)
+k2 = GPy.kern.Matern32(1, 0.5, 0.2)
+
+# Product of kernels
+k_prod = k1.prod(k2)
+k_prodorth = k1.prod_orthogonal(k2)
+
+# Sum of kernels
+k_add = k1.add(k2)
+k_addorth = k1.add_orthogonal(k2)    
+
+pb.figure(figsize=(8,8))
+pb.subplot(2,2,1)
+k_prod.plot()
+pb.title('prod')
+pb.subplot(2,2,2)
+k_prodorth.plot()
+pb.title('prod_orthogonal')
+pb.subplot(2,2,3)
+k_add.plot()
+pb.title('add')
+pb.subplot(2,2,4)
+k_addorth.plot()
+pb.title('add_orthogonal')
+pb.subplots_adjust(wspace=0.3, hspace=0.3)
+
+k1 = GPy.kern.rbf(1,1.,2)
+k2 = GPy.kern.periodic_Matern52(1,variance=1e3, lengthscale=1, period = 1.5, lower=-5., upper = 5)
+
+k = k1 * k2  # equivalent to k = k1.prod(k2)
+print k
+
+# Simulate sample paths
+X = np.linspace(-5,5,501)[:,None]
+Y = np.random.multivariate_normal(np.zeros(501),k.K(X),1)
+
+# plot
+pb.figure(figsize=(10,4))
+pb.subplot(1,2,1)
+k.plot()
+pb.subplot(1,2,2)
+pb.plot(X,Y.T)
+pb.ylabel("Sample path")
+pb.subplots_adjust(wspace=0.3)
+
+k = (k1+k2)*(k1+k2)
+print k.parts[0].name, '\n', k.parts[1].name, '\n', k.parts[2].name, '\n', k.parts[3].name
+
+k1 = GPy.kern.rbf(1)
+k2 = GPy.kern.Matern32(1)
+k3 = GPy.kern.white(1)
+
+k = k1 + k2 + k3
+print k
+
+k.constrain_positive('var')
+k.constrain_fixed(np.array([1]),1.75)
+k.tie_param('len')
+k.unconstrain('white')
+k.constrain_bounded('white',lower=1e-5,upper=.5)
+print k
+
+k_cst = GPy.kern.bias(1,variance=1.)
+k_mat = GPy.kern.Matern52(1,variance=1., lengthscale=3)
+Kanova = (k_cst + k_mat).prod_orthogonal(k_cst + k_mat)
+print Kanova
+
+# sample inputs and outputs
+X = np.random.uniform(-3.,3.,(40,2))
+Y = 0.5*X[:,:1] + 0.5*X[:,1:] + 2*np.sin(X[:,:1]) * np.sin(X[:,1:])
+
+# Create GP regression model
+m = GPy.models.GP_regression(X,Y,Kanova)
+pb.figure(figsize=(5,5))
+m.plot()
+
+pb.figure(figsize=(20,3))
+pb.subplots_adjust(wspace=0.5)
+pb.subplot(1,5,1)
+m.plot()
+pb.subplot(1,5,2)
+pb.ylabel("=   ",rotation='horizontal',fontsize='30')
+pb.subplot(1,5,3)
+m.plot(which_functions=[False,True,False,False])
+pb.ylabel("cst          +",rotation='horizontal',fontsize='30')
+pb.subplot(1,5,4)
+m.plot(which_functions=[False,False,True,False])
+pb.ylabel("+   ",rotation='horizontal',fontsize='30')
+pb.subplot(1,5,5)
+pb.ylabel("+   ",rotation='horizontal',fontsize='30')
+m.plot(which_functions=[False,False,False,True])
+
+import pylab as pb
+import numpy as np
+import GPy
+pb.ion()
+
+ker1 = GPy.kern.rbf(D=1)  # Equivalent to ker1 = GPy.kern.rbf(D=1, variance=1., lengthscale=1.)
+ker2 = GPy.kern.rbf(D=1, variance = .75, lengthscale=3.)
+ker3 = GPy.kern.rbf(1, .5, .25)
+
+ker1.plot()
+ker2.plot()
+ker3.plot()
+#pb.savefig("Figures/tuto_kern_overview_basicdef.png")
+
+kernels = [GPy.kern.rbf(1), GPy.kern.exponential(1), GPy.kern.Matern32(1), GPy.kern.Matern52(1),  GPy.kern.Brownian(1), GPy.kern.bias(1), GPy.kern.linear(1), GPy.kern.spline(1), GPy.kern.periodic_exponential(1), GPy.kern.periodic_Matern32(1), GPy.kern.periodic_Matern52(1), GPy.kern.white(1)]
+kernel_names = ["GPy.kern.rbf", "GPy.kern.exponential", "GPy.kern.Matern32", "GPy.kern.Matern52", "GPy.kern.Brownian", "GPy.kern.bias", "GPy.kern.linear", "GPy.kern.spline", "GPy.kern.periodic_exponential", "GPy.kern.periodic_Matern32", "GPy.kern.periodic_Matern52", "GPy.kern.white"]
+
+pb.figure(figsize=(16,12))
+pb.subplots_adjust(wspace=.5, hspace=.5)
+for i, kern in enumerate(kernels):
+   pb.subplot(3,4,i+1)
+   kern.plot(x=7.5,plot_limits=[0.00001,15.])
+   pb.title(kernel_names[i]+ '\n')
+
+# actual plot for the noise
+i = 11
+X = np.linspace(0.,15.,201)
+WN = 0*X
+WN[100] = 1.
+pb.subplot(3,4,i+1)
+pb.plot(X,WN,'b')
diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index 625f6080..132fad41 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -2,5 +2,5 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
-from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, spline, Brownian, rbf_sympy, sympykern, periodic_exponential, periodic_Matern32, periodic_Matern52, product, product_orthogonal, symmetric, coregionalise
+from constructors import rbf, Matern32, Matern52, exponential, linear, white, bias, finite_dimensional, spline, Brownian, rbf_sympy, sympykern, periodic_exponential, periodic_Matern32, periodic_Matern52, prod, prod_orthogonal, symmetric, coregionalise
 from kern import kern
diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index 9b58c282..b848821b 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -18,8 +18,8 @@ from Brownian import Brownian as Brownianpart
 from periodic_exponential import periodic_exponential as periodic_exponentialpart
 from periodic_Matern32 import periodic_Matern32 as periodic_Matern32part
 from periodic_Matern52 import periodic_Matern52 as periodic_Matern52part
-from product import product as productpart
-from product_orthogonal import product_orthogonal as product_orthogonalpart
+from prod import prod as prodpart
+from prod_orthogonal import prod_orthogonal as prod_orthogonalpart
 from symmetric import symmetric as symmetric_part
 from coregionalise import coregionalise as coregionalise_part
 #TODO these s=constructors are not as clean as we'd like. Tidy the code up
@@ -245,7 +245,7 @@ def periodic_Matern52(D,variance=1., lengthscale=None, period=2*np.pi,n_freq=10,
     part = periodic_Matern52part(D,variance, lengthscale, period, n_freq, lower, upper)
     return kern(D, [part])
 
-def product(k1,k2):
+def prod(k1,k2):
     """
      Construct a product kernel over D from two kernels over D
 
@@ -253,10 +253,10 @@ def product(k1,k2):
     :type k1, k2: kernpart
     :rtype: kernel object
     """
-    part = productpart(k1,k2)
+    part = prodpart(k1,k2)
     return kern(k1.D, [part])
 
-def product_orthogonal(k1,k2):
+def prod_orthogonal(k1,k2):
     """
      Construct a product kernel over D1 x D2 from a kernel over D1 and another over D2.
 
@@ -264,7 +264,7 @@ def product_orthogonal(k1,k2):
     :type k1, k2: kernpart
     :rtype: kernel object
     """
-    part = product_orthogonalpart(k1,k2)
+    part = prod_orthogonalpart(k1,k2)
     return kern(k1.D+k2.D, [part])
 
 def symmetric(k):
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index c1f5eca9..f1a5bd45 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -7,8 +7,8 @@ import pylab as pb
 from ..core.parameterised import parameterised
 from kernpart import kernpart
 import itertools
-from product_orthogonal import product_orthogonal
-from product import product
+from prod_orthogonal import prod_orthogonal
+from prod import prod
 
 class kern(parameterised):
     def __init__(self,D,parts=[], input_slices=None):
@@ -161,7 +161,7 @@ class kern(parameterised):
         K1 = self.copy()
         K2 = other.copy()
 
-        newkernparts = [product(k1,k2) for k1, k2 in itertools.product(K1.parts,K2.parts)]
+        newkernparts = [prod(k1,k2) for k1, k2 in itertools.product(K1.parts,K2.parts)]
 
         slices = []
         for sl1, sl2 in itertools.product(K1.input_slices,K2.input_slices):
@@ -183,7 +183,7 @@ class kern(parameterised):
         K1 = self.copy()
         K2 = other.copy()
 
-        newkernparts = [product_orthogonal(k1,k2) for k1, k2 in itertools.product(K1.parts,K2.parts)]
+        newkernparts = [prod_orthogonal(k1,k2) for k1, k2 in itertools.product(K1.parts,K2.parts)]
 
         slices = []
         for sl1, sl2 in itertools.product(K1.input_slices,K2.input_slices):
@@ -371,16 +371,17 @@ class kern(parameterised):
 
     def psi2(self,Z,mu,S,slices1=None,slices2=None):
         """
-        :Z: np.ndarray of inducing inputs (M x Q)
-        : mu, S: np.ndarrays of means and variacnes (each N x Q)
-        :returns psi2: np.ndarray (N,M,M,Q) """
+        :param Z: np.ndarray of inducing inputs (M x Q)
+        :param mu, S: np.ndarrays of means and variances (each N x Q)
+        :returns psi2: np.ndarray (N,M,M)
+        """
         target = np.zeros((mu.shape[0],Z.shape[0],Z.shape[0]))
         slices1, slices2 = self._process_slices(slices1,slices2)
         [p.psi2(Z[s2,i_s],mu[s1,i_s],S[s1,i_s],target[s1,s2,s2]) for p,i_s,s1,s2 in zip(self.parts,self.input_slices,slices1,slices2)]
 
         #compute the "cross" terms
         for p1, p2 in itertools.combinations(self.parts,2):
-            #white doesn;t compine with anything
+            #white doesn;t combine with anything
             if p1.name=='white' or p2.name=='white':
                 pass
             #rbf X bias
@@ -396,25 +397,7 @@ class kern(parameterised):
             else:
                 raise NotImplementedError, "psi2 cannot be computed for this kernel"
 
-
-
-
-
-        # "crossterms". Here we are recomputing psi1 for white (we don't need to), but it's
-        # not really expensive, since it's just a matrix of zeroes.
-        # psi1_matrices = [np.zeros((mu.shape[0], Z.shape[0])) for p in self.parts]
-        # [p.psi1(Z[s2],mu[s1],S[s1],psi1_target[s1,s2]) for p,s1,s2,psi1_target in zip(self.parts,slices1,slices2, psi1_matrices)]
-
-        crossterms = 0.0
-        # for 3 kernels this returns something like
-        # [(0,1), (0,2), (1,2)]
-        # in theory, we should also account for (1,0), (2,0) and so on, but
-        # the transpose deals exactly with that
-        # for a,b in itertools.combinations(psi1_matrices, 2):
-        #     tmp = np.multiply(a,b)
-        #     crossterms += tmp[:,None,:] + tmp[:, :,None]
-
-        return target + crossterms
+        return target
 
     def dpsi2_dtheta(self,dL_dpsi2,partial1,Z,mu,S,slices1=None,slices2=None):
         """Returns shape (N,M,M,Ntheta)"""
@@ -429,7 +412,7 @@ class kern(parameterised):
             ipsl1, ipsl2 = self.input_slices[i1], self.input_slices[i2]
             ps1, ps2 = self.param_slices[i1], self.param_slices[i2]
 
-            #white doesn;t compine with anything
+            #white doesn;t combine with anything
             if p1.name=='white' or p2.name=='white':
                 pass
             #rbf X bias
@@ -447,26 +430,6 @@ class kern(parameterised):
             else:
                 raise NotImplementedError, "psi2 cannot be computed for this kernel"
 
-        # # "crossterms"
-        # # 1. get all the psi1 statistics
-        # psi1_matrices = [np.zeros((mu.shape[0], Z.shape[0])) for p in self.parts]
-        # [p.psi1(Z[s2],mu[s1],S[s1],psi1_target[s1,s2]) for p,s1,s2,psi1_target in zip(self.parts,slices1,slices2, psi1_matrices)]
-
-        # partial1 = np.ones_like(partial1)
-        # # 2. get all the dpsi1/dtheta gradients
-        # psi1_gradients = [np.zeros(self.Nparam) for p in self.parts]
-        # [p.dpsi1_dtheta(partial1[s2,s1],Z[s2,i_s],mu[s1,i_s],S[s1,i_s],psi1g_target[ps]) for p,ps,s1,s2,i_s,psi1g_target in zip(self.parts, self.param_slices,slices1,slices2,self.input_slices,psi1_gradients)]
-
-
-        # # 3. multiply them somehow
-        # for a,b in itertools.combinations(range(len(psi1_matrices)), 2):
-
-        #     tmp = (psi1_gradients[a][None, None] * psi1_matrices[b][:,:, None])
-        #     # target += (tmp[None] + tmp[:,None]).sum(0).sum(0).sum(0)
-        #     # gne = (psi1_gradients[a].sum()*psi1_matrices[b].sum())
-        #     # target += gne
-        #     #target += (gne[None] + gne[:, None]).sum(0)
-        #     target += (partial.sum(0)[:,:,None] * (tmp[:, None] + tmp[:,:,None]).sum(0)).sum(0).sum(0)
         return self._transform_gradients(target)
 
     def dpsi2_dZ(self,dL_dpsi2,Z,mu,S,slices1=None,slices2=None):
@@ -475,16 +438,15 @@ class kern(parameterised):
         [p.dpsi2_dZ(dL_dpsi2[s1,s2,s2],Z[s2,i_s],mu[s1,i_s],S[s1,i_s],target[s2,i_s]) for p,i_s,s1,s2 in zip(self.parts,self.input_slices,slices1,slices2)]
 
         #compute the "cross" terms
-        #TODO: slices (need to iterate around the input slices also...)
         for p1, p2 in itertools.combinations(self.parts,2):
-            #white doesn;t compine with anything
+            #white doesn;t combine with anything
             if p1.name=='white' or p2.name=='white':
                 pass
             #rbf X bias
             elif p1.name=='bias' and p2.name=='rbf':
-                target += p2.dpsi1_dX(dL_dpsi2.sum(1)*p1.variance,Z,mu,S)
+                target += p2.dpsi1_dX(dL_dpsi2.sum(1)*p1.variance,Z,mu,S,target)
             elif p2.name=='bias' and p1.name=='rbf':
-                target += p1.dpsi1_dZ(dL_dpsi2.sum(2)*p2.variance,Z,mu,S)
+                target += p1.dpsi1_dZ(dL_dpsi2.sum(2)*p2.variance,Z,mu,S,target)
             #rbf X linear
             elif p1.name=='linear' and p2.name=='rbf':
                 raise NotImplementedError #TODO
@@ -502,7 +464,24 @@ class kern(parameterised):
         target_mu, target_S = np.zeros((2,mu.shape[0],mu.shape[1]))
         [p.dpsi2_dmuS(dL_dpsi2[s1,s2,s2],Z[s2,i_s],mu[s1,i_s],S[s1,i_s],target_mu[s1,i_s],target_S[s1,i_s]) for p,i_s,s1,s2 in zip(self.parts,self.input_slices,slices1,slices2)]
 
-        #TODO: there are some extra terms to compute here!
+        #compute the "cross" terms
+        for p1, p2 in itertools.combinations(self.parts,2):
+            #white doesn;t combine with anything
+            if p1.name=='white' or p2.name=='white':
+                pass
+            #rbf X bias
+            elif p1.name=='bias' and p2.name=='rbf':
+                target += p2.dpsi1_dmuS(partial.sum(1)*p1.variance,Z,mu,S,target_mu,target_S)
+            elif p2.name=='bias' and p1.name=='rbf':
+                target += p1.dpsi1_dmuS(partial.sum(2)*p2.variance,Z,mu,S,target_mu,target_S)
+            #rbf X linear
+            elif p1.name=='linear' and p2.name=='rbf':
+                raise NotImplementedError #TODO
+            elif p2.name=='linear' and p1.name=='rbf':
+                raise NotImplementedError #TODO
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"
+
         return target_mu, target_S
 
     def plot(self, x = None, plot_limits=None,which_functions='all',resolution=None,*args,**kwargs):
diff --git a/GPy/kern/product.py b/GPy/kern/prod.py
similarity index 99%
rename from GPy/kern/product.py
rename to GPy/kern/prod.py
index 3bad51c1..6a59c220 100644
--- a/GPy/kern/product.py
+++ b/GPy/kern/prod.py
@@ -6,7 +6,7 @@ import numpy as np
 import hashlib
 #from scipy import integrate # This may not be necessary (Nicolas, 20th Feb)
 
-class product(kernpart):
+class prod(kernpart):
     """
     Computes the product of 2 kernels that are defined on the same space
 
diff --git a/GPy/kern/product_orthogonal.py b/GPy/kern/prod_orthogonal.py
similarity index 99%
rename from GPy/kern/product_orthogonal.py
rename to GPy/kern/prod_orthogonal.py
index b0112199..fc349da8 100644
--- a/GPy/kern/product_orthogonal.py
+++ b/GPy/kern/prod_orthogonal.py
@@ -6,7 +6,7 @@ import numpy as np
 import hashlib
 #from scipy import integrate # This may not be necessary (Nicolas, 20th Feb)
 
-class product_orthogonal(kernpart):
+class prod_orthogonal(kernpart):
     """
     Computes the product of 2 kernels
 
diff --git a/GPy/likelihoods/EP.py b/GPy/likelihoods/EP.py
index 30b21d9b..f49ed275 100644
--- a/GPy/likelihoods/EP.py
+++ b/GPy/likelihoods/EP.py
@@ -17,7 +17,7 @@ class EP(likelihood):
         self.epsilon = epsilon
         self.eta, self.delta = power_ep
         self.data = data
-        self.N = self.data.size
+        self.N, self.D = self.data.shape
         self.is_heteroscedastic = True
         self.Nparams = 0
 
@@ -29,7 +29,7 @@ class EP(likelihood):
         #initial values for the GP variables
         self.Y = np.zeros((self.N,1))
         self.covariance_matrix = np.eye(self.N)
-        self.precision = np.ones(self.N)
+        self.precision = np.ones(self.N)[:,None]
         self.Z = 0
         self.YYT = None
 
@@ -54,18 +54,14 @@ class EP(likelihood):
 
         self.Y =  mu_tilde[:,None]
         self.YYT = np.dot(self.Y,self.Y.T)
-        self.precision = self.tau_tilde
-        self.covariance_matrix = np.diag(1./self.precision)
+        self.covariance_matrix = np.diag(1./self.tau_tilde)
+        self.precision = self.tau_tilde[:,None]
 
     def fit_full(self,K):
         """
         The expectation-propagation algorithm.
         For nomenclature see Rasmussen & Williams 2006.
         """
-        #Prior distribution parameters: p(f|X) = N(f|0,K)
-
-        self.tau_tilde = np.zeros(self.N)
-        self.v_tilde = np.zeros(self.N)
         #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
         mu = np.zeros(self.N)
         Sigma = K.copy()
@@ -124,13 +120,14 @@ class EP(likelihood):
 
         return self._compute_GP_variables()
 
-    def fit_DTC(self, Knn_diag, Kmn, Kmm):
+    #def fit_DTC(self, Knn_diag, Kmn, Kmm):
+    def fit_DTC(self, Kmm, Kmn):
         """
         The expectation-propagation algorithm with sparse pseudo-input.
         For nomenclature see ... 2013.
         """
 
-        #TODO: this doesn;t work with uncertain inputs!
+        #TODO: this doesn't work with uncertain inputs!
 
         """
         Prior approximation parameters:
@@ -158,12 +155,12 @@ class EP(likelihood):
         sigma_ = 1./tau_
         mu_ = v_/tau_
         """
-        tau_ = np.empty(self.N,dtype=float)
-        v_ = np.empty(self.N,dtype=float)
+        self.tau_ = np.empty(self.N,dtype=float)
+        self.v_ = np.empty(self.N,dtype=float)
 
         #Initial values - Marginal moments
         z = np.empty(self.N,dtype=float)
-        Z_hat = np.empty(self.N,dtype=float)
+        self.Z_hat = np.empty(self.N,dtype=float)
         phi = np.empty(self.N,dtype=float)
         mu_hat = np.empty(self.N,dtype=float)
         sigma2_hat = np.empty(self.N,dtype=float)
@@ -172,21 +169,21 @@ class EP(likelihood):
         epsilon_np1 = 1
         epsilon_np2 = 1
        	self.iterations = 0
-        np1 = [tau_tilde.copy()]
-        np2 = [v_tilde.copy()]
+        np1 = [self.tau_tilde.copy()]
+        np2 = [self.v_tilde.copy()]
         while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
             update_order = np.random.permutation(self.N)
             for i in update_order:
                 #Cavity distribution parameters
-                tau_[i] = 1./Sigma_diag[i] - self.eta*tau_tilde[i]
-                v_[i] = mu[i]/Sigma_diag[i] - self.eta*v_tilde[i]
+                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
+                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                 #Marginal moments
-                Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(self.data[i],tau_[i],v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(self.data[i],self.tau_[i],self.v_[i])
                 #Site parameters update
-                Delta_tau = delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
+                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                 Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
-                tau_tilde[i] = tau_tilde[i] + Delta_tau
-                v_tilde[i] = v_tilde[i] + Delta_v
+                self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau
+                self.v_tilde[i] = self.v_tilde[i] + Delta_v
                 #Posterior distribution parameters update
                 LLT = LLT + np.outer(Kmn[:,i],Kmn[:,i])*Delta_tau
                 L = jitchol(LLT)
@@ -196,25 +193,26 @@ class EP(likelihood):
                 mu = mu + (Delta_v-Delta_tau*mu[i])*si
                 self.iterations += 1
             #Sigma recomputation with Cholesky decompositon
-            LLT0 = LLT0 + np.dot(Kmn*tau_tilde[None,:],Kmn.T)
+            LLT0 = LLT0 + np.dot(Kmn*self.tau_tilde[None,:],Kmn.T)
             L = jitchol(LLT)
             V,info = linalg.lapack.flapack.dtrtrs(L,Kmn,lower=1)
             V2,info = linalg.lapack.flapack.dtrtrs(L.T,V,lower=0)
             Sigma_diag = np.sum(V*V,-2)
-            Knmv_tilde = np.dot(Kmn,v_tilde)
+            Knmv_tilde = np.dot(Kmn,self.v_tilde)
             mu = np.dot(V2.T,Knmv_tilde)
-            epsilon_np1 = sum((tau_tilde-np1[-1])**2)/self.N
-            epsilon_np2 = sum((v_tilde-np2[-1])**2)/self.N
-            np1.append(tau_tilde.copy())
-            np2.append(v_tilde.copy())
+            epsilon_np1 = sum((self.tau_tilde-np1[-1])**2)/self.N
+            epsilon_np2 = sum((self.v_tilde-np2[-1])**2)/self.N
+            np1.append(self.tau_tilde.copy())
+            np2.append(self.v_tilde.copy())
 
         self._compute_GP_variables()
 
-    def fit_FITC(self, Knn_diag, Kmn):
+    def fit_FITC(self, Kmm, Kmn, Knn_diag):
         """
         The expectation-propagation algorithm with sparse pseudo-input.
         For nomenclature see Naish-Guzman and Holden, 2008.
         """
+        M = Kmm.shape[0]
 
         """
         Prior approximation parameters:
@@ -235,7 +233,7 @@ class EP(likelihood):
         mu = w + P*gamma
         """
         self.w = np.zeros(self.N)
-        self.gamma = np.zeros(self.M)
+        self.gamma = np.zeros(M)
         mu = np.zeros(self.N)
         P = P0.copy()
         R = R0.copy()
@@ -271,7 +269,7 @@ class EP(likelihood):
                 self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
                 self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
                 #Marginal moments
-                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(data[i],self.tau_[i],self.v_[i])
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(self.data[i],self.tau_[i],self.v_[i])
                 #Site parameters update
                 Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                 Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
@@ -281,10 +279,10 @@ class EP(likelihood):
                 dtd1 = Delta_tau*Diag[i] + 1.
                 dii = Diag[i]
                 Diag[i] = dii - (Delta_tau * dii**2.)/dtd1
-                pi_ = P[i,:].reshape(1,self.M)
+                pi_ = P[i,:].reshape(1,M)
                 P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_
                 Rp_i = np.dot(R,pi_.T)
-                RTR = np.dot(R.T,np.dot(np.eye(self.M) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R))
+                RTR = np.dot(R.T,np.dot(np.eye(M) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R))
                 R = jitchol(RTR).T
                 self.w[i] = self.w[i] + (Delta_v - Delta_tau*self.w[i])*dii/dtd1
                 self.gamma = self.gamma + (Delta_v - Delta_tau*mu[i])*np.dot(RTR,P[i,:].T)
@@ -296,7 +294,7 @@ class EP(likelihood):
             Diag = Diag0/(1.+ Diag0 * self.tau_tilde)
             P = (Diag / Diag0)[:,None] * P0
             RPT0 = np.dot(R0,P0.T)
-            L = jitchol(np.eye(self.M) + np.dot(RPT0,(1./Diag0 - Diag/(Diag0**2))[:,None]*RPT0.T))
+            L = jitchol(np.eye(M) + np.dot(RPT0,(1./Diag0 - Diag/(Diag0**2))[:,None]*RPT0.T))
             R,info = linalg.lapack.flapack.dtrtrs(L,R0,lower=1)
             RPT = np.dot(R,P.T)
             Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py
index 23881899..3e2a0361 100644
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@@ -37,8 +37,8 @@ class probit(likelihood_function):
         :param tau_i: precision of the cavity distribution (float)
         :param v_i: mean/variance of the cavity distribution (float)
         """
-        # TODO: some version of assert np.sum(np.abs(Y)-1) == 0, "Output values must be either -1 or 1"
-        if data_i == 0: data_i = -1 #NOTE Binary classification works better classes {-1,1}, 1D-plotting works better with classes {0,1}.
+        if data_i == 0: data_i = -1 #NOTE Binary classification algorithm works better with classes {-1,1}, 1D-plotting works better with classes {0,1}.
+        # TODO: some version of assert
         z = data_i*v_i/np.sqrt(tau_i**2 + tau_i)
         Z_hat = stats.norm.cdf(z)
         phi = stats.norm.pdf(z)
diff --git a/GPy/models/GP.py b/GPy/models/GP.py
index 1d985c33..5879a2bf 100644
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@@ -269,6 +269,8 @@ class GP(model):
             if hasattr(self,'Z'):
                 Zu = self.Z*self._Xstd + self._Xmean
                 pb.plot(Zu,Zu*0+pb.ylim()[0],'r|',mew=1.5,markersize=12)
+                if self.has_uncertain_inputs:
+                    pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten()))
 
         elif self.X.shape[1]==2: #FIXME
             resolution = resolution or 50
@@ -281,5 +283,8 @@ class GP(model):
             pb.scatter(self.X[:,0], self.X[:,1], 40, Yf, cmap=pb.cm.jet,vmin=m.min(),vmax=m.max(), linewidth=0.)
             pb.xlim(xmin[0],xmax[0])
             pb.ylim(xmin[1],xmax[1])
+            if hasattr(self,'Z'):
+                pb.plot(self.Z[:,0],self.Z[:,1],'wo')
+
         else:
             raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py
index 6932154d..ff00faea 100644
--- a/GPy/models/sparse_GP.py
+++ b/GPy/models/sparse_GP.py
@@ -72,7 +72,7 @@ class sparse_GP(GP):
             self.psi2 = None
 
     def _computations(self):
-        # TODO find routine to multiply triangular matrices
+        #TODO: find routine to multiply triangular matrices
         #TODO: slices for psi statistics (easy enough)
 
         sf = self.scale_factor
@@ -82,9 +82,9 @@ class sparse_GP(GP):
         if self.likelihood.is_heteroscedastic:
             assert self.likelihood.D == 1 #TODO: what is the likelihood is heterscedatic and there are multiple independent outputs?
             if self.has_uncertain_inputs:
-                self.psi2_beta_scaled = (self.psi2*(self.likelihood.precision.reshape(self.N,1,1)/sf2)).sum(0)
+                self.psi2_beta_scaled = (self.psi2*(self.likelihood.precision.flatten().reshape(self.N,1,1)/sf2)).sum(0)
             else:
-                tmp = self.psi1.T*(np.sqrt(self.likelihood.precision.reshape(1,self.N))/sf)
+                tmp = self.psi1*(np.sqrt(self.likelihood.precision.flatten().reshape(1,self.N))/sf)
                 self.psi2_beta_scaled = np.dot(tmp,tmp.T)
         else:
             if self.has_uncertain_inputs:
@@ -106,15 +106,19 @@ class sparse_GP(GP):
         self.C = mdot(self.Lmi.T, self.Bi, self.Lmi)
         self.E = mdot(self.C, self.psi1VVpsi1/sf2, self.C.T)
 
-        # Compute dL_dpsi # FIXME: this is untested for the het. case
-        self.dL_dpsi0 = - 0.5 * self.D * self.likelihood.precision * np.ones(self.N)
+        # Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertin inputs case
+        self.dL_dpsi0 = - 0.5 * self.D * (self.likelihood.precision * np.ones([self.N,1])).flatten()
         self.dL_dpsi1 = mdot(self.V, self.psi1V.T,self.C).T
         if self.likelihood.is_heteroscedastic:
-            self.dL_dpsi2 = 0.5 * self.likelihood.precision[:,None,None] * self.D * self.Kmmi[None,:,:] # dB
-            self.dL_dpsi2 += - 0.5 * self.likelihood.precision[:,None,None]/sf2 * self.D * self.C[None,:,:] # dC
-            self.dL_dpsi2 += - 0.5 * self.likelihood.precision[:,None,None]* self.E[None,:,:] # dD
-            if not self.has_uncertain_inputs:
-                raise NotImplementedError, "TODO: recaste derivatibes in psi2 back into psi1"
+            if self.has_uncertain_inputs:
+                self.dL_dpsi2 = 0.5 * self.likelihood.precision[:,None,None] * self.D * self.Kmmi[None,:,:] # dB
+                self.dL_dpsi2 += - 0.5 * self.likelihood.precision[:,None,None]/sf2 * self.D * self.C[None,:,:] # dC
+                self.dL_dpsi2 += - 0.5 * self.likelihood.precision[:,None,None]* self.E[None,:,:] # dD
+            else:
+                self.dL_dpsi1 += mdot(self.Kmmi,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)) #dB
+                self.dL_dpsi1 += -mdot(self.C,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)/sf2) #dC
+                self.dL_dpsi1 += -mdot(self.E,self.psi1*self.likelihood.precision.flatten().reshape(1,self.N)) #dD
+                self.dL_dpsi2 = None
 
         else:
             self.dL_dpsi2 = 0.5 * self.likelihood.precision * self.D * self.Kmmi # dB
@@ -166,14 +170,29 @@ class sparse_GP(GP):
     def _get_param_names(self):
         return sum([['iip_%i_%i'%(i,j) for j in range(self.Z.shape[1])] for i in range(self.Z.shape[0])],[]) + GP._get_param_names(self)
 
+    def update_likelihood_approximation(self):
+        """
+        Approximates a non-gaussian likelihood using Expectation Propagation
+
+        For a Gaussian (or direct: TODO) likelihood, no iteration is required:
+        this function does nothing
+        """
+        if self.has_uncertain_inputs:
+            raise NotImplementedError, "EP approximation not implemented for uncertain inputs"
+        else:
+            self.likelihood.fit_DTC(self.Kmm,self.psi1)
+            #self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
+            self._set_params(self._get_params()) # update the GP
+
     def log_likelihood(self):
         """ Compute the (lower bound on the) log marginal likelihood """
         sf2 = self.scale_factor**2
         if self.likelihood.is_heteroscedastic:
             A = -0.5*self.N*self.D*np.log(2.*np.pi) +0.5*np.sum(np.log(self.likelihood.precision)) -0.5*np.sum(self.V*self.likelihood.Y)
+            B = -0.5*self.D*(np.sum(self.likelihood.precision.flatten()*self.psi0) - np.trace(self.A)*sf2)
         else:
             A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.likelihood.precision)) -0.5*self.likelihood.precision*self.likelihood.trYYT
-        B = -0.5*self.D*(np.sum(self.likelihood.precision*self.psi0) - np.trace(self.A)*sf2)
+            B = -0.5*self.D*(np.sum(self.likelihood.precision*self.psi0) - np.trace(self.A)*sf2)
         C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2))
         D = +0.5*np.sum(self.psi1VVpsi1 * self.C)
         return A+B+C+D
@@ -221,14 +240,3 @@ class sparse_GP(GP):
             var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0)
 
         return mu,var[:,None]
-
-    def plot(self, *args, **kwargs):
-        """
-        Plot the fitted model: just call the GP plot function and then add inducing inputs
-        """
-        GP.plot(self,*args,**kwargs)
-        if self.Q==1:
-            if self.has_uncertain_inputs:
-                pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten()))
-        if self.Q==2:
-            pb.plot(self.Z[:,0],self.Z[:,1],'wo')
diff --git a/GPy/models/sparse_GPLVM.py b/GPy/models/sparse_GPLVM.py
index fe7c1c43..542fbe0e 100644
--- a/GPy/models/sparse_GPLVM.py
+++ b/GPy/models/sparse_GPLVM.py
@@ -43,7 +43,7 @@ class sparse_GPLVM(sparse_GP_regression, GPLVM):
 
     def dL_dX(self):
         dL_dX = self.kern.dKdiag_dX(self.dL_dpsi0,self.X)
-        dL_dX += self.kern.dK_dX(self.dL_dpsi1,self.X,self.Z)
+        dL_dX += self.kern.dK_dX(self.dL_dpsi1.T,self.X,self.Z)
 
         return dL_dX
 
diff --git a/GPy/testing/gplvm_tests.py b/GPy/testing/gplvm_tests.py
new file mode 100644
index 00000000..51828768
--- /dev/null
+++ b/GPy/testing/gplvm_tests.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2012, Nicolo Fusi
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import unittest
+import numpy as np
+import GPy
+
+class GPLVMTests(unittest.TestCase):
+    def test_bias_kern(self):
+        N, M, Q, D = 10, 3, 2, 4
+        X = np.random.rand(N, Q)
+        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        k = GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
+        m = GPy.models.GPLVM(Y, Q, kernel = k)
+        m.ensure_default_constraints()
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+    def test_linear_kern(self):
+        N, M, Q, D = 10, 3, 2, 4
+        X = np.random.rand(N, Q)
+        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        k = GPy.kern.linear(Q) + GPy.kern.white(Q, 0.00001)
+        m = GPy.models.GPLVM(Y, Q, kernel = k)
+        m.ensure_default_constraints()
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+    def test_rbf_kern(self):
+        N, M, Q, D = 10, 3, 2, 4
+        X = np.random.rand(N, Q)
+        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
+        m = GPy.models.GPLVM(Y, Q, kernel = k)
+        m.ensure_default_constraints()
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()
diff --git a/GPy/testing/sparse_gplvm_tests.py b/GPy/testing/sparse_gplvm_tests.py
new file mode 100644
index 00000000..35fa4fcf
--- /dev/null
+++ b/GPy/testing/sparse_gplvm_tests.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2012, Nicolo Fusi, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import unittest
+import numpy as np
+import GPy
+
+class sparse_GPLVMTests(unittest.TestCase):
+    def test_bias_kern(self):
+        N, M, Q, D = 10, 3, 2, 4
+        X = np.random.rand(N, Q)
+        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        k = GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
+        m = GPy.models.sparse_GPLVM(Y, Q, kernel = k, M=M)
+        m.ensure_default_constraints()
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+    @unittest.skip('linear kernels do not have dKdiag_dX')
+    def test_linear_kern(self):
+        N, M, Q, D = 10, 3, 2, 4
+        X = np.random.rand(N, Q)
+        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        k = GPy.kern.linear(Q) + GPy.kern.white(Q, 0.00001)
+        m = GPy.models.sparse_GPLVM(Y, Q, kernel = k, M=M)
+        m.ensure_default_constraints()
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+    def test_rbf_kern(self):
+        N, M, Q, D = 10, 3, 2, 4
+        X = np.random.rand(N, Q)
+        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
+        k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
+        m = GPy.models.sparse_GPLVM(Y, Q, kernel = k, M=M)
+        m.ensure_default_constraints()
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()
diff --git a/GPy/testing/unit_tests.py b/GPy/testing/unit_tests.py
index 61fb15bb..5ec1d766 100644
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@@ -157,13 +157,28 @@ class GradientTests(unittest.TestCase):
     def test_GP_EP_probit(self):
         N = 20
         X = np.hstack([np.random.normal(5,2,N/2),np.random.normal(10,2,N/2)])[:,None]
-        Y = np.hstack([np.ones(N/2),np.repeat(-1,N/2)])[:,None]
+        Y = np.hstack([np.ones(N/2),np.zeros(N/2)])[:,None]
         kernel = GPy.kern.rbf(1)
         distribution = GPy.likelihoods.likelihood_functions.probit()
         likelihood = GPy.likelihoods.EP(Y, distribution)
         m = GPy.models.GP(X, likelihood, kernel)
         m.ensure_default_constraints()
-        self.assertTrue(m.EPEM)
+        m.update_likelihood_approximation()
+        self.assertTrue(m.checkgrad())
+        #self.assertTrue(m.EPEM)
+
+    def test_sparse_EP_DTC_probit(self):
+        N = 20
+        X = np.hstack([np.random.normal(5,2,N/2),np.random.normal(10,2,N/2)])[:,None]
+        Y = np.hstack([np.ones(N/2),np.zeros(N/2)])[:,None]
+        Z = np.linspace(0,15,4)[:,None]
+        kernel = GPy.kern.rbf(1)
+        distribution = GPy.likelihoods.likelihood_functions.probit()
+        likelihood = GPy.likelihoods.EP(Y, distribution)
+        m = GPy.models.sparse_GP(X, likelihood, kernel,Z)
+        m.ensure_default_constraints()
+        m.update_likelihood_approximation()
+        self.assertTrue(m.checkgrad())
 
     @unittest.skip("FITC will be broken for a while")
     def test_generalized_FITC(self):
diff --git a/doc/kernel_implementation.rst b/doc/kernel_implementation.rst
new file mode 100644
index 00000000..57b37c8e
--- /dev/null
+++ b/doc/kernel_implementation.rst
@@ -0,0 +1,17 @@
+
+***************************
+List of implemented kernels
+***************************
+
+The :math:`\checkmark` symbol represents the functions that have been implemented for each kernel.
+
+..  |tick|
+
+..  |tick| image:: tick.png
+
+
+======  ===========  ===  ======= =========== =============== ======= =========== ====== ====== =======
+ NAME     get/set    K    Kdiag   dK_dtheta   dKdiag_dtheta   dK_dX   dKdiag_dX   psi0   psi1   psi2
+======  ===========  ===  ======= =========== =============== ======= =========== ====== ====== =======
+rbf     \\checkmark   y  
+======  ===========  ===  ======= =========== =============== ======= =========== ====== ====== =======
diff --git a/doc/tuto_GP_regression.rst b/doc/tuto_GP_regression.rst
index 92b25bc0..9de79a8c 100644
--- a/doc/tuto_GP_regression.rst
+++ b/doc/tuto_GP_regression.rst
@@ -2,7 +2,7 @@
 Gaussian process regression tutorial
 *************************************
 
-We will see in this tutorial the basics for building a 1 dimensional and a 2 dimensional Gaussian process regression model, also known as a kriging model.
+We will see in this tutorial the basics for building a 1 dimensional and a 2 dimensional Gaussian process regression model, also known as a kriging model. The code shown in this tutorial can be found without the comments at GPy/examples/tuto_GP_regression.py.
 
 We first import the libraries we will need: ::
 
diff --git a/doc/tuto_interacting_with_models.rst b/doc/tuto_interacting_with_models.rst
new file mode 100644
index 00000000..370ffd95
--- /dev/null
+++ b/doc/tuto_interacting_with_models.rst
@@ -0,0 +1,60 @@
+*************************************
+Interacting with models
+*************************************
+
+The GPy model class has a set of features which are designed to make it simple to explore the parameter space of the model. By default, the scipy optimisers are used to fit GPy models (via model.optimize()), for which we provide mechanisms for 'free' optimisation: GPy can ensure that naturally positive parameters (such as variances) remain positive. But these mechanisms are much more powerful than simple reparameterisation, as we shall see. 
+
+All of the examples included in GPy return an instance of a model class. We'll use GPy.examples.?? as an example::
+
+    import pylab as pb
+    pb.ion()
+    import GPy
+    m = GPy.examples.??
+
+Examining the model using print
+===============================
+To see the current state of the model parameters, and the model's (marginal) likelihood just print the model::
+    print m
+
+?? output
+
+Getting the model's likelihood and gradients
+===========================================
+foobar
+
+Setting and fetching parameters by name
+=======================================
+foobar
+
+Constraining and optimising the model
+=====================================
+A simple task in GPy is to ensure that the models' variances remain positive during optimisation. the models class has a function called constrain_positive(), which accepts a regex string as above. To constrain the models' variance to be positive::
+    m.constrain_positive('variance')
+    print m
+
+Now we see that the variance of the model is constrained to be postive. GPy handles the effective change of gradients: see how m.objective_gradients has changed approriately
+
+
+For convenience, we also provide a catch all function which ensures that anything which appears to require positivity is constrianed appropriately::
+    m.ensure_default_constraints()
+
+
+Fixing parameters
+=================
+
+
+Tying Parameters
+================
+
+Bounding parameters
+===================
+
+
+Further Reading
+===============
+All of the mechansiams for dealing with parameters are baked right into GPy.core.model, from which all of the classes in GPy.models inherrit. To learn how to construct your own model, you might want to read ??link?? creating_new_models. 
+
+By deafult, GPy uses the tnc optimizer (from scipy.optimize.tnc). To use other optimisers, and to control the setting of those optimisers, as well as other funky features like automated restarts and diagnostics, you can read the optimization tutorial ??link??.
+
+
+
diff --git a/doc/tuto_kernel_overview.rst b/doc/tuto_kernel_overview.rst
index a8f5b53d..c420943b 100644
--- a/doc/tuto_kernel_overview.rst
+++ b/doc/tuto_kernel_overview.rst
@@ -2,6 +2,7 @@
 ****************************
 tutorial : A kernel overview
 ****************************
+The aim of this tutorial is to give a better understanding of the kernel objects in GPy and to list the ones that are already implemented. The code shown in this tutorial can be found without the comments at GPy/examples/tuto_kernel_overview.py.
 
 First we import the libraries we will need ::
 
@@ -38,7 +39,7 @@ return::
 Implemented kernels
 ===================
 
-Many kernels are already implemented in GPy. Here is a summary of most of them:
+Many kernels are already implemented in GPy. A comprehensive list can be found `here <kernel_implementation.html>`_ . The following figure gives a summary of most of them:
 
 .. figure::  Figures/tuto_kern_overview_allkern.png
     :align:  center