Merged in real gpy

2026-05-15 06:52:39 +02:00 · 2013-09-09 13:33:54 +01:00 · 2013-09-09 13:33:54 +01:00 · 29358519d9
commit 29358519d9
parent 1985cdcdbb 2558418b08
170 changed files with 30940 additions and 5 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,7 +9,6 @@
 dist
 build
 eggs
-parts
 bin
 var
 sdist
@ -39,3 +38,11 @@ nosetests.xml

 #bfgs optimiser leaves this lying around
 iterate.dat
+
+# Nosetests #
+#############
+*.noseids
+
+# git merge files #
+###################
+*.orig
--- a/.travis.yml
+++ b/.travis.yml
@ -7,15 +7,20 @@ virtualenv:
  system_site_packages: true

 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
-before_install: 
+before_install:
  - sudo apt-get install -qq python-scipy python-pip
  - sudo apt-get install -qq python-matplotlib
+  # Workaround for a permissions issue with Travis virtual machine images
+  # that breaks Python's multiprocessing:
+  # https://github.com/travis-ci/travis-cookbooks/issues/155
+  - sudo rm -rf /dev/shm
+  - sudo ln -s /run/shm /dev/shm

 install:
-  - pip install --upgrade numpy==1.7.1 
-  - pip install sphinx 
+  - pip install --upgrade numpy==1.7.1
+  - pip install sphinx
  - pip install nose
  - pip install . --use-mirrors
 # command to run tests, e.g. python setup.py test
-script: 
+script:
  - nosetests GPy/testing
--- a/AUTHORS.txt
+++ b/AUTHORS.txt
@ -0,0 +1,7 @@
+James Hensman
+Nicolo Fusi
+Ricardo Andrade
+Nicolas Durrande
+Alan Saul
+Max Zwiessele
+Neil D. Lawrence
--- a/GPy/init.py
+++ b/GPy/init.py
@ -0,0 +1,21 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+import warnings
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+
+import core
+import models
+import mappings
+import inference
+import util
+import examples
+import likelihoods
+import testing
+from numpy.testing import Tester
+from nose.tools import nottest
+import kern
+from core import priors
+
+@nottest
+def tests():
+    Tester(testing).test(verbose=10)
--- a/GPy/core/init.py
+++ b/GPy/core/init.py
@ -0,0 +1,11 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from model import *
+from parameterized import *
+import priors
+from gp import GP
+from sparse_gp import SparseGP
+from fitc import FITC
+from svigp import SVIGP
+from mapping import *
--- a/GPy/core/domains.py
+++ b/GPy/core/domains.py
@ -0,0 +1,26 @@
+'''
+Created on 4 Jun 2013
+
+@author: maxz
+
+(Hyper-)Parameter domains defined for :py:mod:`~GPy.core.priors` and :py:mod:`~GPy.kern`.
+These domains specify the legitimate realm of the parameters to live in.
+
+:const:`~GPy.core.domains.REAL` :
+    real domain, all values in the real numbers are allowed
+
+:const:`~GPy.core.domains.POSITIVE`:
+    positive domain, only positive real values are allowed
+    
+:const:`~GPy.core.domains.NEGATIVE`:
+    same as :const:`~GPy.core.domains.POSITIVE`, but only negative values are allowed
+    
+:const:`~GPy.core.domains.BOUNDED`:
+    only values within the bounded range are allowed,
+    the bounds are specified withing the object with the bounded range
+'''
+
+REAL = 'real'
+POSITIVE = "positive"
+NEGATIVE = 'negative'
+BOUNDED = 'bounded'
--- a/GPy/core/fitc.py
+++ b/GPy/core/fitc.py
@ -0,0 +1,247 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+import pylab as pb
+from ..util.linalg import mdot, jitchol, chol_inv, tdot, symmetrify, pdinv, dtrtrs
+from ..util.plot import gpplot
+from .. import kern
+from scipy import stats
+from sparse_gp import SparseGP
+
+class FITC(SparseGP):
+    """
+    sparse FITC approximation
+
+    :param X: inputs
+    :type X: np.ndarray (num_data x Q)
+    :param likelihood: a likelihood instance, containing the observed data
+    :type likelihood: GPy.likelihood.(Gaussian | EP)
+    :param kernel : the kernel (covariance function). See link kernels
+    :type kernel: a GPy.kern.kern instance
+    :param Z: inducing inputs (optional, see note)
+    :type Z: np.ndarray (M x Q) | None
+    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
+    :type normalize_(X|Y): bool
+    """
+
+    def __init__(self, X, likelihood, kernel, Z, normalize_X=False):
+        SparseGP.__init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False)
+        assert self.output_dim == 1, "FITC model is not defined for handling multiple outputs"
+
+    def update_likelihood_approximation(self):
+        """
+        Approximates a non-Gaussian likelihood using Expectation Propagation
+
+        For a Gaussian likelihood, no iteration is required:
+        this function does nothing
+        """
+        self.likelihood.restart()
+        self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
+        self._set_params(self._get_params())
+
+    def _compute_kernel_matrices(self):
+        # kernel computations, using BGPLVM notation
+        self.Kmm = self.kern.K(self.Z)
+        self.psi0 = self.kern.Kdiag(self.X)
+        self.psi1 = self.kern.K(self.Z, self.X)
+        self.psi2 = None
+
+    def _computations(self):
+        #factor Kmm
+        self.Lm = jitchol(self.Kmm)
+        self.Lmi,info = dtrtrs(self.Lm,np.eye(self.num_inducing),lower=1)
+        Lmipsi1 = np.dot(self.Lmi,self.psi1)
+        self.Qnn = np.dot(Lmipsi1.T,Lmipsi1).copy()
+        self.Diag0 = self.psi0 - np.diag(self.Qnn)
+        self.beta_star = self.likelihood.precision/(1. + self.likelihood.precision*self.Diag0[:,None]) #NOTE: beta_star contains Diag0 and the precision
+        self.V_star = self.beta_star * self.likelihood.Y
+
+        # The rather complex computations of self.A
+        tmp = self.psi1 * (np.sqrt(self.beta_star.flatten().reshape(1, self.num_data)))
+        tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1)
+        self.A = tdot(tmp)
+
+        # factor B
+        self.B = np.eye(self.num_inducing) + self.A
+        self.LB = jitchol(self.B)
+        self.LBi = chol_inv(self.LB)
+        self.psi1V = np.dot(self.psi1, self.V_star)
+
+        Lmi_psi1V, info = dtrtrs(self.Lm, np.asfortranarray(self.psi1V), lower=1, trans=0)
+        self._LBi_Lmi_psi1V, _ = dtrtrs(self.LB, np.asfortranarray(Lmi_psi1V), lower=1, trans=0)
+
+        Kmmipsi1 = np.dot(self.Lmi.T,Lmipsi1)
+        b_psi1_Ki = self.beta_star * Kmmipsi1.T
+        Ki_pbp_Ki = np.dot(Kmmipsi1,b_psi1_Ki)
+        Kmmi = np.dot(self.Lmi.T,self.Lmi)
+        LBiLmi = np.dot(self.LBi,self.Lmi)
+        LBL_inv = np.dot(LBiLmi.T,LBiLmi)
+        VVT = np.outer(self.V_star,self.V_star)
+        VV_p_Ki = np.dot(VVT,Kmmipsi1.T)
+        Ki_pVVp_Ki = np.dot(Kmmipsi1,VV_p_Ki)
+        psi1beta = self.psi1*self.beta_star.T
+        H = self.Kmm + mdot(self.psi1,psi1beta.T)
+        LH = jitchol(H)
+        LHi = chol_inv(LH)
+        Hi = np.dot(LHi.T,LHi)
+
+        betapsi1TLmiLBi = np.dot(psi1beta.T,LBiLmi.T)
+        alpha = np.array([np.dot(a.T,a) for a in betapsi1TLmiLBi])[:,None]
+        gamma_1 = mdot(VVT,self.psi1.T,Hi)
+        pHip = mdot(self.psi1.T,Hi,self.psi1)
+        gamma_2 = mdot(self.beta_star*pHip,self.V_star)
+        gamma_3 = self.V_star * gamma_2
+
+        self._dL_dpsi0 = -0.5 * self.beta_star#dA_dpsi0: logdet(self.beta_star)
+        self._dL_dpsi0 += .5 * self.V_star**2 #dA_psi0: yT*beta_star*y
+        self._dL_dpsi0 += .5 *alpha #dC_dpsi0
+        self._dL_dpsi0 += 0.5*mdot(self.beta_star*pHip,self.V_star)**2 - self.V_star * mdot(self.V_star.T,pHip*self.beta_star).T #dD_dpsi0
+
+        self._dL_dpsi1 = b_psi1_Ki.copy() #dA_dpsi1: logdet(self.beta_star)
+        self._dL_dpsi1 += -np.dot(psi1beta.T,LBL_inv) #dC_dpsi1
+        self._dL_dpsi1 += gamma_1 - mdot(psi1beta.T,Hi,self.psi1,gamma_1) #dD_dpsi1
+
+        self._dL_dKmm = -0.5 * np.dot(Kmmipsi1,b_psi1_Ki) #dA_dKmm: logdet(self.beta_star)
+        self._dL_dKmm += .5*(LBL_inv - Kmmi) + mdot(LBL_inv,psi1beta,Kmmipsi1.T) #dC_dKmm
+        self._dL_dKmm += -.5 * mdot(Hi,self.psi1,gamma_1) #dD_dKmm
+
+        self._dpsi1_dtheta = 0
+        self._dpsi1_dX = 0
+        self._dKmm_dtheta = 0
+        self._dKmm_dX = 0
+
+        self._dpsi1_dX_jkj = 0
+        self._dpsi1_dtheta_jkj = 0
+
+        for i,V_n,alpha_n,gamma_n,gamma_k in zip(range(self.num_data),self.V_star,alpha,gamma_2,gamma_3):
+            K_pp_K = np.dot(Kmmipsi1[:,i:(i+1)],Kmmipsi1[:,i:(i+1)].T)
+            _dpsi1 = (-V_n**2 - alpha_n + 2.*gamma_k - gamma_n**2) * Kmmipsi1.T[i:(i+1),:]
+            _dKmm = .5*(V_n**2 + alpha_n + gamma_n**2 - 2.*gamma_k) * K_pp_K #Diag_dD_dKmm
+            self._dpsi1_dtheta += self.kern.dK_dtheta(_dpsi1,self.X[i:i+1,:],self.Z)
+            self._dKmm_dtheta += self.kern.dK_dtheta(_dKmm,self.Z)
+            self._dKmm_dX += 2.*self.kern.dK_dX(_dKmm ,self.Z)
+            self._dpsi1_dX += self.kern.dK_dX(_dpsi1.T,self.Z,self.X[i:i+1,:])
+
+        # the partial derivative vector for the likelihood
+        if self.likelihood.Nparams == 0:
+            # save computation here.
+            self.partial_for_likelihood = None
+        elif self.likelihood.is_heteroscedastic:
+            raise NotImplementedError, "heteroscedatic derivates not implemented."
+        else:
+            # likelihood is not heterscedatic
+            dbstar_dnoise = self.likelihood.precision * (self.beta_star**2 * self.Diag0[:,None] - self.beta_star)
+            Lmi_psi1 = mdot(self.Lmi,self.psi1)
+            LBiLmipsi1 = np.dot(self.LBi,Lmi_psi1)
+            aux_0 = np.dot(self._LBi_Lmi_psi1V.T,LBiLmipsi1)
+            aux_1 = self.likelihood.Y.T * np.dot(self._LBi_Lmi_psi1V.T,LBiLmipsi1)
+            aux_2 = np.dot(LBiLmipsi1.T,self._LBi_Lmi_psi1V)
+
+            dA_dnoise = 0.5 * self.input_dim * (dbstar_dnoise/self.beta_star).sum() - 0.5 * self.input_dim * np.sum(self.likelihood.Y**2 * dbstar_dnoise)
+            dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T,self.LBi,Lmi_psi1) *  Lmi_psi1 * dbstar_dnoise.T)
+            dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T,self.LBi,Lmi_psi1) *  Lmi_psi1 * dbstar_dnoise.T)
+
+            dD_dnoise_1 =  mdot(self.V_star*LBiLmipsi1.T,LBiLmipsi1*dbstar_dnoise.T*self.likelihood.Y.T)
+            alpha = mdot(LBiLmipsi1,self.V_star)
+            alpha_ = mdot(LBiLmipsi1.T,alpha)
+            dD_dnoise_2 = -0.5 * self.input_dim * np.sum(alpha_**2 * dbstar_dnoise )
+
+            dD_dnoise_1 = mdot(self.V_star.T,self.psi1.T,self.Lmi.T,self.LBi.T,self.LBi,self.Lmi,self.psi1,dbstar_dnoise*self.likelihood.Y)
+            dD_dnoise_2 = 0.5*mdot(self.V_star.T,self.psi1.T,Hi,self.psi1,dbstar_dnoise*self.psi1.T,Hi,self.psi1,self.V_star)
+            dD_dnoise = dD_dnoise_1 + dD_dnoise_2
+
+            self.partial_for_likelihood = dA_dnoise + dC_dnoise + dD_dnoise
+
+    def log_likelihood(self):
+        """ Compute the (lower bound on the) log marginal likelihood """
+        A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.beta_star)) - 0.5 * np.sum(self.V_star * self.likelihood.Y)
+        C = -self.output_dim * (np.sum(np.log(np.diag(self.LB))))
+        D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V))
+        return A + C + D
+
+    def _log_likelihood_gradients(self):
+        pass
+        return np.hstack((self.dL_dZ().flatten(), self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))
+
+    def dL_dtheta(self):
+        dL_dtheta = self.kern.dKdiag_dtheta(self._dL_dpsi0,self.X)
+        dL_dtheta += self.kern.dK_dtheta(self._dL_dpsi1,self.X,self.Z)
+        dL_dtheta += self.kern.dK_dtheta(self._dL_dKmm,X=self.Z)
+        dL_dtheta += self._dKmm_dtheta
+        dL_dtheta += self._dpsi1_dtheta
+        return dL_dtheta
+
+    def dL_dZ(self):
+        dL_dZ = self.kern.dK_dX(self._dL_dpsi1.T,self.Z,self.X)
+        dL_dZ += 2. * self.kern.dK_dX(self._dL_dKmm,X=self.Z)
+        dL_dZ += self._dpsi1_dX
+        dL_dZ += self._dKmm_dX
+        return dL_dZ
+
+    def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
+        assert X_variance_new is None, "FITC model is not defined for handling uncertain inputs."
+
+        if self.likelihood.is_heteroscedastic:
+            Iplus_Dprod_i = 1./(1.+ self.Diag0 * self.likelihood.precision.flatten())
+            self.Diag = self.Diag0 * Iplus_Dprod_i
+            self.P = Iplus_Dprod_i[:,None] * self.psi1.T
+            self.RPT0 = np.dot(self.Lmi,self.psi1)
+            self.L = np.linalg.cholesky(np.eye(self.num_inducing) + np.dot(self.RPT0,((1. - Iplus_Dprod_i)/self.Diag0)[:,None]*self.RPT0.T))
+            self.R,info = dtrtrs(self.L,self.Lmi,lower=1)
+            self.RPT = np.dot(self.R,self.P.T)
+            self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T,self.RPT)
+            self.w = self.Diag * self.likelihood.v_tilde
+            self.Gamma = np.dot(self.R.T, np.dot(self.RPT,self.likelihood.v_tilde))
+            self.mu = self.w + np.dot(self.P,self.Gamma)
+
+            """
+            Make a prediction for the generalized FITC model
+
+            Arguments
+            ---------
+            X : Input prediction data - Nx1 numpy array (floats)
+            """
+            # q(u|f) = N(u| R0i*mu_u*f, R0i*C*R0i.T)
+
+            # Ci = I + (RPT0)Di(RPT0).T
+            # C = I - [RPT0] * (input_dim+[RPT0].T*[RPT0])^-1*[RPT0].T
+            #   = I - [RPT0] * (input_dim + self.Qnn)^-1 * [RPT0].T
+            #   = I - [RPT0] * (U*U.T)^-1 * [RPT0].T
+            #   = I - V.T * V
+            U = np.linalg.cholesky(np.diag(self.Diag0) + self.Qnn)
+            V,info = dtrtrs(U,self.RPT0.T,lower=1)
+            C = np.eye(self.num_inducing) - np.dot(V.T,V)
+            mu_u = np.dot(C,self.RPT0)*(1./self.Diag0[None,:])
+            #self.C = C
+            #self.RPT0 = np.dot(self.R0,self.Knm.T) P0.T
+            #self.mu_u = mu_u
+            #self.U = U
+            # q(u|y) = N(u| R0i*mu_H,R0i*Sigma_H*R0i.T)
+            mu_H = np.dot(mu_u,self.mu)
+            self.mu_H = mu_H
+            Sigma_H = C + np.dot(mu_u,np.dot(self.Sigma,mu_u.T))
+            # q(f_star|y) = N(f_star|mu_star,sigma2_star)
+            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
+            KR0T = np.dot(Kx.T,self.Lmi.T)
+            mu_star = np.dot(KR0T,mu_H)
+            if full_cov:
+                Kxx = self.kern.K(Xnew,which_parts=which_parts)
+                var = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.num_inducing),KR0T.T))
+            else:
+                Kxx = self.kern.Kdiag(Xnew,which_parts=which_parts)
+                var = (Kxx + np.sum(KR0T.T*np.dot(Sigma_H - np.eye(self.num_inducing),KR0T.T),0))[:,None]
+            return mu_star[:,None],var
+        else:
+            raise NotImplementedError, "Heteroscedastic case not implemented."
+            """
+            Kx = self.kern.K(self.Z, Xnew)
+            mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
+            if full_cov:
+                Kxx = self.kern.K(Xnew)
+                var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) #NOTE this won't work for plotting
+            else:
+                Kxx = self.kern.Kdiag(Xnew)
+                var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0)
+            return mu,var[:,None]
+            """
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -0,0 +1,157 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+import pylab as pb
+from .. import kern
+from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs
+from ..likelihoods import EP
+from gp_base import GPBase
+
+class GP(GPBase):
+    """
+    Gaussian Process model for regression and EP
+
+    :param X: input observations
+    :param kernel: a GPy kernel, defaults to rbf+white
+    :parm likelihood: a GPy likelihood
+    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_X: False|True
+    :rtype: model object
+    :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1
+    :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.]
+    :type powerep: list
+
+    .. Note:: Multiple independent outputs are allowed using columns of Y
+
+    """
+    def __init__(self, X, likelihood, kernel, normalize_X=False):
+        GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
+        self._set_params(self._get_params())
+
+    def getstate(self):
+        return GPBase.getstate(self)
+
+    def setstate(self, state):
+        GPBase.setstate(self, state)
+        self._set_params(self._get_params())
+
+    def _set_params(self, p):
+        self.kern._set_params_transformed(p[:self.kern.num_params_transformed()])
+        self.likelihood._set_params(p[self.kern.num_params_transformed():])
+
+        self.K = self.kern.K(self.X)
+        self.K += self.likelihood.covariance_matrix
+
+        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
+
+        # the gradient of the likelihood wrt the covariance matrix
+        if self.likelihood.YYT is None:
+            # alpha = np.dot(self.Ki, self.likelihood.Y)
+            alpha, _ = dpotrs(self.L, self.likelihood.Y, lower=1)
+
+            self.dL_dK = 0.5 * (tdot(alpha) - self.output_dim * self.Ki)
+        else:
+            # tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki)
+            tmp, _ = dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1)
+            tmp, _ = dpotrs(self.L, np.asfortranarray(tmp.T), lower=1)
+            self.dL_dK = 0.5 * (tmp - self.output_dim * self.Ki)
+
+    def _get_params(self):
+        return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params()))
+
+
+    def _get_param_names(self):
+        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
+
+    def update_likelihood_approximation(self):
+        """
+        Approximates a non-gaussian likelihood using Expectation Propagation
+
+        For a Gaussian likelihood, no iteration is required:
+        this function does nothing
+        """
+        self.likelihood.restart()
+        self.likelihood.fit_full(self.kern.K(self.X))
+        self._set_params(self._get_params()) # update the GP
+
+    def _model_fit_term(self):
+        """
+        Computes the model fit using YYT if it's available
+        """
+        if self.likelihood.YYT is None:
+            tmp, _ = dtrtrs(self.L, np.asfortranarray(self.likelihood.Y), lower=1)
+            return -0.5 * np.sum(np.square(tmp))
+            # return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y)))
+        else:
+            return -0.5 * np.sum(np.multiply(self.Ki, self.likelihood.YYT))
+
+    def log_likelihood(self):
+        """
+        The log marginal likelihood of the GP.
+
+        For an EP model,  can be written as the log likelihood of a regression
+        model for a new variable Y* = v_tilde/tau_tilde, with a covariance
+        matrix K* = K + diag(1./tau_tilde) plus a normalization term.
+        """
+        return (-0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) -
+            0.5 * self.output_dim * self.K_logdet + self._model_fit_term() + self.likelihood.Z)
+
+
+    def _log_likelihood_gradients(self):
+        """
+        The gradient of all parameters.
+
+        Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
+        """
+        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
+
+    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False):
+        """
+        Internal helper function for making predictions, does not account
+        for normalization or likelihood
+        """
+        Kx = self.kern.K(_Xnew, self.X, which_parts=which_parts).T
+        # KiKx = np.dot(self.Ki, Kx)
+        KiKx, _ = dpotrs(self.L, np.asfortranarray(Kx), lower=1)
+        mu = np.dot(KiKx.T, self.likelihood.Y)
+        if full_cov:
+            Kxx = self.kern.K(_Xnew, which_parts=which_parts)
+            var = Kxx - np.dot(KiKx.T, Kx)
+        else:
+            Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
+            var = Kxx - np.sum(np.multiply(KiKx, Kx), 0)
+            var = var[:, None]
+        if stop:
+            debug_this # @UndefinedVariable
+        return mu, var
+
+    def predict(self, Xnew, which_parts='all', full_cov=False, likelihood_args=dict()):
+        """
+        Predict the function(s) at the new point(s) Xnew.
+        Arguments
+        ---------
+        :param Xnew: The points at which to make a prediction
+        :type Xnew: np.ndarray, Nnew x self.input_dim
+        :param which_parts:  specifies which outputs kernel(s) to use in prediction
+        :type which_parts: ('all', list of bools)
+        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
+        :type full_cov: bool
+        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
+        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
+        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
+
+
+           If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
+           This is to allow for different normalizations of the output dimensions.
+
+        """
+        # normalize X values
+        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
+        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)
+
+        # now push through likelihood
+        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, **likelihood_args)
+
+        return mean, var, _025pm, _975pm
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@ -0,0 +1,209 @@
+import numpy as np
+from .. import kern
+from ..util.plot import gpplot, Tango, x_frame1D, x_frame2D
+import pylab as pb
+from GPy.core.model import Model
+
+class GPBase(Model):
+    """
+    Gaussian process base model for holding shared behaviour between
+    sparse_GP and GP models.
+    """
+
+    def __init__(self, X, likelihood, kernel, normalize_X=False):
+        self.X = X
+        assert len(self.X.shape) == 2
+        self.num_data, self.input_dim = self.X.shape
+        assert isinstance(kernel, kern.kern)
+        self.kern = kernel
+        self.likelihood = likelihood
+        assert self.X.shape[0] == self.likelihood.data.shape[0]
+        self.num_data, self.output_dim = self.likelihood.data.shape
+
+        if normalize_X:
+            self._Xoffset = X.mean(0)[None, :]
+            self._Xscale = X.std(0)[None, :]
+            self.X = (X.copy() - self._Xoffset) / self._Xscale
+        else:
+            self._Xoffset = np.zeros((1, self.input_dim))
+            self._Xscale = np.ones((1, self.input_dim))
+
+        super(GPBase, self).__init__()
+        # Model.__init__(self)
+        # All leaf nodes should call self._set_params(self._get_params()) at
+        # the end
+
+    def getstate(self):
+        """
+        Get the current state of the class, here we return everything that is needed to recompute the model.
+        """
+        return Model.getstate(self) + [self.X,
+                self.num_data,
+                self.input_dim,
+                self.kern,
+                self.likelihood,
+                self.output_dim,
+                self._Xoffset,
+                self._Xscale]
+
+    def setstate(self, state):
+        self._Xscale = state.pop()
+        self._Xoffset = state.pop()
+        self.output_dim = state.pop()
+        self.likelihood = state.pop()
+        self.kern = state.pop()
+        self.input_dim = state.pop()
+        self.num_data = state.pop()
+        self.X = state.pop()
+        Model.setstate(self, state)
+
+    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None):
+        """
+        Plot the GP's view of the world, where the data is normalized and the
+        likelihood is Gaussian.
+
+        Plot the posterior of the GP.
+          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
+          - In two dimsensions, a contour-plot shows the mean predicted function
+          - In higher dimensions, we've no implemented this yet !TODO!
+
+        Can plot only part of the data and part of the posterior functions
+        using which_data and which_functions
+
+        :param samples: the number of a posteriori samples to plot
+        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+        :param which_data: which if the training data to plot (default all)
+        :type which_data: 'all' or a slice object to slice self.X, self.Y
+        :param which_parts: which of the kernel functions to plot (additively)
+        :type which_parts: 'all', or list of bools
+        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+        :type resolution: int
+        :param full_cov:
+        :type full_cov: bool
+                :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+
+        """
+        if which_data == 'all':
+            which_data = slice(None)
+
+        if ax is None:
+            fig = pb.figure(num=fignum)
+            ax = fig.add_subplot(111)
+
+        if self.X.shape[1] == 1:
+            Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits)
+            if samples == 0:
+                m, v = self._raw_predict(Xnew, which_parts=which_parts)
+                gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v), axes=ax)
+                ax.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
+            else:
+                m, v = self._raw_predict(Xnew, which_parts=which_parts, full_cov=True)
+                Ysim = np.random.multivariate_normal(m.flatten(), v, samples)
+                gpplot(Xnew, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None, ], axes=ax)
+                for i in range(samples):
+                    ax.plot(Xnew, Ysim[i, :], Tango.colorsHex['darkBlue'], linewidth=0.25)
+            ax.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
+            ax.set_xlim(xmin, xmax)
+            ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
+            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
+            ax.set_ylim(ymin, ymax)
+
+        elif self.X.shape[1] == 2:
+            resolution = resolution or 50
+            Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits, resolution)
+            m, v = self._raw_predict(Xnew, which_parts=which_parts)
+            m = m.reshape(resolution, resolution).T
+            ax.contour(xx, yy, m, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
+            ax.scatter(self.X[:, 0], self.X[:, 1], 40, self.likelihood.Y, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max()) # @UndefinedVariable
+            ax.set_xlim(xmin[0], xmax[0])
+            ax.set_ylim(xmin[1], xmax[1])
+        else:
+            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+
+    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
+        """
+        Plot the GP with noise where the likelihood is Gaussian.
+
+        Plot the posterior of the GP.
+          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
+          - In two dimsensions, a contour-plot shows the mean predicted function
+          - In higher dimensions, we've no implemented this yet !TODO!
+
+        Can plot only part of the data and part of the posterior functions
+        using which_data and which_functions
+
+        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+        :type plot_limits: np.array
+        :param which_data: which if the training data to plot (default all)
+        :type which_data: 'all' or a slice object to slice self.X, self.Y
+        :param which_parts: which of the kernel functions to plot (additively)
+        :type which_parts: 'all', or list of bools
+        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+        :type resolution: int
+        :param levels: number of levels to plot in a contour plot.
+        :type levels: int
+        :param samples: the number of a posteriori samples to plot
+        :type samples: int
+        :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
+        :type fixed_inputs: a list of tuples
+        :param linecol: color of line to plot.
+        :type linecol:
+        :param fillcol: color of fill
+        :type fillcol: 
+        :param levels: for 2D plotting, the number of contour levels to use
+        is ax is None, create a new figure
+
+        """
+        # TODO include samples
+        if which_data == 'all':
+            which_data = slice(None)
+
+        if ax is None:
+            fig = pb.figure(num=fignum)
+            ax = fig.add_subplot(111)
+
+        plotdims = self.input_dim - len(fixed_inputs)
+
+        if plotdims == 1:
+
+            Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
+
+            fixed_dims = np.array([i for i,v in fixed_inputs])
+            freedim = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
+
+            Xnew, xmin, xmax = x_frame1D(Xu[:,freedim], plot_limits=plot_limits)
+            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
+            Xgrid[:,freedim] = Xnew
+            for i,v in fixed_inputs:
+                Xgrid[:,i] = v
+
+            m, _, lower, upper = self.predict(Xgrid, which_parts=which_parts)
+            for d in range(m.shape[1]):
+                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
+                ax.plot(Xu[which_data,freedim], self.likelihood.data[which_data, d], 'kx', mew=1.5)
+            ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
+            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
+            ax.set_xlim(xmin, xmax)
+            ax.set_ylim(ymin, ymax)
+
+        elif self.X.shape[1] == 2: # FIXME
+            resolution = resolution or 50
+            Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
+            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
+            m, _, lower, upper = self.predict(Xnew, which_parts=which_parts)
+            m = m.reshape(resolution, resolution).T
+            ax.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
+            Yf = self.likelihood.data.flatten()
+            ax.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.) # @UndefinedVariable
+            ax.set_xlim(xmin[0], xmax[0])
+            ax.set_ylim(xmin[1], xmax[1])
+
+        else:
+            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@ -0,0 +1,190 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from ..util.plot import Tango, x_frame1D, x_frame2D
+from parameterized import Parameterized
+import numpy as np
+import pylab as pb
+
+class Mapping(Parameterized):
+    """
+    Base model for shared behavior between models that can act like a mapping. 
+    """
+
+    def __init__(self, input_dim, output_dim):
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+
+        super(Mapping, self).__init__()
+        # Model.__init__(self)
+        # All leaf nodes should call self._set_params(self._get_params()) at
+        # the end
+
+    def f(self, X):
+        raise NotImplementedError
+
+    def df_dX(self, dL_df, X):
+        """Evaluate derivatives of mapping outputs with respect to inputs.
+
+        :param dL_df: gradient of the objective with respect to the function.
+        :type dL_df: ndarray (num_data x output_dim)
+        :param X: the input locations where derivatives are to be evaluated.
+        :type X: ndarray (num_data x input_dim)
+        :returns: matrix containing gradients of the function with respect to the inputs.
+        """
+        raise NotImplementedError
+
+    def df_dtheta(self, dL_df, X):
+        """The gradient of the outputs of the multi-layer perceptron with respect to each of the parameters.
+        
+        :param dL_df: gradient of the objective with respect to the function.
+        :type dL_df: ndarray (num_data x output_dim)
+        :param X: input locations where the function is evaluated.
+        :type X: ndarray (num_data x input_dim)
+        :returns: Matrix containing gradients with respect to parameters of each output for each input data.
+        :rtype: ndarray (num_params length)
+        """
+        
+        raise NotImplementedError
+
+    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue']):
+        """
+        Plot the mapping.
+        
+        Plots the mapping associated with the model.
+          - In one dimension, the function is plotted.
+          - In two dimsensions, a contour-plot shows the function
+          - In higher dimensions, we've not implemented this yet !TODO!
+
+        Can plot only part of the data and part of the posterior functions
+        using which_data and which_functions
+
+        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+        :type plot_limits: np.array
+        :param which_data: which if the training data to plot (default all)
+        :type which_data: 'all' or a slice object to slice self.X, self.Y
+        :param which_parts: which of the kernel functions to plot (additively)
+        :type which_parts: 'all', or list of bools
+        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+        :type resolution: int
+        :param levels: number of levels to plot in a contour plot.
+        :type levels: int
+        :param samples: the number of a posteriori samples to plot
+        :type samples: int
+        :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
+        :type fixed_inputs: a list of tuples
+        :param linecol: color of line to plot.
+        :type linecol:
+        :param levels: for 2D plotting, the number of contour levels to use
+        is ax is None, create a new figure
+
+        """
+        # TODO include samples
+        if which_data == 'all':
+            which_data = slice(None)
+
+        if ax is None:
+            fig = pb.figure(num=fignum)
+            ax = fig.add_subplot(111)
+
+        plotdims = self.input_dim - len(fixed_inputs)
+
+        if plotdims == 1:
+
+            Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
+
+            fixed_dims = np.array([i for i,v in fixed_inputs])
+            freedim = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
+
+            Xnew, xmin, xmax = x_frame1D(Xu[:,freedim], plot_limits=plot_limits)
+            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
+            Xgrid[:,freedim] = Xnew
+            for i,v in fixed_inputs:
+                Xgrid[:,i] = v
+
+            f = self.predict(Xgrid, which_parts=which_parts)
+            for d in range(y.shape[1]):
+                ax.plot(Xnew, f[:, d], edgecol=linecol)
+
+        elif self.X.shape[1] == 2: 
+            resolution = resolution or 50
+            Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
+            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
+            f = self.predict(Xnew, which_parts=which_parts)
+            m = m.reshape(resolution, resolution).T
+            ax.contour(x, y, f, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
+            ax.set_xlim(xmin[0], xmax[0])
+            ax.set_ylim(xmin[1], xmax[1])
+
+        else:
+            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+
+from GPy.core.model import Model
+
+class Mapping_check_model(Model):
+    """This is a dummy model class used as a base class for checking that the gradients of a given mapping are implemented correctly. It enables checkgradient() to be called independently on each mapping."""
+    def __init__(self, mapping=None, dL_df=None, X=None):
+        num_samples = 20
+        if mapping==None:
+            mapping = GPy.mapping.linear(1, 1)
+        if X==None:
+            X = np.random.randn(num_samples, mapping.input_dim)
+        if dL_df==None:
+            dL_df = np.ones((num_samples, mapping.output_dim))
+        
+        self.mapping=mapping
+        self.X = X
+        self.dL_df = dL_df
+        self.num_params = self.mapping.num_params
+        Model.__init__(self)
+
+        
+    def _get_params(self):
+        return self.mapping._get_params()
+
+    def _get_param_names(self):
+        return self.mapping._get_param_names()
+
+    def _set_params(self, x):
+        self.mapping._set_params(x)
+
+    def log_likelihood(self):
+        return (self.dL_df*self.mapping.f(self.X)).sum()
+
+    def _log_likelihood_gradients(self):
+        raise NotImplementedError, "This needs to be implemented to use the Mapping_check_model class."
+    
+class Mapping_check_df_dtheta(Mapping_check_model):
+    """This class allows gradient checks for the gradient of a mapping with respect to parameters. """
+    def __init__(self, mapping=None, dL_df=None, X=None):
+        Mapping_check_model.__init__(self,mapping=mapping,dL_df=dL_df, X=X)
+
+    def _log_likelihood_gradients(self):
+        return self.mapping.df_dtheta(self.dL_df, self.X)
+
+
+class Mapping_check_df_dX(Mapping_check_model):
+    """This class allows gradient checks for the gradient of a mapping with respect to X. """
+    def __init__(self, mapping=None, dL_df=None, X=None):
+        Mapping_check_model.__init__(self,mapping=mapping,dL_df=dL_df, X=X)
+
+        if dL_df==None:
+            dL_df = np.ones((self.X.shape[0],self.mapping.output_dim))
+        self.num_params = self.X.shape[0]*self.mapping.input_dim
+        
+    def _log_likelihood_gradients(self):
+        return self.mapping.df_dX(self.dL_df, self.X).flatten()
+
+    def _get_param_names(self):
+        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
+                
+    def _get_params(self):
+        return self.X.flatten()
+
+    def _set_params(self, x):
+        self.X=x.reshape(self.X.shape)
+
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -0,0 +1,581 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from .. import likelihoods
+from ..inference import optimization
+from ..util.linalg import jitchol
+from GPy.util.misc import opt_wrapper
+from parameterized import Parameterized
+import multiprocessing as mp
+import numpy as np
+from GPy.core.domains import POSITIVE, REAL
+from numpy.linalg.linalg import LinAlgError
+# import numdifftools as ndt
+
+class Model(Parameterized):
+    _fail_count = 0 # Count of failed optimization steps (see objective)
+    _allowed_failures = 10 # number of allowed failures
+    def __init__(self):
+        Parameterized.__init__(self)
+        self.priors = None
+        self.optimization_runs = []
+        self.sampling_runs = []
+        self.preferred_optimizer = 'scg'
+        # self._set_params(self._get_params()) has been taken out as it should only be called on leaf nodes
+    def log_likelihood(self):
+        raise NotImplementedError, "this needs to be implemented to use the model class"
+    def _log_likelihood_gradients(self):
+        raise NotImplementedError, "this needs to be implemented to use the model class"
+
+    def getstate(self):
+        """
+        Get the current state of the class.
+        
+        Inherited from Parameterized, so add those parameters to the state
+        :return: list of states from the model.
+
+        """
+        return Parameterized.getstate(self) + \
+            [self.priors, self.optimization_runs,
+             self.sampling_runs, self.preferred_optimizer]
+
+    def setstate(self, state):
+        """
+        set state from previous call to getstate
+        call Parameterized with the rest of the state
+
+        :param state: the state of the model.
+        :type state: list as returned from getstate.        
+        """
+        self.preferred_optimizer = state.pop()
+        self.sampling_runs = state.pop()
+        self.optimization_runs = state.pop()
+        self.priors = state.pop()
+        Parameterized.setstate(self, state)
+
+    def set_prior(self, regexp, what):
+        """
+        Sets priors on the model parameters.
+
+        Notes
+        -----
+        Asserts that the prior is suitable for the constraint. If the
+        wrong constraint is in place, an error is raised.  If no
+        constraint is in place, one is added (warning printed).
+
+        For tied parameters, the prior will only be "counted" once, thus
+        a prior object is only inserted on the first tied index
+
+        :param regexp: regular expression of parameters on which priors need to be set.
+        :type param: string, regexp, or integer array
+        :param what: prior to set on parameter.
+        :type what: GPy.core.Prior type
+
+        """
+        if self.priors is None:
+            self.priors = [None for i in range(self._get_params().size)]
+
+        which = self.grep_param_names(regexp)
+
+        # check tied situation
+        tie_partial_matches = [tie for tie in self.tied_indices if (not set(tie).isdisjoint(set(which))) & (not set(tie) == set(which))]
+        if len(tie_partial_matches):
+            raise ValueError, "cannot place prior across partial ties"
+        tie_matches = [tie for tie in self.tied_indices if set(which) == set(tie) ]
+        if len(tie_matches) > 1:
+            raise ValueError, "cannot place prior across multiple ties"
+        elif len(tie_matches) == 1:
+            which = which[:1] # just place a prior object on the first parameter
+
+
+        # check constraints are okay
+
+        if what.domain is POSITIVE:
+            constrained_positive_indices = [i for i, t in zip(self.constrained_indices, self.constraints) if t.domain is POSITIVE]
+            if len(constrained_positive_indices):
+                constrained_positive_indices = np.hstack(constrained_positive_indices)
+            else:
+                constrained_positive_indices = np.zeros(shape=(0,))
+            bad_constraints = np.setdiff1d(self.all_constrained_indices(), constrained_positive_indices)
+            assert not np.any(which[:, None] == bad_constraints), "constraint and prior incompatible"
+            unconst = np.setdiff1d(which, constrained_positive_indices)
+            if len(unconst):
+                print "Warning: constraining parameters to be positive:"
+                print '\n'.join([n for i, n in enumerate(self._get_param_names()) if i in unconst])
+                print '\n'
+                self.constrain_positive(unconst)
+        elif what.domain is REAL:
+            assert not np.any(which[:, None] == self.all_constrained_indices()), "constraint and prior incompatible"
+        else:
+            raise ValueError, "prior not recognised"
+
+        # store the prior in a local list
+        for w in which:
+            self.priors[w] = what
+
+    def get_gradient(self, name, return_names=False):
+        """
+        Get model gradient(s) by name. The name is applied as a regular expression and all parameters that match that regular expression are returned.
+
+        :param name: the name of parameters required (as a regular expression).
+        :type name: regular expression
+        :param return_names: whether or not to return the names matched (default False)
+        :type return_names: bool
+        """
+        matches = self.grep_param_names(name)
+        if len(matches):
+            if return_names:
+                return self._log_likelihood_gradients()[matches], np.asarray(self._get_param_names())[matches].tolist()
+            else:
+                return self._log_likelihood_gradients()[matches]
+        else:
+            raise AttributeError, "no parameter matches %s" % name
+
+    def log_prior(self):
+        """evaluate the prior"""
+        if self.priors is not None:
+            return np.sum([p.lnpdf(x) for p, x in zip(self.priors, self._get_params()) if p is not None])
+        else:
+            return 0.
+
+    def _log_prior_gradients(self):
+        """evaluate the gradients of the priors"""
+        if self.priors is None:
+            return 0.
+        x = self._get_params()
+        ret = np.zeros(x.size)
+        [np.put(ret, i, p.lnpdf_grad(xx)) for i, (p, xx) in enumerate(zip(self.priors, x)) if not p is None]
+        return ret
+
+    def _transform_gradients(self, g):
+        x = self._get_params()
+        for index, constraint in zip(self.constrained_indices, self.constraints):
+            g[index] = g[index] * constraint.gradfactor(x[index])
+        [np.put(g, i, v) for i, v in [(t[0], np.sum(g[t])) for t in self.tied_indices]]
+        if len(self.tied_indices) or len(self.fixed_indices):
+            to_remove = np.hstack((self.fixed_indices + [t[1:] for t in self.tied_indices]))
+            return np.delete(g, to_remove)
+        else:
+            return g
+
+    def randomize(self):
+        """
+        Randomize the model.
+        Make this draw from the prior if one exists, else draw from N(0,1)
+        """
+        # first take care of all parameters (from N(0,1))
+        x = self._get_params_transformed()
+        x = np.random.randn(x.size)
+        self._set_params_transformed(x)
+        # now draw from prior where possible
+        x = self._get_params()
+        if self.priors is not None:
+            [np.put(x, i, p.rvs(1)) for i, p in enumerate(self.priors) if not p is None]
+        self._set_params(x)
+        self._set_params_transformed(self._get_params_transformed()) # makes sure all of the tied parameters get the same init (since there's only one prior object...)
+
+
+    def optimize_restarts(self, num_restarts=10, robust=False, verbose=True, parallel=False, num_processes=None, **kwargs):
+        """
+        Perform random restarts of the model, and set the model to the best
+        seen solution.
+
+        If the robust flag is set, exceptions raised during optimizations will
+        be handled silently.  If _all_ runs fail, the model is reset to the
+        existing parameter values.
+
+        Notes
+        -----
+        :param num_restarts: number of restarts to use (default 10)
+        :type num_restarts: int
+        :param robust: whether to handle exceptions silently or not (default False)
+        :type robust: bool
+        :param parallel: whether to run each restart as a separate process. It relies on the multiprocessing module.
+        :type parallel: bool
+        :param num_processes: number of workers in the multiprocessing pool
+        :type numprocesses: int
+        **kwargs are passed to the optimizer. They can be:
+        :param max_f_eval: maximum number of function evaluations
+        :type max_f_eval: int
+        :param max_iters: maximum number of iterations
+        :type max_iters: int
+        :param messages: whether to display during optimisation
+        :type messages: bool
+
+        ..Note: If num_processes is None, the number of workes in the multiprocessing pool is automatically
+        set to the number of processors on the current machine.
+
+
+        """
+        initial_parameters = self._get_params_transformed()
+
+        if parallel:
+            try:
+                jobs = []
+                pool = mp.Pool(processes=num_processes)
+                for i in range(num_restarts):
+                    self.randomize()
+                    job = pool.apply_async(opt_wrapper, args=(self,), kwds=kwargs)
+                    jobs.append(job)
+
+                pool.close() # signal that no more data coming in
+                pool.join() # wait for all the tasks to complete
+            except KeyboardInterrupt:
+                print "Ctrl+c received, terminating and joining pool."
+                pool.terminate()
+                pool.join()
+
+        for i in range(num_restarts):
+            try:
+                if not parallel:
+                    self.randomize()
+                    self.optimize(**kwargs)
+                else:
+                    self.optimization_runs.append(jobs[i].get())
+
+                if verbose:
+                    print("Optimization restart {0}/{1}, f = {2}".format(i + 1, num_restarts, self.optimization_runs[-1].f_opt))
+            except Exception as e:
+                if robust:
+                    print("Warning - optimization restart {0}/{1} failed".format(i + 1, num_restarts))
+                else:
+                    raise e
+
+        if len(self.optimization_runs):
+            i = np.argmin([o.f_opt for o in self.optimization_runs])
+            self._set_params_transformed(self.optimization_runs[i].x_opt)
+        else:
+            self._set_params_transformed(initial_parameters)
+
+    def ensure_default_constraints(self):
+        """       
+        Ensure that any variables which should clearly be positive
+        have been constrained somehow. The method performs a regular
+        expression search on parameter names looking for the terms
+        'variance', 'lengthscale', 'precision' and 'kappa'. If any of
+        these terms are present in the name the parameter is
+        constrained positive.
+        """
+        positive_strings = ['variance', 'lengthscale', 'precision', 'kappa']
+        # param_names = self._get_param_names()
+        currently_constrained = self.all_constrained_indices()
+        to_make_positive = []
+        for s in positive_strings:
+            for i in self.grep_param_names(".*" + s):
+                if not (i in currently_constrained):
+                    to_make_positive.append(i)
+        if len(to_make_positive):
+            self.constrain_positive(np.asarray(to_make_positive))
+
+    def objective_function(self, x):
+        """
+        The objective function passed to the optimizer. It combines
+        the likelihood and the priors.
+        
+        Failures are handled robustly. The algorithm will try several times to
+        return the objective, and will raise the original exception if it
+        the objective cannot be computed.
+
+        :param x: the parameters of the model.
+        :parameter type: np.array
+        """
+        try:
+            self._set_params_transformed(x)
+            self._fail_count = 0
+        except (LinAlgError, ZeroDivisionError, ValueError) as e:
+            if self._fail_count >= self._allowed_failures:
+                raise e
+            self._fail_count += 1
+            return np.inf
+        return -self.log_likelihood() - self.log_prior()
+
+    def objective_function_gradients(self, x):
+        """
+        Gets the gradients from the likelihood and the priors.
+
+        Failures are handled robustly. The algorithm will try several times to
+        return the gradients, and will raise the original exception if it
+        the objective cannot be computed.
+
+        :param x: the parameters of the model.
+        :parameter type: np.array
+        """
+        try:
+            self._set_params_transformed(x)
+            obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
+            self._fail_count = 0
+        except (LinAlgError, ZeroDivisionError, ValueError) as e:
+            if self._fail_count >= self._allowed_failures:
+                raise e
+            self._fail_count += 1
+            obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100)
+        return obj_grads
+
+    def objective_and_gradients(self, x):
+        """
+        Compute the objective function of the model and the gradient of the model at the point given by x.
+
+        :param x: the point at which gradients are to be computed.
+        :type np.array:
+        """
+
+        try:
+            self._set_params_transformed(x)
+            obj_f = -self.log_likelihood() - self.log_prior()
+            self._fail_count = 0
+            obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
+        except (LinAlgError, ZeroDivisionError, ValueError) as e:
+            if self._fail_count >= self._allowed_failures:
+                raise e
+            self._fail_count += 1
+            obj_f = np.inf
+            obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100)
+        return obj_f, obj_grads
+
+    def optimize(self, optimizer=None, start=None, **kwargs):
+        """
+        Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
+        kwargs are passed to the optimizer. They can be:
+
+        :param max_f_eval: maximum number of function evaluations
+        :type max_f_eval: int
+        :messages: whether to display during optimisation
+        :type messages: bool
+        :param optimzer: which optimizer to use (defaults to self.preferred optimizer)
+        :type optimzer: string TODO: valid strings?
+        """
+        if optimizer is None:
+            optimizer = self.preferred_optimizer
+
+        if start == None:
+            start = self._get_params_transformed()
+
+        optimizer = optimization.get_optimizer(optimizer)
+        opt = optimizer(start, model=self, **kwargs)
+
+        opt.run(f_fp=self.objective_and_gradients, f=self.objective_function, fp=self.objective_function_gradients)
+
+        self.optimization_runs.append(opt)
+
+        self._set_params_transformed(opt.x_opt)
+
+    def optimize_SGD(self, momentum=0.1, learning_rate=0.01, iterations=20, **kwargs):
+        # assert self.Y.shape[1] > 1, "SGD only works with D > 1"
+        sgd = SGD.StochasticGD(self, iterations, learning_rate, momentum, **kwargs) # @UndefinedVariable
+        sgd.run()
+        self.optimization_runs.append(sgd)
+
+    def Laplace_covariance(self):
+        """return the covariance matrix of a Laplace approximation at the current (stationary) point."""
+        # TODO add in the prior contributions for MAP estimation
+        # TODO fix the hessian for tied, constrained and fixed components
+        if hasattr(self, 'log_likelihood_hessian'):
+            A = -self.log_likelihood_hessian()
+
+        else:
+            print "numerically calculating Hessian. please be patient!"
+            x = self._get_params()
+            def f(x):
+                self._set_params(x)
+                return self.log_likelihood()
+            h = ndt.Hessian(f) # @UndefinedVariable
+            A = -h(x)
+            self._set_params(x)
+        # check for almost zero components on the diagonal which screw up the cholesky
+        aa = np.nonzero((np.diag(A) < 1e-6) & (np.diag(A) > 0.))[0]
+        A[aa, aa] = 0.
+        return A
+
+    def Laplace_evidence(self):
+        """Returns an estiamte of the model evidence based on the Laplace approximation.
+        Uses a numerical estimate of the Hessian if none is available analytically."""
+        A = self.Laplace_covariance()
+        try:
+            hld = np.sum(np.log(np.diag(jitchol(A)[0])))
+        except:
+            return np.nan
+        return 0.5 * self._get_params().size * np.log(2 * np.pi) + self.log_likelihood() - hld
+
+    def __str__(self, names=None):
+        if names is None:
+            names = self._get_print_names()
+        s = Parameterized.__str__(self, names=names).split('\n')
+        # add priors to the string
+        if self.priors is not None:
+            strs = [str(p) if p is not None else '' for p in self.priors]
+        else:
+            strs = [''] * len(self._get_param_names())
+        name_indices = self.grep_param_names("|".join(names))
+        strs = np.array(strs)[name_indices]
+        width = np.array(max([len(p) for p in strs] + [5])) + 4
+
+        log_like = self.log_likelihood()
+        log_prior = self.log_prior()
+        obj_funct = '\nLog-likelihood: {0:.3e}'.format(log_like)
+        if len(''.join(strs)) != 0:
+            obj_funct += ', Log prior: {0:.3e}, LL+prior = {0:.3e}'.format(log_prior, log_like + log_prior)
+        obj_funct += '\n\n'
+        s[0] = obj_funct + s[0]
+        s[0] += "|{h:^{col}}".format(h='prior', col=width)
+        s[1] += '-' * (width + 1)
+
+        for p in range(2, len(strs) + 2):
+            s[p] += '|{prior:^{width}}'.format(prior=strs[p - 2], width=width)
+
+        return '\n'.join(s)
+
+
+    def checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3):
+        """
+        Check the gradient of the ,odel by comparing to a numerical
+        estimate.  If the verbose flag is passed, invividual
+        components are tested (and printed)
+
+        :param verbose: If True, print a "full" checking of each parameter
+        :type verbose: bool
+        :param step: The size of the step around which to linearise the objective
+        :type step: float (default 1e-6)
+        :param tolerance: the tolerance allowed (see note)
+        :type tolerance: float (default 1e-3)
+
+        Note:-
+           The gradient is considered correct if the ratio of the analytical
+           and numerical gradients is within <tolerance> of unity.
+        """
+
+        x = self._get_params_transformed().copy()
+
+        if not verbose:
+            # just check the global ratio
+            dx = step * np.sign(np.random.uniform(-1, 1, x.size))
+
+            # evaulate around the point x
+            f1, g1 = self.objective_and_gradients(x + dx)
+            f2, g2 = self.objective_and_gradients(x - dx)
+            gradient = self.objective_function_gradients(x)
+
+            numerical_gradient = (f1 - f2) / (2 * dx)
+            global_ratio = (f1 - f2) / (2 * np.dot(dx, gradient))
+
+            return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() - 1) < tolerance
+        else:
+            # check the gradient of each parameter individually, and do some pretty printing
+            try:
+                names = self._get_param_names_transformed()
+            except NotImplementedError:
+                names = ['Variable %i' % i for i in range(len(x))]
+
+            # Prepare for pretty-printing
+            header = ['Name', 'Ratio', 'Difference', 'Analytical', 'Numerical']
+            max_names = max([len(names[i]) for i in range(len(names))] + [len(header[0])])
+            float_len = 10
+            cols = [max_names]
+            cols.extend([max(float_len, len(header[i])) for i in range(1, len(header))])
+            cols = np.array(cols) + 5
+            header_string = ["{h:^{col}}".format(h=header[i], col=cols[i]) for i in range(len(cols))]
+            header_string = map(lambda x: '|'.join(x), [header_string])
+            separator = '-' * len(header_string[0])
+            print '\n'.join([header_string[0], separator])
+
+            if target_param is None:
+                param_list = range(len(x))
+            else:
+                param_list = self.grep_param_names(target_param, transformed=True, search=True)
+                if not np.any(param_list):
+                    print "No free parameters to check"
+                    return
+
+
+            for i in param_list:
+                xx = x.copy()
+                xx[i] += step
+                f1, g1 = self.objective_and_gradients(xx)
+                xx[i] -= 2.*step
+                f2, g2 = self.objective_and_gradients(xx)
+                gradient = self.objective_function_gradients(x)[i]
+
+                numerical_gradient = (f1 - f2) / (2 * step)
+                ratio = (f1 - f2) / (2 * step * gradient)
+                difference = np.abs((f1 - f2) / 2 / step - gradient)
+
+                if (np.abs(1. - ratio) < tolerance) or np.abs(difference) < tolerance:
+                    formatted_name = "\033[92m {0} \033[0m".format(names[i])
+                else:
+                    formatted_name = "\033[91m {0} \033[0m".format(names[i])
+                r = '%.6f' % float(ratio)
+                d = '%.6f' % float(difference)
+                g = '%.6f' % gradient
+                ng = '%.6f' % float(numerical_gradient)
+                grad_string = "{0:^{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}".format(formatted_name, r, d, g, ng, c0=cols[0] + 9, c1=cols[1], c2=cols[2], c3=cols[3], c4=cols[4])
+                print grad_string
+
+    def input_sensitivity(self):
+        """
+        return an array describing the sesitivity of the model to each input
+
+        NB. Right now, we're basing this on the lengthscales (or
+        variances) of the kernel.  TODO: proper sensitivity analysis
+        where we integrate across the model inputs and evaluate the
+        effect on the variance of the model output.  """
+
+        if not hasattr(self, 'kern'):
+            raise ValueError, "this model has no kernel"
+
+        k = [p for p in self.kern.parts if p.name in ['rbf', 'linear', 'rbf_inv']]
+        if (not len(k) == 1) or (not k[0].ARD):
+            raise ValueError, "cannot determine sensitivity for this kernel"
+        k = k[0]
+
+        if k.name == 'rbf':
+            return 1. / k.lengthscale
+        elif k.name == 'rbf_inv':
+            return k.inv_lengthscale
+        elif k.name == 'linear':
+            return k.variances
+
+
+    def pseudo_EM(self, epsilon=.1, **kwargs):
+        """
+        TODO: Should this not bein the GP class?
+        EM - like algorithm  for Expectation Propagation and Laplace approximation
+
+        kwargs are passed to the optimize function. They can be:
+
+        :epsilon: convergence criterion
+        :max_f_eval: maximum number of function evaluations
+        :messages: whether to display during optimisation
+        :param optimzer: whice optimizer to use (defaults to self.preferred optimizer)
+        :type optimzer: string TODO: valid strings?
+
+        """
+        assert isinstance(self.likelihood, likelihoods.EP), "pseudo_EM is only available for EP likelihoods"
+        ll_change = epsilon + 1.
+        iteration = 0
+        last_ll = -np.inf
+
+        convergence = False
+        alpha = 0
+        stop = False
+
+        while not stop:
+            last_approximation = self.likelihood.copy()
+            last_params = self._get_params()
+            self.update_likelihood_approximation()
+            new_ll = self.log_likelihood()
+            ll_change = new_ll - last_ll
+
+            if ll_change < 0:
+                self.likelihood = last_approximation # restore previous likelihood approximation
+                self._set_params(last_params) # restore model parameters
+                print "Log-likelihood decrement: %s \nLast likelihood update discarded." % ll_change
+                stop = True
+            else:
+                self.optimize(**kwargs)
+                last_ll = self.log_likelihood()
+                if ll_change < epsilon:
+                    stop = True
+            iteration += 1
+            if stop:
+                print "%s iterations." % iteration
+        self.update_likelihood_approximation()
--- a/GPy/core/parameterized.py
+++ b/GPy/core/parameterized.py
@ -0,0 +1,385 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+import re
+import copy
+import cPickle
+import warnings
+import transformations
+
+class Parameterized(object):
+    def __init__(self):
+        """
+        This is the base class for model and kernel. Mostly just handles tieing and constraining of parameters
+        """
+        self.tied_indices = []
+        self.fixed_indices = []
+        self.fixed_values = []
+        self.constrained_indices = []
+        self.constraints = []
+
+    def _get_params(self):
+        raise NotImplementedError, "this needs to be implemented to use the Parameterized class"
+    def _set_params(self, x):
+        raise NotImplementedError, "this needs to be implemented to use the Parameterized class"
+
+    def _get_param_names(self):
+        raise NotImplementedError, "this needs to be implemented to use the Parameterized class"
+    def _get_print_names(self):
+        """ Override for which names to print out, when using print m """
+        return self._get_param_names()
+
+    def pickle(self, filename, protocol=None):
+        if protocol is None:
+            if self._has_get_set_state():
+                protocol = 0
+            else:
+                protocol = -1
+        with open(filename, 'w') as f:
+            cPickle.dump(self, f, protocol)
+
+    def copy(self):
+        """Returns a (deep) copy of the current model """
+        return copy.deepcopy(self)
+
+    def __getstate__(self):
+        if self._has_get_set_state():
+            return self.getstate()
+        return self.__dict__
+
+    def __setstate__(self, state):
+        if self._has_get_set_state():
+            self.setstate(state) # set state
+            self._set_params(self._get_params()) # restore all values
+            return
+        self.__dict__ = state
+
+    def _has_get_set_state(self):
+        return 'getstate' in vars(self.__class__) and 'setstate' in vars(self.__class__)
+
+    def getstate(self):
+        """
+        Get the current state of the class,
+        here just all the indices, rest can get recomputed
+        
+        For inheriting from Parameterized:
+        Allways append the state of the inherited object 
+        and call down to the inherited object in setstate!! 
+        """
+        return [self.tied_indices,
+                self.fixed_indices,
+                self.fixed_values,
+                self.constrained_indices,
+                self.constraints]
+
+    def setstate(self, state):
+        self.constraints = state.pop()
+        self.constrained_indices = state.pop()
+        self.fixed_values = state.pop()
+        self.fixed_indices = state.pop()
+        self.tied_indices = state.pop()
+
+    def __getitem__(self, regexp, return_names=False):
+        """
+        Get a model parameter by name.  The name is applied as a regular
+        expression and all parameters that match that regular expression are
+        returned.
+        """
+        matches = self.grep_param_names(regexp)
+        if len(matches):
+            if return_names:
+                return self._get_params()[matches], np.asarray(self._get_param_names())[matches].tolist()
+            else:
+                return self._get_params()[matches]
+        else:
+            raise AttributeError, "no parameter matches %s" % regexp
+
+    def __setitem__(self, name, val):
+        """
+        Set model parameter(s) by name. The name is provided as a regular
+        expression. All parameters matching that regular expression are set to
+        the given value.
+        """
+        matches = self.grep_param_names(name)
+        if len(matches):
+            val = np.array(val)
+            assert (val.size == 1) or val.size == len(matches), "Shape mismatch: {}:({},)".format(val.size, len(matches))
+            x = self._get_params()
+            x[matches] = val
+            self._set_params(x)
+        else:
+            raise AttributeError, "no parameter matches %s" % name
+
+    def tie_params(self, regexp):
+        """
+        Tie (all!) parameters matching the regular expression `regexp`. 
+        """
+        matches = self.grep_param_names(regexp)
+        assert matches.size > 0, "need at least something to tie together"
+        if len(self.tied_indices):
+            assert not np.any(matches[:, None] == np.hstack(self.tied_indices)), "Some indices are already tied!"
+        self.tied_indices.append(matches)
+        # TODO only one of the priors will be evaluated. Give a warning message if the priors are not identical
+        if hasattr(self, 'prior'):
+            pass
+
+        self._set_params_transformed(self._get_params_transformed()) # sets tied parameters to single value
+
+    def untie_everything(self):
+        """Unties all parameters by setting tied_indices to an empty list."""
+        self.tied_indices = []
+
+    def grep_param_names(self, regexp, transformed=False, search=False):
+        """
+        :param regexp: regular expression to select parameter names
+        :type regexp: re | str | int
+        :rtype: the indices of self._get_param_names which match the regular expression.
+
+        Note:-
+          Other objects are passed through - i.e. integers which weren't meant for grepping
+        """
+
+        if transformed:
+            names = self._get_param_names_transformed()
+        else:
+            names = self._get_param_names()
+
+        if type(regexp) in [str, np.string_, np.str]:
+            regexp = re.compile(regexp)
+        elif type(regexp) is re._pattern_type:
+            pass
+        else:
+            return regexp
+        if search:
+            return np.nonzero([regexp.search(name) for name in names])[0]
+        else:
+            return np.nonzero([regexp.match(name) for name in names])[0]
+
+    def num_params_transformed(self):
+        removed = 0
+        for tie in self.tied_indices:
+            removed += tie.size - 1
+
+        for fix in self.fixed_indices:
+            removed += fix.size
+
+        return len(self._get_params()) - removed
+
+    def unconstrain(self, regexp):
+        """Unconstrain matching parameters.  Does not untie parameters"""
+        matches = self.grep_param_names(regexp)
+
+        # tranformed contraints:
+        for match in matches:
+            self.constrained_indices = [i[i <> match] for i in self.constrained_indices]
+
+        # remove empty constraints
+        tmp = zip(*[(i, t) for i, t in zip(self.constrained_indices, self.constraints) if len(i)])
+        if tmp:
+            self.constrained_indices, self.constraints = zip(*[(i, t) for i, t in zip(self.constrained_indices, self.constraints) if len(i)])
+            self.constrained_indices, self.constraints = list(self.constrained_indices), list(self.constraints)
+
+        # fixed:
+        self.fixed_values = [np.delete(values, np.nonzero(np.sum(indices[:, None] == matches[None, :], 1))[0]) for indices, values in zip(self.fixed_indices, self.fixed_values)]
+        self.fixed_indices = [np.delete(indices, np.nonzero(np.sum(indices[:, None] == matches[None, :], 1))[0]) for indices in self.fixed_indices]
+
+        # remove empty elements
+        tmp = [(i, v) for i, v in zip(self.fixed_indices, self.fixed_values) if len(i)]
+        if tmp:
+            self.fixed_indices, self.fixed_values = zip(*tmp)
+            self.fixed_indices, self.fixed_values = list(self.fixed_indices), list(self.fixed_values)
+        else:
+            self.fixed_indices, self.fixed_values = [], []
+
+    def constrain_negative(self, regexp):
+        """ Set negative constraints. """
+        self.constrain(regexp, transformations.negative_logexp())
+
+    def constrain_positive(self, regexp):
+        """ Set positive constraints. """
+        self.constrain(regexp, transformations.logexp())
+
+    def constrain_bounded(self, regexp, lower, upper):
+        """ Set bounded constraints. """
+        self.constrain(regexp, transformations.logistic(lower, upper))
+
+    def all_constrained_indices(self):
+        if len(self.constrained_indices) or len(self.fixed_indices):
+            return np.hstack(self.constrained_indices + self.fixed_indices)
+        else:
+            return np.empty(shape=(0,))
+
+    def constrain(self, regexp, transform):
+        assert isinstance(transform, transformations.transformation)
+
+        matches = self.grep_param_names(regexp)
+        overlap = set(matches).intersection(set(self.all_constrained_indices()))
+        if overlap:
+            self.unconstrain(np.asarray(list(overlap)))
+            print 'Warning: re-constraining these parameters'
+            pn = self._get_param_names()
+            for i in overlap:
+                print pn[i]
+
+        self.constrained_indices.append(matches)
+        self.constraints.append(transform)
+        x = self._get_params()
+        x[matches] = transform.initialize(x[matches])
+        self._set_params(x)
+
+    def constrain_fixed(self, regexp, value=None):
+        """
+        Arguments
+        ---------
+        :param regexp: which parameters need to be fixed.
+        :type regexp: ndarray(dtype=int) or regular expression object or string
+        :param value: the vlaue to fix the parameters to. If the value is not specified,
+                 the parameter is fixed to the current value
+        :type value: float
+        Notes
+        -----
+        Fixing a parameter which is tied to another, or constrained in some way will result in an error.
+        To fix multiple parameters to the same value, simply pass a regular expression which matches both parameter names, or pass both of the indexes
+        """
+        matches = self.grep_param_names(regexp)
+        overlap = set(matches).intersection(set(self.all_constrained_indices()))
+        if overlap:
+            self.unconstrain(np.asarray(list(overlap)))
+            print 'Warning: re-constraining these parameters'
+            pn = self._get_param_names()
+            for i in overlap:
+                print pn[i]
+
+        self.fixed_indices.append(matches)
+        if value != None:
+            self.fixed_values.append(value)
+        else:
+            self.fixed_values.append(self._get_params()[self.fixed_indices[-1]])
+
+        # self.fixed_values.append(value)
+        self._set_params_transformed(self._get_params_transformed())
+
+    def _get_params_transformed(self):
+        """use self._get_params to get the 'true' parameters of the model, which are then tied, constrained and fixed"""
+        x = self._get_params()
+        [np.put(x, i, t.finv(x[i])) for i, t in zip(self.constrained_indices, self.constraints)]
+
+        to_remove = self.fixed_indices + [t[1:] for t in self.tied_indices]
+        if len(to_remove):
+            return np.delete(x, np.hstack(to_remove))
+        else:
+            return x
+
+    def _set_params_transformed(self, x):
+        """ takes the vector x, which is then modified (by untying, reparameterising or inserting fixed values), and then call self._set_params"""
+        self._set_params(self._untransform_params(x))
+
+    def _untransform_params(self, x):
+        """
+        The transformation required for _set_params_transformed.
+
+        This moves the vector x seen by the optimiser (unconstrained) to the
+        valid parameter vector seen by the model
+
+        Note:
+          - This function is separate from _set_params_transformed for downstream flexibility
+        """
+        # work out how many places are fixed, and where they are. tricky logic!
+        fix_places = self.fixed_indices + [t[1:] for t in self.tied_indices]
+        if len(fix_places):
+            fix_places = np.hstack(fix_places)
+            Nfix_places = fix_places.size
+        else:
+            Nfix_places = 0
+
+        free_places = np.setdiff1d(np.arange(Nfix_places + x.size, dtype=np.int), fix_places)
+
+        # put the models values in the vector xx
+        xx = np.zeros(Nfix_places + free_places.size, dtype=np.float64)
+
+        xx[free_places] = x
+        [np.put(xx, i, v) for i, v in zip(self.fixed_indices, self.fixed_values)]
+        [np.put(xx, i, v) for i, v in [(t[1:], xx[t[0]]) for t in self.tied_indices] ]
+
+        [np.put(xx, i, t.f(xx[i])) for i, t in zip(self.constrained_indices, self.constraints)]
+        if hasattr(self, 'debug'):
+            stop # @UndefinedVariable
+
+        return xx
+
+    def _get_param_names_transformed(self):
+        """
+        Returns the parameter names as propagated after constraining,
+        tying or fixing, i.e. a list of the same length as _get_params_transformed()
+        """
+        n = self._get_param_names()
+
+        # remove/concatenate the tied parameter names
+        if len(self.tied_indices):
+            for t in self.tied_indices:
+                n[t[0]] = "<tie>".join([n[tt] for tt in t])
+            remove = np.hstack([t[1:] for t in self.tied_indices])
+        else:
+            remove = np.empty(shape=(0,), dtype=np.int)
+
+        # also remove the fixed params
+        if len(self.fixed_indices):
+            remove = np.hstack((remove, np.hstack(self.fixed_indices)))
+
+        # add markers to show that some variables are constrained
+        for i, t in zip(self.constrained_indices, self.constraints):
+            for ii in i:
+                n[ii] = n[ii] + t.__str__()
+
+        n = [nn for i, nn in enumerate(n) if not i in remove]
+        return n
+
+    @property
+    def all(self):
+        return self.__str__(self._get_param_names())
+
+
+    def __str__(self, names=None, nw=30):
+        """
+        Return a string describing the parameter names and their ties and constraints
+        """
+        if names is None:
+            names = self._get_print_names()
+        name_indices = self.grep_param_names("|".join(names))
+        N = len(names)
+
+        if not N:
+            return "This object has no free parameters."
+        header = ['Name', 'Value', 'Constraints', 'Ties']
+        values = self._get_params()[name_indices] # map(str,self._get_params())
+        # sort out the constraints
+        constraints = [''] * len(self._get_param_names())
+        for i, t in zip(self.constrained_indices, self.constraints):
+            for ii in i:
+                constraints[ii] = t.__str__()
+        for i in self.fixed_indices:
+            for ii in i:
+                constraints[ii] = 'Fixed'
+        # sort out the ties
+        ties = [''] * len(names)
+        for i, tie in enumerate(self.tied_indices):
+            for j in tie:
+                ties[j] = '(' + str(i) + ')'
+
+        values = ['%.4f' % float(v) for v in values]
+        max_names = max([len(names[i]) for i in range(len(names))] + [len(header[0])])
+        max_values = max([len(values[i]) for i in range(len(values))] + [len(header[1])])
+        max_constraint = max([len(constraints[i]) for i in range(len(constraints))] + [len(header[2])])
+        max_ties = max([len(ties[i]) for i in range(len(ties))] + [len(header[3])])
+        cols = np.array([max_names, max_values, max_constraint, max_ties]) + 4
+        # columns = cols.sum()
+
+        header_string = ["{h:^{col}}".format(h=header[i], col=cols[i]) for i in range(len(cols))]
+        header_string = map(lambda x: '|'.join(x), [header_string])
+        separator = '-' * len(header_string[0])
+        param_string = ["{n:^{c0}}|{v:^{c1}}|{c:^{c2}}|{t:^{c3}}".format(n=names[i], v=values[i], c=constraints[i], t=ties[i], c0=cols[0], c1=cols[1], c2=cols[2], c3=cols[3]) for i in range(len(values))]
+
+
+        return ('\n'.join([header_string[0], separator] + param_string)) + '\n'
--- a/GPy/core/priors.py
+++ b/GPy/core/priors.py
@ -0,0 +1,217 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+import pylab as pb
+from scipy.special import gammaln, digamma
+from ..util.linalg import pdinv
+from GPy.core.domains import REAL, POSITIVE
+import warnings
+
+class Prior:
+    domain = None
+    def pdf(self, x):
+        return np.exp(self.lnpdf(x))
+
+    def plot(self):
+        rvs = self.rvs(1000)
+        pb.hist(rvs, 100, normed=True)
+        xmin, xmax = pb.xlim()
+        xx = np.linspace(xmin, xmax, 1000)
+        pb.plot(xx, self.pdf(xx), 'r', linewidth=2)
+
+
+class Gaussian(Prior):
+    """
+    Implementation of the univariate Gaussian probability function, coupled with random variables.
+
+    :param mu: mean
+    :param sigma: standard deviation
+
+    .. Note:: Bishop 2006 notation is used throughout the code
+
+    """
+    domain = REAL
+    def __init__(self, mu, sigma):
+        self.mu = float(mu)
+        self.sigma = float(sigma)
+        self.sigma2 = np.square(self.sigma)
+        self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)
+
+    def __str__(self):
+        return "N(" + str(np.round(self.mu)) + ', ' + str(np.round(self.sigma2)) + ')'
+
+    def lnpdf(self, x):
+        return self.constant - 0.5 * np.square(x - self.mu) / self.sigma2
+
+    def lnpdf_grad(self, x):
+        return -(x - self.mu) / self.sigma2
+
+    def rvs(self, n):
+        return np.random.randn(n) * self.sigma + self.mu
+
+
+class LogGaussian(Prior):
+    """
+    Implementation of the univariate *log*-Gaussian probability function, coupled with random variables.
+
+    :param mu: mean
+    :param sigma: standard deviation
+
+    .. Note:: Bishop 2006 notation is used throughout the code
+
+    """
+    domain = POSITIVE
+    def __init__(self, mu, sigma):
+        self.mu = float(mu)
+        self.sigma = float(sigma)
+        self.sigma2 = np.square(self.sigma)
+        self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)
+
+    def __str__(self):
+        return "lnN(" + str(np.round(self.mu)) + ', ' + str(np.round(self.sigma2)) + ')'
+
+    def lnpdf(self, x):
+        return self.constant - 0.5 * np.square(np.log(x) - self.mu) / self.sigma2 - np.log(x)
+
+    def lnpdf_grad(self, x):
+        return -((np.log(x) - self.mu) / self.sigma2 + 1.) / x
+
+    def rvs(self, n):
+        return np.exp(np.random.randn(n) * self.sigma + self.mu)
+
+
+class MultivariateGaussian:
+    """
+    Implementation of the multivariate Gaussian probability function, coupled with random variables.
+
+    :param mu: mean (N-dimensional array)
+    :param var: covariance matrix (NxN)
+
+    .. Note:: Bishop 2006 notation is used throughout the code
+
+    """
+    domain = REAL
+    def __init__(self, mu, var):
+        self.mu = np.array(mu).flatten()
+        self.var = np.array(var)
+        assert len(self.var.shape) == 2
+        assert self.var.shape[0] == self.var.shape[1]
+        assert self.var.shape[0] == self.mu.size
+        self.input_dim = self.mu.size
+        self.inv, self.hld = pdinv(self.var)
+        self.constant = -0.5 * self.input_dim * np.log(2 * np.pi) - self.hld
+
+    def summary(self):
+        raise NotImplementedError
+
+    def pdf(self, x):
+        return np.exp(self.lnpdf(x))
+
+    def lnpdf(self, x):
+        d = x - self.mu
+        return self.constant - 0.5 * np.sum(d * np.dot(d, self.inv), 1)
+
+    def lnpdf_grad(self, x):
+        d = x - self.mu
+        return -np.dot(self.inv, d)
+
+    def rvs(self, n):
+        return np.random.multivariate_normal(self.mu, self.var, n)
+
+    def plot(self):
+        if self.input_dim == 2:
+            rvs = self.rvs(200)
+            pb.plot(rvs[:, 0], rvs[:, 1], 'kx', mew=1.5)
+            xmin, xmax = pb.xlim()
+            ymin, ymax = pb.ylim()
+            xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
+            xflat = np.vstack((xx.flatten(), yy.flatten())).T
+            zz = self.pdf(xflat).reshape(100, 100)
+            pb.contour(xx, yy, zz, linewidths=2)
+
+
+def gamma_from_EV(E, V):
+    warnings.warn("use Gamma.from_EV to create Gamma Prior", FutureWarning)
+    return Gamma.from_EV(E, V)
+
+
+class Gamma(Prior):
+    """
+    Implementation of the Gamma probability function, coupled with random variables.
+
+    :param a: shape parameter
+    :param b: rate parameter (warning: it's the *inverse* of the scale)
+
+    .. Note:: Bishop 2006 notation is used throughout the code
+
+    """
+    domain = POSITIVE
+    def __init__(self, a, b):
+        self.a = float(a)
+        self.b = float(b)
+        self.constant = -gammaln(self.a) + a * np.log(b)
+
+    def __str__(self):
+        return "Ga(" + str(np.round(self.a)) + ', ' + str(np.round(self.b)) + ')'
+
+    def summary(self):
+        ret = {"E[x]": self.a / self.b, \
+            "E[ln x]": digamma(self.a) - np.log(self.b), \
+            "var[x]": self.a / self.b / self.b, \
+            "Entropy": gammaln(self.a) - (self.a - 1.) * digamma(self.a) - np.log(self.b) + self.a}
+        if self.a > 1:
+            ret['Mode'] = (self.a - 1.) / self.b
+        else:
+            ret['mode'] = np.nan
+        return ret
+
+    def lnpdf(self, x):
+        return self.constant + (self.a - 1) * np.log(x) - self.b * x
+
+    def lnpdf_grad(self, x):
+        return (self.a - 1.) / x - self.b
+
+    def rvs(self, n):
+        return np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
+    @staticmethod
+    def from_EV(E, V):
+        """
+        Creates an instance of a Gamma Prior  by specifying the Expected value(s)
+        and Variance(s) of the distribution.
+    
+        :param E: expected value
+        :param V: variance
+        """
+        a = np.square(E) / V
+        b = E / V
+        return Gamma(a, b)
+
+class inverse_gamma(Prior):
+    """
+    Implementation of the inverse-Gamma probability function, coupled with random variables.
+
+    :param a: shape parameter
+    :param b: rate parameter (warning: it's the *inverse* of the scale)
+
+    .. Note:: Bishop 2006 notation is used throughout the code
+
+    """
+    domain = POSITIVE
+    def __init__(self, a, b):
+        self.a = float(a)
+        self.b = float(b)
+        self.constant = -gammaln(self.a) + a * np.log(b)
+
+    def __str__(self):
+        return "iGa(" + str(np.round(self.a)) + ', ' + str(np.round(self.b)) + ')'
+
+    def lnpdf(self, x):
+        return self.constant - (self.a + 1) * np.log(x) - self.b / x
+
+    def lnpdf_grad(self, x):
+        return -(self.a + 1.) / x + self.b / x ** 2
+
+    def rvs(self, n):
+        return 1. / np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -0,0 +1,347 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+import pylab as pb
+from ..util.linalg import mdot, jitchol, tdot, symmetrify, backsub_both_sides, chol_inv, dtrtrs, dpotrs, dpotri
+from scipy import linalg
+from ..likelihoods import Gaussian
+from gp_base import GPBase
+
+class SparseGP(GPBase):
+    """
+    Variational sparse GP model
+
+    :param X: inputs
+    :type X: np.ndarray (num_data x input_dim)
+    :param likelihood: a likelihood instance, containing the observed data
+    :type likelihood: GPy.likelihood.(Gaussian | EP | Laplace)
+    :param kernel : the kernel (covariance function). See link kernels
+    :type kernel: a GPy.kern.kern instance
+    :param X_variance: The uncertainty in the measurements of X (Gaussian variance)
+    :type X_variance: np.ndarray (num_data x input_dim) | None
+    :param Z: inducing inputs (optional, see note)
+    :type Z: np.ndarray (num_inducing x input_dim) | None
+    :param num_inducing : Number of inducing points (optional, default 10. Ignored if Z is not None)
+    :type num_inducing: int
+    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
+    :type normalize_(X|Y): bool
+    """
+
+    def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False):
+        GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
+
+        self.Z = Z
+        self.num_inducing = Z.shape[0]
+#         self.likelihood = likelihood
+
+        if X_variance is None:
+            self.has_uncertain_inputs = False
+            self.X_variance = None
+        else:
+            assert X_variance.shape == X.shape
+            self.has_uncertain_inputs = True
+            self.X_variance = X_variance
+
+        if normalize_X:
+            self.Z = (self.Z.copy() - self._Xoffset) / self._Xscale
+
+        # normalize X uncertainty also
+        if self.has_uncertain_inputs:
+            self.X_variance /= np.square(self._Xscale)
+
+        self._const_jitter = None
+
+    def getstate(self):
+        """
+        Get the current state of the class,
+        here just all the indices, rest can get recomputed
+        """
+        return GPBase.getstate(self) + [self.Z,
+                self.num_inducing,
+                self.has_uncertain_inputs,
+                self.X_variance]
+
+    def setstate(self, state):
+        self.X_variance = state.pop()
+        self.has_uncertain_inputs = state.pop()
+        self.num_inducing = state.pop()
+        self.Z = state.pop()
+        GPBase.setstate(self, state)
+
+    def _compute_kernel_matrices(self):
+        # kernel computations, using BGPLVM notation
+        self.Kmm = self.kern.K(self.Z)
+        if self.has_uncertain_inputs:
+            self.psi0 = self.kern.psi0(self.Z, self.X, self.X_variance)
+            self.psi1 = self.kern.psi1(self.Z, self.X, self.X_variance)
+            self.psi2 = self.kern.psi2(self.Z, self.X, self.X_variance)
+        else:
+            self.psi0 = self.kern.Kdiag(self.X)
+            self.psi1 = self.kern.K(self.X, self.Z)
+            self.psi2 = None
+
+    def _computations(self):
+        if self._const_jitter is None or not(self._const_jitter.shape[0] == self.num_inducing):
+            self._const_jitter = np.eye(self.num_inducing) * 1e-7
+
+        # factor Kmm
+        self._Lm = jitchol(self.Kmm + self._const_jitter)
+        # TODO: no white kernel needed anymore, all noise in likelihood --------
+
+        # The rather complex computations of self._A
+        if self.has_uncertain_inputs:
+            if self.likelihood.is_heteroscedastic:
+                psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.num_data, 1, 1))).sum(0)
+            else:
+                psi2_beta = self.psi2.sum(0) * self.likelihood.precision
+            evals, evecs = linalg.eigh(psi2_beta)
+            clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
+            if not np.array_equal(evals, clipped_evals):
+                pass # print evals
+            tmp = evecs * np.sqrt(clipped_evals)
+            tmp = tmp.T
+        else:
+            if self.likelihood.is_heteroscedastic:
+                tmp = self.psi1 * (np.sqrt(self.likelihood.precision.flatten().reshape(self.num_data, 1)))
+            else:
+                tmp = self.psi1 * (np.sqrt(self.likelihood.precision))
+        tmp, _ = dtrtrs(self._Lm, np.asfortranarray(tmp.T), lower=1)
+        self._A = tdot(tmp)
+
+
+        # factor B
+        self.B = np.eye(self.num_inducing) + self._A
+        self.LB = jitchol(self.B)
+
+        # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
+        self.psi1Vf = np.dot(self.psi1.T, self.likelihood.VVT_factor)
+
+        # back substutue C into psi1Vf
+        tmp, info1 = dtrtrs(self._Lm, np.asfortranarray(self.psi1Vf), lower=1, trans=0)
+        self._LBi_Lmi_psi1Vf, _ = dtrtrs(self.LB, np.asfortranarray(tmp), lower=1, trans=0)
+        # tmp, info2 = dpotrs(self.LB, tmp, lower=1)
+        tmp, info2 = dtrtrs(self.LB, self._LBi_Lmi_psi1Vf, lower=1, trans=1)
+        self.Cpsi1Vf, info3 = dtrtrs(self._Lm, tmp, lower=1, trans=1)
+
+        # Compute dL_dKmm
+        tmp = tdot(self._LBi_Lmi_psi1Vf)
+        self.data_fit = np.trace(tmp)
+        self.DBi_plus_BiPBi = backsub_both_sides(self.LB, self.output_dim * np.eye(self.num_inducing) + tmp)
+        tmp = -0.5 * self.DBi_plus_BiPBi
+        tmp += -0.5 * self.B * self.output_dim
+        tmp += self.output_dim * np.eye(self.num_inducing)
+        self.dL_dKmm = backsub_both_sides(self._Lm, tmp)
+
+        # Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertain inputs case
+        self.dL_dpsi0 = -0.5 * self.output_dim * (self.likelihood.precision * np.ones([self.num_data, 1])).flatten()
+        self.dL_dpsi1 = np.dot(self.likelihood.VVT_factor, self.Cpsi1Vf.T)
+        dL_dpsi2_beta = 0.5 * backsub_both_sides(self._Lm, self.output_dim * np.eye(self.num_inducing) - self.DBi_plus_BiPBi)
+
+        if self.likelihood.is_heteroscedastic:
+            if self.has_uncertain_inputs:
+                self.dL_dpsi2 = self.likelihood.precision.flatten()[:, None, None] * dL_dpsi2_beta[None, :, :]
+            else:
+                self.dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (self.psi1 * self.likelihood.precision.reshape(self.num_data, 1)).T).T
+                self.dL_dpsi2 = None
+        else:
+            dL_dpsi2 = self.likelihood.precision * dL_dpsi2_beta
+            if self.has_uncertain_inputs:
+                # repeat for each of the N psi_2 matrices
+                self.dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], self.num_data, axis=0)
+            else:
+                # subsume back into psi1 (==Kmn)
+                self.dL_dpsi1 += 2.*np.dot(self.psi1, dL_dpsi2)
+                self.dL_dpsi2 = None
+
+
+        # the partial derivative vector for the likelihood
+        if self.likelihood.Nparams == 0:
+            # save computation here.
+            self.partial_for_likelihood = None
+        elif self.likelihood.is_heteroscedastic:
+            raise NotImplementedError, "heteroscedatic derivates not implemented"
+        else:
+            # likelihood is not heterscedatic
+            self.partial_for_likelihood = -0.5 * self.num_data * self.output_dim * self.likelihood.precision + 0.5 * self.likelihood.trYYT * self.likelihood.precision ** 2
+            self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0.sum() * self.likelihood.precision ** 2 - np.trace(self._A) * self.likelihood.precision)
+            self.partial_for_likelihood += self.likelihood.precision * (0.5 * np.sum(self._A * self.DBi_plus_BiPBi) - self.data_fit)
+
+    def log_likelihood(self):
+        """ Compute the (lower bound on the) log marginal likelihood """
+        if self.likelihood.is_heteroscedastic:
+            A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.likelihood.V * self.likelihood.Y)
+            B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self._A))
+        else:
+            A = -0.5 * self.num_data * self.output_dim * (np.log(2.*np.pi) - np.log(self.likelihood.precision)) - 0.5 * self.likelihood.precision * self.likelihood.trYYT
+            B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self._A))
+        C = -self.output_dim * (np.sum(np.log(np.diag(self.LB)))) # + 0.5 * self.num_inducing * np.log(sf2))
+        D = 0.5 * self.data_fit
+        return A + B + C + D + self.likelihood.Z
+
+    def _set_params(self, p):
+        self.Z = p[:self.num_inducing * self.input_dim].reshape(self.num_inducing, self.input_dim)
+        self.kern._set_params(p[self.Z.size:self.Z.size + self.kern.num_params])
+        self.likelihood._set_params(p[self.Z.size + self.kern.num_params:])
+        self._compute_kernel_matrices()
+        self._computations()
+        self.Cpsi1V = None
+
+    def _get_params(self):
+        return np.hstack([self.Z.flatten(), self.kern._get_params_transformed(), self.likelihood._get_params()])
+
+    def _get_param_names(self):
+        return sum([['iip_%i_%i' % (i, j) for j in range(self.Z.shape[1])] for i in range(self.Z.shape[0])], [])\
+            + self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
+
+    def _get_print_names(self):
+        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
+
+    def update_likelihood_approximation(self):
+        """
+        Approximates a non-gaussian likelihood using Expectation Propagation
+
+        For a Gaussian likelihood, no iteration is required:
+        this function does nothing
+        """
+        if not isinstance(self.likelihood, Gaussian): # Updates not needed for Gaussian likelihood
+            self.likelihood.restart()
+            if self.has_uncertain_inputs:
+                Lmi = chol_inv(self._Lm)
+                Kmmi = tdot(Lmi.T)
+                diag_tr_psi2Kmmi = np.array([np.trace(psi2_Kmmi) for psi2_Kmmi in np.dot(self.psi2, Kmmi)])
+
+                self.likelihood.fit_FITC(self.Kmm, self.psi1.T, diag_tr_psi2Kmmi) # This uses the fit_FITC code, but does not perfomr a FITC-EP.#TODO solve potential confusion
+                # raise NotImplementedError, "EP approximation not implemented for uncertain inputs"
+            else:
+                self.likelihood.fit_DTC(self.Kmm, self.psi1.T)
+                # self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
+                self._set_params(self._get_params()) # update the GP
+
+    def _log_likelihood_gradients(self):
+        return np.hstack((self.dL_dZ().flatten(), self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))
+
+    def dL_dtheta(self):
+        """
+        Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel
+        """
+        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm, self.Z)
+        if self.has_uncertain_inputs:
+            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X, self.X_variance)
+            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1, self.Z, self.X, self.X_variance)
+            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X, self.X_variance)
+        else:
+            dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1, self.X, self.Z)
+            dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
+
+        return dL_dtheta
+
+    def dL_dZ(self):
+        """
+        The derivative of the bound wrt the inducing inputs Z
+        """
+        dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm, self.Z) # factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
+        if self.has_uncertain_inputs:
+            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1, self.Z, self.X, self.X_variance)
+            dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2, self.Z, self.X, self.X_variance)
+        else:
+            dL_dZ += self.kern.dK_dX(self.dL_dpsi1.T, self.Z, self.X)
+        return dL_dZ
+
+    def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
+        """
+        Internal helper function for making predictions, does not account for
+        normalization or likelihood function
+        """
+
+        Bi, _ = dpotri(self.LB, lower=0) # WTH? this lower switch should be 1, but that doesn't work!
+        symmetrify(Bi)
+        Kmmi_LmiBLmi = backsub_both_sides(self._Lm, np.eye(self.num_inducing) - Bi)
+
+        if self.Cpsi1V is None:
+            psi1V = np.dot(self.psi1.T, self.likelihood.V)
+            tmp, _ = dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
+            tmp, _ = dpotrs(self.LB, tmp, lower=1)
+            self.Cpsi1V, _ = dtrtrs(self._Lm, tmp, lower=1, trans=1)
+
+        if X_variance_new is None:
+            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
+            mu = np.dot(Kx.T, self.Cpsi1V)
+            if full_cov:
+                Kxx = self.kern.K(Xnew, which_parts=which_parts)
+                var = Kxx - mdot(Kx.T, Kmmi_LmiBLmi, Kx) # NOTE this won't work for plotting
+            else:
+                Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)
+                var = Kxx - np.sum(Kx * np.dot(Kmmi_LmiBLmi, Kx), 0)
+        else:
+            # assert which_p.Tarts=='all', "swithching out parts of variational kernels is not implemented"
+            Kx = self.kern.psi1(self.Z, Xnew, X_variance_new) # , which_parts=which_parts) TODO: which_parts
+            mu = np.dot(Kx, self.Cpsi1V)
+            if full_cov:
+                raise NotImplementedError, "TODO"
+            else:
+                Kxx = self.kern.psi0(self.Z, Xnew, X_variance_new)
+                psi2 = self.kern.psi2(self.Z, Xnew, X_variance_new)
+                var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
+
+        return mu, var[:, None]
+
+    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
+        """
+        Predict the function(s) at the new point(s) Xnew.
+
+        Arguments
+        ---------
+        :param Xnew: The points at which to make a prediction
+        :type Xnew: np.ndarray, Nnew x self.input_dim
+        :param X_variance_new: The uncertainty in the prediction points
+        :type X_variance_new: np.ndarray, Nnew x self.input_dim
+        :param which_parts:  specifies which outputs kernel(s) to use in prediction
+        :type which_parts: ('all', list of bools)
+        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
+        :type full_cov: bool
+        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
+        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
+        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
+
+
+           If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
+           This is to allow for different normalizations of the output dimensions.
+
+        """
+        # normalize X values
+        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
+        if X_variance_new is not None:
+            X_variance_new = X_variance_new / self._Xscale ** 2
+
+        # here's the actual prediction by the GP model
+        mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts)
+
+        # now push through likelihood
+        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
+
+        return mean, var, _025pm, _975pm
+
+    def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, fignum=None, ax=None):
+
+        if ax is None:
+            fig = pb.figure(num=fignum)
+            ax = fig.add_subplot(111)
+        if which_data is 'all':
+            which_data = slice(None)
+
+        GPBase.plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, ax=ax)
+
+        # add the inducing inputs and some errorbars
+        if self.X.shape[1] == 1:
+            if self.has_uncertain_inputs:
+                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
+                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
+                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
+                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
+            Zu = self.Z * self._Xscale + self._Xoffset
+            ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
+
+        elif self.X.shape[1] == 2:
+            Zu = self.Z * self._Xscale + self._Xoffset
+            ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
--- a/GPy/core/svigp.py
+++ b/GPy/core/svigp.py
@ -0,0 +1,518 @@
+# Copyright (c) 2012, James Hensman and Nicolo' Fusi
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+import pylab as pb
+from .. import kern
+from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs, jitchol, backsub_both_sides
+from ..likelihoods import EP
+from gp_base import GPBase
+from model import Model
+import time
+import sys
+
+
+class SVIGP(GPBase):
+    """
+    Stochastic Variational inference in a Gaussian Process
+
+    :param X: inputs
+    :type X: np.ndarray (N x Q)
+    :param Y: observed data
+    :type Y: np.ndarray of observations (N x D)
+    :param batchsize: the size of a h
+
+    Additional kwargs are used as for a sparse GP. They include
+
+    :param q_u: canonical parameters of the distribution squasehd into a 1D array
+    :type q_u: np.ndarray
+    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
+    :type M: int
+    :param kernel : the kernel/covariance function. See link kernels
+    :type kernel: a GPy kernel
+    :param Z: inducing inputs (optional, see note)
+    :type Z: np.ndarray (M x Q) | None
+    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
+    :type X_uncertainty: np.ndarray (N x Q) | None
+    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
+    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
+    :type M: int
+    :param beta: noise precision. TODO> ignore beta if doing EP
+    :type beta: float
+    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
+    :type normalize_(X|Y): bool
+    """
+
+
+    def __init__(self, X, likelihood, kernel, Z, q_u=None, batchsize=10, X_variance=None):
+        GPBase.__init__(self, X, likelihood, kernel, normalize_X=False)
+        self.batchsize=batchsize
+        self.Y = self.likelihood.Y.copy()
+        self.Z = Z
+        self.num_inducing = Z.shape[0]
+
+        self.batchcounter = 0
+        self.epochs = 0
+        self.iterations = 0
+
+        self.vb_steplength = 0.05
+        self.param_steplength = 1e-5
+        self.momentum = 0.9
+
+        if X_variance is None:
+            self.has_uncertain_inputs = False
+        else:
+            self.has_uncertain_inputs = True
+            self.X_variance = X_variance
+
+
+        if q_u is None:
+             q_u = np.hstack((np.random.randn(self.num_inducing*self.output_dim),-.5*np.eye(self.num_inducing).flatten()))
+        self.set_vb_param(q_u)
+
+        self._permutation = np.random.permutation(self.num_data)
+        self.load_batch()
+
+        self._param_trace = []
+        self._ll_trace = []
+        self._grad_trace = []
+
+        #set the adaptive steplength parameters
+        self.hbar_t = 0.0
+        self.tau_t = 100.0
+        self.gbar_t = 0.0
+        self.gbar_t1 = 0.0
+        self.gbar_t2 = 0.0
+        self.hbar_tp = 0.0
+        self.tau_tp = 10000.0
+        self.gbar_tp = 0.0
+        self.adapt_param_steplength = True
+        self.adapt_vb_steplength = True
+        self._param_steplength_trace = []
+        self._vb_steplength_trace = []
+
+    def getstate(self):
+        steplength_params = [self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength]
+        return GPBase.getstate(self) + \
+            [self.get_vb_param(),
+             self.Z,
+             self.num_inducing,
+             self.has_uncertain_inputs,
+             self.X_variance,
+             self.X_batch,
+             self.X_variance_batch,
+             steplength_params,
+             self.batchcounter,
+             self.batchsize,
+             self.epochs,
+             self.momentum,
+             self.data_prop,
+             self._param_trace,
+             self._param_steplength_trace,
+             self._vb_steplength_trace,
+             self._ll_trace,
+             self._grad_trace,
+             self.Y,
+             self._permutation,
+             self.iterations
+            ]
+
+    def setstate(self, state):
+        self.iterations = state.pop()
+        self._permutation = state.pop()
+        self.Y = state.pop()
+        self._grad_trace = state.pop()
+        self._ll_trace = state.pop()
+        self._vb_steplength_trace = state.pop()
+        self._param_steplength_trace = state.pop()
+        self._param_trace = state.pop()
+        self.data_prop = state.pop()
+        self.momentum = state.pop()
+        self.epochs = state.pop()
+        self.batchsize = state.pop()
+        self.batchcounter = state.pop()
+        steplength_params = state.pop()
+        (self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength) = steplength_params
+        self.X_variance_batch = state.pop()
+        self.X_batch = state.pop()
+        self.X_variance = state.pop()
+        self.has_uncertain_inputs = state.pop()
+        self.num_inducing = state.pop()
+        self.Z = state.pop()
+        vb_param = state.pop()
+        GPBase.setstate(self, state)
+        self.set_vb_param(vb_param)
+
+    def _compute_kernel_matrices(self):
+        # kernel computations, using BGPLVM notation
+        self.Kmm = self.kern.K(self.Z)
+        if self.has_uncertain_inputs:
+            self.psi0 = self.kern.psi0(self.Z, self.X_batch, self.X_variance_batch)
+            self.psi1 = self.kern.psi1(self.Z, self.X_batch, self.X_variance_batch)
+            self.psi2 = self.kern.psi2(self.Z, self.X_batch, self.X_variance_batch)
+        else:
+            self.psi0 = self.kern.Kdiag(self.X_batch)
+            self.psi1 = self.kern.K(self.X_batch, self.Z)
+            self.psi2 = None
+
+    def dL_dtheta(self):
+        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm, self.Z)
+        if self.has_uncertain_inputs:
+            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X_batch, self.X_variance_batch)
+            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1, self.Z, self.X_batch, self.X_variance_batch)
+            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X_batch, self.X_variance_batch)
+        else:
+            dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1, self.X_batch, self.Z)
+            dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X_batch)
+        return dL_dtheta
+
+    def _set_params(self, p, computations=True):
+        self.kern._set_params_transformed(p[:self.kern.num_params])
+        self.likelihood._set_params(p[self.kern.num_params:])
+        if computations:
+            self._compute_kernel_matrices()
+            self._computations()
+
+    def _get_params(self):
+        return np.hstack((self.kern._get_params_transformed() , self.likelihood._get_params()))
+
+    def _get_param_names(self):
+        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
+
+    def load_batch(self):
+        """
+        load a batch of data (set self.X_batch and self.likelihood.Y from self.X, self.Y)
+        """
+
+        #if we've seen all the data, start again with them in a new random order
+        if self.batchcounter+self.batchsize > self.num_data:
+            self.batchcounter = 0
+            self.epochs += 1
+            self._permutation = np.random.permutation(self.num_data)
+
+        this_perm = self._permutation[self.batchcounter:self.batchcounter+self.batchsize]
+
+        self.X_batch = self.X[this_perm]
+        self.likelihood.set_data(self.Y[this_perm])
+        if self.has_uncertain_inputs:
+            self.X_variance_batch = self.X_variance[this_perm]
+
+        self.batchcounter += self.batchsize
+
+        self.data_prop = float(self.batchsize)/self.num_data
+
+        self._compute_kernel_matrices()
+        self._computations()
+
+    def _computations(self,do_Kmm=True, do_Kmm_grad=True):
+        """
+        All of the computations needed. Some are optional, see kwargs.
+        """
+
+        if do_Kmm:
+            self.Lm = jitchol(self.Kmm)
+
+        # The rather complex computations of self.A
+        if self.has_uncertain_inputs:
+            if self.likelihood.is_heteroscedastic:
+                psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.batchsize, 1, 1))).sum(0)
+            else:
+                psi2_beta = self.psi2.sum(0) * self.likelihood.precision
+            evals, evecs = np.linalg.eigh(psi2_beta)
+            clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
+            tmp = evecs * np.sqrt(clipped_evals)
+        else:
+            if self.likelihood.is_heteroscedastic:
+                tmp = self.psi1.T * (np.sqrt(self.likelihood.precision.flatten().reshape(1, self.batchsize)))
+            else:
+                tmp = self.psi1.T * (np.sqrt(self.likelihood.precision))
+        tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1)
+        self.A = tdot(tmp)
+
+        self.V = self.likelihood.precision*self.likelihood.Y
+        self.VmT = np.dot(self.V,self.q_u_expectation[0].T)
+        self.psi1V = np.dot(self.psi1.T, self.V)
+
+        self.B = np.eye(self.num_inducing)*self.data_prop + self.A
+        self.Lambda = backsub_both_sides(self.Lm, self.B.T)
+        self.LQL = backsub_both_sides(self.Lm,self.q_u_expectation[1].T,transpose='right')
+
+        self.trace_K = self.psi0.sum() - np.trace(self.A)/self.likelihood.precision
+        self.Kmmi_m, _ = dpotrs(self.Lm, self.q_u_expectation[0], lower=1)
+        self.projected_mean = np.dot(self.psi1,self.Kmmi_m)
+
+        # Compute dL_dpsi
+        self.dL_dpsi0 = - 0.5 * self.output_dim * self.likelihood.precision * np.ones(self.batchsize)
+        self.dL_dpsi1, _ = dpotrs(self.Lm,np.asfortranarray(self.VmT.T),lower=1)
+        self.dL_dpsi1 = self.dL_dpsi1.T
+
+        dL_dpsi2 = -0.5 * self.likelihood.precision * backsub_both_sides(self.Lm, self.LQL - self.output_dim * np.eye(self.num_inducing))
+        if self.has_uncertain_inputs:
+            self.dL_dpsi2 = np.repeat(dL_dpsi2[None,:,:],self.batchsize,axis=0)
+        else:
+            self.dL_dpsi1 += 2.*np.dot(dL_dpsi2,self.psi1.T).T
+            self.dL_dpsi2 = None
+
+        # Compute dL_dKmm
+        if do_Kmm_grad:
+            tmp = np.dot(self.LQL,self.A) - backsub_both_sides(self.Lm,np.dot(self.q_u_expectation[0],self.psi1V.T),transpose='right')
+            tmp += tmp.T
+            tmp += -self.output_dim*self.B
+            tmp += self.data_prop*self.LQL
+            self.dL_dKmm = 0.5*backsub_both_sides(self.Lm,tmp)
+
+        #Compute the gradient of the log likelihood wrt noise variance
+        self.partial_for_likelihood =  -0.5*(self.batchsize*self.output_dim - np.sum(self.A*self.LQL))*self.likelihood.precision
+        self.partial_for_likelihood +=  (0.5*self.output_dim*self.trace_K + 0.5 * self.likelihood.trYYT - np.sum(self.likelihood.Y*self.projected_mean))*self.likelihood.precision**2
+
+
+    def log_likelihood(self):
+        """
+        As for uncollapsed sparse GP, but account for the proportion of data we're looking at right now.
+
+        NB. self.batchsize is the size of the batch, not the size of X_all
+        """
+        assert not self.likelihood.is_heteroscedastic
+        A = -0.5*self.batchsize*self.output_dim*(np.log(2.*np.pi) - np.log(self.likelihood.precision))
+        B = -0.5*self.likelihood.precision*self.output_dim*self.trace_K
+        Kmm_logdet = 2.*np.sum(np.log(np.diag(self.Lm)))
+        C = -0.5*self.output_dim*self.data_prop*(Kmm_logdet-self.q_u_logdet - self.num_inducing)
+        C += -0.5*np.sum(self.LQL * self.B)
+        D = -0.5*self.likelihood.precision*self.likelihood.trYYT
+        E = np.sum(self.V*self.projected_mean)
+        return (A+B+C+D+E)/self.data_prop
+
+    def _log_likelihood_gradients(self):
+        return np.hstack((self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))/self.data_prop
+
+    def vb_grad_natgrad(self):
+        """
+        Compute the gradients of the lower bound wrt the canonical and
+        Expectation parameters of u.
+
+        Note that the natural gradient in either is given by the gradient in the other (See Hensman et al 2012 Fast Variational inference in the conjugate exponential Family)
+        """
+
+        # Gradient for eta
+        dL_dmmT_S = -0.5*self.Lambda/self.data_prop + 0.5*self.q_u_prec
+        Kmmipsi1V,_ = dpotrs(self.Lm,self.psi1V,lower=1)
+        dL_dm = (Kmmipsi1V - np.dot(self.Lambda,self.q_u_mean))/self.data_prop
+
+        # Gradients for theta
+        S = self.q_u_cov
+        Si = self.q_u_prec
+        m = self.q_u_mean
+        dL_dSi = -mdot(S,dL_dmmT_S, S)
+
+        dL_dmhSi = -2*dL_dSi
+        dL_dSim = np.dot(dL_dSi,m) + np.dot(Si, dL_dm)
+
+        return np.hstack((dL_dm.flatten(),dL_dmmT_S.flatten())) , np.hstack((dL_dSim.flatten(), dL_dmhSi.flatten()))
+
+
+    def optimize(self, iterations, print_interval=10, callback=lambda:None, callback_interval=5):
+
+        param_step = 0.
+
+        #Iterate!
+        for i in range(iterations):
+
+            #store the current configuration for plotting later
+            self._param_trace.append(self._get_params())
+            self._ll_trace.append(self.log_likelihood() + self.log_prior())
+
+            #load a batch
+            self.load_batch()
+
+            #compute the (stochastic) gradient
+            natgrads = self.vb_grad_natgrad()
+            grads = self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
+            self._grad_trace.append(grads)
+
+            #compute the steps in all parameters
+            vb_step = self.vb_steplength*natgrads[0]
+            if (self.epochs>=1):#only move the parameters after the first epoch
+                param_step = self.momentum*param_step + self.param_steplength*grads
+            else:
+                param_step = 0.
+
+            self.set_vb_param(self.get_vb_param() + vb_step)
+            #Note: don't recompute everything here, wait until the next iteration when we have a new batch
+            self._set_params(self._untransform_params(self._get_params_transformed() + param_step), computations=False)
+
+            #print messages if desired
+            if i and (not i%print_interval):
+                print i, np.mean(self._ll_trace[-print_interval:]) #, self.log_likelihood()
+                print np.round(np.mean(self._grad_trace[-print_interval:],0),3)
+                sys.stdout.flush()
+
+            #callback
+            if i and not i%callback_interval:
+                callback()
+                time.sleep(0.1)
+
+            if self.epochs > 10:
+                self._adapt_steplength()
+
+            self.iterations += 1
+
+
+    def _adapt_steplength(self):
+        if self.adapt_vb_steplength:
+            # self._adaptive_vb_steplength()
+            self._adaptive_vb_steplength_KL()
+        self._vb_steplength_trace.append(self.vb_steplength)
+        assert self.vb_steplength > 0
+
+        if self.adapt_param_steplength:
+            # self._adaptive_param_steplength()
+            # self._adaptive_param_steplength_log()
+            self._adaptive_param_steplength_from_vb()
+        self._param_steplength_trace.append(self.param_steplength)
+
+    def _adaptive_param_steplength(self):
+        decr_factor = 0.1
+        g_tp = self._transform_gradients(self._log_likelihood_gradients())
+        self.gbar_tp = (1-1/self.tau_tp)*self.gbar_tp + 1/self.tau_tp * g_tp
+        self.hbar_tp = (1-1/self.tau_tp)*self.hbar_tp + 1/self.tau_tp * np.dot(g_tp.T, g_tp)
+        new_param_steplength = np.dot(self.gbar_tp.T, self.gbar_tp) / self.hbar_tp
+        #- hack
+        new_param_steplength *= decr_factor
+        self.param_steplength = (self.param_steplength + new_param_steplength)/2
+        #-
+        self.tau_tp = self.tau_tp*(1-self.param_steplength) + 1
+
+    def _adaptive_param_steplength_log(self):
+        stp = np.logspace(np.log(0.0001), np.log(1e-6), base=np.e, num=18000)
+        self.param_steplength = stp[self.iterations]
+
+    def _adaptive_param_steplength_log2(self):
+        self.param_steplength = (self.iterations + 0.001)**-0.5
+
+    def _adaptive_param_steplength_from_vb(self):
+        self.param_steplength = self.vb_steplength * 0.01
+
+    def _adaptive_vb_steplength(self):
+        decr_factor = 0.1
+        g_t = self.vb_grad_natgrad()[0]
+        self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t
+        self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t)
+        new_vb_steplength = np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t
+        #- hack
+        new_vb_steplength *= decr_factor
+        self.vb_steplength = (self.vb_steplength + new_vb_steplength)/2
+        #-
+        self.tau_t = self.tau_t*(1-self.vb_steplength) + 1
+
+    def _adaptive_vb_steplength_KL(self):
+        decr_factor = 1 #0.1
+        natgrad = self.vb_grad_natgrad()
+        g_t1 = natgrad[0]
+        g_t2 = natgrad[1]
+        self.gbar_t1 = (1-1/self.tau_t)*self.gbar_t1 + 1/self.tau_t * g_t1
+        self.gbar_t2 = (1-1/self.tau_t)*self.gbar_t2 + 1/self.tau_t * g_t2
+        self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t1.T, g_t2)
+        self.vb_steplength = np.dot(self.gbar_t1.T, self.gbar_t2) / self.hbar_t
+        self.vb_steplength *= decr_factor
+        self.tau_t = self.tau_t*(1-self.vb_steplength) + 1
+
+    def _raw_predict(self, X_new, X_variance_new=None, which_parts='all',full_cov=False):
+        """Internal helper function for making predictions, does not account for normalization"""
+
+        #TODO: make this more efficient!
+        self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)
+        tmp = self.Kmmi- mdot(self.Kmmi,self.q_u_cov,self.Kmmi)
+
+        if X_variance_new is None:
+            Kx = self.kern.K(X_new,self.Z)
+            mu = np.dot(Kx,self.Kmmi_m)
+            if full_cov:
+                Kxx = self.kern.K(X_new)
+                var = Kxx - mdot(Kx,tmp,Kx.T)
+            else:
+                Kxx = self.kern.Kdiag(X_new)
+                var = (Kxx - np.sum(Kx*np.dot(Kx,tmp),1))[:,None]
+            return mu, var
+        else:
+            assert X_variance_new.shape == X_new.shape
+            Kx = self.kern.psi1(self.Z,X_new, X_variance_new)
+            mu = np.dot(Kx,self.Kmmi_m)
+            Kxx = self.kern.psi0(self.Z,X_new,X_variance_new)
+            psi2 = self.kern.psi2(self.Z,X_new,X_variance_new)
+            diag_var = Kxx - np.sum(np.sum(psi2*tmp[None,:,:],1),1)
+            if full_cov:
+                raise NotImplementedError
+            else:
+                return mu, diag_var[:,None]
+
+    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
+        # normalize X values
+        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
+        if X_variance_new is not None:
+            X_variance_new = X_variance_new / self._Xscale ** 2
+
+        # here's the actual prediction by the GP model
+        mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts)
+
+        # now push through likelihood
+        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
+
+        return mean, var, _025pm, _975pm
+
+
+    def set_vb_param(self,vb_param):
+        """set the distribution q(u) from the canonical parameters"""
+        self.q_u_canonical_flat = vb_param.copy()
+        self.q_u_canonical = self.q_u_canonical_flat[:self.num_inducing*self.output_dim].reshape(self.num_inducing,self.output_dim),self.q_u_canonical_flat[self.num_inducing*self.output_dim:].reshape(self.num_inducing,self.num_inducing)
+
+        self.q_u_prec = -2.*self.q_u_canonical[1]
+        self.q_u_cov, q_u_Li, q_u_L, tmp = pdinv(self.q_u_prec)
+        self.q_u_Li = q_u_Li
+        self.q_u_logdet = -tmp
+        self.q_u_mean, _ = dpotrs(q_u_Li, np.asfortranarray(self.q_u_canonical[0]),lower=1)
+
+        self.q_u_expectation = (self.q_u_mean, np.dot(self.q_u_mean,self.q_u_mean.T)+self.q_u_cov*self.output_dim)
+
+
+    def get_vb_param(self):
+        """
+        Return the canonical parameters of the distribution q(u)
+        """
+        return self.q_u_canonical_flat
+
+
+    def plot(self, ax=None, fignum=None, Z_height=None, **kwargs):
+
+        if ax is None:
+            fig = pb.figure(num=fignum)
+            ax = fig.add_subplot(111)
+
+        #horrible hack here:
+        data = self.likelihood.data.copy()
+        self.likelihood.data = self.Y
+        GPBase.plot(self, ax=ax, **kwargs)
+        self.likelihood.data = data
+
+        Zu = self.Z * self._Xscale + self._Xoffset
+        if self.input_dim==1:
+            ax.plot(self.X_batch, self.likelihood.data, 'gx',mew=2)
+            if Z_height is None:
+                Z_height = ax.get_ylim()[0]
+            ax.plot(Zu, np.zeros_like(Zu) + Z_height, 'r|', mew=1.5, markersize=12)
+
+        if self.input_dim==2:
+            ax.scatter(self.X[:,0], self.X[:,1], 20., self.Y[:,0], linewidth=0, cmap=pb.cm.jet)
+            ax.plot(Zu[:,0], Zu[:,1], 'w^')
+
+    def plot_traces(self):
+        pb.figure()
+        t = np.array(self._param_trace)
+        pb.subplot(2,1,1)
+        for l,ti in zip(self._get_param_names(),t.T):
+            if not l[:3]=='iip':
+                pb.plot(ti,label=l)
+        pb.legend(loc=0)
+
+        pb.subplot(2,1,2)
+        pb.plot(np.asarray(self._ll_trace),label='stochastic likelihood')
+        pb.legend(loc=0)
--- a/GPy/core/transformations.py
+++ b/GPy/core/transformations.py
@ -0,0 +1,144 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from GPy.core.domains import POSITIVE, NEGATIVE, BOUNDED
+import sys 
+lim_val = -np.log(sys.float_info.epsilon) 
+
+class transformation(object):
+    domain = None
+    def f(self, x):
+        raise NotImplementedError
+
+    def finv(self, x):
+        raise NotImplementedError
+
+    def gradfactor(self, f):
+        """ df_dx evaluated at self.f(x)=f"""
+        raise NotImplementedError
+    def initialize(self, f):
+        """ produce a sensible initial value for f(x)"""
+        raise NotImplementedError
+    def __str__(self):
+        raise NotImplementedError
+
+class logexp(transformation):
+    domain = POSITIVE
+    def f(self, x):
+        return np.where(x>lim_val, x, np.log(1. + np.exp(x)))
+    def finv(self, f):
+        return np.where(f>lim_val, f, np.log(np.exp(f) - 1.))
+    def gradfactor(self, f):
+        return np.where(f>lim_val, 1., 1 - np.exp(-f))
+    def initialize(self, f):
+        if np.any(f < 0.):
+            print "Warning: changing parameters to satisfy constraints"
+        return np.abs(f)
+    def __str__(self):
+        return '(+ve)'
+
+class negative_logexp(transformation):
+    domain = NEGATIVE
+    def f(self, x):
+        return -logexp.f(x) #np.log(1. + np.exp(x))
+    def finv(self, f):
+        return logexp.finv(-f) #np.log(np.exp(-f) - 1.)
+    def gradfactor(self, f):
+        return -logexp.gradfactor(-f)
+        #ef = np.exp(-f)
+        #return -(ef - 1.) / ef
+    def initialize(self, f):
+        return -logexp.initialize(f) #np.abs(f)
+    def __str__(self):
+        return '(-ve)'
+
+class logexp_clipped(logexp):
+    max_bound = 1e100
+    min_bound = 1e-10
+    log_max_bound = np.log(max_bound)
+    log_min_bound = np.log(min_bound)
+    domain = POSITIVE
+    def __init__(self, lower=1e-6):
+        self.lower = lower
+    def f(self, x):
+        exp = np.exp(np.clip(x, self.log_min_bound, self.log_max_bound))
+        f = np.log(1. + exp)
+#         if np.isnan(f).any():
+#             import ipdb;ipdb.set_trace()
+        return np.clip(f, self.min_bound, self.max_bound)
+    def finv(self, f):
+        return np.log(np.exp(f - 1.))
+    def gradfactor(self, f):
+        ef = np.exp(f) # np.clip(f, self.min_bound, self.max_bound))
+        gf = (ef - 1.) / ef
+        return gf # np.where(f < self.lower, 0, gf)
+    def initialize(self, f):
+        if np.any(f < 0.):
+            print "Warning: changing parameters to satisfy constraints"
+        return np.abs(f)
+    def __str__(self):
+        return '(+ve_c)'
+
+class exponent(transformation):
+    # TODO: can't allow this to go to zero, need to set a lower bound. Similar with negative exponent below. See old MATLAB code.
+    domain = POSITIVE
+    def f(self, x):
+        return np.where(x<lim_val, np.where(x>-lim_val, np.exp(x), np.exp(-lim_val)), np.exp(lim_val))
+    def finv(self, x):
+        return np.log(x)
+    def gradfactor(self, f):
+        return f
+    def initialize(self, f):
+        if np.any(f < 0.):
+            print "Warning: changing parameters to satisfy constraints"
+        return np.abs(f)
+    def __str__(self):
+        return '(+ve)'
+
+class negative_exponent(exponent):
+    domain = NEGATIVE
+    def f(self, x):
+        return -exponent.f(x)
+    def finv(self, f):
+        return exponent.finv(-f)
+    def gradfactor(self, f):
+        return f
+    def initialize(self, f):
+        return -exponent.initialize(f) #np.abs(f)
+    def __str__(self):
+        return '(-ve)'
+
+class square(transformation):
+    domain = POSITIVE
+    def f(self, x):
+        return x ** 2
+    def finv(self, x):
+        return np.sqrt(x)
+    def gradfactor(self, f):
+        return 2 * np.sqrt(f)
+    def initialize(self, f):
+        return np.abs(f)
+    def __str__(self):
+        return '(+sq)'
+
+class logistic(transformation):
+    domain = BOUNDED
+    def __init__(self, lower, upper):
+        assert lower < upper
+        self.lower, self.upper = float(lower), float(upper)
+        self.difference = self.upper - self.lower
+    def f(self, x):
+        return self.lower + self.difference / (1. + np.exp(-x))
+    def finv(self, f):
+        return np.log(np.clip(f - self.lower, 1e-10, np.inf) / np.clip(self.upper - f, 1e-10, np.inf))
+    def gradfactor(self, f):
+        return (f - self.lower) * (self.upper - f) / self.difference
+    def initialize(self, f):
+        if np.any(np.logical_or(f < self.lower, f > self.upper)):
+            print "Warning: changing parameters to satisfy constraints"
+        return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(f * 0.), f)
+    def __str__(self):
+        return '({},{})'.format(self.lower, self.upper)
+
--- a/GPy/examples/init.py
+++ b/GPy/examples/init.py
@ -0,0 +1,8 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import classification
+import regression
+import dimensionality_reduction
+import tutorials
+import stochastic
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -0,0 +1,168 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+"""
+Gaussian Processes classification
+"""
+import pylab as pb
+import numpy as np
+import GPy
+
+default_seed = 10000
+def crescent_data(seed=default_seed, kernel=None): # FIXME
+    """Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
+
+    :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
+    :param seed : seed value for data generation.
+    :type seed: int
+    :param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
+    :type inducing: int
+    """
+
+    data = GPy.util.datasets.crescent_data(seed=seed)
+    Y = data['Y']
+    Y[Y.flatten()==-1] = 0
+
+    m = GPy.models.GPClassification(data['X'], Y)
+    #m.update_likelihood_approximation()
+    #m.optimize()
+    m.pseudo_EM()
+    print(m)
+    m.plot()
+    return m
+
+def oil(num_inducing=50, max_iters=100, kernel=None):
+    """
+    Run a Gaussian process classification on the three phase oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
+    """
+    data = GPy.util.datasets.oil()
+    X = data['X']
+    Xtest = data['Xtest']
+    Y = data['Y'][:, 0:1]
+    Ytest = data['Ytest'][:, 0:1]
+    Y[Y.flatten()==-1] = 0
+    Ytest[Ytest.flatten()==-1] = 0
+
+    # Create GP model
+    m = GPy.models.SparseGPClassification(X, Y,kernel=kernel,num_inducing=num_inducing)
+
+    # Contrain all parameters to be positive
+    m.tie_params('.*len')
+    m['.*len'] = 10.
+    m.update_likelihood_approximation()
+
+    # Optimize
+    m.optimize(max_iters=max_iters)
+    print(m)
+
+    #Test
+    probs = m.predict(Xtest)[0]
+    GPy.util.classification.conf_matrix(probs,Ytest)
+    return m
+
+def toy_linear_1d_classification(seed=default_seed):
+    """
+    Simple 1D classification example
+    :param seed : seed value for data generation (default is 4).
+    :type seed: int
+    """
+
+    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    Y = data['Y'][:, 0:1]
+    Y[Y.flatten() == -1] = 0
+
+    # Model definition
+    m = GPy.models.GPClassification(data['X'], Y)
+
+    # Optimize
+    #m.update_likelihood_approximation()
+    # Parameters optimization:
+    #m.optimize()
+    m.pseudo_EM()
+
+    # Plot
+    fig, axes = pb.subplots(2,1)
+    m.plot_f(ax=axes[0])
+    m.plot(ax=axes[1])
+    print(m)
+
+    return m
+
+def sparse_toy_linear_1d_classification(num_inducing=10,seed=default_seed):
+    """
+    Sparse 1D classification example
+    :param seed : seed value for data generation (default is 4).
+    :type seed: int
+    """
+
+    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    Y = data['Y'][:, 0:1]
+    Y[Y.flatten() == -1] = 0
+
+    # Model definition
+    m = GPy.models.SparseGPClassification(data['X'], Y,num_inducing=num_inducing)
+    m['.*len']= 4.
+
+    # Optimize
+    #m.update_likelihood_approximation()
+    # Parameters optimization:
+    #m.optimize()
+    m.pseudo_EM()
+
+    # Plot
+    fig, axes = pb.subplots(2,1)
+    m.plot_f(ax=axes[0])
+    m.plot(ax=axes[1])
+    print(m)
+
+    return m
+
+def sparse_crescent_data(num_inducing=10, seed=default_seed, kernel=None):
+    """
+    Run a Gaussian process classification with DTC approxiamtion on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
+
+    :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
+    :param seed : seed value for data generation.
+    :type seed: int
+    :param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
+    :type inducing: int
+    """
+
+    data = GPy.util.datasets.crescent_data(seed=seed)
+    Y = data['Y']
+    Y[Y.flatten()==-1]=0
+
+    m = GPy.models.SparseGPClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing)
+    m['.*len'] = 10.
+    #m.update_likelihood_approximation()
+    #m.optimize()
+    m.pseudo_EM()
+    print(m)
+    m.plot()
+    return m
+
+def FITC_crescent_data(num_inducing=10, seed=default_seed):
+    """
+    Run a Gaussian process classification with FITC approximation on the crescent data. The demonstration uses EP to approximate the likelihood.
+
+    :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
+    :param seed : seed value for data generation.
+    :type seed: int
+    :param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
+    :type num_inducing: int
+    """
+
+    data = GPy.util.datasets.crescent_data(seed=seed)
+    Y = data['Y']
+    Y[Y.flatten()==-1]=0
+
+    m = GPy.models.FITCClassification(data['X'], Y,num_inducing=num_inducing)
+    m.constrain_bounded('.*len',1.,1e3)
+    m['.*len'] = 3.
+    #m.update_likelihood_approximation()
+    #m.optimize()
+    m.pseudo_EM()
+    print(m)
+    m.plot()
+    return m
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -0,0 +1,492 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from matplotlib import pyplot as plt, cm
+
+import GPy
+from GPy.core.transformations import logexp
+from GPy.models.bayesian_gplvm import BayesianGPLVM
+from GPy.likelihoods.gaussian import Gaussian
+
+default_seed = np.random.seed(123344)
+
+def BGPLVM(seed=default_seed):
+    N = 5
+    num_inducing = 4
+    Q = 3
+    D = 2
+    # generate GPLVM-like data
+    X = np.random.rand(N, Q)
+    lengthscales = np.random.rand(Q)
+    k = (GPy.kern.rbf(Q, .5, lengthscales, ARD=True)
+         + GPy.kern.white(Q, 0.01))
+    K = k.K(X)
+    Y = np.random.multivariate_normal(np.zeros(N), K, D).T
+    lik = Gaussian(Y, normalize=True)
+
+    k = GPy.kern.rbf_inv(Q, .5, np.ones(Q) * 2., ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q)
+    # k = GPy.kern.rbf(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
+    # k = GPy.kern.rbf(Q, ARD = False)  + GPy.kern.white(Q, 0.00001)
+
+    m = GPy.models.BayesianGPLVM(lik, Q, kernel=k, num_inducing=num_inducing)
+    m.lengthscales = lengthscales
+    # m.constrain_positive('(rbf|bias|noise|white|S)')
+    # m.constrain_fixed('S', 1)
+
+    # pb.figure()
+    # m.plot()
+    # pb.title('PCA initialisation')
+    # pb.figure()
+    # m.optimize(messages = 1)
+    # m.plot()
+    # pb.title('After optimisation')
+    # m.randomize()
+    # m.checkgrad(verbose=1)
+
+    return m
+
+def GPLVM_oil_100(optimize=True):
+    data = GPy.util.datasets.oil_100()
+    Y = data['X']
+
+    # create simple GP model
+    kernel = GPy.kern.rbf(6, ARD=True) + GPy.kern.bias(6)
+    m = GPy.models.GPLVM(Y, 6, kernel=kernel)
+    m.data_labels = data['Y'].argmax(axis=1)
+
+    # optimize
+    if optimize:
+        m.optimize('scg', messages=1)
+
+    # plot
+    print(m)
+    m.plot_latent(labels=m.data_labels)
+    return m
+
+def sparseGPLVM_oil(optimize=True, N=100, Q=6, num_inducing=15, max_iters=50):
+    np.random.seed(0)
+    data = GPy.util.datasets.oil()
+
+    Y = data['X'][:N]
+    Y = Y - Y.mean(0)
+    Y /= Y.std(0)
+
+    # create simple GP model
+    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q)
+    m = GPy.models.SparseGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing)
+    m.data_labels = data['Y'].argmax(axis=1)
+
+    # optimize
+    if optimize:
+        m.optimize('scg', messages=1, max_iters=max_iters)
+
+    # plot
+    print(m)
+    # m.plot_latent(labels=m.data_labels)
+    return m
+
+def swiss_roll(optimize=True, N=1000, num_inducing=15, Q=4, sigma=.2, plot=False):
+    from GPy.util.datasets import swiss_roll_generated
+    from GPy.core.transformations import logexp_clipped
+
+    data = swiss_roll_generated(N=N, sigma=sigma)
+    Y = data['Y']
+    Y -= Y.mean()
+    Y /= Y.std()
+
+    t = data['t']
+    c = data['colors']
+
+    try:
+        from sklearn.manifold.isomap import Isomap
+        iso = Isomap().fit(Y)
+        X = iso.embedding_
+        if Q > 2:
+            X = np.hstack((X, np.random.randn(N, Q - 2)))
+    except ImportError:
+        X = np.random.randn(N, Q)
+
+    if plot:
+        from mpl_toolkits import mplot3d
+        import pylab
+        fig = pylab.figure("Swiss Roll Data")
+        ax = fig.add_subplot(121, projection='3d')
+        ax.scatter(*Y.T, c=c)
+        ax.set_title("Swiss Roll")
+
+        ax = fig.add_subplot(122)
+        ax.scatter(*X.T[:2], c=c)
+        ax.set_title("Initialization")
+
+
+    var = .5
+    S = (var * np.ones_like(X) + np.clip(np.random.randn(N, Q) * var ** 2,
+                                         - (1 - var),
+                                         (1 - var))) + .001
+    Z = np.random.permutation(X)[:num_inducing]
+
+    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2)) + GPy.kern.white(Q, np.exp(-2))
+
+    m = BayesianGPLVM(Y, Q, X=X, X_variance=S, num_inducing=num_inducing, Z=Z, kernel=kernel)
+    m.data_colors = c
+    m.data_t = t
+
+    m['rbf_lengthscale'] = 1. # X.var(0).max() / X.var(0)
+    m['noise_variance'] = Y.var() / 100.
+    m['bias_variance'] = 0.05
+
+    if optimize:
+        m.optimize('scg', messages=1)
+    return m
+
+def BGPLVM_oil(optimize=True, N=200, Q=7, num_inducing=40, max_iters=1000, plot=False, **k):
+    np.random.seed(0)
+    data = GPy.util.datasets.oil()
+
+    # create simple GP model
+    kernel = GPy.kern.rbf_inv(Q, 1., [.1] * Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2))
+
+    Y = data['X'][:N]
+    Yn = Gaussian(Y, normalize=True)
+#     Yn = Y - Y.mean(0)
+#     Yn /= Yn.std(0)
+
+    m = GPy.models.BayesianGPLVM(Yn, Q, kernel=kernel, num_inducing=num_inducing, **k)
+    m.data_labels = data['Y'][:N].argmax(axis=1)
+
+    # m.constrain('variance|leng', logexp_clipped())
+    # m['.*lengt'] = m.X.var(0).max() / m.X.var(0)
+    m['noise'] = Yn.Y.var() / 100.
+
+
+    # optimize
+    if optimize:
+        m.constrain_fixed('noise')
+        m.optimize('scg', messages=1, max_iters=200, gtol=.05)
+        m.constrain_positive('noise')
+        m.constrain_bounded('white', 1e-7, 1)
+        m.optimize('scg', messages=1, max_iters=max_iters, gtol=.05)
+
+    if plot:
+        y = m.likelihood.Y[0, :]
+        fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
+        plt.sca(latent_axes)
+        m.plot_latent()
+        data_show = GPy.util.visualize.vector_show(y)
+        lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :], m, data_show, latent_axes=latent_axes) # , sense_axes=sense_axes)
+        raw_input('Press enter to finish')
+        plt.close(fig)
+    return m
+
+def oil_100():
+    data = GPy.util.datasets.oil_100()
+    m = GPy.models.GPLVM(data['X'], 2)
+
+    # optimize
+    m.optimize(messages=1, max_iters=2)
+
+    # plot
+    print(m)
+    # m.plot_latent(labels=data['Y'].argmax(axis=1))
+    return m
+
+
+
+def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
+    x = np.linspace(0, 4 * np.pi, N)[:, None]
+    s1 = np.vectorize(lambda x: np.sin(x))
+    s2 = np.vectorize(lambda x: np.cos(x))
+    s3 = np.vectorize(lambda x:-np.exp(-np.cos(2 * x)))
+    sS = np.vectorize(lambda x: np.sin(2 * x))
+
+    s1 = s1(x)
+    s2 = s2(x)
+    s3 = s3(x)
+    sS = sS(x)
+
+    S1 = np.hstack([s1, sS])
+    S2 = np.hstack([s2, s3, sS])
+    S3 = np.hstack([s3, sS])
+
+    Y1 = S1.dot(np.random.randn(S1.shape[1], D1))
+    Y2 = S2.dot(np.random.randn(S2.shape[1], D2))
+    Y3 = S3.dot(np.random.randn(S3.shape[1], D3))
+
+    Y1 += .3 * np.random.randn(*Y1.shape)
+    Y2 += .2 * np.random.randn(*Y2.shape)
+    Y3 += .25 * np.random.randn(*Y3.shape)
+
+    Y1 -= Y1.mean(0)
+    Y2 -= Y2.mean(0)
+    Y3 -= Y3.mean(0)
+    Y1 /= Y1.std(0)
+    Y2 /= Y2.std(0)
+    Y3 /= Y3.std(0)
+
+    slist = [sS, s1, s2, s3]
+    slist_names = ["sS", "s1", "s2", "s3"]
+    Ylist = [Y1, Y2, Y3]
+
+    if plot_sim:
+        import pylab
+        import itertools
+        fig = pylab.figure("MRD Simulation Data", figsize=(8, 6))
+        fig.clf()
+        ax = fig.add_subplot(2, 1, 1)
+        labls = slist_names
+        for S, lab in itertools.izip(slist, labls):
+            ax.plot(S, label=lab)
+        ax.legend()
+        for i, Y in enumerate(Ylist):
+            ax = fig.add_subplot(2, len(Ylist), len(Ylist) + 1 + i)
+            ax.imshow(Y, aspect='auto', cmap=cm.gray) # @UndefinedVariable
+            ax.set_title("Y{}".format(i + 1))
+        pylab.draw()
+        pylab.tight_layout()
+
+    return slist, [S1, S2, S3], Ylist
+
+def bgplvm_simulation_matlab_compare():
+    from GPy.util.datasets import simulation_BGPLVM
+    sim_data = simulation_BGPLVM()
+    Y = sim_data['Y']
+    S = sim_data['S']
+    mu = sim_data['mu']
+    num_inducing, [_, Q] = 3, mu.shape
+
+    from GPy.models import mrd
+    from GPy import kern
+    reload(mrd); reload(kern)
+    k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2))
+    m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k,
+#                        X=mu,
+#                        X_variance=S,
+                       _debug=False)
+    m.auto_scale_factor = True
+    m['noise'] = Y.var() / 100.
+    m['linear_variance'] = .01
+    return m
+
+def bgplvm_simulation(optimize='scg',
+                      plot=True,
+                      max_iters=2e4,
+                      plot_sim=False):
+#     from GPy.core.transformations import logexp_clipped
+    D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 3, 10
+    slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
+
+    from GPy.models import mrd
+    from GPy import kern
+    reload(mrd); reload(kern)
+
+    Y = Ylist[0]
+
+    k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2)) # + kern.bias(Q)
+    m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k)
+
+    # m.constrain('variance|noise', logexp_clipped())
+    m['noise'] = Y.var() / 100.
+
+    if optimize:
+        print "Optimizing model:"
+        m.optimize(optimize, max_iters=max_iters,
+                   messages=True, gtol=.05)
+    if plot:
+        m.plot_X_1d("BGPLVM Latent Space 1D")
+        m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
+    return m
+
+def mrd_simulation(optimize=True, plot=True, plot_sim=True, **kw):
+    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
+    slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
+
+    likelihood_list = [Gaussian(x, normalize=True) for x in Ylist]
+
+    from GPy.models import mrd
+    from GPy import kern
+
+    reload(mrd); reload(kern)
+
+    k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2))
+    m = mrd.MRD(likelihood_list, input_dim=Q, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw)
+    m.ensure_default_constraints()
+
+    for i, bgplvm in enumerate(m.bgplvms):
+        m['{}_noise'.format(i)] = bgplvm.likelihood.Y.var() / 500.
+
+
+    # DEBUG
+    # np.seterr("raise")
+
+    if optimize:
+        print "Optimizing Model:"
+        m.optimize(messages=1, max_iters=8e3, gtol=.1)
+    if plot:
+        m.plot_X_1d("MRD Latent Space 1D")
+        m.plot_scales("MRD Scales")
+    return m
+
+def brendan_faces():
+    from GPy import kern
+    data = GPy.util.datasets.brendan_faces()
+    Q = 2
+    Y = data['Y'][0:-1:10, :]
+    # Y = data['Y']
+    Yn = Y - Y.mean()
+    Yn /= Yn.std()
+
+    m = GPy.models.GPLVM(Yn, Q)
+    # m = GPy.models.BayesianGPLVM(Yn, Q, num_inducing=100)
+
+    # optimize
+    m.constrain('rbf|noise|white', GPy.core.transformations.logexp_clipped())
+
+    m.optimize('scg', messages=1, max_f_eval=10000)
+
+    ax = m.plot_latent(which_indices=(0, 1))
+    y = m.likelihood.Y[0, :]
+    data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, invert=False, scale=False)
+    lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+    raw_input('Press enter to finish')
+
+    return m
+def stick_play(range=None, frame_rate=15):
+    data = GPy.util.datasets.osu_run1()
+    # optimize
+    if range == None:
+        Y = data['Y'].copy()
+    else:
+        Y = data['Y'][range[0]:range[1], :].copy()
+    y = Y[0, :]
+    data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+    GPy.util.visualize.data_play(Y, data_show, frame_rate)
+    return Y
+
+def stick(kernel=None):
+    data = GPy.util.datasets.osu_run1()
+    # optimize
+    m = GPy.models.GPLVM(data['Y'], 2, kernel=kernel)
+    m.optimize(messages=1, max_f_eval=10000)
+    if GPy.util.visualize.visual_available:
+        plt.clf
+        ax = m.plot_latent()
+        y = m.likelihood.Y[0, :]
+        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        raw_input('Press enter to finish')
+
+    return m
+
+def bcgplvm_linear_stick(kernel=None):
+    data = GPy.util.datasets.osu_run1()
+    # optimize
+    mapping = GPy.mappings.Linear(data['Y'].shape[1], 2)
+    m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
+    m.optimize(messages=1, max_f_eval=10000)
+    if GPy.util.visualize.visual_available:
+        plt.clf
+        ax = m.plot_latent()
+        y = m.likelihood.Y[0, :]
+        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        raw_input('Press enter to finish')
+
+    return m
+
+def bcgplvm_stick(kernel=None):
+    data = GPy.util.datasets.osu_run1()
+    # optimize
+    back_kernel=GPy.kern.rbf(data['Y'].shape[1], lengthscale=5.)
+    mapping = GPy.mappings.Kernel(X=data['Y'], output_dim=2, kernel=back_kernel)
+    m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
+    m.optimize(messages=1, max_f_eval=10000)
+    if GPy.util.visualize.visual_available:
+        plt.clf
+        ax = m.plot_latent()
+        y = m.likelihood.Y[0, :]
+        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        raw_input('Press enter to finish')
+
+    return m
+
+def robot_wireless():
+    data = GPy.util.datasets.robot_wireless()
+    # optimize
+    m = GPy.models.GPLVM(data['Y'], 2)
+    m.optimize(messages=1, max_f_eval=10000)
+    m._set_params(m._get_params())
+    plt.clf
+    ax = m.plot_latent()
+
+    return m
+
+def stick_bgplvm(model=None):
+    data = GPy.util.datasets.osu_run1()
+    Q = 6
+    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2)) + GPy.kern.white(Q, np.exp(-2))
+    m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel)
+    # optimize
+    m.ensure_default_constraints()
+    m.optimize('scg', messages=1, max_iters=200, xtol=1e-300, ftol=1e-300)
+    m._set_params(m._get_params())
+    plt.clf, (latent_axes, sense_axes) = plt.subplots(1, 2)
+    plt.sca(latent_axes)
+    m.plot_latent()
+    y = m.likelihood.Y[0, :].copy()
+    data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+    lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
+    raw_input('Press enter to finish')
+
+    return m
+
+
+def cmu_mocap(subject='35', motion=['01'], in_place=True):
+
+    data = GPy.util.datasets.cmu_mocap(subject, motion)
+    Y = data['Y']
+    if in_place:
+        # Make figure move in place.
+        data['Y'][:, 0:3] = 0.0
+    m = GPy.models.GPLVM(data['Y'], 2, normalize_Y=True)
+
+    # optimize
+    m.optimize(messages=1, max_f_eval=10000)
+
+    ax = m.plot_latent()
+    y = m.likelihood.Y[0, :]
+    data_show = GPy.util.visualize.skeleton_show(y[None, :], data['skel'])
+    lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+    raw_input('Press enter to finish')
+    lvm_visualizer.close()
+
+    return m
+
+# def BGPLVM_oil():
+#     data = GPy.util.datasets.oil()
+#     Y, X = data['Y'], data['X']
+#     X -= X.mean(axis=0)
+#     X /= X.std(axis=0)
+#
+#     Q = 10
+#     num_inducing = 30
+#
+#     kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q)
+#     m = GPy.models.BayesianGPLVM(X, Q, kernel=kernel, num_inducing=num_inducing)
+#     # m.scale_factor = 100.0
+#     m.constrain_positive('(white|noise|bias|X_variance|rbf_variance|rbf_length)')
+#     from sklearn import cluster
+#     km = cluster.KMeans(num_inducing, verbose=10)
+#     Z = km.fit(m.X).cluster_centers_
+#     # Z = GPy.util.misc.kmm_init(m.X, num_inducing)
+#     m.set('iip', Z)
+#     m.set('bias', 1e-4)
+#     # optimize
+#
+#     import pdb; pdb.set_trace()
+#     m.optimize('tnc', messages=1)
+#     print m
+#     m.plot_latent(labels=data['Y'].argmax(axis=1))
+#     return m
+
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -0,0 +1,473 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+"""
+Gaussian Processes regression examples
+"""
+import pylab as pb
+import numpy as np
+import GPy
+
+def coregionalisation_toy2(max_iters=100):
+    """
+    A simple demonstration of coregionalisation on two sinusoidal functions.
+    """
+    X1 = np.random.rand(50, 1) * 8
+    X2 = np.random.rand(30, 1) * 5
+    index = np.vstack((np.zeros_like(X1), np.ones_like(X2)))
+    X = np.hstack((np.vstack((X1, X2)), index))
+    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
+    Y2 = np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + 2.
+    Y = np.vstack((Y1, Y2))
+
+    k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
+    k2 = GPy.kern.coregionalise(2, 1)
+    k = k1**k2
+    m = GPy.models.GPRegression(X, Y, kernel=k)
+    m.constrain_fixed('.*rbf_var', 1.)
+    # m.constrain_positive('.*kappa')
+    m.optimize('sim', messages=1, max_iters=max_iters)
+
+    pb.figure()
+    Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1))))
+    Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1))))
+    mean, var, low, up = m.predict(Xtest1)
+    GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up)
+    mean, var, low, up = m.predict(Xtest2)
+    GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up)
+    pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2)
+    pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2)
+    return m
+
+def coregionalisation_toy(max_iters=100):
+    """
+    A simple demonstration of coregionalisation on two sinusoidal functions.
+    """
+    X1 = np.random.rand(50, 1) * 8
+    X2 = np.random.rand(30, 1) * 5
+    index = np.vstack((np.zeros_like(X1), np.ones_like(X2)))
+    X = np.hstack((np.vstack((X1, X2)), index))
+    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
+    Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
+    Y = np.vstack((Y1, Y2))
+
+    k1 = GPy.kern.rbf(1)
+    k2 = GPy.kern.coregionalise(2, 2)
+    k = k1**k2 #k1.prod(k2, tensor=True)
+    m = GPy.models.GPRegression(X, Y, kernel=k)
+    m.constrain_fixed('.*rbf_var', 1.)
+    # m.constrain_positive('kappa')
+    m.optimize(max_iters=max_iters)
+
+    pb.figure()
+    Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1))))
+    Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1))))
+    mean, var, low, up = m.predict(Xtest1)
+    GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up)
+    mean, var, low, up = m.predict(Xtest2)
+    GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up)
+    pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2)
+    pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2)
+    return m
+
+
+def coregionalisation_sparse(max_iters=100):
+    """
+    A simple demonstration of coregionalisation on two sinusoidal functions using sparse approximations.
+    """
+    X1 = np.random.rand(500, 1) * 8
+    X2 = np.random.rand(300, 1) * 5
+    index = np.vstack((np.zeros_like(X1), np.ones_like(X2)))
+    X = np.hstack((np.vstack((X1, X2)), index))
+    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
+    Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
+    Y = np.vstack((Y1, Y2))
+
+    num_inducing = 40
+    Z = np.hstack((np.random.rand(num_inducing, 1) * 8, np.random.randint(0, 2, num_inducing)[:, None]))
+
+    k1 = GPy.kern.rbf(1)
+    k2 = GPy.kern.coregionalise(2, 2)
+    k = k1**k2 #.prod(k2, tensor=True) # + GPy.kern.white(2,0.001)
+
+    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z)
+    m.constrain_fixed('.*rbf_var', 1.)
+    m.constrain_fixed('iip')
+    m.constrain_bounded('noise_variance', 1e-3, 1e-1)
+#     m.optimize_restarts(5, robust=True, messages=1, max_iters=max_iters, optimizer='bfgs')
+    m.optimize(max_iters=max_iters)
+
+    # plotting:
+    pb.figure()
+    Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1))))
+    Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1))))
+    mean, var, low, up = m.predict(Xtest1)
+    GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up)
+    mean, var, low, up = m.predict(Xtest2)
+    GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up)
+    pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2)
+    pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2)
+    y = pb.ylim()[0]
+    pb.plot(Z[:, 0][Z[:, 1] == 0], np.zeros(np.sum(Z[:, 1] == 0)) + y, 'r|', mew=2)
+    pb.plot(Z[:, 0][Z[:, 1] == 1], np.zeros(np.sum(Z[:, 1] == 1)) + y, 'g|', mew=2)
+    return m
+
+def epomeo_gpx(max_iters=100):
+    """Perform Gaussian process regression on the latitude and longitude data from the Mount Epomeo runs. Requires gpxpy to be installed on your system to load in the data."""
+    data = GPy.util.datasets.epomeo_gpx()
+    num_data_list = []
+    for Xpart in data['X']:
+        num_data_list.append(Xpart.shape[0])
+
+    num_data_array = np.array(num_data_list)
+    num_data = num_data_array.sum()
+    Y = np.zeros((num_data, 2))
+    t = np.zeros((num_data, 2))
+    start = 0
+    for Xpart, index in zip(data['X'], range(len(data['X']))):
+        end = start+Xpart.shape[0]
+        t[start:end, :] = np.hstack((Xpart[:, 0:1],
+                                    index*np.ones((Xpart.shape[0], 1))))
+        Y[start:end, :] = Xpart[:, 1:3]
+
+    num_inducing = 200
+    Z = np.hstack((np.linspace(t[:,0].min(), t[:, 0].max(), num_inducing)[:, None],
+                   np.random.randint(0, 4, num_inducing)[:, None]))
+
+    k1 = GPy.kern.rbf(1)
+    k2 = GPy.kern.coregionalise(output_dim=5, rank=5)
+    k = k1**k2 
+
+    m = GPy.models.SparseGPRegression(t, Y, kernel=k, Z=Z, normalize_Y=True)
+    m.constrain_fixed('.*rbf_var', 1.)
+    m.constrain_fixed('iip')
+    m.constrain_bounded('noise_variance', 1e-3, 1e-1)
+#     m.optimize_restarts(5, robust=True, messages=1, max_iters=max_iters, optimizer='bfgs')
+    m.optimize(max_iters=max_iters,messages=True)
+
+    return m
+
+
+def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=10000, max_iters=300):
+    """Show an example of a multimodal error surface for Gaussian process regression. Gene 939 has bimodal behaviour where the noisy mode is higher."""
+
+    # Contour over a range of length scales and signal/noise ratios.
+    length_scales = np.linspace(0.1, 60., resolution)
+    log_SNRs = np.linspace(-3., 4., resolution)
+
+    data = GPy.util.datasets.della_gatta_TRP63_gene_expression(gene_number)
+    # data['Y'] = data['Y'][0::2, :]
+    # data['X'] = data['X'][0::2, :]
+
+    data['Y'] = data['Y'] - np.mean(data['Y'])
+
+    lls = GPy.examples.regression._contour_data(data, length_scales, log_SNRs, GPy.kern.rbf)
+    pb.contour(length_scales, log_SNRs, np.exp(lls), 20, cmap=pb.cm.jet)
+    ax = pb.gca()
+    pb.xlabel('length scale')
+    pb.ylabel('log_10 SNR')
+
+    xlim = ax.get_xlim()
+    ylim = ax.get_ylim()
+
+    # Now run a few optimizations
+    models = []
+    optim_point_x = np.empty(2)
+    optim_point_y = np.empty(2)
+    np.random.seed(seed=seed)
+    for i in range(0, model_restarts):
+        # kern = GPy.kern.rbf(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.))
+        kern = GPy.kern.rbf(1, variance=np.random.uniform(1e-3, 1), lengthscale=np.random.uniform(5, 50))
+
+        m = GPy.models.GPRegression(data['X'], data['Y'], kernel=kern)
+        m['noise_variance'] = np.random.uniform(1e-3, 1)
+        optim_point_x[0] = m['rbf_lengthscale']
+        optim_point_y[0] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);
+
+        # optimize
+        m.optimize('scg', xtol=1e-6, ftol=1e-6, max_iters=max_iters)
+
+        optim_point_x[1] = m['rbf_lengthscale']
+        optim_point_y[1] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);
+
+        pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1] - optim_point_x[0], optim_point_y[1] - optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k')
+        models.append(m)
+
+    ax.set_xlim(xlim)
+    ax.set_ylim(ylim)
+    return m # (models, lls)
+
+def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):
+    """Evaluate the GP objective function for a given data set for a range of signal to noise ratios and a range of lengthscales.
+
+    :data_set: A data set from the utils.datasets director.
+    :length_scales: a list of length scales to explore for the contour plot.
+    :log_SNRs: a list of base 10 logarithm signal to noise ratios to explore for the contour plot.
+    :kernel: a kernel to use for the 'signal' portion of the data."""
+
+    lls = []
+    total_var = np.var(data['Y'])
+    kernel = kernel_call(1, variance=1., lengthscale=1.)
+    model = GPy.models.GPRegression(data['X'], data['Y'], kernel=kernel)
+    for log_SNR in log_SNRs:
+        SNR = 10.**log_SNR
+        noise_var = total_var / (1. + SNR)
+        signal_var = total_var - noise_var
+        model.kern['.*variance'] = signal_var
+        model['noise_variance'] = noise_var
+        length_scale_lls = []
+
+        for length_scale in length_scales:
+            model['.*lengthscale'] = length_scale
+            length_scale_lls.append(model.log_likelihood())
+
+        lls.append(length_scale_lls)
+
+    return np.array(lls)
+
+
+def olympic_100m_men(max_iters=100, kernel=None):
+    """Run a standard Gaussian process regression on the Rogers and Girolami olympics data."""
+    data = GPy.util.datasets.olympic_100m_men()
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(data['X'], data['Y'], kernel)
+
+    # set the lengthscale to be something sensible (defaults to 1)
+    if kernel==None:
+        m['rbf_lengthscale'] = 10
+
+    # optimize
+    m.optimize(max_iters=max_iters)
+
+    # plot
+    m.plot(plot_limits=(1850, 2050))
+    print(m)
+    return m
+
+def olympic_marathon_men(max_iters=100, kernel=None):
+    """Run a standard Gaussian process regression on the Olympic marathon data."""
+    data = GPy.util.datasets.olympic_marathon_men()
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(data['X'], data['Y'], kernel)
+
+    # set the lengthscale to be something sensible (defaults to 1)
+    if kernel==None:
+        m['rbf_lengthscale'] = 10
+
+    # optimize
+    m.optimize(max_iters=max_iters)
+
+    # plot
+    m.plot(plot_limits=(1850, 2050))
+    print(m)
+    return m
+
+def toy_rbf_1d(optimizer='tnc', max_nb_eval_optim=100):
+    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
+    data = GPy.util.datasets.toy_rbf_1d()
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(data['X'], data['Y'])
+
+    # optimize
+    m.optimize(optimizer, max_f_eval=max_nb_eval_optim)
+    # plot
+    m.plot()
+    print(m)
+    return m
+
+def toy_rbf_1d_50(max_iters=100):
+    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
+    data = GPy.util.datasets.toy_rbf_1d_50()
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(data['X'], data['Y'])
+
+    # optimize
+    m.optimize(max_iters=max_iters)
+
+    # plot
+    m.plot()
+    print(m)
+    return m
+
+def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4):
+    # Create an artificial dataset where the values in the targets (Y)
+    # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to
+    # see if this dependency can be recovered
+    X1 = np.sin(np.sort(np.random.rand(num_samples, 1) * 10, 0))
+    X2 = np.cos(np.sort(np.random.rand(num_samples, 1) * 10, 0))
+    X3 = np.exp(np.sort(np.random.rand(num_samples, 1), 0))
+    X4 = np.log(np.sort(np.random.rand(num_samples, 1), 0))
+    X = np.hstack((X1, X2, X3, X4))
+
+    Y1 = np.asarray(2 * X[:, 0] + 3).reshape(-1, 1)
+    Y2 = np.asarray(4 * (X[:, 2] - 1.5 * X[:, 0])).reshape(-1, 1)
+    Y = np.hstack((Y1, Y2))
+
+    Y = np.dot(Y, np.random.rand(2, D));
+    Y = Y + 0.2 * np.random.randn(Y.shape[0], Y.shape[1])
+    Y -= Y.mean()
+    Y /= Y.std()
+
+    if kernel_type == 'linear':
+        kernel = GPy.kern.linear(X.shape[1], ARD=1)
+    elif kernel_type == 'rbf_inv':
+        kernel = GPy.kern.rbf_inv(X.shape[1], ARD=1)
+    else:
+        kernel = GPy.kern.rbf(X.shape[1], ARD=1)
+    kernel += GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+    m = GPy.models.GPRegression(X, Y, kernel)
+    # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
+    # m.set_prior('.*lengthscale',len_prior)
+
+    m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
+
+    m.kern.plot_ARD()
+    print(m)
+    return m
+
+def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4):
+    # Create an artificial dataset where the values in the targets (Y)
+    # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to
+    # see if this dependency can be recovered
+    X1 = np.sin(np.sort(np.random.rand(num_samples, 1) * 10, 0))
+    X2 = np.cos(np.sort(np.random.rand(num_samples, 1) * 10, 0))
+    X3 = np.exp(np.sort(np.random.rand(num_samples, 1), 0))
+    X4 = np.log(np.sort(np.random.rand(num_samples, 1), 0))
+    X = np.hstack((X1, X2, X3, X4))
+
+    Y1 = np.asarray(2 * X[:, 0] + 3)[:, None]
+    Y2 = np.asarray(4 * (X[:, 2] - 1.5 * X[:, 0]))[:, None]
+    Y = np.hstack((Y1, Y2))
+
+    Y = np.dot(Y, np.random.rand(2, D));
+    Y = Y + 0.2 * np.random.randn(Y.shape[0], Y.shape[1])
+    Y -= Y.mean()
+    Y /= Y.std()
+
+    if kernel_type == 'linear':
+        kernel = GPy.kern.linear(X.shape[1], ARD=1)
+    elif kernel_type == 'rbf_inv':
+        kernel = GPy.kern.rbf_inv(X.shape[1], ARD=1)
+    else:
+        kernel = GPy.kern.rbf(X.shape[1], ARD=1)
+    kernel += GPy.kern.bias(X.shape[1])
+    X_variance = np.ones(X.shape) * 0.5
+    m = GPy.models.SparseGPRegression(X, Y, kernel, X_variance=X_variance)
+    # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
+    # m.set_prior('.*lengthscale',len_prior)
+
+    m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
+
+    m.kern.plot_ARD()
+    print(m)
+    return m
+
+def robot_wireless(max_iters=100, kernel=None):
+    """Predict the location of a robot given wirelss signal strength readings."""
+    data = GPy.util.datasets.robot_wireless()
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(data['Y'], data['X'], kernel=kernel)
+
+    # optimize
+    m.optimize(messages=True, max_iters=max_iters)
+    Xpredict = m.predict(data['Ytest'])[0]
+    pb.plot(data['Xtest'][:, 0], data['Xtest'][:, 1], 'r-')
+    pb.plot(Xpredict[:, 0], Xpredict[:, 1], 'b-')
+    pb.axis('equal')
+    pb.title('WiFi Localization with Gaussian Processes')
+    pb.legend(('True Location', 'Predicted Location'))
+
+    sse = ((data['Xtest'] - Xpredict)**2).sum()
+    print(m)
+    print('Sum of squares error on test data: ' + str(sse))
+    return m
+
+def silhouette(max_iters=100):
+    """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper."""
+    data = GPy.util.datasets.silhouette()
+
+    # create simple GP Model
+    m = GPy.models.GPRegression(data['X'], data['Y'])
+
+    # optimize
+    m.optimize(messages=True, max_iters=max_iters)
+
+    print(m)
+    return m
+
+
+
+def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100):
+    """Run a 1D example of a sparse GP regression."""
+    # sample inputs and outputs
+    X = np.random.uniform(-3., 3., (num_samples, 1))
+    Y = np.sin(X) + np.random.randn(num_samples, 1) * 0.05
+    # construct kernel
+    rbf = GPy.kern.rbf(1)
+    # create simple GP Model
+    m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing)
+
+
+    m.checkgrad(verbose=1)
+    m.optimize('tnc', messages=1, max_iters=max_iters)
+    m.plot()
+    return m
+
+def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100):
+    """Run a 2D example of a sparse GP regression."""
+    X = np.random.uniform(-3., 3., (num_samples, 2))
+    Y = np.sin(X[:, 0:1]) * np.sin(X[:, 1:2]) + np.random.randn(num_samples, 1) * 0.05
+
+    # construct kernel
+    rbf = GPy.kern.rbf(2)
+
+    # create simple GP Model
+    m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing)
+
+    # contrain all parameters to be positive (but not inducing inputs)
+    m['.*len'] = 2.
+
+    m.checkgrad()
+
+    # optimize and plot
+    m.optimize('tnc', messages=1, max_iters=max_iters)
+    m.plot()
+    print(m)
+    return m
+
+def uncertain_inputs_sparse_regression(max_iters=100):
+    """Run a 1D example of a sparse GP regression with uncertain inputs."""
+    fig, axes = pb.subplots(1, 2, figsize=(12, 5))
+
+    # sample inputs and outputs
+    S = np.ones((20, 1))
+    X = np.random.uniform(-3., 3., (20, 1))
+    Y = np.sin(X) + np.random.randn(20, 1) * 0.05
+    # likelihood = GPy.likelihoods.Gaussian(Y)
+    Z = np.random.uniform(-3., 3., (7, 1))
+
+    k = GPy.kern.rbf(1)
+
+    # create simple GP Model - no input uncertainty on this one
+    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z)
+    m.optimize('scg', messages=1, max_iters=max_iters)
+    m.plot(ax=axes[0])
+    axes[0].set_title('no input uncertainty')
+
+
+    # the same Model with uncertainty
+    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z, X_variance=S)
+    m.optimize('scg', messages=1, max_iters=max_iters)
+    m.plot(ax=axes[1])
+    axes[1].set_title('with input uncertainty')
+    print(m)
+
+    fig.canvas.draw()
+
+    return m
--- a/GPy/examples/stochastic.py
+++ b/GPy/examples/stochastic.py
@ -0,0 +1,41 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import pylab as pb
+import numpy as np
+import GPy
+
+def toy_1d():
+    N = 2000
+    M = 20
+
+    #create data
+    X = np.linspace(0,32,N)[:,None]
+    Z = np.linspace(0,32,M)[:,None]
+    Y = np.sin(X) + np.cos(0.3*X) + np.random.randn(*X.shape)/np.sqrt(50.)
+
+    m = GPy.models.SVIGPRegression(X,Y, batchsize=10, Z=Z)
+    m.constrain_bounded('noise_variance',1e-3,1e-1)
+    m.constrain_bounded('white_variance',1e-3,1e-1)
+
+    m.param_steplength = 1e-4
+
+    fig = pb.figure()
+    ax = fig.add_subplot(111)
+    def cb():
+        ax.cla()
+        m.plot(ax=ax,Z_height=-3)
+        ax.set_ylim(-3,3)
+        fig.canvas.draw()
+
+    m.optimize(500, callback=cb, callback_interval=1)
+
+    m.plot_traces()
+    return m
+
+
+
+
+
+
+
--- a/GPy/examples/tutorials.py
+++ b/GPy/examples/tutorials.py
@ -0,0 +1,146 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+"""
+Code of Tutorials
+"""
+
+import pylab as pb
+pb.ion()
+import numpy as np
+import GPy
+
+def tuto_GP_regression():
+    """The detailed explanations of the commands used in this file can be found in the tutorial section"""
+
+    X = np.random.uniform(-3.,3.,(20,1))
+    Y = np.sin(X) + np.random.randn(20,1)*0.05
+
+    kernel = GPy.kern.rbf(input_dim=1, variance=1., lengthscale=1.)
+
+    m = GPy.models.GPRegression(X, Y, kernel)
+
+    print m
+    m.plot()
+
+    m.constrain_positive('')
+
+    m.unconstrain('')               # may be used to remove the previous constrains
+    m.constrain_positive('.*rbf_variance')
+    m.constrain_bounded('.*lengthscale',1.,10. )
+    m.constrain_fixed('.*noise',0.0025)
+
+    m.optimize()
+
+    m.optimize_restarts(num_restarts = 10)
+
+    #######################################################
+    #######################################################
+    # sample inputs and outputs
+    X = np.random.uniform(-3.,3.,(50,2))
+    Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(50,1)*0.05
+
+    # define kernel
+    ker = GPy.kern.Matern52(2,ARD=True) + GPy.kern.white(2)
+
+    # create simple GP model
+    m = GPy.models.GPRegression(X, Y, ker)
+
+    # contrain all parameters to be positive
+    m.constrain_positive('')
+
+    # optimize and plot
+    m.optimize('tnc', max_f_eval = 1000)
+    m.plot()
+    print(m)
+    return(m)
+
+def tuto_kernel_overview():
+    """The detailed explanations of the commands used in this file can be found in the tutorial section"""
+    ker1 = GPy.kern.rbf(1)  # Equivalent to ker1 = GPy.kern.rbf(input_dim=1, variance=1., lengthscale=1.)
+    ker2 = GPy.kern.rbf(input_dim=1, variance = .75, lengthscale=2.)
+    ker3 = GPy.kern.rbf(1, .5, .5)
+    
+    print ker2
+
+    ker1.plot()
+    ker2.plot()
+    ker3.plot()
+
+    k1 = GPy.kern.rbf(1,1.,2.)
+    k2 = GPy.kern.Matern32(1, 0.5, 0.2)
+
+    # Product of kernels
+    k_prod = k1.prod(k2)                        # By default, tensor=False
+    k_prodtens = k1.prod(k2,tensor=True)
+
+    # Sum of kernels
+    k_add = k1.add(k2)                          # By default, tensor=False
+    k_addtens = k1.add(k2,tensor=True)    
+    
+    k1 = GPy.kern.rbf(1,1.,2)
+    k2 = GPy.kern.periodic_Matern52(1,variance=1e3, lengthscale=1, period = 1.5, lower=-5., upper = 5)
+
+    k = k1 * k2  # equivalent to k = k1.prod(k2)
+    print k
+
+    # Simulate sample paths
+    X = np.linspace(-5,5,501)[:,None]
+    Y = np.random.multivariate_normal(np.zeros(501),k.K(X),1)
+
+    k1 = GPy.kern.rbf(1)
+    k2 = GPy.kern.Matern32(1)
+    k3 = GPy.kern.white(1)
+
+    k = k1 + k2 + k3
+    print k
+
+    k.constrain_positive('.*var')
+    k.constrain_fixed(np.array([1]),1.75)
+    k.tie_params('.*len')
+    k.unconstrain('white')
+    k.constrain_bounded('white',lower=1e-5,upper=.5)
+    print k
+    
+    k_cst = GPy.kern.bias(1,variance=1.)
+    k_mat = GPy.kern.Matern52(1,variance=1., lengthscale=3)
+    Kanova = (k_cst + k_mat).prod(k_cst + k_mat,tensor=True)
+    print Kanova
+
+    # sample inputs and outputs
+    X = np.random.uniform(-3.,3.,(40,2))
+    Y = 0.5*X[:,:1] + 0.5*X[:,1:] + 2*np.sin(X[:,:1]) * np.sin(X[:,1:])
+
+    # Create GP regression model
+    m = GPy.models.GPRegression(X, Y, Kanova)
+    fig = pb.figure(figsize=(5,5))
+    ax = fig.add_subplot(111)
+    m.plot(ax=ax)
+   
+    pb.figure(figsize=(20,3))
+    pb.subplots_adjust(wspace=0.5)
+    axs = pb.subplot(1,5,1)
+    m.plot(ax=axs)
+    pb.subplot(1,5,2)
+    pb.ylabel("=   ",rotation='horizontal',fontsize='30')
+    axs = pb.subplot(1,5,3)
+    m.plot(ax=axs, which_parts=[False,True,False,False])
+    pb.ylabel("cst          +",rotation='horizontal',fontsize='30')
+    axs = pb.subplot(1,5,4)
+    m.plot(ax=axs, which_parts=[False,False,True,False])
+    pb.ylabel("+   ",rotation='horizontal',fontsize='30')
+    axs = pb.subplot(1,5,5)
+    pb.ylabel("+   ",rotation='horizontal',fontsize='30')
+    m.plot(ax=axs, which_parts=[False,False,False,True])
+
+    return(m)
+
+
+def model_interaction():
+    X = np.random.randn(20,1)
+    Y = np.sin(X) + np.random.randn(*X.shape)*0.01 + 5.
+    k = GPy.kern.rbf(1) + GPy.kern.bias(1)
+    m = GPy.models.GPRegression(X, Y, kernel=k)
+    return m
+
--- a/GPy/inference/init.py
+++ b/GPy/inference/init.py
--- a/GPy/inference/conjugate_gradient_descent.py
+++ b/GPy/inference/conjugate_gradient_descent.py
@ -0,0 +1,290 @@
+'''
+Created on 24 Apr 2013
+
+@author: maxz
+'''
+from GPy.inference.gradient_descent_update_rules import FletcherReeves, \
+    PolakRibiere
+from Queue import Empty
+from multiprocessing import Value
+from multiprocessing.queues import Queue
+from multiprocessing.synchronize import Event
+from scipy.optimize.linesearch import line_search_wolfe1, line_search_wolfe2
+from threading import Thread
+import numpy
+import sys
+import time
+
+RUNNING = "running"
+CONVERGED = "converged"
+MAXITER = "maximum number of iterations reached"
+MAX_F_EVAL = "maximum number of function calls reached"
+LINE_SEARCH = "line search failed"
+KBINTERRUPT = "interrupted"
+
+class _Async_Optimization(Thread):
+
+    def __init__(self, f, df, x0, update_rule, runsignal, SENTINEL,
+                 report_every=10, messages=0, maxiter=5e3, max_f_eval=15e3,
+                 gtol=1e-6, outqueue=None, *args, **kw):
+        """
+        Helper Process class for async optimization
+        
+        f_call and df_call are Multiprocessing Values, for synchronized assignment
+        """
+        self.f_call = Value('i', 0)
+        self.df_call = Value('i', 0)
+        self.f = self.f_wrapper(f, self.f_call)
+        self.df = self.f_wrapper(df, self.df_call)
+        self.x0 = x0
+        self.update_rule = update_rule
+        self.report_every = report_every
+        self.messages = messages
+        self.maxiter = maxiter
+        self.max_f_eval = max_f_eval
+        self.gtol = gtol
+        self.SENTINEL = SENTINEL
+        self.runsignal = runsignal
+#         self.parent = parent
+#         self.result = None
+        self.outq = outqueue
+        super(_Async_Optimization, self).__init__(target=self.run,
+                                            name="CG Optimization",
+                                            *args, **kw)
+
+#     def __enter__(self):
+#         return self
+#
+#     def __exit__(self, type, value, traceback):
+#         return isinstance(value, TypeError)
+
+    def f_wrapper(self, f, counter):
+        def f_w(*a, **kw):
+            counter.value += 1
+            return f(*a, **kw)
+        return f_w
+
+    def callback(self, *a):
+        if self.outq is not None:
+            self.outq.put(a)
+#         self.parent and self.parent.callback(*a, **kw)
+        pass
+        # print "callback done"
+
+    def callback_return(self, *a):
+        self.callback(*a)
+        if self.outq is not None:
+            self.outq.put(self.SENTINEL)
+        if self.messages:
+            print ""
+        self.runsignal.clear()
+
+    def run(self, *args, **kwargs):
+        raise NotImplementedError("Overwrite this with optimization (for async use)")
+        pass
+
+class _CGDAsync(_Async_Optimization):
+
+    def reset(self, xi, *a, **kw):
+        gi = -self.df(xi, *a, **kw)
+        si = gi
+        ur = self.update_rule(gi)
+        return gi, ur, si
+
+    def run(self, *a, **kw):
+        status = RUNNING
+
+        fi = self.f(self.x0)
+        fi_old = fi + 5000
+
+        gi, ur, si = self.reset(self.x0, *a, **kw)
+        xi = self.x0
+        xi_old = numpy.nan
+        it = 0
+
+        while it < self.maxiter:
+            if not self.runsignal.is_set():
+                break
+
+            if self.f_call.value > self.max_f_eval:
+                status = MAX_F_EVAL
+
+            gi = -self.df(xi, *a, **kw)
+            if numpy.dot(gi.T, gi) <= self.gtol:
+                status = CONVERGED
+                break
+            if numpy.isnan(numpy.dot(gi.T, gi)):
+                if numpy.any(numpy.isnan(xi_old)):
+                    status = CONVERGED
+                    break
+                self.reset(xi_old)
+
+            gammai = ur(gi)
+            if gammai < 1e-6 or it % xi.shape[0] == 0:
+                gi, ur, si = self.reset(xi, *a, **kw)
+            si = gi + gammai * si
+            alphai, _, _, fi2, fi_old2, gfi = line_search_wolfe1(self.f,
+                                                                 self.df,
+                                                                 xi,
+                                                                 si, gi,
+                                                                 fi, fi_old)
+            if alphai is None:
+                alphai, _, _, fi2, fi_old2, gfi = \
+                         line_search_wolfe2(self.f, self.df,
+                                            xi, si, gi,
+                                            fi, fi_old)
+                if alphai is None:
+                    # This line search also failed to find a better solution.
+                    status = LINE_SEARCH
+                    break
+            if fi2 < fi:
+                fi, fi_old = fi2, fi_old2
+            if gfi is not None:
+                gi = gfi
+
+            if numpy.isnan(fi) or fi_old < fi:
+                gi, ur, si = self.reset(xi, *a, **kw)
+
+            else:
+                xi += numpy.dot(alphai, si)
+                if self.messages:
+                    sys.stdout.write("\r")
+                    sys.stdout.flush()
+                    sys.stdout.write("iteration: {0:> 6g}  f:{1:> 12e}  |g|:{2:> 12e}".format(it, fi, numpy.dot(gi.T, gi)))
+
+            if it % self.report_every == 0:
+                self.callback(xi, fi, gi, it, self.f_call.value, self.df_call.value, status)
+            it += 1
+        else:
+            status = MAXITER
+        self.callback_return(xi, fi, gi, it, self.f_call.value, self.df_call.value, status)
+        self.result = [xi, fi, gi, it, self.f_call.value, self.df_call.value, status]
+
+class Async_Optimize(object):
+    callback = lambda *x: None
+    runsignal = Event()
+    SENTINEL = "SENTINEL"
+
+    def async_callback_collect(self, q):
+        while self.runsignal.is_set():
+            try:
+                for ret in iter(lambda: q.get(timeout=1), self.SENTINEL):
+                    self.callback(*ret)
+                self.runsignal.clear()
+            except Empty:
+                pass
+
+    def opt_async(self, f, df, x0, callback, update_rule=PolakRibiere,
+                   messages=0, maxiter=5e3, max_f_eval=15e3, gtol=1e-6,
+                   report_every=10, *args, **kwargs):
+        self.runsignal.set()
+        c = None
+        outqueue = None
+        if callback:
+            outqueue = Queue()
+            self.callback = callback
+            c = Thread(target=self.async_callback_collect, args=(outqueue,))
+            c.start()
+        p = _CGDAsync(f, df, x0, update_rule, self.runsignal, self.SENTINEL,
+                 report_every=report_every, messages=messages, maxiter=maxiter,
+                 max_f_eval=max_f_eval, gtol=gtol, outqueue=outqueue, *args, **kwargs)
+        p.start()
+        return p, c
+
+    def opt(self, f, df, x0, callback=None, update_rule=FletcherReeves,
+                   messages=0, maxiter=5e3, max_f_eval=15e3, gtol=1e-6,
+                   report_every=10, *args, **kwargs):
+        p, c = self.opt_async(f, df, x0, callback, update_rule, messages,
+                            maxiter, max_f_eval, gtol,
+                            report_every, *args, **kwargs)
+        while self.runsignal.is_set():
+            try:
+                p.join(1)
+                if c: c.join(1)
+            except KeyboardInterrupt:
+                # print "^C"
+                self.runsignal.clear()
+                p.join()
+                if c: c.join()
+        if c and c.is_alive():
+#             self.runsignal.set()
+#             while self.runsignal.is_set():
+#                 try:
+#                     c.join(.1)
+#                 except KeyboardInterrupt:
+#                     # print "^C"
+#                     self.runsignal.clear()
+#                     c.join()
+            print "WARNING: callback still running, optimisation done!"
+        return p.result
+
+class CGD(Async_Optimize):
+    '''
+    Conjugate gradient descent algorithm to minimize
+    function f with gradients df, starting at x0
+    with update rule update_rule
+    
+    if df returns tuple (grad, natgrad) it will optimize according 
+    to natural gradient rules
+    '''
+    opt_name = "Conjugate Gradient Descent"
+
+    def opt_async(self, *a, **kw):
+        """
+        opt_async(self, f, df, x0, callback, update_rule=FletcherReeves,
+               messages=0, maxiter=5e3, max_f_eval=15e3, gtol=1e-6,
+               report_every=10, *args, **kwargs)
+        
+        callback gets called every `report_every` iterations
+
+            callback(xi, fi, gi, iteration, function_calls, gradient_calls, status_message)
+        
+        if df returns tuple (grad, natgrad) it will optimize according 
+        to natural gradient rules
+    
+        f, and df will be called with
+            
+            f(xi, *args, **kwargs)
+            df(xi, *args, **kwargs)
+        
+        **returns**
+        -----------
+        
+            Started `Process` object, optimizing asynchronously 
+        
+        **calls** 
+        ---------
+        
+            callback(x_opt, f_opt, g_opt, iteration, function_calls, gradient_calls, status_message)
+        
+        at end of optimization!
+        """
+        return super(CGD, self).opt_async(*a, **kw)
+
+    def opt(self, *a, **kw):
+        """
+        opt(self, f, df, x0, callback=None, update_rule=FletcherReeves,
+               messages=0, maxiter=5e3, max_f_eval=15e3, gtol=1e-6,
+               report_every=10, *args, **kwargs)
+        
+        Minimize f, calling callback every `report_every` iterations with following syntax:
+        
+            callback(xi, fi, gi, iteration, function_calls, gradient_calls, status_message)
+        
+        if df returns tuple (grad, natgrad) it will optimize according 
+        to natural gradient rules
+    
+        f, and df will be called with
+            
+            f(xi, *args, **kwargs)
+            df(xi, *args, **kwargs)
+                
+        **returns** 
+        ---------
+        
+            x_opt, f_opt, g_opt, iteration, function_calls, gradient_calls, status_message
+        
+        at end of optimization
+        """
+        return super(CGD, self).opt(*a, **kw)
+
--- a/GPy/inference/gradient_descent_update_rules.py
+++ b/GPy/inference/gradient_descent_update_rules.py
@ -0,0 +1,53 @@
+'''
+Created on 24 Apr 2013
+
+@author: maxz
+'''
+import numpy
+
+class GDUpdateRule():
+    _gradnat = None
+    _gradnatold = None
+    def __init__(self, initgrad, initgradnat=None):
+        self.grad = initgrad
+        if initgradnat:
+            self.gradnat = initgradnat
+        else:
+            self.gradnat = initgrad
+        # self.grad, self.gradnat
+    def _gamma(self):
+        raise NotImplemented("""Implement gamma update rule here, 
+        you can use self.grad and self.gradold for parameters, as well as
+        self.gradnat and self.gradnatold for natural gradients.""")
+    def __call__(self, grad, gradnat=None, si=None, *args, **kw):
+        """
+        Return gamma for given gradients and optional natural gradients
+        """
+        if not gradnat:
+            gradnat = grad
+        self.gradold = self.grad
+        self.gradnatold = self.gradnat
+        self.grad = grad
+        self.gradnat = gradnat
+        self.si = si
+        return self._gamma(*args, **kw)
+
+class FletcherReeves(GDUpdateRule):
+    '''
+    Fletcher Reeves update rule for gamma
+    '''
+    def _gamma(self, *a, **kw):
+        tmp = numpy.dot(self.grad.T, self.gradnat)
+        if tmp:
+            return tmp / numpy.dot(self.gradold.T, self.gradnatold)
+        return tmp
+
+class PolakRibiere(GDUpdateRule):
+    '''
+    Fletcher Reeves update rule for gamma
+    '''
+    def _gamma(self, *a, **kw):
+        tmp = numpy.dot((self.grad - self.gradold).T, self.gradnat)
+        if tmp:
+            return tmp / numpy.dot(self.gradold.T, self.gradnatold)
+        return tmp
--- a/GPy/inference/optimization.py
+++ b/GPy/inference/optimization.py
@ -0,0 +1,240 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import pylab as pb
+import datetime as dt
+from scipy import optimize
+from warnings import warn
+
+try:
+    import rasmussens_minimize as rasm
+    rasm_available = True
+except ImportError:
+    rasm_available = False
+from scg import SCG
+
+class Optimizer():
+    """
+    Superclass for all the optimizers.
+
+    :param x_init: initial set of parameters
+    :param f_fp: function that returns the function AND the gradients at the same time
+    :param f: function to optimize
+    :param fp: gradients
+    :param messages: print messages from the optimizer?
+    :type messages: (True | False)
+    :param max_f_eval: maximum number of function evaluations
+
+    :rtype: optimizer object.
+
+    """
+    def __init__(self, x_init, messages=False, model=None, max_f_eval=1e4, max_iters=1e3,
+                 ftol=None, gtol=None, xtol=None):
+        self.opt_name = None
+        self.x_init = x_init
+        self.messages = messages
+        self.f_opt = None
+        self.x_opt = None
+        self.funct_eval = None
+        self.status = None
+        self.max_f_eval = int(max_f_eval)
+        self.max_iters = int(max_iters)
+        self.trace = None
+        self.time = "Not available"
+        self.xtol = xtol
+        self.gtol = gtol
+        self.ftol = ftol
+        self.model = model
+
+    def run(self, **kwargs):
+        start = dt.datetime.now()
+        self.opt(**kwargs)
+        end = dt.datetime.now()
+        self.time = str(end - start)
+
+    def opt(self, f_fp=None, f=None, fp=None):
+        raise NotImplementedError, "this needs to be implemented to use the optimizer class"
+
+    def plot(self):
+        if self.trace == None:
+            print "No trace present so I can't plot it. Please check that the optimizer actually supplies a trace."
+        else:
+            pb.figure()
+            pb.plot(self.trace)
+            pb.xlabel('Iteration')
+            pb.ylabel('f(x)')
+
+    def __str__(self):
+        diagnostics = "Optimizer: \t\t\t\t %s\n" % self.opt_name
+        diagnostics += "f(x_opt): \t\t\t\t %.3f\n" % self.f_opt
+        diagnostics += "Number of function evaluations: \t %d\n" % self.funct_eval
+        diagnostics += "Optimization status: \t\t\t %s\n" % self.status
+        diagnostics += "Time elapsed: \t\t\t\t %s\n" % self.time
+        return diagnostics
+
+class opt_tnc(Optimizer):
+    def __init__(self, *args, **kwargs):
+        Optimizer.__init__(self, *args, **kwargs)
+        self.opt_name = "TNC (Scipy implementation)"
+
+    def opt(self, f_fp=None, f=None, fp=None):
+        """
+        Run the TNC optimizer
+
+        """
+        tnc_rcstrings = ['Local minimum', 'Converged', 'XConverged', 'Maximum number of f evaluations reached',
+             'Line search failed', 'Function is constant']
+
+        assert f_fp != None, "TNC requires f_fp"
+
+        opt_dict = {}
+        if self.xtol is not None:
+            opt_dict['xtol'] = self.xtol
+        if self.ftol is not None:
+            opt_dict['ftol'] = self.ftol
+        if self.gtol is not None:
+            opt_dict['pgtol'] = self.gtol
+
+        opt_result = optimize.fmin_tnc(f_fp, self.x_init, messages=self.messages,
+                       maxfun=self.max_f_eval, **opt_dict)
+        self.x_opt = opt_result[0]
+        self.f_opt = f_fp(self.x_opt)[0]
+        self.funct_eval = opt_result[1]
+        self.status = tnc_rcstrings[opt_result[2]]
+
+class opt_lbfgsb(Optimizer):
+    def __init__(self, *args, **kwargs):
+        Optimizer.__init__(self, *args, **kwargs)
+        self.opt_name = "L-BFGS-B (Scipy implementation)"
+
+    def opt(self, f_fp=None, f=None, fp=None):
+        """
+        Run the optimizer
+
+        """
+        rcstrings = ['Converged', 'Maximum number of f evaluations reached', 'Error']
+
+        assert f_fp != None, "BFGS requires f_fp"
+
+        if self.messages:
+            iprint = 1
+        else:
+            iprint = -1
+
+        opt_dict = {}
+        if self.xtol is not None:
+            print "WARNING: l-bfgs-b doesn't have an xtol arg, so I'm going to ignore it"
+        if self.ftol is not None:
+            print "WARNING: l-bfgs-b doesn't have an ftol arg, so I'm going to ignore it"
+        if self.gtol is not None:
+            opt_dict['pgtol'] = self.gtol
+
+        opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint=iprint,
+                                            maxfun=self.max_f_eval, **opt_dict)
+        self.x_opt = opt_result[0]
+        self.f_opt = f_fp(self.x_opt)[0]
+        self.funct_eval = opt_result[2]['funcalls']
+        self.status = rcstrings[opt_result[2]['warnflag']]
+
+class opt_simplex(Optimizer):
+    def __init__(self, *args, **kwargs):
+        Optimizer.__init__(self, *args, **kwargs)
+        self.opt_name = "Nelder-Mead simplex routine (via Scipy)"
+
+    def opt(self, f_fp=None, f=None, fp=None):
+        """
+        The simplex optimizer does not require gradients.
+        """
+
+        statuses = ['Converged', 'Maximum number of function evaluations made', 'Maximum number of iterations reached']
+
+        opt_dict = {}
+        if self.xtol is not None:
+            opt_dict['xtol'] = self.xtol
+        if self.ftol is not None:
+            opt_dict['ftol'] = self.ftol
+        if self.gtol is not None:
+            print "WARNING: simplex doesn't have an gtol arg, so I'm going to ignore it"
+
+        opt_result = optimize.fmin(f, self.x_init, (), disp=self.messages,
+                   maxfun=self.max_f_eval, full_output=True, **opt_dict)
+
+        self.x_opt = opt_result[0]
+        self.f_opt = opt_result[1]
+        self.funct_eval = opt_result[3]
+        self.status = statuses[opt_result[4]]
+        self.trace = None
+
+
+class opt_rasm(Optimizer):
+    def __init__(self, *args, **kwargs):
+        Optimizer.__init__(self, *args, **kwargs)
+        self.opt_name = "Rasmussen's Conjugate Gradient"
+
+    def opt(self, f_fp=None, f=None, fp=None):
+        """
+        Run Rasmussen's Conjugate Gradient optimizer
+        """
+
+        assert f_fp != None, "Rasmussen's minimizer requires f_fp"
+        statuses = ['Converged', 'Line search failed', 'Maximum number of f evaluations reached',
+                'NaNs in optimization']
+
+        opt_dict = {}
+        if self.xtol is not None:
+            print "WARNING: minimize doesn't have an xtol arg, so I'm going to ignore it"
+        if self.ftol is not None:
+            print "WARNING: minimize doesn't have an ftol arg, so I'm going to ignore it"
+        if self.gtol is not None:
+            print "WARNING: minimize doesn't have an gtol arg, so I'm going to ignore it"
+
+        opt_result = rasm.minimize(self.x_init, f_fp, (), messages=self.messages,
+                                   maxnumfuneval=self.max_f_eval)
+        self.x_opt = opt_result[0]
+        self.f_opt = opt_result[1][-1]
+        self.funct_eval = opt_result[2]
+        self.status = statuses[opt_result[3]]
+
+        self.trace = opt_result[1]
+
+class opt_SCG(Optimizer):
+    def __init__(self, *args, **kwargs):
+        if 'max_f_eval' in kwargs:
+            warn("max_f_eval deprecated for SCG optimizer: use max_iters instead!\nIgnoring max_f_eval!", FutureWarning)
+        Optimizer.__init__(self, *args, **kwargs)
+
+        self.opt_name = "Scaled Conjugate Gradients"
+
+    def opt(self, f_fp=None, f=None, fp=None):
+        assert not f is None
+        assert not fp is None
+
+        opt_result = SCG(f, fp, self.x_init, display=self.messages,
+                         maxiters=self.max_iters,
+                         max_f_eval=self.max_f_eval,
+                         xtol=self.xtol, ftol=self.ftol,
+                         gtol=self.gtol)
+
+        self.x_opt = opt_result[0]
+        self.trace = opt_result[1]
+        self.f_opt = self.trace[-1]
+        self.funct_eval = opt_result[2]
+        self.status = opt_result[3]
+
+def get_optimizer(f_min):
+    from sgd import opt_SGD
+
+    optimizers = {'fmin_tnc': opt_tnc,
+          'simplex': opt_simplex,
+          'lbfgsb': opt_lbfgsb,
+          'scg': opt_SCG,
+          'sgd': opt_SGD}
+
+    if rasm_available:
+        optimizers['rasmussen'] = opt_rasm
+
+    for opt_name in optimizers.keys():
+        if opt_name.lower().find(f_min.lower()) != -1:
+            return optimizers[opt_name]
+
+    raise KeyError('No optimizer was found matching the name: %s' % f_min)
--- a/GPy/inference/samplers.py
+++ b/GPy/inference/samplers.py
@ -0,0 +1,85 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import linalg, optimize
+import pylab as pb
+import Tango
+import sys
+import re
+import numdifftools as ndt
+import pdb
+import cPickle
+
+
+class Metropolis_Hastings:
+    def __init__(self,model,cov=None):
+        """Metropolis Hastings, with tunings according to Gelman et al. """
+        self.model = model
+        current = self.model._get_params_transformed()
+        self.D = current.size
+        self.chains = []
+        if cov is None:
+            self.cov = model.Laplace_covariance()
+        else:
+            self.cov = cov
+        self.scale = 2.4/np.sqrt(self.D)
+        self.new_chain(current)
+
+    def new_chain(self, start=None):
+        self.chains.append([])
+        if start is None:
+            self.model.randomize()
+        else:
+            self.model._set_params_transformed(start)
+
+
+
+    def sample(self, Ntotal, Nburn, Nthin, tune=True, tune_throughout=False, tune_interval=400):
+        current = self.model._get_params_transformed()
+        fcurrent = self.model.log_likelihood() + self.model.log_prior()
+        accepted = np.zeros(Ntotal,dtype=np.bool)
+        for it in range(Ntotal):
+            print "sample %d of %d\r"%(it,Ntotal),
+            sys.stdout.flush()
+            prop = np.random.multivariate_normal(current, self.cov*self.scale*self.scale)
+            self.model._set_params_transformed(prop)
+            fprop = self.model.log_likelihood() + self.model.log_prior()
+
+            if fprop>fcurrent:#sample accepted, going 'uphill'
+                accepted[it] = True
+                current = prop
+                fcurrent = fprop
+            else:
+                u = np.random.rand()
+                if np.exp(fprop-fcurrent)>u:#sample accepted downhill
+                    accepted[it] = True
+                    current = prop
+                    fcurrent = fprop
+
+            #store current value
+            if (it > Nburn) & ((it%Nthin)==0):
+                self.chains[-1].append(current)
+
+            #tuning!
+            if it & ((it%tune_interval)==0) & tune & ((it<Nburn) | (tune_throughout)):
+                pc = np.mean(accepted[it-tune_interval:it])
+                self.cov = np.cov(np.vstack(self.chains[-1][-tune_interval:]).T)
+                if pc > .25:
+                    self.scale *= 1.1
+                if pc < .15:
+                    self.scale /= 1.1
+
+    def predict(self,function,args):
+        """Make a prediction for the function, to which we will pass the additional arguments"""
+        param = self.model._get_params()
+        fs = []
+        for p in self.chain:
+            self.model._set_params(p)
+            fs.append(function(*args))
+        self.model._set_params(param)# reset model to starting state
+        return fs
+
+
+
--- a/GPy/inference/scg.py
+++ b/GPy/inference/scg.py
@ -0,0 +1,191 @@
+# Copyright I. Nabney, N.Lawrence and James Hensman (1996 - 2012)
+
+# Scaled Conjuagte Gradients, originally in Matlab as part of the Netlab toolbox by I. Nabney, converted to python N. Lawrence and given a pythonic interface by James Hensman
+
+#      THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
+#      HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+#      EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
+#      NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+#      MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+#      PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+#      REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
+#      DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#      EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#      (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+#      OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#      DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+#      HOWEVER CAUSED AND ON ANY THEORY OF
+#      LIABILITY, WHETHER IN CONTRACT, STRICT
+#      LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+#      OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#      OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+#      POSSIBILITY OF SUCH DAMAGE.
+
+
+import numpy as np
+import sys
+
+
+def print_out(len_maxiters, fnow, current_grad, beta, iteration):
+    print '\r',
+    print '{0:>0{mi}g}  {1:> 12e}  {2:> 12e}  {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
+    sys.stdout.flush()
+
+def exponents(fnow, current_grad):
+    exps = [np.abs(fnow), current_grad]
+    return np.sign(exps) * np.log10(exps).astype(int)
+
+def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True, xtol=None, ftol=None, gtol=None):
+    """
+    Optimisation through Scaled Conjugate Gradients (SCG)
+
+    f: the objective function
+    gradf : the gradient function (should return a 1D np.ndarray)
+    x : the initial condition
+
+    Returns
+    x the optimal value for x
+    flog : a list of all the objective values
+    function_eval number of fn evaluations
+    status: string describing convergence status
+    """
+    if xtol is None:
+        xtol = 1e-6
+    if ftol is None:
+        ftol = 1e-6
+    if gtol is None:
+        gtol = 1e-5
+
+    sigma0 = 1.0e-8
+    fold = f(x, *optargs) # Initial function value.
+    function_eval = 1
+    fnow = fold
+    gradnew = gradf(x, *optargs) # Initial gradient.
+    if any(np.isnan(gradnew)):
+        raise UnexpectedInfOrNan
+    current_grad = np.dot(gradnew, gradnew)
+    gradold = gradnew.copy()
+    d = -gradnew # Initial search direction.
+    success = True # Force calculation of directional derivs.
+    nsuccess = 0 # nsuccess counts number of successes.
+    beta = 1.0 # Initial scale parameter.
+    betamin = 1.0e-60 # Lower bound on scale.
+    betamax = 1.0e50 # Upper bound on scale.
+    status = "Not converged"
+
+    flog = [fold]
+
+    iteration = 0
+
+    len_maxiters = len(str(maxiters))
+    if display:
+        print ' {0:{mi}s}   {1:11s}    {2:11s}    {3:11s}'.format("I", "F", "Scale", "|g|", mi=len_maxiters)
+        exps = exponents(fnow, current_grad)
+        p_iter = iteration
+
+    # Main optimization loop.
+    while iteration < maxiters:
+
+        # Calculate first and second directional derivatives.
+        if success:
+            mu = np.dot(d, gradnew)
+            if mu >= 0:
+                d = -gradnew
+                mu = np.dot(d, gradnew)
+            kappa = np.dot(d, d)
+            sigma = sigma0 / np.sqrt(kappa)
+            xplus = x + sigma * d
+            gplus = gradf(xplus, *optargs)
+            theta = np.dot(d, (gplus - gradnew)) / sigma
+
+        # Increase effective curvature and evaluate step size alpha.
+        delta = theta + beta * kappa
+        if delta <= 0:
+            delta = beta * kappa
+            beta = beta - theta / kappa
+
+        alpha = -mu / delta
+
+        # Calculate the comparison ratio.
+        xnew = x + alpha * d
+        fnew = f(xnew, *optargs)
+        function_eval += 1
+
+#         if function_eval >= max_f_eval:
+#             status = "maximum number of function evaluations exceeded"
+#             break
+#             return x, flog, function_eval, status
+
+        Delta = 2.*(fnew - fold) / (alpha * mu)
+        if Delta >= 0.:
+            success = True
+            nsuccess += 1
+            x = xnew
+            fnow = fnew
+        else:
+            success = False
+            fnow = fold
+
+        # Store relevant variables
+        flog.append(fnow) # Current function value
+
+        iteration += 1
+        if display:
+            print_out(len_maxiters, fnow, current_grad, beta, iteration)
+            n_exps = exponents(fnow, current_grad)
+            if iteration - p_iter >= 20 * np.random.rand():
+                a = iteration >= p_iter * 2.78
+                b = np.any(n_exps < exps)
+                if a or b:
+                    p_iter = iteration
+                    print ''
+                if b:
+                    exps = n_exps
+
+        if success:
+            # Test for termination
+
+            if (np.abs(fnew - fold) < ftol):
+                status = 'converged - relative reduction in objective'
+                break
+#                 return x, flog, function_eval, status
+            elif (np.max(np.abs(alpha * d)) < xtol):
+                status = 'converged - relative stepsize'
+                break
+            else:
+                # Update variables for new position
+                gradnew = gradf(x, *optargs)
+                current_grad = np.dot(gradnew, gradnew)
+                gradold = gradnew
+                fold = fnew
+                # If the gradient is zero then we are done.
+                if current_grad <= gtol:
+                    status = 'converged - relative reduction in gradient'
+                    break
+                    # return x, flog, function_eval, status
+
+        # Adjust beta according to comparison ratio.
+        if Delta < 0.25:
+            beta = min(4.0 * beta, betamax)
+        if Delta > 0.75:
+            beta = max(0.5 * beta, betamin)
+
+        # Update search direction using Polak-Ribiere formula, or re-start
+        # in direction of negative gradient after nparams steps.
+        if nsuccess == x.size:
+            d = -gradnew
+#             beta = 1.  # TODO: betareset!!
+            nsuccess = 0
+        elif success:
+            Gamma = np.dot(gradold - gradnew, gradnew) / (mu)
+            d = Gamma * d - gradnew
+    else:
+        # If we get here, then we haven't terminated in the given number of
+        # iterations.
+        status = "maxiter exceeded"
+
+    if display:
+        print_out(len_maxiters, fnow, current_grad, beta, iteration)
+        print ""
+        print status
+    return x, flog, function_eval, status
--- a/GPy/inference/sgd.py
+++ b/GPy/inference/sgd.py
@ -0,0 +1,356 @@
+import numpy as np
+import scipy as sp
+import scipy.sparse
+from optimization import Optimizer
+from scipy import linalg, optimize
+import pylab as plt
+import copy, sys, pickle
+
+class opt_SGD(Optimizer):
+    """
+    Optimize using stochastic gradient descent.
+
+    *** Parameters ***
+    Model: reference to the Model object
+    iterations: number of iterations
+    learning_rate: learning rate
+    momentum: momentum
+
+    """
+
+    def __init__(self, start, iterations = 10, learning_rate = 1e-4, momentum = 0.9, model = None, messages = False, batch_size = 1, self_paced = False, center = True, iteration_file = None, learning_rate_adaptation=None, actual_iter=None, schedule=None, **kwargs):
+        self.opt_name = "Stochastic Gradient Descent"
+
+        self.Model = model
+        self.iterations = iterations
+        self.momentum = momentum
+        self.learning_rate = learning_rate
+        self.x_opt = None
+        self.f_opt = None
+        self.messages = messages
+        self.batch_size = batch_size
+        self.self_paced = self_paced
+        self.center = center
+        self.param_traces = [('noise',[])]
+        self.iteration_file = iteration_file
+        self.learning_rate_adaptation = learning_rate_adaptation
+        self.actual_iter = actual_iter
+        if self.learning_rate_adaptation != None:
+            if self.learning_rate_adaptation == 'annealing':
+                self.learning_rate_0 = self.learning_rate
+            else:
+                self.learning_rate_0 = self.learning_rate.mean()
+
+        self.schedule = schedule
+        # if len([p for p in self.model.kern.parts if p.name == 'bias']) == 1:
+        #     self.param_traces.append(('bias',[]))
+        # if len([p for p in self.model.kern.parts if p.name == 'linear']) == 1:
+        #     self.param_traces.append(('linear',[]))
+        # if len([p for p in self.model.kern.parts if p.name == 'rbf']) == 1:
+        #     self.param_traces.append(('rbf_var',[]))
+
+        self.param_traces = dict(self.param_traces)
+        self.fopt_trace = []
+
+        num_params = len(self.Model._get_params())
+        if isinstance(self.learning_rate, float):
+            self.learning_rate = np.ones((num_params,)) * self.learning_rate
+
+        assert (len(self.learning_rate) == num_params), "there must be one learning rate per parameter"
+
+    def __str__(self):
+        status = "\nOptimizer: \t\t\t %s\n" % self.opt_name
+        status += "f(x_opt): \t\t\t %.4f\n" % self.f_opt
+        status += "Number of iterations: \t\t %d\n" % self.iterations
+        status += "Learning rate: \t\t\t max %.3f, min %.3f\n" % (self.learning_rate.max(), self.learning_rate.min())
+        status += "Momentum: \t\t\t %.3f\n" % self.momentum
+        status += "Batch size: \t\t\t %d\n" % self.batch_size
+        status += "Time elapsed: \t\t\t %s\n" % self.time
+        return status
+
+    def plot_traces(self):
+        plt.figure()
+        plt.subplot(211)
+        plt.title('Parameters')
+        for k in self.param_traces.keys():
+            plt.plot(self.param_traces[k], label=k)
+        plt.legend(loc=0)
+        plt.subplot(212)
+        plt.title('Objective function')
+        plt.plot(self.fopt_trace)
+
+
+    def non_null_samples(self, data):
+        return (np.isnan(data).sum(axis=1) == 0)
+
+    def check_for_missing(self, data):
+        if sp.sparse.issparse(self.Model.likelihood.Y):
+            return True
+        else:
+            return np.isnan(data).sum() > 0
+
+    def subset_parameter_vector(self, x, samples, param_shapes):
+        subset = np.array([], dtype = int)
+        x = np.arange(0, len(x))
+        i = 0
+
+        for s in param_shapes:
+            N, input_dim = s
+            X = x[i:i+N*input_dim].reshape(N, input_dim)
+            X = X[samples]
+            subset = np.append(subset, X.flatten())
+            i += N*input_dim
+
+        subset = np.append(subset, x[i:])
+
+        return subset
+
+    def shift_constraints(self, j):
+
+        constrained_indices = copy.deepcopy(self.Model.constrained_indices)
+
+        for c, constraint in enumerate(constrained_indices):
+            mask = (np.ones_like(constrained_indices[c]) == 1)
+            for i in range(len(constrained_indices[c])):
+                pos = np.where(j == constrained_indices[c][i])[0]
+                if len(pos) == 1:
+                    self.Model.constrained_indices[c][i] = pos
+                else:
+                    mask[i] = False
+
+            self.Model.constrained_indices[c] = self.Model.constrained_indices[c][mask]
+        return constrained_indices
+        # back them up
+        # bounded_i = copy.deepcopy(self.Model.constrained_bounded_indices)
+        # bounded_l = copy.deepcopy(self.Model.constrained_bounded_lowers)
+        # bounded_u = copy.deepcopy(self.Model.constrained_bounded_uppers)
+
+        # for b in range(len(bounded_i)): # for each group of constraints
+        #     for bc in range(len(bounded_i[b])):
+        #         pos = np.where(j == bounded_i[b][bc])[0]
+        #         if len(pos) == 1:
+        #             pos2 = np.where(self.Model.constrained_bounded_indices[b] == bounded_i[b][bc])[0][0]
+        #             self.Model.constrained_bounded_indices[b][pos2] = pos[0]
+        #         else:
+        #             if len(self.Model.constrained_bounded_indices[b]) == 1:
+        #                 # if it's the last index to be removed
+        #                 # the logic here is just a mess. If we remove the last one, then all the
+        #                 # b-indices change and we have to iterate through everything to find our
+        #                 # current index. Can't deal with this right now.
+        #                 raise NotImplementedError
+
+        #             else: # just remove it from the indices
+        #                 mask = self.Model.constrained_bounded_indices[b] != bc
+        #                 self.Model.constrained_bounded_indices[b] = self.Model.constrained_bounded_indices[b][mask]
+
+
+        # # here we shif the positive constraints. We cycle through each positive
+        # # constraint
+        # positive = self.Model.constrained_positive_indices.copy()
+        # mask = (np.ones_like(positive) == 1)
+        # for p in range(len(positive)):
+        #     # we now check whether the constrained index appears in the j vector
+        #     # (the vector of the "active" indices)
+        #     pos = np.where(j == self.Model.constrained_positive_indices[p])[0]
+        #     if len(pos) == 1:
+        #         self.Model.constrained_positive_indices[p] = pos
+        #     else:
+        #         mask[p] = False
+        # self.Model.constrained_positive_indices = self.Model.constrained_positive_indices[mask]
+
+        # return (bounded_i, bounded_l, bounded_u), positive
+
+    def restore_constraints(self, c):#b, p):
+        # self.Model.constrained_bounded_indices = b[0]
+        # self.Model.constrained_bounded_lowers = b[1]
+        # self.Model.constrained_bounded_uppers = b[2]
+        # self.Model.constrained_positive_indices = p
+        self.Model.constrained_indices = c
+
+    def get_param_shapes(self, N = None, input_dim = None):
+        model_name = self.Model.__class__.__name__
+        if model_name == 'GPLVM':
+            return [(N, input_dim)]
+        if model_name == 'Bayesian_GPLVM':
+            return [(N, input_dim), (N, input_dim)]
+        else:
+            raise NotImplementedError
+
+    def step_with_missing_data(self, f_fp, X, step, shapes):
+        N, input_dim = X.shape
+
+        if not sp.sparse.issparse(self.Model.likelihood.Y):
+            Y = self.Model.likelihood.Y
+            samples = self.non_null_samples(self.Model.likelihood.Y)
+            self.Model.N = samples.sum()
+            Y = Y[samples]
+        else:
+            samples = self.Model.likelihood.Y.nonzero()[0]
+            self.Model.N = len(samples)
+            Y = np.asarray(self.Model.likelihood.Y[samples].todense(), dtype = np.float64)
+
+        if self.Model.N == 0 or Y.std() == 0.0:
+            return 0, step, self.Model.N
+
+        self.Model.likelihood._offset = Y.mean()
+        self.Model.likelihood._scale = Y.std()
+        self.Model.likelihood.set_data(Y)
+        # self.Model.likelihood.V = self.Model.likelihood.Y*self.Model.likelihood.precision
+
+        sigma = self.Model.likelihood._variance
+        self.Model.likelihood._variance = None # invalidate cache
+        self.Model.likelihood._set_params(sigma)
+
+
+        j = self.subset_parameter_vector(self.x_opt, samples, shapes)
+        self.Model.X = X[samples]
+
+        model_name = self.Model.__class__.__name__
+
+        if model_name == 'Bayesian_GPLVM':
+            self.Model.likelihood.YYT = np.dot(self.Model.likelihood.Y, self.Model.likelihood.Y.T)
+            self.Model.likelihood.trYYT = np.trace(self.Model.likelihood.YYT)
+
+        ci = self.shift_constraints(j)
+        f, fp = f_fp(self.x_opt[j])
+
+        step[j] = self.momentum * step[j] + self.learning_rate[j] * fp
+        self.x_opt[j] -= step[j]
+        self.restore_constraints(ci)
+
+        self.Model.grads[j] = fp
+        # restore likelihood _offset and _scale, otherwise when we call set_data(y) on
+        # the next feature, it will get normalized with the mean and std of this one.
+        self.Model.likelihood._offset = 0
+        self.Model.likelihood._scale = 1
+
+        return f, step, self.Model.N
+
+    def adapt_learning_rate(self, t, D):
+        if self.learning_rate_adaptation == 'adagrad':
+            if t > 0:
+                g_k = self.Model.grads
+                self.s_k += np.square(g_k)
+                t0 = 100.0
+                self.learning_rate = 0.1/(t0 + np.sqrt(self.s_k))
+
+                import pdb; pdb.set_trace()
+            else:
+                self.learning_rate = np.zeros_like(self.learning_rate)
+                self.s_k = np.zeros_like(self.x_opt)
+
+        elif self.learning_rate_adaptation == 'annealing':
+            #self.learning_rate = self.learning_rate_0/(1+float(t+1)/10)
+            self.learning_rate = np.ones_like(self.learning_rate) * self.schedule[t]
+
+
+        elif self.learning_rate_adaptation == 'semi_pesky':
+            if self.Model.__class__.__name__ == 'Bayesian_GPLVM':
+                g_t = self.Model.grads
+                if t == 0:
+                    self.hbar_t = 0.0
+                    self.tau_t = 100.0
+                    self.gbar_t = 0.0
+
+                self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t
+                self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t)
+                self.learning_rate = np.ones_like(self.learning_rate)*(np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t)
+                tau_t = self.tau_t*(1-self.learning_rate) + 1
+
+
+    def opt(self, f_fp=None, f=None, fp=None):
+        self.x_opt = self.Model._get_params_transformed()
+        self.grads = []
+
+        X, Y = self.Model.X.copy(), self.Model.likelihood.Y.copy()
+
+        self.Model.likelihood.YYT = 0
+        self.Model.likelihood.trYYT = 0
+        self.Model.likelihood._offset = 0.0
+        self.Model.likelihood._scale = 1.0
+
+        N, input_dim = self.Model.X.shape
+        D = self.Model.likelihood.Y.shape[1]
+        num_params = self.Model._get_params()
+        self.trace = []
+        missing_data = self.check_for_missing(self.Model.likelihood.Y)
+
+        step = np.zeros_like(num_params)
+        for it in range(self.iterations):
+            if self.actual_iter != None:
+                it = self.actual_iter
+
+            self.Model.grads = np.zeros_like(self.x_opt) # TODO this is ugly
+
+            if it == 0 or self.self_paced is False:
+                features = np.random.permutation(Y.shape[1])
+            else:
+                features = np.argsort(NLL)
+
+            b = len(features)/self.batch_size
+            features = [features[i::b] for i in range(b)]
+            NLL = []
+            import pylab as plt
+            for count, j in enumerate(features):
+                self.Model.input_dim = len(j)
+                self.Model.likelihood.input_dim = len(j)
+                self.Model.likelihood.set_data(Y[:, j])
+                # self.Model.likelihood.V = self.Model.likelihood.Y*self.Model.likelihood.precision
+
+                sigma = self.Model.likelihood._variance
+                self.Model.likelihood._variance = None # invalidate cache
+                self.Model.likelihood._set_params(sigma)
+
+                if missing_data:
+                    shapes = self.get_param_shapes(N, input_dim)
+                    f, step, Nj = self.step_with_missing_data(f_fp, X, step, shapes)
+                else:
+                    self.Model.likelihood.YYT = np.dot(self.Model.likelihood.Y, self.Model.likelihood.Y.T)
+                    self.Model.likelihood.trYYT = np.trace(self.Model.likelihood.YYT)
+                    Nj = N
+                    f, fp = f_fp(self.x_opt)
+                    self.Model.grads = fp.copy()
+                    step = self.momentum * step + self.learning_rate * fp
+                    self.x_opt -= step
+
+                if self.messages == 2:
+                    noise = self.Model.likelihood._variance
+                    status = "evaluating {feature: 5d}/{tot: 5d} \t f: {f: 2.3f} \t non-missing: {nm: 4d}\t noise: {noise: 2.4f}\r".format(feature = count, tot = len(features), f = f, nm = Nj, noise = noise)
+                    sys.stdout.write(status)
+                    sys.stdout.flush()
+                    self.param_traces['noise'].append(noise)
+
+                self.adapt_learning_rate(it+count, D)
+                NLL.append(f)
+                self.fopt_trace.append(NLL[-1])
+                # fig = plt.figure('traces')
+                # plt.clf()
+                # plt.plot(self.param_traces['noise'])
+
+                # for k in self.param_traces.keys():
+                #     self.param_traces[k].append(self.Model.get(k)[0])
+            self.grads.append(self.Model.grads.tolist())
+            # should really be a sum(), but earlier samples in the iteration will have a very crappy ll
+            self.f_opt = np.mean(NLL)
+            self.Model.N = N
+            self.Model.X = X
+            self.Model.input_dim = D
+            self.Model.likelihood.N = N
+            self.Model.likelihood.input_dim = D
+            self.Model.likelihood.Y = Y
+            sigma = self.Model.likelihood._variance
+            self.Model.likelihood._variance = None # invalidate cache
+            self.Model.likelihood._set_params(sigma)
+
+            self.trace.append(self.f_opt)
+            if self.iteration_file is not None:
+                f = open(self.iteration_file + "iteration%d.pickle" % it, 'w')
+                data = [self.x_opt, self.fopt_trace, self.param_traces]
+                pickle.dump(data, f)
+                f.close()
+
+            if self.messages != 0:
+                sys.stdout.write('\r' + ' '*len(status)*2 + '  \r')
+                status = "SGD Iteration: {0: 3d}/{1: 3d}  f: {2: 2.3f}   max eta: {3: 1.5f}\n".format(it+1, self.iterations, self.f_opt, self.learning_rate.max())
+                sys.stdout.write(status)
+                sys.stdout.flush()
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -0,0 +1,9 @@
+# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from constructors import *
+try:
+    from constructors import rbf_sympy, sympykern # these depend on sympy
+except:
+    pass
+from kern import *
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@ -0,0 +1,423 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from kern import kern
+import parts
+
+
+def rbf_inv(input_dim,variance=1., inv_lengthscale=None,ARD=False):
+    """
+    Construct an RBF kernel
+
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+    :param lengthscale: the lengthscale of the kernel
+    :type lengthscale: float
+    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
+    :type ARD: Boolean
+    """
+    part = parts.rbf_inv.RBFInv(input_dim,variance,inv_lengthscale,ARD)
+    return kern(input_dim, [part])
+
+def rbf(input_dim,variance=1., lengthscale=None,ARD=False):
+    """
+    Construct an RBF kernel
+
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+    :param lengthscale: the lengthscale of the kernel
+    :type lengthscale: float
+    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
+    :type ARD: Boolean
+    """
+    part = parts.rbf.RBF(input_dim,variance,lengthscale,ARD)
+    return kern(input_dim, [part])
+
+def linear(input_dim,variances=None,ARD=False):
+    """
+     Construct a linear kernel.
+
+     Arguments
+     ---------
+    input_dimD (int), obligatory
+     variances (np.ndarray)
+     ARD (boolean)
+    """
+    part = parts.linear.Linear(input_dim,variances,ARD)
+    return kern(input_dim, [part])
+
+def mlp(input_dim,variance=1., weight_variance=None,bias_variance=100.,ARD=False):
+    """
+    Construct an MLP kernel
+
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+    :param weight_scale: the lengthscale of the kernel
+    :type weight_scale: vector of weight variances for input weights in neural network (length 1 if kernel is isotropic)
+    :param bias_variance: the variance of the biases in the neural network.
+    :type bias_variance: float
+    :param ARD: Auto Relevance Determination (allows for ARD version of covariance)
+    :type ARD: Boolean
+    """
+    part = parts.mlp.MLP(input_dim,variance,weight_variance,bias_variance,ARD)
+    return kern(input_dim, [part])
+
+def gibbs(input_dim,variance=1., mapping=None):
+    """
+    Gibbs and MacKay non-stationary covariance function.
+
+    .. math::
+       
+       r = sqrt((x_i - x_j)'*(x_i - x_j))
+       
+       k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
+
+       Z = \sqrt{2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')}
+
+       where :math:`l(x)` is a function giving the length scale as a function of space.
+       This is the non stationary kernel proposed by Mark Gibbs in his 1997
+        thesis. It is similar to an RBF but has a length scale that varies
+        with input location. This leads to an additional term in front of
+        the kernel.
+
+        The parameters are :math:`\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
+
+        :param input_dim: the number of input dimensions
+        :type input_dim: int 
+        :param variance: the variance :math:`\sigma^2`
+        :type variance: float
+        :param mapping: the mapping that gives the lengthscale across the input space.
+        :type mapping: GPy.core.Mapping
+        :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
+        :type ARD: Boolean
+        :rtype: Kernpart object
+
+    """
+    part = parts.gibbs.Gibbs(input_dim,variance,mapping)
+    return kern(input_dim, [part])
+
+def poly(input_dim,variance=1., weight_variance=None,bias_variance=1.,degree=2, ARD=False):
+    """
+    Construct a polynomial kernel
+
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+    :param weight_scale: the lengthscale of the kernel
+    :type weight_scale: vector of weight variances for input weights.
+    :param bias_variance: the variance of the biases.
+    :type bias_variance: float
+    :param degree: the degree of the polynomial
+    :type degree: int
+    :param ARD: Auto Relevance Determination (allows for ARD version of covariance)
+    :type ARD: Boolean
+    """
+    part = parts.poly.POLY(input_dim,variance,weight_variance,bias_variance,degree,ARD)
+    return kern(input_dim, [part])
+
+def white(input_dim,variance=1.):
+    """
+     Construct a white kernel.
+
+     Arguments
+     ---------
+    input_dimD (int), obligatory
+     variance (float)
+    """
+    part = parts.white.White(input_dim,variance)
+    return kern(input_dim, [part])
+
+def exponential(input_dim,variance=1., lengthscale=None, ARD=False):
+    """
+    Construct an exponential kernel
+
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+    :param lengthscale: the lengthscale of the kernel
+    :type lengthscale: float
+    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
+    :type ARD: Boolean
+    """
+    part = parts.exponential.Exponential(input_dim,variance, lengthscale, ARD)
+    return kern(input_dim, [part])
+
+def Matern32(input_dim,variance=1., lengthscale=None, ARD=False):
+    """
+     Construct a Matern 3/2 kernel.
+
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+    :param lengthscale: the lengthscale of the kernel
+    :type lengthscale: float
+    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
+    :type ARD: Boolean
+    """
+    part = parts.Matern32.Matern32(input_dim,variance, lengthscale, ARD)
+    return kern(input_dim, [part])
+
+def Matern52(input_dim, variance=1., lengthscale=None, ARD=False):
+    """
+     Construct a Matern 5/2 kernel.
+
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+    :param lengthscale: the lengthscale of the kernel
+    :type lengthscale: float
+    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
+    :type ARD: Boolean
+    """
+    part = parts.Matern52.Matern52(input_dim, variance, lengthscale, ARD)
+    return kern(input_dim, [part])
+
+def bias(input_dim, variance=1.):
+    """
+     Construct a bias kernel.
+
+     Arguments
+     ---------
+     input_dim (int), obligatory
+     variance (float)
+    """
+    part = parts.bias.Bias(input_dim, variance)
+    return kern(input_dim, [part])
+
+def finite_dimensional(input_dim, F, G, variances=1., weights=None):
+    """
+    Construct a finite dimensional kernel.
+    input_dim: int - the number of input dimensions
+    F: np.array of functions with shape (n,) - the n basis functions
+    G: np.array with shape (n,n) - the Gram matrix associated to F
+    variances : np.ndarray with shape (n,)
+    """
+    part = parts.finite_dimensional.FiniteDimensional(input_dim, F, G, variances, weights)
+    return kern(input_dim, [part])
+
+def spline(input_dim, variance=1.):
+    """
+    Construct a spline kernel.
+
+    :param input_dim: Dimensionality of the kernel
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+    """
+    part = parts.spline.Spline(input_dim, variance)
+    return kern(input_dim, [part])
+
+def Brownian(input_dim, variance=1.):
+    """
+    Construct a Brownian motion kernel.
+
+    :param input_dim: Dimensionality of the kernel
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+    """
+    part = parts.Brownian.Brownian(input_dim, variance)
+    return kern(input_dim, [part])
+
+try:
+    import sympy as sp
+    from sympykern import spkern
+    from sympy.parsing.sympy_parser import parse_expr
+    sympy_available = True
+except ImportError:
+    sympy_available = False
+
+if sympy_available:
+    def rbf_sympy(input_dim, ARD=False, variance=1., lengthscale=1.):
+        """
+        Radial Basis Function covariance.
+        """
+        X = [sp.var('x%i' % i) for i in range(input_dim)]
+        Z = [sp.var('z%i' % i) for i in range(input_dim)]
+        rbf_variance = sp.var('rbf_variance',positive=True)
+        if ARD:
+            rbf_lengthscales = [sp.var('rbf_lengthscale_%i' % i, positive=True) for i in range(input_dim)]
+            dist_string = ' + '.join(['(x%i-z%i)**2/rbf_lengthscale_%i**2' % (i, i, i) for i in range(input_dim)])
+            dist = parse_expr(dist_string)
+            f =  rbf_variance*sp.exp(-dist/2.)
+        else:
+            rbf_lengthscale = sp.var('rbf_lengthscale',positive=True)
+            dist_string = ' + '.join(['(x%i-z%i)**2' % (i, i) for i in range(input_dim)])
+            dist = parse_expr(dist_string)
+            f =  rbf_variance*sp.exp(-dist/(2*rbf_lengthscale**2))
+        return kern(input_dim, [spkern(input_dim, f)])
+
+    def sympykern(input_dim, k):
+        """
+        A kernel from a symbolic sympy representation
+        """
+        return kern(input_dim, [spkern(input_dim, k)])
+del sympy_available
+
+def periodic_exponential(input_dim=1, variance=1., lengthscale=None, period=2 * np.pi, n_freq=10, lower=0., upper=4 * np.pi):
+    """
+    Construct an periodic exponential kernel
+
+    :param input_dim: dimensionality, only defined for input_dim=1
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+    :param lengthscale: the lengthscale of the kernel
+    :type lengthscale: float
+    :param period: the period
+    :type period: float
+    :param n_freq: the number of frequencies considered for the periodic subspace
+    :type n_freq: int
+    """
+    part = parts.periodic_exponential.PeriodicExponential(input_dim, variance, lengthscale, period, n_freq, lower, upper)
+    return kern(input_dim, [part])
+
+def periodic_Matern32(input_dim, variance=1., lengthscale=None, period=2 * np.pi, n_freq=10, lower=0., upper=4 * np.pi):
+    """
+     Construct a periodic Matern 3/2 kernel.
+
+     :param input_dim: dimensionality, only defined for input_dim=1
+     :type input_dim: int
+     :param variance: the variance of the kernel
+     :type variance: float
+     :param lengthscale: the lengthscale of the kernel
+     :type lengthscale: float
+     :param period: the period
+     :type period: float
+     :param n_freq: the number of frequencies considered for the periodic subspace
+     :type n_freq: int
+    """
+    part = parts.periodic_Matern32.PeriodicMatern32(input_dim, variance, lengthscale, period, n_freq, lower, upper)
+    return kern(input_dim, [part])
+
+def periodic_Matern52(input_dim, variance=1., lengthscale=None, period=2 * np.pi, n_freq=10, lower=0., upper=4 * np.pi):
+    """
+     Construct a periodic Matern 5/2 kernel.
+
+     :param input_dim: dimensionality, only defined for input_dim=1
+     :type input_dim: int
+     :param variance: the variance of the kernel
+     :type variance: float
+     :param lengthscale: the lengthscale of the kernel
+     :type lengthscale: float
+     :param period: the period
+     :type period: float
+     :param n_freq: the number of frequencies considered for the periodic subspace
+     :type n_freq: int
+    """
+    part = parts.periodic_Matern52.PeriodicMatern52(input_dim, variance, lengthscale, period, n_freq, lower, upper)
+    return kern(input_dim, [part])
+
+def prod(k1,k2,tensor=False):
+    """
+     Construct a product kernel over input_dim from two kernels over input_dim
+
+    :param k1, k2: the kernels to multiply
+    :type k1, k2: kernpart
+    :param tensor: The kernels are either multiply as functions defined on the same input space (default) or on the product of the input spaces
+    :type tensor: Boolean
+    :rtype: kernel object
+    """
+    part = parts.prod.Prod(k1, k2, tensor)
+    return kern(part.input_dim, [part])
+
+def symmetric(k):
+    """
+    Construct a symmetric kernel from an existing kernel
+    """
+    k_ = k.copy()
+    k_.parts = [symmetric.Symmetric(p) for p in k.parts]
+    return k_
+
+def coregionalise(output_dim, rank=1, W=None, kappa=None):
+    """
+        Coregionalisation kernel. 
+
+    Used for computing covariance functions of the form
+    .. math::
+       k_2(x, y)=\mathbf{B} k(x, y)
+    where
+    .. math::
+       \mathbf{B} = \mathbf{W}\mathbf{W}^\top + kappa \mathbf{I}
+
+    :param output_dim: the number of output dimensions
+    :type output_dim: int
+    :param rank: the rank of the coregionalisation matrix.
+    :type rank: int
+    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B.
+    :type W: ndarray
+    :param kappa: a diagonal term which allows the outputs to behave independently.
+    :rtype: kernel object
+
+    .. Note: see coregionalisation examples in GPy.examples.regression for some usage.
+    """
+    p = parts.coregionalise.Coregionalise(output_dim,rank,W,kappa)
+    return kern(1,[p])
+
+
+def rational_quadratic(input_dim, variance=1., lengthscale=1., power=1.):
+    """
+     Construct rational quadratic kernel.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int (input_dim=1 is the only value currently supported)
+    :param variance: the variance :math:`\sigma^2`
+    :type variance: float
+    :param lengthscale: the lengthscale :math:`\ell`
+    :type lengthscale: float
+    :rtype: kern object
+
+    """
+    part = parts.rational_quadratic.RationalQuadratic(input_dim, variance, lengthscale, power)
+    return kern(input_dim, [part])
+
+def fixed(input_dim, K, variance=1.):
+    """
+     Construct a Fixed effect kernel.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int (input_dim=1 is the only value currently supported)
+    :param K: the variance :math:`\sigma^2`
+    :type K: np.array
+    :param variance: kernel variance
+    :type variance: float
+    :rtype: kern object
+    """
+    part = parts.fixed.Fixed(input_dim, K, variance)
+    return kern(input_dim, [part])
+
+def rbfcos(input_dim, variance=1., frequencies=None, bandwidths=None, ARD=False):
+    """
+    construct a rbfcos kernel
+    """
+    part = parts.rbfcos.RBFCos(input_dim, variance, frequencies, bandwidths, ARD)
+    return kern(input_dim, [part])
+
+def independent_outputs(k):
+    """
+    Construct a kernel with independent outputs from an existing kernel
+    """
+    for sl in k.input_slices:
+        assert (sl.start is None) and (sl.stop is None), "cannot adjust input slices! (TODO)"
+    _parts = [parts.independent_outputs.IndependentOutputs(p) for p in k.parts]
+    return kern(k.input_dim+1,_parts)
+
+def hierarchical(k):
+    """
+    TODO THis can't be right! Construct a kernel with independent outputs from an existing kernel
+    """
+    # for sl in k.input_slices:
+    #     assert (sl.start is None) and (sl.stop is None), "cannot adjust input slices! (TODO)"
+    _parts = [parts.hierarchical.Hierarchical(k.parts)]
+    return kern(k.input_dim+len(k.parts),_parts)
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@ -0,0 +1,669 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+import pylab as pb
+from ..core.parameterized import Parameterized
+from parts.kernpart import Kernpart
+import itertools
+from parts.prod import Prod as prod
+from matplotlib.transforms import offset_copy
+
+class kern(Parameterized):
+    def __init__(self, input_dim, parts=[], input_slices=None):
+        """
+        This is the main kernel class for GPy. It handles multiple (additive) kernel functions, and keeps track of variaous things like which parameters live where.
+
+        The technical code for kernels is divided into _parts_ (see
+        e.g. rbf.py). This object contains a list of parts, which are
+        computed additively. For multiplication, special _prod_ parts
+        are used.
+
+        :param input_dim: The dimensionality of the kernel's input space
+        :type input_dim: int
+        :param parts: the 'parts' (PD functions) of the kernel
+        :type parts: list of Kernpart objects
+        :param input_slices: the slices on the inputs which apply to each kernel
+        :type input_slices: list of slice objects, or list of bools
+
+        """
+        self.parts = parts
+        self.Nparts = len(parts)
+        self.num_params = sum([p.num_params for p in self.parts])
+
+        self.input_dim = input_dim
+
+        # deal with input_slices
+        if input_slices is None:
+            self.input_slices = [slice(None) for p in self.parts]
+        else:
+            assert len(input_slices) == len(self.parts)
+            self.input_slices = [sl if type(sl) is slice else slice(None) for sl in input_slices]
+
+        for p in self.parts:
+            assert isinstance(p, Kernpart), "bad kernel part"
+
+        self.compute_param_slices()
+
+        Parameterized.__init__(self)
+
+    def getstate(self):
+        """
+        Get the current state of the class,
+        here just all the indices, rest can get recomputed
+        """
+        return Parameterized.getstate(self) + [self.parts,
+                self.Nparts,
+                self.num_params,
+                self.input_dim,
+                self.input_slices,
+                self.param_slices
+                ]
+
+    def setstate(self, state):
+        self.param_slices = state.pop()
+        self.input_slices = state.pop()
+        self.input_dim = state.pop()
+        self.num_params = state.pop()
+        self.Nparts = state.pop()
+        self.parts = state.pop()
+        Parameterized.setstate(self, state)
+
+
+    def plot_ARD(self, fignum=None, ax=None, title='', legend=False):
+        """If an ARD kernel is present, it bar-plots the ARD parameters,
+        :param fignum: figure number of the plot
+        :param ax: matplotlib axis to plot on
+        :param title: 
+            title of the plot, 
+            pass '' to not print a title
+            pass None for a generic title
+        """
+        if ax is None:
+            fig = pb.figure(fignum)
+            ax = fig.add_subplot(111)
+        else:
+            fig = ax.figure
+        from GPy.util import Tango
+        from matplotlib.textpath import TextPath
+        Tango.reset()
+        xticklabels = []
+        bars = []
+        x0 = 0
+        for p in self.parts:
+            c = Tango.nextMedium()
+            if hasattr(p, 'ARD') and p.ARD:
+                if title is None:
+                    ax.set_title('ARD parameters, %s kernel' % p.name)
+                else:
+                    ax.set_title(title)
+                if p.name == 'linear':
+                    ard_params = p.variances
+                else:
+                    ard_params = 1. / p.lengthscale
+
+                x = np.arange(x0, x0 + len(ard_params))
+                bars.append(ax.bar(x, ard_params, align='center', color=c, edgecolor='k', linewidth=1.2, label=p.name))
+                xticklabels.extend([r"$\mathrm{{{name}}}\ {x}$".format(name=p.name, x=i) for i in np.arange(len(ard_params))])
+                x0 += len(ard_params)
+        x = np.arange(x0)
+        transOffset = offset_copy(ax.transData, fig=fig,
+                                  x=0., y= -2., units='points')
+        transOffsetUp = offset_copy(ax.transData, fig=fig,
+                                  x=0., y=1., units='points')
+        for bar in bars:
+            for patch, num in zip(bar.patches, np.arange(len(bar.patches))):
+                height = patch.get_height()
+                xi = patch.get_x() + patch.get_width() / 2.
+                va = 'top'
+                c = 'w'
+                t = TextPath((0, 0), "${xi}$".format(xi=xi), rotation=0, usetex=True, ha='center')
+                transform = transOffset
+                if patch.get_extents().height <= t.get_extents().height + 3:
+                    va = 'bottom'
+                    c = 'k'
+                    transform = transOffsetUp
+                ax.text(xi, height, "${xi}$".format(xi=int(num)), color=c, rotation=0, ha='center', va=va, transform=transform)
+        # for xi, t in zip(x, xticklabels):
+        #    ax.text(xi, maxi / 2, t, rotation=90, ha='center', va='center')
+        # ax.set_xticklabels(xticklabels, rotation=17)
+        ax.set_xticks([])
+        ax.set_xlim(-.5, x0 - .5)
+        if legend:
+            if title is '':
+                mode = 'expand'
+                if len(bars) > 1:
+                    mode = 'expand'
+                ax.legend(bbox_to_anchor=(0., 1.02, 1., 1.02), loc=3,
+                          ncol=len(bars), mode=mode, borderaxespad=0.)
+                fig.tight_layout(rect=(0, 0, 1, .9))
+            else:
+                ax.legend()
+        return ax
+
+    def _transform_gradients(self, g):
+        x = self._get_params()
+        [np.put(x, i, x * t.gradfactor(x[i])) for i, t in zip(self.constrained_indices, self.constraints)]
+        [np.put(g, i, v) for i, v in [(t[0], np.sum(g[t])) for t in self.tied_indices]]
+        if len(self.tied_indices) or len(self.fixed_indices):
+            to_remove = np.hstack((self.fixed_indices + [t[1:] for t in self.tied_indices]))
+            return np.delete(g, to_remove)
+        else:
+            return g
+
+    def compute_param_slices(self):
+        """create a set of slices that can index the parameters of each part."""
+        self.param_slices = []
+        count = 0
+        for p in self.parts:
+            self.param_slices.append(slice(count, count + p.num_params))
+            count += p.num_params
+
+    def __add__(self, other):
+        """
+        Shortcut for `add`.
+        """
+        return self.add(other)
+
+    def add(self, other, tensor=False):
+        """
+        Add another kernel to this one. Both kernels are defined on the same _space_
+        :param other: the other kernel to be added
+        :type other: GPy.kern
+        """
+        if tensor:
+            D = self.input_dim + other.input_dim
+            self_input_slices = [slice(*sl.indices(self.input_dim)) for sl in self.input_slices]
+            other_input_indices = [sl.indices(other.input_dim) for sl in other.input_slices]
+            other_input_slices = [slice(i[0] + self.input_dim, i[1] + self.input_dim, i[2]) for i in other_input_indices]
+
+            newkern = kern(D, self.parts + other.parts, self_input_slices + other_input_slices)
+
+            # transfer constraints:
+            newkern.constrained_indices = self.constrained_indices + [x + self.num_params for x in other.constrained_indices]
+            newkern.constraints = self.constraints + other.constraints
+            newkern.fixed_indices = self.fixed_indices + [self.num_params + x for x in other.fixed_indices]
+            newkern.fixed_values = self.fixed_values + other.fixed_values
+            newkern.constraints = self.constraints + other.constraints
+            newkern.tied_indices = self.tied_indices + [self.num_params + x for x in other.tied_indices]
+        else:
+            assert self.input_dim == other.input_dim
+            newkern = kern(self.input_dim, self.parts + other.parts, self.input_slices + other.input_slices)
+            # transfer constraints:
+            newkern.constrained_indices = self.constrained_indices + [i + self.num_params  for i in other.constrained_indices]
+            newkern.constraints = self.constraints + other.constraints
+            newkern.fixed_indices = self.fixed_indices + [self.num_params + x for x in other.fixed_indices]
+            newkern.fixed_values = self.fixed_values + other.fixed_values
+            newkern.tied_indices = self.tied_indices + [self.num_params + x for x in other.tied_indices]
+        return newkern
+
+    def __mul__(self, other):
+        """
+        Shortcut for `prod`.
+        """
+        return self.prod(other)
+
+    def __pow__(self, other, tensor=False):
+        """
+        Shortcut for tensor `prod`.
+        """
+        return self.prod(other, tensor=True)
+
+    def prod(self, other, tensor=False):
+        """
+        multiply two kernels (either on the same space, or on the tensor product of the input space).
+        :param other: the other kernel to be added
+        :type other: GPy.kern
+        :param tensor: whether or not to use the tensor space (default is false).
+        :type tensor: bool 
+        """
+        K1 = self.copy()
+        K2 = other.copy()
+
+        slices = []
+        for sl1, sl2 in itertools.product(K1.input_slices, K2.input_slices):
+            s1, s2 = [False] * K1.input_dim, [False] * K2.input_dim
+            s1[sl1], s2[sl2] = [True], [True]
+            slices += [s1 + s2]
+
+        newkernparts = [prod(k1, k2, tensor) for k1, k2 in itertools.product(K1.parts, K2.parts)]
+
+        if tensor:
+            newkern = kern(K1.input_dim + K2.input_dim, newkernparts, slices)
+        else:
+            newkern = kern(K1.input_dim, newkernparts, slices)
+
+        newkern._follow_constrains(K1, K2)
+        return newkern
+
+    def _follow_constrains(self, K1, K2):
+
+        # Build the array that allows to go from the initial indices of the param to the new ones
+        K1_param = []
+        n = 0
+        for k1 in K1.parts:
+            K1_param += [range(n, n + k1.num_params)]
+            n += k1.num_params
+        n = 0
+        K2_param = []
+        for k2 in K2.parts:
+            K2_param += [range(K1.num_params + n, K1.num_params + n + k2.num_params)]
+            n += k2.num_params
+        index_param = []
+        for p1 in K1_param:
+            for p2 in K2_param:
+                index_param += p1 + p2
+        index_param = np.array(index_param)
+
+        # Get the ties and constrains of the kernels before the multiplication
+        prev_ties = K1.tied_indices + [arr + K1.num_params for arr in K2.tied_indices]
+
+        prev_constr_ind = [K1.constrained_indices] + [K1.num_params + i for i in K2.constrained_indices]
+        prev_constr = K1.constraints + K2.constraints
+
+        # prev_constr_fix = K1.fixed_indices + [arr + K1.num_params for arr in K2.fixed_indices]
+        # prev_constr_fix_values = K1.fixed_values + K2.fixed_values
+
+        # follow the previous ties
+        for arr in prev_ties:
+            for j in arr:
+                index_param[np.where(index_param == j)[0]] = arr[0]
+
+        # ties and constrains
+        for i in range(K1.num_params + K2.num_params):
+            index = np.where(index_param == i)[0]
+            if index.size > 1:
+                self.tie_params(index)
+        for i, t in zip(prev_constr_ind, prev_constr):
+            self.constrain(np.where(index_param == i)[0], t)
+
+    def _get_params(self):
+        return np.hstack([p._get_params() for p in self.parts])
+
+    def _set_params(self, x):
+        [p._set_params(x[s]) for p, s in zip(self.parts, self.param_slices)]
+
+    def _get_param_names(self):
+        # this is a bit nasty: we want to distinguish between parts with the same name by appending a count
+        part_names = np.array([k.name for k in self.parts], dtype=np.str)
+        counts = [np.sum(part_names == ni) for i, ni in enumerate(part_names)]
+        cum_counts = [np.sum(part_names[i:] == ni) for i, ni in enumerate(part_names)]
+        names = [name + '_' + str(cum_count) if count > 1 else name for name, count, cum_count in zip(part_names, counts, cum_counts)]
+
+        return sum([[name + '_' + n for n in k._get_param_names()] for name, k in zip(names, self.parts)], [])
+
+    def K(self, X, X2=None, which_parts='all'):
+        if which_parts == 'all':
+            which_parts = [True] * self.Nparts
+        assert X.shape[1] == self.input_dim
+        if X2 is None:
+            target = np.zeros((X.shape[0], X.shape[0]))
+            [p.K(X[:, i_s], None, target=target) for p, i_s, part_i_used in zip(self.parts, self.input_slices, which_parts) if part_i_used]
+        else:
+            target = np.zeros((X.shape[0], X2.shape[0]))
+            [p.K(X[:, i_s], X2[:, i_s], target=target) for p, i_s, part_i_used in zip(self.parts, self.input_slices, which_parts) if part_i_used]
+        return target
+
+    def dK_dtheta(self, dL_dK, X, X2=None):
+        """
+        Compute the gradient of the covariance function with respect to the parameters.
+        
+        :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
+        :type dL_dK: Np.ndarray (num_samples x num_inducing)
+        :param X: Observed data inputs
+        :type X: np.ndarray (num_samples x input_dim)
+        :param X2: Observed data inputs (optional, defaults to X)
+        :type X2: np.ndarray (num_inducing x input_dim)
+        """
+        assert X.shape[1] == self.input_dim
+        target = np.zeros(self.num_params)
+        if X2 is None:
+            [p.dK_dtheta(dL_dK, X[:, i_s], None, target[ps]) for p, i_s, ps, in zip(self.parts, self.input_slices, self.param_slices)]
+        else:
+            [p.dK_dtheta(dL_dK, X[:, i_s], X2[:, i_s], target[ps]) for p, i_s, ps, in zip(self.parts, self.input_slices, self.param_slices)]
+
+        return self._transform_gradients(target)
+
+    def dK_dX(self, dL_dK, X, X2=None):
+        """Compute the gradient of the covariance function with respect to X.
+
+        :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
+        :type dL_dK: np.ndarray (num_samples x num_inducing)
+        :param X: Observed data inputs
+        :type X: np.ndarray (num_samples x input_dim)
+        :param X2: Observed data inputs (optional, defaults to X)
+        :type X2: np.ndarray (num_inducing x input_dim)"""
+        if X2 is None:
+            X2 = X
+        target = np.zeros_like(X)
+        if X2 is None:
+            [p.dK_dX(dL_dK, X[:, i_s], None, target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
+        else:
+            [p.dK_dX(dL_dK, X[:, i_s], X2[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
+        return target
+
+    def Kdiag(self, X, which_parts='all'):
+        """Compute the diagonal of the covariance function for inputs X."""
+        if which_parts == 'all':
+            which_parts = [True] * self.Nparts
+        assert X.shape[1] == self.input_dim
+        target = np.zeros(X.shape[0])
+        [p.Kdiag(X[:, i_s], target=target) for p, i_s, part_on in zip(self.parts, self.input_slices, which_parts) if part_on]
+        return target
+
+    def dKdiag_dtheta(self, dL_dKdiag, X):
+        """Compute the gradient of the diagonal of the covariance function with respect to the parameters."""
+        assert X.shape[1] == self.input_dim
+        assert dL_dKdiag.size == X.shape[0]
+        target = np.zeros(self.num_params)
+        [p.dKdiag_dtheta(dL_dKdiag, X[:, i_s], target[ps]) for p, i_s, ps in zip(self.parts, self.input_slices, self.param_slices)]
+        return self._transform_gradients(target)
+
+    def dKdiag_dX(self, dL_dKdiag, X):
+        assert X.shape[1] == self.input_dim
+        target = np.zeros_like(X)
+        [p.dKdiag_dX(dL_dKdiag, X[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
+        return target
+
+    def psi0(self, Z, mu, S):
+        target = np.zeros(mu.shape[0])
+        [p.psi0(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
+        return target
+
+    def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S):
+        target = np.zeros(self.num_params)
+        [p.dpsi0_dtheta(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self.parts, self.param_slices, self.input_slices)]
+        return self._transform_gradients(target)
+
+    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S):
+        target_mu, target_S = np.zeros_like(mu), np.zeros_like(S)
+        [p.dpsi0_dmuS(dL_dpsi0, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
+        return target_mu, target_S
+
+    def psi1(self, Z, mu, S):
+        target = np.zeros((mu.shape[0], Z.shape[0]))
+        [p.psi1(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
+        return target
+
+    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S):
+        target = np.zeros((self.num_params))
+        [p.dpsi1_dtheta(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, ps, i_s in zip(self.parts, self.param_slices, self.input_slices)]
+        return self._transform_gradients(target)
+
+    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S):
+        target = np.zeros_like(Z)
+        [p.dpsi1_dZ(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
+        return target
+
+    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S):
+        """return shapes are num_samples,num_inducing,input_dim"""
+        target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
+        [p.dpsi1_dmuS(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
+        return target_mu, target_S
+
+    def psi2(self, Z, mu, S):
+        """
+        Computer the psi2 statistics for the covariance function.
+        
+        :param Z: np.ndarray of inducing inputs (num_inducing x input_dim)
+        :param mu, S: np.ndarrays of means and variances (each num_samples x input_dim)
+        :returns psi2: np.ndarray (num_samples,num_inducing,num_inducing)
+        """
+        target = np.zeros((mu.shape[0], Z.shape[0], Z.shape[0]))
+        [p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
+
+        # compute the "cross" terms
+        # TODO: input_slices needed
+        crossterms = 0
+
+        for [p1, i_s1], [p2, i_s2] in itertools.combinations(zip(self.parts, self.input_slices), 2):
+            if i_s1 == i_s2:
+                # TODO psi1 this must be faster/better/precached/more nice
+                tmp1 = np.zeros((mu.shape[0], Z.shape[0]))
+                p1.psi1(Z[:, i_s1], mu[:, i_s1], S[:, i_s1], tmp1)
+                tmp2 = np.zeros((mu.shape[0], Z.shape[0]))
+                p2.psi1(Z[:, i_s2], mu[:, i_s2], S[:, i_s2], tmp2)
+    
+                prod = np.multiply(tmp1, tmp2)
+                crossterms += prod[:, :, None] + prod[:, None, :]
+
+        # target += crossterms
+        return target + crossterms
+
+    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S):
+        """Gradient of the psi2 statistics with respect to the parameters."""
+        target = np.zeros(self.num_params)
+        [p.dpsi2_dtheta(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, i_s, ps in zip(self.parts, self.input_slices, self.param_slices)]
+
+        # compute the "cross" terms
+        # TODO: better looping, input_slices
+        for i1, i2 in itertools.permutations(range(len(self.parts)), 2):
+            p1, p2 = self.parts[i1], self.parts[i2]
+#             ipsl1, ipsl2 = self.input_slices[i1], self.input_slices[i2]
+            ps1, ps2 = self.param_slices[i1], self.param_slices[i2]
+
+            tmp = np.zeros((mu.shape[0], Z.shape[0]))
+            p1.psi1(Z, mu, S, tmp)
+            p2.dpsi1_dtheta((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target[ps2])
+
+        return self._transform_gradients(target)
+
+    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S):
+        target = np.zeros_like(Z)
+        [p.dpsi2_dZ(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
+        # target *= 2
+
+        # compute the "cross" terms
+        # TODO: we need input_slices here.
+        for p1, p2 in itertools.permutations(self.parts, 2):
+            if p1.name == 'linear' and p2.name == 'linear':
+                raise NotImplementedError("We don't handle linear/linear cross-terms")
+            tmp = np.zeros((mu.shape[0], Z.shape[0]))
+            p1.psi1(Z, mu, S, tmp)
+            p2.dpsi1_dZ((tmp[:, None, :] * dL_dpsi2).sum(1), Z, mu, S, target)
+
+        return target * 2
+
+    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S):
+        target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
+        [p.dpsi2_dmuS(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
+
+        # compute the "cross" terms
+        # TODO: we need input_slices here.
+        for p1, p2 in itertools.permutations(self.parts, 2):
+            if p1.name == 'linear' and p2.name == 'linear':
+                raise NotImplementedError("We don't handle linear/linear cross-terms")
+
+            tmp = np.zeros((mu.shape[0], Z.shape[0]))
+            p1.psi1(Z, mu, S, tmp)
+            p2.dpsi1_dmuS((tmp[:, None, :] * dL_dpsi2).sum(1) * 2., Z, mu, S, target_mu, target_S)
+
+        return target_mu, target_S
+
+    def plot(self, x=None, plot_limits=None, which_parts='all', resolution=None, *args, **kwargs):
+        if which_parts == 'all':
+            which_parts = [True] * self.Nparts
+        if self.input_dim == 1:
+            if x is None:
+                x = np.zeros((1, 1))
+            else:
+                x = np.asarray(x)
+                assert x.size == 1, "The size of the fixed variable x is not 1"
+                x = x.reshape((1, 1))
+
+            if plot_limits == None:
+                xmin, xmax = (x - 5).flatten(), (x + 5).flatten()
+            elif len(plot_limits) == 2:
+                xmin, xmax = plot_limits
+            else:
+                raise ValueError, "Bad limits for plotting"
+
+            Xnew = np.linspace(xmin, xmax, resolution or 201)[:, None]
+            Kx = self.K(Xnew, x, which_parts)
+            pb.plot(Xnew, Kx, *args, **kwargs)
+            pb.xlim(xmin, xmax)
+            pb.xlabel("x")
+            pb.ylabel("k(x,%0.1f)" % x)
+
+        elif self.input_dim == 2:
+            if x is None:
+                x = np.zeros((1, 2))
+            else:
+                x = np.asarray(x)
+                assert x.size == 2, "The size of the fixed variable x is not 2"
+                x = x.reshape((1, 2))
+
+            if plot_limits == None:
+                xmin, xmax = (x - 5).flatten(), (x + 5).flatten()
+            elif len(plot_limits) == 2:
+                xmin, xmax = plot_limits
+            else:
+                raise ValueError, "Bad limits for plotting"
+
+            resolution = resolution or 51
+            xx, yy = np.mgrid[xmin[0]:xmax[0]:1j * resolution, xmin[1]:xmax[1]:1j * resolution]
+            xg = np.linspace(xmin[0], xmax[0], resolution)
+            yg = np.linspace(xmin[1], xmax[1], resolution)
+            Xnew = np.vstack((xx.flatten(), yy.flatten())).T
+            Kx = self.K(Xnew, x, which_parts)
+            Kx = Kx.reshape(resolution, resolution).T
+            pb.contour(xg, yg, Kx, vmin=Kx.min(), vmax=Kx.max(), cmap=pb.cm.jet, *args, **kwargs) # @UndefinedVariable
+            pb.xlim(xmin[0], xmax[0])
+            pb.ylim(xmin[1], xmax[1])
+            pb.xlabel("x1")
+            pb.ylabel("x2")
+            pb.title("k(x1,x2 ; %0.1f,%0.1f)" % (x[0, 0], x[0, 1]))
+        else:
+            raise NotImplementedError, "Cannot plot a kernel with more than two input dimensions"
+
+from GPy.core.model import Model
+
+class Kern_check_model(Model):
+    """This is a dummy model class used as a base class for checking that the gradients of a given kernel are implemented correctly. It enables checkgradient() to be called independently on a kernel."""
+    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
+        num_samples = 20
+        num_samples2 = 10
+        if kernel==None:
+            kernel = GPy.kern.rbf(1)
+        if X==None:
+            X = np.random.randn(num_samples, kernel.input_dim)
+        if dL_dK==None:
+            if X2==None:
+                dL_dK = np.ones((X.shape[0], X.shape[0]))
+            else:
+                dL_dK = np.ones((X.shape[0], X2.shape[0]))
+        
+        self.kernel=kernel
+        self.X = X
+        self.X2 = X2
+        self.dL_dK = dL_dK
+        #self.constrained_indices=[]
+        #self.constraints=[]
+        Model.__init__(self)
+
+    def is_positive_definite(self):
+        v = np.linalg.eig(self.kernel.K(self.X))[0]
+        if any(v<0):
+            return False
+        else:
+            return True
+        
+    def _get_params(self):
+        return self.kernel._get_params()
+
+    def _get_param_names(self):
+        return self.kernel._get_param_names()
+
+    def _set_params(self, x):
+        self.kernel._set_params(x)
+
+    def log_likelihood(self):
+        return (self.dL_dK*self.kernel.K(self.X, self.X2)).sum()
+
+    def _log_likelihood_gradients(self):
+        raise NotImplementedError, "This needs to be implemented to use the kern_check_model class."
+    
+class Kern_check_dK_dtheta(Kern_check_model):
+    """This class allows gradient checks for the gradient of a kernel with respect to parameters. """
+    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
+        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
+
+    def _log_likelihood_gradients(self):
+        return self.kernel.dK_dtheta(self.dL_dK, self.X, self.X2)
+
+class Kern_check_dKdiag_dtheta(Kern_check_model):
+    """This class allows gradient checks of the gradient of the diagonal of a kernel with respect to the parameters."""
+    def __init__(self, kernel=None, dL_dK=None, X=None):
+        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
+        if dL_dK==None:
+            self.dL_dK = np.ones((self.X.shape[0]))
+        
+    def log_likelihood(self):
+        return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
+
+    def _log_likelihood_gradients(self):
+        return self.kernel.dKdiag_dtheta(self.dL_dK, self.X)
+
+class Kern_check_dK_dX(Kern_check_model):
+    """This class allows gradient checks for the gradient of a kernel with respect to X. """
+    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
+        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2)
+
+    def _log_likelihood_gradients(self):
+        return self.kernel.dK_dX(self.dL_dK, self.X, self.X2).flatten()
+
+    def _get_param_names(self):
+        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
+                
+    def _get_params(self):
+        return self.X.flatten()
+
+    def _set_params(self, x):
+        self.X=x.reshape(self.X.shape)
+
+class Kern_check_dKdiag_dX(Kern_check_model):
+    """This class allows gradient checks for the gradient of a kernel diagonal with respect to X. """
+    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
+        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
+        if dL_dK==None:
+            self.dL_dK = np.ones((self.X.shape[0]))
+
+    def log_likelihood(self):
+        return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
+
+    def _log_likelihood_gradients(self):
+        return self.kernel.dKdiag_dX(self.dL_dK, self.X).flatten()
+
+    def _get_param_names(self):
+        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
+                
+    def _get_params(self):
+        return self.X.flatten()
+
+    def _set_params(self, x):
+        self.X=x.reshape(self.X.shape)
+
+def kern_test(kern, X=None, X2=None, verbose=False):
+    """This function runs on kernels to check the correctness of their implementation. It checks that the covariance function is positive definite for a randomly generated data set.
+
+    :param kern: the kernel to be tested.
+    :type kern: GPy.kern.Kernpart
+    :param X: X input values to test the covariance function.
+    :type X: ndarray
+    :param X2: X2 input values to test the covariance function.
+    :type X2: ndarray
+    """
+    if X==None:
+        X = np.random.randn(10, kern.input_dim)
+    if X2==None:
+        X2 = np.random.randn(20, kern.input_dim)
+    result = [Kern_check_model(kern, X=X).is_positive_definite(),
+              Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=verbose),
+              Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=verbose),
+             Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=verbose),
+              Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=verbose),
+              Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=verbose)]
+    # Need to check 
+    #Kern_check_dK_dX(kern, X, X2=None).checkgrad(verbose=verbose)]
+    # but currently I think these aren't implemented.
+    return np.all(result)
--- a/GPy/kern/parts/Brownian.py
+++ b/GPy/kern/parts/Brownian.py
@ -0,0 +1,65 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+
+def theta(x):
+    """Heavisdie step function"""
+    return np.where(x>=0.,1.,0.)
+
+class Brownian(Kernpart):
+    """
+    Brownian Motion kernel.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance:
+    :type variance: float
+    """
+    def __init__(self,input_dim,variance=1.):
+        self.input_dim = input_dim
+        assert self.input_dim==1, "Brownian motion in 1D only"
+        self.num_params = 1
+        self.name = 'Brownian'
+        self._set_params(np.array([variance]).flatten())
+
+    def _get_params(self):
+        return self.variance
+
+    def _set_params(self,x):
+        assert x.shape==(1,)
+        self.variance = x
+
+    def _get_param_names(self):
+        return ['variance']
+
+    def K(self,X,X2,target):
+        if X2 is None:
+            X2 = X
+        target += self.variance*np.fmin(X,X2.T)
+
+    def Kdiag(self,X,target):
+        target += self.variance*X.flatten()
+
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        if X2 is None:
+            X2 = X
+        target += np.sum(np.fmin(X,X2.T)*dL_dK)
+
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        target += np.dot(X.flatten(), dL_dKdiag)
+
+    def dK_dX(self,dL_dK,X,X2,target):
+        raise NotImplementedError, "TODO"
+        #target += self.variance
+        #target -= self.variance*theta(X-X2.T)
+        #if X.shape==X2.shape:
+            #if np.all(X==X2):
+                #np.add(target[:,:,0],self.variance*np.diag(X2.flatten()-X.flatten()),target[:,:,0])
+
+
+    def dKdiag_dX(self,dL_dKdiag,X,target):
+        target += self.variance*dL_dKdiag[:,None]
+
--- a/GPy/kern/parts/Matern32.py
+++ b/GPy/kern/parts/Matern32.py
@ -0,0 +1,135 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+from scipy import integrate
+
+class Matern32(Kernpart):
+    """
+    Matern 3/2 kernel:
+
+    .. math::
+
+       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance :math:`\sigma^2`
+    :type variance: float
+    :param lengthscale: the vector of lengthscale :math:`\ell_i`
+    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
+    :type ARD: Boolean
+    :rtype: kernel object
+
+    """
+
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False):
+        self.input_dim = input_dim
+        self.ARD = ARD
+        if ARD == False:
+            self.num_params = 2
+            self.name = 'Mat32'
+            if lengthscale is not None:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
+            else:
+                lengthscale = np.ones(1)
+        else:
+            self.num_params = self.input_dim + 1
+            self.name = 'Mat32'
+            if lengthscale is not None:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size == self.input_dim, "bad number of lengthscales"
+            else:
+                lengthscale = np.ones(self.input_dim)
+        self._set_params(np.hstack((variance, lengthscale.flatten())))
+
+    def _get_params(self):
+        """return the value of the parameters."""
+        return np.hstack((self.variance, self.lengthscale))
+
+    def _set_params(self, x):
+        """set the value of the parameters."""
+        assert x.size == self.num_params
+        self.variance = x[0]
+        self.lengthscale = x[1:]
+
+    def _get_param_names(self):
+        """return parameter names."""
+        if self.num_params == 2:
+            return ['variance', 'lengthscale']
+        else:
+            return ['variance'] + ['lengthscale_%i' % i for i in range(self.lengthscale.size)]
+
+    def K(self, X, X2, target):
+        """Compute the covariance matrix between X and X2."""
+        if X2 is None: X2 = X
+        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
+        np.add(self.variance * (1 + np.sqrt(3.) * dist) * np.exp(-np.sqrt(3.) * dist), target, target)
+
+    def Kdiag(self, X, target):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        np.add(target, self.variance, target)
+
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        """derivative of the covariance matrix with respect to the parameters."""
+        if X2 is None: X2 = X
+        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
+        dvar = (1 + np.sqrt(3.) * dist) * np.exp(-np.sqrt(3.) * dist)
+        invdist = 1. / np.where(dist != 0., dist, np.inf)
+        dist2M = np.square(X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 3
+        # dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
+        target[0] += np.sum(dvar * dL_dK)
+        if self.ARD == True:
+            dl = (self.variance * 3 * dist * np.exp(-np.sqrt(3.) * dist))[:, :, np.newaxis] * dist2M * invdist[:, :, np.newaxis]
+            # dl = self.variance*dvar[:,:,None]*dist2M*invdist[:,:,None]
+            target[1:] += (dl * dL_dK[:, :, None]).sum(0).sum(0)
+        else:
+            dl = (self.variance * 3 * dist * np.exp(-np.sqrt(3.) * dist)) * dist2M.sum(-1) * invdist
+            # dl = self.variance*dvar*dist2M.sum(-1)*invdist
+            target[1] += np.sum(dl * dL_dK)
+
+    def dKdiag_dtheta(self, dL_dKdiag, X, target):
+        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
+        target[0] += np.sum(dL_dKdiag)
+
+    def dK_dX(self, dL_dK, X, X2, target):
+        """derivative of the covariance matrix with respect to X."""
+        if X2 is None: X2 = X
+        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
+        ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
+        dK_dX = -np.transpose(3 * self.variance * dist * np.exp(-np.sqrt(3) * dist) * ddist_dX, (1, 0, 2))
+        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        pass
+
+    def Gram_matrix(self, F, F1, F2, lower, upper):
+        """
+        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to input_dim=1.
+
+        :param F: vector of functions
+        :type F: np.array
+        :param F1: vector of derivatives of F
+        :type F1: np.array
+        :param F2: vector of second derivatives of F
+        :type F2: np.array
+        :param lower,upper: boundaries of the input domain
+        :type lower,upper: floats
+        """
+        assert self.input_dim == 1
+        def L(x, i):
+            return(3. / self.lengthscale ** 2 * F[i](x) + 2 * np.sqrt(3) / self.lengthscale * F1[i](x) + F2[i](x))
+        n = F.shape[0]
+        G = np.zeros((n, n))
+        for i in range(n):
+            for j in range(i, n):
+                G[i, j] = G[j, i] = integrate.quad(lambda x : L(x, i) * L(x, j), lower, upper)[0]
+        Flower = np.array([f(lower) for f in F])[:, None]
+        F1lower = np.array([f(lower) for f in F1])[:, None]
+        # print "OLD \n", np.dot(F1lower,F1lower.T), "\n \n"
+        # return(G)
+        return(self.lengthscale ** 3 / (12.*np.sqrt(3) * self.variance) * G + 1. / self.variance * np.dot(Flower, Flower.T) + self.lengthscale ** 2 / (3.*self.variance) * np.dot(F1lower, F1lower.T))
--- a/GPy/kern/parts/Matern52.py
+++ b/GPy/kern/parts/Matern52.py
@ -0,0 +1,142 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+import hashlib
+from scipy import integrate
+
+class Matern52(Kernpart):
+    """
+    Matern 5/2 kernel:
+
+    .. math::
+
+       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r) \ \ \ \ \  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance :math:`\sigma^2`
+    :type variance: float
+    :param lengthscale: the vector of lengthscale :math:`\ell_i`
+    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
+    :type ARD: Boolean
+    :rtype: kernel object
+
+    """
+    def __init__(self,input_dim,variance=1.,lengthscale=None,ARD=False):
+        self.input_dim = input_dim
+        self.ARD = ARD
+        if ARD == False:
+            self.num_params = 2
+            self.name = 'Mat52'
+            if lengthscale is not None:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
+            else:
+                lengthscale = np.ones(1)
+        else:
+            self.num_params = self.input_dim + 1
+            self.name = 'Mat52'
+            if lengthscale is not None:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size == self.input_dim, "bad number of lengthscales"
+            else:
+                lengthscale = np.ones(self.input_dim)
+        self._set_params(np.hstack((variance,lengthscale.flatten())))
+
+    def _get_params(self):
+        """return the value of the parameters."""
+        return np.hstack((self.variance,self.lengthscale))
+
+    def _set_params(self,x):
+        """set the value of the parameters."""
+        assert x.size == self.num_params
+        self.variance = x[0]
+        self.lengthscale = x[1:]
+
+    def _get_param_names(self):
+        """return parameter names."""
+        if self.num_params == 2:
+            return ['variance','lengthscale']
+        else:
+            return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscale.size)]
+
+    def K(self,X,X2,target):
+        """Compute the covariance matrix between X and X2."""
+        if X2 is None: X2 = X
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
+        np.add(self.variance*(1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist), target,target)
+
+    def Kdiag(self,X,target):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        np.add(target,self.variance,target)
+
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        """derivative of the covariance matrix with respect to the parameters."""
+        if X2 is None: X2 = X
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))
+        invdist = 1./np.where(dist!=0.,dist,np.inf)
+        dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscale**3
+        dvar = (1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist)
+        dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
+        target[0] += np.sum(dvar*dL_dK)
+        if self.ARD:
+            dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
+            #dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
+            target[1:] += (dl*dL_dK[:,:,None]).sum(0).sum(0)
+        else:
+            dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist)) * dist2M.sum(-1)*invdist
+            #dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist)) * dist2M.sum(-1)*invdist
+            target[1] += np.sum(dl*dL_dK)
+
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
+        target[0] += np.sum(dL_dKdiag)
+
+    def dK_dX(self,dL_dK,X,X2,target):
+        """derivative of the covariance matrix with respect to X."""
+        if X2 is None: X2 = X
+        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
+        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
+        dK_dX = -  np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2))
+        target += np.sum(dK_dX*dL_dK.T[:,:,None],0)
+
+    def dKdiag_dX(self,dL_dKdiag,X,target):
+        pass
+
+    def Gram_matrix(self,F,F1,F2,F3,lower,upper):
+        """
+        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to input_dim=1.
+
+        :param F: vector of functions
+        :type F: np.array
+        :param F1: vector of derivatives of F
+        :type F1: np.array
+        :param F2: vector of second derivatives of F
+        :type F2: np.array
+        :param F3: vector of third derivatives of F
+        :type F3: np.array
+        :param lower,upper: boundaries of the input domain
+        :type lower,upper: floats
+        """
+        assert self.input_dim == 1
+        def L(x,i):
+            return(5*np.sqrt(5)/self.lengthscale**3*F[i](x) + 15./self.lengthscale**2*F1[i](x)+ 3*np.sqrt(5)/self.lengthscale*F2[i](x) + F3[i](x))
+        n = F.shape[0]
+        G = np.zeros((n,n))
+        for i in range(n):
+            for j in range(i,n):
+                G[i,j] = G[j,i] = integrate.quad(lambda x : L(x,i)*L(x,j),lower,upper)[0]
+        G_coef = 3.*self.lengthscale**5/(400*np.sqrt(5))
+        Flower = np.array([f(lower) for f in F])[:,None]
+        F1lower = np.array([f(lower) for f in F1])[:,None]
+        F2lower = np.array([f(lower) for f in F2])[:,None]
+        orig = 9./8*np.dot(Flower,Flower.T) + 9.*self.lengthscale**4/200*np.dot(F2lower,F2lower.T)
+        orig2 = 3./5*self.lengthscale**2 * ( np.dot(F1lower,F1lower.T) + 1./8*np.dot(Flower,F2lower.T) + 1./8*np.dot(F2lower,Flower.T))
+        return(1./self.variance* (G_coef*G + orig + orig2))
+
+
+
--- a/GPy/kern/parts/init.py
+++ b/GPy/kern/parts/init.py
@ -0,0 +1,26 @@
+import bias
+import Brownian
+import coregionalise
+import exponential
+import finite_dimensional
+import fixed
+import gibbs
+import independent_outputs
+import linear
+import Matern32
+import Matern52
+import mlp
+import periodic_exponential
+import periodic_Matern32
+import periodic_Matern52
+import poly
+import prod_orthogonal
+import prod
+import rational_quadratic
+import rbfcos
+import rbf
+import spline
+import symmetric
+import white
+import hierarchical
+import rbf_inv
--- a/GPy/kern/parts/bias.py
+++ b/GPy/kern/parts/bias.py
@ -0,0 +1,89 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+import hashlib
+
+class Bias(Kernpart):
+    def __init__(self,input_dim,variance=1.):
+        """
+        :param input_dim: the number of input dimensions
+        :type input_dim: int
+        :param variance: the variance of the kernel
+        :type variance: float
+        """
+        self.input_dim = input_dim
+        self.num_params = 1
+        self.name = 'bias'
+        self._set_params(np.array([variance]).flatten())
+
+    def _get_params(self):
+        return self.variance
+
+    def _set_params(self,x):
+        assert x.shape==(1,)
+        self.variance = x
+
+    def _get_param_names(self):
+        return ['variance']
+
+    def K(self,X,X2,target):
+        target += self.variance
+
+    def Kdiag(self,X,target):
+        target += self.variance
+
+    def dK_dtheta(self,dL_dKdiag,X,X2,target):
+        target += dL_dKdiag.sum()
+
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        target += dL_dKdiag.sum()
+
+    def dK_dX(self, dL_dK,X, X2, target):
+        pass
+
+    def dKdiag_dX(self,dL_dKdiag,X,target):
+        pass
+
+    #---------------------------------------#
+    #             PSI statistics            #
+    #---------------------------------------#
+
+    def psi0(self, Z, mu, S, target):
+        target += self.variance
+
+    def psi1(self, Z, mu, S, target):
+        self._psi1 = self.variance
+        target += self._psi1
+        
+    def psi2(self, Z, mu, S, target):
+        target += self.variance**2
+
+    def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S, target):
+        target += dL_dpsi0.sum()
+
+    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target):
+        target += dL_dpsi1.sum()
+
+    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target):
+        target += 2.*self.variance*dL_dpsi2.sum()
+
+    def dpsi0_dZ(self, dL_dpsi0, Z, mu, S, target):
+        pass
+
+    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S, target_mu, target_S):
+        pass
+
+    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
+        pass
+
+    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
+        pass
+
+    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
+        pass
+
+    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
+        pass
--- a/GPy/kern/parts/coregionalise.py
+++ b/GPy/kern/parts/coregionalise.py
@ -0,0 +1,160 @@
+# Copyright (c) 2012, James Hensman and Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+from GPy.util.linalg import mdot, pdinv
+import pdb
+from scipy import weave
+
+class Coregionalise(Kernpart):
+    """
+    Coregionalisation kernel. 
+
+    Used for computing covariance functions of the form
+    .. math::
+       k_2(x, y)=B k(x, y)
+    where
+    .. math::
+       B = WW^\top + diag(kappa)
+
+    :param output_dim: the number of output dimensions
+    :type output_dim: int
+    :param rank: the rank of the coregionalisation matrix.
+    :type rank: int
+    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B.
+    :type W: ndarray
+    :param kappa: a diagonal term which allows the outputs to behave independently.
+    :rtype: kernel object
+
+    .. Note: see coregionalisation examples in GPy.examples.regression for some usage.
+    """
+    def __init__(self,output_dim,rank=1, W=None, kappa=None):
+        self.input_dim = 1
+        self.name = 'coregion'
+        self.output_dim = output_dim
+        self.rank = rank
+        if W is None:
+            self.W = 0.5*np.random.randn(self.output_dim,self.rank)/np.sqrt(self.rank)
+        else:
+            assert W.shape==(self.output_dim,self.rank)
+            self.W = W
+        if kappa is None:
+            kappa = 0.5*np.ones(self.output_dim)
+        else:
+            assert kappa.shape==(self.output_dim,)
+        self.kappa = kappa
+        self.num_params = self.output_dim*(self.rank + 1)
+        self._set_params(np.hstack([self.W.flatten(),self.kappa]))
+
+    def _get_params(self):
+        return np.hstack([self.W.flatten(),self.kappa])
+
+    def _set_params(self,x):
+        assert x.size == self.num_params
+        self.kappa = x[-self.output_dim:]
+        self.W = x[:-self.output_dim].reshape(self.output_dim,self.rank)
+        self.B = np.dot(self.W,self.W.T) + np.diag(self.kappa)
+
+    def _get_param_names(self):
+        return sum([['W%i_%i'%(i,j) for j in range(self.rank)] for i in range(self.output_dim)],[]) + ['kappa_%i'%i for i in range(self.output_dim)]
+
+    def K(self,index,index2,target):
+        index = np.asarray(index,dtype=np.int)
+
+        #here's the old code (numpy)
+        #if index2 is None:
+            #index2 = index
+        #else:
+            #index2 = np.asarray(index2,dtype=np.int)
+        #false_target = target.copy()
+        #ii,jj = np.meshgrid(index,index2)
+        #ii,jj = ii.T, jj.T
+        #false_target += self.B[ii,jj]
+
+        if index2 is None:
+            code="""
+            for(int i=0;i<N; i++){
+              target[i+i*N] += B[index[i]+output_dim*index[i]];
+              for(int j=0; j<i; j++){
+                  target[j+i*N] += B[index[i]+output_dim*index[j]];
+                  target[i+j*N] += target[j+i*N];
+                }
+              }
+            """
+            N,B,output_dim = index.size, self.B, self.output_dim
+            weave.inline(code,['target','index','N','B','output_dim'])
+        else:
+            index2 = np.asarray(index2,dtype=np.int)
+            code="""
+            for(int i=0;i<num_inducing; i++){
+              for(int j=0; j<N; j++){
+                  target[i+j*num_inducing] += B[output_dim*index[j]+index2[i]];
+                }
+              }
+            """
+            N,num_inducing,B,output_dim = index.size,index2.size, self.B, self.output_dim
+            weave.inline(code,['target','index','index2','N','num_inducing','B','output_dim'])
+
+
+    def Kdiag(self,index,target):
+        target += np.diag(self.B)[np.asarray(index,dtype=np.int).flatten()]
+
+    def dK_dtheta(self,dL_dK,index,index2,target):
+        index = np.asarray(index,dtype=np.int)
+        dL_dK_small = np.zeros_like(self.B)
+        if index2 is None:
+            index2 = index
+        else:
+            index2 = np.asarray(index2,dtype=np.int)
+
+        code="""
+        for(int i=0; i<num_inducing; i++){
+          for(int j=0; j<N; j++){
+            dL_dK_small[index[j] + output_dim*index2[i]] += dL_dK[i+j*num_inducing];
+          }
+        }
+        """
+        N, num_inducing, output_dim = index.size, index2.size, self.output_dim
+        weave.inline(code, ['N','num_inducing','output_dim','dL_dK','dL_dK_small','index','index2'])
+
+        dkappa = np.diag(dL_dK_small)
+        dL_dK_small += dL_dK_small.T
+        dW = (self.W[:,None,:]*dL_dK_small[:,:,None]).sum(0)
+
+        target += np.hstack([dW.flatten(),dkappa])
+
+    def dK_dtheta_old(self,dL_dK,index,index2,target):
+        if index2 is None:
+            index2 = index
+        else:
+            index2 = np.asarray(index2,dtype=np.int)
+        ii,jj = np.meshgrid(index,index2)
+        ii,jj = ii.T, jj.T
+
+        dL_dK_small = np.zeros_like(self.B)
+        for i in range(self.output_dim):
+            for j in range(self.output_dim):
+                tmp = np.sum(dL_dK[(ii==i)*(jj==j)])
+                dL_dK_small[i,j] = tmp
+
+        dkappa = np.diag(dL_dK_small)
+        dL_dK_small += dL_dK_small.T
+        dW = (self.W[:,None,:]*dL_dK_small[:,:,None]).sum(0)
+
+        target += np.hstack([dW.flatten(),dkappa])
+
+    def dKdiag_dtheta(self,dL_dKdiag,index,target):
+        index = np.asarray(index,dtype=np.int).flatten()
+        dL_dKdiag_small = np.zeros(self.output_dim)
+        for i in range(self.output_dim):
+            dL_dKdiag_small[i] += np.sum(dL_dKdiag[index==i])
+        dW = 2.*self.W*dL_dKdiag_small[:,None]
+        dkappa = dL_dKdiag_small
+        target += np.hstack([dW.flatten(),dkappa])
+
+    def dK_dX(self,dL_dK,X,X2,target):
+        pass
+
+
+
--- a/GPy/kern/parts/exponential.py
+++ b/GPy/kern/parts/exponential.py
@ -0,0 +1,127 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+from scipy import integrate
+
+class Exponential(Kernpart):
+    """
+    Exponential kernel (aka Ornstein-Uhlenbeck or Matern 1/2)
+
+    .. math::
+
+       k(r) = \sigma^2 \exp(- r) \ \ \ \ \  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance :math:`\sigma^2`
+    :type variance: float
+    :param lengthscale: the vector of lengthscale :math:`\ell_i`
+    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
+    :type ARD: Boolean
+    :rtype: kernel object
+
+    """
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False):
+        self.input_dim = input_dim
+        self.ARD = ARD
+        if ARD == False:
+            self.num_params = 2
+            self.name = 'exp'
+            if lengthscale is not None:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
+            else:
+                lengthscale = np.ones(1)
+        else:
+            self.num_params = self.input_dim + 1
+            self.name = 'exp'
+            if lengthscale is not None:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size == self.input_dim, "bad number of lengthscales"
+            else:
+                lengthscale = np.ones(self.input_dim)
+        self._set_params(np.hstack((variance, lengthscale.flatten())))
+
+    def _get_params(self):
+        """return the value of the parameters."""
+        return np.hstack((self.variance, self.lengthscale))
+
+    def _set_params(self, x):
+        """set the value of the parameters."""
+        assert x.size == self.num_params
+        self.variance = x[0]
+        self.lengthscale = x[1:]
+
+    def _get_param_names(self):
+        """return parameter names."""
+        if self.num_params == 2:
+            return ['variance', 'lengthscale']
+        else:
+            return ['variance'] + ['lengthscale_%i' % i for i in range(self.lengthscale.size)]
+
+    def K(self, X, X2, target):
+        """Compute the covariance matrix between X and X2."""
+        if X2 is None: X2 = X
+        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
+        np.add(self.variance * np.exp(-dist), target, target)
+
+    def Kdiag(self, X, target):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        np.add(target, self.variance, target)
+
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        """derivative of the covariance matrix with respect to the parameters."""
+        if X2 is None: X2 = X
+        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))
+        invdist = 1. / np.where(dist != 0., dist, np.inf)
+        dist2M = np.square(X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 3
+        dvar = np.exp(-dist)
+        target[0] += np.sum(dvar * dL_dK)
+        if self.ARD == True:
+            dl = self.variance * dvar[:, :, None] * dist2M * invdist[:, :, None]
+            target[1:] += (dl * dL_dK[:, :, None]).sum(0).sum(0)
+        else:
+            dl = self.variance * dvar * dist2M.sum(-1) * invdist
+            target[1] += np.sum(dl * dL_dK)
+
+    def dKdiag_dtheta(self, dL_dKdiag, X, target):
+        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
+        # NB: derivative of diagonal elements wrt lengthscale is 0
+        target[0] += np.sum(dL_dKdiag)
+
+    def dK_dX(self, dL_dK, X, X2, target):
+        """derivative of the covariance matrix with respect to X."""
+        if X2 is None: X2 = X
+        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
+        ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
+        dK_dX = -np.transpose(self.variance * np.exp(-dist) * ddist_dX, (1, 0, 2))
+        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        pass
+
+    def Gram_matrix(self, F, F1, lower, upper):
+        """
+        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to input_dim=1.
+
+        :param F: vector of functions
+        :type F: np.array
+        :param F1: vector of derivatives of F
+        :type F1: np.array
+        :param lower,upper: boundaries of the input domain
+        :type lower,upper: floats
+        """
+        assert self.input_dim == 1
+        def L(x, i):
+            return(1. / self.lengthscale * F[i](x) + F1[i](x))
+        n = F.shape[0]
+        G = np.zeros((n, n))
+        for i in range(n):
+            for j in range(i, n):
+                G[i, j] = G[j, i] = integrate.quad(lambda x : L(x, i) * L(x, j), lower, upper)[0]
+        Flower = np.array([f(lower) for f in F])[:, None]
+        return(self.lengthscale / 2. / self.variance * G + 1. / self.variance * np.dot(Flower, Flower.T))
--- a/GPy/kern/parts/finite_dimensional.py
+++ b/GPy/kern/parts/finite_dimensional.py
@ -0,0 +1,74 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+from ...util.linalg import pdinv,mdot
+
+class FiniteDimensional(Kernpart):
+    def __init__(self, input_dim, F, G, variance=1., weights=None):
+        """
+        Argumnents
+        ----------
+        input_dim: int - the number of input dimensions
+        F: np.array of functions with shape (n,) - the n basis functions
+        G: np.array with shape (n,n) - the Gram matrix associated to F
+        weights : np.ndarray with shape (n,)
+        """
+        self.input_dim = input_dim
+        self.F = F
+        self.G = G
+        self.G_1 ,L,Li,logdet = pdinv(G)
+        self.n = F.shape[0]
+        if weights is not None:
+            assert weights.shape==(self.n,)
+        else:
+            weights = np.ones(self.n)
+        self.num_params = self.n + 1
+        self.name = 'finite_dim'
+        self._set_params(np.hstack((variance,weights)))
+
+    def _get_params(self):
+        return np.hstack((self.variance,self.weights))
+    def _set_params(self,x):
+        assert x.size == (self.num_params)
+        self.variance = x[0]
+        self.weights = x[1:]
+    def _get_param_names(self):
+        if self.n==1:
+            return ['variance','weight']
+        else:
+            return ['variance']+['weight_%i'%i for i in range(self.weights.size)]
+
+    def K(self,X,X2,target):
+        if X2 is None: X2 = X
+        FX = np.column_stack([f(X) for f in self.F])
+        FX2 = np.column_stack([f(X2) for f in self.F])
+        product = self.variance * mdot(FX,np.diag(np.sqrt(self.weights)),self.G_1,np.diag(np.sqrt(self.weights)),FX2.T)
+        np.add(product,target,target)
+    def Kdiag(self,X,target):
+        product = np.diag(self.K(X, X))
+        np.add(target,product,target)
+    def dK_dtheta(self,X,X2,target):
+        """Return shape is NxMx(Ntheta)"""
+        if X2 is None: X2 = X
+        FX = np.column_stack([f(X) for f in self.F])
+        FX2 = np.column_stack([f(X2) for f in self.F])
+        DER = np.zeros((self.n,self.n,self.n))
+        for i in range(self.n):
+            DER[i,i,i] = np.sqrt(self.weights[i])
+        dw = self.variance * mdot(FX,DER,self.G_1,np.diag(np.sqrt(self.weights)),FX2.T)
+        dv = mdot(FX,np.diag(np.sqrt(self.weights)),self.G_1,np.diag(np.sqrt(self.weights)),FX2.T)
+        np.add(target[:,:,0],np.transpose(dv,(0,2,1)), target[:,:,0])
+        np.add(target[:,:,1:],np.transpose(dw,(0,2,1)), target[:,:,1:])
+    def dKdiag_dtheta(self,X,target):
+        np.add(target[:,0],1.,target[:,0])
+
+
+
+
+
+
+
+
--- a/GPy/kern/parts/fixed.py
+++ b/GPy/kern/parts/fixed.py
@ -0,0 +1,41 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+
+class Fixed(Kernpart):
+    def __init__(self, input_dim, K, variance=1.):
+        """
+        :param input_dim: the number of input dimensions
+        :type input_dim: int
+        :param variance: the variance of the kernel
+        :type variance: float
+        """
+        self.input_dim = input_dim
+        self.fixed_K = K
+        self.num_params = 1
+        self.name = 'fixed'
+        self._set_params(np.array([variance]).flatten())
+
+    def _get_params(self):
+        return self.variance
+
+    def _set_params(self, x):
+        assert x.shape == (1,)
+        self.variance = x
+
+    def _get_param_names(self):
+        return ['variance']
+
+    def K(self, X, X2, target):
+        target += self.variance * self.fixed_K
+
+    def dK_dtheta(self, partial, X, X2, target):
+        target += (partial * self.fixed_K).sum()
+
+    def dK_dX(self, partial, X, X2, target):
+        pass
+
+    def dKdiag_dX(self, partial, X, target):
+        pass
--- a/GPy/kern/parts/gibbs.py
+++ b/GPy/kern/parts/gibbs.py
@ -0,0 +1,135 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+from ...util.linalg import tdot
+from ...core.mapping import Mapping
+import GPy
+
+class Gibbs(Kernpart):
+    """
+    Gibbs and MacKay non-stationary covariance function.
+
+    .. math::
+       
+       r = sqrt((x_i - x_j)'*(x_i - x_j))
+       
+       k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
+
+       Z = (2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')^{q/2}
+
+       where :math:`l(x)` is a function giving the length scale as a function of space and :math:`q` is the dimensionality of the input space.
+       This is the non stationary kernel proposed by Mark Gibbs in his 1997
+        thesis. It is similar to an RBF but has a length scale that varies
+        with input location. This leads to an additional term in front of
+        the kernel.
+
+        The parameters are :math:`\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
+
+        :param input_dim: the number of input dimensions
+        :type input_dim: int 
+        :param variance: the variance :math:`\sigma^2`
+        :type variance: float
+        :param mapping: the mapping that gives the lengthscale across the input space (by default GPy.mappings.MLP is used with 20 hidden nodes).
+        :type mapping: GPy.core.Mapping
+        :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
+        :type ARD: Boolean
+        :rtype: Kernpart object
+
+    """
+
+    def __init__(self, input_dim, variance=1., mapping=None, ARD=False):
+        self.input_dim = input_dim
+        self.ARD = ARD
+        if not mapping:
+            mapping = GPy.mappings.MLP(output_dim=1, hidden_dim=20, input_dim=input_dim)
+        if not ARD:
+            self.num_params=1+mapping.num_params
+        else:
+            raise NotImplementedError
+
+        self.mapping = mapping
+        self.name='gibbs'
+        self._set_params(np.hstack((variance, self.mapping._get_params())))
+
+    def _get_params(self):
+        return np.hstack((self.variance, self.mapping._get_params()))
+
+    def _set_params(self, x):
+        assert x.size == (self.num_params)
+        self.variance = x[0]
+        self.mapping._set_params(x[1:])
+
+    def _get_param_names(self):
+        return ['variance'] + self.mapping._get_param_names()
+
+    def K(self, X, X2, target):
+        """Return covariance between X and X2."""
+        self._K_computations(X, X2)
+        target += self.variance*self._K_dvar
+
+    def Kdiag(self, X, target):
+        """Compute the diagonal of the covariance matrix for X."""
+        np.add(target, self.variance, target)
+
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        """Derivative of the covariance with respect to the parameters."""
+        self._K_computations(X, X2)
+        self._dK_computations(dL_dK)
+        if X2==None:
+            gmapping = self.mapping.df_dtheta(2*self._dL_dl[:, None], X)
+        else:
+            gmapping = self.mapping.df_dtheta(self._dL_dl[:, None], X)
+            gmapping += self.mapping.df_dtheta(self._dL_dl_two[:, None], X2)
+
+        target+= np.hstack([(dL_dK*self._K_dvar).sum(), gmapping])
+
+    def dK_dX(self, dL_dK, X, X2, target):
+        """Derivative of the covariance matrix with respect to X."""
+        # First account for gradients arising from presence of X in exponent.
+        self._K_computations(X, X2)
+        _K_dist = X[:, None, :] - X2[None, :, :]
+        dK_dX = (-2.*self.variance)*np.transpose((self._K_dvar/self._w2)[:, :, None]*_K_dist, (1, 0, 2))
+        target += np.sum(dK_dX*dL_dK.T[:, :, None], 0)
+        # Now account for gradients arising from presence of X in lengthscale.
+        self._dK_computations(dL_dK)
+        target += self.mapping.df_dX(self._dL_dl[:, None], X)
+    
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to X."""
+        pass
+
+    def dKdiag_dtheta(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to parameters."""
+        pass
+
+    
+    def _K_computations(self, X, X2=None):
+        """Pre-computations for the covariance function (used both when computing the covariance and its gradients). Here self._dK_dvar and self._K_dist2 are updated."""
+        self._lengthscales=self.mapping.f(X)
+        self._lengthscales2=np.square(self._lengthscales)
+        if X2==None:
+            self._lengthscales_two = self._lengthscales
+            self._lengthscales_two2 = self._lengthscales2
+            Xsquare = np.square(X).sum(1)
+            self._K_dist2 = -2.*tdot(X) + Xsquare[:, None] + Xsquare[None, :]
+        else:
+            self._lengthscales_two = self.mapping.f(X2)
+            self._lengthscales_two2 = np.square(self._lengthscales_two)
+            self._K_dist2 = -2.*np.dot(X, X2.T) + np.square(X).sum(1)[:, None] + np.square(X2).sum(1)[None, :]
+        self._w2 = self._lengthscales2 + self._lengthscales_two2.T
+        prod_length = self._lengthscales*self._lengthscales_two.T
+        self._K_exponential = np.exp(-self._K_dist2/self._w2)
+        self._K_dvar = np.sign(prod_length)*(2*np.abs(prod_length)/self._w2)**(self.input_dim/2.)*np.exp(-self._K_dist2/self._w2)
+
+    def _dK_computations(self, dL_dK):
+        """Pre-computations for the gradients of the covaraince function. Here the gradient of the covariance with respect to all the individual lengthscales is computed.
+        :param dL_dK: the gradient of the objective with respect to the covariance function.
+        :type dL_dK: ndarray"""
+        
+        self._dL_dl = (dL_dK*self.variance*self._K_dvar*(self.input_dim/2.*(self._lengthscales_two.T**4 - self._lengthscales**4) + 2*self._lengthscales2*self._K_dist2)/(self._w2*self._w2*self._lengthscales)).sum(1)
+        if self._lengthscales_two is self._lengthscales:
+            self._dL_dl_two = None
+        else:
+            self._dL_dl_two = (dL_dK*self.variance*self._K_dvar*(self.input_dim/2.*(self._lengthscales**4 - self._lengthscales_two.T**4 ) + 2*self._lengthscales_two2.T*self._K_dist2)/(self._w2*self._w2*self._lengthscales_two.T)).sum(0)
--- a/GPy/kern/parts/hierarchical.py
+++ b/GPy/kern/parts/hierarchical.py
@ -0,0 +1,76 @@
+# Copyright (c) 2012, James Hesnsman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+from independent_outputs import index_to_slices
+
+class Hierarchical(Kernpart):
+    """
+    A kernel part which can reopresent a hierarchy of indepencnce: a gerenalisation of independent_outputs
+
+    """
+    def __init__(self,parts):
+        self.levels = len(parts)
+        self.input_dim = parts[0].input_dim + 1
+        self.num_params = np.sum([k.num_params for k in parts])
+        self.name = 'hierarchy'
+        self.parts = parts
+
+        self.param_starts = np.hstack((0,np.cumsum([k.num_params for k in self.parts[:-1]])))
+        self.param_stops = np.cumsum([k.num_params for k in self.parts])
+
+    def _get_params(self):
+        return np.hstack([k._get_params() for k in self.parts])
+
+    def _set_params(self,x):
+        [k._set_params(x[start:stop]) for k, start, stop in zip(self.parts, self.param_starts, self.param_stops)]
+
+    def _get_param_names(self):
+        return sum([[str(i)+'_'+k.name+'_'+n for n in k._get_param_names()] for i,k in enumerate(self.parts)],[])
+
+    def _sort_slices(self,X,X2):
+        slices = [index_to_slices(x) for x in X[:,-self.levels:].T]
+        X = X[:,:-self.levels]
+        if X2 is None:
+            slices2 = slices
+            X2 = X
+        else:
+            slices2 = [index_to_slices(x) for x in X2[:,-self.levels:].T]
+            X2 = X2[:,:-self.levels]
+        return X, X2, slices, slices2
+
+    def K(self,X,X2,target):
+        X, X2, slices, slices2 = self._sort_slices(X,X2)
+
+        [[[[k.K(X[s],X2[s2],target[s,s2]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices_,slices2_)] for k, slices_, slices2_ in zip(self.parts,slices,slices2)]
+
+    def Kdiag(self,X,target):
+        raise NotImplementedError
+        #X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        #[[self.k.Kdiag(X[s],target[s]) for s in slices_i] for slices_i in slices]
+
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        X, X2, slices, slices2 = self._sort_slices(X,X2)
+        [[[[k.dK_dtheta(dL_dK[s,s2],X[s],X2[s2],target[p_start:p_stop]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices_, slices2_)] for k, p_start, p_stop, slices_, slices2_ in zip(self.parts, self.param_starts, self.param_stops, slices, slices2)]
+
+
+    def dK_dX(self,dL_dK,X,X2,target):
+        raise NotImplementedError
+        #X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        #if X2 is None:
+            #X2,slices2 = X,slices
+        #else:
+            #X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+        #[[[self.k.dK_dX(dL_dK[s,s2],X[s],X2[s2],target[s,:-1]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
+#
+    def dKdiag_dX(self,dL_dKdiag,X,target):
+        raise NotImplementedError
+        #X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        #[[self.k.dKdiag_dX(dL_dKdiag[s],X[s],target[s,:-1]) for s in slices_i] for slices_i in slices]
+
+
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        raise NotImplementedError
+        #X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        #[[self.k.dKdiag_dX(dL_dKdiag[s],X[s],target) for s in slices_i] for slices_i in slices]
--- a/GPy/kern/parts/independent_outputs.py
+++ b/GPy/kern/parts/independent_outputs.py
@ -0,0 +1,97 @@
+# Copyright (c) 2012, James Hesnsman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+
+def index_to_slices(index):
+    """
+    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index. 
+
+    e.g.
+    >>> index = np.asarray([0,0,0,1,1,1,2,2,2])
+    returns
+    >>> [[slice(0,3,None)],[slice(3,6,None)],[slice(6,9,None)]]
+
+    or, a more complicated example
+    >>> index = np.asarray([0,0,1,1,0,2,2,2,1,1])
+    returns
+    >>> [[slice(0,2,None),slice(4,5,None)],[slice(2,4,None),slice(8,10,None)],[slice(5,8,None)]]
+    """
+
+    #contruct the return structure
+    ind = np.asarray(index,dtype=np.int64)
+    ret = [[] for i in range(ind.max()+1)]
+
+    #find the switchpoints
+    ind_ = np.hstack((ind,ind[0]+ind[-1]+1))
+    switchpoints = np.nonzero(ind_ - np.roll(ind_,+1))[0]
+
+    [ret[ind_i].append(slice(*indexes_i)) for ind_i,indexes_i in zip(ind[switchpoints[:-1]],zip(switchpoints,switchpoints[1:]))]
+    return ret
+
+class IndependentOutputs(Kernpart):
+    """
+    A kernel part shich can reopresent several independent functions.
+    this kernel 'switches off' parts of the matrix where the output indexes are different.
+
+    The index of the functions is given by the last column in the input X
+    the rest of the columns of X are passed to the kernel for computation (in blocks).
+
+    """
+    def __init__(self,k):
+        self.input_dim = k.input_dim + 1
+        self.num_params = k.num_params
+        self.name = 'iops('+ k.name + ')'
+        self.k = k
+
+    def _get_params(self):
+        return self.k._get_params()
+
+    def _set_params(self,x):
+        self.k._set_params(x)
+        self.params = x
+
+    def _get_param_names(self):
+        return self.k._get_param_names()
+
+    def K(self,X,X2,target):
+        #Sort out the slices from the input data
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        if X2 is None:
+            X2,slices2 = X,slices
+        else:
+            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+
+        [[[self.k.K(X[s],X2[s2],target[s,s2]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
+
+    def Kdiag(self,X,target):
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        [[self.k.Kdiag(X[s],target[s]) for s in slices_i] for slices_i in slices]
+
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        if X2 is None:
+            X2,slices2 = X,slices
+        else:
+            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+        [[[self.k.dK_dtheta(dL_dK[s,s2],X[s],X2[s2],target) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
+
+
+    def dK_dX(self,dL_dK,X,X2,target):
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        if X2 is None:
+            X2,slices2 = X,slices
+        else:
+            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+        [[[self.k.dK_dX(dL_dK[s,s2],X[s],X2[s2],target[s,:-1]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)]
+
+    def dKdiag_dX(self,dL_dKdiag,X,target):
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        [[self.k.dKdiag_dX(dL_dKdiag[s],X[s],target[s,:-1]) for s in slices_i] for slices_i in slices]
+
+
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        [[self.k.dKdiag_dX(dL_dKdiag[s],X[s],target) for s in slices_i] for slices_i in slices]
--- a/GPy/kern/parts/kernpart.py
+++ b/GPy/kern/parts/kernpart.py
@ -0,0 +1,76 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+class Kernpart(object):
+    def __init__(self,input_dim):
+        """
+        The base class for a kernpart: a positive definite function which forms part of a kernel
+
+        :param input_dim: the number of input dimensions to the function
+        :type input_dim: int
+
+        Do not instantiate.
+        """
+        self.input_dim = input_dim
+        self.num_params = 1
+        self.name = 'unnamed'
+
+    def _get_params(self):
+        raise NotImplementedError
+    def _set_params(self,x):
+        raise NotImplementedError
+    def _get_param_names(self):
+        raise NotImplementedError
+    def K(self,X,X2,target):
+        raise NotImplementedError
+    def Kdiag(self,X,target):
+        raise NotImplementedError
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        raise NotImplementedError
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        # In the base case compute this by calling dK_dtheta. Need to
+        # override for stationary covariances (for example) to save
+        # time.
+        for i in range(X.shape[0]):
+            self.dK_dtheta(dL_dKdiag[i], X[i, :][None, :], X2=None, target=target)
+    def psi0(self,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi0_dmuS(self,dL_dpsi0,Z,mu,S,target_mu,target_S):
+        raise NotImplementedError
+    def psi1(self,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi1_dtheta(self,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi1_dZ(self,dL_dpsi1,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi1_dmuS(self,dL_dpsi1,Z,mu,S,target_mu,target_S):
+        raise NotImplementedError
+    def psi2(self,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi2_dZ(self,dL_dpsi2,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi2_dtheta(self,dL_dpsi2,Z,mu,S,target):
+        raise NotImplementedError
+    def dpsi2_dmuS(self,dL_dpsi2,Z,mu,S,target_mu,target_S):
+        raise NotImplementedError
+    def dK_dX(self, dL_dK, X, X2, target):
+        raise NotImplementedError
+
+class Kernpart_inner(Kernpart):
+    def __init__(self,input_dim):
+        """
+        The base class for a kernpart_inner: a positive definite function which forms part of a kernel that is based on the inner product between inputs.
+
+        :param input_dim: the number of input dimensions to the function
+        :type input_dim: int
+
+        Do not instantiate.
+        """
+        Kernpart.__init__(self, input_dim)
+
+        # initialize cache
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
--- a/GPy/kern/parts/linear.py
+++ b/GPy/kern/parts/linear.py
@ -0,0 +1,304 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+from ...util.linalg import tdot
+from ...util.misc import fast_array_equal
+from scipy import weave
+
+class Linear(Kernpart):
+    """
+    Linear kernel
+
+    .. math::
+
+       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i x_iy_i
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variances: the vector of variances :math:`\sigma^2_i`
+    :type variances: array or list of the appropriate size (or float if there is only one variance parameter)
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel has only one variance parameter \sigma^2, otherwise there is one variance parameter per dimension.
+    :type ARD: Boolean
+    :rtype: kernel object
+    """
+
+    def __init__(self, input_dim, variances=None, ARD=False):
+        self.input_dim = input_dim
+        self.ARD = ARD
+        if ARD == False:
+            self.num_params = 1
+            self.name = 'linear'
+            if variances is not None:
+                variances = np.asarray(variances)
+                assert variances.size == 1, "Only one variance needed for non-ARD kernel"
+            else:
+                variances = np.ones(1)
+            self._Xcache, self._X2cache = np.empty(shape=(2,))
+        else:
+            self.num_params = self.input_dim
+            self.name = 'linear'
+            if variances is not None:
+                variances = np.asarray(variances)
+                assert variances.size == self.input_dim, "bad number of lengthscales"
+            else:
+                variances = np.ones(self.input_dim)
+        self._set_params(variances.flatten())
+
+        # initialize cache
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
+
+    def _get_params(self):
+        return self.variances
+
+    def _set_params(self, x):
+        assert x.size == (self.num_params)
+        self.variances = x
+        self.variances2 = np.square(self.variances)
+
+    def _get_param_names(self):
+        if self.num_params == 1:
+            return ['variance']
+        else:
+            return ['variance_%i' % i for i in range(self.variances.size)]
+
+    def K(self, X, X2, target):
+        if self.ARD:
+            XX = X * np.sqrt(self.variances)
+            if X2 is None:
+                target += tdot(XX)
+            else:
+                XX2 = X2 * np.sqrt(self.variances)
+                target += np.dot(XX, XX2.T)
+        else:
+            self._K_computations(X, X2)
+            target += self.variances * self._dot_product
+
+    def Kdiag(self, X, target):
+        np.add(target, np.sum(self.variances * np.square(X), -1), target)
+
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        if self.ARD:
+            if X2 is None:
+                [np.add(target[i:i + 1], np.sum(dL_dK * tdot(X[:, i:i + 1])), target[i:i + 1]) for i in range(self.input_dim)]
+            else:
+                product = X[:, None, :] * X2[None, :, :]
+                target += (dL_dK[:, :, None] * product).sum(0).sum(0)
+        else:
+            self._K_computations(X, X2)
+            target += np.sum(self._dot_product * dL_dK)
+
+    def dKdiag_dtheta(self, dL_dKdiag, X, target):
+        tmp = dL_dKdiag[:, None] * X ** 2
+        if self.ARD:
+            target += tmp.sum(0)
+        else:
+            target += tmp.sum()
+
+    def dK_dX(self, dL_dK, X, X2, target):
+        target += (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
+
+    def dKdiag_dX(self,dL_dKdiag,X,target):
+        target += 2.*self.variances*dL_dKdiag[:,None]*X
+
+    #---------------------------------------#
+    #             PSI statistics            #
+    #---------------------------------------#
+
+    def psi0(self, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        target += np.sum(self.variances * self.mu2_S, 1)
+
+    def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        tmp = dL_dpsi0[:, None] * self.mu2_S
+        if self.ARD:
+            target += tmp.sum(0)
+        else:
+            target += tmp.sum()
+
+    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S, target_mu, target_S):
+        target_mu += dL_dpsi0[:, None] * (2.0 * mu * self.variances)
+        target_S += dL_dpsi0[:, None] * self.variances
+
+    def psi1(self, Z, mu, S, target):
+        """the variance, it does nothing"""
+        self._psi1 = self.K(mu, Z, target)
+
+    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target):
+        """the variance, it does nothing"""
+        self.dK_dtheta(dL_dpsi1, mu, Z, target)
+
+    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
+        """Do nothing for S, it does not affect psi1"""
+        self._psi_computations(Z, mu, S)
+        target_mu += (dL_dpsi1[:, :, None] * (Z * self.variances)).sum(1)
+
+    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
+        self.dK_dX(dL_dpsi1.T, Z, mu, target)
+
+    def psi2(self, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        target += self._psi2
+
+    def psi2_new(self,Z,mu,S,target):
+        tmp = np.zeros((mu.shape[0], Z.shape[0]))
+        self.K(mu,Z,tmp)
+        target += tmp[:,:,None]*tmp[:,None,:] + np.sum(S[:,None,None,:]*self.variances**2*Z[None,:,None,:]*Z[None,None,:,:],-1)
+
+    def dpsi2_dtheta_new(self, dL_dpsi2, Z, mu, S, target):
+        tmp = np.zeros((mu.shape[0], Z.shape[0]))
+        self.K(mu,Z,tmp)
+        self.dK_dtheta(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target)
+        result= 2.*(dL_dpsi2[:,:,:,None]*S[:,None,None,:]*self.variances*Z[None,:,None,:]*Z[None,None,:,:]).sum(0).sum(0).sum(0)
+        if self.ARD:
+            target += result.sum(0).sum(0).sum(0)
+        else:
+            target += result.sum()
+
+    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        tmp = dL_dpsi2[:, :, :, None] * (self.ZAinner[:, :, None, :] * (2 * Z)[None, None, :, :])
+        if self.ARD:
+            target += tmp.sum(0).sum(0).sum(0)
+        else:
+            target += tmp.sum()
+
+    def dpsi2_dmuS_new(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
+        tmp = np.zeros((mu.shape[0], Z.shape[0]))
+        self.K(mu,Z,tmp)
+        self.dK_dX(2.*np.sum(dL_dpsi2*tmp[:,None,:],2),mu,Z,target_mu)
+
+        Zs = Z*self.variances
+        Zs_sq = Zs[:,None,:]*Zs[None,:,:]
+        target_S += (dL_dpsi2[:,:,:,None]*Zs_sq[None,:,:,:]).sum(1).sum(1)
+
+    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
+        """Think N,num_inducing,num_inducing,input_dim """
+        self._psi_computations(Z, mu, S)
+        AZZA = self.ZA.T[:, None, :, None] * self.ZA[None, :, None, :]
+        AZZA = AZZA + AZZA.swapaxes(1, 2)
+        AZZA_2 = AZZA/2.
+        #muAZZA = np.tensordot(mu,AZZA,(-1,0))
+        #target_mu_dummy, target_S_dummy = np.zeros_like(target_mu), np.zeros_like(target_S)
+        #target_mu_dummy += (dL_dpsi2[:, :, :, None] * muAZZA).sum(1).sum(1)
+        #target_S_dummy += (dL_dpsi2[:, :, :, None] * self.ZA[None, :, None, :] * self.ZA[None, None, :, :]).sum(1).sum(1)
+
+        #Using weave, we can exploiut the symmetry of this problem:
+        code = """
+        int n, m, mm,q,qq;
+        double factor,tmp;
+        #pragma omp parallel for private(m,mm,q,qq,factor,tmp)
+        for(n=0;n<N;n++){
+          for(m=0;m<num_inducing;m++){
+            for(mm=0;mm<=m;mm++){
+              //add in a factor of 2 for the off-diagonal terms (and then count them only once)
+              if(m==mm)
+                factor = dL_dpsi2(n,m,mm);
+              else
+                factor = 2.0*dL_dpsi2(n,m,mm);
+
+              for(q=0;q<input_dim;q++){
+
+                //take the dot product of mu[n,:] and AZZA[:,m,mm,q] TODO: blas!
+                tmp = 0.0;
+                for(qq=0;qq<input_dim;qq++){
+                  tmp += mu(n,qq)*AZZA(qq,m,mm,q);
+                }
+
+                target_mu(n,q) += factor*tmp;
+                target_S(n,q) += factor*AZZA_2(q,m,mm,q);
+              }
+            }
+          }
+        }
+        """
+        support_code = """
+        #include <omp.h>
+        #include <math.h>
+        """
+        weave_options = {'headers'           : ['<omp.h>'],
+                         'extra_compile_args': ['-fopenmp -O3'],  #-march=native'],
+                         'extra_link_args'   : ['-lgomp']}
+
+        N,num_inducing,input_dim = mu.shape[0],Z.shape[0],mu.shape[1]
+        weave.inline(code, support_code=support_code, libraries=['gomp'],
+                     arg_names=['N','num_inducing','input_dim','mu','AZZA','AZZA_2','target_mu','target_S','dL_dpsi2'],
+                     type_converters=weave.converters.blitz,**weave_options)
+
+
+    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        #psi2_dZ = dL_dpsi2[:, :, :, None] * self.variances * self.ZAinner[:, :, None, :]
+        #dummy_target = np.zeros_like(target)
+        #dummy_target += psi2_dZ.sum(0).sum(0)
+
+        AZA = self.variances*self.ZAinner
+        code="""
+        int n,m,mm,q;
+        #pragma omp parallel for private(n,mm,q)
+        for(m=0;m<num_inducing;m++){
+          for(q=0;q<input_dim;q++){
+            for(mm=0;mm<num_inducing;mm++){
+              for(n=0;n<N;n++){
+                target(m,q) += dL_dpsi2(n,m,mm)*AZA(n,mm,q);
+              }
+            }
+          }
+        }
+        """
+        support_code = """
+        #include <omp.h>
+        #include <math.h>
+        """
+        weave_options = {'headers'           : ['<omp.h>'],
+                         'extra_compile_args': ['-fopenmp -O3'],  #-march=native'],
+                         'extra_link_args'   : ['-lgomp']}
+
+        N,num_inducing,input_dim = mu.shape[0],Z.shape[0],mu.shape[1]
+        weave.inline(code, support_code=support_code, libraries=['gomp'],
+                     arg_names=['N','num_inducing','input_dim','AZA','target','dL_dpsi2'],
+                     type_converters=weave.converters.blitz,**weave_options)
+
+
+
+
+
+    #---------------------------------------#
+    #            Precomputations            #
+    #---------------------------------------#
+
+    def _K_computations(self, X, X2):
+        if not (fast_array_equal(X, self._Xcache) and fast_array_equal(X2, self._X2cache)):
+            self._Xcache = X.copy()
+            if X2 is None:
+                self._dot_product = tdot(X)
+                self._X2cache = None
+            else:
+                self._X2cache = X2.copy()
+                self._dot_product = np.dot(X, X2.T)
+
+    def _psi_computations(self, Z, mu, S):
+        # here are the "statistics" for psi1 and psi2
+        Zv_changed = not (fast_array_equal(Z, self._Z) and fast_array_equal(self.variances, self._variances))
+        muS_changed = not (fast_array_equal(mu, self._mu) and fast_array_equal(S, self._S))
+        if Zv_changed:
+            # Z has changed, compute Z specific stuff
+            # self.ZZ = Z[:,None,:]*Z[None,:,:] # num_inducing,num_inducing,input_dim
+#             self.ZZ = np.empty((Z.shape[0], Z.shape[0], Z.shape[1]), order='F')
+#             [tdot(Z[:, i:i + 1], self.ZZ[:, :, i].T) for i in xrange(Z.shape[1])]
+            self.ZA = Z * self.variances
+            self._Z = Z.copy()
+            self._variances = self.variances.copy()
+        if muS_changed:
+            self.mu2_S = np.square(mu) + S
+            self.inner = (mu[:, None, :] * mu[:, :, None])
+            diag_indices = np.diag_indices(mu.shape[1], 2)
+            self.inner[:, diag_indices[0], diag_indices[1]] += S
+            self._mu, self._S = mu.copy(), S.copy()
+        if Zv_changed or muS_changed:
+            self.ZAinner = np.dot(self.ZA, self.inner).swapaxes(0, 1)  # NOTE: self.ZAinner \in [num_inducing x N x input_dim]!
+            self._psi2 = np.dot(self.ZAinner, self.ZA.T)
--- a/GPy/kern/parts/mlp.py
+++ b/GPy/kern/parts/mlp.py
@ -0,0 +1,155 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+four_over_tau = 2./np.pi
+
+class MLP(Kernpart):
+    """
+    multi layer perceptron kernel (also known as arc sine kernel or neural network kernel)
+
+    .. math::
+
+       k(x,y) = \sigma^2 \frac{2}{\pi}  \text{asin} \left(\frac{\sigma_w^2 x^\top y+\sigma_b^2}{\sqrt{\sigma_w^2x^\top x + \sigma_b^2 + 1}\sqrt{\sigma_w^2 y^\top y \sigma_b^2 +1}} \right)
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int 
+    :param variance: the variance :math:`\sigma^2`
+    :type variance: float
+    :param weight_variance: the vector of the variances of the prior over input weights in the neural network :math:`\sigma^2_w`
+    :type weight_variance: array or list of the appropriate size (or float if there is only one weight variance parameter)
+    :param bias_variance: the variance of the prior over bias parameters :math:`\sigma^2_b`
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
+    :type ARD: Boolean
+    :rtype: Kernpart object
+
+    """
+
+    def __init__(self, input_dim, variance=1., weight_variance=None, bias_variance=100., ARD=False):
+        self.input_dim = input_dim
+        self.ARD = ARD
+        if not ARD:
+            self.num_params=3
+            if weight_variance is not None:
+                weight_variance = np.asarray(weight_variance)
+                assert weight_variance.size == 1, "Only one weight variance needed for non-ARD kernel"
+            else:
+                weight_variance = 100.*np.ones(1)
+        else:
+            self.num_params = self.input_dim + 2
+            if weight_variance is not None:
+                weight_variance = np.asarray(weight_variance)
+                assert weight_variance.size == self.input_dim, "bad number of weight variances"
+            else:
+                weight_variance = np.ones(self.input_dim)
+            raise NotImplementedError
+
+        self.name='mlp'
+        self._set_params(np.hstack((variance, weight_variance.flatten(), bias_variance)))
+
+    def _get_params(self):
+        return np.hstack((self.variance, self.weight_variance.flatten(), self.bias_variance))
+
+    def _set_params(self, x):
+        assert x.size == (self.num_params)
+        self.variance = x[0]
+        self.weight_variance = x[1:-1]
+        self.weight_std = np.sqrt(self.weight_variance)
+        self.bias_variance = x[-1]
+
+    def _get_param_names(self):
+        if self.num_params == 3:
+            return ['variance', 'weight_variance', 'bias_variance']
+        else:
+            return ['variance'] + ['weight_variance_%i' % i for i in range(self.lengthscale.size)] + ['bias_variance']
+
+    def K(self, X, X2, target):
+        """Return covariance between X and X2."""
+        self._K_computations(X, X2)
+        target += self.variance*self._K_dvar
+
+    def Kdiag(self, X, target):
+        """Compute the diagonal of the covariance matrix for X."""
+        self._K_diag_computations(X)
+        target+= self.variance*self._K_diag_dvar
+
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        """Derivative of the covariance with respect to the parameters."""
+        self._K_computations(X, X2)
+        denom3 = self._K_denom*self._K_denom*self._K_denom
+        base = four_over_tau*self.variance/np.sqrt(1-self._K_asin_arg*self._K_asin_arg)
+        base_cov_grad = base*dL_dK
+
+        if X2 is None:
+            vec = np.diag(self._K_inner_prod)
+            target[1] += ((self._K_inner_prod/self._K_denom 
+                           -.5*self._K_numer/denom3
+                           *(np.outer((self.weight_variance*vec+self.bias_variance+1.), vec) 
+                             +np.outer(vec,(self.weight_variance*vec+self.bias_variance+1.))))*base_cov_grad).sum()
+            target[2] += ((1./self._K_denom 
+                           -.5*self._K_numer/denom3 
+                           *((vec[None, :]+vec[:, None])*self.weight_variance
+                           +2.*self.bias_variance + 2.))*base_cov_grad).sum()
+        else:
+            vec1 = (X*X).sum(1)
+            vec2 = (X2*X2).sum(1)
+            target[1] += ((self._K_inner_prod/self._K_denom 
+                           -.5*self._K_numer/denom3
+                           *(np.outer((self.weight_variance*vec1+self.bias_variance+1.), vec2) + np.outer(vec1, self.weight_variance*vec2 + self.bias_variance+1.)))*base_cov_grad).sum()
+            target[2] += ((1./self._K_denom 
+                           -.5*self._K_numer/denom3 
+                           *((vec1[:, None]+vec2[None, :])*self.weight_variance
+                             + 2*self.bias_variance + 2.))*base_cov_grad).sum()
+            
+        target[0] += np.sum(self._K_dvar*dL_dK)
+
+    def dK_dX(self, dL_dK, X, X2, target):
+        """Derivative of the covariance matrix with respect to X"""
+        self._K_computations(X, X2)
+        arg = self._K_asin_arg
+        numer = self._K_numer
+        denom = self._K_denom
+        vec2 = (X2*X2).sum(1)*self.weight_variance + self.bias_variance + 1.
+        denom3 = denom*denom*denom
+        target += four_over_tau*self.weight_variance*self.variance*((X2[None, :, :]/denom[:, :, None] - vec2[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
+            
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to X"""
+        self._K_diag_computations(X)
+        arg = self._K_diag_asin_arg
+        denom = self._K_diag_denom
+        numer = self._K_diag_numer
+        target += four_over_tau*2.*self.weight_variance*self.variance*X*(1/denom*(1 - arg)*dL_dKdiag/(np.sqrt(1-arg*arg)))[:, None] 
+
+    
+    def _K_computations(self, X, X2):
+        """Pre-computations for the covariance matrix (used for computing the covariance and its gradients."""
+        if self.ARD:
+            pass
+        else:
+            if X2 is None:
+                self._K_inner_prod = np.dot(X,X.T)
+                self._K_numer = self._K_inner_prod*self.weight_variance+self.bias_variance
+                vec = np.diag(self._K_numer) + 1.
+                self._K_denom = np.sqrt(np.outer(vec,vec))
+                self._K_asin_arg = self._K_numer/self._K_denom
+                self._K_dvar = four_over_tau*np.arcsin(self._K_asin_arg)
+            else:
+                self._K_inner_prod = np.dot(X,X2.T)
+                self._K_numer = self._K_inner_prod*self.weight_variance + self.bias_variance
+                vec1 = (X*X).sum(1)*self.weight_variance + self.bias_variance + 1.
+                vec2 = (X2*X2).sum(1)*self.weight_variance + self.bias_variance + 1.
+                self._K_denom = np.sqrt(np.outer(vec1,vec2))
+                self._K_asin_arg = self._K_numer/self._K_denom
+                self._K_dvar = four_over_tau*np.arcsin(self._K_asin_arg)
+
+    def _K_diag_computations(self, X):
+        """Pre-computations concerning the diagonal terms (used for computation of diagonal and its gradients)."""
+        if self.ARD:
+            pass
+        else:
+            self._K_diag_numer = (X*X).sum(1)*self.weight_variance + self.bias_variance
+            self._K_diag_denom = self._K_diag_numer+1.
+            self._K_diag_asin_arg = self._K_diag_numer/self._K_diag_denom
+            self._K_diag_dvar = four_over_tau*np.arcsin(self._K_diag_asin_arg)
--- a/GPy/kern/parts/periodic_Matern32.py
+++ b/GPy/kern/parts/periodic_Matern32.py
@ -0,0 +1,248 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+from GPy.util.linalg import mdot
+from GPy.util.decorators import silence_errors
+
+class PeriodicMatern32(Kernpart):
+    """
+    Kernel of the periodic subspace (up to a given frequency) of a Matern 3/2 RKHS. Only defined for input_dim=1.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance of the Matern kernel
+    :type variance: float
+    :param lengthscale: the lengthscale of the Matern kernel
+    :type lengthscale: np.ndarray of size (input_dim,)
+    :param period: the period
+    :type period: float
+    :param n_freq: the number of frequencies considered for the periodic subspace
+    :type n_freq: int
+    :rtype: kernel object
+
+    """
+
+    def __init__(self, input_dim=1, variance=1., lengthscale=None, period=2 * np.pi, n_freq=10, lower=0., upper=4 * np.pi):
+        assert input_dim==1, "Periodic kernels are only defined for input_dim=1"
+        self.name = 'periodic_Mat32'
+        self.input_dim = input_dim
+        if lengthscale is not None:
+            lengthscale = np.asarray(lengthscale)
+            assert lengthscale.size == 1, "Wrong size: only one lengthscale needed"
+        else:
+            lengthscale = np.ones(1)
+        self.lower,self.upper = lower, upper
+        self.num_params = 3
+        self.n_freq = n_freq
+        self.n_basis = 2*n_freq
+        self._set_params(np.hstack((variance,lengthscale,period)))
+
+    def _cos(self,alpha,omega,phase):
+        def f(x):
+            return alpha*np.cos(omega*x+phase)
+        return f
+
+    @silence_errors
+    def _cos_factorization(self,alpha,omega,phase):
+        r1 = np.sum(alpha*np.cos(phase),axis=1)[:,None]
+        r2 = np.sum(alpha*np.sin(phase),axis=1)[:,None]
+        r =  np.sqrt(r1**2 + r2**2)
+        psi = np.where(r1 != 0, (np.arctan(r2/r1) + (r1<0.)*np.pi),np.arcsin(r2))
+        return r,omega[:,0:1], psi
+
+    @silence_errors
+    def _int_computation(self,r1,omega1,phi1,r2,omega2,phi2):
+        Gint1 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) + 1./(omega1-omega2.T)*( np.sin((omega1-omega2.T)*self.upper+phi1-phi2.T) - np.sin((omega1-omega2.T)*self.lower+phi1-phi2.T) )
+        Gint2 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) +  np.cos(phi1-phi2.T)*(self.upper-self.lower)
+        #Gint2[0,0] = 2.*(self.upper-self.lower)*np.cos(phi1[0,0])*np.cos(phi2[0,0])
+        Gint = np.dot(r1,r2.T)/2 * np.where(np.isnan(Gint1),Gint2,Gint1)
+        return Gint
+
+    def _get_params(self):
+        """return the value of the parameters."""
+        return np.hstack((self.variance,self.lengthscale,self.period))
+
+    def _set_params(self,x):
+        """set the value of the parameters."""
+        assert x.size==3
+        self.variance = x[0]
+        self.lengthscale = x[1]
+        self.period = x[2]
+
+        self.a = [3./self.lengthscale**2, 2*np.sqrt(3)/self.lengthscale, 1.]
+        self.b = [1,self.lengthscale**2/3]
+
+        self.basis_alpha = np.ones((self.n_basis,))
+        self.basis_omega = np.array(sum([[i*2*np.pi/self.period]*2 for i in  range(1,self.n_freq+1)],[]))
+        self.basis_phi =   np.array(sum([[-np.pi/2, 0.]  for i in range(1,self.n_freq+1)],[]))
+
+        self.G = self.Gram_matrix()
+        self.Gi = np.linalg.inv(self.G)
+
+    def _get_param_names(self):
+        """return parameter names."""
+        return ['variance','lengthscale','period']
+
+    def Gram_matrix(self):
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega,self.a[2]*self.basis_omega**2))
+        Lo = np.column_stack((self.basis_omega,self.basis_omega,self.basis_omega))
+        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2,self.basis_phi+np.pi))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        return(self.lengthscale**3/(12*np.sqrt(3)*self.variance) * Gint + 1./self.variance*np.dot(Flower,Flower.T) + self.lengthscale**2/(3.*self.variance)*np.dot(F1lower,F1lower.T))
+
+    def K(self,X,X2,target):
+        """Compute the covariance matrix between X and X2."""
+        FX = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+        if X2 is None:
+            FX2 = FX
+        else:
+            FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
+        np.add(mdot(FX,self.Gi,FX2.T), target,target)
+
+    def Kdiag(self,X,target):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+        np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target)
+
+    @silence_errors
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        """derivative of the covariance matrix with respect to the parameters (shape is Nxnum_inducingxNparam)"""
+        if X2 is None: X2 = X
+        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
+
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega,self.a[2]*self.basis_omega**2))
+        Lo = np.column_stack((self.basis_omega,self.basis_omega,self.basis_omega))
+        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2,self.basis_phi+np.pi))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+
+        #dK_dvar
+        dK_dvar = 1./self.variance*mdot(FX,self.Gi,FX2.T)
+
+        #dK_dlen
+        da_dlen = [-6/self.lengthscale**3,-2*np.sqrt(3)/self.lengthscale**2,0.]
+        db_dlen = [0.,2*self.lengthscale/3.]
+        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)),da_dlen[1]*self.basis_omega,da_dlen[2]*self.basis_omega**2))
+        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
+        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
+        dGint_dlen = dGint_dlen + dGint_dlen.T
+        dG_dlen = self.lengthscale**2/(4*np.sqrt(3))*Gint + self.lengthscale**3/(12*np.sqrt(3))*dGint_dlen + db_dlen[0]*np.dot(Flower,Flower.T) + db_dlen[1]*np.dot(F1lower,F1lower.T)
+        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX2.T)
+
+        #dK_dper
+        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
+        dFX2_dper = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X2,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X2)
+
+        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period, -self.a[2]*self.basis_omega**3/self.period))
+        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi,self.basis_phi+np.pi*3/2))
+        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
+
+        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
+        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
+        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
+        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
+        #IPPprim2[0,0] = 2*(self.upper**2 - self.lower**2)*np.cos(phi[0,0])*np.cos(phi1[0,0])
+        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)
+
+        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
+        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
+        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + 1./2*self.upper**2*np.cos(phi-phi1.T)
+        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + 1./2*self.lower**2*np.cos(phi-phi1.T)
+        #IPPint2[0,0] = (self.upper**2 - self.lower**2)*np.cos(phi[0,0])*np.cos(phi1[0,0])
+        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
+
+        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period, -2*self.a[2]*self.basis_omega**2/self.period))
+        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi))
+        r2,omega2,phi2 =  self._cos_factorization(dLa_dper2,Lo[:,0:2],dLp_dper2)
+
+        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) +  self._int_computation(r2,omega2,phi2, r,omega,phi)
+        dGint_dper = dGint_dper + dGint_dper.T
+
+        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        dF1lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower)+self._cos(-self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+
+        dG_dper = 1./self.variance*(self.lengthscale**3/(12*np.sqrt(3))*dGint_dper + self.b[0]*(np.dot(dFlower_dper,Flower.T)+np.dot(Flower,dFlower_dper.T)) + self.b[1]*(np.dot(dF1lower_dper,F1lower.T)+np.dot(F1lower,dF1lower_dper.T)))
+
+        dK_dper = mdot(dFX_dper,self.Gi,FX2.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX2.T) + mdot(FX,self.Gi,dFX2_dper.T)
+
+        # np.add(target[:,:,0],dK_dvar, target[:,:,0])
+        target[0] += np.sum(dK_dvar*dL_dK)
+        #np.add(target[:,:,1],dK_dlen, target[:,:,1])
+        target[1] += np.sum(dK_dlen*dL_dK)
+        #np.add(target[:,:,2],dK_dper, target[:,:,2])
+        target[2] += np.sum(dK_dper*dL_dK)
+
+    @silence_errors
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        """derivative of the diagonal covariance matrix with respect to the parameters"""
+        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega, self.a[2]*self.basis_omega**2))
+        Lo = np.column_stack((self.basis_omega,self.basis_omega,self.basis_omega))
+        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2,self.basis_phi+np.pi))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+
+        #dK_dvar
+        dK_dvar = 1./self.variance*mdot(FX,self.Gi,FX.T)
+
+        #dK_dlen
+        da_dlen = [-6/self.lengthscale**3,-2*np.sqrt(3)/self.lengthscale**2,0.]
+        db_dlen = [0.,2*self.lengthscale/3.]
+        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)),da_dlen[1]*self.basis_omega,da_dlen[2]*self.basis_omega**2))
+        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
+        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
+        dGint_dlen = dGint_dlen + dGint_dlen.T
+        dG_dlen = self.lengthscale**2/(4*np.sqrt(3))*Gint + self.lengthscale**3/(12*np.sqrt(3))*dGint_dlen + db_dlen[0]*np.dot(Flower,Flower.T) + db_dlen[1]*np.dot(F1lower,F1lower.T)
+        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX.T)
+
+        #dK_dper
+        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
+
+        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period, -self.a[2]*self.basis_omega**3/self.period))
+        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi,self.basis_phi+np.pi*3/2))
+        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
+
+        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
+        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
+        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
+        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
+        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)
+
+        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
+        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
+        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + 1./2*self.upper**2*np.cos(phi-phi1.T)
+        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + 1./2*self.lower**2*np.cos(phi-phi1.T)
+        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
+
+        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period, -2*self.a[2]*self.basis_omega**2/self.period))
+        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi))
+        r2,omega2,phi2 =  self._cos_factorization(dLa_dper2,Lo[:,0:2],dLp_dper2)
+
+        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) +  self._int_computation(r2,omega2,phi2, r,omega,phi)
+        dGint_dper = dGint_dper + dGint_dper.T
+
+        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        dF1lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower)+self._cos(-self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+
+        dG_dper = 1./self.variance*(self.lengthscale**3/(12*np.sqrt(3))*dGint_dper + self.b[0]*(np.dot(dFlower_dper,Flower.T)+np.dot(Flower,dFlower_dper.T)) + self.b[1]*(np.dot(dF1lower_dper,F1lower.T)+np.dot(F1lower,dF1lower_dper.T)))
+
+        dK_dper = 2* mdot(dFX_dper,self.Gi,FX.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX.T)
+
+        target[0] += np.sum(np.diag(dK_dvar)*dL_dKdiag)
+        target[1] += np.sum(np.diag(dK_dlen)*dL_dKdiag)
+        target[2] += np.sum(np.diag(dK_dper)*dL_dKdiag)
--- a/GPy/kern/parts/periodic_Matern52.py
+++ b/GPy/kern/parts/periodic_Matern52.py
@ -0,0 +1,266 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+from GPy.util.linalg import mdot
+from GPy.util.decorators import silence_errors
+
+class PeriodicMatern52(Kernpart):
+    """
+    Kernel of the periodic subspace (up to a given frequency) of a Matern 5/2 RKHS. Only defined for input_dim=1.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance of the Matern kernel
+    :type variance: float
+    :param lengthscale: the lengthscale of the Matern kernel
+    :type lengthscale: np.ndarray of size (input_dim,)
+    :param period: the period
+    :type period: float
+    :param n_freq: the number of frequencies considered for the periodic subspace
+    :type n_freq: int
+    :rtype: kernel object
+
+    """
+
+    def __init__(self,input_dim=1,variance=1.,lengthscale=None,period=2*np.pi,n_freq=10,lower=0.,upper=4*np.pi):
+        assert input_dim==1, "Periodic kernels are only defined for input_dim=1"
+        self.name = 'periodic_Mat52'
+        self.input_dim = input_dim
+        if lengthscale is not None:
+            lengthscale = np.asarray(lengthscale)
+            assert lengthscale.size == 1, "Wrong size: only one lengthscale needed"
+        else:
+            lengthscale = np.ones(1)
+        self.lower,self.upper = lower, upper
+        self.num_params = 3
+        self.n_freq = n_freq
+        self.n_basis = 2*n_freq
+        self._set_params(np.hstack((variance,lengthscale,period)))
+
+    def _cos(self,alpha,omega,phase):
+        def f(x):
+            return alpha*np.cos(omega*x+phase)
+        return f
+
+    @silence_errors
+    def _cos_factorization(self,alpha,omega,phase):
+        r1 = np.sum(alpha*np.cos(phase),axis=1)[:,None]
+        r2 = np.sum(alpha*np.sin(phase),axis=1)[:,None]
+        r =  np.sqrt(r1**2 + r2**2)
+        psi = np.where(r1 != 0, (np.arctan(r2/r1) + (r1<0.)*np.pi),np.arcsin(r2))
+        return r,omega[:,0:1], psi
+
+    @silence_errors
+    def _int_computation(self,r1,omega1,phi1,r2,omega2,phi2):
+        Gint1 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) + 1./(omega1-omega2.T)*( np.sin((omega1-omega2.T)*self.upper+phi1-phi2.T) - np.sin((omega1-omega2.T)*self.lower+phi1-phi2.T) )
+        Gint2 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) +  np.cos(phi1-phi2.T)*(self.upper-self.lower)
+        #Gint2[0,0] = 2.*(self.upper-self.lower)*np.cos(phi1[0,0])*np.cos(phi2[0,0])
+        Gint = np.dot(r1,r2.T)/2 * np.where(np.isnan(Gint1),Gint2,Gint1)
+        return Gint
+
+    def _get_params(self):
+        """return the value of the parameters."""
+        return np.hstack((self.variance,self.lengthscale,self.period))
+
+    def _set_params(self,x):
+        """set the value of the parameters."""
+        assert x.size==3
+        self.variance = x[0]
+        self.lengthscale = x[1]
+        self.period = x[2]
+
+        self.a = [5*np.sqrt(5)/self.lengthscale**3, 15./self.lengthscale**2,3*np.sqrt(5)/self.lengthscale, 1.]
+        self.b  = [9./8, 9*self.lengthscale**4/200., 3*self.lengthscale**2/5., 3*self.lengthscale**2/(5*8.), 3*self.lengthscale**2/(5*8.)]
+
+        self.basis_alpha = np.ones((2*self.n_freq,))
+        self.basis_omega = np.array(sum([[i*2*np.pi/self.period]*2 for i in  range(1,self.n_freq+1)],[]))
+        self.basis_phi =   np.array(sum([[-np.pi/2, 0.]  for i in range(1,self.n_freq+1)],[]))
+
+        self.G = self.Gram_matrix()
+        self.Gi = np.linalg.inv(self.G)
+
+    def _get_param_names(self):
+        """return parameter names."""
+        return ['variance','lengthscale','period']
+
+    def Gram_matrix(self):
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)), self.a[1]*self.basis_omega, self.a[2]*self.basis_omega**2, self.a[3]*self.basis_omega**3))
+        Lo = np.column_stack((self.basis_omega, self.basis_omega, self.basis_omega, self.basis_omega))
+        Lp = np.column_stack((self.basis_phi, self.basis_phi+np.pi/2, self.basis_phi+np.pi, self.basis_phi+np.pi*3/2))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        F2lower = np.array(self._cos(self.basis_alpha*self.basis_omega**2,self.basis_omega,self.basis_phi+np.pi)(self.lower))[:,None]
+        lower_terms = self.b[0]*np.dot(Flower,Flower.T) + self.b[1]*np.dot(F2lower,F2lower.T) + self.b[2]*np.dot(F1lower,F1lower.T) + self.b[3]*np.dot(F2lower,Flower.T) + self.b[4]*np.dot(Flower,F2lower.T)
+        return(3*self.lengthscale**5/(400*np.sqrt(5)*self.variance) * Gint + 1./self.variance*lower_terms)
+
+    def K(self,X,X2,target):
+        """Compute the covariance matrix between X and X2."""
+        FX = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+        if X2 is None:
+            FX2 = FX
+        else:
+            FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
+        np.add(mdot(FX,self.Gi,FX2.T), target,target)
+
+    def Kdiag(self,X,target):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+        np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target)
+
+    @silence_errors
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        """derivative of the covariance matrix with respect to the parameters (shape is Nxnum_inducingxNparam)"""
+        if X2 is None: X2 = X
+        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
+
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)), self.a[1]*self.basis_omega, self.a[2]*self.basis_omega**2, self.a[3]*self.basis_omega**3))
+        Lo = np.column_stack((self.basis_omega, self.basis_omega, self.basis_omega, self.basis_omega))
+        Lp = np.column_stack((self.basis_phi, self.basis_phi+np.pi/2, self.basis_phi+np.pi, self.basis_phi+np.pi*3/2))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        F2lower = np.array(self._cos(self.basis_alpha*self.basis_omega**2,self.basis_omega,self.basis_phi+np.pi)(self.lower))[:,None]
+
+        #dK_dvar
+        dK_dvar = 1./self.variance*mdot(FX,self.Gi,FX2.T)
+
+        #dK_dlen
+        da_dlen = [-3*self.a[0]/self.lengthscale, -2*self.a[1]/self.lengthscale, -self.a[2]/self.lengthscale, 0.]
+        db_dlen = [0., 4*self.b[1]/self.lengthscale, 2*self.b[2]/self.lengthscale, 2*self.b[3]/self.lengthscale, 2*self.b[4]/self.lengthscale]
+        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)), da_dlen[1]*self.basis_omega, da_dlen[2]*self.basis_omega**2, da_dlen[3]*self.basis_omega**3))
+        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
+        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
+        dGint_dlen = dGint_dlen + dGint_dlen.T
+        dlower_terms_dlen = db_dlen[0]*np.dot(Flower,Flower.T) + db_dlen[1]*np.dot(F2lower,F2lower.T) + db_dlen[2]*np.dot(F1lower,F1lower.T) + db_dlen[3]*np.dot(F2lower,Flower.T) + db_dlen[4]*np.dot(Flower,F2lower.T)
+        dG_dlen = 15*self.lengthscale**4/(400*np.sqrt(5))*Gint + 3*self.lengthscale**5/(400*np.sqrt(5))*dGint_dlen + dlower_terms_dlen
+        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX2.T)
+
+        #dK_dper
+        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
+        dFX2_dper = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X2,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X2)
+
+        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period, -self.a[2]*self.basis_omega**3/self.period, -self.a[3]*self.basis_omega**4/self.period))
+        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi,self.basis_phi+np.pi*3/2,self.basis_phi))
+        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
+
+        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
+        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
+        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
+        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
+        #IPPprim2[0,0] = 2*(self.upper**2 - self.lower**2)*np.cos(phi[0,0])*np.cos(phi1[0,0])
+        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)
+
+        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
+        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
+        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + 1./2*self.upper**2*np.cos(phi-phi1.T)
+        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + 1./2*self.lower**2*np.cos(phi-phi1.T)
+        #IPPint2[0,0] = (self.upper**2 - self.lower**2)*np.cos(phi[0,0])*np.cos(phi1[0,0])
+        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
+
+        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period, -2*self.a[2]*self.basis_omega**2/self.period, -3*self.a[3]*self.basis_omega**3/self.period))
+        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2, self.basis_phi+np.pi, self.basis_phi+np.pi*3/2))
+        r2,omega2,phi2 =  self._cos_factorization(dLa_dper2,Lo[:,0:2],dLp_dper2)
+
+        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) +  self._int_computation(r2,omega2,phi2, r,omega,phi)
+        dGint_dper = dGint_dper + dGint_dper.T
+
+        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        dF1lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower)+self._cos(-self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        dF2lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**3/self.period,self.basis_omega,self.basis_phi+np.pi*3/2)(self.lower) + self._cos(-2*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower))[:,None]
+
+        dlower_terms_dper  = self.b[0] * (np.dot(dFlower_dper,Flower.T) + np.dot(Flower.T,dFlower_dper))
+        dlower_terms_dper += self.b[1] * (np.dot(dF2lower_dper,F2lower.T) + np.dot(F2lower,dF2lower_dper.T)) - 4*self.b[1]/self.period*np.dot(F2lower,F2lower.T)
+        dlower_terms_dper += self.b[2] * (np.dot(dF1lower_dper,F1lower.T) + np.dot(F1lower,dF1lower_dper.T)) - 2*self.b[2]/self.period*np.dot(F1lower,F1lower.T)
+        dlower_terms_dper += self.b[3] * (np.dot(dF2lower_dper,Flower.T) + np.dot(F2lower,dFlower_dper.T)) - 2*self.b[3]/self.period*np.dot(F2lower,Flower.T)
+        dlower_terms_dper += self.b[4] * (np.dot(dFlower_dper,F2lower.T) + np.dot(Flower,dF2lower_dper.T)) - 2*self.b[4]/self.period*np.dot(Flower,F2lower.T)
+
+        dG_dper = 1./self.variance*(3*self.lengthscale**5/(400*np.sqrt(5))*dGint_dper + 0.5*dlower_terms_dper)
+        dK_dper = mdot(dFX_dper,self.Gi,FX2.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX2.T) + mdot(FX,self.Gi,dFX2_dper.T)
+
+        # np.add(target[:,:,0],dK_dvar, target[:,:,0])
+        target[0] += np.sum(dK_dvar*dL_dK)
+        #np.add(target[:,:,1],dK_dlen, target[:,:,1])
+        target[1] += np.sum(dK_dlen*dL_dK)
+        #np.add(target[:,:,2],dK_dper, target[:,:,2])
+        target[2] += np.sum(dK_dper*dL_dK)
+
+    @silence_errors
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        """derivative of the diagonal of the covariance matrix with respect to the parameters"""
+        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)), self.a[1]*self.basis_omega, self.a[2]*self.basis_omega**2, self.a[3]*self.basis_omega**3))
+        Lo = np.column_stack((self.basis_omega, self.basis_omega, self.basis_omega, self.basis_omega))
+        Lp = np.column_stack((self.basis_phi, self.basis_phi+np.pi/2, self.basis_phi+np.pi, self.basis_phi+np.pi*3/2))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        F2lower = np.array(self._cos(self.basis_alpha*self.basis_omega**2,self.basis_omega,self.basis_phi+np.pi)(self.lower))[:,None]
+
+        #dK_dvar
+        dK_dvar = 1. / self.variance * mdot(FX, self.Gi, FX.T)
+
+        #dK_dlen
+        da_dlen = [-3*self.a[0]/self.lengthscale, -2*self.a[1]/self.lengthscale, -self.a[2]/self.lengthscale, 0.]
+        db_dlen = [0., 4*self.b[1]/self.lengthscale, 2*self.b[2]/self.lengthscale, 2*self.b[3]/self.lengthscale, 2*self.b[4]/self.lengthscale]
+        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)), da_dlen[1]*self.basis_omega, da_dlen[2]*self.basis_omega**2, da_dlen[3]*self.basis_omega**3))
+        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
+        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
+        dGint_dlen = dGint_dlen + dGint_dlen.T
+        dlower_terms_dlen = db_dlen[0]*np.dot(Flower,Flower.T) + db_dlen[1]*np.dot(F2lower,F2lower.T) + db_dlen[2]*np.dot(F1lower,F1lower.T) + db_dlen[3]*np.dot(F2lower,Flower.T) + db_dlen[4]*np.dot(Flower,F2lower.T)
+        dG_dlen = 15*self.lengthscale**4/(400*np.sqrt(5))*Gint + 3*self.lengthscale**5/(400*np.sqrt(5))*dGint_dlen + dlower_terms_dlen
+        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX.T)
+
+        #dK_dper
+        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
+
+        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period, -self.a[2]*self.basis_omega**3/self.period, -self.a[3]*self.basis_omega**4/self.period))
+        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi,self.basis_phi+np.pi*3/2,self.basis_phi))
+        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
+
+        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
+        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
+        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
+        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
+        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)
+
+        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
+        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
+        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + .5*self.upper**2*np.cos(phi-phi1.T)
+        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + .5*self.lower**2*np.cos(phi-phi1.T)
+        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
+
+        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period, -2*self.a[2]*self.basis_omega**2/self.period, -3*self.a[3]*self.basis_omega**3/self.period))
+        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2, self.basis_phi+np.pi, self.basis_phi+np.pi*3/2))
+        r2,omega2,phi2 =  self._cos_factorization(dLa_dper2,Lo[:,0:2],dLp_dper2)
+
+        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) +  self._int_computation(r2,omega2,phi2, r,omega,phi)
+        dGint_dper = dGint_dper + dGint_dper.T
+
+        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        dF1lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower)+self._cos(-self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        dF2lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**3/self.period,self.basis_omega,self.basis_phi+np.pi*3/2)(self.lower) + self._cos(-2*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower))[:,None]
+
+        dlower_terms_dper  = self.b[0] * (np.dot(dFlower_dper,Flower.T) + np.dot(Flower.T,dFlower_dper))
+        dlower_terms_dper += self.b[1] * (np.dot(dF2lower_dper,F2lower.T) + np.dot(F2lower,dF2lower_dper.T)) - 4*self.b[1]/self.period*np.dot(F2lower,F2lower.T)
+        dlower_terms_dper += self.b[2] * (np.dot(dF1lower_dper,F1lower.T) + np.dot(F1lower,dF1lower_dper.T)) - 2*self.b[2]/self.period*np.dot(F1lower,F1lower.T)
+        dlower_terms_dper += self.b[3] * (np.dot(dF2lower_dper,Flower.T) + np.dot(F2lower,dFlower_dper.T)) - 2*self.b[3]/self.period*np.dot(F2lower,Flower.T)
+        dlower_terms_dper += self.b[4] * (np.dot(dFlower_dper,F2lower.T) + np.dot(Flower,dF2lower_dper.T)) - 2*self.b[4]/self.period*np.dot(Flower,F2lower.T)
+
+        dG_dper = 1./self.variance*(3*self.lengthscale**5/(400*np.sqrt(5))*dGint_dper + 0.5*dlower_terms_dper)
+        dK_dper = 2*mdot(dFX_dper,self.Gi,FX.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX.T)
+
+        target[0] += np.sum(np.diag(dK_dvar)*dL_dKdiag)
+        target[1] += np.sum(np.diag(dK_dlen)*dL_dKdiag)
+        target[2] += np.sum(np.diag(dK_dper)*dL_dKdiag)
--- a/GPy/kern/parts/periodic_exponential.py
+++ b/GPy/kern/parts/periodic_exponential.py
@ -0,0 +1,237 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+from GPy.util.linalg import mdot
+from GPy.util.decorators import silence_errors
+
+class PeriodicExponential(Kernpart):
+    """
+    Kernel of the periodic subspace (up to a given frequency) of a exponential (Matern 1/2) RKHS. Only defined for input_dim=1.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance of the Matern kernel
+    :type variance: float
+    :param lengthscale: the lengthscale of the Matern kernel
+    :type lengthscale: np.ndarray of size (input_dim,)
+    :param period: the period
+    :type period: float
+    :param n_freq: the number of frequencies considered for the periodic subspace
+    :type n_freq: int
+    :rtype: kernel object
+
+    """
+
+    def __init__(self, input_dim=1, variance=1., lengthscale=None, period=2 * np.pi, n_freq=10, lower=0., upper=4 * np.pi):
+        assert input_dim==1, "Periodic kernels are only defined for input_dim=1"
+        self.name = 'periodic_exp'
+        self.input_dim = input_dim
+        if lengthscale is not None:
+            lengthscale = np.asarray(lengthscale)
+            assert lengthscale.size == 1, "Wrong size: only one lengthscale needed"
+        else:
+            lengthscale = np.ones(1)
+        self.lower,self.upper = lower, upper
+        self.num_params = 3
+        self.n_freq = n_freq
+        self.n_basis = 2*n_freq
+        self._set_params(np.hstack((variance,lengthscale,period)))
+
+    def _cos(self,alpha,omega,phase):
+        def f(x):
+            return alpha*np.cos(omega*x+phase)
+        return f
+
+    @silence_errors
+    def _cos_factorization(self,alpha,omega,phase):
+        r1 = np.sum(alpha*np.cos(phase),axis=1)[:,None]
+        r2 = np.sum(alpha*np.sin(phase),axis=1)[:,None]
+        r =  np.sqrt(r1**2 + r2**2)
+        psi = np.where(r1 != 0, (np.arctan(r2/r1) + (r1<0.)*np.pi),np.arcsin(r2))
+        return r,omega[:,0:1], psi
+
+    @silence_errors
+    def _int_computation(self,r1,omega1,phi1,r2,omega2,phi2):
+        Gint1 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) + 1./(omega1-omega2.T)*( np.sin((omega1-omega2.T)*self.upper+phi1-phi2.T) - np.sin((omega1-omega2.T)*self.lower+phi1-phi2.T) )
+        Gint2 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) +  np.cos(phi1-phi2.T)*(self.upper-self.lower)
+        #Gint2[0,0] = 2.*(self.upper-self.lower)*np.cos(phi1[0,0])*np.cos(phi2[0,0])
+        Gint = np.dot(r1,r2.T)/2 * np.where(np.isnan(Gint1),Gint2,Gint1)
+        return Gint
+
+    def _get_params(self):
+        """return the value of the parameters."""
+        return np.hstack((self.variance,self.lengthscale,self.period))
+
+    def _set_params(self,x):
+        """set the value of the parameters."""
+        assert x.size==3
+        self.variance = x[0]
+        self.lengthscale = x[1]
+        self.period = x[2]
+
+        self.a = [1./self.lengthscale, 1.]
+        self.b = [1]
+
+        self.basis_alpha = np.ones((self.n_basis,))
+        self.basis_omega = np.array(sum([[i*2*np.pi/self.period]*2 for i in  range(1,self.n_freq+1)],[]))
+        self.basis_phi =   np.array(sum([[-np.pi/2, 0.]  for i in range(1,self.n_freq+1)],[]))
+
+        self.G = self.Gram_matrix()
+        self.Gi = np.linalg.inv(self.G)
+
+    def _get_param_names(self):
+        """return parameter names."""
+        return ['variance','lengthscale','period']
+
+    def Gram_matrix(self):
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega))
+        Lo = np.column_stack((self.basis_omega,self.basis_omega))
+        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+        return(self.lengthscale/(2*self.variance) * Gint + 1./self.variance*np.dot(Flower,Flower.T))
+
+    def K(self,X,X2,target):
+        """Compute the covariance matrix between X and X2."""
+        FX = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+        if X2 is None:
+            FX2 = FX
+        else:
+            FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
+        np.add(mdot(FX,self.Gi,FX2.T), target,target)
+
+    def Kdiag(self,X,target):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+        np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target)
+
+    @silence_errors
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        """derivative of the covariance matrix with respect to the parameters (shape is Nxnum_inducingxNparam)"""
+        if X2 is None: X2 = X
+        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
+
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega))
+        Lo = np.column_stack((self.basis_omega,self.basis_omega))
+        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+
+        #dK_dvar
+        dK_dvar = 1./self.variance*mdot(FX,self.Gi,FX2.T)
+
+        #dK_dlen
+        da_dlen = [-1./self.lengthscale**2,0.]
+        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)),da_dlen[1]*self.basis_omega))
+        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
+        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
+        dGint_dlen = dGint_dlen + dGint_dlen.T
+        dG_dlen = 1./2*Gint + self.lengthscale/2*dGint_dlen
+        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX2.T)
+
+        #dK_dper
+        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
+        dFX2_dper = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X2,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X2)
+
+        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period))
+        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi))
+        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
+
+        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
+        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
+        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
+        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
+        #IPPprim2[0,0] = 2*(self.upper**2 - self.lower**2)*np.cos(phi[0,0])*np.cos(phi1[0,0])
+        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)
+
+        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
+        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
+        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + 1./2*self.upper**2*np.cos(phi-phi1.T)
+        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + 1./2*self.lower**2*np.cos(phi-phi1.T)
+        #IPPint2[0,0] = (self.upper**2 - self.lower**2)*np.cos(phi[0,0])*np.cos(phi1[0,0])
+        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
+
+        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period))
+        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2))
+        r2,omega2,phi2 = dLa_dper2.T,Lo[:,0:1],dLp_dper2.T
+
+        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) + self._int_computation(r2,omega2,phi2, r,omega,phi)
+        dGint_dper = dGint_dper + dGint_dper.T
+
+        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+
+        dG_dper = 1./self.variance*(self.lengthscale/2*dGint_dper + self.b[0]*(np.dot(dFlower_dper,Flower.T)+np.dot(Flower,dFlower_dper.T)))
+
+        dK_dper = mdot(dFX_dper,self.Gi,FX2.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX2.T) + mdot(FX,self.Gi,dFX2_dper.T)
+
+        target[0] += np.sum(dK_dvar*dL_dK)
+        target[1] += np.sum(dK_dlen*dL_dK)
+        target[2] += np.sum(dK_dper*dL_dK)
+
+    @silence_errors
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        """derivative of the diagonal of the covariance matrix with respect to the parameters"""
+        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega))
+        Lo = np.column_stack((self.basis_omega,self.basis_omega))
+        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+
+        #dK_dvar
+        dK_dvar = 1./self.variance*mdot(FX,self.Gi,FX.T)
+
+        #dK_dlen
+        da_dlen = [-1./self.lengthscale**2,0.]
+        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)),da_dlen[1]*self.basis_omega))
+        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
+        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
+        dGint_dlen = dGint_dlen + dGint_dlen.T
+        dG_dlen = 1./2*Gint + self.lengthscale/2*dGint_dlen
+        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX.T)
+
+        #dK_dper
+        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
+
+        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period))
+        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi))
+        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
+
+        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
+        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
+        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
+        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
+        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)
+
+        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
+        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
+        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + 1./2*self.upper**2*np.cos(phi-phi1.T)
+        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + 1./2*self.lower**2*np.cos(phi-phi1.T)
+        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
+
+        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period))
+        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2))
+        r2,omega2,phi2 = dLa_dper2.T,Lo[:,0:1],dLp_dper2.T
+
+        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) + self._int_computation(r2,omega2,phi2, r,omega,phi)
+        dGint_dper = dGint_dper + dGint_dper.T
+
+        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+
+        dG_dper = 1./self.variance*(self.lengthscale/2*dGint_dper + self.b[0]*(np.dot(dFlower_dper,Flower.T)+np.dot(Flower,dFlower_dper.T)))
+
+        dK_dper = 2*mdot(dFX_dper,self.Gi,FX.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX.T)
+
+        target[0] += np.sum(np.diag(dK_dvar)*dL_dKdiag)
+        target[1] += np.sum(np.diag(dK_dlen)*dL_dKdiag)
+        target[2] += np.sum(np.diag(dK_dper)*dL_dKdiag)
--- a/GPy/kern/parts/poly.py
+++ b/GPy/kern/parts/poly.py
@ -0,0 +1,135 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+four_over_tau = 2./np.pi
+
+class POLY(Kernpart):
+    """
+    polynomial kernel parameter initialisation.  Included for completeness, but generally not recommended, is the polynomial kernel,
+    .. math::
+    
+    k(x, y) = \sigma^2*(\sigma_w^2 x'y+\sigma_b^b)^d
+
+    The kernel parameters are \sigma^2 (variance), \sigma^2_w
+    (weight_variance), \sigma^2_b (bias_variance) and d
+    (degree). Only gradients of the first three are provided for
+    kernel optimisation, it is assumed that polynomial degree would
+    be set by hand.
+
+    The kernel is not recommended as it is badly behaved when the
+    \sigma^2_w*x'*y + \sigma^2_b has a magnitude greater than one. For completeness
+    there is an automatic relevance determination version of this
+    kernel provided.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int 
+    :param variance: the variance :math:`\sigma^2`
+    :type variance: float
+    :param weight_variance: the vector of the variances of the prior over input weights in the neural network :math:`\sigma^2_w`
+    :type weight_variance: array or list of the appropriate size (or float if there is only one weight variance parameter)
+    :param bias_variance: the variance of the prior over bias parameters :math:`\sigma^2_b`
+    :param degree: the degree of the polynomial.
+    :type degree: int
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
+    :type ARD: Boolean
+    :rtype: Kernpart object
+
+    """
+
+    def __init__(self, input_dim, variance=1., weight_variance=None, bias_variance=1., degree=2, ARD=False):
+        self.input_dim = input_dim
+        self.ARD = ARD
+        if not ARD:
+            self.num_params=3
+            if weight_variance is not None:
+                weight_variance = np.asarray(weight_variance)
+                assert weight_variance.size == 1, "Only one weight variance needed for non-ARD kernel"
+            else:
+                weight_variance = 1.*np.ones(1)
+        else:
+            self.num_params = self.input_dim + 2
+            if weight_variance is not None:
+                weight_variance = np.asarray(weight_variance)
+                assert weight_variance.size == self.input_dim, "bad number of weight variances"
+            else:
+                weight_variance = np.ones(self.input_dim)
+            raise NotImplementedError
+        self.degree=degree
+        self.name='poly_deg' + str(self.degree)
+        self._set_params(np.hstack((variance, weight_variance.flatten(), bias_variance)))
+
+    def _get_params(self):
+        return np.hstack((self.variance, self.weight_variance.flatten(), self.bias_variance))
+
+    def _set_params(self, x):
+        assert x.size == (self.num_params)
+        self.variance = x[0]
+        self.weight_variance = x[1:-1]
+        self.weight_std = np.sqrt(self.weight_variance)
+        self.bias_variance = x[-1]
+
+    def _get_param_names(self):
+        if self.num_params == 3:
+            return ['variance', 'weight_variance', 'bias_variance']
+        else:
+            return ['variance'] + ['weight_variance_%i' % i for i in range(self.lengthscale.size)] + ['bias_variance']
+
+    def K(self, X, X2, target):
+        """Return covariance between X and X2."""
+        self._K_computations(X, X2)
+        target += self.variance*self._K_dvar
+
+    def Kdiag(self, X, target):
+        """Compute the diagonal of the covariance matrix for X."""
+        self._K_diag_computations(X)
+        target+= self.variance*self._K_diag_dvar
+
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        """Derivative of the covariance with respect to the parameters."""
+        self._K_computations(X, X2)
+        base = self.variance*self.degree*self._K_poly_arg**(self.degree-1)
+        base_cov_grad = base*dL_dK
+
+
+            
+        target[0] += np.sum(self._K_dvar*dL_dK)
+        target[1] += (self._K_inner_prod*base_cov_grad).sum()
+        target[2] += base_cov_grad.sum()
+
+
+    def dK_dX(self, dL_dK, X, X2, target):
+        """Derivative of the covariance matrix with respect to X"""
+        self._K_computations(X, X2)
+        arg = self._K_poly_arg
+        target += self.weight_variance*self.degree*self.variance*(((X2[None,:, :])) *(arg**(self.degree-1))[:, :, None]*dL_dK[:, :, None]).sum(1)
+            
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to X"""
+        self._K_diag_computations(X)
+        arg = self._K_diag_poly_arg
+        target += 2.*self.weight_variance*self.degree*self.variance*X*dL_dKdiag[:, None]*(arg**(self.degree-1))[:, None]
+    
+    
+    def _K_computations(self, X, X2):
+        if self.ARD:
+            pass
+        else:
+            if X2 is None:
+                self._K_inner_prod = np.dot(X,X.T)
+            else:
+                self._K_inner_prod = np.dot(X,X2.T)
+            self._K_poly_arg = self._K_inner_prod*self.weight_variance + self.bias_variance
+        self._K_dvar = self._K_poly_arg**self.degree
+
+    def _K_diag_computations(self, X):
+        if self.ARD:
+            pass
+        else:
+            self._K_diag_poly_arg = (X*X).sum(1)*self.weight_variance + self.bias_variance
+        self._K_diag_dvar = self._K_diag_poly_arg**self.degree
+
+  
+
+
--- a/GPy/kern/parts/prod.py
+++ b/GPy/kern/parts/prod.py
@ -0,0 +1,121 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+import hashlib
+
+class Prod(Kernpart):
+    """
+    Computes the product of 2 kernels
+
+    :param k1, k2: the kernels to multiply
+    :type k1, k2: Kernpart
+    :param tensor: The kernels are either multiply as functions defined on the same input space (default) or on the product of the input spaces
+    :type tensor: Boolean
+    :rtype: kernel object
+
+    """
+    def __init__(self,k1,k2,tensor=False):
+        self.num_params = k1.num_params + k2.num_params
+        self.name = k1.name + '<times>' + k2.name
+        self.k1 = k1
+        self.k2 = k2
+        if tensor:
+            self.input_dim = k1.input_dim + k2.input_dim
+            self.slice1 = slice(0,self.k1.input_dim)
+            self.slice2 = slice(self.k1.input_dim,self.k1.input_dim+self.k2.input_dim)
+        else:
+            assert k1.input_dim == k2.input_dim, "Error: The input spaces of the kernels to sum don't have the same dimension."
+            self.input_dim = k1.input_dim
+            self.slice1 = slice(0,self.input_dim)
+            self.slice2 = slice(0,self.input_dim)
+
+        self._X, self._X2, self._params = np.empty(shape=(3,1))
+        self._set_params(np.hstack((k1._get_params(),k2._get_params())))
+
+    def _get_params(self):
+        """return the value of the parameters."""
+        return np.hstack((self.k1._get_params(), self.k2._get_params()))
+
+    def _set_params(self,x):
+        """set the value of the parameters."""
+        self.k1._set_params(x[:self.k1.num_params])
+        self.k2._set_params(x[self.k1.num_params:])
+
+    def _get_param_names(self):
+        """return parameter names."""
+        return [self.k1.name + '_' + param_name for param_name in self.k1._get_param_names()] + [self.k2.name + '_' + param_name for param_name in self.k2._get_param_names()]
+
+    def K(self,X,X2,target):
+        self._K_computations(X,X2)
+        target += self._K1 * self._K2
+
+    def K1(self,X, X2):
+        """Compute the part of the kernel associated with k1."""
+        self._K_computations(X, X2)
+        return self._K1
+
+    def K2(self, X, X2):
+        """Compute the part of the kernel associated with k2."""
+        self._K_computations(X, X2)
+        return self._K2
+    
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        """Derivative of the covariance matrix with respect to the parameters."""
+        self._K_computations(X,X2)
+        if X2 is None:
+            self.k1.dK_dtheta(dL_dK*self._K2, X[:,self.slice1], None, target[:self.k1.num_params])
+            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.slice2], None, target[self.k1.num_params:])
+        else:
+            self.k1.dK_dtheta(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:self.k1.num_params])
+            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[self.k1.num_params:])
+
+    def Kdiag(self,X,target):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        target1 = np.zeros(X.shape[0])
+        target2 = np.zeros(X.shape[0])
+        self.k1.Kdiag(X[:,self.slice1],target1)
+        self.k2.Kdiag(X[:,self.slice2],target2)
+        target += target1 * target2
+
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        K1 = np.zeros(X.shape[0])
+        K2 = np.zeros(X.shape[0])
+        self.k1.Kdiag(X[:,self.slice1],K1)
+        self.k2.Kdiag(X[:,self.slice2],K2)
+        self.k1.dKdiag_dtheta(dL_dKdiag*K2,X[:,self.slice1],target[:self.k1.num_params])
+        self.k2.dKdiag_dtheta(dL_dKdiag*K1,X[:,self.slice2],target[self.k1.num_params:])
+
+    def dK_dX(self,dL_dK,X,X2,target):
+        """derivative of the covariance matrix with respect to X."""
+        self._K_computations(X,X2)
+        self.k1.dK_dX(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
+        self.k2.dK_dX(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
+
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        K1 = np.zeros(X.shape[0])
+        K2 = np.zeros(X.shape[0])
+        self.k1.Kdiag(X[:,self.slice1],K1)
+        self.k2.Kdiag(X[:,self.slice2],K2)
+
+        self.k1.dK_dX(dL_dKdiag*K2, X[:,self.slice1], target[:,self.slice1])
+        self.k2.dK_dX(dL_dKdiag*K1, X[:,self.slice2], target[:,self.slice2])
+
+    def _K_computations(self,X,X2):
+        if not (np.array_equal(X,self._X) and np.array_equal(X2,self._X2) and np.array_equal(self._params , self._get_params())):
+            self._X = X.copy()
+            self._params == self._get_params().copy()
+            if X2 is None:
+                self._X2 = None
+                self._K1 = np.zeros((X.shape[0],X.shape[0]))
+                self._K2 = np.zeros((X.shape[0],X.shape[0]))
+                self.k1.K(X[:,self.slice1],None,self._K1)
+                self.k2.K(X[:,self.slice2],None,self._K2)
+            else:
+                self._X2 = X2.copy()
+                self._K1 = np.zeros((X.shape[0],X2.shape[0]))
+                self._K2 = np.zeros((X.shape[0],X2.shape[0]))
+                self.k1.K(X[:,self.slice1],X2[:,self.slice1],self._K1)
+                self.k2.K(X[:,self.slice2],X2[:,self.slice2],self._K2)
+
--- a/GPy/kern/parts/prod_orthogonal.py
+++ b/GPy/kern/parts/prod_orthogonal.py
@ -0,0 +1,101 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+import hashlib
+#from scipy import integrate # This may not be necessary (Nicolas, 20th Feb)
+
+class prod_orthogonal(Kernpart):
+    """
+    Computes the product of 2 kernels
+
+    :param k1, k2: the kernels to multiply
+    :type k1, k2: Kernpart
+    :rtype: kernel object
+
+    """
+    def __init__(self,k1,k2):
+        self.input_dim = k1.input_dim + k2.input_dim
+        self.num_params = k1.num_params + k2.num_params
+        self.name = k1.name + '<times>' + k2.name
+        self.k1 = k1
+        self.k2 = k2
+        self._X, self._X2, self._params = np.empty(shape=(3,1))
+        self._set_params(np.hstack((k1._get_params(),k2._get_params())))
+
+    def _get_params(self):
+        """return the value of the parameters."""
+        return np.hstack((self.k1._get_params(), self.k2._get_params()))
+
+    def _set_params(self,x):
+        """set the value of the parameters."""
+        self.k1._set_params(x[:self.k1.num_params])
+        self.k2._set_params(x[self.k1.num_params:])
+
+    def _get_param_names(self):
+        """return parameter names."""
+        return [self.k1.name + '_' + param_name for param_name in self.k1._get_param_names()] + [self.k2.name + '_' + param_name for param_name in self.k2._get_param_names()]
+
+    def K(self,X,X2,target):
+        self._K_computations(X,X2)
+        target += self._K1 * self._K2
+
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        """derivative of the covariance matrix with respect to the parameters."""
+        self._K_computations(X,X2)
+        if X2 is None:
+            self.k1.dK_dtheta(dL_dK*self._K2, X[:,:self.k1.input_dim], None, target[:self.k1.num_params])
+            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.k1.input_dim:], None, target[self.k1.num_params:])
+        else:
+            self.k1.dK_dtheta(dL_dK*self._K2, X[:,:self.k1.input_dim], X2[:,:self.k1.input_dim], target[:self.k1.num_params])
+            self.k2.dK_dtheta(dL_dK*self._K1, X[:,self.k1.input_dim:], X2[:,self.k1.input_dim:], target[self.k1.num_params:])
+
+    def Kdiag(self,X,target):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        target1 = np.zeros(X.shape[0])
+        target2 = np.zeros(X.shape[0])
+        self.k1.Kdiag(X[:,:self.k1.input_dim],target1)
+        self.k2.Kdiag(X[:,self.k1.input_dim:],target2)
+        target += target1 * target2
+
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        K1 = np.zeros(X.shape[0])
+        K2 = np.zeros(X.shape[0])
+        self.k1.Kdiag(X[:,:self.k1.input_dim],K1)
+        self.k2.Kdiag(X[:,self.k1.input_dim:],K2)
+        self.k1.dKdiag_dtheta(dL_dKdiag*K2,X[:,:self.k1.input_dim],target[:self.k1.num_params])
+        self.k2.dKdiag_dtheta(dL_dKdiag*K1,X[:,self.k1.input_dim:],target[self.k1.num_params:])
+
+    def dK_dX(self,dL_dK,X,X2,target):
+        """derivative of the covariance matrix with respect to X."""
+        self._K_computations(X,X2)
+        self.k1.dK_dX(dL_dK*self._K2, X[:,:self.k1.input_dim], X2[:,:self.k1.input_dim], target)
+        self.k2.dK_dX(dL_dK*self._K1, X[:,self.k1.input_dim:], X2[:,self.k1.input_dim:], target)
+
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        K1 = np.zeros(X.shape[0])
+        K2 = np.zeros(X.shape[0])
+        self.k1.Kdiag(X[:,0:self.k1.input_dim],K1)
+        self.k2.Kdiag(X[:,self.k1.input_dim:],K2)
+
+        self.k1.dK_dX(dL_dKdiag*K2, X[:,:self.k1.input_dim], target)
+        self.k2.dK_dX(dL_dKdiag*K1, X[:,self.k1.input_dim:], target)
+
+    def _K_computations(self,X,X2):
+        if not (np.array_equal(X,self._X) and np.array_equal(X2,self._X2) and np.array_equal(self._params , self._get_params())):
+            self._X = X.copy()
+            self._params == self._get_params().copy()
+            if X2 is None:
+                self._X2 = None
+                self._K1 = np.zeros((X.shape[0],X.shape[0]))
+                self._K2 = np.zeros((X.shape[0],X.shape[0]))
+                self.k1.K(X[:,:self.k1.input_dim],None,self._K1)
+                self.k2.K(X[:,self.k1.input_dim:],None,self._K2)
+            else:
+                self._X2 = X2.copy()
+                self._K1 = np.zeros((X.shape[0],X2.shape[0]))
+                self._K2 = np.zeros((X.shape[0],X2.shape[0]))
+                self.k1.K(X[:,:self.k1.input_dim],X2[:,:self.k1.input_dim],self._K1)
+                self.k2.K(X[:,self.k1.input_dim:],X2[:,self.k1.input_dim:],self._K2)
+
--- a/GPy/kern/parts/rational_quadratic.py
+++ b/GPy/kern/parts/rational_quadratic.py
@ -0,0 +1,80 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+
+class RationalQuadratic(Kernpart):
+    """
+    rational quadratic kernel
+
+    .. math::
+
+       k(r) = \sigma^2 \\bigg( 1 + \\frac{r^2}{2 \ell^2} \\bigg)^{- \\alpha} \ \ \ \ \  \\text{ where  } r^2 = (x-y)^2
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int (input_dim=1 is the only value currently supported)
+    :param variance: the variance :math:`\sigma^2`
+    :type variance: float
+    :param lengthscale: the lengthscale :math:`\ell`
+    :type lengthscale: float
+    :param power: the power :math:`\\alpha`
+    :type power: float
+    :rtype: Kernpart object
+
+    """
+    def __init__(self,input_dim,variance=1.,lengthscale=1.,power=1.):
+        assert input_dim == 1, "For this kernel we assume input_dim=1"
+        self.input_dim = input_dim
+        self.num_params = 3
+        self.name = 'rat_quad'
+        self.variance = variance
+        self.lengthscale = lengthscale
+        self.power = power
+
+    def _get_params(self):
+        return np.hstack((self.variance,self.lengthscale,self.power))
+
+    def _set_params(self,x):
+        self.variance = x[0]
+        self.lengthscale = x[1]
+        self.power = x[2]
+
+    def _get_param_names(self):
+        return ['variance','lengthscale','power']
+
+    def K(self,X,X2,target):
+        if X2 is None: X2 = X
+        dist2 = np.square((X-X2.T)/self.lengthscale)
+        target += self.variance*(1 + dist2/2.)**(-self.power)
+
+    def Kdiag(self,X,target):
+        target += self.variance
+
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        if X2 is None: X2 = X
+        dist2 = np.square((X-X2.T)/self.lengthscale)
+
+        dvar = (1 + dist2/2.)**(-self.power)
+        dl = self.power * self.variance * dist2 * self.lengthscale**(-3) * (1 + dist2/2./self.power)**(-self.power-1)
+        dp = - self.variance * np.log(1 + dist2/2.) * (1 + dist2/2.)**(-self.power)
+
+        target[0] += np.sum(dvar*dL_dK)
+        target[1] += np.sum(dl*dL_dK)
+        target[2] += np.sum(dp*dL_dK)
+
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        target[0] += np.sum(dL_dKdiag)
+        # here self.lengthscale and self.power have no influence on Kdiag so target[1:] are unchanged
+
+    def dK_dX(self,dL_dK,X,X2,target):
+        """derivative of the covariance matrix with respect to X."""
+        if X2 is None: X2 = X
+        dist2 = np.square((X-X2.T)/self.lengthscale)
+
+        dX = -self.variance*self.power * (X-X2.T)/self.lengthscale**2 *  (1 + dist2/2./self.lengthscale)**(-self.power-1)
+        target += np.sum(dL_dK*dX,1)[:,np.newaxis]
+
+    def dKdiag_dX(self,dL_dKdiag,X,target):
+        pass
--- a/GPy/kern/parts/rbf.py
+++ b/GPy/kern/parts/rbf.py
@ -0,0 +1,330 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+from scipy import weave
+from ...util.linalg import tdot
+from ...util.misc import fast_array_equal
+
+class RBF(Kernpart):
+    """
+    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel:
+
+    .. math::
+
+       k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \ \ \ \ \  \\text{ where  } r^2 = \sum_{i=1}^d \\frac{ (x_i-x^\prime_i)^2}{\ell_i^2}
+
+    where \ell_i is the lengthscale, \sigma^2 the variance and d the dimensionality of the input.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+    :param lengthscale: the vector of lengthscale of the kernel
+    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
+    :type ARD: Boolean
+    :rtype: kernel object
+
+    .. Note: this object implements both the ARD and 'spherical' version of the function
+    """
+
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False):
+        self.input_dim = input_dim
+        self.name = 'rbf'
+        self.ARD = ARD
+        if not ARD:
+            self.num_params = 2
+            if lengthscale is not None:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
+            else:
+                lengthscale = np.ones(1)
+        else:
+            self.num_params = self.input_dim + 1
+            if lengthscale is not None:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size == self.input_dim, "bad number of lengthscales"
+            else:
+                lengthscale = np.ones(self.input_dim)
+
+        self._set_params(np.hstack((variance, lengthscale.flatten())))
+
+        # initialize cache
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
+
+        # a set of optional args to pass to weave
+        self.weave_options = {'headers'           : ['<omp.h>'],
+                         'extra_compile_args': ['-fopenmp -O3'], # -march=native'],
+                         'extra_link_args'   : ['-lgomp']}
+
+
+
+    def _get_params(self):
+        return np.hstack((self.variance, self.lengthscale))
+
+    def _set_params(self, x):
+        assert x.size == (self.num_params)
+        self.variance = x[0]
+        self.lengthscale = x[1:]
+        self.lengthscale2 = np.square(self.lengthscale)
+        # reset cached results
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1)) # cached versions of Z,mu,S
+
+    def _get_param_names(self):
+        if self.num_params == 2:
+            return ['variance', 'lengthscale']
+        else:
+            return ['variance'] + ['lengthscale_%i' % i for i in range(self.lengthscale.size)]
+
+    def K(self, X, X2, target):
+        self._K_computations(X, X2)
+        target += self.variance * self._K_dvar
+
+    def Kdiag(self, X, target):
+        np.add(target, self.variance, target)
+
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        self._K_computations(X, X2)
+        target[0] += np.sum(self._K_dvar * dL_dK)
+        if self.ARD:
+            dvardLdK = self._K_dvar * dL_dK
+            var_len3 = self.variance / np.power(self.lengthscale, 3)
+            if X2 is None:
+                # save computation for the symmetrical case
+                dvardLdK = dvardLdK + dvardLdK.T
+                code = """
+                int q,i,j;
+                double tmp;
+                for(q=0; q<input_dim; q++){
+                  tmp = 0;
+                  for(i=0; i<num_data; i++){
+                    for(j=0; j<i; j++){
+                      tmp += (X(i,q)-X(j,q))*(X(i,q)-X(j,q))*dvardLdK(i,j);
+                    }
+                  }
+                  target(q+1) += var_len3(q)*tmp;
+                }
+                """
+                num_data, num_inducing, input_dim = X.shape[0], X.shape[0], self.input_dim
+                weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3'], type_converters=weave.converters.blitz, **self.weave_options)
+            else:
+                code = """
+                int q,i,j;
+                double tmp;
+                for(q=0; q<input_dim; q++){
+                  tmp = 0;
+                  for(i=0; i<num_data; i++){
+                    for(j=0; j<num_inducing; j++){
+                      tmp += (X(i,q)-X2(j,q))*(X(i,q)-X2(j,q))*dvardLdK(i,j);
+                    }
+                  }
+                  target(q+1) += var_len3(q)*tmp;
+                }
+                """
+                num_data, num_inducing, input_dim = X.shape[0], X2.shape[0], self.input_dim
+                # [np.add(target[1+q:2+q],var_len3[q]*np.sum(dvardLdK*np.square(X[:,q][:,None]-X2[:,q][None,:])),target[1+q:2+q]) for q in range(self.input_dim)]
+                weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3'], type_converters=weave.converters.blitz, **self.weave_options)
+        else:
+            target[1] += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dK)
+
+    def dKdiag_dtheta(self, dL_dKdiag, X, target):
+        # NB: derivative of diagonal elements wrt lengthscale is 0
+        target[0] += np.sum(dL_dKdiag)
+
+    def dK_dX(self, dL_dK, X, X2, target):
+        self._K_computations(X, X2)
+        _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
+        dK_dX = (-self.variance / self.lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
+        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        pass
+
+
+    #---------------------------------------#
+    #             PSI statistics            #
+    #---------------------------------------#
+
+    def psi0(self, Z, mu, S, target):
+        target += self.variance
+
+    def dpsi0_dtheta(self, dL_dpsi0, Z, mu, S, target):
+        target[0] += np.sum(dL_dpsi0)
+
+    def dpsi0_dmuS(self, dL_dpsi0, Z, mu, S, target_mu, target_S):
+        pass
+
+    def psi1(self, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        target += self._psi1
+
+    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        target[0] += np.sum(dL_dpsi1 * self._psi1 / self.variance)
+        d_length = self._psi1[:,:,None] * ((self._psi1_dist_sq - 1.)/(self.lengthscale*self._psi1_denom) +1./self.lengthscale)
+        dpsi1_dlength = d_length * dL_dpsi1[:, :, None]
+        if not self.ARD:
+            target[1] += dpsi1_dlength.sum()
+        else:
+            target[1:] += dpsi1_dlength.sum(0).sum(0)
+
+    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        denominator = (self.lengthscale2 * (self._psi1_denom))
+        dpsi1_dZ = -self._psi1[:, :, None] * ((self._psi1_dist / denominator))
+        target += np.sum(dL_dpsi1[:, :, None] * dpsi1_dZ, 0)
+
+    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
+        self._psi_computations(Z, mu, S)
+        tmp = self._psi1[:, :, None] / self.lengthscale2 / self._psi1_denom
+        target_mu += np.sum(dL_dpsi1[:, :, None] * tmp * self._psi1_dist, 1)
+        target_S += np.sum(dL_dpsi1[:, :, None] * 0.5 * tmp * (self._psi1_dist_sq - 1), 1)
+
+    def psi2(self, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        target += self._psi2
+
+    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target):
+        """Shape N,num_inducing,num_inducing,Ntheta"""
+        self._psi_computations(Z, mu, S)
+        d_var = 2.*self._psi2 / self.variance
+        d_length = 2.*self._psi2[:, :, :, None] * (self._psi2_Zdist_sq * self._psi2_denom + self._psi2_mudist_sq + S[:, None, None, :] / self.lengthscale2) / (self.lengthscale * self._psi2_denom)
+
+        target[0] += np.sum(dL_dpsi2 * d_var)
+        dpsi2_dlength = d_length * dL_dpsi2[:, :, :, None]
+        if not self.ARD:
+            target[1] += dpsi2_dlength.sum()
+        else:
+            target[1:] += dpsi2_dlength.sum(0).sum(0).sum(0)
+
+    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        term1 = self._psi2_Zdist / self.lengthscale2 # num_inducing, num_inducing, input_dim
+        term2 = self._psi2_mudist / self._psi2_denom / self.lengthscale2 # N, num_inducing, num_inducing, input_dim
+        dZ = self._psi2[:, :, :, None] * (term1[None] + term2)
+        target += (dL_dpsi2[:, :, :, None] * dZ).sum(0).sum(0)
+
+    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
+        """Think N,num_inducing,num_inducing,input_dim """
+        self._psi_computations(Z, mu, S)
+        tmp = self._psi2[:, :, :, None] / self.lengthscale2 / self._psi2_denom
+        target_mu += -2.*(dL_dpsi2[:, :, :, None] * tmp * self._psi2_mudist).sum(1).sum(1)
+        target_S += (dL_dpsi2[:, :, :, None] * tmp * (2.*self._psi2_mudist_sq - 1)).sum(1).sum(1)
+
+    #---------------------------------------#
+    #            Precomputations            #
+    #---------------------------------------#
+
+    def _K_computations(self, X, X2):
+        params = self._get_params()
+        if not (fast_array_equal(X, self._X) and fast_array_equal(X2, self._X2) and fast_array_equal(self._params , params)):
+            self._X = X.copy()
+            self._params = params.copy()
+            if X2 is None:
+                self._X2 = None
+                X = X / self.lengthscale
+                Xsquare = np.sum(np.square(X), 1)
+                self._K_dist2 = -2.*tdot(X) + (Xsquare[:, None] + Xsquare[None, :])
+            else:
+                self._X2 = X2.copy()
+                X = X / self.lengthscale
+                X2 = X2 / self.lengthscale
+                self._K_dist2 = -2.*np.dot(X, X2.T) + (np.sum(np.square(X), 1)[:, None] + np.sum(np.square(X2), 1)[None, :])
+            self._K_dvar = np.exp(-0.5 * self._K_dist2)
+
+    def _psi_computations(self, Z, mu, S):
+        # here are the "statistics" for psi1 and psi2
+        Z_changed = not fast_array_equal(Z, self._Z)
+        if Z_changed:
+            # Z has changed, compute Z specific stuff
+            self._psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
+            self._psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
+            self._psi2_Zdist_sq = np.square(self._psi2_Zdist / self.lengthscale) # M,M,Q
+
+        if Z_changed or not fast_array_equal(mu, self._mu) or not fast_array_equal(S, self._S):
+            # something's changed. recompute EVERYTHING
+
+            # psi1
+            self._psi1_denom = S[:, None, :] / self.lengthscale2 + 1.
+            self._psi1_dist = Z[None, :, :] - mu[:, None, :]
+            self._psi1_dist_sq = np.square(self._psi1_dist) / self.lengthscale2 / self._psi1_denom
+            self._psi1_exponent = -0.5 * np.sum(self._psi1_dist_sq + np.log(self._psi1_denom), -1)
+            self._psi1 = self.variance * np.exp(self._psi1_exponent)
+
+            # psi2
+            self._psi2_denom = 2.*S[:, None, None, :] / self.lengthscale2 + 1. # N,M,M,Q
+            self._psi2_mudist, self._psi2_mudist_sq, self._psi2_exponent, _ = self.weave_psi2(mu, self._psi2_Zhat)
+            # self._psi2_mudist = mu[:,None,None,:]-self._psi2_Zhat #N,M,M,Q
+            # self._psi2_mudist_sq = np.square(self._psi2_mudist)/(self.lengthscale2*self._psi2_denom)
+            # self._psi2_exponent = np.sum(-self._psi2_Zdist_sq -self._psi2_mudist_sq -0.5*np.log(self._psi2_denom),-1) #N,M,M,Q
+            self._psi2 = np.square(self.variance) * np.exp(self._psi2_exponent) # N,M,M,Q
+
+            # store matrices for caching
+            self._Z, self._mu, self._S = Z, mu, S
+
+    def weave_psi2(self, mu, Zhat):
+        N, input_dim = mu.shape
+        num_inducing = Zhat.shape[0]
+
+        mudist = np.empty((N, num_inducing, num_inducing, input_dim))
+        mudist_sq = np.empty((N, num_inducing, num_inducing, input_dim))
+        psi2_exponent = np.zeros((N, num_inducing, num_inducing))
+        psi2 = np.empty((N, num_inducing, num_inducing))
+
+        psi2_Zdist_sq = self._psi2_Zdist_sq
+        _psi2_denom = self._psi2_denom.squeeze().reshape(N, self.input_dim)
+        half_log_psi2_denom = 0.5 * np.log(self._psi2_denom).squeeze().reshape(N, self.input_dim)
+        variance_sq = float(np.square(self.variance))
+        if self.ARD:
+            lengthscale2 = self.lengthscale2
+        else:
+            lengthscale2 = np.ones(input_dim) * self.lengthscale2
+        code = """
+        double tmp;
+
+        #pragma omp parallel for private(tmp)
+        for (int n=0; n<N; n++){
+            for (int m=0; m<num_inducing; m++){
+               for (int mm=0; mm<(m+1); mm++){
+                   for (int q=0; q<input_dim; q++){
+                       //compute mudist
+                       tmp = mu(n,q) - Zhat(m,mm,q);
+                       mudist(n,m,mm,q) = tmp;
+                       mudist(n,mm,m,q) = tmp;
+
+                       //now mudist_sq
+                       tmp = tmp*tmp/lengthscale2(q)/_psi2_denom(n,q);
+                       mudist_sq(n,m,mm,q) = tmp;
+                       mudist_sq(n,mm,m,q) = tmp;
+
+                       //now psi2_exponent
+                       tmp = -psi2_Zdist_sq(m,mm,q) - tmp - half_log_psi2_denom(n,q);
+                       psi2_exponent(n,mm,m) += tmp;
+                       if (m !=mm){
+                           psi2_exponent(n,m,mm) += tmp;
+                       }
+                   //psi2 would be computed like this, but np is faster
+                   //tmp = variance_sq*exp(psi2_exponent(n,m,mm));
+                   //psi2(n,m,mm) = tmp;
+                   //psi2(n,mm,m) = tmp;
+                   }
+                }
+            }
+        }
+
+        """
+
+        support_code = """
+        #include <omp.h>
+        #include <math.h>
+        """
+        weave.inline(code, support_code=support_code, libraries=['gomp'],
+                     arg_names=['N', 'num_inducing', 'input_dim', 'mu', 'Zhat', 'mudist_sq', 'mudist', 'lengthscale2', '_psi2_denom', 'psi2_Zdist_sq', 'psi2_exponent', 'half_log_psi2_denom', 'psi2', 'variance_sq'],
+                     type_converters=weave.converters.blitz, **self.weave_options)
+
+        return mudist, mudist_sq, psi2_exponent, psi2
--- a/GPy/kern/parts/rbf_inv.py
+++ b/GPy/kern/parts/rbf_inv.py
@ -0,0 +1,322 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from rbf import RBF
+import numpy as np
+import hashlib
+from scipy import weave
+from ...util.linalg import tdot
+
+class RBFInv(RBF):
+    """
+    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel. It only
+    differs from RBF in that here the parametrization is wrt the inverse lengthscale:
+
+    .. math::
+
+       k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \ \ \ \ \  \\text{ where  } r^2 = \sum_{i=1}^d \\frac{ (x_i-x^\prime_i)^2}{\ell_i^2}
+
+    where \ell_i is the lengthscale, \sigma^2 the variance and d the dimensionality of the input.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+    :param lengthscale: the vector of lengthscale of the kernel
+    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
+    :type ARD: Boolean
+    :rtype: kernel object
+
+    .. Note: this object implements both the ARD and 'spherical' version of the function
+    """
+
+    def __init__(self, input_dim, variance=1., inv_lengthscale=None, ARD=False):
+        self.input_dim = input_dim
+        self.name = 'rbf_inv'
+        self.ARD = ARD
+        if not ARD:
+            self.num_params = 2
+            if inv_lengthscale is not None:
+                inv_lengthscale = np.asarray(inv_lengthscale)
+                assert inv_lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
+            else:
+                inv_lengthscale = np.ones(1)
+        else:
+            self.num_params = self.input_dim + 1
+            if inv_lengthscale is not None:
+                inv_lengthscale = np.asarray(inv_lengthscale)
+                assert inv_lengthscale.size == self.input_dim, "bad number of lengthscales"
+            else:
+                inv_lengthscale = np.ones(self.input_dim)
+
+        self._set_params(np.hstack((variance, inv_lengthscale.flatten())))
+
+        # initialize cache
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
+
+        # a set of optional args to pass to weave
+        self.weave_options = {'headers'           : ['<omp.h>'],
+                         'extra_compile_args': ['-fopenmp -O3'], # -march=native'],
+                         'extra_link_args'   : ['-lgomp']}
+
+
+
+    def _get_params(self):
+        return np.hstack((self.variance, self.inv_lengthscale))
+
+    def _set_params(self, x):
+        assert x.size == (self.num_params)
+        self.variance = x[0]
+        self.inv_lengthscale = x[1:]
+        self.inv_lengthscale2 = np.square(self.inv_lengthscale)
+        # TODO: We can rewrite everything with inv_lengthscale and never need to do the below
+        self.lengthscale = 1. / self.inv_lengthscale
+        self.lengthscale2 = np.square(self.lengthscale)
+        # reset cached results
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1)) # cached versions of Z,mu,S
+
+    def _get_param_names(self):
+        if self.num_params == 2:
+            return ['variance', 'inv_lengthscale']
+        else:
+            return ['variance'] + ['inv_lengthscale%i' % i for i in range(self.inv_lengthscale.size)]
+
+    # TODO: Rewrite computations so that lengthscale is not needed (but only inv. lengthscale)
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        self._K_computations(X, X2)
+        target[0] += np.sum(self._K_dvar * dL_dK)
+        if self.ARD:
+            dvardLdK = self._K_dvar * dL_dK
+            var_len3 = self.variance / np.power(self.lengthscale, 3)
+            len2 = self.lengthscale2
+            if X2 is None:
+                # save computation for the symmetrical case
+                dvardLdK = dvardLdK + dvardLdK.T
+                code = """
+                int q,i,j;
+                double tmp;
+                for(q=0; q<input_dim; q++){
+                  tmp = 0;
+                  for(i=0; i<num_data; i++){
+                    for(j=0; j<i; j++){
+                      tmp += (X(i,q)-X(j,q))*(X(i,q)-X(j,q))*dvardLdK(i,j);
+                    }
+                  }
+                  target(q+1) += var_len3(q)*tmp*(-len2(q));
+                }
+                """
+                num_data, num_inducing, input_dim = X.shape[0], X.shape[0], self.input_dim
+                weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3', 'len2'], type_converters=weave.converters.blitz, **self.weave_options)
+            else:
+                code = """
+                int q,i,j;
+                double tmp;
+                for(q=0; q<input_dim; q++){
+                  tmp = 0;
+                  for(i=0; i<num_data; i++){
+                    for(j=0; j<num_inducing; j++){
+                      tmp += (X(i,q)-X2(j,q))*(X(i,q)-X2(j,q))*dvardLdK(i,j);
+                    }
+                  }
+                  target(q+1) += var_len3(q)*tmp*(-len2(q));
+                }
+                """
+                num_data, num_inducing, input_dim = X.shape[0], X2.shape[0], self.input_dim
+                # [np.add(target[1+q:2+q],var_len3[q]*np.sum(dvardLdK*np.square(X[:,q][:,None]-X2[:,q][None,:])),target[1+q:2+q]) for q in range(self.input_dim)]
+                weave.inline(code, arg_names=['num_data', 'num_inducing', 'input_dim', 'X', 'X2', 'target', 'dvardLdK', 'var_len3', 'len2'], type_converters=weave.converters.blitz, **self.weave_options)
+        else:
+            target[1] += (self.variance / self.lengthscale) * np.sum(self._K_dvar * self._K_dist2 * dL_dK) * (-self.lengthscale2)
+
+    def dK_dX(self, dL_dK, X, X2, target):
+        self._K_computations(X, X2)
+        _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
+        dK_dX = (-self.variance * self.inv_lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
+        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
+
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        pass
+
+
+    #---------------------------------------#
+    #             PSI statistics            #
+    #---------------------------------------#
+
+    # def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target):
+    #     self._psi_computations(Z, mu, S)
+    #     denom_deriv = S[:, None, :] / (self.lengthscale ** 3 + self.lengthscale * S[:, None, :])
+    #     d_length = self._psi1[:, :, None] * (self.lengthscale * np.square(self._psi1_dist / (self.lengthscale2 + S[:, None, :])) + denom_deriv)
+    #     target[0] += np.sum(dL_dpsi1 * self._psi1 / self.variance)
+    #     dpsi1_dlength = d_length * dL_dpsi1[:, :, None]
+    #     if not self.ARD:
+    #         target[1] += dpsi1_dlength.sum()*(-self.lengthscale2)
+    #     else:
+    #         target[1:] += dpsi1_dlength.sum(0).sum(0)*(-self.lengthscale2)
+    #     #target[1:] = target[1:]*(-self.lengthscale2)
+
+    def dpsi1_dtheta(self, dL_dpsi1, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        tmp = 1 + S[:, None, :] * self.inv_lengthscale2
+        # d_inv_length_old = -self._psi1[:, :, None] * ((self._psi1_dist_sq - 1.) / (self.lengthscale * self._psi1_denom) + self.inv_lengthscale) / self.inv_lengthscale2
+        d_length = -(self._psi1[:, :, None] * ((np.square(self._psi1_dist) * self.inv_lengthscale) / (tmp ** 2) + (S[:, None, :] * self.inv_lengthscale) / (tmp)))
+        # d_inv_length = -self._psi1[:, :, None] * ((self._psi1_dist_sq - 1.) / self._psi1_denom + self.lengthscale)
+        target[0] += np.sum(dL_dpsi1 * self._psi1 / self.variance)
+        dpsi1_dlength = d_length * dL_dpsi1[:, :, None]
+        if not self.ARD:
+            target[1] += dpsi1_dlength.sum() # *(-self.lengthscale2)
+        else:
+            target[1:] += dpsi1_dlength.sum(0).sum(0) # *(-self.lengthscale2)
+        # target[1:] = target[1:]*(-self.lengthscale2)
+
+    def dpsi1_dZ(self, dL_dpsi1, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        dpsi1_dZ = -self._psi1[:, :, None] * ((self.inv_lengthscale2 * self._psi1_dist) / self._psi1_denom)
+        target += np.sum(dL_dpsi1[:, :, None] * dpsi1_dZ, 0)
+
+    def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S, target_mu, target_S):
+        self._psi_computations(Z, mu, S)
+        tmp = (self._psi1[:, :, None] * self.inv_lengthscale2) / self._psi1_denom
+        target_mu += np.sum(dL_dpsi1[:, :, None] * tmp * self._psi1_dist, 1)
+        target_S += np.sum(dL_dpsi1[:, :, None] * 0.5 * tmp * (self._psi1_dist_sq - 1), 1)
+
+    def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S, target):
+        """Shape N,num_inducing,num_inducing,Ntheta"""
+        self._psi_computations(Z, mu, S)
+        d_var = 2.*self._psi2 / self.variance
+        # d_length = 2.*self._psi2[:, :, :, None] * (self._psi2_Zdist_sq * self._psi2_denom + self._psi2_mudist_sq + S[:, None, None, :] / self.lengthscale2) / (self.lengthscale * self._psi2_denom)
+        d_length = -2.*self._psi2[:, :, :, None] * (self._psi2_Zdist_sq * self._psi2_denom + self._psi2_mudist_sq + S[:, None, None, :] * self.inv_lengthscale2) / (self.inv_lengthscale * self._psi2_denom)
+        target[0] += np.sum(dL_dpsi2 * d_var)
+        dpsi2_dlength = d_length * dL_dpsi2[:, :, :, None]
+        if not self.ARD:
+            target[1] += dpsi2_dlength.sum() # *(-self.lengthscale2)
+        else:
+            target[1:] += dpsi2_dlength.sum(0).sum(0).sum(0) # *(-self.lengthscale2)
+        # target[1:] = target[1:]*(-self.lengthscale2)
+
+    def dpsi2_dZ(self, dL_dpsi2, Z, mu, S, target):
+        self._psi_computations(Z, mu, S)
+        term1 = self._psi2_Zdist * self.inv_lengthscale2 # num_inducing, num_inducing, input_dim
+        term2 = (self._psi2_mudist * self.inv_lengthscale2) / self._psi2_denom # N, num_inducing, num_inducing, input_dim
+        dZ = self._psi2[:, :, :, None] * (term1[None] + term2)
+        target += (dL_dpsi2[:, :, :, None] * dZ).sum(0).sum(0)
+
+    def dpsi2_dmuS(self, dL_dpsi2, Z, mu, S, target_mu, target_S):
+        """Think N,num_inducing,num_inducing,input_dim """
+        self._psi_computations(Z, mu, S)
+        tmp = (self.inv_lengthscale2 * self._psi2[:, :, :, None]) / self._psi2_denom
+        target_mu += -2.*(dL_dpsi2[:, :, :, None] * tmp * self._psi2_mudist).sum(1).sum(1)
+        target_S += (dL_dpsi2[:, :, :, None] * tmp * (2.*self._psi2_mudist_sq - 1)).sum(1).sum(1)
+
+    #---------------------------------------#
+    #            Precomputations            #
+    #---------------------------------------#
+
+    def _K_computations(self, X, X2):
+        if not (np.array_equal(X, self._X) and np.array_equal(X2, self._X2) and np.array_equal(self._params , self._get_params())):
+            self._X = X.copy()
+            self._params = self._get_params().copy()
+            if X2 is None:
+                self._X2 = None
+                X = X * self.inv_lengthscale
+                Xsquare = np.sum(np.square(X), 1)
+                self._K_dist2 = -2.*tdot(X) + (Xsquare[:, None] + Xsquare[None, :])
+            else:
+                self._X2 = X2.copy()
+                X = X * self.inv_lengthscale
+                X2 = X2 * self.inv_lengthscale
+                self._K_dist2 = -2.*np.dot(X, X2.T) + (np.sum(np.square(X), 1)[:, None] + np.sum(np.square(X2), 1)[None, :])
+            self._K_dvar = np.exp(-0.5 * self._K_dist2)
+
+    def _psi_computations(self, Z, mu, S):
+        # here are the "statistics" for psi1 and psi2
+        if not np.array_equal(Z, self._Z):
+            # Z has changed, compute Z specific stuff
+            self._psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
+            self._psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
+            self._psi2_Zdist_sq = np.square(self._psi2_Zdist * self.inv_lengthscale) # M,M,Q
+
+        if not (np.array_equal(Z, self._Z) and np.array_equal(mu, self._mu) and np.array_equal(S, self._S)):
+            # something's changed. recompute EVERYTHING
+
+            # psi1
+            self._psi1_denom = S[:, None, :] * self.inv_lengthscale2 + 1.
+            self._psi1_dist = Z[None, :, :] - mu[:, None, :]
+            self._psi1_dist_sq = (np.square(self._psi1_dist) * self.inv_lengthscale2) / self._psi1_denom
+            self._psi1_exponent = -0.5 * np.sum(self._psi1_dist_sq + np.log(self._psi1_denom), -1)
+            self._psi1 = self.variance * np.exp(self._psi1_exponent)
+
+            # psi2
+            self._psi2_denom = 2.*S[:, None, None, :] * self.inv_lengthscale2 + 1. # N,M,M,Q
+            self._psi2_mudist, self._psi2_mudist_sq, self._psi2_exponent, _ = self.weave_psi2(mu, self._psi2_Zhat)
+            # self._psi2_mudist = mu[:,None,None,:]-self._psi2_Zhat #N,M,M,Q
+            # self._psi2_mudist_sq = np.square(self._psi2_mudist)/(self.lengthscale2*self._psi2_denom)
+            # self._psi2_exponent = np.sum(-self._psi2_Zdist_sq -self._psi2_mudist_sq -0.5*np.log(self._psi2_denom),-1) #N,M,M,Q
+            self._psi2 = np.square(self.variance) * np.exp(self._psi2_exponent) # N,M,M,Q
+
+            # store matrices for caching
+            self._Z, self._mu, self._S = Z, mu, S
+
+    def weave_psi2(self, mu, Zhat):
+        N, input_dim = mu.shape
+        num_inducing = Zhat.shape[0]
+
+        mudist = np.empty((N, num_inducing, num_inducing, input_dim))
+        mudist_sq = np.empty((N, num_inducing, num_inducing, input_dim))
+        psi2_exponent = np.zeros((N, num_inducing, num_inducing))
+        psi2 = np.empty((N, num_inducing, num_inducing))
+
+        psi2_Zdist_sq = self._psi2_Zdist_sq
+        _psi2_denom = self._psi2_denom.squeeze().reshape(N, self.input_dim)
+        half_log_psi2_denom = 0.5 * np.log(self._psi2_denom).squeeze().reshape(N, self.input_dim)
+        variance_sq = float(np.square(self.variance))
+        if self.ARD:
+            inv_lengthscale2 = self.inv_lengthscale2
+        else:
+            inv_lengthscale2 = np.ones(input_dim) * self.inv_lengthscale2
+        code = """
+        double tmp;
+
+        #pragma omp parallel for private(tmp)
+        for (int n=0; n<N; n++){
+            for (int m=0; m<num_inducing; m++){
+               for (int mm=0; mm<(m+1); mm++){
+                   for (int q=0; q<input_dim; q++){
+                       //compute mudist
+                       tmp = mu(n,q) - Zhat(m,mm,q);
+                       mudist(n,m,mm,q) = tmp;
+                       mudist(n,mm,m,q) = tmp;
+
+                       //now mudist_sq
+                       tmp = tmp*tmp*inv_lengthscale2(q)/_psi2_denom(n,q);
+                       mudist_sq(n,m,mm,q) = tmp;
+                       mudist_sq(n,mm,m,q) = tmp;
+
+                       //now psi2_exponent
+                       tmp = -psi2_Zdist_sq(m,mm,q) - tmp - half_log_psi2_denom(n,q);
+                       psi2_exponent(n,mm,m) += tmp;
+                       if (m !=mm){
+                           psi2_exponent(n,m,mm) += tmp;
+                       }
+                   //psi2 would be computed like this, but np is faster
+                   //tmp = variance_sq*exp(psi2_exponent(n,m,mm));
+                   //psi2(n,m,mm) = tmp;
+                   //psi2(n,mm,m) = tmp;
+                   }
+                }
+            }
+        }
+
+        """
+
+        support_code = """
+        #include <omp.h>
+        #include <math.h>
+        """
+        weave.inline(code, support_code=support_code, libraries=['gomp'],
+                     arg_names=['N', 'num_inducing', 'input_dim', 'mu', 'Zhat', 'mudist_sq', 'mudist', 'inv_lengthscale2', '_psi2_denom', 'psi2_Zdist_sq', 'psi2_exponent', 'half_log_psi2_denom', 'psi2', 'variance_sq'],
+                     type_converters=weave.converters.blitz, **self.weave_options)
+
+        return mudist, mudist_sq, psi2_exponent, psi2
--- a/GPy/kern/parts/rbfcos.py
+++ b/GPy/kern/parts/rbfcos.py
@ -0,0 +1,117 @@
+
+# Copyright (c) 2012, James Hensman and Andrew Gordon Wilson
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+
+class RBFCos(Kernpart):
+    def __init__(self,input_dim,variance=1.,frequencies=None,bandwidths=None,ARD=False):
+        self.input_dim = input_dim
+        self.name = 'rbfcos'
+        if self.input_dim>10:
+            print "Warning: the rbfcos kernel requires a lot of memory for high dimensional inputs"
+        self.ARD = ARD
+
+        #set the default frequencies and bandwidths, appropriate num_params
+        if ARD:
+            self.num_params = 2*self.input_dim + 1
+            if frequencies is not None:
+                frequencies = np.asarray(frequencies)
+                assert frequencies.size == self.input_dim, "bad number of frequencies"
+            else:
+                frequencies = np.ones(self.input_dim)
+            if bandwidths is not None:
+                bandwidths = np.asarray(bandwidths)
+                assert bandwidths.size == self.input_dim, "bad number of bandwidths"
+            else:
+                bandwidths = np.ones(self.input_dim)
+        else:
+            self.num_params = 3
+            if frequencies is not None:
+                frequencies = np.asarray(frequencies)
+                assert frequencies.size == 1, "Exactly one frequency needed for non-ARD kernel"
+            else:
+                frequencies = np.ones(1)
+
+            if bandwidths is not None:
+                bandwidths = np.asarray(bandwidths)
+                assert bandwidths.size == 1, "Exactly one bandwidth needed for non-ARD kernel"
+            else:
+                bandwidths = np.ones(1)
+
+        #initialise cache
+        self._X, self._X2, self._params = np.empty(shape=(3,1))
+
+        self._set_params(np.hstack((variance,frequencies.flatten(),bandwidths.flatten())))
+
+
+    def _get_params(self):
+        return np.hstack((self.variance,self.frequencies, self.bandwidths))
+
+    def _set_params(self,x):
+        assert x.size==(self.num_params)
+        if self.ARD:
+            self.variance = x[0]
+            self.frequencies = x[1:1+self.input_dim]
+            self.bandwidths = x[1+self.input_dim:]
+        else:
+            self.variance, self.frequencies, self.bandwidths = x
+
+    def _get_param_names(self):
+        if self.num_params == 3:
+            return ['variance','frequency','bandwidth']
+        else:
+            return ['variance']+['frequency_%i'%i for i in range(self.input_dim)]+['bandwidth_%i'%i for i in range(self.input_dim)]
+
+    def K(self,X,X2,target):
+        self._K_computations(X,X2)
+        target += self.variance*self._dvar
+
+    def Kdiag(self,X,target):
+        np.add(target,self.variance,target)
+
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        self._K_computations(X,X2)
+        target[0] += np.sum(dL_dK*self._dvar)
+        if self.ARD:
+            for q in xrange(self.input_dim):
+                target[q+1] += -2.*np.pi*self.variance*np.sum(dL_dK*self._dvar*np.tan(2.*np.pi*self._dist[:,:,q]*self.frequencies[q])*self._dist[:,:,q])
+                target[q+1+self.input_dim] += -2.*np.pi**2*self.variance*np.sum(dL_dK*self._dvar*self._dist2[:,:,q])
+        else:
+            target[1] += -2.*np.pi*self.variance*np.sum(dL_dK*self._dvar*np.sum(np.tan(2.*np.pi*self._dist*self.frequencies)*self._dist,-1))
+            target[2] += -2.*np.pi**2*self.variance*np.sum(dL_dK*self._dvar*self._dist2.sum(-1))
+
+
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        target[0] += np.sum(dL_dKdiag)
+
+    def dK_dX(self,dL_dK,X,X2,target):
+        #TODO!!!
+        raise NotImplementedError
+
+    def dKdiag_dX(self,dL_dKdiag,X,target):
+        pass
+
+    def _K_computations(self,X,X2):
+        if not (np.all(X==self._X) and np.all(X2==self._X2)):
+            if X2 is None: X2 = X
+            self._X = X.copy()
+            self._X2 = X2.copy()
+
+            #do the distances: this will be high memory for large input_dim
+            #NB: we don't take the abs of the dist because cos is symmetric
+            self._dist = X[:,None,:] - X2[None,:,:]
+            self._dist2 = np.square(self._dist)
+
+            #ensure the next section is computed:
+            self._params = np.empty(self.num_params)
+
+        if not np.all(self._params == self._get_params()):
+            self._params == self._get_params().copy()
+
+            self._rbf_part = np.exp(-2.*np.pi**2*np.sum(self._dist2*self.bandwidths,-1))
+            self._cos_part = np.prod(np.cos(2.*np.pi*self._dist*self.frequencies),-1)
+            self._dvar = self._rbf_part*self._cos_part
+
--- a/GPy/kern/parts/spline.py
+++ b/GPy/kern/parts/spline.py
@ -0,0 +1,58 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kernpart import Kernpart
+import numpy as np
+import hashlib
+def theta(x):
+    """Heaviside step function"""
+    return np.where(x>=0.,1.,0.)
+
+class Spline(Kernpart):
+    """
+    Spline kernel
+
+    :param input_dim: the number of input dimensions (fixed to 1 right now TODO)
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+
+    """
+
+    def __init__(self,input_dim,variance=1.,lengthscale=1.):
+        self.input_dim = input_dim
+        assert self.input_dim==1
+        self.num_params = 1
+        self.name = 'spline'
+        self._set_params(np.squeeze(variance))
+
+    def _get_params(self):
+        return self.variance
+
+    def _set_params(self,x):
+        self.variance = x
+
+    def _get_param_names(self):
+        return ['variance']
+
+    def K(self,X,X2,target):
+        assert np.all(X>0), "Spline covariance is for +ve domain only. TODO: symmetrise"
+        assert np.all(X2>0), "Spline covariance is for +ve domain only. TODO: symmetrise"
+        t = X
+        s = X2.T
+        s_t = s-t # broadcasted subtraction
+        target += self.variance*(0.5*(t*s**2) - s**3/6. + (s_t)**3*theta(s_t)/6.)
+
+    def Kdiag(self,X,target):
+        target += self.variance*X.flatten()**3/3.
+
+    def dK_dtheta(self,X,X2,target):
+        target += 0.5*(t*s**2) - s**3/6. + (s_t)**3*theta(s_t)/6.
+
+    def dKdiag_dtheta(self,X,target):
+        target += X.flatten()**3/3.
+
+    def dKdiag_dX(self,X,target):
+        target += self.variance*X**2
+
--- a/GPy/kern/parts/symmetric.py
+++ b/GPy/kern/parts/symmetric.py
@ -0,0 +1,92 @@
+# Copyright (c) 2012 James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+
+class Symmetric(Kernpart):
+    """
+    Symmetrical kernels
+
+    :param k: the kernel to symmetrify
+    :type k: Kernpart
+    :param transform: the transform to use in symmetrification (allows symmetry on specified axes)
+    :type transform: A numpy array (input_dim x input_dim) specifiying the transform
+    :rtype: Kernpart
+
+    """
+    def __init__(self,k,transform=None):
+        if transform is None:
+            transform = np.eye(k.input_dim)*-1.
+        assert transform.shape == (k.input_dim, k.input_dim)
+        self.transform = transform
+        self.input_dim = k.input_dim
+        self.num_params = k.num_params
+        self.name = k.name + '_symm'
+        self.k = k
+        self._set_params(k._get_params())
+
+    def _get_params(self):
+        """return the value of the parameters."""
+        return self.k._get_params()
+
+    def _set_params(self,x):
+        """set the value of the parameters."""
+        self.k._set_params(x)
+
+    def _get_param_names(self):
+        """return parameter names."""
+        return self.k._get_param_names()
+
+    def K(self,X,X2,target):
+        """Compute the covariance matrix between X and X2."""
+        AX = np.dot(X,self.transform)
+        if X2 is None:
+            X2 = X
+            AX2 = AX
+        else:
+            AX2 = np.dot(X2, self.transform)
+        self.k.K(X,X2,target)
+        self.k.K(AX,X2,target)
+        self.k.K(X,AX2,target)
+        self.k.K(AX,AX2,target)
+
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        """derivative of the covariance matrix with respect to the parameters."""
+        AX = np.dot(X,self.transform)
+        if X2 is None:
+            X2 = X
+            ZX2 = AX
+        else:
+            AX2 = np.dot(X2, self.transform)
+        self.k.dK_dtheta(dL_dK,X,X2,target)
+        self.k.dK_dtheta(dL_dK,AX,X2,target)
+        self.k.dK_dtheta(dL_dK,X,AX2,target)
+        self.k.dK_dtheta(dL_dK,AX,AX2,target)
+
+
+    def dK_dX(self,dL_dK,X,X2,target):
+        """derivative of the covariance matrix with respect to X."""
+        AX = np.dot(X,self.transform)
+        if X2 is None:
+            X2 = X
+            ZX2 = AX
+        else:
+            AX2 = np.dot(X2, self.transform)
+        self.k.dK_dX(dL_dK, X, X2, target)
+        self.k.dK_dX(dL_dK, AX, X2, target)
+        self.k.dK_dX(dL_dK, X, AX2, target)
+        self.k.dK_dX(dL_dK, AX ,AX2, target)
+
+    def Kdiag(self,X,target):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        foo = np.zeros((X.shape[0],X.shape[0]))
+        self.K(X,X,foo)
+        target += np.diag(foo)
+
+    def dKdiag_dX(self,dL_dKdiag,X,target):
+        raise NotImplementedError
+
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        raise NotImplementedError
--- a/GPy/kern/parts/sympy_helpers.cpp
+++ b/GPy/kern/parts/sympy_helpers.cpp
@ -0,0 +1,10 @@
+#include <math.h>
+double DiracDelta(double x){
+    if((x<0.000001) & (x>-0.000001))//go on, laught at my c++ skills
+        return 1.0;
+    else
+        return 0.0;
+};
+double DiracDelta(double x,int foo){
+    return 0.0;
+};
--- a/GPy/kern/parts/sympy_helpers.h
+++ b/GPy/kern/parts/sympy_helpers.h
@ -0,0 +1,3 @@
+#include <math.h>
+double DiracDelta(double x);
+double DiracDelta(double x, int foo);
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
@ -0,0 +1,258 @@
+import numpy as np
+import sympy as sp
+from sympy.utilities.codegen import codegen
+from sympy.core.cache import clear_cache
+from scipy import weave
+import re
+import os
+import sys
+current_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__)))
+import tempfile
+import pdb
+from kernpart import Kernpart
+
+class spkern(Kernpart):
+    """
+    A kernel object, where all the hard work in done by sympy.
+
+    :param k: the covariance function
+    :type k: a positive definite sympy function of x1, z1, x2, z2...
+
+    To construct a new sympy kernel, you'll need to define:
+     - a kernel function using a sympy object. Ensure that the kernel is of the form k(x,z).
+     - that's it! we'll extract the variables from the function k.
+
+    Note:
+     - to handle multiple inputs, call them x1, z1, etc
+     - to handle multpile correlated outputs, you'll need to define each covariance function and 'cross' variance function. TODO
+    """
+    def __init__(self,input_dim,k,param=None):
+        self.name='sympykern'
+        self._sp_k = k
+        sp_vars = [e for e in k.atoms() if e.is_Symbol]
+        self._sp_x= sorted([e for e in sp_vars if e.name[0]=='x'],key=lambda x:int(x.name[1:]))
+        self._sp_z= sorted([e for e in sp_vars if e.name[0]=='z'],key=lambda z:int(z.name[1:]))
+        assert all([x.name=='x%i'%i for i,x in enumerate(self._sp_x)])
+        assert all([z.name=='z%i'%i for i,z in enumerate(self._sp_z)])
+        assert len(self._sp_x)==len(self._sp_z)
+        self.input_dim = len(self._sp_x)
+        assert self.input_dim == input_dim
+        self._sp_theta = sorted([e for e in sp_vars if not (e.name[0]=='x' or e.name[0]=='z')],key=lambda e:e.name)
+        self.num_params = len(self._sp_theta)
+
+        #deal with param
+        if param is None:
+            param = np.ones(self.num_params)
+        assert param.size==self.num_params
+        self._set_params(param)
+
+        #Differentiate!
+        self._sp_dk_dtheta = [sp.diff(k,theta).simplify() for theta in self._sp_theta]
+        self._sp_dk_dx = [sp.diff(k,xi).simplify() for xi in self._sp_x]
+        #self._sp_dk_dz = [sp.diff(k,zi) for zi in self._sp_z]
+
+        #self.compute_psi_stats()
+        self._gen_code()
+
+        self.weave_kwargs = {\
+            'support_code':self._function_code,\
+            'include_dirs':[tempfile.gettempdir(), os.path.join(current_dir,'kern/')],\
+            'headers':['"sympy_helpers.h"'],\
+            'sources':[os.path.join(current_dir,"kern/sympy_helpers.cpp")],\
+            #'extra_compile_args':['-ftree-vectorize', '-mssse3', '-ftree-vectorizer-verbose=5'],\
+            'extra_compile_args':[],\
+            'extra_link_args':['-lgomp'],\
+            'verbose':True}
+
+    def __add__(self,other):
+        return spkern(self._sp_k+other._sp_k)
+
+    def compute_psi_stats(self):
+        #define some normal distributions
+        mus = [sp.var('mu%i'%i,real=True) for i in range(self.input_dim)]
+        Ss = [sp.var('S%i'%i,positive=True) for i in range(self.input_dim)]
+        normals = [(2*sp.pi*Si)**(-0.5)*sp.exp(-0.5*(xi-mui)**2/Si) for xi, mui, Si in zip(self._sp_x, mus, Ss)]
+
+        #do some integration!
+        #self._sp_psi0 = ??
+        self._sp_psi1 = self._sp_k
+        for i in range(self.input_dim):
+            print 'perfoming integrals %i of %i'%(i+1,2*self.input_dim)
+            sys.stdout.flush()
+            self._sp_psi1 *= normals[i]
+            self._sp_psi1 = sp.integrate(self._sp_psi1,(self._sp_x[i],-sp.oo,sp.oo))
+            clear_cache()
+        self._sp_psi1 = self._sp_psi1.simplify()
+
+        #and here's psi2 (eek!)
+        zprime = [sp.Symbol('zp%i'%i) for i in range(self.input_dim)]
+        self._sp_psi2 = self._sp_k.copy()*self._sp_k.copy().subs(zip(self._sp_z,zprime))
+        for i in range(self.input_dim):
+            print 'perfoming integrals %i of %i'%(self.input_dim+i+1,2*self.input_dim)
+            sys.stdout.flush()
+            self._sp_psi2 *= normals[i]
+            self._sp_psi2 = sp.integrate(self._sp_psi2,(self._sp_x[i],-sp.oo,sp.oo))
+            clear_cache()
+        self._sp_psi2 = self._sp_psi2.simplify()
+
+
+    def _gen_code(self):
+        #generate c functions from sympy objects
+        (foo_c,self._function_code),(foo_h,self._function_header) = \
+                codegen([('k',self._sp_k)] \
+                + [('dk_d%s'%x.name,dx) for x,dx in zip(self._sp_x,self._sp_dk_dx)]\
+                #+ [('dk_d%s'%z.name,dz) for z,dz in zip(self._sp_z,self._sp_dk_dz)]\
+                + [('dk_d%s'%theta.name,dtheta) for theta,dtheta in zip(self._sp_theta,self._sp_dk_dtheta)]\
+                ,"C",'foobar',argument_sequence=self._sp_x+self._sp_z+self._sp_theta)
+        #put the header file where we can find it
+        f = file(os.path.join(tempfile.gettempdir(),'foobar.h'),'w')
+        f.write(self._function_header)
+        f.close()
+
+        #get rid of derivatives of DiracDelta
+        self._function_code = re.sub('DiracDelta\(.+?,.+?\)','0.0',self._function_code)
+
+        #Here's some code to do the looping for K
+        arglist = ", ".join(["X[i*input_dim+%s]"%x.name[1:] for x in self._sp_x]\
+                + ["Z[j*input_dim+%s]"%z.name[1:] for z in self._sp_z]\
+                + ["param[%i]"%i for i in range(self.num_params)])
+
+        self._K_code =\
+        """
+        int i;
+        int j;
+        int N = target_array->dimensions[0];
+        int num_inducing = target_array->dimensions[1];
+        int input_dim = X_array->dimensions[1];
+        //#pragma omp parallel for private(j)
+        for (i=0;i<N;i++){
+            for (j=0;j<num_inducing;j++){
+                target[i*num_inducing+j] = k(%s);
+            }
+        }
+        %s
+        """%(arglist,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
+
+        diag_arglist = re.sub('Z','X',arglist)
+        diag_arglist = re.sub('j','i',diag_arglist)
+        #Here's some code to do the looping for Kdiag
+        self._Kdiag_code =\
+        """
+        int i;
+        int N = target_array->dimensions[0];
+        int input_dim = X_array->dimensions[1];
+        //#pragma omp parallel for
+        for (i=0;i<N;i++){
+                target[i] = k(%s);
+        }
+        %s
+        """%(diag_arglist,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
+
+        #here's some code to compute gradients
+        funclist = '\n'.join([' '*16 + 'target[%i] += partial[i*num_inducing+j]*dk_d%s(%s);'%(i,theta.name,arglist) for i,theta in  enumerate(self._sp_theta)])
+        self._dK_dtheta_code =\
+        """
+        int i;
+        int j;
+        int N = partial_array->dimensions[0];
+        int num_inducing = partial_array->dimensions[1];
+        int input_dim = X_array->dimensions[1];
+        //#pragma omp parallel for private(j)
+        for (i=0;i<N;i++){
+            for (j=0;j<num_inducing;j++){
+%s
+            }
+        }
+        %s
+        """%(funclist,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
+
+        #here's some code to compute gradients for Kdiag TODO: thius is yucky.
+        diag_funclist = re.sub('Z','X',funclist,count=0)
+        diag_funclist = re.sub('j','i',diag_funclist)
+        diag_funclist = re.sub('partial\[i\*num_inducing\+i\]','partial[i]',diag_funclist)
+        self._dKdiag_dtheta_code =\
+        """
+        int i;
+        int N = partial_array->dimensions[0];
+        int input_dim = X_array->dimensions[1];
+        for (i=0;i<N;i++){
+                %s
+        }
+        %s
+        """%(diag_funclist,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
+
+        #Here's some code to do gradients wrt x
+        gradient_funcs = "\n".join(["target[i*input_dim+%i] += partial[i*num_inducing+j]*dk_dx%i(%s);"%(q,q,arglist) for q in range(self.input_dim)])
+        self._dK_dX_code = \
+        """
+        int i;
+        int j;
+        int N = partial_array->dimensions[0];
+        int num_inducing = partial_array->dimensions[1];
+        int input_dim = X_array->dimensions[1];
+        //#pragma omp parallel for private(j)
+        for (i=0;i<N; i++){
+            for (j=0; j<num_inducing; j++){
+                %s
+                //if(isnan(target[i*input_dim+2])){printf("%%f\\n",dk_dx2(X[i*input_dim+0], X[i*input_dim+1], X[i*input_dim+2], Z[j*input_dim+0], Z[j*input_dim+1], Z[j*input_dim+2], param[0], param[1], param[2], param[3], param[4], param[5]));}
+                //if(isnan(target[i*input_dim+2])){printf("%%f,%%f,%%i,%%i\\n", X[i*input_dim+2], Z[j*input_dim+2],i,j);}
+
+            }
+        }
+        %s
+        """%(gradient_funcs,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
+
+        #now for gradients of Kdiag wrt X
+        self._dKdiag_dX_code= \
+        """
+        int i;
+        int j;
+        int N = partial_array->dimensions[0];
+        int num_inducing = 0;
+        int input_dim = X_array->dimensions[1];
+        for (i=0;i<N; i++){
+            j = i;
+            %s
+        }
+        %s
+        """%(gradient_funcs,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
+
+
+        #TODO: insert multiple functions here via string manipulation
+        #TODO: similar functions for psi_stats
+
+    def K(self,X,Z,target):
+        param = self._param
+        weave.inline(self._K_code,arg_names=['target','X','Z','param'],**self.weave_kwargs)
+
+    def Kdiag(self,X,target):
+        param = self._param
+        weave.inline(self._Kdiag_code,arg_names=['target','X','param'],**self.weave_kwargs)
+
+    def dK_dtheta(self,partial,X,Z,target):
+        param = self._param
+        weave.inline(self._dK_dtheta_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
+
+    def dKdiag_dtheta(self,partial,X,target):
+        param = self._param
+        Z = X
+        weave.inline(self._dKdiag_dtheta_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
+
+    def dK_dX(self,partial,X,Z,target):
+        param = self._param
+        weave.inline(self._dK_dX_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
+
+    def dKdiag_dX(self,partial,X,target):
+        param = self._param
+        Z = X
+        weave.inline(self._dKdiag_dX_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
+
+    def _set_params(self,param):
+        #print param.flags['C_CONTIGUOUS']
+        self._param = param.copy()
+
+    def _get_params(self):
+        return self._param
+
+    def _get_param_names(self):
+        return [x.name for x in self._sp_theta]
--- a/GPy/kern/parts/white.py
+++ b/GPy/kern/parts/white.py
@ -0,0 +1,84 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+
+class White(Kernpart):
+    """
+    White noise kernel.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance:
+    :type variance: float
+    """
+    def __init__(self,input_dim,variance=1.):
+        self.input_dim = input_dim
+        self.num_params = 1
+        self.name = 'white'
+        self._set_params(np.array([variance]).flatten())
+        self._psi1 = 0 # TODO: more elegance here
+        
+    def _get_params(self):
+        return self.variance
+
+    def _set_params(self,x):
+        assert x.shape==(1,)
+        self.variance = x
+
+    def _get_param_names(self):
+        return ['variance']
+
+    def K(self,X,X2,target):
+        if X2 is None:
+            target += np.eye(X.shape[0])*self.variance
+
+    def Kdiag(self,X,target):
+        target += self.variance
+
+    def dK_dtheta(self,dL_dK,X,X2,target):
+        if X2 is None:
+            target += np.trace(dL_dK)
+
+    def dKdiag_dtheta(self,dL_dKdiag,X,target):
+        target += np.sum(dL_dKdiag)
+
+    def dK_dX(self,dL_dK,X,X2,target):        
+        pass
+
+    def dKdiag_dX(self,dL_dKdiag,X,target):
+        pass
+
+    def psi0(self,Z,mu,S,target):
+        pass # target += self.variance
+
+    def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target):
+        pass # target += dL_dpsi0.sum()
+
+    def dpsi0_dmuS(self,dL_dpsi0,Z,mu,S,target_mu,target_S):
+        pass
+
+    def psi1(self,Z,mu,S,target):
+        pass
+
+    def dpsi1_dtheta(self,dL_dpsi1,Z,mu,S,target):
+        pass
+
+    def dpsi1_dZ(self,dL_dpsi1,Z,mu,S,target):
+        pass
+
+    def dpsi1_dmuS(self,dL_dpsi1,Z,mu,S,target_mu,target_S):
+        pass
+
+    def psi2(self,Z,mu,S,target):
+        pass
+
+    def dpsi2_dZ(self,dL_dpsi2,Z,mu,S,target):
+        pass
+
+    def dpsi2_dtheta(self,dL_dpsi2,Z,mu,S,target):
+        pass
+
+    def dpsi2_dmuS(self,dL_dpsi2,Z,mu,S,target_mu,target_S):
+        pass
--- a/GPy/likelihoods/init.py
+++ b/GPy/likelihoods/init.py
@ -0,0 +1,4 @@
+from ep import EP
+from gaussian import Gaussian
+# TODO: from Laplace import Laplace
+import likelihood_functions as functions
--- a/GPy/likelihoods/ep.py
+++ b/GPy/likelihoods/ep.py
@ -0,0 +1,342 @@
+import numpy as np
+from scipy import stats
+from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
+from likelihood import likelihood
+
+class EP(likelihood):
+    def __init__(self,data,LikelihoodFunction,epsilon=1e-3,power_ep=[1.,1.]):
+        """
+        Expectation Propagation
+
+        Arguments
+        ---------
+        epsilon : Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
+        LikelihoodFunction : a likelihood function (see likelihood_functions.py)
+        """
+        self.LikelihoodFunction = LikelihoodFunction
+        self.epsilon = epsilon
+        self.eta, self.delta = power_ep
+        self.data = data
+        self.N, self.output_dim = self.data.shape
+        self.is_heteroscedastic = True
+        self.Nparams = 0
+        self._transf_data = self.LikelihoodFunction._preprocess_values(data)
+
+        #Initial values - Likelihood approximation parameters:
+        #p(y|f) = t(f|tau_tilde,v_tilde)
+        self.tau_tilde = np.zeros(self.N)
+        self.v_tilde = np.zeros(self.N)
+
+        #initial values for the GP variables
+        self.Y = np.zeros((self.N,1))
+        self.covariance_matrix = np.eye(self.N)
+        self.precision = np.ones(self.N)[:,None]
+        self.Z = 0
+        self.YYT = None
+        self.V = self.precision * self.Y
+        self.VVT_factor = self.V
+        self.trYYT = 0.
+
+    def restart(self):
+        self.tau_tilde = np.zeros(self.N)
+        self.v_tilde = np.zeros(self.N)
+        self.Y = np.zeros((self.N,1))
+        self.covariance_matrix = np.eye(self.N)
+        self.precision = np.ones(self.N)[:,None]
+        self.Z = 0
+        self.YYT = None
+        self.V = self.precision * self.Y
+        self.VVT_factor = self.V
+        self.trYYT = 0.
+
+    def predictive_values(self,mu,var,full_cov):
+        if full_cov:
+            raise NotImplementedError, "Cannot make correlated predictions with an EP likelihood"
+        return self.LikelihoodFunction.predictive_values(mu,var)
+
+    def _get_params(self):
+        return np.zeros(0)
+    def _get_param_names(self):
+        return []
+    def _set_params(self,p):
+        pass # TODO: the EP likelihood might want to take some parameters...
+    def _gradients(self,partial):
+        return np.zeros(0) # TODO: the EP likelihood might want to take some parameters...
+
+    def _compute_GP_variables(self):
+        #Variables to be called from GP
+        mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model
+        sigma_sum = 1./self.tau_ + 1./self.tau_tilde
+        mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2
+        self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep
+
+        self.Y =  mu_tilde[:,None]
+        self.YYT = np.dot(self.Y,self.Y.T)
+        self.covariance_matrix = np.diag(1./self.tau_tilde)
+        self.precision = self.tau_tilde[:,None]
+        self.V = self.precision * self.Y
+        self.VVT_factor = self.V
+        self.trYYT = np.trace(self.YYT)
+
+    def fit_full(self,K):
+        """
+        The expectation-propagation algorithm.
+        For nomenclature see Rasmussen & Williams 2006.
+        """
+        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
+        mu = np.zeros(self.N)
+        Sigma = K.copy()
+
+        """
+        Initial values - Cavity distribution parameters:
+        q_(f|mu_,sigma2_) = Product{q_i(f|mu_i,sigma2_i)}
+        sigma_ = 1./tau_
+        mu_ = v_/tau_
+        """
+        self.tau_ = np.empty(self.N,dtype=float)
+        self.v_ = np.empty(self.N,dtype=float)
+
+        #Initial values - Marginal moments
+        z = np.empty(self.N,dtype=float)
+        self.Z_hat = np.empty(self.N,dtype=float)
+        phi = np.empty(self.N,dtype=float)
+        mu_hat = np.empty(self.N,dtype=float)
+        sigma2_hat = np.empty(self.N,dtype=float)
+
+        #Approximation
+        epsilon_np1 = self.epsilon + 1.
+        epsilon_np2 = self.epsilon + 1.
+       	self.iterations = 0
+        self.np1 = [self.tau_tilde.copy()]
+        self.np2 = [self.v_tilde.copy()]
+        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
+            update_order = np.random.permutation(self.N)
+            for i in update_order:
+                #Cavity distribution parameters
+                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
+                self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i]
+                #Marginal moments
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                #Site parameters update
+                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
+                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
+                self.tau_tilde[i] += Delta_tau
+                self.v_tilde[i] += Delta_v
+                #Posterior distribution parameters update
+                DSYR(Sigma,Sigma[:,i].copy(), -float(Delta_tau/(1.+ Delta_tau*Sigma[i,i])))
+                mu = np.dot(Sigma,self.v_tilde)
+                self.iterations += 1
+            #Sigma recomptutation with Cholesky decompositon
+            Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K
+            B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
+            L = jitchol(B)
+            V,info = dtrtrs(L,Sroot_tilde_K,lower=1)
+            Sigma = K - np.dot(V.T,V)
+            mu = np.dot(Sigma,self.v_tilde)
+            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
+            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
+            self.np1.append(self.tau_tilde.copy())
+            self.np2.append(self.v_tilde.copy())
+
+        return self._compute_GP_variables()
+
+    def fit_DTC(self, Kmm, Kmn):
+        """
+        The expectation-propagation algorithm with sparse pseudo-input.
+        For nomenclature see ... 2013.
+        """
+        num_inducing = Kmm.shape[0]
+
+        #TODO: this doesn't work with uncertain inputs!
+
+        """
+        Prior approximation parameters:
+        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
+        Sigma0 = Qnn = Knm*Kmmi*Kmn
+        """
+        KmnKnm = np.dot(Kmn,Kmn.T)
+        Lm = jitchol(Kmm)
+        Lmi = chol_inv(Lm)
+        Kmmi = np.dot(Lmi.T,Lmi)
+        KmmiKmn = np.dot(Kmmi,Kmn)
+        Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
+        LLT0 = Kmm.copy()
+
+        #Kmmi, Lm, Lmi, Kmm_logdet = pdinv(Kmm)
+        #KmnKnm = np.dot(Kmn, Kmn.T)
+        #KmmiKmn = np.dot(Kmmi,Kmn)
+        #Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
+        #LLT0 = Kmm.copy()
+
+        """
+        Posterior approximation: q(f|y) = N(f| mu, Sigma)
+        Sigma = Diag + P*R.T*R*P.T + K
+        mu = w + P*Gamma
+        """
+        mu = np.zeros(self.N)
+        LLT = Kmm.copy()
+        Sigma_diag = Qnn_diag.copy()
+
+        """
+        Initial values - Cavity distribution parameters:
+        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
+        sigma_ = 1./tau_
+        mu_ = v_/tau_
+        """
+        self.tau_ = np.empty(self.N,dtype=float)
+        self.v_ = np.empty(self.N,dtype=float)
+
+        #Initial values - Marginal moments
+        z = np.empty(self.N,dtype=float)
+        self.Z_hat = np.empty(self.N,dtype=float)
+        phi = np.empty(self.N,dtype=float)
+        mu_hat = np.empty(self.N,dtype=float)
+        sigma2_hat = np.empty(self.N,dtype=float)
+
+        #Approximation
+        epsilon_np1 = 1
+        epsilon_np2 = 1
+       	self.iterations = 0
+        np1 = [self.tau_tilde.copy()]
+        np2 = [self.v_tilde.copy()]
+        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
+            update_order = np.random.permutation(self.N)
+            for i in update_order:
+                #Cavity distribution parameters
+                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
+                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
+                #Marginal moments
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                #Site parameters update
+                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
+                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
+                self.tau_tilde[i] += Delta_tau
+                self.v_tilde[i] += Delta_v
+                #Posterior distribution parameters update
+                DSYR(LLT,Kmn[:,i].copy(),Delta_tau) #LLT = LLT + np.outer(Kmn[:,i],Kmn[:,i])*Delta_tau
+                L = jitchol(LLT)
+                #cholUpdate(L,Kmn[:,i]*np.sqrt(Delta_tau))
+                V,info = dtrtrs(L,Kmn,lower=1)
+                Sigma_diag = np.sum(V*V,-2)
+                si = np.sum(V.T*V[:,i],-1)
+                mu += (Delta_v-Delta_tau*mu[i])*si
+                self.iterations += 1
+            #Sigma recomputation with Cholesky decompositon
+            LLT = LLT0 + np.dot(Kmn*self.tau_tilde[None,:],Kmn.T)
+            L = jitchol(LLT)
+            V,info = dtrtrs(L,Kmn,lower=1)
+            V2,info = dtrtrs(L.T,V,lower=0)
+            Sigma_diag = np.sum(V*V,-2)
+            Knmv_tilde = np.dot(Kmn,self.v_tilde)
+            mu = np.dot(V2.T,Knmv_tilde)
+            epsilon_np1 = sum((self.tau_tilde-np1[-1])**2)/self.N
+            epsilon_np2 = sum((self.v_tilde-np2[-1])**2)/self.N
+            np1.append(self.tau_tilde.copy())
+            np2.append(self.v_tilde.copy())
+
+        self._compute_GP_variables()
+
+    def fit_FITC(self, Kmm, Kmn, Knn_diag):
+        """
+        The expectation-propagation algorithm with sparse pseudo-input.
+        For nomenclature see Naish-Guzman and Holden, 2008.
+        """
+        num_inducing = Kmm.shape[0]
+
+        """
+        Prior approximation parameters:
+        q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
+        Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn
+        """
+        Lm = jitchol(Kmm)
+        Lmi = chol_inv(Lm)
+        Kmmi = np.dot(Lmi.T,Lmi)
+        P0 = Kmn.T
+        KmnKnm = np.dot(P0.T, P0)
+        KmmiKmn = np.dot(Kmmi,P0.T)
+        Qnn_diag = np.sum(P0.T*KmmiKmn,-2)
+        Diag0 = Knn_diag - Qnn_diag
+        R0 = jitchol(Kmmi).T
+
+        """
+        Posterior approximation: q(f|y) = N(f| mu, Sigma)
+        Sigma = Diag + P*R.T*R*P.T + K
+        mu = w + P*Gamma
+        """
+        self.w = np.zeros(self.N)
+        self.Gamma = np.zeros(num_inducing)
+        mu = np.zeros(self.N)
+        P = P0.copy()
+        R = R0.copy()
+        Diag = Diag0.copy()
+        Sigma_diag = Knn_diag
+        RPT0 = np.dot(R0,P0.T)
+
+        """
+        Initial values - Cavity distribution parameters:
+        q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
+        sigma_ = 1./tau_
+        mu_ = v_/tau_
+        """
+        self.tau_ = np.empty(self.N,dtype=float)
+        self.v_ = np.empty(self.N,dtype=float)
+
+        #Initial values - Marginal moments
+        z = np.empty(self.N,dtype=float)
+        self.Z_hat = np.empty(self.N,dtype=float)
+        phi = np.empty(self.N,dtype=float)
+        mu_hat = np.empty(self.N,dtype=float)
+        sigma2_hat = np.empty(self.N,dtype=float)
+
+        #Approximation
+        epsilon_np1 = 1
+        epsilon_np2 = 1
+       	self.iterations = 0
+        self.np1 = [self.tau_tilde.copy()]
+        self.np2 = [self.v_tilde.copy()]
+        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
+            update_order = np.random.permutation(self.N)
+            for i in update_order:
+                #Cavity distribution parameters
+                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
+                self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i]
+                #Marginal moments
+                self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.LikelihoodFunction.moments_match(self._transf_data[i],self.tau_[i],self.v_[i])
+                #Site parameters update
+                Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
+                Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
+                self.tau_tilde[i] += Delta_tau
+                self.v_tilde[i] += Delta_v
+                #Posterior distribution parameters update
+                dtd1 = Delta_tau*Diag[i] + 1.
+                dii = Diag[i]
+                Diag[i] = dii - (Delta_tau * dii**2.)/dtd1
+                pi_ = P[i,:].reshape(1,num_inducing)
+                P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_
+                Rp_i = np.dot(R,pi_.T)
+                RTR = np.dot(R.T,np.dot(np.eye(num_inducing) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R))
+                R = jitchol(RTR).T
+                self.w[i] += (Delta_v - Delta_tau*self.w[i])*dii/dtd1
+                self.Gamma += (Delta_v - Delta_tau*mu[i])*np.dot(RTR,P[i,:].T)
+                RPT = np.dot(R,P.T)
+                Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
+                mu = self.w + np.dot(P,self.Gamma)
+                self.iterations += 1
+            #Sigma recomptutation with Cholesky decompositon
+            Iplus_Dprod_i = 1./(1.+ Diag0 * self.tau_tilde)
+            Diag = Diag0 * Iplus_Dprod_i
+            P = Iplus_Dprod_i[:,None] * P0
+            safe_diag = np.where(Diag0 < self.tau_tilde, self.tau_tilde/(1.+Diag0*self.tau_tilde), (1. - Iplus_Dprod_i)/Diag0)
+            L = jitchol(np.eye(num_inducing) + np.dot(RPT0,safe_diag[:,None]*RPT0.T))
+            R,info = dtrtrs(L,R0,lower=1)
+            RPT = np.dot(R,P.T)
+            Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1)
+            self.w = Diag * self.v_tilde
+            self.Gamma = np.dot(R.T, np.dot(RPT,self.v_tilde))
+            mu = self.w + np.dot(P,self.Gamma)
+            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
+            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
+            self.np1.append(self.tau_tilde.copy())
+            self.np2.append(self.v_tilde.copy())
+
+        return self._compute_GP_variables()
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@ -0,0 +1,98 @@
+import numpy as np
+from likelihood import likelihood
+from ..util.linalg import jitchol
+
+
+class Gaussian(likelihood):
+    """
+    Likelihood class for doing Expectation propagation
+
+    :param Y: observed output (Nx1 numpy.darray)
+    ..Note:: Y values allowed depend on the likelihood_function used
+    :param variance :
+    :param normalize:  whether to normalize the data before computing (predictions will be in original scales)
+    :type normalize: False|True
+    """
+    def __init__(self, data, variance=1., normalize=False):
+        self.is_heteroscedastic = False
+        self.Nparams = 1
+        self.Z = 0. # a correction factor which accounts for the approximation made
+        N, self.output_dim = data.shape
+
+        # normalization
+        if normalize:
+            self._offset = data.mean(0)[None, :]
+            self._scale = data.std(0)[None, :]
+            # Don't scale outputs which have zero variance to zero.
+            self._scale[np.nonzero(self._scale == 0.)] = 1.0e-3
+        else:
+            self._offset = np.zeros((1, self.output_dim))
+            self._scale = np.ones((1, self.output_dim))
+
+        self.set_data(data)
+
+        self._variance = np.asarray(variance) + 1.
+        self._set_params(np.asarray(variance))
+
+    def set_data(self, data):
+        self.data = data
+        self.N, D = data.shape
+        assert D == self.output_dim
+        self.Y = (self.data - self._offset) / self._scale
+        if D > self.N:
+            self.YYT = np.dot(self.Y, self.Y.T)
+            self.trYYT = np.trace(self.YYT)
+            self.YYT_factor = jitchol(self.YYT)
+        else:
+            self.YYT = None
+            self.trYYT = np.sum(np.square(self.Y))
+            self.YYT_factor = self.Y
+
+    def _get_params(self):
+        return np.asarray(self._variance)
+
+    def _get_param_names(self):
+        return ["noise_variance"]
+
+    def _set_params(self, x):
+        x = np.float64(x)
+        if np.all(self._variance != x):
+            if x == 0.:#special case of zero noise
+                self.precision = np.inf
+                self.V = None
+            else:
+                self.precision = 1. / x
+                self.V = (self.precision) * self.Y
+                self.VVT_factor = self.precision * self.YYT_factor
+            self.covariance_matrix = np.eye(self.N) * x
+            self._variance = x
+
+    def predictive_values(self, mu, var, full_cov):
+        """
+        Un-normalize the prediction and add the likelihood variance, then return the 5%, 95% interval
+        """
+        mean = mu * self._scale + self._offset
+        if full_cov:
+            if self.output_dim > 1:
+                raise NotImplementedError, "TODO"
+                # Note. for output_dim>1, we need to re-normalise all the outputs independently.
+                # This will mess up computations of diag(true_var), below.
+                # note that the upper, lower quantiles should be the same shape as mean
+            # Augment the output variance with the likelihood variance and rescale.
+            true_var = (var + np.eye(var.shape[0]) * self._variance) * self._scale ** 2
+            _5pc = mean - 2.*np.sqrt(np.diag(true_var))
+            _95pc = mean + 2.*np.sqrt(np.diag(true_var))
+        else:
+            true_var = (var + self._variance) * self._scale ** 2
+            _5pc = mean - 2.*np.sqrt(true_var)
+            _95pc = mean + 2.*np.sqrt(true_var)
+        return mean, true_var, _5pc, _95pc
+
+    def fit_full(self):
+        """
+        No approximations needed
+        """
+        pass
+
+    def _gradients(self, partial):
+        return np.sum(partial)
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -0,0 +1,44 @@
+import numpy as np
+import copy
+
+class likelihood:
+    """
+    The atom for a likelihood class
+
+    This object interfaces the GP and the data. The most basic likelihood
+    (Gaussian) inherits directly from this, as does the EP algorithm
+
+    Some things must be defined for this to work properly:
+    self.Y : the effective Gaussian target of the GP
+    self.N, self.D : Y.shape
+    self.covariance_matrix : the effective (noise) covariance of the GP targets
+    self.Z : a factor which gets added to the likelihood (0 for a Gaussian, Z_EP for EP)
+    self.is_heteroscedastic : enables significant computational savings in GP
+    self.precision : a scalar or vector representation of the effective target precision
+    self.YYT : (optional) = np.dot(self.Y, self.Y.T) enables computational savings for D>N
+    self.V : self.precision * self.Y 
+    """
+    def __init__(self,data):
+        raise ValueError, "this class is not to be instantiated"
+
+    def _get_params(self):
+        raise NotImplementedError
+
+    def _get_param_names(self):
+        raise NotImplementedError
+
+    def _set_params(self, x):
+        raise NotImplementedError
+
+    def fit(self):
+        raise NotImplementedError
+
+    def _gradients(self, partial):
+        raise NotImplementedError
+
+    def predictive_values(self, mu, var):
+        raise NotImplementedError
+
+    def copy(self):
+        """ Returns a (deep) copy of the current likelihood """
+        return copy.deepcopy(self)
--- a/GPy/likelihoods/likelihood_functions.py
+++ b/GPy/likelihoods/likelihood_functions.py
@ -0,0 +1,166 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats
+import scipy as sp
+import pylab as pb
+from ..util.plot import gpplot
+from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+import link_functions
+
+class LikelihoodFunction(object):
+    """
+    Likelihood class for doing Expectation propagation
+
+    :param Y: observed output (Nx1 numpy.darray)
+    ..Note:: Y values allowed depend on the LikelihoodFunction used
+    """
+    def __init__(self,link):
+        if link == self._analytical:
+            self.moments_match = self._moments_match_analytical
+        else:
+            assert isinstance(link,link_functions.LinkFunction)
+            self.link = link
+            self.moments_match = self._moments_match_numerical
+
+    def _preprocess_values(self,Y):
+        return Y
+
+    def _product(self,gp,obs,mu,sigma):
+        return stats.norm.pdf(gp,loc=mu,scale=sigma) * self._distribution(gp,obs)
+
+    def _nlog_product(self,gp,obs,mu,sigma):
+        return -(-.5*(gp-mu)**2/sigma**2 + self._log_distribution(gp,obs))
+
+    def _locate(self,obs,mu,sigma):
+        """
+        Golden Search to find the mode in the _product function (cavity x exact likelihood) and define a grid around it for numerical integration
+        """
+        golden_A = -1 if obs == 0 else np.array([np.log(obs),mu]).min() #Lower limit
+        golden_B = np.array([np.log(obs),mu]).max() #Upper limit
+        return sp.optimize.golden(self._nlog_product, args=(obs,mu,sigma), brack=(golden_A,golden_B)) #Better to work with _nlog_product than with _product
+
+    def _moments_match_numerical(self,obs,tau,v):
+        """
+        Simpson's Rule is used to calculate the moments mumerically, it needs a grid of points as input.
+        """
+        mu = v/tau
+        sigma = np.sqrt(1./tau)
+        opt = self._locate(obs,mu,sigma)
+        width = 3./np.log(max(obs,2))
+        A = opt - width #Grid's lower limit
+        B = opt + width #Grid's Upper limit
+        K =  10*int(np.log(max(obs,150))) #Number of points in the grid
+        h = (B-A)/K # length of the intervals
+        grid_x = np.hstack([np.linspace(opt-width,opt,K/2+1)[1:-1], np.linspace(opt,opt+width,K/2+1)]) # grid of points (X axis)
+        x = np.hstack([A,B,grid_x[range(1,K,2)],grid_x[range(2,K-1,2)]]) # grid_x rearranged, just to make Simpson's algorithm easier
+        _aux1 = self._product(A,obs,mu,sigma)
+        _aux2 = self._product(B,obs,mu,sigma)
+        _aux3 = 4*self._product(grid_x[range(1,K,2)],obs,mu,sigma)
+        _aux4 = 2*self._product(grid_x[range(2,K-1,2)],obs,mu,sigma)
+        zeroth = np.hstack((_aux1,_aux2,_aux3,_aux4)) # grid of points (Y axis) rearranged
+        first = zeroth*x
+        second = first*x
+        Z_hat = sum(zeroth)*h/3 # Zero-th moment
+        mu_hat = sum(first)*h/(3*Z_hat) # First moment
+        m2 = sum(second)*h/(3*Z_hat) # Second moment
+        sigma2_hat = m2 - mu_hat**2 # Second central moment
+        return float(Z_hat), float(mu_hat), float(sigma2_hat)
+
+class Binomial(LikelihoodFunction):
+    """
+    Probit likelihood
+    Y is expected to take values in {-1,1}
+    -----
+    $$
+    L(x) = \\Phi (Y_i*f_i)
+    $$
+    """
+    def __init__(self,link=None):
+        self._analytical = link_functions.Probit
+        if not link:
+            link = self._analytical
+        super(Binomial, self).__init__(link)
+
+    def _distribution(self,gp,obs):
+        pass
+
+    def _log_distribution(self,gp,obs):
+        pass
+
+    def _preprocess_values(self,Y):
+        """
+        Check if the values of the observations correspond to the values
+        assumed by the likelihood function.
+
+        ..Note:: Binary classification algorithm works better with classes {-1,1}
+        """
+        Y_prep = Y.copy()
+        Y1 = Y[Y.flatten()==1].size
+        Y2 = Y[Y.flatten()==0].size
+        assert Y1 + Y2 == Y.size, 'Binomial likelihood is meant to be used only with outputs in {0,1}.'
+        Y_prep[Y.flatten() == 0] = -1
+        return Y_prep
+
+    def _moments_match_analytical(self,data_i,tau_i,v_i):
+        """
+        Moments match of the marginal approximation in EP algorithm
+
+        :param i: number of observation (int)
+        :param tau_i: precision of the cavity distribution (float)
+        :param v_i: mean/variance of the cavity distribution (float)
+        """
+        z = data_i*v_i/np.sqrt(tau_i**2 + tau_i)
+        Z_hat = std_norm_cdf(z)
+        phi = std_norm_pdf(z)
+        mu_hat = v_i/tau_i + data_i*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i))
+        sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
+        return Z_hat, mu_hat, sigma2_hat
+
+    def predictive_values(self,mu,var):
+        """
+        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction
+        :param mu: mean of the latent variable
+        :param var: variance of the latent variable
+        """
+        mu = mu.flatten()
+        var = var.flatten()
+        mean = stats.norm.cdf(mu/np.sqrt(1+var))
+        norm_025 = [stats.norm.ppf(.025,m,v) for m,v in zip(mu,var)]
+        norm_975 = [stats.norm.ppf(.975,m,v) for m,v in zip(mu,var)]
+        p_025 = stats.norm.cdf(norm_025/np.sqrt(1+var))
+        p_975 = stats.norm.cdf(norm_975/np.sqrt(1+var))
+        return mean[:,None], np.nan*var, p_025[:,None], p_975[:,None] # TODO: var
+
+class Poisson(LikelihoodFunction):
+    """
+    Poisson likelihood
+    Y is expected to take values in {0,1,2,...}
+    -----
+    $$
+    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
+    $$
+    """
+    def __init__(self,link=None):
+        self._analytical = None
+        if not link:
+            link = link_functions.Log()
+        super(Poisson, self).__init__(link)
+
+    def _distribution(self,gp,obs):
+        return stats.poisson.pmf(obs,self.link.inv_transf(gp))
+
+    def _log_distribution(self,gp,obs):
+        return - self.link.inv_transf(gp) + obs * self.link.log_inv_transf(gp)
+
+    def predictive_values(self,mu,var):
+        """
+        Compute  mean, and conficence interval (percentiles 5 and 95) of the  prediction
+        """
+        mean = self.link.transf(mu)#np.exp(mu*self.scale + self.location)
+        tmp = stats.poisson.ppf(np.array([.025,.975]),mean)
+        p_025 = tmp[:,0]
+        p_975 = tmp[:,1]
+        return mean,np.nan*mean,p_025,p_975 # better variance here TODO
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@ -0,0 +1,33 @@
+# Copyright (c) 2012, 2013 Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy import stats
+import scipy as sp
+import pylab as pb
+from ..util.plot import gpplot
+from ..util.univariate_Gaussian import std_norm_pdf,std_norm_cdf
+
+class LinkFunction(object):
+    """
+    Link function class for doing non-Gaussian likelihoods approximation
+
+    :param Y: observed output (Nx1 numpy.darray)
+    ..Note:: Y values allowed depend on the likelihood_function used
+    """
+    def __init__(self):
+        pass
+
+class Probit(LinkFunction):
+    """
+    Probit link function: Squashes a likelihood between 0 and 1
+    """
+    def transf(self,mu):
+        pass
+
+    def inv_transf(self,f):
+        pass
+
+    def log_inv_transf(self,f):
+        pass
--- a/GPy/mappings/init.py
+++ b/GPy/mappings/init.py
@ -0,0 +1,7 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernel import Kernel
+from linear import Linear
+from mlp import MLP
+#from rbf import RBF
--- a/GPy/mappings/kernel.py
+++ b/GPy/mappings/kernel.py
@ -0,0 +1,60 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ..core.mapping import Mapping
+import GPy
+
+class Kernel(Mapping):
+    """
+    Mapping based on a kernel/covariance function.
+
+    .. math::
+
+       f(\mathbf{x}*) = \mathbf{A}\mathbf{k}(\mathbf{X}, \mathbf{x}^*) + \mathbf{b}
+
+    :param X: input observations containing :math:`\mathbf{X}`
+    :type X: ndarray
+    :param output_dim: dimension of output.
+    :type output_dim: int
+    :param kernel: a GPy kernel, defaults to GPy.kern.rbf
+    :type kernel: GPy.kern.kern
+
+    """
+
+    def __init__(self, X, output_dim=1, kernel=None):
+        Mapping.__init__(self, input_dim=X.shape[1], output_dim=output_dim)
+        if kernel is None:
+            kernel = GPy.kern.rbf(self.input_dim)
+        self.kern = kernel
+        self.X = X
+        self.num_data = X.shape[0]
+        self.num_params = self.output_dim*(self.num_data + 1)
+        self.A = np.array((self.num_data, self.output_dim))
+        self.bias = np.array(self.output_dim)
+        self.randomize()
+        self.name = 'kernel'
+    def _get_param_names(self):
+        return sum([['A_%i_%i' % (n, d) for d in range(self.output_dim)] for n in range(self.num_data)], []) + ['bias_%i' % d for d in range(self.output_dim)]
+
+    def _get_params(self):
+        return np.hstack((self.A.flatten(), self.bias))
+
+    def _set_params(self, x):
+        self.A = x[:self.num_data * self.output_dim].reshape(self.num_data, self.output_dim).copy()
+        self.bias = x[self.num_data*self.output_dim:].copy()
+        
+    def randomize(self):
+        self.A = np.random.randn(self.num_data, self.output_dim)/np.sqrt(self.num_data+1)
+        self.bias = np.random.randn(self.output_dim)/np.sqrt(self.num_data+1)
+
+    def f(self, X):
+        return np.dot(self.kern.K(X, self.X),self.A) + self.bias
+
+    def df_dtheta(self, dL_df, X):
+        self._df_dA = (dL_df[:, :, None]*self.kern.K(X, self.X)[:, None, :]).sum(0).T
+        self._df_dbias = (dL_df.sum(0))
+        return np.hstack((self._df_dA.flatten(), self._df_dbias))
+
+    def df_dX(self, dL_df, X):
+        return self.kern.dK_dX((dL_df[:, None, :]*self.A[None, :, :]).sum(2), X, self.X) 
--- a/GPy/mappings/linear.py
+++ b/GPy/mappings/linear.py
@ -0,0 +1,53 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ..core.mapping import Mapping
+
+class Linear(Mapping):
+    """
+    Mapping based on a linear model.
+
+    .. math::
+
+       f(\mathbf{x}*) = \mathbf{W}\mathbf{x}^* + \mathbf{b}
+
+    :param X: input observations
+    :type X: ndarray
+    :param output_dim: dimension of output.
+    :type output_dim: int
+    
+    """
+
+    def __init__(self, input_dim=1, output_dim=1):
+        self.name = 'linear'
+        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
+        self.num_params = self.output_dim*(self.input_dim + 1)
+        self.W = np.array((self.input_dim, self.output_dim))
+        self.bias = np.array(self.output_dim)
+        self.randomize()
+
+    def _get_param_names(self):
+        return sum([['W_%i_%i' % (n, d) for d in range(self.output_dim)] for n in range(self.input_dim)], []) + ['bias_%i' % d for d in range(self.output_dim)]
+
+    def _get_params(self):
+        return np.hstack((self.W.flatten(), self.bias))
+
+    def _set_params(self, x):
+        self.W = x[:self.input_dim * self.output_dim].reshape(self.input_dim, self.output_dim).copy()
+        self.bias = x[self.input_dim*self.output_dim:].copy()
+    def randomize(self):
+        self.W = np.random.randn(self.input_dim, self.output_dim)/np.sqrt(self.input_dim + 1)
+        self.bias = np.random.randn(self.output_dim)/np.sqrt(self.input_dim + 1)
+
+    def f(self, X):
+        return np.dot(X,self.W) + self.bias
+
+    def df_dtheta(self, dL_df, X):
+        self._df_dW = (dL_df[:, :, None]*X[:, None, :]).sum(0).T
+        self._df_dbias = (dL_df.sum(0))
+        return np.hstack((self._df_dW.flatten(), self._df_dbias))
+        
+    def df_dX(self, dL_df, X):
+        return (dL_df[:, None, :]*self.W[None, :, :]).sum(2) 
+    
--- a/GPy/mappings/mlp.py
+++ b/GPy/mappings/mlp.py
@ -0,0 +1,127 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ..core.mapping import Mapping
+
+class MLP(Mapping):
+    """
+    Mapping based on a multi-layer perceptron neural network model.
+
+    .. math::
+
+       f(\mathbf{x}*) = \mathbf{W}^0\boldsymbol{\phi}(\mathbf{W}^1\mathbf{x}+\mathb{b}^1)^* + \mathbf{b}^0
+
+    where
+    ..math::
+      \phi(\cdot) = \text{tanh}(\cdot)
+
+    :param X: input observations
+    :type X: ndarray
+    :param output_dim: dimension of output.
+    :type output_dim: int
+    :param hidden_dim: dimension of hidden layer. If it is an int, there is one hidden layer of the given dimension. If it is a list of ints there are as manny hidden layers as the length of the list, each with the given number of hidden nodes in it.
+    :type hidden_dim: int or list of ints. 
+    """
+
+    def __init__(self, input_dim=1, output_dim=1, hidden_dim=3):
+        Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
+        self.name = 'mlp'
+        if isinstance(hidden_dim, int):
+            hidden_dim = [hidden_dim]
+        self.hidden_dim = hidden_dim
+        self.activation = [None]*len(self.hidden_dim)
+        self.W = []
+        self._dL_dW = []
+        self.bias = []
+        self._dL_dbias = []
+        self.W.append(np.zeros((self.input_dim, self.hidden_dim[0])))
+        self._dL_dW.append(np.zeros((self.input_dim, self.hidden_dim[0])))
+        self.bias.append(np.zeros(self.hidden_dim[0]))
+        self._dL_dbias.append(np.zeros(self.hidden_dim[0]))
+        self.num_params = self.hidden_dim[0]*(self.input_dim+1)
+        for h1, h0 in zip(hidden_dim[1:], hidden_dim[0:-1]):
+            self.W.append(np.zeros((h0, h1)))
+            self._dL_dW.append(np.zeros((h0, h1)))
+            self.bias.append(np.zeros(h1))
+            self._dL_dbias.append(np.zeros(h1))
+            self.num_params += h1*(h0+1)
+        self.W.append(np.zeros((self.hidden_dim[-1], self.output_dim)))
+        self._dL_dW.append(np.zeros((self.hidden_dim[-1], self.output_dim)))
+        self.bias.append(np.zeros(self.output_dim))
+        self._dL_dbias.append(np.zeros(self.output_dim))
+        self.num_params += self.output_dim*(self.hidden_dim[-1]+1)
+        self.randomize()
+
+    def _get_param_names(self):
+        return sum([['W%i_%i_%i' % (i, n, d)  for n in range(self.W[i].shape[0]) for d in range(self.W[i].shape[1])] + ['bias%i_%i' % (i, d) for d in range(self.W[i].shape[1])] for i in range(len(self.W))], [])
+
+    def _get_params(self):
+        param = np.array([])
+        for W, bias in zip(self.W, self.bias):
+            param = np.hstack((param, W.flatten(), bias))
+        return param
+    
+    def _set_params(self, x):
+        start = 0
+        for W, bias in zip(self.W, self.bias):
+            end = W.shape[0]*W.shape[1]+start
+            W[:] = x[start:end].reshape(W.shape[0], W.shape[1]).copy()
+            start = end
+            end = W.shape[1]+end
+            bias[:] = x[start:end].copy()
+            start = end
+
+    def randomize(self):
+        for W, bias in zip(self.W, self.bias):
+            W[:] = np.random.randn(W.shape[0], W.shape[1])/np.sqrt(W.shape[0]+1)
+            bias[:] = np.random.randn(W.shape[1])/np.sqrt(W.shape[0]+1)
+
+    def f(self, X):
+        self._f_computations(X)
+        return np.dot(np.tanh(self.activation[-1]), self.W[-1]) + self.bias[-1]
+
+    def _f_computations(self, X):
+        W = self.W[0]
+        bias = self.bias[0]
+        self.activation[0] = np.dot(X,W) + bias
+        for W, bias, index in zip(self.W[1:-1], self.bias[1:-1], range(1, len(self.activation))):
+            self.activation[index] = np.dot(np.tanh(self.activation[index-1]), W)+bias
+
+    def df_dtheta(self, dL_df, X):
+        self._df_computations(dL_df, X)
+        g = np.array([])
+        for gW, gbias in zip(self._dL_dW, self._dL_dbias):
+            g = np.hstack((g, gW.flatten(), gbias))
+        return g
+
+    def _df_computations(self, dL_df, X):
+        self._f_computations(X)
+        a0 = self.activation[-1]
+        W = self.W[-1]
+        self._dL_dW[-1] = (dL_df[:, :, None]*np.tanh(a0[:, None, :])).sum(0).T
+        dL_dta=(dL_df[:, None, :]*W[None, :, :]).sum(2)
+        self._dL_dbias[-1] = (dL_df.sum(0))
+        for dL_dW, dL_dbias, W, bias, a0, a1 in zip(self._dL_dW[-2:0:-1],
+                                                    self._dL_dbias[-2:0:-1],
+                                                    self.W[-2:0:-1],
+                                                    self.bias[-2:0:-1],
+                                                    self.activation[-2::-1],
+                                                    self.activation[-1:0:-1]):
+            ta = np.tanh(a1)
+            dL_da = dL_dta*(1-ta*ta)
+            dL_dW[:] = (dL_da[:, :, None]*np.tanh(a0[:, None, :])).sum(0).T
+            dL_dbias[:] = (dL_da.sum(0))
+            dL_dta = (dL_da[:, None, :]*W[None, :, :]).sum(2)
+        ta = np.tanh(self.activation[0])
+        dL_da = dL_dta*(1-ta*ta)
+        W = self.W[0]
+        self._dL_dW[0] = (dL_da[:, :, None]*X[:, None, :]).sum(0).T
+        self._dL_dbias[0] = (dL_da.sum(0))
+        self._dL_dX = (dL_da[:, None, :]*W[None, :, :]).sum(2)
+
+        
+    def df_dX(self, dL_df, X):
+        self._df_computations(dL_df, X)
+        return self._dL_dX
+    
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -0,0 +1,16 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from gp_regression import GPRegression
+from gp_classification import GPClassification
+from sparse_gp_regression import SparseGPRegression
+from svigp_regression import SVIGPRegression
+from sparse_gp_classification import SparseGPClassification
+from fitc_classification import FITCClassification
+from gplvm import GPLVM
+from bcgplvm import BCGPLVM
+from sparse_gplvm import SparseGPLVM
+from warped_gp import WarpedGP
+from bayesian_gplvm import BayesianGPLVM
+from mrd import MRD
+from gradient_checker import GradientChecker
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@ -0,0 +1,334 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ..core import SparseGP
+from ..likelihoods import Gaussian
+from .. import kern
+import itertools
+from matplotlib.colors import colorConverter
+from GPy.inference.optimization import SCG
+from GPy.util import plot_latent
+from GPy.models.gplvm import GPLVM
+from GPy.util.plot_latent import most_significant_input_dimensions
+from matplotlib import pyplot
+
+class BayesianGPLVM(SparseGP, GPLVM):
+    """
+    Bayesian Gaussian Process Latent Variable Model
+
+    :param Y: observed data (np.ndarray) or GPy.likelihood
+    :type Y: np.ndarray| GPy.likelihood instance
+    :param input_dim: latent dimensionality
+    :type input_dim: int
+    :param init: initialisation method for the latent space
+    :type init: 'PCA'|'random'
+
+    """
+    def __init__(self, likelihood_or_Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
+                 Z=None, kernel=None, **kwargs):
+        if type(likelihood_or_Y) is np.ndarray:
+            likelihood = Gaussian(likelihood_or_Y)
+        else:
+            likelihood = likelihood_or_Y
+
+        if X == None:
+            X = self.initialise_latent(init, input_dim, likelihood.Y)
+        self.init = init
+
+        if X_variance is None:
+            X_variance = np.clip((np.ones_like(X) * 0.5) + .01 * np.random.randn(*X.shape), 0.001, 1)
+
+        if Z is None:
+            Z = np.random.permutation(X.copy())[:num_inducing]
+        assert Z.shape[1] == X.shape[1]
+
+        if kernel is None:
+            kernel = kern.rbf(input_dim) # + kern.white(input_dim)
+
+        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, X_variance=X_variance, **kwargs)
+        self.ensure_default_constraints()
+
+    def getstate(self):
+        """
+        Get the current state of the class,
+        here just all the indices, rest can get recomputed
+        """
+        return SparseGP.getstate(self) + [self.init]
+
+    def setstate(self, state):
+        self._const_jitter = None
+        self.init = state.pop()
+        SparseGP.setstate(self, state)
+
+    def _get_param_names(self):
+        X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
+        S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
+        return (X_names + S_names + SparseGP._get_param_names(self))
+
+    def _get_print_names(self):
+        return SparseGP._get_print_names(self)
+
+    def _get_params(self):
+        """
+        Horizontally stacks the parameters in order to present them to the optimizer.
+        The resulting 1-input_dim array has this structure:
+
+        ===============================================================
+        |       mu       |        S        |    Z    | theta |  beta  |
+        ===============================================================
+
+        """
+        x = np.hstack((self.X.flatten(), self.X_variance.flatten(), SparseGP._get_params(self)))
+        return x
+
+    def _set_params(self, x, save_old=True, save_count=0):
+        N, input_dim = self.num_data, self.input_dim
+        self.X = x[:self.X.size].reshape(N, input_dim).copy()
+        self.X_variance = x[(N * input_dim):(2 * N * input_dim)].reshape(N, input_dim).copy()
+        SparseGP._set_params(self, x[(2 * N * input_dim):])
+
+    def dKL_dmuS(self):
+        dKL_dS = (1. - (1. / (self.X_variance))) * 0.5
+        dKL_dmu = self.X
+        return dKL_dmu, dKL_dS
+
+    def dL_dmuS(self):
+        dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi0_dmuS(self.dL_dpsi0, self.Z, self.X, self.X_variance)
+        dL_dmu_psi1, dL_dS_psi1 = self.kern.dpsi1_dmuS(self.dL_dpsi1, self.Z, self.X, self.X_variance)
+        dL_dmu_psi2, dL_dS_psi2 = self.kern.dpsi2_dmuS(self.dL_dpsi2, self.Z, self.X, self.X_variance)
+        dL_dmu = dL_dmu_psi0 + dL_dmu_psi1 + dL_dmu_psi2
+        dL_dS = dL_dS_psi0 + dL_dS_psi1 + dL_dS_psi2
+
+        return dL_dmu, dL_dS
+
+    def KL_divergence(self):
+        var_mean = np.square(self.X).sum()
+        var_S = np.sum(self.X_variance - np.log(self.X_variance))
+        return 0.5 * (var_mean + var_S) - 0.5 * self.input_dim * self.num_data
+
+    def log_likelihood(self):
+        ll = SparseGP.log_likelihood(self)
+        kl = self.KL_divergence()
+        return ll - kl
+
+    def _log_likelihood_gradients(self):
+        dKL_dmu, dKL_dS = self.dKL_dmuS()
+        dL_dmu, dL_dS = self.dL_dmuS()
+        d_dmu = (dL_dmu - dKL_dmu).flatten()
+        d_dS = (dL_dS - dKL_dS).flatten()
+        self.dbound_dmuS = np.hstack((d_dmu, d_dS))
+        self.dbound_dZtheta = SparseGP._log_likelihood_gradients(self)
+        return np.hstack((self.dbound_dmuS.flatten(), self.dbound_dZtheta))
+
+    def plot_latent(self, plot_inducing=True, *args, **kwargs):
+        return plot_latent.plot_latent(self, plot_inducing=plot_inducing, *args, **kwargs)
+
+    def do_test_latents(self, Y):
+        """
+        Compute the latent representation for a set of new points Y
+
+        Notes:
+        This will only work with a univariate Gaussian likelihood (for now)
+        """
+        assert not self.likelihood.is_heteroscedastic
+        N_test = Y.shape[0]
+        input_dim = self.Z.shape[1]
+        means = np.zeros((N_test, input_dim))
+        covars = np.zeros((N_test, input_dim))
+
+        dpsi0 = -0.5 * self.input_dim * self.likelihood.precision
+        dpsi2 = self.dL_dpsi2[0][None, :, :] # TODO: this may change if we ignore het. likelihoods
+        V = self.likelihood.precision * Y
+        dpsi1 = np.dot(self.Cpsi1V, V.T)
+
+        start = np.zeros(self.input_dim * 2)
+
+        for n, dpsi1_n in enumerate(dpsi1.T[:, :, None]):
+            args = (self.kern, self.Z, dpsi0, dpsi1_n, dpsi2)
+            xopt, fopt, neval, status = SCG(f=latent_cost, gradf=latent_grad, x=start, optargs=args, display=False)
+
+            mu, log_S = xopt.reshape(2, 1, -1)
+            means[n] = mu[0].copy()
+            covars[n] = np.exp(log_S[0]).copy()
+
+        return means, covars
+
+    def dmu_dX(self, Xnew):
+        """
+        Calculate the gradient of the prediction at Xnew w.r.t Xnew.
+        """
+        dmu_dX = np.zeros_like(Xnew)
+        for i in range(self.Z.shape[0]):
+            dmu_dX += self.kern.dK_dX(self.Cpsi1Vf[i:i + 1, :], Xnew, self.Z[i:i + 1, :])
+        return dmu_dX
+
+    def dmu_dXnew(self, Xnew):
+        """
+        Individual gradient of prediction at Xnew w.r.t. each sample in Xnew
+        """
+        dK_dX = np.zeros((Xnew.shape[0], self.num_inducing))
+        ones = np.ones((1, 1))
+        for i in range(self.Z.shape[0]):
+            dK_dX[:, i] = self.kern.dK_dX(ones, Xnew, self.Z[i:i + 1, :]).sum(-1)
+        return np.dot(dK_dX, self.Cpsi1Vf)
+
+    def plot_steepest_gradient_map(self, fignum=None, ax=None, which_indices=None, labels=None, data_labels=None, data_marker='o', data_s=40, resolution=20, aspect='auto', updates=False, ** kwargs):
+        input_1, input_2 = significant_dims = most_significant_input_dimensions(self, which_indices)
+
+        X = np.zeros((resolution ** 2, self.input_dim))
+        indices = np.r_[:X.shape[0]]
+        if labels is None:
+            labels = range(self.output_dim)
+
+        def plot_function(x):
+            X[:, significant_dims] = x
+            dmu_dX = self.dmu_dXnew(X)
+            argmax = np.argmax(dmu_dX, 1)
+            return dmu_dX[indices, argmax], np.array(labels)[argmax]
+
+        if ax is None:
+            fig = pyplot.figure(num=fignum)
+            ax = fig.add_subplot(111)
+
+        if data_labels is None:
+            data_labels = np.ones(self.num_data)
+        ulabels = []
+        for lab in data_labels:
+            if not lab in ulabels:
+                ulabels.append(lab)
+        marker = itertools.cycle(list(data_marker))
+        from GPy.util import Tango
+        for i, ul in enumerate(ulabels):
+            if type(ul) is np.string_:
+                this_label = ul
+            elif type(ul) is np.int64:
+                this_label = 'class %i' % ul
+            else:
+                this_label = 'class %i' % i
+            m = marker.next()
+            index = np.nonzero(data_labels == ul)[0]
+            x = self.X[index, input_1]
+            y = self.X[index, input_2]
+            ax.scatter(x, y, marker=m, s=data_s, color=Tango.nextMedium(), label=this_label)
+
+        ax.set_xlabel('latent dimension %i' % input_1)
+        ax.set_ylabel('latent dimension %i' % input_2)
+
+        from matplotlib.cm import get_cmap
+        from GPy.util.latent_space_visualizations.controllers.imshow_controller import ImAnnotateController
+        controller = ImAnnotateController(ax,
+                                      plot_function,
+                                      tuple(self.X.min(0)[:, significant_dims]) + tuple(self.X.max(0)[:, significant_dims]),
+                                      resolution=resolution,
+                                      aspect=aspect,
+                                      cmap=get_cmap('jet'),
+                                      **kwargs)
+        ax.legend()
+        ax.figure.tight_layout()
+        if updates:
+            pyplot.show()
+            clear = raw_input('Enter to continue')
+            if clear.lower() in 'yes' or clear == '':
+                controller.deactivate()
+        return controller.view
+
+    def plot_X_1d(self, fignum=None, ax=None, colors=None):
+        """
+        Plot latent space X in 1D:
+
+            -if fig is given, create input_dim subplots in fig and plot in these
+            -if ax is given plot input_dim 1D latent space plots of X into each `axis`
+            -if neither fig nor ax is given create a figure with fignum and plot in there
+
+        colors:
+            colors of different latent space dimensions input_dim
+        """
+        import pylab
+        if ax is None:
+            fig = pylab.figure(num=fignum, figsize=(8, min(12, (2 * self.X.shape[1]))))
+        if colors is None:
+            colors = pylab.gca()._get_lines.color_cycle
+            pylab.clf()
+        else:
+            colors = iter(colors)
+        plots = []
+        x = np.arange(self.X.shape[0])
+        for i in range(self.X.shape[1]):
+            if ax is None:
+                a = fig.add_subplot(self.X.shape[1], 1, i + 1)
+            elif isinstance(ax, (tuple, list)):
+                a = ax[i]
+            else:
+                raise ValueError("Need one ax per latent dimnesion input_dim")
+            a.plot(self.X, c='k', alpha=.3)
+            plots.extend(a.plot(x, self.X.T[i], c=colors.next(), label=r"$\mathbf{{X_{{{}}}}}$".format(i)))
+            a.fill_between(x,
+                            self.X.T[i] - 2 * np.sqrt(self.X_variance.T[i]),
+                            self.X.T[i] + 2 * np.sqrt(self.X_variance.T[i]),
+                            facecolor=plots[-1].get_color(),
+                            alpha=.3)
+            a.legend(borderaxespad=0.)
+            a.set_xlim(x.min(), x.max())
+            if i < self.X.shape[1] - 1:
+                a.set_xticklabels('')
+        pylab.draw()
+        fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
+        return fig
+
+def latent_cost_and_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
+    """
+    objective function for fitting the latent variables for test points
+    (negative log-likelihood: should be minimised!)
+    """
+    mu, log_S = mu_S.reshape(2, 1, -1)
+    S = np.exp(log_S)
+
+    psi0 = kern.psi0(Z, mu, S)
+    psi1 = kern.psi1(Z, mu, S)
+    psi2 = kern.psi2(Z, mu, S)
+
+    lik = dL_dpsi0 * psi0 + np.dot(dL_dpsi1.flatten(), psi1.flatten()) + np.dot(dL_dpsi2.flatten(), psi2.flatten()) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S)
+
+    mu0, S0 = kern.dpsi0_dmuS(dL_dpsi0, Z, mu, S)
+    mu1, S1 = kern.dpsi1_dmuS(dL_dpsi1, Z, mu, S)
+    mu2, S2 = kern.dpsi2_dmuS(dL_dpsi2, Z, mu, S)
+
+    dmu = mu0 + mu1 + mu2 - mu
+    # dS = S0 + S1 + S2 -0.5 + .5/S
+    dlnS = S * (S0 + S1 + S2 - 0.5) + .5
+    return -lik, -np.hstack((dmu.flatten(), dlnS.flatten()))
+
+def latent_cost(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
+    """
+    objective function for fitting the latent variables (negative log-likelihood: should be minimised!)
+    This is the same as latent_cost_and_grad but only for the objective
+    """
+    mu, log_S = mu_S.reshape(2, 1, -1)
+    S = np.exp(log_S)
+
+    psi0 = kern.psi0(Z, mu, S)
+    psi1 = kern.psi1(Z, mu, S)
+    psi2 = kern.psi2(Z, mu, S)
+
+    lik = dL_dpsi0 * psi0 + np.dot(dL_dpsi1.flatten(), psi1.flatten()) + np.dot(dL_dpsi2.flatten(), psi2.flatten()) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S)
+    return -float(lik)
+
+def latent_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
+    """
+    This is the same as latent_cost_and_grad but only for the grad
+    """
+    mu, log_S = mu_S.reshape(2, 1, -1)
+    S = np.exp(log_S)
+
+    mu0, S0 = kern.dpsi0_dmuS(dL_dpsi0, Z, mu, S)
+    mu1, S1 = kern.dpsi1_dmuS(dL_dpsi1, Z, mu, S)
+    mu2, S2 = kern.dpsi2_dmuS(dL_dpsi2, Z, mu, S)
+
+    dmu = mu0 + mu1 + mu2 - mu
+    # dS = S0 + S1 + S2 -0.5 + .5/S
+    dlnS = S * (S0 + S1 + S2 - 0.5) + .5
+
+    return -np.hstack((dmu.flatten(), dlnS.flatten()))
+
+
--- a/GPy/models/bcgplvm.py
+++ b/GPy/models/bcgplvm.py
@ -0,0 +1,50 @@
+# ## Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+import pylab as pb
+import sys, pdb
+from ..core import GP
+from ..models import GPLVM
+from ..mappings import *
+
+
+class BCGPLVM(GPLVM):
+    """
+    Back constrained Gaussian Process Latent Variable Model
+
+    :param Y: observed data
+    :type Y: np.ndarray
+    :param input_dim: latent dimensionality
+    :type input_dim: int
+    :param init: initialisation method for the latent space
+    :type init: 'PCA'|'random'
+    :param mapping: mapping for back constraint
+    :type mapping: GPy.core.Mapping object
+
+    """
+    def __init__(self, Y, input_dim, init='PCA', X=None, kernel=None, normalize_Y=False, mapping=None):
+        
+        if mapping is None:
+            mapping = Kernel(X=Y, output_dim=input_dim)
+        self.mapping = mapping
+        GPLVM.__init__(self, Y, input_dim, init, X, kernel, normalize_Y)
+        self.X = self.mapping.f(self.likelihood.Y)
+
+    def _get_param_names(self):
+        return self.mapping._get_param_names() + GP._get_param_names(self)
+
+    def _get_params(self):
+        return np.hstack((self.mapping._get_params(), GP._get_params(self)))
+
+    def _set_params(self, x):
+        self.mapping._set_params(x[:self.mapping.num_params])
+        self.X = self.mapping.f(self.likelihood.Y)
+        GP._set_params(self, x[self.mapping.num_params:])
+
+    def _log_likelihood_gradients(self):
+        dL_df = 2.*self.kern.dK_dX(self.dL_dK, self.X)
+        dL_dtheta = self.mapping.df_dtheta(dL_df, self.likelihood.Y)
+        return np.hstack((dL_dtheta.flatten(), GP._log_likelihood_gradients(self)))
+
--- a/GPy/models/fitc_classification.py
+++ b/GPy/models/fitc_classification.py
@ -0,0 +1,47 @@
+# Copyright (c) 2013, Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from ..core import FITC
+from .. import likelihoods
+from .. import kern
+from ..likelihoods import likelihood
+
+class FITCClassification(FITC):
+    """
+    FITC approximation for classification
+
+    This is a thin wrapper around the FITC class, with a set of sensible defaults
+
+    :param X: input observations
+    :param Y: observed values
+    :param likelihood: a GPy likelihood, defaults to Binomial with probit link function
+    :param kernel: a GPy kernel, defaults to rbf+white
+    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_X: False|True
+    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_Y: False|True
+    :rtype: model object
+
+    """
+
+    def __init__(self, X, Y=None, likelihood=None, kernel=None, normalize_X=False, normalize_Y=False, Z=None, num_inducing=10):
+        if kernel is None:
+            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1],1e-3)
+
+        if likelihood is None:
+            distribution = likelihoods.likelihood_functions.Binomial()
+            likelihood = likelihoods.EP(Y, distribution)
+        elif Y is not None:
+            if not all(Y.flatten() == likelihood.data.flatten()):
+                raise Warning, 'likelihood.data and Y are different.'
+
+        if Z is None:
+            i = np.random.permutation(X.shape[0])[:num_inducing]
+            Z = X[i].copy()
+        else:
+            assert Z.shape[1]==X.shape[1]
+
+        FITC.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X)
+        self.ensure_default_constraints()
--- a/GPy/models/gp_classification.py
+++ b/GPy/models/gp_classification.py
@ -0,0 +1,41 @@
+# Copyright (c) 2013, Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from ..core import GP
+from .. import likelihoods
+from .. import kern
+
+class GPClassification(GP):
+    """
+    Gaussian Process classification
+
+    This is a thin wrapper around the models.GP class, with a set of sensible defaults
+
+    :param X: input observations
+    :param Y: observed values
+    :param likelihood: a GPy likelihood, defaults to Binomial with probit link_function
+    :param kernel: a GPy kernel, defaults to rbf
+    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_X: False|True
+    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_Y: False|True
+
+    .. Note:: Multiple independent outputs are allowed using columns of Y
+
+    """
+
+    def __init__(self,X,Y=None,likelihood=None,kernel=None,normalize_X=False,normalize_Y=False):
+        if kernel is None:
+            kernel = kern.rbf(X.shape[1])
+
+        if likelihood is None:
+            distribution = likelihoods.likelihood_functions.Binomial()
+            likelihood = likelihoods.EP(Y, distribution)
+        elif Y is not None:
+            if not all(Y.flatten() == likelihood.data.flatten()):
+                raise Warning, 'likelihood.data and Y are different.'
+
+        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
+        self.ensure_default_constraints()
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@ -0,0 +1,43 @@
+# Copyright (c) 2012, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from ..core import GP
+from .. import likelihoods
+from .. import kern
+
+class GPRegression(GP):
+    """
+    Gaussian Process model for regression
+
+    This is a thin wrapper around the models.GP class, with a set of sensible defaults
+
+    :param X: input observations
+    :param Y: observed values
+    :param kernel: a GPy kernel, defaults to rbf
+    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_X: False|True
+    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_Y: False|True
+
+    .. Note:: Multiple independent outputs are allowed using columns of Y
+
+    """
+
+    def __init__(self, X, Y, kernel=None, normalize_X=False, normalize_Y=False):
+        if kernel is None:
+            kernel = kern.rbf(X.shape[1])
+
+        likelihood = likelihoods.Gaussian(Y, normalize=normalize_Y)
+
+        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
+        self.ensure_default_constraints()
+
+    def getstate(self):
+        return GP.getstate(self)
+
+    def setstate(self, state):
+        return GP.setstate(self, state)
+
+    pass
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@ -0,0 +1,93 @@
+# ## Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+import pylab as pb
+import sys, pdb
+from .. import kern
+from ..core import Model
+from ..util.linalg import pdinv, PCA
+from ..core.priors import Gaussian as Gaussian_prior
+from ..core import GP
+from ..likelihoods import Gaussian
+from .. import util
+from GPy.util import plot_latent
+
+
+class GPLVM(GP):
+    """
+    Gaussian Process Latent Variable Model
+
+    :param Y: observed data
+    :type Y: np.ndarray
+    :param input_dim: latent dimensionality
+    :type input_dim: int
+    :param init: initialisation method for the latent space
+    :type init: 'PCA'|'random'
+
+    """
+    def __init__(self, Y, input_dim, init='PCA', X=None, kernel=None, normalize_Y=False):
+        if X is None:
+            X = self.initialise_latent(init, input_dim, Y)
+        if kernel is None:
+            kernel = kern.rbf(input_dim, ARD=input_dim > 1) + kern.bias(input_dim, np.exp(-2))
+        likelihood = Gaussian(Y, normalize=normalize_Y, variance=np.exp(-2.))
+        GP.__init__(self, X, likelihood, kernel, normalize_X=False)
+        self.set_prior('.*X', Gaussian_prior(0, 1))
+        self.ensure_default_constraints()
+
+    def initialise_latent(self, init, input_dim, Y):
+        Xr = np.random.randn(Y.shape[0], input_dim)
+        if init == 'PCA':
+            PC = PCA(Y, input_dim)[0]
+            Xr[:PC.shape[0], :PC.shape[1]] = PC
+        return Xr
+
+    def getstate(self):
+        return GP.getstate(self)
+
+    def setstate(self, state):
+        GP.setstate(self, state)
+
+    def _get_param_names(self):
+        return sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], []) + GP._get_param_names(self)
+
+    def _get_params(self):
+        return np.hstack((self.X.flatten(), GP._get_params(self)))
+
+    def _set_params(self, x):
+        self.X = x[:self.num_data * self.input_dim].reshape(self.num_data, self.input_dim).copy()
+        GP._set_params(self, x[self.X.size:])
+
+    def _log_likelihood_gradients(self):
+        dL_dX = 2.*self.kern.dK_dX(self.dL_dK, self.X)
+
+        return np.hstack((dL_dX.flatten(), GP._log_likelihood_gradients(self)))
+
+    def jacobian(self,X):
+        target = np.zeros((X.shape[0],X.shape[1],self.output_dim))
+        for i in range(self.output_dim):
+        	target[:,:,i]=self.kern.dK_dX(np.dot(self.Ki,self.likelihood.Y[:,i])[None, :],X,self.X)
+        return target
+   
+    def magnification(self,X):
+        target=np.zeros(X.shape[0])
+        J = np.zeros((X.shape[0],X.shape[1],self.output_dim))
+    	J=self.jacobian(X)
+        for i in range(X.shape[0]):
+		    target[i]=np.sqrt(pb.det(np.dot(J[i,:,:],np.transpose(J[i,:,:]))))
+        return target
+
+    def plot(self):
+        assert self.likelihood.Y.shape[1] == 2
+        pb.scatter(self.likelihood.Y[:, 0], self.likelihood.Y[:, 1], 40, self.X[:, 0].copy(), linewidth=0, cmap=pb.cm.jet)
+        Xnew = np.linspace(self.X.min(), self.X.max(), 200)[:, None]
+        mu, var, upper, lower = self.predict(Xnew)
+        pb.plot(mu[:, 0], mu[:, 1], 'k', linewidth=1.5)
+
+    def plot_latent(self, *args, **kwargs):
+        return util.plot_latent.plot_latent(self, *args, **kwargs)
+
+    def plot_magnification(self, *args, **kwargs):
+        return util.plot_latent.plot_magnification(self, *args, **kwargs)
--- a/GPy/models/gradient_checker.py
+++ b/GPy/models/gradient_checker.py
@ -0,0 +1,114 @@
+'''
+Created on 17 Jul 2013
+
+@author: maxz
+'''
+from GPy.core.model import Model
+import itertools
+import numpy
+
+def get_shape(x):
+    if isinstance(x, numpy.ndarray):
+        return x.shape
+    return ()
+
+def at_least_one_element(x):
+    if isinstance(x, (list, tuple)):
+        return x
+    return [x]
+
+def flatten_if_needed(x):
+    return numpy.atleast_1d(x).flatten()
+
+class GradientChecker(Model):
+
+    def __init__(self, f, df, x0, names=None, *args, **kwargs):
+        """
+        :param f: Function to check gradient for
+        :param df: Gradient of function to check
+        :param x0: 
+            Initial guess for inputs x (if it has a shape (a,b) this will be reflected in the parameter names).
+            Can be a list of arrays, if takes a list of arrays. This list will be passed 
+            to f and df in the same order as given here.
+            If only one argument, make sure not to pass a list!!!
+            
+        :type x0: [array-like] | array-like | float | int
+        :param names:
+            Names to print, when performing gradcheck. If a list was passed to x0
+            a list of names with the same length is expected.
+        :param args: Arguments passed as f(x, *args, **kwargs) and df(x, *args, **kwargs)
+        
+        Examples:
+        ---------
+            from GPy.models import GradientChecker
+            N, M, Q = 10, 5, 3
+        
+            Sinusoid:
+            
+                X = numpy.random.rand(N, Q)
+                grad = GradientChecker(numpy.sin,numpy.cos,X,'x')
+                grad.checkgrad(verbose=1)
+    
+            Using GPy:
+            
+                X, Z = numpy.random.randn(N,Q), numpy.random.randn(M,Q)
+                kern = GPy.kern.linear(Q, ARD=True) + GPy.kern.rbf(Q, ARD=True)
+                grad = GradientChecker(kern.K, 
+                                       lambda x: 2*kern.dK_dX(numpy.ones((1,1)), x),
+                                       x0 = X.copy(),
+                                       names='X')  
+                grad.checkgrad(verbose=1)
+                grad.randomize()
+                grad.checkgrad(verbose=1)      
+        """
+        Model.__init__(self)
+        if isinstance(x0, (list, tuple)) and names is None:
+            self.shapes = [get_shape(xi) for xi in x0]
+            self.names = ['X{i}'.format(i=i) for i in range(len(x0))]
+        elif isinstance(x0, (list, tuple)) and names is not None:
+            self.shapes = [get_shape(xi) for xi in x0]
+            self.names = names
+        elif names is None:
+            self.names = ['X']
+            self.shapes = [get_shape(x0)]
+        else:
+            self.names = names
+            self.shapes = [get_shape(x0)]
+        for name, xi in zip(self.names, at_least_one_element(x0)):
+            self.__setattr__(name, xi)
+#         self._param_names = []
+#         for name, shape in zip(self.names, self.shapes):
+#             self._param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape))))))
+        self.args = args
+        self.kwargs = kwargs
+        self.f = f
+        self.df = df
+
+    def _get_x(self):
+        if len(self.names) > 1:
+            return [self.__getattribute__(name) for name in self.names] + list(self.args)
+        return [self.__getattribute__(self.names[0])] + list(self.args)
+
+    def log_likelihood(self):
+        return float(numpy.sum(self.f(*self._get_x(), **self.kwargs)))
+
+    def _log_likelihood_gradients(self):
+        return numpy.atleast_1d(self.df(*self._get_x(), **self.kwargs)).flatten()
+
+
+    def _get_params(self):
+        return numpy.atleast_1d(numpy.hstack(map(lambda name: flatten_if_needed(self.__getattribute__(name)), self.names)))
+
+
+    def _set_params(self, x):
+        current_index = 0
+        for name, shape in zip(self.names, self.shapes):
+            current_size = numpy.prod(shape)
+            self.__setattr__(name, x[current_index:current_index + current_size].reshape(shape))
+            current_index += current_size
+
+    def _get_param_names(self):
+        _param_names = []
+        for name, shape in zip(self.names, self.shapes):
+            _param_names.extend(map(lambda nameshape: ('_'.join(nameshape)).strip('_'), itertools.izip(itertools.repeat(name), itertools.imap(lambda t: '_'.join(map(str, t)), itertools.product(*map(lambda xi: range(xi), shape))))))
+        return _param_names
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@ -0,0 +1,367 @@
+'''
+Created on 10 Apr 2013
+
+@author: Max Zwiessele
+'''
+from GPy.core import Model
+from GPy.core import SparseGP
+from GPy.util.linalg import PCA
+import numpy
+import itertools
+import pylab
+from GPy.kern.kern import kern
+from GPy.models.bayesian_gplvm import BayesianGPLVM
+
+class MRD(Model):
+    """
+    Do MRD on given Datasets in Ylist.
+    All Ys in likelihood_list are in [N x Dn], where Dn can be different per Yn,
+    N must be shared across datasets though.
+
+    :param likelihood_list: list of observed datasets (:py:class:`~GPy.likelihoods.gaussian.Gaussian` if not supplied directly)
+    :type likelihood_list: [:py:class:`~GPy.likelihoods.likelihood.likelihood` | :py:class:`ndarray`]
+    :param names: names for different gplvm models
+    :type names: [str]
+    :param input_dim: latent dimensionality
+    :type input_dim: int
+    :param initx: initialisation method for the latent space :
+        
+        * 'concat' - PCA on concatenation of all datasets
+        * 'single' - Concatenation of PCA on datasets, respectively
+        * 'random' - Random draw from a normal
+            
+    :type initx: ['concat'|'single'|'random']
+    :param initz: initialisation method for inducing inputs
+    :type initz: 'permute'|'random'
+    :param X: Initial latent space
+    :param X_variance: Initial latent space variance
+    :param Z: initial inducing inputs
+    :param num_inducing: number of inducing inputs to use
+    :param kernels: list of kernels or kernel shared for all BGPLVMS
+    :type kernels: [GPy.kern.kern] | GPy.kern.kern | None (default)
+    """
+    def __init__(self, likelihood_or_Y_list, input_dim, num_inducing=10, names=None,
+                 kernels=None, initx='PCA',
+                 initz='permute', _debug=False, **kw):
+        if names is None:
+            self.names = ["{}".format(i) for i in range(len(likelihood_or_Y_list))]
+
+        # sort out the kernels
+        if kernels is None:
+            kernels = [None] * len(likelihood_or_Y_list)
+        elif isinstance(kernels, kern):
+            kernels = [kernels.copy() for i in range(len(likelihood_or_Y_list))]
+        else:
+            assert len(kernels) == len(likelihood_or_Y_list), "need one kernel per output"
+            assert all([isinstance(k, kern) for k in kernels]), "invalid kernel object detected!"
+        assert not ('kernel' in kw), "pass kernels through `kernels` argument"
+
+        self.input_dim = input_dim
+        self._debug = _debug
+        self.num_inducing = num_inducing
+
+        self._init = True
+        X = self._init_X(initx, likelihood_or_Y_list)
+        Z = self._init_Z(initz, X)
+        self.num_inducing = Z.shape[0] # ensure M==N if M>N
+
+        self.bgplvms = [BayesianGPLVM(l, input_dim=input_dim, kernel=k, X=X, Z=Z, num_inducing=self.num_inducing, **kw) for l, k in zip(likelihood_or_Y_list, kernels)]
+        del self._init
+
+        self.gref = self.bgplvms[0]
+        nparams = numpy.array([0] + [SparseGP._get_params(g).size - g.Z.size for g in self.bgplvms])
+        self.nparams = nparams.cumsum()
+
+        self.num_data = self.gref.num_data
+
+        self.NQ = self.num_data * self.input_dim
+        self.MQ = self.num_inducing * self.input_dim
+
+        Model.__init__(self)
+        self.ensure_default_constraints()
+
+    def getstate(self):
+        return Model.getstate(self) + [self.names,
+                self.bgplvms,
+                self.gref,
+                self.nparams,
+                self.input_dim,
+                self.num_inducing,
+                self.num_data,
+                self.NQ,
+                self.MQ]
+
+    def setstate(self, state):
+        self.MQ = state.pop()
+        self.NQ = state.pop()
+        self.num_data = state.pop()
+        self.num_inducing = state.pop()
+        self.input_dim = state.pop()
+        self.nparams = state.pop()
+        self.gref = state.pop()
+        self.bgplvms = state.pop()
+        self.names = state.pop()
+        Model.setstate(self, state)
+
+    @property
+    def X(self):
+        return self.gref.X
+    @X.setter
+    def X(self, X):
+        try:
+            self.propagate_param(X=X)
+        except AttributeError:
+            if not self._init:
+                raise AttributeError("bgplvm list not initialized")
+    @property
+    def Z(self):
+        return self.gref.Z
+    @Z.setter
+    def Z(self, Z):
+        try:
+            self.propagate_param(Z=Z)
+        except AttributeError:
+            if not self._init:
+                raise AttributeError("bgplvm list not initialized")
+    @property
+    def X_variance(self):
+        return self.gref.X_variance
+    @X_variance.setter
+    def X_variance(self, X_var):
+        try:
+            self.propagate_param(X_variance=X_var)
+        except AttributeError:
+            if not self._init:
+                raise AttributeError("bgplvm list not initialized")
+    @property
+    def likelihood_list(self):
+        return [g.likelihood.Y for g in self.bgplvms]
+    @likelihood_list.setter
+    def likelihood_list(self, likelihood_list):
+        for g, Y in itertools.izip(self.bgplvms, likelihood_list):
+            g.likelihood.Y = Y
+
+    @property
+    def auto_scale_factor(self):
+        """
+        set auto_scale_factor for all gplvms
+        :param b: auto_scale_factor
+        :type b:
+        """
+        return self.gref.auto_scale_factor
+    @auto_scale_factor.setter
+    def auto_scale_factor(self, b):
+        self.propagate_param(auto_scale_factor=b)
+
+    def propagate_param(self, **kwargs):
+        for key, val in kwargs.iteritems():
+            for g in self.bgplvms:
+                g.__setattr__(key, val)
+
+    def randomize(self, initx='concat', initz='permute', *args, **kw):
+        super(MRD, self).randomize(*args, **kw)
+        self._init_X(initx, self.likelihood_list)
+        self._init_Z(initz, self.X)
+
+    def _get_latent_param_names(self):
+        n1 = self.gref._get_param_names()
+        n1var = n1[:self.NQ * 2 + self.MQ]
+        return n1var
+
+
+    def _get_kernel_names(self):
+        map_names = lambda ns, name: map(lambda x: "{1}_{0}".format(*x),
+                                         itertools.izip(ns,
+                                                        itertools.repeat(name)))
+        kernel_names = (map_names(SparseGP._get_param_names(g)[self.MQ:], n) for g, n in zip(self.bgplvms, self.names))
+        return kernel_names
+
+    def _get_param_names(self):
+        # X_names = sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
+        # S_names = sum([['X_variance_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
+        n1var = self._get_latent_param_names()
+        kernel_names = self._get_kernel_names()
+        return list(itertools.chain(n1var, *kernel_names))
+
+    def _get_print_names(self):
+        return list(itertools.chain(*self._get_kernel_names()))
+
+    def _get_params(self):
+        """
+        return parameter list containing private and shared parameters as follows:
+
+        =================================================================
+        | mu | S | Z || theta1 | theta2 | .. | thetaN |
+        =================================================================
+        """
+        X = self.gref.X.ravel()
+        X_var = self.gref.X_variance.ravel()
+        Z = self.gref.Z.ravel()
+        thetas = [SparseGP._get_params(g)[g.Z.size:] for g in self.bgplvms]
+        params = numpy.hstack([X, X_var, Z, numpy.hstack(thetas)])
+        return params
+
+#     def _set_var_params(self, g, X, X_var, Z):
+#         g.X = X.reshape(self.num_data, self.input_dim)
+#         g.X_variance = X_var.reshape(self.num_data, self.input_dim)
+#         g.Z = Z.reshape(self.num_inducing, self.input_dim)
+#
+#     def _set_kern_params(self, g, p):
+#         g.kern._set_params(p[:g.kern.Nparam])
+#         g.likelihood._set_params(p[g.kern.Nparam:])
+
+    def _set_params(self, x):
+        start = 0; end = self.NQ
+        X = x[start:end]
+        start = end; end += start
+        X_var = x[start:end]
+        start = end; end += self.MQ
+        Z = x[start:end]
+        thetas = x[end:]
+
+        # set params for all:
+        for g, s, e in itertools.izip(self.bgplvms, self.nparams, self.nparams[1:]):
+            g._set_params(numpy.hstack([X, X_var, Z, thetas[s:e]]))
+#             self._set_var_params(g, X, X_var, Z)
+#             self._set_kern_params(g, thetas[s:e].copy())
+#             g._compute_kernel_matrices()
+#             if self.auto_scale_factor:
+#                 g.scale_factor = numpy.sqrt(g.psi2.sum(0).mean() * g.likelihood.precision)
+# #                 self.scale_factor = numpy.sqrt(self.psi2.sum(0).mean() * self.likelihood.precision)
+#             g._computations()
+
+
+    def update_likelihood_approximation(self): # TODO: object oriented vs script base
+        for bgplvm in self.bgplvms:
+            bgplvm.update_likelihood_approximation()
+
+    def log_likelihood(self):
+        ll = -self.gref.KL_divergence()
+        for g in self.bgplvms:
+            ll += SparseGP.log_likelihood(g)
+        return ll
+
+    def _log_likelihood_gradients(self):
+        dLdmu, dLdS = reduce(lambda a, b: [a[0] + b[0], a[1] + b[1]], (g.dL_dmuS() for g in self.bgplvms))
+        dKLmu, dKLdS = self.gref.dKL_dmuS()
+        dLdmu -= dKLmu
+        dLdS -= dKLdS
+        dLdmuS = numpy.hstack((dLdmu.flatten(), dLdS.flatten())).flatten()
+        dldzt1 = reduce(lambda a, b: a + b, (SparseGP._log_likelihood_gradients(g)[:self.MQ] for g in self.bgplvms))
+
+        return numpy.hstack((dLdmuS,
+                             dldzt1,
+                numpy.hstack([numpy.hstack([g.dL_dtheta(),
+                                            g.likelihood._gradients(\
+                                                partial=g.partial_for_likelihood)]) \
+                              for g in self.bgplvms])))
+
+    def _init_X(self, init='PCA', likelihood_list=None):
+        if likelihood_list is None:
+            likelihood_list = self.likelihood_list
+        Ylist = []
+        for likelihood_or_Y in likelihood_list:
+            if type(likelihood_or_Y) is numpy.ndarray:
+                Ylist.append(likelihood_or_Y)
+            else:
+                Ylist.append(likelihood_or_Y.Y)
+        del likelihood_list
+        if init in "PCA_concat":
+            X = PCA(numpy.hstack(Ylist), self.input_dim)[0]
+        elif init in "PCA_single":
+            X = numpy.zeros((Ylist[0].shape[0], self.input_dim))
+            for qs, Y in itertools.izip(numpy.array_split(numpy.arange(self.input_dim), len(Ylist)), Ylist):
+                X[:, qs] = PCA(Y, len(qs))[0]
+        else: # init == 'random':
+            X = numpy.random.randn(Ylist[0].shape[0], self.input_dim)
+        self.X = X
+        return X
+
+
+    def _init_Z(self, init="permute", X=None):
+        if X is None:
+            X = self.X
+        if init in "permute":
+            Z = numpy.random.permutation(X.copy())[:self.num_inducing]
+        elif init in "random":
+            Z = numpy.random.randn(self.num_inducing, self.input_dim) * X.var()
+        self.Z = Z
+        return Z
+
+    def _handle_plotting(self, fignum, axes, plotf, sharex=False, sharey=False):
+        if axes is None:
+            fig = pylab.figure(num=fignum)
+        sharex_ax = None
+        sharey_ax = None
+        for i, g in enumerate(self.bgplvms):
+            try:
+                if sharex:
+                    sharex_ax = ax # @UndefinedVariable
+                    sharex = False # dont set twice
+                if sharey:
+                    sharey_ax = ax # @UndefinedVariable
+                    sharey = False # dont set twice
+            except:
+                pass
+            if axes is None:
+                ax = fig.add_subplot(1, len(self.bgplvms), i + 1, sharex=sharex_ax, sharey=sharey_ax)
+            elif isinstance(axes, (tuple, list)):
+                ax = axes[i]
+            else:
+                raise ValueError("Need one axes per latent dimension input_dim")
+            plotf(i, g, ax)
+            if sharey_ax is not None:
+                pylab.setp(ax.get_yticklabels(), visible=False)
+        pylab.draw()
+        if axes is None:
+            fig.tight_layout()
+            return fig
+        else:
+            return pylab.gcf()
+
+    def plot_X_1d(self, *a, **kw):
+        return self.gref.plot_X_1d(*a, **kw)
+
+    def plot_X(self, fignum=None, ax=None):
+        fig = self._handle_plotting(fignum, ax, lambda i, g, ax: ax.imshow(g.X))
+        return fig
+
+    def plot_predict(self, fignum=None, ax=None, sharex=False, sharey=False, **kwargs):
+        fig = self._handle_plotting(fignum,
+                                    ax,
+                                    lambda i, g, ax: ax.imshow(g. predict(g.X)[0], **kwargs),
+                                    sharex=sharex, sharey=sharey)
+        return fig
+
+    def plot_scales(self, fignum=None, ax=None, titles=None, sharex=False, sharey=True, *args, **kwargs):
+        """
+        :param:`titles` :
+            titles for axes of datasets
+        """
+        if titles is None:
+            titles = [r'${}$'.format(name) for name in self.names]
+        ymax = reduce(max, [numpy.ceil(max(g.input_sensitivity())) for g in self.bgplvms])
+        def plotf(i, g, ax):
+            ax.set_ylim([0,ymax])
+            g.kern.plot_ARD(ax=ax, title=titles[i], *args, **kwargs)
+        fig = self._handle_plotting(fignum, ax, plotf, sharex=sharex, sharey=sharey)
+        return fig
+
+    def plot_latent(self, fignum=None, ax=None, *args, **kwargs):
+        fig = self.gref.plot_latent(fignum=fignum, ax=ax, *args, **kwargs) # self._handle_plotting(fignum, ax, lambda i, g, ax: g.plot_latent(ax=ax, *args, **kwargs))
+        return fig
+
+    def _debug_plot(self):
+        self.plot_X_1d()
+        fig = pylab.figure("MRD DEBUG PLOT", figsize=(4 * len(self.bgplvms), 9))
+        fig.clf()
+        axes = [fig.add_subplot(3, len(self.bgplvms), i + 1) for i in range(len(self.bgplvms))]
+        self.plot_X(ax=axes)
+        axes = [fig.add_subplot(3, len(self.bgplvms), i + len(self.bgplvms) + 1) for i in range(len(self.bgplvms))]
+        self.plot_latent(ax=axes)
+        axes = [fig.add_subplot(3, len(self.bgplvms), i + 2 * len(self.bgplvms) + 1) for i in range(len(self.bgplvms))]
+        self.plot_scales(ax=axes)
+        pylab.draw()
+        fig.tight_layout()
+
+
--- a/GPy/models/sparse_gp_classification.py
+++ b/GPy/models/sparse_gp_classification.py
@ -0,0 +1,56 @@
+# Copyright (c) 2013, Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from ..core import SparseGP
+from .. import likelihoods
+from .. import kern
+from ..likelihoods import likelihood
+
+class SparseGPClassification(SparseGP):
+    """
+    sparse Gaussian Process model for classification
+
+    This is a thin wrapper around the sparse_GP class, with a set of sensible defaults
+
+    :param X: input observations
+    :param Y: observed values
+    :param likelihood: a GPy likelihood, defaults to Binomial with probit link_function
+    :param kernel: a GPy kernel, defaults to rbf+white
+    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_X: False|True
+    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_Y: False|True
+    :rtype: model object
+
+    """
+
+    def __init__(self, X, Y=None, likelihood=None, kernel=None, normalize_X=False, normalize_Y=False, Z=None, num_inducing=10):
+        if kernel is None:
+            kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1], 1e-3)
+
+        if likelihood is None:
+            distribution = likelihoods.likelihood_functions.Binomial()
+            likelihood = likelihoods.EP(Y, distribution)
+        elif Y is not None:
+            if not all(Y.flatten() == likelihood.data.flatten()):
+                raise Warning, 'likelihood.data and Y are different.'
+
+        if Z is None:
+            i = np.random.permutation(X.shape[0])[:num_inducing]
+            Z = X[i].copy()
+        else:
+            assert Z.shape[1] == X.shape[1]
+
+        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X)
+        self.ensure_default_constraints()
+
+    def getstate(self):
+        return SparseGP.getstate(self)
+
+
+    def setstate(self, state):
+        return SparseGP.setstate(self, state)
+
+    pass
--- a/GPy/models/sparse_gp_regression.py
+++ b/GPy/models/sparse_gp_regression.py
@ -0,0 +1,55 @@
+# Copyright (c) 2012, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from ..core import SparseGP
+from .. import likelihoods
+from .. import kern
+
+class SparseGPRegression(SparseGP):
+    """
+    Gaussian Process model for regression
+
+    This is a thin wrapper around the SparseGP class, with a set of sensible defalts
+
+    :param X: input observations
+    :param Y: observed values
+    :param kernel: a GPy kernel, defaults to rbf+white
+    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_X: False|True
+    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_Y: False|True
+    :rtype: model object
+
+    .. Note:: Multiple independent outputs are allowed using columns of Y
+
+    """
+
+    def __init__(self, X, Y, kernel=None, normalize_X=False, normalize_Y=False, Z=None, num_inducing=10, X_variance=None):
+        # kern defaults to rbf (plus white for stability)
+        if kernel is None:
+            kernel = kern.rbf(X.shape[1]) # + kern.white(X.shape[1], 1e-3)
+
+        # Z defaults to a subset of the data
+        if Z is None:
+            i = np.random.permutation(X.shape[0])[:num_inducing]
+            Z = X[i].copy()
+        else:
+            assert Z.shape[1] == X.shape[1]
+
+        # likelihood defaults to Gaussian
+        likelihood = likelihoods.Gaussian(Y, normalize=normalize_Y)
+
+        SparseGP.__init__(self, X, likelihood, kernel, Z=Z, normalize_X=normalize_X, X_variance=X_variance)
+        self.ensure_default_constraints()
+        pass
+
+    def getstate(self):
+        return SparseGP.getstate(self)
+
+
+    def setstate(self, state):
+        return SparseGP.setstate(self, state)
+
+    pass
--- a/GPy/models/sparse_gplvm.py
+++ b/GPy/models/sparse_gplvm.py
@ -0,0 +1,70 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+import pylab as pb
+import sys, pdb
+from GPy.models.sparse_gp_regression import SparseGPRegression
+from GPy.models.gplvm import GPLVM
+# from .. import kern
+# from ..core import model
+# from ..util.linalg import pdinv, PCA
+
+class SparseGPLVM(SparseGPRegression, GPLVM):
+    """
+    Sparse Gaussian Process Latent Variable Model
+
+    :param Y: observed data
+    :type Y: np.ndarray
+    :param input_dim: latent dimensionality
+    :type input_dim: int
+    :param init: initialisation method for the latent space
+    :type init: 'PCA'|'random'
+
+    """
+    def __init__(self, Y, input_dim, kernel=None, init='PCA', num_inducing=10):
+        X = self.initialise_latent(init, input_dim, Y)
+        SparseGPRegression.__init__(self, X, Y, kernel=kernel, num_inducing=num_inducing)
+        self.ensure_default_constraints()
+
+    def getstate(self):
+        return SparseGPRegression.getstate(self)
+
+
+    def setstate(self, state):
+        return SparseGPRegression.setstate(self, state)
+
+
+    def _get_param_names(self):
+        return (sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
+                + SparseGPRegression._get_param_names(self))
+
+    def _get_params(self):
+        return np.hstack((self.X.flatten(), SparseGPRegression._get_params(self)))
+
+    def _set_params(self, x):
+        self.X = x[:self.X.size].reshape(self.num_data, self.input_dim).copy()
+        SparseGPRegression._set_params(self, x[self.X.size:])
+
+    def log_likelihood(self):
+        return SparseGPRegression.log_likelihood(self)
+
+    def dL_dX(self):
+        dL_dX = self.kern.dKdiag_dX(self.dL_dpsi0, self.X)
+        dL_dX += self.kern.dK_dX(self.dL_dpsi1, self.X, self.Z)
+
+        return dL_dX
+
+    def _log_likelihood_gradients(self):
+        return np.hstack((self.dL_dX().flatten(), SparseGPRegression._log_likelihood_gradients(self)))
+
+    def plot(self):
+        GPLVM.plot(self)
+        # passing Z without a small amout of jitter will induce the white kernel where we don;t want it!
+        mu, var, upper, lower = SparseGPRegression.predict(self, self.Z + np.random.randn(*self.Z.shape) * 0.0001)
+        pb.plot(mu[:, 0] , mu[:, 1], 'ko')
+
+    def plot_latent(self, *args, **kwargs):
+        input_1, input_2 = GPLVM.plot_latent(*args, **kwargs)
+        pb.plot(m.Z[:, input_1], m.Z[:, input_2], '^w')
--- a/GPy/models/svigp_regression.py
+++ b/GPy/models/svigp_regression.py
@ -0,0 +1,52 @@
+# Copyright (c) 2012, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ..core import SVIGP
+from .. import likelihoods
+from .. import kern
+
+class SVIGPRegression(SVIGP):
+    """
+    Gaussian Process model for regression
+
+    This is a thin wrapper around the SVIGP class, with a set of sensible defalts
+
+    :param X: input observations
+    :param Y: observed values
+    :param kernel: a GPy kernel, defaults to rbf+white
+    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_X: False|True
+    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
+    :type normalize_Y: False|True
+    :rtype: model object
+
+    .. Note:: Multiple independent outputs are allowed using columns of Y
+
+    """
+
+    def __init__(self, X, Y, kernel=None, Z=None, num_inducing=10, q_u=None, batchsize=10):
+        # kern defaults to rbf (plus white for stability)
+        if kernel is None:
+            kernel = kern.rbf(X.shape[1], variance=1., lengthscale=4.) + kern.white(X.shape[1], 1e-3)
+
+        # Z defaults to a subset of the data
+        if Z is None:
+            i = np.random.permutation(X.shape[0])[:num_inducing]
+            Z = X[i].copy()
+        else:
+            assert Z.shape[1] == X.shape[1]
+
+        # likelihood defaults to Gaussian
+        likelihood = likelihoods.Gaussian(Y, normalize=False)
+
+        SVIGP.__init__(self, X, likelihood, kernel, Z, q_u=q_u, batchsize=batchsize)
+        self.load_batch()
+
+    def getstate(self):
+        return GPBase.getstate(self)
+
+
+    def setstate(self, state):
+        return GPBase.setstate(self, state)
+
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@ -0,0 +1,107 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from ..util.warping_functions import *
+from ..core import GP
+from .. import likelihoods
+from GPy.util.warping_functions import TanhWarpingFunction_d
+from GPy import kern
+
+class WarpedGP(GP):
+    def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3, normalize_X=False, normalize_Y=False):
+
+        if kernel is None:
+            kernel = kern.rbf(X.shape[1])
+
+        if warping_function == None:
+            self.warping_function = TanhWarpingFunction_d(warping_terms)
+            self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1,) * 1)
+
+        self.scale_data = False
+        if self.scale_data:
+            Y = self._scale_data(Y)
+        self.has_uncertain_inputs = False
+        self.Y_untransformed = Y.copy()
+        self.predict_in_warped_space = False
+        likelihood = likelihoods.Gaussian(self.transform_data(), normalize=normalize_Y)
+
+        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
+        self._set_params(self._get_params())
+
+    def getstate(self):
+        return GP.getstate(self)
+
+
+    def setstate(self, state):
+        return GP.setstate(self, state)
+
+
+    def _scale_data(self, Y):
+        self._Ymax = Y.max()
+        self._Ymin = Y.min()
+        return (Y - self._Ymin) / (self._Ymax - self._Ymin) - 0.5
+
+    def _unscale_data(self, Y):
+        return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin
+
+    def _set_params(self, x):
+        self.warping_params = x[:self.warping_function.num_parameters]
+        Y = self.transform_data()
+        self.likelihood.set_data(Y)
+        GP._set_params(self, x[self.warping_function.num_parameters:].copy())
+
+    def _get_params(self):
+        return np.hstack((self.warping_params.flatten().copy(), GP._get_params(self).copy()))
+
+    def _get_param_names(self):
+        warping_names = self.warping_function._get_param_names()
+        param_names = GP._get_param_names(self)
+        return warping_names + param_names
+
+    def transform_data(self):
+        Y = self.warping_function.f(self.Y_untransformed.copy(), self.warping_params).copy()
+        return Y
+
+    def log_likelihood(self):
+        ll = GP.log_likelihood(self)
+        jacobian = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
+        return ll + np.log(jacobian).sum()
+
+    def _log_likelihood_gradients(self):
+        ll_grads = GP._log_likelihood_gradients(self)
+        alpha = np.dot(self.Ki, self.likelihood.Y.flatten())
+        warping_grads = self.warping_function_gradients(alpha)
+
+        warping_grads = np.append(warping_grads[:, :-1].flatten(), warping_grads[0, -1])
+        return np.hstack((warping_grads.flatten(), ll_grads.flatten()))
+
+    def warping_function_gradients(self, Kiy):
+        grad_y = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
+        grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed, self.warping_params,
+                                                                 return_covar_chain=True)
+        djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
+        dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
+
+        return -dquad_dpsi + djac_dpsi
+
+    def plot_warping(self):
+        self.warping_function.plot(self.warping_params, self.Y_untransformed.min(), self.Y_untransformed.max())
+
+    def predict(self, Xnew, which_parts='all', full_cov=False, pred_init=None):
+        # normalize X values
+        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
+        mu, var = GP._raw_predict(self, Xnew, full_cov=full_cov, which_parts=which_parts)
+
+        # now push through likelihood
+        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
+
+        if self.predict_in_warped_space:
+            mean = self.warping_function.f_inv(mean, self.warping_params, y=pred_init)
+            var = self.warping_function.f_inv(var, self.warping_params)
+
+        if self.scale_data:
+            mean = self._unscale_data(mean)
+        
+        return mean, var, _025pm, _975pm
--- a/GPy/notes.txt
+++ b/GPy/notes.txt
@ -0,0 +1,80 @@
+Prod.py kernel could also take a list of kernels rather than two arguments for kernels.
+transformations.py should have limits on what is fed into exp() particularly for the negative log logistic (done -neil). 
+
+Load in a model with mlp kernel, plot it, change a parameter, plot it again. It doesn't update the plot.
+
+Tests for kernels which work directly on the kernel implementation (not through GP).
+
+Should stationary covariances have their own kernpart type, I think so, also inner product kernels. That way the caching so carefully constructed for RBF or linear could be shared.
+
+Where do we declare default kernel parameters. In constructors.py or in the definition file for the kernel?
+
+When printing to stdout, can we check that our approach is also working nicely for the ipython notebook? I like the way our optimization ticks over, but at the moment this doesn't seem to work in the ipython notebook, it would be nice if it did. My problems may be due to using ipython 0.12, I've had a poke around at fixing this and I can't do it for 0.12.
+
+When we print a model should we also include information such as number of inputs and number of outputs?
+
+Let's not use N for giving the number of data in the model. When it pops up as a help tip it's not as clear as num_samples or num_data. Prefer the second, but oddly I've been using first.
+
+Loving the fact that the * has been overloaded on the kernels (oddly never thought to check this before). Although naming can be a bit confusing. Can we think how to deal with the names in a clearer way when we use a kernel like this one:
+kern = GPy.kern.rbf(30)*(GPy.kern.mlp(30)+GPy.kern.poly(30, degree=5)) + GPy.kern.bias(30). There seems to be some tieing of parameters going on ... should there be? (you can try it as the kernel for the robot wireless model).
+
+Can we comment up some of the list incomprehensions in hierarchical.py??
+
+Need to tidy up classification.py, 
+many examples include help that doesn't apply 
+(it is suggested that you can try different approximation types)
+
+Shall we overload the ** operator to have tensor products? (I've done this now we can see if we like it)
+
+People aren't filling the doc strings in as they go *everyone* needs to get in the habit of this (and modifying them as they edit, or correcting them when there is a problem).
+
+Need some nice way of explaining how to compile documentation and run the unit tests, could this be in a readme or FAQ somewhere? Maybe it's there already somewhere and I've missed it.
+
+Shouldn't EP be in the inference package (not likelihoods)?
+
+When using bfgs in ipython notebook, text appears in the original console, not in the notebook.
+
+In sparse GPs wouldn't it be clearer to call Z inducing?
+
+In coregionalisation matrix, setting the W to all ones will (surely?) ensure that symmetry isn't broken. Also, but allowing it to scale like that, the output variance increases as rank is increased (and if user sets rank to more than output dim they could get very different results).
+
+We are inconsistent about our use of ise and ize e.g. optimize and normalize_X, but coregionalise, we should choose one and stick to it. Suggest -ize.
+
+Exceptions: we need to provide a list of exceptions we throw and specify what is thrown where. 
+
+Why is it get_params() but it's getstate()? Should be get_state(). Why is it get_gradient instead of get_gradients? Need to be consistent!! Doesn't matter which way we choose as long as it's consistent.
+
+In likelihood Nparams should be num_params
+
+In likelihood N should be num_data
+
+The Gaussian target in likelihood should be F What is V doing here?
+
+Need to check for nan values in likelihoods. These should be treated as missing values. If the likelihood can't handle the missing value an error should be throw.
+
+
+Sometimes you want to print kernpart objects, for diagnosis etc. This isn't possible currently. 
+
+Why do likelihoods still have YYT everywhere, didn't we agree to set observed data to Y and latent function to F?
+
+For some reason a stub of _get_param_names(self) wasn't available in the Parameterized base class. Have put it in (is this right?)
+
+Is there a quick FAQ or something on how to build the documentation? I did it once, but can't remember! Have started a FAQ.txt file where we can add this type of information. 
+
+Similar for the nosetests ... even ran them last week but can't remember the command!
+
+Now added Gaussian priors to GPLVM latent variables by default. When running the GPy.examples.dimensionality_reduction.stick() example the print out from print model has the same value for the prior+likelihood as for the prior.
+
+For the back constrained GP-LVM need priors to be on the Xs not on the model parameters (because they aren't parameters, they are constraints). Need to work out how to do this, perhaps by creating the full GP-LVM model then constraining around it, rather than overriding inside the GP-LVM model.
+
+
+This code fails:
+
+kern = GPy.kern.rbf(2)
+GPy.kern.Kern_check_dK_dX(kern, X=np.random.randn(10, 2), X2=None).checkgrad(verbose=True)
+
+because X2 is now equal to X, so there is a factor of 2 missing. Does this every come up? Yes, in the GP-LVM, (gplvm.py, line 64) where it is called with a corrective factor of 2! And on line 241 of sparse_gp where it is also called with a corrective factor of 2! In original matlab GPLVM, didn't allow gradients with respect to X alone, and multiplied by 2 in base code, but then add diagonal across those elements. This is missing in the new code.
+
+
+In white.py, line 41, Need to check here if X and X2 refer to the same reference too ... becaue up the pipeline somewhere someone may have set X2=X when X2 arrived originally equal to None.
+
--- a/GPy/testing/init.py
+++ b/GPy/testing/init.py
@ -0,0 +1,12 @@
+"""
+
+MaxZ
+
+"""
+import unittest
+import sys
+
+def deepTest(reason):
+    if reason:
+        return lambda x:x
+    return unittest.skip("Not deep scanning, enable deepscan by adding 'deep' argument")
--- a/GPy/testing/bgplvm_tests.py
+++ b/GPy/testing/bgplvm_tests.py
@ -0,0 +1,74 @@
+# Copyright (c) 2012, Nicolo Fusi
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import unittest
+import numpy as np
+import GPy
+from GPy.models.bayesian_gplvm import BayesianGPLVM
+
+class BGPLVMTests(unittest.TestCase):
+    def test_bias_kern(self):
+        N, num_inducing, input_dim, D = 10, 3, 2, 4
+        X = np.random.rand(N, input_dim)
+        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
+        Y -= Y.mean(axis=0)
+        k = GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        m = BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+    def test_linear_kern(self):
+        N, num_inducing, input_dim, D = 10, 3, 2, 4
+        X = np.random.rand(N, input_dim)
+        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
+        Y -= Y.mean(axis=0)
+        k = GPy.kern.linear(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        m = BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+    def test_rbf_kern(self):
+        N, num_inducing, input_dim, D = 10, 3, 2, 4
+        X = np.random.rand(N, input_dim)
+        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
+        Y -= Y.mean(axis=0)
+        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        m = BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+    def test_rbf_bias_kern(self):
+        N, num_inducing, input_dim, D = 10, 3, 2, 4
+        X = np.random.rand(N, input_dim)
+        k = GPy.kern.rbf(input_dim) +  GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
+        Y -= Y.mean(axis=0)
+        k = GPy.kern.rbf(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        m = BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+    #@unittest.skip('psi2 cross terms are NotImplemented for this combination')
+    def test_linear_bias_kern(self):
+        N, num_inducing, input_dim, D = 30, 5, 4, 30
+        X = np.random.rand(N, input_dim)
+        k = GPy.kern.linear(input_dim) +  GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
+        Y -= Y.mean(axis=0)
+        k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        m = BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()
--- a/GPy/testing/cgd_tests.py
+++ b/GPy/testing/cgd_tests.py
@ -0,0 +1,111 @@
+'''
+Created on 26 Apr 2013
+
+@author: maxz
+'''
+import unittest
+import numpy
+from GPy.inference.conjugate_gradient_descent import CGD, RUNNING
+import pylab
+from scipy.optimize.optimize import rosen, rosen_der
+from GPy.inference.gradient_descent_update_rules import PolakRibiere
+
+
+class Test(unittest.TestCase):
+
+    def testMinimizeSquare(self):
+        N = 100
+        A = numpy.random.rand(N) * numpy.eye(N)
+        b = numpy.random.rand(N) * 0
+        f = lambda x: numpy.dot(x.T.dot(A), x) - numpy.dot(x.T, b)
+        df = lambda x: numpy.dot(A, x) - b
+
+        opt = CGD()
+
+        restarts = 10
+        for _ in range(restarts):
+            try:
+                x0 = numpy.random.randn(N) * 10
+                res = opt.opt(f, df, x0, messages=0, maxiter=1000, gtol=1e-15)
+                assert numpy.allclose(res[0], 0, atol=1e-5)
+                break
+            except AssertionError:
+                import ipdb;ipdb.set_trace()
+                # RESTART
+                pass
+        else:
+            raise AssertionError("Test failed for {} restarts".format(restarts))
+
+    def testRosen(self):
+        N = 20
+        f = rosen
+        df = rosen_der
+
+        opt = CGD()
+
+        restarts = 10
+        for _ in range(restarts):
+            try:
+                x0 = (numpy.random.randn(N) * .5) + numpy.ones(N)
+                res = opt.opt(f, df, x0, messages=0,
+                               maxiter=1e3, gtol=1e-12)
+                assert numpy.allclose(res[0], 1, atol=.1)
+                break
+            except:
+                # RESTART
+                pass
+        else:
+            raise AssertionError("Test failed for {} restarts".format(restarts))
+
+if __name__ == "__main__":
+#     import sys;sys.argv = ['',
+#                            'Test.testMinimizeSquare',
+#                            'Test.testRosen',
+#                            ]
+#     unittest.main()
+
+    N = 2
+    A = numpy.random.rand(N) * numpy.eye(N)
+    b = numpy.random.rand(N) * 0
+    f = lambda x: numpy.dot(x.T.dot(A), x) - numpy.dot(x.T, b)
+    df = lambda x: numpy.dot(A, x) - b
+#     f = rosen
+#     df = rosen_der
+    x0 = (numpy.random.randn(N) * .5) + numpy.ones(N)
+    print x0
+
+    opt = CGD()
+
+    pylab.ion()
+    fig = pylab.figure("cgd optimize")
+    if fig.axes:
+        ax = fig.axes[0]
+        ax.cla()
+    else:
+        ax = fig.add_subplot(111, projection='3d')
+
+    interpolation = 40
+#     x, y = numpy.linspace(.5, 1.5, interpolation)[:, None], numpy.linspace(.5, 1.5, interpolation)[:, None]
+    x, y = numpy.linspace(-1, 1, interpolation)[:, None], numpy.linspace(-1, 1, interpolation)[:, None]
+    X, Y = numpy.meshgrid(x, y)
+    fXY = numpy.array([f(numpy.array([x, y])) for x, y in zip(X.flatten(), Y.flatten())]).reshape(interpolation, interpolation)
+
+    ax.plot_wireframe(X, Y, fXY)
+    xopts = [x0.copy()]
+    optplts, = ax.plot3D([x0[0]], [x0[1]], zs=f(x0), marker='', color='r')
+
+    raw_input("enter to start optimize")
+    res = [0]
+
+    def callback(*r):
+        xopts.append(r[0].copy())
+#         time.sleep(.3)
+        optplts._verts3d = [numpy.array(xopts)[:, 0], numpy.array(xopts)[:, 1], [f(xs) for xs in xopts]]
+        fig.canvas.draw()
+        if r[-1] != RUNNING:
+            res[0] = r
+
+    res[0] = opt.opt(f, df, x0.copy(), callback, messages=True, maxiter=1000,
+                   report_every=7, gtol=1e-12, update_rule=PolakRibiere)
+
+    pass
--- a/GPy/testing/examples_tests.py
+++ b/GPy/testing/examples_tests.py
@ -0,0 +1,99 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import unittest
+import numpy as np
+import GPy
+import inspect
+import pkgutil
+import os
+import random
+from nose.tools import nottest
+import sys
+
+class ExamplesTests(unittest.TestCase):
+    def _checkgrad(self, Model):
+        self.assertTrue(Model.checkgrad())
+
+    def _model_instance(self, Model):
+        self.assertTrue(isinstance(Model, GPy.models))
+
+"""
+def model_instance_generator(model):
+    def check_model_returned(self):
+        self._model_instance(model)
+    return check_model_returned
+
+def checkgrads_generator(model):
+    def model_checkgrads(self):
+        self._checkgrad(model)
+    return model_checkgrads
+"""
+
+def model_checkgrads(model):
+    model.randomize()
+    #assert model.checkgrad()
+    return model.checkgrad()
+
+def model_instance(model):
+    #assert isinstance(model, GPy.core.model)
+    return isinstance(model, GPy.core.model)
+
+@nottest
+def test_models():
+    examples_path = os.path.dirname(GPy.examples.__file__)
+    # Load modules
+    failing_models = {}
+    for loader, module_name, is_pkg in pkgutil.iter_modules([examples_path]):
+        # Load examples
+        module_examples = loader.find_module(module_name).load_module(module_name)
+        print "MODULE", module_examples
+        print "Before"
+        print inspect.getmembers(module_examples, predicate=inspect.isfunction)
+        functions = [ func for func in inspect.getmembers(module_examples, predicate=inspect.isfunction) if func[0].startswith('_') is False ][::-1]
+        print "After"
+        print functions
+        for example in functions:
+            if example[0] in ['oil', 'silhouette', 'GPLVM_oil_100']:
+                print "SKIPPING"
+                continue
+
+            print "Testing example: ", example[0]
+            # Generate model
+            try:
+                model = example[1]()
+            except Exception as e:
+                failing_models[example[0]] = "Cannot make model: \n{e}".format(e=e)
+            else:
+                print model
+                model_checkgrads.description = 'test_checkgrads_%s' % example[0]
+                try:
+                    if not model_checkgrads(model):
+                        failing_models[model_checkgrads.description] = False
+                except Exception as e:
+                    failing_models[model_checkgrads.description] = e
+
+                model_instance.description = 'test_instance_%s' % example[0]
+                try:
+                    if not model_instance(model):
+                        failing_models[model_instance.description] = False
+                except Exception as e:
+                    failing_models[model_instance.description] = e
+
+            #yield model_checkgrads, model
+            #yield model_instance, model
+
+        print "Finished checking module {m}".format(m=module_name)
+        if len(failing_models.keys()) > 0:
+            print "Failing models: "
+            print failing_models
+
+    if len(failing_models.keys()) > 0:
+        print failing_models
+        raise Exception(failing_models)
+
+
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    # unittest.main()
+    test_models()
--- a/GPy/testing/gplvm_tests.py
+++ b/GPy/testing/gplvm_tests.py
@ -0,0 +1,44 @@
+# Copyright (c) 2012, Nicolo Fusi
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import unittest
+import numpy as np
+import GPy
+
+class GPLVMTests(unittest.TestCase):
+    def test_bias_kern(self):
+        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
+        X = np.random.rand(num_data, input_dim)
+        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(num_data),K,output_dim).T
+        k = GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+    def test_linear_kern(self):
+        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
+        X = np.random.rand(num_data, input_dim)
+        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(num_data),K,output_dim).T
+        k = GPy.kern.linear(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+    def test_rbf_kern(self):
+        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
+        X = np.random.rand(num_data, input_dim)
+        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(num_data),K,output_dim).T
+        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        m = GPy.models.GPLVM(Y, input_dim, kernel = k)
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -0,0 +1,74 @@
+# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import unittest
+import numpy as np
+import GPy
+    
+
+
+class KernelTests(unittest.TestCase):
+    def test_kerneltie(self):
+        K = GPy.kern.rbf(5, ARD=True)
+        K.tie_params('.*[01]')
+        K.constrain_fixed('2')
+        
+        X = np.random.rand(5,5)
+        Y = np.ones((5,1))
+        m = GPy.models.GPRegression(X,Y,K)
+        self.assertTrue(m.checkgrad())
+
+    def test_rbfkernel(self):
+        verbose = False
+        kern = GPy.kern.rbf(5)
+        self.assertTrue(GPy.kern.Kern_check_model(kern).is_positive_definite())
+        self.assertTrue(GPy.kern.Kern_check_dK_dtheta(kern).checkgrad(verbose=verbose))
+        self.assertTrue(GPy.kern.Kern_check_dKdiag_dtheta(kern).checkgrad(verbose=verbose))
+        self.assertTrue(GPy.kern.Kern_check_dK_dX(kern).checkgrad(verbose=verbose))
+
+    def test_gibbskernel(self):
+        verbose = False
+        kern = GPy.kern.gibbs(5, mapping=GPy.mappings.Linear(5, 1))
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+
+    def test_mlpkernel(self):
+        verbose = False
+        kern = GPy.kern.mlp(5)
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+
+    def test_polykernel(self):
+        verbose = False
+        kern = GPy.kern.poly(5, degree=4)
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+
+    def test_fixedkernel(self):
+        """
+        Fixed effect kernel test
+        """
+        X = np.random.rand(30, 4)
+        K = np.dot(X, X.T)
+        kernel = GPy.kern.fixed(4, K)
+        Y = np.ones((30,1))
+        m = GPy.models.GPRegression(X,Y,kernel=kernel)
+        self.assertTrue(m.checkgrad())
+
+    def test_coregionalisation(self):
+        X1 = np.random.rand(50,1)*8
+        X2 = np.random.rand(30,1)*5
+        index = np.vstack((np.zeros_like(X1),np.ones_like(X2)))
+        X = np.hstack((np.vstack((X1,X2)),index))
+        Y1 = np.sin(X1) + np.random.randn(*X1.shape)*0.05
+        Y2 = np.sin(X2) + np.random.randn(*X2.shape)*0.05 + 2.
+        Y = np.vstack((Y1,Y2))
+
+        k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
+        k2 = GPy.kern.coregionalise(2,1)
+        k = k1.prod(k2,tensor=True)
+        m = GPy.models.GPRegression(X,Y,kernel=k)
+        self.assertTrue(m.checkgrad())
+
+
+       
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()
--- a/GPy/testing/mapping_tests.py
+++ b/GPy/testing/mapping_tests.py
@ -0,0 +1,34 @@
+# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import unittest
+import numpy as np
+import GPy
+    
+
+
+class MappingTests(unittest.TestCase):
+
+    def test_kernelmapping(self):
+        verbose = False
+        mapping = GPy.mappings.Kernel(np.random.rand(10, 3), 2)
+        self.assertTrue(GPy.core.mapping.Mapping_check_df_dtheta(mapping=mapping).checkgrad(verbose=verbose))
+        self.assertTrue(GPy.core.mapping.Mapping_check_df_dX(mapping=mapping).checkgrad(verbose=verbose))
+
+    def test_linearmapping(self):
+        verbose = False
+        mapping = GPy.mappings.Linear(3, 2)
+        self.assertTrue(GPy.core.Mapping_check_df_dtheta(mapping=mapping).checkgrad(verbose=verbose))
+        self.assertTrue(GPy.core.Mapping_check_df_dX(mapping=mapping).checkgrad(verbose=verbose))
+
+    def test_mlpmapping(self):
+        verbose = False
+        mapping = GPy.mappings.MLP(input_dim=2, hidden_dim=[3, 4, 8, 2], output_dim=2)        
+        self.assertTrue(GPy.core.Mapping_check_df_dtheta(mapping=mapping).checkgrad(verbose=verbose))
+        self.assertTrue(GPy.core.Mapping_check_df_dX(mapping=mapping).checkgrad(verbose=verbose))
+
+
+       
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()
--- a/GPy/testing/mrd_tests.py
+++ b/GPy/testing/mrd_tests.py
@ -0,0 +1,32 @@
+# Copyright (c) 2013, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+'''
+Created on 10 Apr 2013
+
+@author: maxz
+'''
+
+import unittest
+import numpy as np
+import GPy
+
+class MRDTests(unittest.TestCase):
+
+    def test_gradients(self):
+        num_m = 3
+        N, num_inducing, input_dim, D = 20, 8, 6, 20
+        X = np.random.rand(N, input_dim)
+
+        k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
+        K = k.K(X)
+
+        Ylist = [np.random.multivariate_normal(np.zeros(N), K, input_dim).T for _ in range(num_m)]
+        likelihood_list = [GPy.likelihoods.Gaussian(Y) for Y in Ylist]
+
+        m = GPy.models.MRD(likelihood_list, input_dim=input_dim, kernels=k, num_inducing=num_inducing)
+
+        self.assertTrue(m.checkgrad())
+
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()
--- a/GPy/testing/prior_tests.py
+++ b/GPy/testing/prior_tests.py
@ -0,0 +1,58 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import unittest
+import numpy as np
+import GPy
+
+class PriorTests(unittest.TestCase):
+    def test_lognormal(self):
+        xmin, xmax = 1, 2.5*np.pi
+        b, C, SNR = 1, 0, 0.1
+        X = np.linspace(xmin, xmax, 500)
+        y  = b*X + C + 1*np.sin(X)
+        y += 0.05*np.random.randn(len(X))
+        X, y = X[:, None], y[:, None]
+        m = GPy.models.GPRegression(X, y)
+        lognormal = GPy.priors.LogGaussian(1, 2)
+        m.set_prior('rbf', lognormal)
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+    def test_Gamma(self):
+        xmin, xmax = 1, 2.5*np.pi
+        b, C, SNR = 1, 0, 0.1
+        X = np.linspace(xmin, xmax, 500)
+        y  = b*X + C + 1*np.sin(X)
+        y += 0.05*np.random.randn(len(X))
+        X, y = X[:, None], y[:, None]
+        m = GPy.models.GPRegression(X, y)
+        Gamma = GPy.priors.Gamma(1, 1)
+        m.set_prior('rbf', Gamma)
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+    def test_incompatibility(self):
+        xmin, xmax = 1, 2.5*np.pi
+        b, C, SNR = 1, 0, 0.1
+        X = np.linspace(xmin, xmax, 500)
+        y  = b*X + C + 1*np.sin(X)
+        y += 0.05*np.random.randn(len(X))
+        X, y = X[:, None], y[:, None]
+        m = GPy.models.GPRegression(X, y)
+        gaussian = GPy.priors.Gaussian(1, 1)
+        success = False
+
+        # setting a Gaussian prior on non-negative parameters
+        # should raise an assertionerror.
+        try:
+            m.set_prior('rbf', gaussian)
+        except AssertionError:
+            success = True
+
+        self.assertTrue(success)
+
+
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()
--- a/GPy/testing/psi_stat_expectation_tests.py
+++ b/GPy/testing/psi_stat_expectation_tests.py
@ -0,0 +1,141 @@
+'''
+Created on 26 Apr 2013
+
+@author: maxz
+'''
+import unittest
+import GPy
+import numpy as np
+from GPy import testing
+import sys
+import numpy
+from GPy.kern.parts.rbf import RBF
+from GPy.kern.parts.linear import Linear
+from copy import deepcopy
+
+__test__ = lambda: 'deep' in sys.argv
+# np.random.seed(0)
+
+def ard(p):
+    try:
+        if p.ARD:
+            return "ARD"
+    except:
+        pass
+    return ""
+
+@testing.deepTest(__test__())
+class Test(unittest.TestCase):
+    input_dim = 9
+    num_inducing = 4
+    N = 3
+    Nsamples = 5e6
+
+    def setUp(self):
+        i_s_dim_list = [2,4,3]
+        indices = numpy.cumsum(i_s_dim_list).tolist()
+        input_slices = [slice(a,b) for a,b in zip([None]+indices, indices)]
+        #input_slices[2] = deepcopy(input_slices[1])
+        input_slice_kern = GPy.kern.kern(9, 
+                                         [
+                                          RBF(i_s_dim_list[0], np.random.rand(), np.random.rand(i_s_dim_list[0]), ARD=True),
+                                          RBF(i_s_dim_list[1], np.random.rand(), np.random.rand(i_s_dim_list[1]), ARD=True),
+                                          Linear(i_s_dim_list[2], np.random.rand(i_s_dim_list[2]), ARD=True)
+                                          ],
+                                         input_slices = input_slices
+                                         )
+        self.kerns = (
+                    input_slice_kern,
+#                       (GPy.kern.rbf(self.input_dim, ARD=True) +
+#                        GPy.kern.linear(self.input_dim, ARD=True) +
+#                        GPy.kern.bias(self.input_dim) +
+#                        GPy.kern.white(self.input_dim)),
+#                     (GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
+#                      GPy.kern.rbf(self.input_dim, np.random.rand(), np.random.rand(self.input_dim), ARD=True) +
+#                      GPy.kern.linear(self.input_dim, np.random.rand(self.input_dim), ARD=True) +
+#                      GPy.kern.bias(self.input_dim) +
+#                      GPy.kern.white(self.input_dim)),
+#                       GPy.kern.rbf(self.input_dim), GPy.kern.rbf(self.input_dim, ARD=True),
+#                       GPy.kern.linear(self.input_dim, ARD=False), GPy.kern.linear(self.input_dim, ARD=True),
+#                       GPy.kern.linear(self.input_dim) + GPy.kern.bias(self.input_dim),
+#                       GPy.kern.rbf(self.input_dim) + GPy.kern.bias(self.input_dim),
+#                       GPy.kern.linear(self.input_dim) + GPy.kern.bias(self.input_dim) + GPy.kern.white(self.input_dim),
+#                       GPy.kern.rbf(self.input_dim) + GPy.kern.bias(self.input_dim) + GPy.kern.white(self.input_dim),
+#                       GPy.kern.bias(self.input_dim), GPy.kern.white(self.input_dim),
+                      )
+        self.q_x_mean = np.random.randn(self.input_dim)
+        self.q_x_variance = np.exp(np.random.randn(self.input_dim))
+        self.q_x_samples = np.random.randn(self.Nsamples, self.input_dim) * np.sqrt(self.q_x_variance) + self.q_x_mean
+        self.Z = np.random.randn(self.num_inducing, self.input_dim)
+        self.q_x_mean.shape = (1, self.input_dim)
+        self.q_x_variance.shape = (1, self.input_dim)
+
+    def test_psi0(self):
+        for kern in self.kerns:
+            psi0 = kern.psi0(self.Z, self.q_x_mean, self.q_x_variance)
+            Kdiag = kern.Kdiag(self.q_x_samples)
+            self.assertAlmostEqual(psi0, np.mean(Kdiag), 1)
+            # print kern.parts[0].name, np.allclose(psi0, np.mean(Kdiag))
+
+    def test_psi1(self):
+        for kern in self.kerns:
+            Nsamples = np.floor(self.Nsamples/300.)
+            psi1 = kern.psi1(self.Z, self.q_x_mean, self.q_x_variance)
+            K_ = np.zeros((Nsamples, self.num_inducing))
+            diffs = []
+            for i, q_x_sample_stripe in enumerate(np.array_split(self.q_x_samples, self.Nsamples / Nsamples)):
+                K = kern.K(q_x_sample_stripe[:Nsamples], self.Z)
+                K_ += K
+                diffs.append((np.abs(psi1 - (K_ / (i + 1)))**2).mean())
+            K_ /= self.Nsamples / Nsamples
+            msg = "psi1: " + "+".join([p.name + ard(p) for p in kern.parts])
+            try:
+                import pylab
+                pylab.figure(msg)
+                pylab.plot(diffs)
+#                 print msg, ((psi1.squeeze() - K_)**2).mean() < .01
+                self.assertTrue(((psi1.squeeze() - K_)**2).mean() < .01,
+                                msg=msg + ": not matching")
+#                 sys.stdout.write(".")
+            except:
+#                 import ipdb;ipdb.set_trace()
+#                 kern.psi2(self.Z, self.q_x_mean, self.q_x_variance)
+#                 sys.stdout.write("E")  # msg + ": not matching"
+                pass
+
+    def test_psi2(self):
+        for kern in self.kerns:
+            Nsamples = self.Nsamples/300.
+            psi2 = kern.psi2(self.Z, self.q_x_mean, self.q_x_variance)
+            K_ = np.zeros((self.num_inducing, self.num_inducing))
+            diffs = []
+            for i, q_x_sample_stripe in enumerate(np.array_split(self.q_x_samples, self.Nsamples / Nsamples)):
+                K = kern.K(q_x_sample_stripe, self.Z)
+                K = (K[:, :, None] * K[:, None, :]).mean(0)
+                K_ += K
+                diffs.append(((psi2 - (K_ / (i + 1)))**2).mean())
+            K_ /= self.Nsamples / Nsamples
+            msg = "psi2: {}".format("+".join([p.name + ard(p) for p in kern.parts]))
+            try:
+                import pylab
+                pylab.figure(msg)
+                pylab.plot(diffs)
+#                 print msg, np.allclose(psi2.squeeze(), K_, rtol=1e-1, atol=.1)
+                self.assertTrue(np.allclose(psi2.squeeze(), K_,
+                                            rtol=1e-1, atol=.1),
+                                msg=msg + ": not matching")
+#                 sys.stdout.write(".")
+            except:
+#                 import ipdb;ipdb.set_trace()
+#                 kern.psi2(self.Z, self.q_x_mean, self.q_x_variance)
+#                 sys.stdout.write("E")
+                print msg + ": not matching"
+                pass
+
+if __name__ == "__main__":
+    sys.argv = ['',
+         #'Test.test_psi0',
+         'Test.test_psi1',
+         'Test.test_psi2',
+         ]
+    unittest.main()
--- a/GPy/testing/psi_stat_gradient_tests.py
+++ b/GPy/testing/psi_stat_gradient_tests.py
@ -0,0 +1,152 @@
+'''
+Created on 22 Apr 2013
+
+@author: maxz
+'''
+import unittest
+import numpy
+
+import GPy
+import itertools
+from GPy.core import Model
+
+class PsiStatModel(Model):
+    def __init__(self, which, X, X_variance, Z, num_inducing, kernel):
+        self.which = which
+        self.X = X
+        self.X_variance = X_variance
+        self.Z = Z
+        self.N, self.input_dim = X.shape
+        self.num_inducing, input_dim = Z.shape
+        assert self.input_dim == input_dim, "shape missmatch: Z:{!s} X:{!s}".format(Z.shape, X.shape)
+        self.kern = kernel
+        super(PsiStatModel, self).__init__()
+        self.psi_ = self.kern.__getattribute__(self.which)(self.Z, self.X, self.X_variance)
+    def _get_param_names(self):
+        Xnames = ["{}_{}_{}".format(what, i, j) for what, i, j in itertools.product(['X', 'X_variance'], range(self.N), range(self.input_dim))]
+        Znames = ["Z_{}_{}".format(i, j) for i, j in itertools.product(range(self.num_inducing), range(self.input_dim))]
+        return Xnames + Znames + self.kern._get_param_names()
+    def _get_params(self):
+        return numpy.hstack([self.X.flatten(), self.X_variance.flatten(), self.Z.flatten(), self.kern._get_params()])
+    def _set_params(self, x, save_old=True, save_count=0):
+        start, end = 0, self.X.size
+        self.X = x[start:end].reshape(self.N, self.input_dim)
+        start, end = end, end + self.X_variance.size
+        self.X_variance = x[start: end].reshape(self.N, self.input_dim)
+        start, end = end, end + self.Z.size
+        self.Z = x[start: end].reshape(self.num_inducing, self.input_dim)
+        self.kern._set_params(x[end:])
+    def log_likelihood(self):
+        return self.kern.__getattribute__(self.which)(self.Z, self.X, self.X_variance).sum()
+    def _log_likelihood_gradients(self):
+        psimu, psiS = self.kern.__getattribute__("d" + self.which + "_dmuS")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance)
+        try:
+            psiZ = self.kern.__getattribute__("d" + self.which + "_dZ")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance)
+        except AttributeError:
+            psiZ = numpy.zeros(self.num_inducing * self.input_dim)
+        thetagrad = self.kern.__getattribute__("d" + self.which + "_dtheta")(numpy.ones_like(self.psi_), self.Z, self.X, self.X_variance).flatten()
+        return numpy.hstack((psimu.flatten(), psiS.flatten(), psiZ.flatten(), thetagrad))
+
+class DPsiStatTest(unittest.TestCase):
+    input_dim = 5
+    N = 50
+    num_inducing = 10
+    input_dim = 20
+    X = numpy.random.randn(N, input_dim)
+    X_var = .5 * numpy.ones_like(X) + .4 * numpy.clip(numpy.random.randn(*X.shape), 0, 1)
+    Z = numpy.random.permutation(X)[:num_inducing]
+    Y = X.dot(numpy.random.randn(input_dim, input_dim))
+#     kernels = [GPy.kern.linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)), GPy.kern.rbf(input_dim, ARD=True), GPy.kern.bias(input_dim)]
+
+    kernels = [GPy.kern.linear(input_dim), GPy.kern.rbf(input_dim), GPy.kern.bias(input_dim),
+               GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim),
+               GPy.kern.rbf(input_dim) + GPy.kern.bias(input_dim)]
+
+    def testPsi0(self):
+        for k in self.kernels:
+            m = PsiStatModel('psi0', X=self.X, X_variance=self.X_var, Z=self.Z,
+                             num_inducing=self.num_inducing, kernel=k)
+            assert m.checkgrad(), "{} x psi0".format("+".join(map(lambda x: x.name, k.parts)))
+
+#     def testPsi1(self):
+#         for k in self.kernels:
+#             m = PsiStatModel('psi1', X=self.X, X_variance=self.X_var, Z=self.Z,
+#                      num_inducing=self.num_inducing, kernel=k)
+#             assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k.parts)))
+
+    def testPsi2_lin(self):
+        k = self.kernels[0]
+        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
+                     num_inducing=self.num_inducing, kernel=k)
+        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
+    def testPsi2_lin_bia(self):
+        k = self.kernels[3]
+        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
+                     num_inducing=self.num_inducing, kernel=k)
+        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
+    def testPsi2_rbf(self):
+        k = self.kernels[1]
+        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
+                     num_inducing=self.num_inducing, kernel=k)
+        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
+    def testPsi2_rbf_bia(self):
+        k = self.kernels[-1]
+        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
+                     num_inducing=self.num_inducing, kernel=k)
+        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
+    def testPsi2_bia(self):
+        k = self.kernels[2]
+        m = PsiStatModel('psi2', X=self.X, X_variance=self.X_var, Z=self.Z,
+                     num_inducing=self.num_inducing, kernel=k)
+        assert m.checkgrad(), "{} x psi2".format("+".join(map(lambda x: x.name, k.parts)))
+
+
+if __name__ == "__main__":
+    import sys
+    interactive = 'i' in sys.argv
+    if interactive:
+#         N, num_inducing, input_dim, input_dim = 30, 5, 4, 30
+#         X = numpy.random.rand(N, input_dim)
+#         k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+#         K = k.K(X)
+#         Y = numpy.random.multivariate_normal(numpy.zeros(N), K, input_dim).T
+#         Y -= Y.mean(axis=0)
+#         k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+#         m = GPy.models.Bayesian_GPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
+#         m.randomize()
+# #         self.assertTrue(m.checkgrad())
+        numpy.random.seed(0)
+        input_dim = 5
+        N = 50
+        num_inducing = 10
+        D = 15
+        X = numpy.random.randn(N, input_dim)
+        X_var = .5 * numpy.ones_like(X) + .1 * numpy.clip(numpy.random.randn(*X.shape), 0, 1)
+        Z = numpy.random.permutation(X)[:num_inducing]
+        Y = X.dot(numpy.random.randn(input_dim, D))
+#         kernel = GPy.kern.bias(input_dim)
+#
+#         kernels = [GPy.kern.linear(input_dim), GPy.kern.rbf(input_dim), GPy.kern.bias(input_dim),
+#                GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim),
+#                GPy.kern.rbf(input_dim) + GPy.kern.bias(input_dim)]
+
+#         for k in kernels:
+#             m = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
+#                      num_inducing=num_inducing, kernel=k)
+#             assert m.checkgrad(), "{} x psi1".format("+".join(map(lambda x: x.name, k.parts)))
+#
+#         m0 = PsiStatModel('psi0', X=X, X_variance=X_var, Z=Z,
+#                          num_inducing=num_inducing, kernel=GPy.kern.linear(input_dim))
+#         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
+#                          num_inducing=num_inducing, kernel=kernel)
+#         m1 = PsiStatModel('psi1', X=X, X_variance=X_var, Z=Z,
+#                          num_inducing=num_inducing, kernel=kernel)
+#         m2 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
+#                          num_inducing=num_inducing, kernel=GPy.kern.rbf(input_dim))
+        m3 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
+                         num_inducing=num_inducing, kernel=GPy.kern.linear(input_dim, ARD=True, variances=numpy.random.rand(input_dim)))
+        # + GPy.kern.bias(input_dim))
+#         m4 = PsiStatModel('psi2', X=X, X_variance=X_var, Z=Z,
+#                          num_inducing=num_inducing, kernel=GPy.kern.rbf(input_dim) + GPy.kern.bias(input_dim))
+    else:
+        unittest.main()
--- a/GPy/testing/sparse_gplvm_tests.py
+++ b/GPy/testing/sparse_gplvm_tests.py
@ -0,0 +1,45 @@
+# Copyright (c) 2012, Nicolo Fusi, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import unittest
+import numpy as np
+import GPy
+from GPy.models.sparse_gplvm import SparseGPLVM
+
+class sparse_GPLVMTests(unittest.TestCase):
+    def test_bias_kern(self):
+        N, num_inducing, input_dim, D = 10, 3, 2, 4
+        X = np.random.rand(N, input_dim)
+        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
+        k = GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        m = SparseGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+    def test_linear_kern(self):
+        N, num_inducing, input_dim, D = 10, 3, 2, 4
+        X = np.random.rand(N, input_dim)
+        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
+        k = GPy.kern.linear(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        m = SparseGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+    def test_rbf_kern(self):
+        N, num_inducing, input_dim, D = 10, 3, 2, 4
+        X = np.random.rand(N, input_dim)
+        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        K = k.K(X)
+        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
+        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
+        m = SparseGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
+        m.randomize()
+        self.assertTrue(m.checkgrad())
+
+if __name__ == "__main__":
+    print "Running unit tests, please be (very) patient..."
+    unittest.main()
--- a/Show more
+++ b/Show more