Huge merge correcting upstream master

2026-05-10 12:32:40 +02:00 · 2014-11-21 16:49:33 +00:00 · 2014-11-21 16:49:33 +00:00 · 34932f8746
commit 34932f8746
parent 67ddbde119 acd9b4b2f8
319 changed files with 26201 additions and 26660 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -2,14 +2,14 @@ language: python
 python:
  - "2.7"

-#Set virtual env with system-site-packages to true
-virtualenv:
-  system_site_packages: true
-
 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
 before_install:
-  - sudo apt-get install -qq python-scipy python-pip
-  - sudo apt-get install -qq python-matplotlib
+  #Install a mini version of anaconda such that we can easily install our dependencies
+  - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
+  - chmod +x miniconda.sh
+  - ./miniconda.sh -b
+  - export PATH=/home/travis/miniconda/bin:$PATH
+  - conda update --yes conda
  # Workaround for a permissions issue with Travis virtual machine images
  # that breaks Python's multiprocessing:
  # https://github.com/travis-ci/travis-cookbooks/issues/155
@ -17,11 +17,10 @@ before_install:
  - sudo ln -s /run/shm /dev/shm

 install:
-  - pip install --upgrade numpy==1.7.1
-  - pip install sphinx
-  - pip install nose
-  - pip install . --use-mirrors
+  - conda install --yes python=$TRAVIS_PYTHON_VERSION atlas numpy=1.7 scipy=0.12 matplotlib nose sphinx pip nose
+  - pip install . 
+  #--use-mirrors
+  #
 # command to run tests, e.g. python setup.py test
 script: 
  - nosetests GPy/testing
-  #- yes | nosetests GPy/testing
--- a/GPy/FAQ.txt
+++ b/GPy/FAQ.txt
@ -1,8 +0,0 @@
-Frequently Asked Questions
--------------------------
-
-Unit tests are run through Travis-Ci. They can be run locally through entering the GPy route diretory and writing
-
-nosetests testing/
-
-Documentation is handled by Sphinx. To build the documentation:
--- a/GPy/init.py
+++ b/GPy/init.py
@ -2,15 +2,10 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import warnings
 warnings.filterwarnings("ignore", category=DeprecationWarning)
-import os
-
-
-def read(fname):
-    with open(os.path.join(os.path.dirname(__file__), fname)) as f:
-        return f.read()
-__version__ = read('version') 

 import core
+from core.parameterization import transformations, priors
+constraints = transformations
 import models
 import mappings
 import inference
@ -19,27 +14,36 @@ import examples
 import likelihoods
 import testing
 from numpy.testing import Tester
-from nose.tools import nottest
 import kern
-from core import priors
+import plotting

-@nottest
-def tests():
+# Direct imports for convenience:
+from core import Model
+from core.parameterization import Param, Parameterized, ObsAr
+
+#@nottest
+try:
+    #Get rid of nose dependency by only ignoring if you have nose installed
+    from nose.tools import nottest
+    @nottest
+    def tests():
+        Tester(testing).test(verbose=10)
+except:
+    def tests():
        Tester(testing).test(verbose=10)

-if os.name == 'nt':
+def load(file_path):
    """
-    Fortran seems to like to intercept keyboard interrupts on windows.
-    This means that when a model is optimizing and the user presses Ctrl-C,
-    the program will crash. Since it's kind of nice to be able to stop
-    the optimization at any time, we define our own handler below.
+    Load a previously pickled model, using `m.pickle('path/to/file.pickle)'

+    :param file_name: path/to/file.pickle
    """
-    import win32api
-    import thread
-
-    def handler(sig, hook=thread.interrupt_main):
-        hook()
-        return 1
-
-    win32api.SetConsoleCtrlHandler(handler, 1)
+    import cPickle as pickle
+    try:
+        with open(file_path, 'rb') as f:
+            m = pickle.load(f)
+    except:
+        import pickle as pickle
+        with open(file_path, 'rb') as f:
+            m = pickle.load(f)
+    return m
--- a/GPy/coding_style_guide.txt
+++ b/GPy/coding_style_guide.txt
@ -1,10 +0,0 @@
-In this text document we will describe coding conventions to be used in GPy to keep things consistent.
-
-All arrays containing data are two dimensional. The first dimension is the number of data, the second dimension is number of features. This keeps things consistent with the idea of a design matrix.
-
-Input matrices are either X or t, output matrices are Y.
-
-Input dimensionality is input_dim, output dimensionality is output_dim, number of data is num_data.
-
-Data sets are preprocessed in the datasets.py file. This file also records where the data set was obtained from in the dictionary stored in the file. Long term we should move this dictionary to sqlite or similar.
-
--- a/GPy/core/init.py
+++ b/GPy/core/init.py
@ -1,11 +1,11 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 from model import *
-from parameterized import *
-import priors
+from parameterization.parameterized import adjust_name_for_printing, Parameterizable
+from parameterization.param import Param, ParamConcatenation
+from parameterization.observable_array import ObsAr
+
 from gp import GP
 from sparse_gp import SparseGP
-from fitc import FITC
-from svigp import SVIGP
 from mapping import *
--- a/GPy/core/domains.py
+++ b/GPy/core/domains.py
@ -1,26 +0,0 @@
-'''
-Created on 4 Jun 2013
-
-@author: maxz
-
-(Hyper-)Parameter domains defined for :py:mod:`~GPy.core.priors` and :py:mod:`~GPy.kern`.
-These domains specify the legitimate realm of the parameters to live in.
-
-:const:`~GPy.core.domains.REAL` :
-    real domain, all values in the real numbers are allowed
-
-:const:`~GPy.core.domains.POSITIVE`:
-    positive domain, only positive real values are allowed
-    
-:const:`~GPy.core.domains.NEGATIVE`:
-    same as :const:`~GPy.core.domains.POSITIVE`, but only negative values are allowed
-    
-:const:`~GPy.core.domains.BOUNDED`:
-    only values within the bounded range are allowed,
-    the bounds are specified withing the object with the bounded range
-'''
-
-REAL = 'real'
-POSITIVE = "positive"
-NEGATIVE = 'negative'
-BOUNDED = 'bounded'
--- a/GPy/core/fitc.py
+++ b/GPy/core/fitc.py
@ -1,248 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import numpy as np
-import pylab as pb
-from ..util.linalg import mdot, jitchol, chol_inv, tdot, symmetrify, pdinv, dtrtrs
-from ..util.plot import gpplot
-from .. import kern
-from scipy import stats
-from sparse_gp import SparseGP
-
-class FITC(SparseGP):
-    """
-
-    Sparse FITC approximation
-
-    :param X: inputs
-    :type X: np.ndarray (num_data x Q)
-    :param likelihood: a likelihood instance, containing the observed data
-    :type likelihood: GPy.likelihood.(Gaussian | EP)
-    :param kernel: the kernel (covariance function). See link kernels
-    :type kernel: a GPy.kern.kern instance
-    :param Z: inducing inputs (optional, see note)
-    :type Z: np.ndarray (M x Q) | None
-    :param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales)
-    :type normalize_(X|Y): bool
-
-    """
-
-    def __init__(self, X, likelihood, kernel, Z, normalize_X=False):
-        SparseGP.__init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False)
-        assert self.output_dim == 1, "FITC model is not defined for handling multiple outputs"
-
-    def update_likelihood_approximation(self, **kwargs):
-        """
-        Approximates a non-Gaussian likelihood using Expectation Propagation
-
-        For a Gaussian likelihood, no iteration is required:
-        this function does nothing
-        """
-        self.likelihood.restart()
-        self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0, **kwargs)
-        self._set_params(self._get_params())
-
-    def _compute_kernel_matrices(self):
-        # kernel computations, using BGPLVM notation
-        self.Kmm = self.kern.K(self.Z)
-        self.psi0 = self.kern.Kdiag(self.X)
-        self.psi1 = self.kern.K(self.Z, self.X)
-        self.psi2 = None
-
-    def _computations(self):
-        #factor Kmm
-        self.Lm = jitchol(self.Kmm)
-        self.Lmi,info = dtrtrs(self.Lm,np.eye(self.num_inducing),lower=1)
-        Lmipsi1 = np.dot(self.Lmi,self.psi1)
-        self.Qnn = np.dot(Lmipsi1.T,Lmipsi1).copy()
-        self.Diag0 = self.psi0 - np.diag(self.Qnn)
-        self.beta_star = self.likelihood.precision/(1. + self.likelihood.precision*self.Diag0[:,None]) #NOTE: beta_star contains Diag0 and the precision
-        self.V_star = self.beta_star * self.likelihood.Y
-
-        # The rather complex computations of self.A
-        tmp = self.psi1 * (np.sqrt(self.beta_star.flatten().reshape(1, self.num_data)))
-        tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1)
-        self.A = tdot(tmp)
-
-        # factor B
-        self.B = np.eye(self.num_inducing) + self.A
-        self.LB = jitchol(self.B)
-        self.LBi = chol_inv(self.LB)
-        self.psi1V = np.dot(self.psi1, self.V_star)
-
-        Lmi_psi1V, info = dtrtrs(self.Lm, np.asfortranarray(self.psi1V), lower=1, trans=0)
-        self._LBi_Lmi_psi1V, _ = dtrtrs(self.LB, np.asfortranarray(Lmi_psi1V), lower=1, trans=0)
-
-        Kmmipsi1 = np.dot(self.Lmi.T,Lmipsi1)
-        b_psi1_Ki = self.beta_star * Kmmipsi1.T
-        Ki_pbp_Ki = np.dot(Kmmipsi1,b_psi1_Ki)
-        Kmmi = np.dot(self.Lmi.T,self.Lmi)
-        LBiLmi = np.dot(self.LBi,self.Lmi)
-        LBL_inv = np.dot(LBiLmi.T,LBiLmi)
-        VVT = np.outer(self.V_star,self.V_star)
-        VV_p_Ki = np.dot(VVT,Kmmipsi1.T)
-        Ki_pVVp_Ki = np.dot(Kmmipsi1,VV_p_Ki)
-        psi1beta = self.psi1*self.beta_star.T
-        H = self.Kmm + mdot(self.psi1,psi1beta.T)
-        LH = jitchol(H)
-        LHi = chol_inv(LH)
-        Hi = np.dot(LHi.T,LHi)
-
-        betapsi1TLmiLBi = np.dot(psi1beta.T,LBiLmi.T)
-        alpha = np.array([np.dot(a.T,a) for a in betapsi1TLmiLBi])[:,None]
-        gamma_1 = mdot(VVT,self.psi1.T,Hi)
-        pHip = mdot(self.psi1.T,Hi,self.psi1)
-        gamma_2 = mdot(self.beta_star*pHip,self.V_star)
-        gamma_3 = self.V_star * gamma_2
-
-        self._dL_dpsi0 = -0.5 * self.beta_star#dA_dpsi0: logdet(self.beta_star)
-        self._dL_dpsi0 += .5 * self.V_star**2 #dA_psi0: yT*beta_star*y
-        self._dL_dpsi0 += .5 *alpha #dC_dpsi0
-        self._dL_dpsi0 += 0.5*mdot(self.beta_star*pHip,self.V_star)**2 - self.V_star * mdot(self.V_star.T,pHip*self.beta_star).T #dD_dpsi0
-
-        self._dL_dpsi1 = b_psi1_Ki.copy() #dA_dpsi1: logdet(self.beta_star)
-        self._dL_dpsi1 += -np.dot(psi1beta.T,LBL_inv) #dC_dpsi1
-        self._dL_dpsi1 += gamma_1 - mdot(psi1beta.T,Hi,self.psi1,gamma_1) #dD_dpsi1
-
-        self._dL_dKmm = -0.5 * np.dot(Kmmipsi1,b_psi1_Ki) #dA_dKmm: logdet(self.beta_star)
-        self._dL_dKmm += .5*(LBL_inv - Kmmi) + mdot(LBL_inv,psi1beta,Kmmipsi1.T) #dC_dKmm
-        self._dL_dKmm += -.5 * mdot(Hi,self.psi1,gamma_1) #dD_dKmm
-
-        self._dpsi1_dtheta = 0
-        self._dpsi1_dX = 0
-        self._dKmm_dtheta = 0
-        self._dKmm_dX = 0
-
-        self._dpsi1_dX_jkj = 0
-        self._dpsi1_dtheta_jkj = 0
-
-        for i,V_n,alpha_n,gamma_n,gamma_k in zip(range(self.num_data),self.V_star,alpha,gamma_2,gamma_3):
-            K_pp_K = np.dot(Kmmipsi1[:,i:(i+1)],Kmmipsi1[:,i:(i+1)].T)
-            _dpsi1 = (-V_n**2 - alpha_n + 2.*gamma_k - gamma_n**2) * Kmmipsi1.T[i:(i+1),:]
-            _dKmm = .5*(V_n**2 + alpha_n + gamma_n**2 - 2.*gamma_k) * K_pp_K #Diag_dD_dKmm
-            self._dpsi1_dtheta += self.kern.dK_dtheta(_dpsi1,self.X[i:i+1,:],self.Z)
-            self._dKmm_dtheta += self.kern.dK_dtheta(_dKmm,self.Z)
-            self._dKmm_dX += self.kern.dK_dX(_dKmm ,self.Z)
-            self._dpsi1_dX += self.kern.dK_dX(_dpsi1.T,self.Z,self.X[i:i+1,:])
-
-        # the partial derivative vector for the likelihood
-        if self.likelihood.num_params == 0:
-            # save computation here.
-            self.partial_for_likelihood = None
-        elif self.likelihood.is_heteroscedastic:
-            raise NotImplementedError, "heteroscedatic derivates not implemented."
-        else:
-            # likelihood is not heterscedatic
-            dbstar_dnoise = self.likelihood.precision * (self.beta_star**2 * self.Diag0[:,None] - self.beta_star)
-            Lmi_psi1 = mdot(self.Lmi,self.psi1)
-            LBiLmipsi1 = np.dot(self.LBi,Lmi_psi1)
-            aux_0 = np.dot(self._LBi_Lmi_psi1V.T,LBiLmipsi1)
-            aux_1 = self.likelihood.Y.T * np.dot(self._LBi_Lmi_psi1V.T,LBiLmipsi1)
-            aux_2 = np.dot(LBiLmipsi1.T,self._LBi_Lmi_psi1V)
-
-            dA_dnoise = 0.5 * self.input_dim * (dbstar_dnoise/self.beta_star).sum() - 0.5 * self.input_dim * np.sum(self.likelihood.Y**2 * dbstar_dnoise)
-            dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T,self.LBi,Lmi_psi1) *  Lmi_psi1 * dbstar_dnoise.T)
-
-            dD_dnoise_1 =  mdot(self.V_star*LBiLmipsi1.T,LBiLmipsi1*dbstar_dnoise.T*self.likelihood.Y.T)
-            alpha = mdot(LBiLmipsi1,self.V_star)
-            alpha_ = mdot(LBiLmipsi1.T,alpha)
-            dD_dnoise_2 = -0.5 * self.input_dim * np.sum(alpha_**2 * dbstar_dnoise )
-
-            dD_dnoise_1 = mdot(self.V_star.T,self.psi1.T,self.Lmi.T,self.LBi.T,self.LBi,self.Lmi,self.psi1,dbstar_dnoise*self.likelihood.Y)
-            dD_dnoise_2 = 0.5*mdot(self.V_star.T,self.psi1.T,Hi,self.psi1,dbstar_dnoise*self.psi1.T,Hi,self.psi1,self.V_star)
-            dD_dnoise = dD_dnoise_1 + dD_dnoise_2
-
-            self.partial_for_likelihood = dA_dnoise + dC_dnoise + dD_dnoise
-
-    def log_likelihood(self):
-        """ Compute the (lower bound on the) log marginal likelihood """
-        A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.beta_star)) - 0.5 * np.sum(self.V_star * self.likelihood.Y)
-        C = -self.output_dim * (np.sum(np.log(np.diag(self.LB))))
-        D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V))
-        return A + C + D + self.likelihood.Z
-
-    def _log_likelihood_gradients(self):
-        pass
-        return np.hstack((self.dL_dZ().flatten(), self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))
-
-    def dL_dtheta(self):
-        dL_dtheta = self.kern.dKdiag_dtheta(self._dL_dpsi0,self.X)
-        dL_dtheta += self.kern.dK_dtheta(self._dL_dpsi1,self.X,self.Z)
-        dL_dtheta += self.kern.dK_dtheta(self._dL_dKmm,X=self.Z)
-        dL_dtheta += self._dKmm_dtheta
-        dL_dtheta += self._dpsi1_dtheta
-        return dL_dtheta
-
-    def dL_dZ(self):
-        dL_dZ = self.kern.dK_dX(self._dL_dpsi1.T,self.Z,self.X)
-        dL_dZ += self.kern.dK_dX(self._dL_dKmm,X=self.Z)
-        dL_dZ += self._dpsi1_dX
-        dL_dZ += self._dKmm_dX
-        return dL_dZ
-
-    def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
-        assert X_variance_new is None, "FITC model is not defined for handling uncertain inputs."
-
-        if self.likelihood.is_heteroscedastic:
-            Iplus_Dprod_i = 1./(1.+ self.Diag0 * self.likelihood.precision.flatten())
-            self.Diag = self.Diag0 * Iplus_Dprod_i
-            self.P = Iplus_Dprod_i[:,None] * self.psi1.T
-            self.RPT0 = np.dot(self.Lmi,self.psi1)
-            self.L = np.linalg.cholesky(np.eye(self.num_inducing) + np.dot(self.RPT0,((1. - Iplus_Dprod_i)/self.Diag0)[:,None]*self.RPT0.T))
-            self.R,info = dtrtrs(self.L,self.Lmi,lower=1)
-            self.RPT = np.dot(self.R,self.P.T)
-            self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T,self.RPT)
-            self.w = self.Diag * self.likelihood.v_tilde
-            self.Gamma = np.dot(self.R.T, np.dot(self.RPT,self.likelihood.v_tilde))
-            self.mu = self.w + np.dot(self.P,self.Gamma)
-
-            """
-            Make a prediction for the generalized FITC model
-
-            Arguments
-            ---------
-            X : Input prediction data - Nx1 numpy array (floats)
-            """
-            # q(u|f) = N(u| R0i*mu_u*f, R0i*C*R0i.T)
-
-            # Ci = I + (RPT0)Di(RPT0).T
-            # C = I - [RPT0] * (input_dim+[RPT0].T*[RPT0])^-1*[RPT0].T
-            #   = I - [RPT0] * (input_dim + self.Qnn)^-1 * [RPT0].T
-            #   = I - [RPT0] * (U*U.T)^-1 * [RPT0].T
-            #   = I - V.T * V
-            U = np.linalg.cholesky(np.diag(self.Diag0) + self.Qnn)
-            V,info = dtrtrs(U,self.RPT0.T,lower=1)
-            C = np.eye(self.num_inducing) - np.dot(V.T,V)
-            mu_u = np.dot(C,self.RPT0)*(1./self.Diag0[None,:])
-            #self.C = C
-            #self.RPT0 = np.dot(self.R0,self.Knm.T) P0.T
-            #self.mu_u = mu_u
-            #self.U = U
-            # q(u|y) = N(u| R0i*mu_H,R0i*Sigma_H*R0i.T)
-            mu_H = np.dot(mu_u,self.mu)
-            self.mu_H = mu_H
-            Sigma_H = C + np.dot(mu_u,np.dot(self.Sigma,mu_u.T))
-            # q(f_star|y) = N(f_star|mu_star,sigma2_star)
-            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
-            KR0T = np.dot(Kx.T,self.Lmi.T)
-            mu_star = np.dot(KR0T,mu_H)
-            if full_cov:
-                Kxx = self.kern.K(Xnew,which_parts=which_parts)
-                var = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.num_inducing),KR0T.T))
-            else:
-                Kxx = self.kern.Kdiag(Xnew,which_parts=which_parts)
-                var = (Kxx + np.sum(KR0T.T*np.dot(Sigma_H - np.eye(self.num_inducing),KR0T.T),0))[:,None]
-            return mu_star[:,None],var
-        else:
-            raise NotImplementedError, "Heteroscedastic case not implemented."
-            """
-            Kx = self.kern.K(self.Z, Xnew)
-            mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
-            if full_cov:
-                Kxx = self.kern.K(Xnew)
-                var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) #NOTE this won't work for plotting
-            else:
-                Kxx = self.kern.Kdiag(Xnew)
-                var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0)
-            return mu,var[:,None]
-            """
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -1,204 +1,459 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-
 import numpy as np
-import pylab as pb
+import sys
 from .. import kern
-from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs
-from ..likelihoods import EP, Laplace
-from gp_base import GPBase
+from model import Model
+from parameterization import ObsAr
+from .. import likelihoods
+from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
+from parameterization.variational import VariationalPosterior

-class GP(GPBase):
+import logging
+from GPy.util.normalizer import MeanNorm
+logger = logging.getLogger("GP")
+
+class GP(Model):
    """
-    Gaussian Process model for regression and EP
+    General purpose Gaussian process model

    :param X: input observations
+    :param Y: output observations
    :param kernel: a GPy kernel, defaults to rbf+white
    :param likelihood: a GPy likelihood
-    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
-    :type normalize_X: False|True
+    :param inference_method: The :class:`~GPy.inference.latent_function_inference.LatentFunctionInference` inference method to use for this GP
    :rtype: model object
+    :param Norm normalizer:
+        normalize the outputs Y.
+        Prediction will be un-normalized using this normalizer.
+        If normalizer is None, we will normalize using MeanNorm.
+        If normalizer is False, no normalization will be done.

    .. Note:: Multiple independent outputs are allowed using columns of Y

+
    """
-    def __init__(self, X, likelihood, kernel, normalize_X=False):
-        GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
-        self.update_likelihood_approximation()
+    def __init__(self, X, Y, kernel, likelihood, inference_method=None, name='gp', Y_metadata=None, normalizer=False):
+        super(GP, self).__init__(name)

+        assert X.ndim == 2
+        if isinstance(X, (ObsAr, VariationalPosterior)):
+            self.X = X.copy()
+        else: self.X = ObsAr(X)

-    def _set_params(self, p):
-        new_kern_params = p[:self.kern.num_params_transformed()]
-        new_likelihood_params = p[self.kern.num_params_transformed():]
-        old_likelihood_params = self.likelihood._get_params()
+        self.num_data, self.input_dim = self.X.shape

-        self.kern._set_params_transformed(new_kern_params)
-        self.likelihood._set_params_transformed(new_likelihood_params)
+        assert Y.ndim == 2
+        logger.info("initializing Y")

-        self.K = self.kern.K(self.X)
-
-        #Re fit likelihood approximation (if it is an approx), as parameters have changed
-        if isinstance(self.likelihood, Laplace):
-            self.likelihood.fit_full(self.K)
-
-        self.K += self.likelihood.covariance_matrix
-
-        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
-
-        # the gradient of the likelihood wrt the covariance matrix
-        if self.likelihood.YYT is None:
-            # alpha = np.dot(self.Ki, self.likelihood.Y)
-            alpha, _ = dpotrs(self.L, self.likelihood.Y, lower=1)
-
-            self.dL_dK = 0.5 * (tdot(alpha) - self.output_dim * self.Ki)
+        if normalizer is True:
+            self.normalizer = MeanNorm()
+        elif normalizer is False:
+            self.normalizer = None
        else:
-            # tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki)
-            tmp, _ = dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1)
-            tmp, _ = dpotrs(self.L, np.asfortranarray(tmp.T), lower=1)
-            self.dL_dK = 0.5 * (tmp - self.output_dim * self.Ki)
+            self.normalizer = normalizer

-        #Adding dZ_dK (0 for a non-approximate likelihood, compensates for
-        #additional gradients of K when log-likelihood has non-zero Z term)
-        self.dL_dK += self.likelihood.dZ_dK
-
-    def _get_params(self):
-        return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params()))
-
-    def _get_param_names(self):
-        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
-
-    def update_likelihood_approximation(self, **kwargs):
-        """
-        Approximates a non-gaussian likelihood using Expectation Propagation
-
-        For a Gaussian likelihood, no iteration is required:
-        this function does nothing
-        """
-        self.likelihood.restart()
-        self.likelihood.fit_full(self.kern.K(self.X), **kwargs)
-        self._set_params(self._get_params()) # update the GP
-
-    def _model_fit_term(self):
-        """
-        Computes the model fit using YYT if it's available
-        """
-        if self.likelihood.YYT is None:
-            tmp, _ = dtrtrs(self.L, np.asfortranarray(self.likelihood.Y), lower=1)
-            return -0.5 * np.sum(np.square(tmp))
-            # return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y)))
+        if self.normalizer is not None:
+            self.normalizer.scale_by(Y)
+            self.Y_normalized = ObsAr(self.normalizer.normalize(Y))
+            self.Y = Y
        else:
-            return -0.5 * np.sum(np.multiply(self.Ki, self.likelihood.YYT))
+            self.Y = ObsAr(Y)
+            self.Y_normalized = self.Y
+
+        assert Y.shape[0] == self.num_data
+        _, self.output_dim = self.Y.shape
+
+        #TODO: check the type of this is okay?
+        self.Y_metadata = Y_metadata
+
+        assert isinstance(kernel, kern.Kern)
+        #assert self.input_dim == kernel.input_dim
+        self.kern = kernel
+
+        assert isinstance(likelihood, likelihoods.Likelihood)
+        self.likelihood = likelihood
+
+        #find a sensible inference method
+        logger.info("initializing inference method")
+        if inference_method is None:
+            if isinstance(likelihood, likelihoods.Gaussian) or isinstance(likelihood, likelihoods.MixedNoise):
+                inference_method = exact_gaussian_inference.ExactGaussianInference()
+            else:
+                inference_method = expectation_propagation.EP()
+                print "defaulting to ", inference_method, "for latent function inference"
+        self.inference_method = inference_method
+
+        logger.info("adding kernel and likelihood as parameters")
+        self.link_parameter(self.kern)
+        self.link_parameter(self.likelihood)
+
+    def set_XY(self, X=None, Y=None):
+        """
+        Set the input / output data of the model
+        This is useful if we wish to change our existing data but maintain the same model
+
+        :param X: input observations
+        :type X: np.ndarray
+        :param Y: output observations
+        :type Y: np.ndarray
+        """
+        self.update_model(False)
+        if Y is not None:
+            if self.normalizer is not None:
+                self.normalizer.scale_by(Y)
+                self.Y_normalized = ObsAr(self.normalizer.normalize(Y))
+                self.Y = Y
+            else:
+                self.Y = ObsAr(Y)
+                self.Y_normalized = self.Y
+        if X is not None:
+            if self.X in self.parameters:
+                # LVM models
+                if isinstance(self.X, VariationalPosterior):
+                    assert isinstance(X, type(self.X)), "The given X must have the same type as the X in the model!"
+                    self.unlink_parameter(self.X)
+                    self.X = X
+                    self.link_parameters(self.X)
+                else:
+                    self.unlink_parameter(self.X)
+                    from ..core import Param
+                    self.X = Param('latent mean',X)
+                    self.link_parameters(self.X)
+            else:
+                self.X = ObsAr(X)
+        self.update_model(True)
+
+    def set_X(self,X):
+        """
+        Set the input data of the model
+
+        :param X: input observations
+        :type X: np.ndarray
+        """
+        self.set_XY(X=X)
+
+    def set_Y(self,Y):
+        """
+        Set the output data of the model
+
+        :param X: output observations
+        :type X: np.ndarray
+        """
+        self.set_XY(Y=Y)
+
+    def parameters_changed(self):
+        """
+        Method that is called upon any changes to :class:`~GPy.core.parameterization.param.Param` variables within the model.
+        In particular in the GP class this method reperforms inference, recalculating the posterior and log marginal likelihood and gradients of the model
+
+        .. warning::
+            This method is not designed to be called manually, the framework is set up to automatically call this method upon changes to parameters, if you call
+            this method yourself, there may be unexpected consequences.
+        """
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.Y_metadata)
+        self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
+        self.kern.update_gradients_full(self.grad_dict['dL_dK'], self.X)

    def log_likelihood(self):
        """
-        The log marginal likelihood of the GP.
+        The log marginal likelihood of the model, :math:`p(\mathbf{y})`, this is the objective function of the model being optimised
+        """
+        return self._log_marginal_likelihood

-        For an EP model,  can be written as the log likelihood of a regression
-        model for a new variable Y* = v_tilde/tau_tilde, with a covariance
-        matrix K* = K + diag(1./tau_tilde) plus a normalization term.
+    def _raw_predict(self, _Xnew, full_cov=False, kern=None):
        """
-        return (-0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) -
-            0.5 * self.output_dim * self.K_logdet + self._model_fit_term() + self.likelihood.Z)
+        For making predictions, does not account for normalization or likelihood

-    def _log_likelihood_gradients(self):
-        """
-        The gradient of all parameters.
+        full_cov is a boolean which defines whether the full covariance matrix
+        of the prediction is computed. If full_cov is False (default), only the
+        diagonal of the covariance is returned.

-        Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
+        .. math::
+            p(f*|X*, X, Y) = \int^{\inf}_{\inf} p(f*|f,X*)p(f|X,Y) df
+                        = N(f*| K_{x*x}(K_{xx} + \Sigma)^{-1}Y, K_{x*x*} - K_{xx*}(K_{xx} + \Sigma)^{-1}K_{xx*}
+            \Sigma := \texttt{Likelihood.variance / Approximate likelihood covariance}
        """
-        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
+        if kern is None:
+            kern = self.kern

-    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False):
-        """
-        Internal helper function for making predictions, does not account
-        for normalization or likelihood
-        """
-        Kx = self.kern.K(_Xnew, self.X, which_parts=which_parts).T
-        # KiKx = np.dot(self.Ki, Kx)
-        KiKx, _ = dpotrs(self.L, np.asfortranarray(Kx), lower=1)
-        mu = np.dot(KiKx.T, self.likelihood.Y)
+        Kx = kern.K(_Xnew, self.X).T
+        WiKx = np.dot(self.posterior.woodbury_inv, Kx)
+        mu = np.dot(Kx.T, self.posterior.woodbury_vector)
        if full_cov:
-            Kxx = self.kern.K(_Xnew, which_parts=which_parts)
-            var = Kxx - np.dot(KiKx.T, Kx)
+            Kxx = kern.K(_Xnew)
+            var = Kxx - np.dot(Kx.T, WiKx)
        else:
-            Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
-            var = Kxx - np.sum(np.multiply(KiKx, Kx), 0)
-            var = var[:, None]
-        if stop:
-            debug_this # @UndefinedVariable
+            Kxx = kern.Kdiag(_Xnew)
+            var = Kxx - np.sum(WiKx*Kx, 0)
+            var = var.reshape(-1, 1)
+
+        #force mu to be a column vector
+        if len(mu.shape)==1: mu = mu[:,None]
        return mu, var

-    def predict(self, Xnew, which_parts='all', full_cov=False, **likelihood_args):
+    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None):
        """
        Predict the function(s) at the new point(s) Xnew.

        :param Xnew: The points at which to make a prediction
-        :type Xnew: np.ndarray, Nnew x self.input_dim
-        :param which_parts:  specifies which outputs kernel(s) to use in prediction
-        :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal
+        :type Xnew: np.ndarray (Nnew x self.input_dim)
+        :param full_cov: whether to return the full covariance matrix, or just
+                         the diagonal
        :type full_cov: bool
-        :returns: mean: posterior mean,  a Numpy array, Nnew x self.input_dim
-        :returns: var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
-        :returns: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
-
+        :param Y_metadata: metadata about the predicting point to pass to the likelihood
+        :param kern: The kernel to use for prediction (defaults to the model
+                     kern). this is useful for examining e.g. subprocesses.
+        :returns: (mean, var, lower_upper):
+            mean: posterior mean, a Numpy array, Nnew x self.input_dim
+            var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
+            lower_upper: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim

           If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
           This is to allow for different normalizations of the output dimensions.
-
        """
-        # normalize X values
-        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
-        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)
+        #predict the latent function values
+        mu, var = self._raw_predict(Xnew, full_cov=full_cov, kern=kern)
+        if self.normalizer is not None:
+            mu, var = self.normalizer.inverse_mean(mu), self.normalizer.inverse_variance(var)

        # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, **likelihood_args)
-        return mean, var, _025pm, _975pm
+        mean, var = self.likelihood.predictive_values(mu, var, full_cov, Y_metadata)
+        return mean, var

-    def _raw_predict_single_output(self, _Xnew, output, which_parts='all', full_cov=False,stop=False):
+    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None):
        """
-        For a specific output, calls _raw_predict() at the new point(s) _Xnew.
-        This functions calls _add_output_index(), so _Xnew should not have an index column specifying the output.
-        ---------
+        Get the predictive quantiles around the prediction at X

-        :param Xnew: The points at which to make a prediction
-        :type Xnew: np.ndarray, Nnew x self.input_dim
-        :param output: output to predict
-        :type output: integer in {0,..., output_dim-1}
-        :param which_parts:  specifies which outputs kernel(s) to use in prediction
-        :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal
-
-        .. Note:: For multiple non-independent outputs models only.
+        :param X: The points at which to make a prediction
+        :type X: np.ndarray (Xnew x self.input_dim)
+        :param quantiles: tuple of quantiles, default is (2.5, 97.5) which is the 95% interval
+        :type quantiles: tuple
+        :returns: list of quantiles for each X and predictive quantiles for interval combination
+        :rtype: [np.ndarray (Xnew x self.input_dim), np.ndarray (Xnew x self.input_dim)]
        """
-        _Xnew = self._add_output_index(_Xnew, output)
-        return self._raw_predict(_Xnew, which_parts=which_parts,full_cov=full_cov, stop=stop)
+        m, v = self._raw_predict(X,  full_cov=False)
+        if self.normalizer is not None:
+            m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
+        return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)

-    def predict_single_output(self, Xnew,output=0, which_parts='all', full_cov=False, likelihood_args=dict()):
+    def predictive_gradients(self, Xnew):
        """
-        For a specific output, calls predict() at the new point(s) Xnew.
-        This functions calls _add_output_index(), so Xnew should not have an index column specifying the output.
+        Compute the derivatives of the latent function with respect to X*

-        :param Xnew: The points at which to make a prediction
-        :type Xnew: np.ndarray, Nnew x self.input_dim
-        :param which_parts:  specifies which outputs kernel(s) to use in prediction
-        :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal
-        :type full_cov: bool
-        :returns: mean: posterior mean,  a Numpy array, Nnew x self.input_dim
-        :returns: var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
-        :returns: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
+        Given a set of points at which to predict X* (size [N*,Q]), compute the
+        derivatives of the mean and variance. Resulting arrays are sized:
+         dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).
+
+         dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
+        :param X: The points at which to get the predictive gradients
+        :type X: np.ndarray (Xnew x self.input_dim)
+        :returns: dmu_dX, dv_dX
+        :rtype: [np.ndarray (N*, Q ,D), np.ndarray (N*,Q) ]

-        .. Note:: For multiple non-independent outputs models only.
        """
-        Xnew = self._add_output_index(Xnew, output)
-        return self.predict(Xnew, which_parts=which_parts, full_cov=full_cov, likelihood_args=likelihood_args)
+        dmu_dX = np.empty((Xnew.shape[0],Xnew.shape[1],self.output_dim))
+        for i in range(self.output_dim):
+            dmu_dX[:,:,i] = self.kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1].T, Xnew, self.X)

-    def getstate(self):
-        return GPBase.getstate(self)
+        # gradients wrt the diagonal part k_{xx}
+        dv_dX = self.kern.gradients_X(np.eye(Xnew.shape[0]), Xnew)
+        #grads wrt 'Schur' part K_{xf}K_{ff}^{-1}K_{fx}
+        alpha = -2.*np.dot(self.kern.K(Xnew, self.X),self.posterior.woodbury_inv)
+        dv_dX += self.kern.gradients_X(alpha, Xnew, self.X)
+        return dmu_dX, dv_dX

-    def setstate(self, state):
-        GPBase.setstate(self, state)
-        self._set_params(self._get_params())

+    def posterior_samples_f(self,X,size=10, full_cov=True):
+        """
+        Samples the posterior GP at the points X.
+
+        :param X: The points at which to take the samples.
+        :type X: np.ndarray (Nnew x self.input_dim)
+        :param size: the number of a posteriori samples.
+        :type size: int.
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
+        :type full_cov: bool.
+        :returns: Ysim: set of simulations
+        :rtype: np.ndarray (N x samples)
+        """
+        m, v = self._raw_predict(X,  full_cov=full_cov)
+        if self.normalizer is not None:
+            m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
+        v = v.reshape(m.size,-1) if len(v.shape)==3 else v
+        if not full_cov:
+            Ysim = np.random.multivariate_normal(m.flatten(), np.diag(v.flatten()), size).T
+        else:
+            Ysim = np.random.multivariate_normal(m.flatten(), v, size).T
+
+        return Ysim
+
+    def posterior_samples(self, X, size=10, full_cov=False, Y_metadata=None):
+        """
+        Samples the posterior GP at the points X.
+
+        :param X: the points at which to take the samples.
+        :type X: np.ndarray (Nnew x self.input_dim.)
+        :param size: the number of a posteriori samples.
+        :type size: int.
+        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
+        :type full_cov: bool.
+        :param noise_model: for mixed noise likelihood, the noise model to use in the samples.
+        :type noise_model: integer.
+        :returns: Ysim: set of simulations, a Numpy array (N x samples).
+        """
+        Ysim = self.posterior_samples_f(X, size, full_cov=full_cov)
+        Ysim = self.likelihood.samples(Ysim, Y_metadata)
+
+        return Ysim
+
+    def plot_f(self, plot_limits=None, which_data_rows='all',
+        which_data_ycols='all', fixed_inputs=[],
+        levels=20, samples=0, fignum=None, ax=None, resolution=None,
+        plot_raw=True,
+        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx'):
+        """
+        Plot the GP's view of the world, where the data is normalized and before applying a likelihood.
+        This is a call to plot with plot_raw=True.
+        Data will not be plotted in this, as the GP's view of the world
+        may live in another space, or units then the data.
+
+        Can plot only part of the data and part of the posterior functions
+        using which_data_rowsm which_data_ycols.
+
+        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+        :type plot_limits: np.array
+        :param which_data_rows: which of the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
+        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+        :type which_data_ycols: 'all' or a list of integers
+        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
+        :type fixed_inputs: a list of tuples
+        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+        :type resolution: int
+        :param levels: number of levels to plot in a contour plot.
+        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
+        :type levels: int
+        :param samples: the number of a posteriori samples to plot
+        :type samples: int
+        :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+        :param linecol: color of line to plot [Tango.colorsHex['darkBlue']]
+        :type linecol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
+        :param fillcol: color of fill [Tango.colorsHex['lightBlue']]
+        :type fillcol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
+        :param Y_metadata: additional data associated with Y which may be needed
+        :type Y_metadata: dict
+        :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
+        :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
+        """
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import models_plots
+        kw = {}
+        if linecol is not None:
+            kw['linecol'] = linecol
+        if fillcol is not None:
+            kw['fillcol'] = fillcol
+        return models_plots.plot_fit(self, plot_limits, which_data_rows,
+                                     which_data_ycols, fixed_inputs,
+                                     levels, samples, fignum, ax, resolution,
+                                     plot_raw=plot_raw, Y_metadata=Y_metadata,
+                                     data_symbol=data_symbol, **kw)
+
+    def plot(self, plot_limits=None, which_data_rows='all',
+        which_data_ycols='all', fixed_inputs=[],
+        levels=20, samples=0, fignum=None, ax=None, resolution=None,
+        plot_raw=False,
+        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx'):
+        """
+        Plot the posterior of the GP.
+          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
+          - In two dimsensions, a contour-plot shows the mean predicted function
+          - In higher dimensions, use fixed_inputs to plot the GP  with some of the inputs fixed.
+
+        Can plot only part of the data and part of the posterior functions
+        using which_data_rowsm which_data_ycols.
+
+        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+        :type plot_limits: np.array
+        :param which_data_rows: which of the training data to plot (default all)
+        :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
+        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
+        :type which_data_ycols: 'all' or a list of integers
+        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
+        :type fixed_inputs: a list of tuples
+        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
+        :type resolution: int
+        :param levels: number of levels to plot in a contour plot.
+        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
+        :type levels: int
+        :param samples: the number of a posteriori samples to plot
+        :type samples: int
+        :param fignum: figure to plot on.
+        :type fignum: figure number
+        :param ax: axes to plot on.
+        :type ax: axes handle
+        :param linecol: color of line to plot [Tango.colorsHex['darkBlue']]
+        :type linecol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
+        :param fillcol: color of fill [Tango.colorsHex['lightBlue']]
+        :type fillcol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
+        :param Y_metadata: additional data associated with Y which may be needed
+        :type Y_metadata: dict
+        :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
+        :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
+        """
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import models_plots
+        kw = {}
+        if linecol is not None:
+            kw['linecol'] = linecol
+        if fillcol is not None:
+            kw['fillcol'] = fillcol
+        return models_plots.plot_fit(self, plot_limits, which_data_rows,
+                                     which_data_ycols, fixed_inputs,
+                                     levels, samples, fignum, ax, resolution,
+                                     plot_raw=plot_raw, Y_metadata=Y_metadata,
+                                     data_symbol=data_symbol, **kw)
+
+    def input_sensitivity(self, summarize=True):
+        """
+        Returns the sensitivity for each dimension of this model
+        """
+        return self.kern.input_sensitivity(summarize=summarize)
+
+    def optimize(self, optimizer=None, start=None, **kwargs):
+        """
+        Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
+        kwargs are passed to the optimizer. They can be:
+
+        :param max_f_eval: maximum number of function evaluations
+        :type max_f_eval: int
+        :messages: whether to display during optimisation
+        :type messages: bool
+        :param optimizer: which optimizer to use (defaults to self.preferred optimizer), a range of optimisers can be found in :module:`~GPy.inference.optimization`, they include 'scg', 'lbfgs', 'tnc'.
+        :type optimizer: string
+        """
+        self.inference_method.on_optimization_start()
+        try:
+            super(GP, self).optimize(optimizer, start, **kwargs)
+        except KeyboardInterrupt:
+            print "KeyboardInterrupt caught, calling on_optimization_end() to round things up"
+            self.inference_method.on_optimization_end()
+            raise
+
+    def infer_newX(self, Y_new, optimize=True, ):
+        """
+        Infer the distribution of X for the new observed data *Y_new*.
+
+        :param Y_new: the new observed data for inference
+        :type Y_new: numpy.ndarray
+        :param optimize: whether to optimize the location of new X (True by default)
+        :type optimize: boolean
+        :return: a tuple containing the posterior estimation of X and the model that optimize X
+        :rtype: (:class:`~GPy.core.parameterization.variational.VariationalPosterior` or numpy.ndarray, :class:`~GPy.core.model.Model`)
+        """
+        from ..inference.latent_function_inference.inferenceX import infer_newX
+        return infer_newX(self, Y_new, optimize=optimize)
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@ -1,276 +0,0 @@
-import numpy as np
-from .. import kern
-from ..util.plot import gpplot, Tango, x_frame1D, x_frame2D
-import pylab as pb
-from GPy.core.model import Model
-import warnings
-from ..likelihoods import Gaussian, Gaussian_Mixed_Noise
-
-class GPBase(Model):
-    """
-    Gaussian process base model for holding shared behaviour between
-    sparse_GP and GP models, and potentially other models in the future.
-
-    Here we define some functions that are use
-    """
-    def __init__(self, X, likelihood, kernel, normalize_X=False):
-        if len(X.shape)==1:
-            X = X.reshape(-1,1)
-            warnings.warn("One dimension output (N,) being reshaped to (N,1)")
-        self.X = X
-        assert len(self.X.shape) == 2, "too many dimensions for X input"
-        self.num_data, self.input_dim = self.X.shape
-        assert isinstance(kernel, kern.kern)
-        self.kern = kernel
-        self.likelihood = likelihood
-        assert self.X.shape[0] == self.likelihood.data.shape[0]
-        self.num_data, self.output_dim = self.likelihood.data.shape
-
-        if normalize_X:
-            self._Xoffset = X.mean(0)[None, :]
-            self._Xscale = X.std(0)[None, :]
-            self._Xscale[np.where(self._Xscale==0)] = 1
-            self.X = (X.copy() - self._Xoffset) / self._Xscale
-        else:
-            self._Xoffset = np.zeros((1, self.input_dim))
-            self._Xscale = np.ones((1, self.input_dim))
-
-        super(GPBase, self).__init__()
-        # Model.__init__(self)
-        # All leaf nodes should call self._set_params(self._get_params()) at
-        # the end
-
-
-    def posterior_samples_f(self,X,size=10,which_parts='all'):
-        """
-        Samples the posterior GP at the points X.
-
-        :param X: The points at which to take the samples.
-        :type X: np.ndarray, Nnew x self.input_dim.
-        :param size: the number of a posteriori samples to plot.
-        :type size: int.
-        :param which_parts: which of the kernel functions to plot (additively).
-        :type which_parts: 'all', or list of bools.
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
-        :type full_cov: bool.
-        :returns: Ysim: set of simulations, a Numpy array (N x samples).
-        """
-        m, v = self._raw_predict(X, which_parts=which_parts, full_cov=True)
-        v = v.reshape(m.size,-1) if len(v.shape)==3 else v
-        Ysim = np.random.multivariate_normal(m.flatten(), v, size).T
-
-        return Ysim
-
-    def posterior_samples(self,X,size=10,which_parts='all',noise_model=None):
-        """
-        Samples the posterior GP at the points X.
-
-        :param X: the points at which to take the samples.
-        :type X: np.ndarray, Nnew x self.input_dim.
-        :param size: the number of a posteriori samples to plot.
-        :type size: int.
-        :param which_parts: which of the kernel functions to plot (additively).
-        :type which_parts: 'all', or list of bools.
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
-        :type full_cov: bool.
-        :param noise_model: for mixed noise likelihood, the noise model to use in the samples.
-        :type noise_model: integer.
-        :returns: Ysim: set of simulations, a Numpy array (N x samples).
-        """
-        Ysim = self.posterior_samples_f(X, size, which_parts=which_parts)
-        if isinstance(self.likelihood,Gaussian):
-            noise_std = np.sqrt(self.likelihood._get_params())
-            Ysim += np.random.normal(0,noise_std,Ysim.shape)
-        elif isinstance(self.likelihood,Gaussian_Mixed_Noise):
-            assert noise_model is not None, "A noise model must be specified."
-            noise_std = np.sqrt(self.likelihood._get_params()[noise_model])
-            Ysim += np.random.normal(0,noise_std,Ysim.shape)
-        else:
-            Ysim = self.likelihood.noise_model.samples(Ysim)
-
-        return Ysim
-
-    def plot_f(self, *args, **kwargs):
-        """
-        Plot the GP's view of the world, where the data is normalized and before applying a likelihood.
-
-        This is a convenience function: we simply call self.plot with the
-        argument use_raw_predict set True. All args and kwargs are passed on to
-        plot.
-
-        see also: gp_base.plot
-        """
-        kwargs['plot_raw'] = True
-        self.plot(*args, **kwargs)
-
-    def plot(self, plot_limits=None, which_data_rows='all',
-            which_data_ycols='all', which_parts='all', fixed_inputs=[],
-            levels=20, samples=0, fignum=None, ax=None, resolution=None,
-            plot_raw=False,
-            linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
-        """
-        Plot the posterior of the GP.
-          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
-          - In two dimsensions, a contour-plot shows the mean predicted function
-          - In higher dimensions, use fixed_inputs to plot the GP  with some of the inputs fixed.
-
-        Can plot only part of the data and part of the posterior functions
-        using which_data_rowsm which_data_ycols and which_parts
-
-        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
-        :type plot_limits: np.array
-        :param which_data_rows: which of the training data to plot (default all)
-        :type which_data_rows: 'all' or a slice object to slice self.X, self.Y
-        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
-        :type which_data_rows: 'all' or a list of integers
-        :param which_parts: which of the kernel functions to plot (additively)
-        :type which_parts: 'all', or list of bools
-        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
-        :type fixed_inputs: a list of tuples
-        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
-        :type resolution: int
-        :param levels: number of levels to plot in a contour plot.
-        :type levels: int
-        :param samples: the number of a posteriori samples to plot
-        :type samples: int
-        :param fignum: figure to plot on.
-        :type fignum: figure number
-        :param ax: axes to plot on.
-        :type ax: axes handle
-        :type output: integer (first output is 0)
-        :param linecol: color of line to plot.
-        :type linecol:
-        :param fillcol: color of fill
-        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
-        """
-        #deal with optional arguments
-        if which_data_rows == 'all':
-            which_data_rows = slice(None)
-        if which_data_ycols == 'all':
-            which_data_ycols = np.arange(self.output_dim)
-        if len(which_data_ycols)==0:
-            raise ValueError('No data selected for plotting')
-        if ax is None:
-            fig = pb.figure(num=fignum)
-            ax = fig.add_subplot(111)
-
-        #work out what the inputs are for plotting (1D or 2D)
-        fixed_dims = np.array([i for i,v in fixed_inputs])
-        free_dims = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
-
-        #one dimensional plotting
-        if len(free_dims) == 1:
-
-            #define the frame on which to plot
-            resolution = resolution or 200
-            Xu = self.X * self._Xscale + self._Xoffset #NOTE self.X are the normalized values now
-            Xnew, xmin, xmax = x_frame1D(Xu[:,free_dims], plot_limits=plot_limits)
-            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
-            Xgrid[:,free_dims] = Xnew
-            for i,v in fixed_inputs:
-                Xgrid[:,i] = v
-
-            #make a prediction on the frame and plot it
-            if plot_raw:
-                m, v = self._raw_predict(Xgrid, which_parts=which_parts)
-                lower = m - 2*np.sqrt(v)
-                upper = m + 2*np.sqrt(v)
-                Y = self.likelihood.Y
-            else:
-                m, v, lower, upper = self.predict(Xgrid, which_parts=which_parts, sampling=False) #Compute the exact mean
-                m_, v_, lower, upper = self.predict(Xgrid, which_parts=which_parts, sampling=True, num_samples=15000) #Apporximate the percentiles
-                Y = self.likelihood.data
-            for d in which_data_ycols:
-                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
-                ax.plot(Xu[which_data_rows,free_dims], Y[which_data_rows, d], 'kx', mew=1.5)
-
-            #optionally plot some samples
-            if samples: #NOTE not tested with fixed_inputs
-                Ysim = self.posterior_samples(Xgrid, samples, which_parts=which_parts)
-                for yi in Ysim.T:
-                    ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
-                    #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
-
-            #set the limits of the plot to some sensible values
-            ymin, ymax = min(np.append(Y[which_data_rows, which_data_ycols].flatten(), lower)), max(np.append(Y[which_data_rows, which_data_ycols].flatten(), upper))
-            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
-            ax.set_xlim(xmin, xmax)
-            ax.set_ylim(ymin, ymax)
-
-        #2D plotting
-        elif len(free_dims) == 2:
-
-            #define the frame for plotting on
-            resolution = resolution or 50
-            Xu = self.X * self._Xscale + self._Xoffset #NOTE self.X are the normalized values now
-            Xnew, _, _, xmin, xmax = x_frame2D(Xu[:,free_dims], plot_limits, resolution)
-            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
-            Xgrid[:,free_dims] = Xnew
-            for i,v in fixed_inputs:
-                Xgrid[:,i] = v
-            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
-
-            #predict on the frame and plot
-            if plot_raw:
-                m, _ = self._raw_predict(Xgrid, which_parts=which_parts)
-                Y = self.likelihood.Y
-            else:
-                m, _, _, _ = self.predict(Xgrid, which_parts=which_parts,sampling=False)
-                Y = self.likelihood.data
-            for d in which_data_ycols:
-                m_d = m[:,d].reshape(resolution, resolution).T
-                contour = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
-                scatter = ax.scatter(self.X[which_data_rows, free_dims[0]], self.X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
-
-            #set the limits of the plot to some sensible values
-            ax.set_xlim(xmin[0], xmax[0])
-            ax.set_ylim(xmin[1], xmax[1])
-
-            if samples:
-                warnings.warn("Samples are rather difficult to plot for 2D inputs...")
-            return contour, scatter
-        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
-
-    def getstate(self):
-        """
-        Get the curent state of the class. This is only used to efficiently
-        pickle the model. See also self.setstate
-        """
-        return Model.getstate(self) + [self.X,
-                self.num_data,
-                self.input_dim,
-                self.kern,
-                self.likelihood,
-                self.output_dim,
-                self._Xoffset,
-                self._Xscale]
-
-    def setstate(self, state):
-        """
-        Set the state of the model. Used for efficient pickling
-        """
-        self._Xscale = state.pop()
-        self._Xoffset = state.pop()
-        self.output_dim = state.pop()
-        self.likelihood = state.pop()
-        self.kern = state.pop()
-        self.input_dim = state.pop()
-        self.num_data = state.pop()
-        self.X = state.pop()
-        Model.setstate(self, state)
-
-    def log_predictive_density(self, x_test, y_test):
-        """
-        Calculation of the log predictive density
-
-        .. math:
-            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
-
-        :param x_test: test observations (x_{*})
-        :type x_test: (Nx1) array
-        :param y_test: test observations (y_{*})
-        :type y_test: (Nx1) array
-        """
-        mu_star, var_star = self._raw_predict(x_test)
-        return self.likelihood.log_predictive_density(y_test, mu_star, var_star)
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@ -1,24 +1,19 @@
-# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2013,2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-from ..util.plot import Tango, x_frame1D, x_frame2D
-from parameterized import Parameterized
+import sys
+from parameterization import Parameterized
 import numpy as np
-import pylab as pb

 class Mapping(Parameterized):
    """
    Base model for shared behavior between models that can act like a mapping.
    """

-    def __init__(self, input_dim, output_dim):
+    def __init__(self, input_dim, output_dim, name='mapping'):
        self.input_dim = input_dim
        self.output_dim = output_dim
-
-        super(Mapping, self).__init__()
-        # Model.__init__(self)
-        # All leaf nodes should call self._set_params(self._get_params()) at
-        # the end
+        super(Mapping, self).__init__(name=name)

    def f(self, X):
        raise NotImplementedError
@ -35,7 +30,8 @@ class Mapping(Parameterized):
        raise NotImplementedError

    def df_dtheta(self, dL_df, X):
-        """The gradient of the outputs of the multi-layer perceptron with respect to each of the parameters.
+        """The gradient of the outputs of the mapping with respect to each of the parameters.
+
        :param dL_df: gradient of the objective with respect to the function.
        :type dL_df: ndarray (num_data x output_dim)
        :param X: input locations where the function is evaluated.
@ -43,85 +39,42 @@ class Mapping(Parameterized):
        :returns: Matrix containing gradients with respect to parameters of each output for each input data.
        :rtype: ndarray (num_params length)
        """
+
        raise NotImplementedError

-    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue']):
+    def plot(self, *args):
        """
-
-        Plot the mapping.
-
        Plots the mapping associated with the model.
          - In one dimension, the function is plotted.
-          - In two dimsensions, a contour-plot shows the function
+          - In two dimensions, a contour-plot shows the function
          - In higher dimensions, we've not implemented this yet !TODO!

        Can plot only part of the data and part of the posterior functions
        using which_data and which_functions

-        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
-        :type plot_limits: np.array
-        :param which_data: which if the training data to plot (default all)
-        :type which_data: 'all' or a slice object to slice self.X, self.Y
-        :param which_parts: which of the kernel functions to plot (additively)
-        :type which_parts: 'all', or list of bools
-        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
-        :type resolution: int
-        :param levels: number of levels to plot in a contour plot.
-        :type levels: int
-        :param samples: the number of a posteriori samples to plot
-        :type samples: int
-        :param fignum: figure to plot on.
-        :type fignum: figure number
-        :param ax: axes to plot on.
-        :type ax: axes handle
-        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
-        :type fixed_inputs: a list of tuples
-        :param linecol: color of line to plot.
-        :type linecol:
-        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
-
+        This is a convenience function: arguments are passed to
+        GPy.plotting.matplot_dep.models_plots.plot_mapping
        """
-        # TODO include samples
-        if which_data == 'all':
-            which_data = slice(None)
-
-        if ax is None:
-            fig = pb.figure(num=fignum)
-            ax = fig.add_subplot(111)
-
-        plotdims = self.input_dim - len(fixed_inputs)
-
-        if plotdims == 1:
-
-            Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
-
-            fixed_dims = np.array([i for i,v in fixed_inputs])
-            freedim = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
-
-            Xnew, xmin, xmax = x_frame1D(Xu[:,freedim], plot_limits=plot_limits)
-            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
-            Xgrid[:,freedim] = Xnew
-            for i,v in fixed_inputs:
-                Xgrid[:,i] = v
-
-            f = self.predict(Xgrid, which_parts=which_parts)
-            for d in range(y.shape[1]):
-                ax.plot(Xnew, f[:, d], edgecol=linecol)
-
-        elif self.X.shape[1] == 2:
-            resolution = resolution or 50
-            Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
-            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
-            f = self.predict(Xnew, which_parts=which_parts)
-            m = m.reshape(resolution, resolution).T
-            ax.contour(x, y, f, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
-            ax.set_xlim(xmin[0], xmax[0])
-            ax.set_ylim(xmin[1], xmax[1])

+        if "matplotlib" in sys.modules:
+            from ..plotting.matplot_dep import models_plots
+            mapping_plots.plot_mapping(self,*args)
        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+            raise NameError, "matplotlib package has not been imported."

-from GPy.core.model import Model
+class Bijective_mapping(Mapping):
+    """
+    This is a mapping that is bijective, i.e. you can go from X to f and
+    also back from f to X. The inverse mapping is called g().
+    """
+    def __init__(self, input_dim, output_dim, name='bijective_mapping'):
+        super(Bijective_apping, self).__init__(name=name)
+
+    def g(self, f):
+        """Inverse mapping from output domain of the function to the inputs."""
+        raise NotImplementedError
+
+from model import Model

 class Mapping_check_model(Model):
    """
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -1,182 +1,35 @@
-# Copyright (c) 2012, 2013, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


 from .. import likelihoods
 from ..inference import optimization
-from ..util.linalg import jitchol
-from GPy.util.misc import opt_wrapper
-from parameterized import Parameterized
+from ..util.misc import opt_wrapper
+from parameterization import Parameterized
 import multiprocessing as mp
 import numpy as np
-from GPy.core.domains import POSITIVE, REAL
 from numpy.linalg.linalg import LinAlgError
+import itertools
 # import numdifftools as ndt

 class Model(Parameterized):
    _fail_count = 0  # Count of failed optimization steps (see objective)
    _allowed_failures = 10  # number of allowed failures
-    def __init__(self):
-        Parameterized.__init__(self)
-        self.priors = None
+
+    def __init__(self, name):
+        super(Model, self).__init__(name)  # Parameterized.__init__(self)
        self.optimization_runs = []
        self.sampling_runs = []
-        self.preferred_optimizer = 'scg'
-        # self._set_params(self._get_params()) has been taken out as it should only be called on leaf nodes
+        self.preferred_optimizer = 'bfgs'
+        from .parameterization.ties_and_remappings import Tie
+        self.tie = Tie()
+        self.link_parameter(self.tie, -1)
+        self.add_observer(self.tie, self.tie._parameters_changed_notification, priority=-500)
+
    def log_likelihood(self):
        raise NotImplementedError, "this needs to be implemented to use the model class"
    def _log_likelihood_gradients(self):
-        raise NotImplementedError, "this needs to be implemented to use the model class"
-
-    def getstate(self):
-        """
-        Get the current state of the class.
-        Inherited from Parameterized, so add those parameters to the state
-
-        :return: list of states from the model.
-
-        """
-        return Parameterized.getstate(self) + \
-            [self.priors, self.optimization_runs,
-             self.sampling_runs, self.preferred_optimizer]
-
-    def setstate(self, state):
-        """
-        set state from previous call to getstate
-        call Parameterized with the rest of the state
-
-        :param state: the state of the model.
-        :type state: list as returned from getstate.
-
-        """
-        self.preferred_optimizer = state.pop()
-        self.sampling_runs = state.pop()
-        self.optimization_runs = state.pop()
-        self.priors = state.pop()
-        Parameterized.setstate(self, state)
-
-    def set_prior(self, regexp, what):
-        """
-
-        Sets priors on the model parameters.
-
-        **Notes**
-
-        Asserts that the prior is suitable for the constraint. If the
-        wrong constraint is in place, an error is raised.  If no
-        constraint is in place, one is added (warning printed).
-
-        For tied parameters, the prior will only be "counted" once, thus
-        a prior object is only inserted on the first tied index
-
-        :param regexp: regular expression of parameters on which priors need to be set.
-        :type param: string, regexp, or integer array
-        :param what: prior to set on parameter.
-        :type what: GPy.core.Prior type
-
-        """
-        if self.priors is None:
-            self.priors = [None for i in range(self._get_params().size)]
-
-        which = self.grep_param_names(regexp)
-
-        # check tied situation
-        tie_partial_matches = [tie for tie in self.tied_indices if (not set(tie).isdisjoint(set(which))) & (not set(tie) == set(which))]
-        if len(tie_partial_matches):
-            raise ValueError, "cannot place prior across partial ties"
-        tie_matches = [tie for tie in self.tied_indices if set(which) == set(tie) ]
-        if len(tie_matches) > 1:
-            raise ValueError, "cannot place prior across multiple ties"
-        elif len(tie_matches) == 1:
-            which = which[:1] # just place a prior object on the first parameter
-
-
-        # check constraints are okay
-
-        if what.domain is POSITIVE:
-            constrained_positive_indices = [i for i, t in zip(self.constrained_indices, self.constraints) if t.domain is POSITIVE]
-            if len(constrained_positive_indices):
-                constrained_positive_indices = np.hstack(constrained_positive_indices)
-            else:
-                constrained_positive_indices = np.zeros(shape=(0,))
-            bad_constraints = np.setdiff1d(self.all_constrained_indices(), constrained_positive_indices)
-            assert not np.any(which[:, None] == bad_constraints), "constraint and prior incompatible"
-            unconst = np.setdiff1d(which, constrained_positive_indices)
-            if len(unconst):
-                print "Warning: constraining parameters to be positive:"
-                print '\n'.join([n for i, n in enumerate(self._get_param_names()) if i in unconst])
-                print '\n'
-                self.constrain_positive(unconst)
-        elif what.domain is REAL:
-            assert not np.any(which[:, None] == self.all_constrained_indices()), "constraint and prior incompatible"
-        else:
-            raise ValueError, "prior not recognised"
-
-        # store the prior in a local list
-        for w in which:
-            self.priors[w] = what
-
-    def get_gradient(self, name, return_names=False):
-        """
-        Get model gradient(s) by name. The name is applied as a regular expression and all parameters that match that regular expression are returned.
-
-        :param name: the name of parameters required (as a regular expression).
-        :type name: regular expression
-        :param return_names: whether or not to return the names matched (default False)
-        :type return_names: bool
-        """
-        matches = self.grep_param_names(name)
-        if len(matches):
-            if return_names:
-                return self._log_likelihood_gradients()[matches], np.asarray(self._get_param_names())[matches].tolist()
-            else:
-                return self._log_likelihood_gradients()[matches]
-        else:
-            raise AttributeError, "no parameter matches %s" % name
-
-    def log_prior(self):
-        """evaluate the prior"""
-        if self.priors is not None:
-            return np.sum([p.lnpdf(x) for p, x in zip(self.priors, self._get_params()) if p is not None])
-        else:
-            return 0.
-
-    def _log_prior_gradients(self):
-        """evaluate the gradients of the priors"""
-        if self.priors is None:
-            return 0.
-        x = self._get_params()
-        ret = np.zeros(x.size)
-        [np.put(ret, i, p.lnpdf_grad(xx)) for i, (p, xx) in enumerate(zip(self.priors, x)) if not p is None]
-        return ret
-
-    def _transform_gradients(self, g):
-        x = self._get_params()
-        for index, constraint in zip(self.constrained_indices, self.constraints):
-            g[index] = g[index] * constraint.gradfactor(x[index])
-        [np.put(g, i, v) for i, v in [(t[0], np.sum(g[t])) for t in self.tied_indices]]
-        if len(self.tied_indices) or len(self.fixed_indices):
-            to_remove = np.hstack((self.fixed_indices + [t[1:] for t in self.tied_indices]))
-            return np.delete(g, to_remove)
-        else:
-            return g
-
-    def randomize(self):
-        """
-        Randomize the model.
-        Make this draw from the prior if one exists, else draw from N(0,1)
-        """
-        # first take care of all parameters (from N(0,1))
-        x = self._get_params_transformed()
-        x = np.random.randn(x.size)
-        self._set_params_transformed(x)
-        # now draw from prior where possible
-        x = self._get_params()
-        if self.priors is not None:
-            [np.put(x, i, p.rvs(1)) for i, p in enumerate(self.priors) if not p is None]
-        self._set_params(x)
-        self._set_params_transformed(self._get_params_transformed()) # makes sure all of the tied parameters get the same init (since there's only one prior object...)
-
+        return self.gradient

    def optimize_restarts(self, num_restarts=10, robust=False, verbose=True, parallel=False, num_processes=None, **kwargs):
        """
@ -207,10 +60,12 @@ class Model(Parameterized):
        :param messages: whether to display during optimisation
        :type messages: bool

-        .. note:: If num_processes is None, the number of workes in the multiprocessing pool is automatically set to the number of processors on the current machine.
+        .. note:: If num_processes is None, the number of workes in the
+        multiprocessing pool is automatically set to the number of processors
+        on the current machine.

        """
-        initial_parameters = self._get_params_transformed()
+        initial_parameters = self.optimizer_array.copy()

        if parallel:
            try:
@ -246,11 +101,11 @@ class Model(Parameterized):

        if len(self.optimization_runs):
            i = np.argmin([o.f_opt for o in self.optimization_runs])
-            self._set_params_transformed(self.optimization_runs[i].x_opt)
+            self.optimizer_array = self.optimization_runs[i].x_opt
        else:
-            self._set_params_transformed(initial_parameters)
+            self.optimizer_array = initial_parameters

-    def ensure_default_constraints(self):
+    def ensure_default_constraints(self, warning=True):
        """
        Ensure that any variables which should clearly be positive
        have been constrained somehow. The method performs a regular
@ -258,109 +113,153 @@ class Model(Parameterized):
        'variance', 'lengthscale', 'precision' and 'kappa'. If any of
        these terms are present in the name the parameter is
        constrained positive.
-        """
-        positive_strings = ['variance', 'lengthscale', 'precision', 'decay', 'kappa']
-        # param_names = self._get_param_names()
-        currently_constrained = self.all_constrained_indices()
-        to_make_positive = []
-        for s in positive_strings:
-            for i in self.grep_param_names(".*" + s):
-                if not (i in currently_constrained):
-                    to_make_positive.append(i)
-        if len(to_make_positive):
-            self.constrain_positive(np.asarray(to_make_positive))

-    def objective_function(self, x):
+        DEPRECATED.
+        """
+        raise DeprecationWarning, 'parameters now have default constraints'
+
+    def objective_function(self):
+        """
+        The objective function for the given algorithm.
+
+        This function is the true objective, which wants to be minimized.
+        Note that all parameters are already set and in place, so you just need
+        to return the objective function here.
+
+        For probabilistic models this is the negative log_likelihood
+        (including the MAP prior), so we return it here. If your model is not
+        probabilistic, just return your objective to minimize here!
+        """
+        return -float(self.log_likelihood()) - self.log_prior()
+
+    def objective_function_gradients(self):
+        """
+        The gradients for the objective function for the given algorithm.
+        The gradients are w.r.t. the *negative* objective function, as
+        this framework works with *negative* log-likelihoods as a default.
+
+        You can find the gradient for the parameters in self.gradient at all times.
+        This is the place, where gradients get stored for parameters.
+
+        This function is the true objective, which wants to be minimized.
+        Note that all parameters are already set and in place, so you just need
+        to return the gradient here.
+
+        For probabilistic models this is the gradient of the negative log_likelihood
+        (including the MAP prior), so we return it here. If your model is not
+        probabilistic, just return your *negative* gradient here!
+        """
+        return -(self._log_likelihood_gradients() + self._log_prior_gradients())
+
+    def _grads(self, x):
+        """
+        Gets the gradients from the likelihood and the priors.
+
+        Failures are handled robustly. The algorithm will try several times to
+        return the gradients, and will raise the original exception if
+        the objective cannot be computed.
+
+        :param x: the parameters of the model.
+        :type x: np.array
+        """
+        try:
+            # self._set_params_transformed(x)
+            self.optimizer_array = x
+            obj_grads = self._transform_gradients(self.objective_function_gradients())
+            self._fail_count = 0
+        except (LinAlgError, ZeroDivisionError, ValueError):
+            if self._fail_count >= self._allowed_failures:
+                raise
+            self._fail_count += 1
+            obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e100, 1e100)
+        return obj_grads
+
+    def _objective(self, x):
        """
        The objective function passed to the optimizer. It combines
        the likelihood and the priors.

        Failures are handled robustly. The algorithm will try several times to
-        return the objective, and will raise the original exception if it
+        return the objective, and will raise the original exception if
        the objective cannot be computed.

        :param x: the parameters of the model.
        :parameter type: np.array
        """
        try:
-            self._set_params_transformed(x)
+            self.optimizer_array = x
+            obj = self.objective_function()
            self._fail_count = 0
-        except (LinAlgError, ZeroDivisionError, ValueError) as e:
+        except (LinAlgError, ZeroDivisionError, ValueError):
            if self._fail_count >= self._allowed_failures:
-                raise e
+                raise
            self._fail_count += 1
            return np.inf
-        return -self.log_likelihood() - self.log_prior()
+        return obj

-    def objective_function_gradients(self, x):
-        """
-        Gets the gradients from the likelihood and the priors.
-
-        Failures are handled robustly. The algorithm will try several times to
-        return the gradients, and will raise the original exception if it
-        the objective cannot be computed.
-
-        :param x: the parameters of the model.
-        :parameter type: np.array
-        """
+    def _objective_grads(self, x):
        try:
-            self._set_params_transformed(x)
-            obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
+            self.optimizer_array = x
+            obj_f, obj_grads = self.objective_function(), self._transform_gradients(self.objective_function_gradients())
            self._fail_count = 0
-        except (LinAlgError, ZeroDivisionError, ValueError) as e:
+        except (LinAlgError, ZeroDivisionError, ValueError):
            if self._fail_count >= self._allowed_failures:
-                raise e
-            self._fail_count += 1
-            obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100)
-        return obj_grads
-
-    def objective_and_gradients(self, x):
-        """
-        Compute the objective function of the model and the gradient of the model at the point given by x.
-
-        :param x: the point at which gradients are to be computed.
-        :type np.array:
-        """
-
-        try:
-            self._set_params_transformed(x)
-            obj_f = -self.log_likelihood() - self.log_prior()
-            self._fail_count = 0
-            obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
-        except (LinAlgError, ZeroDivisionError, ValueError) as e:
-            if self._fail_count >= self._allowed_failures:
-                raise e
+                raise
            self._fail_count += 1
            obj_f = np.inf
-            obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100)
+            obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e100, 1e100)
        return obj_f, obj_grads

    def optimize(self, optimizer=None, start=None, **kwargs):
        """
        Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
+
        kwargs are passed to the optimizer. They can be:

        :param max_f_eval: maximum number of function evaluations
        :type max_f_eval: int
        :messages: whether to display during optimisation
        :type messages: bool
-        :param optimzer: which optimizer to use (defaults to self.preferred optimizer)
-        :type optimzer: string TODO: valid strings?
+        :param optimizer: which optimizer to use (defaults to self.preferred optimizer)
+        :type optimizer: string
+
+        Valid optimizers are:
+          - 'scg': scaled conjugate gradient method, recommended for stability.
+                   See also GPy.inference.optimization.scg
+          - 'fmin_tnc': truncated Newton method (see scipy.optimize.fmin_tnc)
+          - 'simplex': the Nelder-Mead simplex method (see scipy.optimize.fmin),
+          - 'lbfgsb': the l-bfgs-b method (see scipy.optimize.fmin_l_bfgs_b),
+          - 'sgd': stochastic gradient decsent (see scipy.optimize.sgd). For experts only!
+
+
        """
+        if self.is_fixed:
+            print 'nothing to optimize'
+        if self.size == 0:
+            print 'nothing to optimize'
+
+        if not self.update_model():
+            print "setting updates on again"
+            self.update_model(True)
+
+        if start == None:
+            start = self.optimizer_array
+
        if optimizer is None:
            optimizer = self.preferred_optimizer

-        if start == None:
-            start = self._get_params_transformed()
-
+        if isinstance(optimizer, optimization.Optimizer):
+            opt = optimizer
+            opt.model = self
+        else:
            optimizer = optimization.get_optimizer(optimizer)
            opt = optimizer(start, model=self, **kwargs)

-        opt.run(f_fp=self.objective_and_gradients, f=self.objective_function, fp=self.objective_function_gradients)
+        opt.run(f_fp=self._objective_grads, f=self._objective, fp=self._grads)

        self.optimization_runs.append(opt)

-        self._set_params_transformed(opt.x_opt)
+        self.optimizer_array = opt.x_opt

    def optimize_SGD(self, momentum=0.1, learning_rate=0.01, iterations=20, **kwargs):
        # assert self.Y.shape[1] > 1, "SGD only works with D > 1"
@ -368,73 +267,10 @@ class Model(Parameterized):
        sgd.run()
        self.optimization_runs.append(sgd)

-    def Laplace_covariance(self):
-        """return the covariance matrix of a Laplace approximation at the current (stationary) point."""
-        # TODO add in the prior contributions for MAP estimation
-        # TODO fix the hessian for tied, constrained and fixed components
-        if hasattr(self, 'log_likelihood_hessian'):
-            A = -self.log_likelihood_hessian()
-
-        else:
-            print "numerically calculating Hessian. please be patient!"
-            x = self._get_params()
-            def f(x):
-                self._set_params(x)
-                return self.log_likelihood()
-            h = ndt.Hessian(f) # @UndefinedVariable
-            A = -h(x)
-            self._set_params(x)
-        # check for almost zero components on the diagonal which screw up the cholesky
-        aa = np.nonzero((np.diag(A) < 1e-6) & (np.diag(A) > 0.))[0]
-        A[aa, aa] = 0.
-        return A
-
-    def Laplace_evidence(self):
-        """Returns an estiamte of the model evidence based on the Laplace approximation.
-        Uses a numerical estimate of the Hessian if none is available analytically."""
-        A = self.Laplace_covariance()
-        try:
-            hld = np.sum(np.log(np.diag(jitchol(A)[0])))
-        except:
-            return np.nan
-        return 0.5 * self._get_params().size * np.log(2 * np.pi) + self.log_likelihood() - hld
-
-    def __str__(self):
-        s = Parameterized.__str__(self).split('\n')
-        #def __str__(self, names=None):
-        #    if names is None:
-        #        names = self._get_print_names()
-        #s = Parameterized.__str__(self, names=names).split('\n')
-        # add priors to the string
-        if self.priors is not None:
-            strs = [str(p) if p is not None else '' for p in self.priors]
-        else:
-            strs = [''] * len(self._get_params())
-       #         strs = [''] * len(self._get_param_names())
-       #     name_indices = self.grep_param_names("|".join(names))
-       #     strs = np.array(strs)[name_indices]
-        width = np.array(max([len(p) for p in strs] + [5])) + 4
-
-        log_like = self.log_likelihood()
-        log_prior = self.log_prior()
-        obj_funct = '\nLog-likelihood: {0:.3e}'.format(log_like)
-        if len(''.join(strs)) != 0:
-            obj_funct += ', Log prior: {0:.3e}, LL+prior = {0:.3e}'.format(log_prior, log_like + log_prior)
-        obj_funct += '\n\n'
-        s[0] = obj_funct + s[0]
-        s[0] += "|{h:^{col}}".format(h='prior', col=width)
-        s[1] += '-' * (width + 1)
-
-        for p in range(2, len(strs) + 2):
-            s[p] += '|{prior:^{width}}'.format(prior=strs[p - 2], width=width)
-
-        return '\n'.join(s)
-
-
-    def checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3):
+    def _checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3, df_tolerance=1e-12):
        """
        Check the gradient of the ,odel by comparing to a numerical
-        estimate.  If the verbose flag is passed, invividual
+        estimate.  If the verbose flag is passed, individual
        components are tested (and printed)

        :param verbose: If True, print a "full" checking of each parameter
@ -447,37 +283,54 @@ class Model(Parameterized):
        Note:-
           The gradient is considered correct if the ratio of the analytical
           and numerical gradients is within <tolerance> of unity.
-        """

-        x = self._get_params_transformed().copy()
+           The *dF_ratio* indicates the limit of numerical accuracy of numerical gradients.
+           If it is too small, e.g., smaller than 1e-12, the numerical gradients are usually
+           not accurate enough for the tests (shown with blue).
+        """
+        x = self.optimizer_array.copy()

        if not verbose:
-            # just check the global ratio
-
-            #choose a random direction to find the linear approximation in
-            if x.size==2:
-                dx = step * np.ones(2) # random direction for 2 parameters can fail dure to symmetry
+            # make sure only to test the selected parameters
+            if target_param is None:
+                transformed_index = range(len(x))
            else:
-                dx = step * np.sign(np.random.uniform(-1, 1, x.size))
+                transformed_index = self._raveled_index_for(target_param)
+                if self._has_fixes():
+                    indices = np.r_[:self.size]
+                    which = (transformed_index[:, None] == indices[self._fixes_][None, :]).nonzero()
+                    transformed_index = (indices - (~self._fixes_).cumsum())[transformed_index[which[0]]]
+
+                if transformed_index.size == 0:
+                    print "No free parameters to check"
+                    return
+
+            # just check the global ratio
+            dx = np.zeros(x.shape)
+            dx[transformed_index] = step * (np.sign(np.random.uniform(-1, 1, transformed_index.size)) if transformed_index.size != 2 else 1.)

            # evaulate around the point x
-            f1, g1 = self.objective_and_gradients(x + dx)
-            f2, g2 = self.objective_and_gradients(x - dx)
-            gradient = self.objective_function_gradients(x)
+            f1 = self._objective(x + dx)
+            f2 = self._objective(x - dx)
+            gradient = self._grads(x)

-            numerical_gradient = (f1 - f2) / (2 * dx)
-            global_ratio = (f1 - f2) / (2 * np.dot(dx, np.where(gradient==0, 1e-32, gradient)))
+            dx = dx[transformed_index]
+            gradient = gradient[transformed_index]

-            return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() < tolerance)
+            denominator = (2 * np.dot(dx, gradient))
+            global_ratio = (f1 - f2) / np.where(denominator == 0., 1e-32, denominator)
+            global_diff = np.abs(f1 - f2) < tolerance and np.allclose(gradient, 0, atol=tolerance)
+            if global_ratio is np.nan:
+                global_ratio = 0
+            return np.abs(1. - global_ratio) < tolerance or global_diff
        else:
            # check the gradient of each parameter individually, and do some pretty printing
            try:
-                names = self._get_param_names_transformed()
+                names = self._get_param_names()
            except NotImplementedError:
                names = ['Variable %i' % i for i in range(len(x))]
-
            # Prepare for pretty-printing
-            header = ['Name', 'Ratio', 'Difference', 'Analytical', 'Numerical']
+            header = ['Name', 'Ratio', 'Difference', 'Analytical', 'Numerical', 'dF_ratio']
            max_names = max([len(names[i]) for i in range(len(names))] + [len(header[0])])
            float_len = 10
            cols = [max_names]
@ -487,115 +340,77 @@ class Model(Parameterized):
            header_string = map(lambda x: '|'.join(x), [header_string])
            separator = '-' * len(header_string[0])
            print '\n'.join([header_string[0], separator])
-
            if target_param is None:
-                param_list = range(len(x))
+                param_index = range(len(x))
+                transformed_index = param_index
            else:
-                param_list = self.grep_param_names(target_param, transformed=True, search=True)
-                if not np.any(param_list):
+                param_index = self._raveled_index_for(target_param)
+                if self._has_fixes():
+                    indices = np.r_[:self.size]
+                    which = (param_index[:, None] == indices[self._fixes_][None, :]).nonzero()
+                    param_index = param_index[which[0]]
+                    transformed_index = (indices - (~self._fixes_).cumsum())[param_index]
+                    # print param_index, transformed_index
+                else:
+                    transformed_index = param_index
+
+                if param_index.size == 0:
                    print "No free parameters to check"
                    return

-
-            for i in param_list:
+            gradient = self._grads(x).copy()
+            np.where(gradient == 0, 1e-312, gradient)
+            ret = True
+            for nind, xind in itertools.izip(param_index, transformed_index):
                xx = x.copy()
-                xx[i] += step
-                f1, g1 = self.objective_and_gradients(xx)
-                xx[i] -= 2.*step
-                f2, g2 = self.objective_and_gradients(xx)
-                gradient = self.objective_function_gradients(x)[i]
-
+                xx[xind] += step
+                f1 = self._objective(xx)
+                xx[xind] -= 2.*step
+                f2 = self._objective(xx)
+                df_ratio = np.abs((f1-f2)/min(f1,f2))
+                df_unstable = df_ratio<df_tolerance
                numerical_gradient = (f1 - f2) / (2 * step)
-                ratio = (f1 - f2) / (2 * step * np.where(gradient==0, 1e-312, gradient))
-                difference = np.abs((f1 - f2) / 2 / step - gradient)
+                if np.all(gradient[xind] == 0): ratio = (f1 - f2) == gradient[xind]
+                else: ratio = (f1 - f2) / (2 * step * gradient[xind])
+                difference = np.abs(numerical_gradient - gradient[xind])

                if (np.abs(1. - ratio) < tolerance) or np.abs(difference) < tolerance:
-                    formatted_name = "\033[92m {0} \033[0m".format(names[i])
+                    formatted_name = "\033[92m {0} \033[0m".format(names[nind])
+                    ret &= True
                else:
-                    formatted_name = "\033[91m {0} \033[0m".format(names[i])
+                    formatted_name = "\033[91m {0} \033[0m".format(names[nind])
+                    ret &= False
+                if df_unstable:
+                    formatted_name = "\033[94m {0} \033[0m".format(names[nind])
+
                r = '%.6f' % float(ratio)
                d = '%.6f' % float(difference)
-                g = '%.6f' % gradient
+                g = '%.6f' % gradient[xind]
                ng = '%.6f' % float(numerical_gradient)
-                grad_string = "{0:^{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}".format(formatted_name, r, d, g, ng, c0=cols[0] + 9, c1=cols[1], c2=cols[2], c3=cols[3], c4=cols[4])
+                df = '%1.e' % float(df_ratio)
+                grad_string = "{0:<{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}|{5:^{c5}}".format(formatted_name, r, d, g, ng, df, c0=cols[0] + 9, c1=cols[1], c2=cols[2], c3=cols[3], c4=cols[4], c5=cols[5])
                print grad_string

-    def input_sensitivity(self):
-        """
-        return an array describing the sesitivity of the model to each input
+            self.optimizer_array = x
+            return ret

-        NB. Right now, we're basing this on the lengthscales (or
-        variances) of the kernel.  TODO: proper sensitivity analysis
-        where we integrate across the model inputs and evaluate the
-        effect on the variance of the model output.  """
+    def _repr_html_(self):
+        """Representation of the model in html for notebook display."""
+        model_details = [['<b>Model</b>', self.name + '<br>'],
+                         ['<b>Log-likelihood</b>', '{}<br>'.format(float(self.log_likelihood()))],
+                         ["<b>Number of Parameters</b>", '{}<br>'.format(self.size)]]
+        from operator import itemgetter
+        to_print = [""] + ["{}: {}".format(name, detail) for name, detail in model_details] + ["<br><b>Parameters</b>:"]
+        to_print.append(super(Model, self)._repr_html_())
+        return "\n".join(to_print)

-        if not hasattr(self, 'kern'):
-            raise ValueError, "this model has no kernel"
+    def __str__(self):
+        model_details = [['Name', self.name],
+                         ['Log-likelihood', '{}'.format(float(self.log_likelihood()))],
+                         ["Number of Parameters", '{}'.format(self.size)]]
+        from operator import itemgetter
+        max_len = reduce(lambda a, b: max(len(b[0]), a), model_details, 0)
+        to_print = [""] + ["{0:{l}} : {1}".format(name, detail, l=max_len) for name, detail in model_details] + ["Parameters:"]
+        to_print.append(super(Model, self).__str__())
+        return "\n".join(to_print)

-        k = [p for p in self.kern.parts if p.name in ['rbf', 'linear', 'rbf_inv']]
-        if (not len(k) == 1) or (not k[0].ARD):
-            raise ValueError, "cannot determine sensitivity for this kernel"
-        k = k[0]
-
-        if k.name == 'rbf':
-            return 1. / k.lengthscale
-        elif k.name == 'rbf_inv':
-            return k.inv_lengthscale
-        elif k.name == 'linear':
-            return k.variances
-
-
-    def pseudo_EM(self, stop_crit=.1, **kwargs):
-        """
-        EM - like algorithm  for Expectation Propagation and Laplace approximation
-
-        :param stop_crit: convergence criterion
-        :type stop_crit: float
-
-        .. Note: kwargs are passed to update_likelihood and optimize functions.
-        """
-        assert isinstance(self.likelihood, (likelihoods.EP, likelihoods.EP_Mixed_Noise, likelihoods.Laplace)), "pseudo_EM is only available for approximate likelihoods"
-        ll_change = stop_crit + 1.
-        iteration = 0
-        last_ll = -np.inf
-
-        convergence = False
-        alpha = 0
-        stop = False
-
-        #Handle **kwargs
-        ep_args = {}
-        for arg in kwargs.keys():
-            if arg in ('epsilon','power_ep'):
-                ep_args[arg] = kwargs[arg]
-                del kwargs[arg]
-
-        while not stop:
-            last_approximation = self.likelihood.copy()
-            last_params = self._get_params()
-            if len(ep_args) == 2:
-                self.update_likelihood_approximation(epsilon=ep_args['epsilon'],power_ep=ep_args['power_ep'])
-            elif len(ep_args) == 1:
-                if  ep_args.keys()[0] == 'epsilon':
-                    self.update_likelihood_approximation(epsilon=ep_args['epsilon'])
-                elif ep_args.keys()[0] == 'power_ep':
-                    self.update_likelihood_approximation(power_ep=ep_args['power_ep'])
-            else:
-                self.update_likelihood_approximation()
-            new_ll = self.log_likelihood()
-            ll_change = new_ll - last_ll
-
-            if ll_change < 0:
-                self.likelihood = last_approximation # restore previous likelihood approximation
-                self._set_params(last_params) # restore model parameters
-                print "Log-likelihood decrement: %s \nLast likelihood update discarded." % ll_change
-                stop = True
-            else:
-                self.optimize(**kwargs)
-                last_ll = self.log_likelihood()
-                if ll_change < stop_crit:
-                    stop = True
-            iteration += 1
-            if stop:
-                print "%s iterations." % iteration
-        self.update_likelihood_approximation()
--- a/GPy/core/parameterization/init.py
+++ b/GPy/core/parameterization/init.py
@ -0,0 +1,5 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from param import Param, ObsAr
+from parameterized import Parameterized
--- a/GPy/core/parameterization/domains.py
+++ b/GPy/core/parameterization/domains.py
@ -0,0 +1,25 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+"""
+(Hyper-)Parameter domains defined for :py:mod:`~GPy.core.priors` and :py:mod:`~GPy.kern`.
+These domains specify the legitimate realm of the parameters to live in.
+
+:const:`~GPy.core.domains._REAL` :
+    real domain, all values in the real numbers are allowed
+
+:const:`~GPy.core.domains._POSITIVE`:
+    positive domain, only positive real values are allowed
+
+:const:`~GPy.core.domains._NEGATIVE`:
+    same as :const:`~GPy.core.domains._POSITIVE`, but only negative values are allowed
+
+:const:`~GPy.core.domains._BOUNDED`:
+    only values within the bounded range are allowed,
+    the bounds are specified withing the object with the bounded range
+"""
+
+_REAL = 'real'
+_POSITIVE = "positive"
+_NEGATIVE = 'negative'
+_BOUNDED = 'bounded'
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@ -0,0 +1,302 @@
+# Copyright (c) 2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy
+from numpy.lib.function_base import vectorize
+from lists_and_dicts import IntArrayDict
+
+def extract_properties_to_index(index, props):
+    prop_index = dict()
+    for i, cl in enumerate(props):
+        for c in cl:
+            ind = prop_index.get(c, list())
+            ind.append(index[i])
+            prop_index[c] = ind
+
+    for c, i in prop_index.items():
+        prop_index[c] = numpy.array(i, dtype=int)
+
+    return prop_index
+
+
+class ParameterIndexOperations(object):
+    """
+    This object wraps a dictionary, whos keys are _operations_ that we'd like
+    to apply to a parameter array, and whose values are np integer arrays which
+    index the parameter array appropriately.
+
+    A model instance will contain one instance of this class for each thing
+    that needs indexing (i.e. constraints, ties and priors). Parameters within
+    the model constain instances of the ParameterIndexOperationsView class,
+    which can map from a 'local' index (starting 0) to this global index.
+
+    Here's an illustration:
+
+    #=======================================================================
+    model : 0 1 2 3 4 5 6 7 8 9
+    key1: 4 5
+    key2: 7 8
+
+    param1: 0 1 2 3 4 5
+    key1: 2 3
+    key2: 5
+
+    param2: 0 1 2 3 4
+    key1: 0
+    key2: 2 3
+    #=======================================================================
+
+    The views of this global index have a subset of the keys in this global
+    (model) index.
+
+    Adding a new key (e.g. a constraint) to a view will cause the view to pass
+    the new key to the global index, along with the local index and an offset.
+    This global index then stores the key and the appropriate global index
+    (which can be seen by the view).
+
+    See also:
+    ParameterIndexOperationsView
+
+    """
+    _offset = 0
+    def __init__(self, constraints=None):
+        self._properties = IntArrayDict()
+        if constraints is not None:
+            for t, i in constraints.iteritems():
+                self.add(t, i)
+
+    def iteritems(self):
+        return self._properties.iteritems()
+
+    def items(self):
+        return self._properties.items()
+
+    def properties(self):
+        return self._properties.keys()
+
+    def iterproperties(self):
+        return self._properties.iterkeys()
+
+    def shift_right(self, start, size):
+        for ind in self.iterindices():
+            toshift = ind>=start
+            ind[toshift] += size
+
+    def shift_left(self, start, size):
+        for v, ind in self.items():
+            todelete = (ind>=start) * (ind<start+size)
+            if todelete.size != 0:
+                ind = ind[~todelete]
+            toshift = ind>=start
+            if toshift.size != 0:
+                ind[toshift] -= size
+            if ind.size != 0: self._properties[v] = ind
+            else: del self._properties[v]
+
+    def clear(self):
+        self._properties.clear()
+
+    @property
+    def size(self):
+        return reduce(lambda a,b: a+b.size, self.iterindices(), 0)
+
+    def iterindices(self):
+        return self._properties.itervalues()
+
+    def indices(self):
+        return self._properties.values()
+
+    def properties_for(self, index):
+        """
+        Returns a list of properties, such that each entry in the list corresponds
+        to the element of the index given.
+
+        Example:
+        let properties: 'one':[1,2,3,4], 'two':[3,5,6]
+
+        >>> properties_for([2,3,5])
+        [['one'], ['one', 'two'], ['two']]
+        """
+        return vectorize(lambda i: [prop for prop in self.iterproperties() if i in self[prop]], otypes=[list])(index)
+
+    def properties_to_index_dict(self, index):
+        """
+        Return a dictionary, containing properties as keys and indices as index
+        Thus, the indices for each constraint, which is contained will be collected as
+        one dictionary
+
+        Example:
+        let properties: 'one':[1,2,3,4], 'two':[3,5,6]
+
+        >>> properties_to_index_dict([2,3,5])
+        {'one':[2,3], 'two':[3,5]}
+        """
+        props = self.properties_for(index)
+        prop_index = extract_properties_to_index(index, props)
+        return prop_index
+
+    def add(self, prop, indices):
+        self._properties[prop] = combine_indices(self._properties[prop], indices)
+
+    def remove(self, prop, indices):
+        if prop in self._properties:
+            diff = remove_indices(self[prop], indices)
+            removed = numpy.intersect1d(self[prop], indices, True)
+            if not index_empty(diff):
+                self._properties[prop] = diff
+            else:
+                del self._properties[prop]
+            return removed.astype(int)
+        return numpy.array([]).astype(int)
+
+    def update(self, parameter_index_view, offset=0):
+        for i, v in parameter_index_view.iteritems():
+            self.add(i, v+offset)
+
+    def copy(self):
+        return self.__deepcopy__(None)
+
+    def __deepcopy__(self, memo):
+        return ParameterIndexOperations(dict(self.iteritems()))
+
+    def __getitem__(self, prop):
+        return self._properties[prop]
+
+    def __delitem__(self, prop):
+        del self._properties[prop]
+
+    def __str__(self, *args, **kwargs):
+        import pprint
+        return pprint.pformat(dict(self._properties))
+
+def combine_indices(arr1, arr2):
+    return numpy.union1d(arr1, arr2)
+
+def remove_indices(arr, to_remove):
+    return numpy.setdiff1d(arr, to_remove, True)
+
+def index_empty(index):
+    return numpy.size(index) == 0
+
+class ParameterIndexOperationsView(object):
+    def __init__(self, param_index_operations, offset, size):
+        self._param_index_ops = param_index_operations
+        self._offset = offset
+        self._size = size
+
+    def __getstate__(self):
+        return [self._param_index_ops, self._offset, self._size]
+
+    def __setstate__(self, state):
+        self._param_index_ops = state[0]
+        self._offset = state[1]
+        self._size = state[2]
+
+    def _filter_index(self, ind):
+        return ind[(ind >= self._offset) * (ind < (self._offset + self._size))] - self._offset
+
+
+    def iteritems(self):
+        for i, ind in self._param_index_ops.iteritems():
+            ind2 = self._filter_index(ind)
+            if ind2.size > 0:
+                yield i, ind2
+
+    def items(self):
+        return [[i,v] for i,v in self.iteritems()]
+
+    def properties(self):
+        return [i for i in self.iterproperties()]
+
+
+    def iterproperties(self):
+        for i, _ in self.iteritems():
+            yield i
+
+
+    def shift_right(self, start, size):
+        self._param_index_ops.shift_right(start+self._offset, size)
+
+    def shift_left(self, start, size):
+        self._param_index_ops.shift_left(start+self._offset, size)
+
+    def clear(self):
+        for i, ind in self.items():
+            self._param_index_ops.remove(i, ind+self._offset)
+
+    @property
+    def size(self):
+        return reduce(lambda a,b: a+b.size, self.iterindices(), 0)
+
+
+    def iterindices(self):
+        for _, ind in self.iteritems():
+            yield ind
+
+
+    def indices(self):
+        return [ind for ind in self.iterindices()]
+
+
+    def properties_for(self, index):
+        """
+        Returns a list of properties, such that each entry in the list corresponds
+        to the element of the index given.
+
+        Example:
+        let properties: 'one':[1,2,3,4], 'two':[3,5,6]
+
+        >>> properties_for([2,3,5])
+        [['one'], ['one', 'two'], ['two']]
+        """
+        return vectorize(lambda i: [prop for prop in self.iterproperties() if i in self[prop]], otypes=[list])(index)
+
+    def properties_to_index_dict(self, index):
+        """
+        Return a dictionary, containing properties as keys and indices as index
+        Thus, the indices for each constraint, which is contained will be collected as
+        one dictionary
+
+        Example:
+        let properties: 'one':[1,2,3,4], 'two':[3,5,6]
+
+        >>> properties_to_index_dict([2,3,5])
+        {'one':[2,3], 'two':[3,5]}
+        """
+        return extract_properties_to_index(index, self.properties_for(index))
+
+
+    def add(self, prop, indices):
+        self._param_index_ops.add(prop, indices+self._offset)
+
+
+    def remove(self, prop, indices):
+        removed = self._param_index_ops.remove(prop, numpy.array(indices)+self._offset)
+        if removed.size > 0:
+            return removed-self._offset
+        return removed
+
+
+    def __getitem__(self, prop):
+        ind = self._filter_index(self._param_index_ops[prop])
+        return ind
+
+    def __delitem__(self, prop):
+        self.remove(prop, self[prop])
+
+    def __str__(self, *args, **kwargs):
+        import pprint
+        return pprint.pformat(dict(self.iteritems()))
+
+    def update(self, parameter_index_view, offset=0):
+        for i, v in parameter_index_view.iteritems():
+            self.add(i, v+offset)
+
+
+    def copy(self):
+        return self.__deepcopy__(None)
+
+    def __deepcopy__(self, memo):
+        return ParameterIndexOperations(dict(self.iteritems()))
+    pass
+
--- a/GPy/core/parameterization/lists_and_dicts.py
+++ b/GPy/core/parameterization/lists_and_dicts.py
@ -0,0 +1,139 @@
+# Copyright (c) 2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from collections import defaultdict
+import weakref
+
+def intarray_default_factory():
+    import numpy as np
+    return np.int_([])
+
+class IntArrayDict(defaultdict):
+    def __init__(self, default_factory=None):
+        """
+        Default will be self._default, if not set otherwise
+        """
+        defaultdict.__init__(self, intarray_default_factory)
+
+class ArrayList(list):
+    """
+    List to store ndarray-likes in.
+    It will look for 'is' instead of calling __eq__ on each element.
+    """
+    def __contains__(self, other):
+        for el in self:
+            if el is other:
+                return True
+        return False
+
+    def index(self, item):
+        index = 0
+        for el in self:
+            if el is item:
+                return index
+            index += 1
+        raise ValueError, "{} is not in list".format(item)
+    pass
+
+class ObserverList(object):
+    """
+    A list which containts the observables.
+    It only holds weak references to observers, such that unbound
+    observers dont dangle in memory.
+    """
+    def __init__(self):
+        self._poc = []
+
+    def __getitem__(self, ind):
+        p,o,c = self._poc[ind]
+        return p, o(), c
+
+    def remove(self, priority, observer, callble):
+        """
+        Remove one observer, which had priority and callble.
+        """
+        self.flush()
+        for i in range(len(self) - 1, -1, -1):
+            p,o,c = self[i]
+            if priority==p and observer==o and callble==c:
+                del self._poc[i]
+
+    def __repr__(self):
+        return self._poc.__repr__()
+
+    def add(self, priority, observer, callble):
+        """
+        Add an observer with priority and callble
+        """
+        if observer is not None:
+            ins = 0
+            for pr, _, _ in self:
+                if priority > pr:
+                    break
+                ins += 1
+            self._poc.insert(ins, (priority, weakref.ref(observer), callble))
+
+    def __str__(self):
+        from . import ObsAr, Param
+        from parameter_core import Parameterizable
+        ret = []
+        curr_p = None
+        
+        def frmt(o):
+            if isinstance(o, ObsAr):
+                return 'ObsArr <{}>'.format(hex(id(o)))
+            elif isinstance(o, (Param,Parameterizable)):
+                return '{}'.format(o.hierarchy_name())
+            else:
+                return repr(o)                
+        for p, o, c in self:
+            curr = ''
+            if curr_p != p:
+                pre = "{!s}: ".format(p)
+                curr_pre = pre
+            else: curr_pre = " "*len(pre)
+            curr_p = p
+            curr += curr_pre
+            
+            ret.append(curr + ", ".join([frmt(o), str(c)]))
+            return '\n'.join(ret)
+
+    def flush(self):
+        """
+        Make sure all weak references, which point to nothing are flushed (deleted)
+        """
+        self._poc = [(p,o,c) for p,o,c in self._poc if o() is not None]
+
+    def __iter__(self):
+        self.flush()
+        for p, o, c in self._poc:
+            yield p, o(), c 
+
+    def __len__(self):
+        self.flush()
+        return self._poc.__len__()
+
+    def __deepcopy__(self, memo):
+        s = ObserverList()
+        for p,o,c in self:
+            import copy
+            s.add(p, copy.deepcopy(o, memo), copy.deepcopy(c, memo))
+        s.flush()
+        return s
+
+    def __getstate__(self):
+        self.flush()
+        from ...util.caching import Cacher
+        obs = []
+        for p, o, c in self:
+            if (getattr(o, c.__name__, None) is not None 
+                and not isinstance(o, Cacher)):
+                obs.append((p,o,c.__name__))
+        return obs
+
+    def __setstate__(self, state):
+        self._poc = []
+        for p, o, c in state:
+            self.add(p,o,getattr(o, c))
+
+    pass
--- a/GPy/core/parameterization/observable.py
+++ b/GPy/core/parameterization/observable.py
@ -0,0 +1,66 @@
+# Copyright (c) 2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+class Observable(object):
+    """
+    Observable pattern for parameterization.
+
+    This Object allows for observers to register with self and a (bound!) function
+    as an observer. Every time the observable changes, it sends a notification with
+    self as only argument to all its observers.
+    """
+    def __init__(self, *args, **kwargs):
+        super(Observable, self).__init__()
+        from lists_and_dicts import ObserverList
+        self.observers = ObserverList()
+
+    def add_observer(self, observer, callble, priority=0):
+        """
+        Add an observer `observer` with the callback `callble`
+        and priority `priority` to this observers list.
+        """
+        self.observers.add(priority, observer, callble)
+
+    def remove_observer(self, observer, callble=None):
+        """
+        Either (if callble is None) remove all callables,
+        which were added alongside observer,
+        or remove callable `callble` which was added alongside
+        the observer `observer`.
+        """
+        to_remove = []
+        for poc in self.observers:
+            _, obs, clble = poc
+            if callble is not None:
+                if (obs is observer) and (callble == clble):
+                    to_remove.append(poc)
+            else:
+                if obs is observer:
+                    to_remove.append(poc)
+        for r in to_remove:
+            self.observers.remove(*r)
+
+    def notify_observers(self, which=None, min_priority=None):
+        """
+        Notifies all observers. Which is the element, which kicked off this
+        notification loop. The first argument will be self, the second `which`.
+
+        NOTE: notifies only observers with priority p > min_priority!
+                                                    ^^^^^^^^^^^^^^^^
+        :param min_priority: only notify observers with priority > min_priority
+                             if min_priority is None, notify all observers in order
+        """
+        if which is None:
+            which = self
+        if min_priority is None:
+            [callble(self, which=which) for _, _, callble in self.observers]
+        else:
+            for p, _, callble in self.observers:
+                if p <= min_priority:
+                    break
+                callble(self, which=which)
+
+    def change_priority(self, observer, callble, priority):
+        self.remove_observer(observer, callble)
+        self.add_observer(observer, callble, priority)
--- a/GPy/core/parameterization/observable_array.py
+++ b/GPy/core/parameterization/observable_array.py
@ -0,0 +1,147 @@
+# Copyright (c) 2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from parameter_core import Pickleable
+from observable import Observable
+
+class ObsAr(np.ndarray, Pickleable, Observable):
+    """
+    An ndarray which reports changes to its observers.
+    The observers can add themselves with a callable, which
+    will be called every time this array changes. The callable
+    takes exactly one argument, which is this array itself.
+    """
+    __array_priority__ = -1 # Never give back ObsAr
+    def __new__(cls, input_array, *a, **kw):
+        # allways make a copy of input paramters, as we need it to be in C order:
+        if not isinstance(input_array, ObsAr):
+            obj = np.atleast_1d(np.require(input_array, dtype=np.float64, requirements=['W', 'C'])).view(cls)
+        else: obj = input_array
+        super(ObsAr, obj).__init__(*a, **kw)
+        return obj
+
+    def __array_finalize__(self, obj):
+        # see InfoArray.__array_finalize__ for comments
+        if obj is None: return
+        self.observers = getattr(obj, 'observers', None)
+
+    def __array_wrap__(self, out_arr, context=None):
+        return out_arr.view(np.ndarray)
+
+    def _setup_observers(self):
+        # do not setup anything, as observable arrays do not have default observers
+        pass
+
+    @property
+    def values(self):
+        return self.view(np.ndarray)
+
+    def copy(self):
+        from lists_and_dicts import ObserverList
+        memo = {}
+        memo[id(self)] = self
+        memo[id(self.observers)] = ObserverList()
+        return self.__deepcopy__(memo)
+
+    def __deepcopy__(self, memo):
+        s = self.__new__(self.__class__, input_array=self.view(np.ndarray).copy())
+        memo[id(self)] = s
+        import copy
+        Pickleable.__setstate__(s, copy.deepcopy(self.__getstate__(), memo))
+        return s
+
+    def __reduce__(self):
+        func, args, state = super(ObsAr, self).__reduce__()
+        return func, args, (state, Pickleable.__getstate__(self))
+
+    def __setstate__(self, state):
+        np.ndarray.__setstate__(self, state[0])
+        Pickleable.__setstate__(self, state[1])
+
+    def __setitem__(self, s, val):
+        super(ObsAr, self).__setitem__(s, val)
+        self.notify_observers()
+
+    def __getslice__(self, start, stop):
+        return self.__getitem__(slice(start, stop))
+
+    def __setslice__(self, start, stop, val):
+        return self.__setitem__(slice(start, stop), val)
+
+    def __ilshift__(self, *args, **kwargs):
+        r = np.ndarray.__ilshift__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+    def __irshift__(self, *args, **kwargs):
+        r = np.ndarray.__irshift__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __ixor__(self, *args, **kwargs):
+        r = np.ndarray.__ixor__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __ipow__(self, *args, **kwargs):
+        r = np.ndarray.__ipow__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __ifloordiv__(self, *args, **kwargs):
+        r = np.ndarray.__ifloordiv__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __isub__(self, *args, **kwargs):
+        r = np.ndarray.__isub__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __ior__(self, *args, **kwargs):
+        r = np.ndarray.__ior__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __itruediv__(self, *args, **kwargs):
+        r = np.ndarray.__itruediv__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __idiv__(self, *args, **kwargs):
+        r = np.ndarray.__idiv__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __iand__(self, *args, **kwargs):
+        r = np.ndarray.__iand__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __imod__(self, *args, **kwargs):
+        r = np.ndarray.__imod__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __iadd__(self, *args, **kwargs):
+        r = np.ndarray.__iadd__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
+
+
+    def __imul__(self, *args, **kwargs):
+        r = np.ndarray.__imul__(self, *args, **kwargs)
+        self.notify_observers()
+        return r
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@ -0,0 +1,476 @@
+# Copyright (c) 2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import itertools
+import numpy
+np = numpy
+from parameter_core import Parameterizable, adjust_name_for_printing, Pickleable
+from observable_array import ObsAr
+
+###### printing
+__constraints_name__ = "Constraint"
+__index_name__ = "Index"
+__tie_name__ = "Tied to"
+__priors_name__ = "Prior"
+__precision__ = numpy.get_printoptions()['precision'] # numpy printing precision used, sublassing numpy ndarray after all
+__print_threshold__ = 5
+######
+
+class Param(Parameterizable, ObsAr):
+    """
+    Parameter object for GPy models.
+
+    :param str name:           name of the parameter to be printed
+    :param input_array:        array which this parameter handles
+    :type input_array:         numpy.ndarray
+    :param default_constraint: The default constraint for this parameter
+    :type default_constraint:
+
+    You can add/remove constraints by calling constrain on the parameter itself, e.g:
+
+        - self[:,1].constrain_positive()
+        - self[0].tie_to(other)
+        - self.untie()
+        - self[:3,:].unconstrain()
+        - self[1].fix()
+
+    Fixing parameters will fix them to the value they are right now. If you change
+    the fixed value, it will be fixed to the new value!
+
+    See :py:class:`GPy.core.parameterized.Parameterized` for more details on constraining etc.
+
+    """
+    __array_priority__ = -1 # Never give back Param
+    _fixes_ = None
+    parameters = []
+    def __new__(cls, name, input_array, default_constraint=None):
+        obj = numpy.atleast_1d(super(Param, cls).__new__(cls, input_array=input_array))
+        obj._current_slice_ = (slice(obj.shape[0]),)
+        obj._realshape_ = obj.shape
+        obj._realsize_ = obj.size
+        obj._realndim_ = obj.ndim
+        obj._original_ = obj
+        return obj
+
+    def __init__(self, name, input_array, default_constraint=None, *a, **kw):
+        self._in_init_ = True
+        super(Param, self).__init__(name=name, default_constraint=default_constraint, *a, **kw)
+        self._in_init_ = False
+
+    def build_pydot(self,G):
+        import pydot
+        node = pydot.Node(id(self), shape='trapezium', label=self.name)#, fontcolor='white', color='white')
+        G.add_node(node)
+        for _, o, _ in self.observers:
+            label = o.name if hasattr(o, 'name') else str(o)
+            observed_node = pydot.Node(id(o), label=label)
+            G.add_node(observed_node)
+            edge = pydot.Edge(str(id(self)), str(id(o)), color='darkorange2', arrowhead='vee')
+            G.add_edge(edge)
+
+        return node
+
+    def __array_finalize__(self, obj):
+        # see InfoArray.__array_finalize__ for comments
+        if obj is None: return
+        super(Param, self).__array_finalize__(obj)
+        self._parent_ = getattr(obj, '_parent_', None)
+        self._parent_index_ = getattr(obj, '_parent_index_', None)
+        self._default_constraint_ = getattr(obj, '_default_constraint_', None)
+        self._current_slice_ = getattr(obj, '_current_slice_', None)
+        self._realshape_ = getattr(obj, '_realshape_', None)
+        self._realsize_ = getattr(obj, '_realsize_', None)
+        self._realndim_ = getattr(obj, '_realndim_', None)
+        self._original_ = getattr(obj, '_original_', None)
+        self._name = getattr(obj, '_name', None)
+        self._gradient_array_ = getattr(obj, '_gradient_array_', None)
+        self.constraints = getattr(obj, 'constraints', None)
+        self.priors = getattr(obj, 'priors', None)
+
+    @property
+    def param_array(self):
+        """
+        As we are a leaf, this just returns self
+        """
+        return self
+
+    @property
+    def values(self):
+        """
+        Return self as numpy array view
+        """
+        return self.view(np.ndarray)
+
+    @property
+    def gradient(self):
+        """
+        Return a view on the gradient, which is in the same shape as this parameter is.
+        Note: this is not the real gradient array, it is just a view on it.
+
+        To work on the real gradient array use: self.full_gradient
+        """
+        if getattr(self, '_gradient_array_', None) is None:
+            self._gradient_array_ = numpy.empty(self._realshape_, dtype=numpy.float64)
+        return self._gradient_array_#[self._current_slice_]
+
+    @gradient.setter
+    def gradient(self, val):
+        self._gradient_array_[:] = val
+
+    #===========================================================================
+    # Array operations -> done
+    #===========================================================================
+    def __getitem__(self, s, *args, **kwargs):
+        if not isinstance(s, tuple):
+            s = (s,)
+        #if not reduce(lambda a, b: a or numpy.any(b is Ellipsis), s, False) and len(s) <= self.ndim:
+        #    s += (Ellipsis,)
+        new_arr = super(Param, self).__getitem__(s, *args, **kwargs)
+        try:
+            new_arr._current_slice_ = s
+            new_arr._gradient_array_ = self.gradient[s]
+            new_arr._original_ = self._original_
+        except AttributeError: pass  # returning 0d array or float, double etc
+        return new_arr
+
+    def _raveled_index(self, slice_index=None):
+        # return an index array on the raveled array, which is formed by the current_slice
+        # of this object
+        extended_realshape = numpy.cumprod((1,) + self._realshape_[:0:-1])[::-1]
+        ind = self._indices(slice_index)
+        if ind.ndim < 2: ind = ind[:, None]
+        return numpy.asarray(numpy.apply_along_axis(lambda x: numpy.sum(extended_realshape * x), 1, ind), dtype=int)
+
+    def _raveled_index_for(self, obj):
+        return self._raveled_index()
+
+    #===========================================================================
+    # Constrainable
+    #===========================================================================
+    def _ensure_fixes(self):
+        if not self._has_fixes(): self._fixes_ = numpy.ones(self._realsize_, dtype=bool)
+
+    #===========================================================================
+    # Convenience
+    #===========================================================================
+    @property
+    def is_fixed(self):
+        from transformations import __fixed__
+        return self.constraints[__fixed__].size == self.size
+
+    def _get_original(self, param):
+        return self._original_
+
+    #===========================================================================
+    # Pickling and copying
+    #===========================================================================
+    def copy(self):
+        return Parameterizable.copy(self, which=self)
+
+    def __deepcopy__(self, memo):
+        s = self.__new__(self.__class__, name=self.name, input_array=self.view(numpy.ndarray).copy())
+        memo[id(self)] = s
+        import copy
+        Pickleable.__setstate__(s, copy.deepcopy(self.__getstate__(), memo))
+        return s
+    def _setup_observers(self):
+        """
+        Setup the default observers
+
+        1: pass through to parent, if present
+        """
+        if self.has_parent():
+            self.add_observer(self._parent_, self._parent_._pass_through_notify_observers, -np.inf)
+
+    #===========================================================================
+    # Printing -> done
+    #===========================================================================
+    @property
+    def _description_str(self):
+        if self.size <= 1:
+            return [str(self.view(numpy.ndarray)[0])]
+        else: return [str(self.shape)]
+    def parameter_names(self, add_self=False, adjust_for_printing=False, recursive=True):
+        # this is just overwrighting the parameterized calls to parameter names, in order to maintain OOP
+        if adjust_for_printing:
+            return [adjust_name_for_printing(self.name)]
+        return [self.name]
+    @property
+    def flattened_parameters(self):
+        return [self]
+    @property
+    def parameter_shapes(self):
+        return [self.shape]
+    @property
+    def num_params(self):
+        return 0
+    @property
+    def _constraints_str(self):
+        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.constraints.iteritems()))]
+    @property
+    def _priors_str(self):
+        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.priors.iteritems()))]
+    @property
+    def _ties_str(self):
+        return ['']
+    def _ties_for(self, ravi):
+        return [['N/A']]*ravi.size
+    def __repr__(self, *args, **kwargs):
+        name = "\033[1m{x:s}\033[0;0m:\n".format(
+                            x=self.hierarchy_name())
+        return name + super(Param, self).__repr__(*args, **kwargs)
+    def _indices(self, slice_index=None):
+        # get a int-array containing all indices in the first axis.
+        if slice_index is None:
+            slice_index = self._current_slice_
+        try:
+            indices = np.indices(self._realshape_, dtype=int)
+            indices = indices[(slice(None),)+slice_index]
+            indices = np.rollaxis(indices, 0, indices.ndim).reshape(-1,self._realndim_)
+            #print indices_
+            #if not np.all(indices==indices__):
+            #    import ipdb; ipdb.set_trace()
+        except:
+            indices = np.indices(self._realshape_, dtype=int)
+            indices = indices[(slice(None),)+slice_index]
+            indices = np.rollaxis(indices, 0, indices.ndim)
+        return indices
+    def _max_len_names(self, gen, header):
+        gen = map(lambda x: " ".join(map(str, x)), gen)
+        return reduce(lambda a, b:max(a, len(b)), gen, len(header))
+    def _max_len_values(self):
+        return reduce(lambda a, b:max(a, len("{x:=.{0}g}".format(__precision__, x=b))), self.flat, len(self.hierarchy_name()))
+    def _max_len_index(self, ind):
+        return reduce(lambda a, b:max(a, len(str(b))), ind, len(__index_name__))
+    def _short(self):
+        # short string to print
+        name = self.hierarchy_name()
+        if self._realsize_ < 2:
+            return name
+        ind = self._indices()
+        if ind.size > 4: indstr = ','.join(map(str, ind[:2])) + "..." + ','.join(map(str, ind[-2:]))
+        else: indstr = ','.join(map(str, ind))
+        return name + '[' + indstr + ']'
+
+    def _repr_html_(self, constr_matrix=None, indices=None, prirs=None, ties=None):
+        """Representation of the parameter in html for notebook display."""
+        filter_ = self._current_slice_
+        vals = self.flat
+        if indices is None: indices = self._indices(filter_)
+        ravi = self._raveled_index(filter_)
+        if constr_matrix is None: constr_matrix = self.constraints.properties_for(ravi)
+        if prirs is None: prirs = self.priors.properties_for(ravi)
+        if ties is None: ties = self._ties_for(ravi)
+        ties = [' '.join(map(lambda x: x, t)) for t in ties]
+        header_format = """
+<tr>
+  <td><b>{i}</b></td>
+  <td><b>{x}</b></td>
+  <td><b>{c}</b></td>
+  <td><b>{p}</b></td>
+  <td><b>{t}</b></td>
+</tr>"""
+        header = header_format.format(x=self.hierarchy_name(), c=__constraints_name__, i=__index_name__, t=__tie_name__, p=__priors_name__)  # nice header for printing
+        if not ties: ties = itertools.cycle([''])
+        return "\n".join(['<table>'] + [header] + ["<tr><td>{i}</td><td align=\"right\">{x}</td><td>{c}</td><td>{p}</td><td>{t}</td></tr>".format(x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in itertools.izip(indices, vals, constr_matrix, ties, prirs)] + ["</table>"])  
+
+    def __str__(self, constr_matrix=None, indices=None, prirs=None, ties=None, lc=None, lx=None, li=None, lp=None, lt=None, only_name=False):
+        filter_ = self._current_slice_
+        vals = self.flat
+        if indices is None: indices = self._indices(filter_)
+        ravi = self._raveled_index(filter_)
+        if constr_matrix is None: constr_matrix = self.constraints.properties_for(ravi)
+        if prirs is None: prirs = self.priors.properties_for(ravi)
+        if ties is None: ties = self._ties_for(ravi)
+        ties = [' '.join(map(lambda x: x, t)) for t in ties]
+        if lc is None: lc = self._max_len_names(constr_matrix, __constraints_name__)
+        if lx is None: lx = self._max_len_values()
+        if li is None: li = self._max_len_index(indices)
+        if lt is None: lt = self._max_len_names(ties, __tie_name__)
+        if lp is None: lp = self._max_len_names(prirs, __tie_name__)
+        sep = '-'
+        header_format = "  {i:{5}^{2}s}  |  \033[1m{x:{5}^{1}s}\033[0;0m  |  {c:{5}^{0}s}  |  {p:{5}^{4}s}  |  {t:{5}^{3}s}"
+        if only_name: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.hierarchy_name(), c=sep*lc, i=sep*li, t=sep*lt, p=sep*lp)  # nice header for printing
+        else: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.hierarchy_name(), c=__constraints_name__, i=__index_name__, t=__tie_name__, p=__priors_name__)  # nice header for printing
+        if not ties: ties = itertools.cycle([''])
+        return "\n".join([header] + ["  {i!s:^{3}s}  |  {x: >{1}.{2}g}  |  {c:^{0}s}  |  {p:^{5}s}  |  {t:^{4}s}  ".format(lc, lx, __precision__, li, lt, lp, x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in itertools.izip(indices, vals, constr_matrix, ties, prirs)])  # return all the constraints with right indices
+        # except: return super(Param, self).__str__()
+
+class ParamConcatenation(object):
+    def __init__(self, params):
+        """
+        Parameter concatenation for convenience of printing regular expression matched arrays
+        you can index this concatenation as if it was the flattened concatenation
+        of all the parameters it contains, same for setting parameters (Broadcasting enabled).
+
+        See :py:class:`GPy.core.parameter.Param` for more details on constraining.
+        """
+        # self.params = params
+        from lists_and_dicts import ArrayList
+        self.params = ArrayList([])
+        for p in params:
+            for p in p.flattened_parameters:
+                if p not in self.params:
+                    self.params.append(p)
+        self._param_sizes = [p.size for p in self.params]
+        startstops = numpy.cumsum([0] + self._param_sizes)
+        self._param_slices_ = [slice(start, stop) for start,stop in zip(startstops, startstops[1:])]
+
+        parents = dict()
+        for p in self.params:
+            if p.has_parent():
+                parent = p._parent_
+                level = 0
+                while parent is not None:
+                    if parent in parents:
+                        parents[parent] = max(level, parents[parent])
+                    else:
+                        parents[parent] = level
+                    level += 1
+                    parent = parent._parent_
+        import operator
+        self.parents = map(lambda x: x[0], sorted(parents.iteritems(), key=operator.itemgetter(1)))
+    #===========================================================================
+    # Get/set items, enable broadcasting
+    #===========================================================================
+    def __getitem__(self, s):
+        ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True;
+        params = [p.param_array.flat[ind[ps]] for p,ps in zip(self.params, self._param_slices_) if numpy.any(p.param_array.flat[ind[ps]])]
+        if len(params)==1: return params[0]
+        return ParamConcatenation(params)
+    def __setitem__(self, s, val, update=True):
+        if isinstance(val, ParamConcatenation):
+            val = val.values()
+        ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True;
+        vals = self.values(); vals[s] = val
+        for p, ps in zip(self.params, self._param_slices_):
+            p.flat[ind[ps]] = vals[ps]
+        if update:
+            self.update_all_params()
+    def values(self):
+        return numpy.hstack([p.param_array.flat for p in self.params])
+    #===========================================================================
+    # parameter operations:
+    #===========================================================================
+    def update_all_params(self):
+        for par in self.parents:
+            par.notify_observers()
+
+    def constrain(self, constraint, warning=True):
+        [param.constrain(constraint, trigger_parent=False) for param in self.params]
+        self.update_all_params()
+    constrain.__doc__ = Param.constrain.__doc__
+
+    def constrain_positive(self, warning=True):
+        [param.constrain_positive(warning, trigger_parent=False) for param in self.params]
+        self.update_all_params()
+    constrain_positive.__doc__ = Param.constrain_positive.__doc__
+
+    def constrain_fixed(self, value=None, warning=True, trigger_parent=True):
+        [param.constrain_fixed(value, warning, trigger_parent) for param in self.params]
+    constrain_fixed.__doc__ = Param.constrain_fixed.__doc__
+    fix = constrain_fixed
+
+    def constrain_negative(self, warning=True):
+        [param.constrain_negative(warning, trigger_parent=False) for param in self.params]
+        self.update_all_params()
+    constrain_negative.__doc__ = Param.constrain_negative.__doc__
+
+    def constrain_bounded(self, lower, upper, warning=True):
+        [param.constrain_bounded(lower, upper, warning, trigger_parent=False) for param in self.params]
+        self.update_all_params()
+    constrain_bounded.__doc__ = Param.constrain_bounded.__doc__
+
+    def unconstrain(self, *constraints):
+        [param.unconstrain(*constraints) for param in self.params]
+    unconstrain.__doc__ = Param.unconstrain.__doc__
+
+    def unconstrain_negative(self):
+        [param.unconstrain_negative() for param in self.params]
+    unconstrain_negative.__doc__ = Param.unconstrain_negative.__doc__
+
+    def unconstrain_positive(self):
+        [param.unconstrain_positive() for param in self.params]
+    unconstrain_positive.__doc__ = Param.unconstrain_positive.__doc__
+
+    def unconstrain_fixed(self):
+        [param.unconstrain_fixed() for param in self.params]
+    unconstrain_fixed.__doc__ = Param.unconstrain_fixed.__doc__
+    unfix = unconstrain_fixed
+
+    def unconstrain_bounded(self, lower, upper):
+        [param.unconstrain_bounded(lower, upper) for param in self.params]
+    unconstrain_bounded.__doc__ = Param.unconstrain_bounded.__doc__
+
+    def untie(self, *ties):
+        [param.untie(*ties) for param in self.params]
+
+    def checkgrad(self, verbose=0, step=1e-6, tolerance=1e-3):
+        return self.params[0]._highest_parent_._checkgrad(self, verbose, step, tolerance)
+    #checkgrad.__doc__ = Gradcheckable.checkgrad.__doc__
+
+    __lt__ = lambda self, val: self.values() < val
+    __le__ = lambda self, val: self.values() <= val
+    __eq__ = lambda self, val: self.values() == val
+    __ne__ = lambda self, val: self.values() != val
+    __gt__ = lambda self, val: self.values() > val
+    __ge__ = lambda self, val: self.values() >= val
+    def __str__(self, *args, **kwargs):
+        def f(p):
+            ind = p._raveled_index()
+            return p.constraints.properties_for(ind), p._ties_for(ind), p.priors.properties_for(ind)
+        params = self.params
+        constr_matrices, ties_matrices, prior_matrices = zip(*map(f, params))
+        indices = [p._indices() for p in params]
+        lc = max([p._max_len_names(cm, __constraints_name__) for p, cm in itertools.izip(params, constr_matrices)])
+        lx = max([p._max_len_values() for p in params])
+        li = max([p._max_len_index(i) for p, i in itertools.izip(params, indices)])
+        lt = max([p._max_len_names(tm, __tie_name__) for p, tm in itertools.izip(params, ties_matrices)])
+        lp = max([p._max_len_names(pm, __constraints_name__) for p, pm in itertools.izip(params, prior_matrices)])
+        strings = []
+        start = True
+        for p, cm, i, tm, pm in itertools.izip(params,constr_matrices,indices,ties_matrices,prior_matrices):
+            strings.append(p.__str__(constr_matrix=cm, indices=i, prirs=pm, ties=tm, lc=lc, lx=lx, li=li, lp=lp, lt=lt, only_name=(1-start)))
+            start = False
+        return "\n".join(strings)
+    def __repr__(self):
+        return "\n".join(map(repr,self.params))
+
+    def __ilshift__(self, *args, **kwargs):
+        self[:] = np.ndarray.__ilshift__(self.values(), *args, **kwargs)
+
+    def __irshift__(self, *args, **kwargs):
+        self[:] = np.ndarray.__irshift__(self.values(), *args, **kwargs)
+
+    def __ixor__(self, *args, **kwargs):
+        self[:] = np.ndarray.__ixor__(self.values(), *args, **kwargs)
+
+    def __ipow__(self, *args, **kwargs):
+        self[:] = np.ndarray.__ipow__(self.values(), *args, **kwargs)
+
+    def __ifloordiv__(self, *args, **kwargs):
+        self[:] = np.ndarray.__ifloordiv__(self.values(), *args, **kwargs)
+
+    def __isub__(self, *args, **kwargs):
+        self[:] = np.ndarray.__isub__(self.values(), *args, **kwargs)
+
+    def __ior__(self, *args, **kwargs):
+        self[:] = np.ndarray.__ior__(self.values(), *args, **kwargs)
+
+    def __itruediv__(self, *args, **kwargs):
+        self[:] = np.ndarray.__itruediv__(self.values(), *args, **kwargs)
+
+    def __idiv__(self, *args, **kwargs):
+        self[:] = np.ndarray.__idiv__(self.values(), *args, **kwargs)
+
+    def __iand__(self, *args, **kwargs):
+        self[:] = np.ndarray.__iand__(self.values(), *args, **kwargs)
+
+    def __imod__(self, *args, **kwargs):
+        self[:] = np.ndarray.__imod__(self.values(), *args, **kwargs)
+
+    def __iadd__(self, *args, **kwargs):
+        self[:] = np.ndarray.__iadd__(self.values(), *args, **kwargs)
+
+    def __imul__(self, *args, **kwargs):
+        self[:] = np.ndarray.__imul__(self.values(), *args, **kwargs)
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -0,0 +1,418 @@
+# Copyright (c) 2014, Max Zwiessele, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy; np = numpy
+import itertools
+from re import compile, _pattern_type
+from param import ParamConcatenation
+from parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing
+
+import logging
+from GPy.core.parameterization.index_operations import ParameterIndexOperationsView
+logger = logging.getLogger("parameters changed meta")
+
+class ParametersChangedMeta(type):
+    def __call__(self, *args, **kw):
+        self._in_init_ = True
+        #import ipdb;ipdb.set_trace()
+        self = super(ParametersChangedMeta, self).__call__(*args, **kw)
+        logger.debug("finished init")
+        self._in_init_ = False
+        logger.debug("connecting parameters")
+        self._highest_parent_._connect_parameters()
+        #self._highest_parent_._notify_parent_change()
+        self._highest_parent_._connect_fixes()
+        logger.debug("calling parameters changed")
+        self.parameters_changed()
+        return self
+
+class Parameterized(Parameterizable):
+    """
+    Parameterized class
+
+    Say m is a handle to a parameterized class.
+
+    Printing parameters:
+
+        - print m:           prints a nice summary over all parameters
+        - print m.name:      prints details for param with name 'name'
+        - print m[regexp]: prints details for all the parameters
+                             which match (!) regexp
+        - print m['']:       prints details for all parameters
+
+        Fields:
+
+            Name:       The name of the param, can be renamed!
+            Value:      Shape or value, if one-valued
+            Constrain:  constraint of the param, curly "{c}" brackets indicate
+                        some parameters are constrained by c. See detailed print
+                        to get exact constraints.
+            Tied_to:    which paramter it is tied to.
+
+    Getting and setting parameters:
+
+        Set all values in param to one:
+
+            m.name.to.param = 1
+
+    Handling of constraining, fixing and tieing parameters:
+
+        You can constrain parameters by calling the constrain on the param itself, e.g:
+
+            - m.name[:,1].constrain_positive()
+            - m.name[0].tie_to(m.name[1])
+
+        Fixing parameters will fix them to the value they are right now. If you change
+        the parameters value, the param will be fixed to the new value!
+
+        If you want to operate on all parameters use m[''] to wildcard select all paramters
+        and concatenate them. Printing m[''] will result in printing of all parameters in detail.
+    """
+    #===========================================================================
+    # Metaclass for parameters changed after init.
+    # This makes sure, that parameters changed will always be called after __init__
+    # **Never** call parameters_changed() yourself
+    __metaclass__ = ParametersChangedMeta
+    #===========================================================================
+    def __init__(self, name=None, parameters=[], *a, **kw):
+        super(Parameterized, self).__init__(name=name, *a, **kw)
+        self.size = sum(p.size for p in self.parameters)
+        self.add_observer(self, self._parameters_changed_notification, -100)
+        if not self._has_fixes():
+            self._fixes_ = None
+        self._param_slices_ = []
+        #self._connect_parameters()
+        self.link_parameters(*parameters)
+
+    def build_pydot(self, G=None):
+        import pydot  # @UnresolvedImport
+        iamroot = False
+        if G is None:
+            G = pydot.Dot(graph_type='digraph', bgcolor=None)
+            iamroot=True
+        node = pydot.Node(id(self), shape='box', label=self.name)#, color='white')
+        G.add_node(node)
+        for child in self.parameters:
+            child_node = child.build_pydot(G)
+            G.add_edge(pydot.Edge(node, child_node))#, color='white'))
+
+        for _, o, _ in self.observers:
+            label = o.name if hasattr(o, 'name') else str(o)
+            observed_node = pydot.Node(id(o), label=label)
+            G.add_node(observed_node)
+            edge = pydot.Edge(str(id(self)), str(id(o)), color='darkorange2', arrowhead='vee')
+            G.add_edge(edge)
+
+        if iamroot:
+            return G
+        return node
+
+    #===========================================================================
+    # Add remove parameters:
+    #===========================================================================
+    def link_parameter(self, param, index=None, _ignore_added_names=False):
+        """
+        :param parameters:  the parameters to add
+        :type parameters:   list of or one :py:class:`GPy.core.param.Param`
+        :param [index]:     index of where to put parameters
+
+        :param bool _ignore_added_names: whether the name of the parameter overrides a possibly existing field
+
+        Add all parameters to this param class, you can insert parameters
+        at any given index using the :func:`list.insert` syntax
+        """
+        if param in self.parameters and index is not None:
+            self.unlink_parameter(param)
+            self.link_parameter(param, index)
+        # elif param.has_parent():
+        #    raise HierarchyError, "parameter {} already in another model ({}), create new object (or copy) for adding".format(param._short(), param._highest_parent_._short())
+        elif param not in self.parameters:
+            if param.has_parent():
+                def visit(parent, self):
+                    if parent is self:
+                        raise HierarchyError, "You cannot add a parameter twice into the hierarchy"
+                param.traverse_parents(visit, self)
+                param._parent_.unlink_parameter(param)
+            # make sure the size is set
+            if index is None:
+                start = sum(p.size for p in self.parameters)
+                self.constraints.shift_right(start, param.size)
+                self.priors.shift_right(start, param.size)
+                self.constraints.update(param.constraints, self.size)
+                self.priors.update(param.priors, self.size)
+                param._parent_ = self
+                param._parent_index_ = len(self.parameters)
+                self.parameters.append(param)
+            else:
+                start = sum(p.size for p in self.parameters[:index])
+                self.constraints.shift_right(start, param.size)
+                self.priors.shift_right(start, param.size)
+                self.constraints.update(param.constraints, start)
+                self.priors.update(param.priors, start)
+                param._parent_ = self
+                param._parent_index_ = index if index>=0 else len(self.parameters[:index])
+                for p in self.parameters[index:]:
+                    p._parent_index_ += 1
+                self.parameters.insert(index, param)
+
+            param.add_observer(self, self._pass_through_notify_observers, -np.inf)
+
+            parent = self
+            while parent is not None:
+                parent.size += param.size
+                parent = parent._parent_
+            self._notify_parent_change()
+
+            if not self._in_init_:
+                #self._connect_parameters()
+                #self._notify_parent_change()
+
+                self._highest_parent_._connect_parameters(ignore_added_names=_ignore_added_names)
+                self._highest_parent_._notify_parent_change()
+                self._highest_parent_._connect_fixes()
+
+        else:
+            raise HierarchyError, """Parameter exists already, try making a copy"""
+
+
+    def link_parameters(self, *parameters):
+        """
+        convenience method for adding several
+        parameters without gradient specification
+        """
+        [self.link_parameter(p) for p in parameters]
+
+    def unlink_parameter(self, param):
+        """
+        :param param: param object to remove from being a parameter of this parameterized object.
+        """
+        if not param in self.parameters:
+            try:
+                raise RuntimeError, "{} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name)
+            except AttributeError:
+                raise RuntimeError, "{} does not seem to be a parameter, remove parameters directly from their respective parents".format(str(param))
+
+        start = sum([p.size for p in self.parameters[:param._parent_index_]])
+        self._remove_parameter_name(param)
+        self.size -= param.size
+        del self.parameters[param._parent_index_]
+
+        param._disconnect_parent()
+        param.remove_observer(self, self._pass_through_notify_observers)
+        self.constraints.shift_left(start, param.size)
+
+        self._connect_parameters()
+        self._notify_parent_change()
+
+        parent = self._parent_
+        while parent is not None:
+            parent.size -= param.size
+            parent = parent._parent_
+
+        self._highest_parent_._connect_parameters()
+        self._highest_parent_._connect_fixes()
+        self._highest_parent_._notify_parent_change()
+
+    def add_parameter(self, *args, **kwargs):
+        raise DeprecationWarning, "add_parameter was renamed to link_parameter to avoid confusion of setting variables"
+    def remove_parameter(self, *args, **kwargs):
+        raise DeprecationWarning, "remove_parameter was renamed to link_parameter to avoid confusion of setting variables"
+
+    def _connect_parameters(self, ignore_added_names=False):
+        # connect parameterlist to this parameterized object
+        # This just sets up the right connection for the params objects
+        # to be used as parameters
+        # it also sets the constraints for each parameter to the constraints
+        # of their respective parents
+        if not hasattr(self, "parameters") or len(self.parameters) < 1:
+            # no parameters for this class
+            return
+        if self.param_array.size != self.size:
+            self._param_array_ = np.empty(self.size, dtype=np.float64)
+        if self.gradient.size != self.size:
+            self._gradient_array_ = np.empty(self.size, dtype=np.float64)
+
+        old_size = 0
+        self._param_slices_ = []
+        for i, p in enumerate(self.parameters):
+            if not p.param_array.flags['C_CONTIGUOUS']:
+                raise ValueError, "This should not happen! Please write an email to the developers with the code, which reproduces this error. All parameter arrays must be C_CONTIGUOUS"
+
+            p._parent_ = self
+            p._parent_index_ = i
+
+            pslice = slice(old_size, old_size + p.size)
+
+            # first connect all children
+            p._propagate_param_grad(self.param_array[pslice], self.gradient_full[pslice])
+
+            # then connect children to self
+            self.param_array[pslice] = p.param_array.flat  # , requirements=['C', 'W']).ravel(order='C')
+            self.gradient_full[pslice] = p.gradient_full.flat  # , requirements=['C', 'W']).ravel(order='C')
+
+            p.param_array.data = self.param_array[pslice].data
+            p.gradient_full.data = self.gradient_full[pslice].data
+
+            self._param_slices_.append(pslice)
+
+            self._add_parameter_name(p, ignore_added_names=ignore_added_names)
+            old_size += p.size
+
+    #===========================================================================
+    # Get/set parameters:
+    #===========================================================================
+    def grep_param_names(self, regexp):
+        """
+        create a list of parameters, matching regular expression regexp
+        """
+        if not isinstance(regexp, _pattern_type): regexp = compile(regexp)
+        found_params = []
+        for n, p in itertools.izip(self.parameter_names(False, False, True), self.flattened_parameters):
+            if regexp.match(n) is not None:
+                found_params.append(p)
+        return found_params
+
+    def __getitem__(self, name, paramlist=None):
+        if isinstance(name, (int, slice, tuple, np.ndarray)):
+            return self.param_array[name]
+        else:
+            if paramlist is None:
+                paramlist = self.grep_param_names(name)
+            if len(paramlist) < 1: raise AttributeError, name
+            if len(paramlist) == 1:
+                if isinstance(paramlist[-1], Parameterized):
+                    paramlist = paramlist[-1].flattened_parameters
+                    if len(paramlist) != 1:
+                        return ParamConcatenation(paramlist)
+                return paramlist[-1]
+            return ParamConcatenation(paramlist)
+
+    def __setitem__(self, name, value, paramlist=None):
+        if value is None:
+            return # nothing to do here
+        if isinstance(name, (slice, tuple, np.ndarray)):
+            try:
+                self.param_array[name] = value
+            except:
+                raise ValueError, "Setting by slice or index only allowed with array-like"
+            self._trigger_params_changed()
+        else:
+            try: param = self.__getitem__(name, paramlist)
+            except: raise
+            param[:] = value
+
+    def __setattr__(self, name, val):
+        # override the default behaviour, if setting a param, so broadcasting can by used
+        if hasattr(self, "parameters"):
+            try:
+                pnames = self.parameter_names(False, adjust_for_printing=True, recursive=False)
+                if name in pnames:
+                    param = self.parameters[pnames.index(name)]
+                    param[:] = val; return
+            except AttributeError:
+                pass
+        object.__setattr__(self, name, val);
+
+    #===========================================================================
+    # Pickling
+    #===========================================================================
+    def __setstate__(self, state):
+        super(Parameterized, self).__setstate__(state)
+        try:
+            self._connect_parameters()
+            self._connect_fixes()
+            self._notify_parent_change()
+            self.parameters_changed()
+        except Exception as e:
+            print "WARNING: caught exception {!s}, trying to continue".format(e)
+
+    def copy(self, memo=None):
+        if memo is None:
+            memo = {}
+        memo[id(self.optimizer_array)] = None # and param_array
+        memo[id(self.param_array)] = None # and param_array
+        copy = super(Parameterized, self).copy(memo)
+        copy._connect_parameters()
+        copy._connect_fixes()
+        copy._notify_parent_change()
+        return copy
+
+    #===========================================================================
+    # Printing:
+    #===========================================================================
+    def _short(self):
+        return self.hierarchy_name()
+    @property
+    def flattened_parameters(self):
+        return [xi for x in self.parameters for xi in x.flattened_parameters]
+    @property
+    def _parameter_sizes_(self):
+        return [x.size for x in self.parameters]
+    @property
+    def parameter_shapes(self):
+        return [xi for x in self.parameters for xi in x.parameter_shapes]
+    @property
+    def _constraints_str(self):
+        return [cs for p in self.parameters for cs in p._constraints_str]
+    @property
+    def _priors_str(self):
+        return [cs for p in self.parameters for cs in p._priors_str]
+    @property
+    def _description_str(self):
+        return [xi for x in self.parameters for xi in x._description_str]
+    @property
+    def _ties_str(self):
+        return [','.join(x._ties_str) for x in self.flattened_parameters]
+
+    def _repr_html_(self, header=True):
+        """Representation of the parameters in html for notebook display."""
+        name = adjust_name_for_printing(self.name) + "."
+        constrs = self._constraints_str;
+        ts = self._ties_str
+        prirs = self._priors_str
+        desc = self._description_str; names = self.parameter_names()
+        nl = max([len(str(x)) for x in names + [name]])
+        sl = max([len(str(x)) for x in desc + ["Value"]])
+        cl = max([len(str(x)) if x else 0 for x in constrs + ["Constraint"]])
+        tl = max([len(str(x)) if x else 0 for x in ts + ["Tied to"]])
+        pl = max([len(str(x)) if x else 0 for x in prirs + ["Prior"]])
+        format_spec = "<tr><td>{{name:<{0}s}}</td><td align=\"right\">{{desc:>{1}s}}</td><td>{{const:^{2}s}}</td><td>{{pri:^{3}s}}</td><td>{{t:^{4}s}}</td></tr>".format(nl, sl, cl, pl, tl)
+        to_print = []
+        for n, d, c, t, p in itertools.izip(names, desc, constrs, ts, prirs):
+            to_print.append(format_spec.format(name=n, desc=d, const=c, t=t, pri=p))
+        sep = '-' * (nl + sl + cl + + pl + tl + 8 * 2 + 3)
+        if header:
+            header = """
+<tr>
+  <td><b>{name}</b>
+  <td><b>Value</b></td>
+  <td><b>Constraint</b></td>
+  <td><b>Prior</b></td>
+  <td><b>Tied to</b></td>""".format(name=name)
+            to_print.insert(0, header)
+        return '<table>' + '\n'.format(sep).join(to_print) + '\n</table>'
+
+    def __str__(self, header=True):
+        name = adjust_name_for_printing(self.name) + "."
+        constrs = self._constraints_str;
+        ts = self._ties_str
+        prirs = self._priors_str
+        desc = self._description_str; names = self.parameter_names()
+        nl = max([len(str(x)) for x in names + [name]])
+        sl = max([len(str(x)) for x in desc + ["Value"]])
+        cl = max([len(str(x)) if x else 0 for x in constrs + ["Constraint"]])
+        tl = max([len(str(x)) if x else 0 for x in ts + ["Tied to"]])
+        pl = max([len(str(x)) if x else 0 for x in prirs + ["Prior"]])
+        format_spec = "  \033[1m{{name:<{0}s}}\033[0;0m  |  {{desc:>{1}s}}  |  {{const:^{2}s}}  |  {{pri:^{3}s}}  |  {{t:^{4}s}}".format(nl, sl, cl, pl, tl)
+        to_print = []
+        for n, d, c, t, p in itertools.izip(names, desc, constrs, ts, prirs):
+            to_print.append(format_spec.format(name=n, desc=d, const=c, t=t, pri=p))
+        sep = '-' * (nl + sl + cl + + pl + tl + 8 * 2 + 3)
+        if header:
+            header = "  {{0:<{0}s}}  |  {{1:^{1}s}}  |  {{2:^{2}s}}  |  {{3:^{3}s}}  |  {{4:^{4}s}}".format(nl, sl, cl, pl, tl).format(name, "Value", "Constraint", "Prior", "Tied to")
+            to_print.insert(0, header)
+        return '\n'.format(sep).join(to_print)
+    pass
+
+
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@ -0,0 +1,771 @@
+# Copyright (c) 2012 - 2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from scipy.special import gammaln, digamma
+from ...util.linalg import pdinv
+from domains import _REAL, _POSITIVE
+import warnings
+import weakref
+
+
+class Prior(object):
+    domain = None
+    _instance = None
+    def __new__(cls, *args, **kwargs):
+        if not cls._instance or cls._instance.__class__ is not cls:
+            cls._instance = super(Prior, cls).__new__(cls, *args, **kwargs)
+        return cls._instance
+
+    def pdf(self, x):
+        return np.exp(self.lnpdf(x))
+
+    def plot(self):
+        import sys
+
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ...plotting.matplot_dep import priors_plots
+
+        priors_plots.univariate_plot(self)
+
+    def __repr__(self, *args, **kwargs):
+        return self.__str__()
+
+
+class Gaussian(Prior):
+    """
+    Implementation of the univariate Gaussian probability function, coupled with random variables.
+
+    :param mu: mean
+    :param sigma: standard deviation
+
+    .. Note:: Bishop 2006 notation is used throughout the code
+
+    """
+    domain = _REAL
+    _instances = []
+
+    def __new__(cls, mu=0, sigma=1):  # Singleton:
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if instance().mu == mu and instance().sigma == sigma:
+                    return instance()
+        o = super(Prior, cls).__new__(cls, mu, sigma)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, mu, sigma):
+        self.mu = float(mu)
+        self.sigma = float(sigma)
+        self.sigma2 = np.square(self.sigma)
+        self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)
+
+    def __str__(self):
+        return "N({:.2g}, {:.2g})".format(self.mu, self.sigma)
+
+    def lnpdf(self, x):
+        return self.constant - 0.5 * np.square(x - self.mu) / self.sigma2
+
+    def lnpdf_grad(self, x):
+        return -(x - self.mu) / self.sigma2
+
+    def rvs(self, n):
+        return np.random.randn(n) * self.sigma + self.mu
+
+#     def __getstate__(self):
+#         return self.mu, self.sigma
+#
+#     def __setstate__(self, state):
+#         self.mu = state[0]
+#         self.sigma = state[1]
+#         self.sigma2 = np.square(self.sigma)
+#         self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)
+
+class Uniform(Prior):
+    domain = _REAL
+    _instances = []
+
+    def __new__(cls, lower=0, upper=1):  # Singleton:
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if instance().lower == lower and instance().upper == upper:
+                    return instance()
+        o = super(Prior, cls).__new__(cls, lower, upper)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, lower, upper):
+        self.lower = float(lower)
+        self.upper = float(upper)
+
+    def __str__(self):
+        return "[{:.2g}, {:.2g}]".format(self.lower, self.upper)
+
+    def lnpdf(self, x):
+        region = (x >= self.lower) * (x <= self.upper)
+        return region
+
+    def lnpdf_grad(self, x):
+        return np.zeros(x.shape)
+
+    def rvs(self, n):
+        return np.random.uniform(self.lower, self.upper, size=n)
+
+#     def __getstate__(self):
+#         return self.lower, self.upper
+#
+#     def __setstate__(self, state):
+#         self.lower = state[0]
+#         self.upper = state[1]
+
+class LogGaussian(Gaussian):
+    """
+    Implementation of the univariate *log*-Gaussian probability function, coupled with random variables.
+
+    :param mu: mean
+    :param sigma: standard deviation
+
+    .. Note:: Bishop 2006 notation is used throughout the code
+
+    """
+    domain = _POSITIVE
+    _instances = []
+
+    def __new__(cls, mu=0, sigma=1):  # Singleton:
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if instance().mu == mu and instance().sigma == sigma:
+                    return instance()
+        o = super(Prior, cls).__new__(cls, mu, sigma)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, mu, sigma):
+        self.mu = float(mu)
+        self.sigma = float(sigma)
+        self.sigma2 = np.square(self.sigma)
+        self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)
+
+    def __str__(self):
+        return "lnN({:.2g}, {:.2g})".format(self.mu, self.sigma)
+
+    def lnpdf(self, x):
+        return self.constant - 0.5 * np.square(np.log(x) - self.mu) / self.sigma2 - np.log(x)
+
+    def lnpdf_grad(self, x):
+        return -((np.log(x) - self.mu) / self.sigma2 + 1.) / x
+
+    def rvs(self, n):
+        return np.exp(np.random.randn(n) * self.sigma + self.mu)
+
+
+class MultivariateGaussian(Prior):
+    """
+    Implementation of the multivariate Gaussian probability function, coupled with random variables.
+
+    :param mu: mean (N-dimensional array)
+    :param var: covariance matrix (NxN)
+
+    .. Note:: Bishop 2006 notation is used throughout the code
+
+    """
+    domain = _REAL
+    _instances = []
+
+    def __new__(cls, mu=0, var=1):  # Singleton:
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if np.all(instance().mu == mu) and np.all(instance().var == var):
+                    return instance()
+        o = super(Prior, cls).__new__(cls, mu, var)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, mu, var):
+        self.mu = np.array(mu).flatten()
+        self.var = np.array(var)
+        assert len(self.var.shape) == 2
+        assert self.var.shape[0] == self.var.shape[1]
+        assert self.var.shape[0] == self.mu.size
+        self.input_dim = self.mu.size
+        self.inv, self.hld = pdinv(self.var)
+        self.constant = -0.5 * self.input_dim * np.log(2 * np.pi) - self.hld
+
+    def summary(self):
+        raise NotImplementedError
+
+    def pdf(self, x):
+        return np.exp(self.lnpdf(x))
+
+    def lnpdf(self, x):
+        d = x - self.mu
+        return self.constant - 0.5 * np.sum(d * np.dot(d, self.inv), 1)
+
+    def lnpdf_grad(self, x):
+        d = x - self.mu
+        return -np.dot(self.inv, d)
+
+    def rvs(self, n):
+        return np.random.multivariate_normal(self.mu, self.var, n)
+
+    def plot(self):
+        import sys
+
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ..plotting.matplot_dep import priors_plots
+
+        priors_plots.multivariate_plot(self)
+
+    def __getstate__(self):
+        return self.mu, self.var
+
+    def __setstate__(self, state):
+        self.mu = state[0]
+        self.var = state[1]
+        assert len(self.var.shape) == 2
+        assert self.var.shape[0] == self.var.shape[1]
+        assert self.var.shape[0] == self.mu.size
+        self.input_dim = self.mu.size
+        self.inv, self.hld = pdinv(self.var)
+        self.constant = -0.5 * self.input_dim * np.log(2 * np.pi) - self.hld
+
+def gamma_from_EV(E, V):
+    warnings.warn("use Gamma.from_EV to create Gamma Prior", FutureWarning)
+    return Gamma.from_EV(E, V)
+
+
+class Gamma(Prior):
+    """
+    Implementation of the Gamma probability function, coupled with random variables.
+
+    :param a: shape parameter
+    :param b: rate parameter (warning: it's the *inverse* of the scale)
+
+    .. Note:: Bishop 2006 notation is used throughout the code
+
+    """
+    domain = _POSITIVE
+    _instances = []
+
+    def __new__(cls, a=1, b=.5):  # Singleton:
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if instance().a == a and instance().b == b:
+                    return instance()
+        o = super(Prior, cls).__new__(cls, a, b)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, a, b):
+        self.a = float(a)
+        self.b = float(b)
+        self.constant = -gammaln(self.a) + a * np.log(b)
+
+    def __str__(self):
+        return "Ga({:.2g}, {:.2g})".format(self.a, self.b)
+
+    def summary(self):
+        ret = {"E[x]": self.a / self.b, \
+               "E[ln x]": digamma(self.a) - np.log(self.b), \
+               "var[x]": self.a / self.b / self.b, \
+               "Entropy": gammaln(self.a) - (self.a - 1.) * digamma(self.a) - np.log(self.b) + self.a}
+        if self.a > 1:
+            ret['Mode'] = (self.a - 1.) / self.b
+        else:
+            ret['mode'] = np.nan
+        return ret
+
+    def lnpdf(self, x):
+        return self.constant + (self.a - 1) * np.log(x) - self.b * x
+
+    def lnpdf_grad(self, x):
+        return (self.a - 1.) / x - self.b
+
+    def rvs(self, n):
+        return np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
+
+    @staticmethod
+    def from_EV(E, V):
+        """
+        Creates an instance of a Gamma Prior  by specifying the Expected value(s)
+        and Variance(s) of the distribution.
+
+        :param E: expected value
+        :param V: variance
+        """
+        a = np.square(E) / V
+        b = E / V
+        return Gamma(a, b)
+
+    def __getstate__(self):
+        return self.a, self.b
+
+    def __setstate__(self, state):
+        self.a = state[0]
+        self.b = state[1]
+        self.constant = -gammaln(self.a) + self.a * np.log(self.b)
+
+class InverseGamma(Gamma):
+    """
+    Implementation of the inverse-Gamma probability function, coupled with random variables.
+
+    :param a: shape parameter
+    :param b: rate parameter (warning: it's the *inverse* of the scale)
+
+    .. Note:: Bishop 2006 notation is used throughout the code
+
+    """
+    domain = _POSITIVE
+    _instances = []
+    def __new__(cls, a=1, b=.5): # Singleton:
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if instance().a == a and instance().b == b:
+                    return instance()
+        o = super(Prior, cls).__new__(cls, a, b)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, a, b):
+        self.a = float(a)
+        self.b = float(b)
+        self.constant = -gammaln(self.a) + a * np.log(b)
+
+    def __str__(self):
+        return "iGa({:.2g}, {:.2g})".format(self.a, self.b)
+
+    def lnpdf(self, x):
+        return self.constant - (self.a + 1) * np.log(x) - self.b / x
+
+    def lnpdf_grad(self, x):
+        return -(self.a + 1.) / x + self.b / x ** 2
+
+    def rvs(self, n):
+        return 1. / np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
+
+class DGPLVM_KFDA(Prior):
+    """
+    Implementation of the Discriminative Gaussian Process Latent Variable function using
+    Kernel Fisher Discriminant Analysis by Seung-Jean Kim for implementing Face paper
+    by Chaochao Lu.
+
+    :param lambdaa: constant
+    :param sigma2: constant
+
+    .. Note:: Surpassing Human-Level Face paper dgplvm implementation
+
+    """
+    domain = _REAL
+    # _instances = []
+    # def __new__(cls, lambdaa, sigma2):  # Singleton:
+    #     if cls._instances:
+    #         cls._instances[:] = [instance for instance in cls._instances if instance()]
+    #         for instance in cls._instances:
+    #             if instance().mu == mu and instance().sigma == sigma:
+    #                 return instance()
+    #     o = super(Prior, cls).__new__(cls, mu, sigma)
+    #     cls._instances.append(weakref.ref(o))
+    #     return cls._instances[-1]()
+
+    def __init__(self, lambdaa, sigma2, lbl, kern, x_shape):
+        """A description for init"""
+        self.datanum = lbl.shape[0]
+        self.classnum = lbl.shape[1]
+        self.lambdaa = lambdaa
+        self.sigma2 = sigma2
+        self.lbl = lbl
+        self.kern = kern
+        lst_ni = self.compute_lst_ni()
+        self.a = self.compute_a(lst_ni)
+        self.A = self.compute_A(lst_ni)
+        self.x_shape = x_shape
+
+    def get_class_label(self, y):
+        for idx, v in enumerate(y):
+            if v == 1:
+                return idx
+        return -1
+
+    # This function assigns each data point to its own class
+    # and returns the dictionary which contains the class name and parameters.
+    def compute_cls(self, x):
+        cls = {}
+        # Appending each data point to its proper class
+        for j in xrange(self.datanum):
+            class_label = self.get_class_label(self.lbl[j])
+            if class_label not in cls:
+                cls[class_label] = []
+            cls[class_label].append(x[j])
+        if len(cls) > 2:
+            for i in range(2, self.classnum):
+                del cls[i]
+        return cls
+
+    def x_reduced(self, cls):
+        x1 = cls[0]
+        x2 = cls[1]
+        x = np.concatenate((x1, x2), axis=0)
+        return x
+
+    def compute_lst_ni(self):
+        lst_ni = []
+        lst_ni1 = []
+        lst_ni2 = []
+        f1 = (np.where(self.lbl[:, 0] == 1)[0])
+        f2 = (np.where(self.lbl[:, 1] == 1)[0])
+        for idx in f1:
+            lst_ni1.append(idx)
+        for idx in f2:
+            lst_ni2.append(idx)
+        lst_ni.append(len(lst_ni1))
+        lst_ni.append(len(lst_ni2))
+        return lst_ni
+
+    def compute_a(self, lst_ni):
+        a = np.ones((self.datanum, 1))
+        count = 0
+        for N_i in lst_ni:
+            if N_i == lst_ni[0]:
+                a[count:count + N_i] = (float(1) / N_i) * a[count]
+                count += N_i
+            else:
+                if N_i == lst_ni[1]:
+                    a[count: count + N_i] = -(float(1) / N_i) * a[count]
+                    count += N_i
+        return a
+
+    def compute_A(self, lst_ni):
+        A = np.zeros((self.datanum, self.datanum))
+        idx = 0
+        for N_i in lst_ni:
+            B = float(1) / np.sqrt(N_i) * (np.eye(N_i) - ((float(1) / N_i) * np.ones((N_i, N_i))))
+            A[idx:idx + N_i, idx:idx + N_i] = B
+            idx += N_i
+        return A
+
+    # Here log function
+    def lnpdf(self, x):
+        x = x.reshape(self.x_shape)
+        K = self.kern.K(x)
+        a_trans = np.transpose(self.a)
+        paran = self.lambdaa * np.eye(x.shape[0]) + self.A.dot(K).dot(self.A)
+        inv_part = pdinv(paran)[0]
+        J = a_trans.dot(K).dot(self.a) - a_trans.dot(K).dot(self.A).dot(inv_part).dot(self.A).dot(K).dot(self.a)
+        J_star = (1. / self.lambdaa) * J
+        return (-1. / self.sigma2) * J_star
+
+    # Here gradient function
+    def lnpdf_grad(self, x):
+        x = x.reshape(self.x_shape)
+        K = self.kern.K(x)
+        paran = self.lambdaa * np.eye(x.shape[0]) + self.A.dot(K).dot(self.A)
+        inv_part = pdinv(paran)[0]
+        b = self.A.dot(inv_part).dot(self.A).dot(K).dot(self.a)
+        a_Minus_b = self.a - b
+        a_b_trans = np.transpose(a_Minus_b)
+        DJ_star_DK = (1. / self.lambdaa) * (a_Minus_b.dot(a_b_trans))
+        DJ_star_DX = self.kern.gradients_X(DJ_star_DK, x)
+        return (-1. / self.sigma2) * DJ_star_DX
+
+    def rvs(self, n):
+        return np.random.rand(n)  # A WRONG implementation
+
+    def __str__(self):
+        return 'DGPLVM_prior'
+
+    def __getstate___(self):
+        return self.lbl, self.lambdaa, self.sigma2, self.kern, self.x_shape
+
+    def __setstate__(self, state):
+        lbl, lambdaa, sigma2, kern, a, A, x_shape = state
+        self.datanum = lbl.shape[0]
+        self.classnum = lbl.shape[1]
+        self.lambdaa = lambdaa
+        self.sigma2 = sigma2
+        self.lbl = lbl
+        self.kern = kern
+        lst_ni = self.compute_lst_ni()
+        self.a = self.compute_a(lst_ni)
+        self.A = self.compute_A(lst_ni)
+        self.x_shape = x_shape
+
+class DGPLVM(Prior):
+    """
+    Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
+
+    :param sigma2: constant
+
+    .. Note:: DGPLVM for Classification paper implementation
+
+    """
+    domain = _REAL
+    # _instances = []
+    # def __new__(cls, mu, sigma): # Singleton:
+    #     if cls._instances:
+    #         cls._instances[:] = [instance for instance in cls._instances if instance()]
+    #         for instance in cls._instances:
+    #             if instance().mu == mu and instance().sigma == sigma:
+    #                 return instance()
+    #     o = super(Prior, cls).__new__(cls, mu, sigma)
+    #     cls._instances.append(weakref.ref(o))
+    #     return cls._instances[-1]()
+
+    def __init__(self, sigma2, lbl, x_shape):
+        self.sigma2 = sigma2
+        # self.x = x
+        self.lbl = lbl
+        self.classnum = lbl.shape[1]
+        self.datanum = lbl.shape[0]
+        self.x_shape = x_shape
+        self.dim = x_shape[1]
+
+    def get_class_label(self, y):
+        for idx, v in enumerate(y):
+            if v == 1:
+                return idx
+        return -1
+
+    # This function assigns each data point to its own class
+    # and returns the dictionary which contains the class name and parameters.
+    def compute_cls(self, x):
+        cls = {}
+        # Appending each data point to its proper class
+        for j in xrange(self.datanum):
+            class_label = self.get_class_label(self.lbl[j])
+            if class_label not in cls:
+                cls[class_label] = []
+            cls[class_label].append(x[j])
+        return cls
+
+    # This function computes mean of each class. The mean is calculated through each dimension
+    def compute_Mi(self, cls):
+        M_i = np.zeros((self.classnum, self.dim))
+        for i in cls:
+            # Mean of each class
+            M_i[i] = np.mean(cls[i], axis=0)
+        return M_i
+
+    # Adding data points as tuple to the dictionary so that we can access indices
+    def compute_indices(self, x):
+        data_idx = {}
+        for j in xrange(self.datanum):
+            class_label = self.get_class_label(self.lbl[j])
+            if class_label not in data_idx:
+                data_idx[class_label] = []
+            t = (j, x[j])
+            data_idx[class_label].append(t)
+        return data_idx
+
+    # Adding indices to the list so we can access whole the indices
+    def compute_listIndices(self, data_idx):
+        lst_idx = []
+        lst_idx_all = []
+        for i in data_idx:
+            if len(lst_idx) == 0:
+                pass
+                #Do nothing, because it is the first time list is created so is empty
+            else:
+                lst_idx = []
+            # Here we put indices of each class in to the list called lst_idx_all
+            for m in xrange(len(data_idx[i])):
+                lst_idx.append(data_idx[i][m][0])
+            lst_idx_all.append(lst_idx)
+        return lst_idx_all
+
+    # This function calculates between classes variances
+    def compute_Sb(self, cls, M_i, M_0):
+        Sb = np.zeros((self.dim, self.dim))
+        for i in cls:
+            B = (M_i[i] - M_0).reshape(self.dim, 1)
+            B_trans = B.transpose()
+            Sb += (float(len(cls[i])) / self.datanum) * B.dot(B_trans)
+        return Sb
+
+    # This function calculates within classes variances
+    def compute_Sw(self, cls, M_i):
+        Sw = np.zeros((self.dim, self.dim))
+        for i in cls:
+            N_i = float(len(cls[i]))
+            W_WT = np.zeros((self.dim, self.dim))
+            for xk in cls[i]:
+                W = (xk - M_i[i])
+                W_WT += np.outer(W, W)
+            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
+        return Sw
+
+    # Calculating beta and Bi for Sb
+    def compute_sig_beta_Bi(self, data_idx, M_i, M_0, lst_idx_all):
+        # import pdb
+        # pdb.set_trace()
+        B_i = np.zeros((self.classnum, self.dim))
+        Sig_beta_B_i_all = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            # pdb.set_trace()
+            # Calculating Bi
+            B_i[i] = (M_i[i] - M_0).reshape(1, self.dim)
+        for k in xrange(self.datanum):
+            for i in data_idx:
+                N_i = float(len(data_idx[i]))
+                if k in lst_idx_all[i]:
+                    beta = (float(1) / N_i) - (float(1) / self.datanum)
+                    Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
+                else:
+                    beta = -(float(1) / self.datanum)
+                    Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
+        Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
+        return Sig_beta_B_i_all
+
+
+    # Calculating W_j s separately so we can access all the W_j s anytime
+    def compute_wj(self, data_idx, M_i):
+        W_i = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            N_i = float(len(data_idx[i]))
+            for tpl in data_idx[i]:
+                xj = tpl[1]
+                j = tpl[0]
+                W_i[j] = (xj - M_i[i])
+        return W_i
+
+    # Calculating alpha and Wj for Sw
+    def compute_sig_alpha_W(self, data_idx, lst_idx_all, W_i):
+        Sig_alpha_W_i = np.zeros((self.datanum, self.dim))
+        for i in data_idx:
+            N_i = float(len(data_idx[i]))
+            for tpl in data_idx[i]:
+                k = tpl[0]
+                for j in lst_idx_all[i]:
+                    if k == j:
+                        alpha = 1 - (float(1) / N_i)
+                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+                    else:
+                        alpha = 0 - (float(1) / N_i)
+                        Sig_alpha_W_i[k] += (alpha * W_i[j])
+        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
+        return Sig_alpha_W_i
+
+    # This function calculates log of our prior
+    def lnpdf(self, x):
+        x = x.reshape(self.x_shape)
+        cls = self.compute_cls(x)
+        M_0 = np.mean(x, axis=0)
+        M_i = self.compute_Mi(cls)
+        Sb = self.compute_Sb(cls, M_i, M_0)
+        Sw = self.compute_Sw(cls, M_i)
+        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
+        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
+
+    # This function calculates derivative of the log of prior function
+    def lnpdf_grad(self, x):
+        x = x.reshape(self.x_shape)
+        cls = self.compute_cls(x)
+        M_0 = np.mean(x, axis=0)
+        M_i = self.compute_Mi(cls)
+        Sb = self.compute_Sb(cls, M_i, M_0)
+        Sw = self.compute_Sw(cls, M_i)
+        data_idx = self.compute_indices(x)
+        lst_idx_all = self.compute_listIndices(data_idx)
+        Sig_beta_B_i_all = self.compute_sig_beta_Bi(data_idx, M_i, M_0, lst_idx_all)
+        W_i = self.compute_wj(data_idx, M_i)
+        Sig_alpha_W_i = self.compute_sig_alpha_W(data_idx, lst_idx_all, W_i)
+
+        # Calculating inverse of Sb and its transpose and minus
+        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
+        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
+        Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
+        Sb_inv_N_trans = np.transpose(Sb_inv_N)
+        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
+        Sw_trans = np.transpose(Sw)
+
+        # Calculating DJ/DXk
+        DJ_Dxk = 2 * (
+            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
+                Sig_alpha_W_i))
+        # Calculating derivative of the log of the prior
+        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
+        return DPx_Dx.T
+
+    # def frb(self, x):
+    #     from functools import partial
+    #     from GPy.models import GradientChecker
+    #     f = partial(self.lnpdf)
+    #     df = partial(self.lnpdf_grad)
+    #     grad = GradientChecker(f, df, x, 'X')
+    #     grad.checkgrad(verbose=1)
+
+    def rvs(self, n):
+        return np.random.rand(n)  # A WRONG implementation
+
+    def __str__(self):
+        return 'DGPLVM_prior'
+
+class HalfT(Prior):
+    """
+    Implementation of the half student t probability function, coupled with random variables.
+
+    :param A: scale parameter
+    :param nu: degrees of freedom
+
+    """
+    domain = _POSITIVE
+    _instances = []
+    def __new__(cls, A, nu): # Singleton:
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if instance().A == A and instance().nu == nu:
+                   return instance()
+        o = super(Prior, cls).__new__(cls, A, nu)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+    def __init__(self, A, nu):
+        self.A = float(A)
+        self.nu = float(nu)
+        self.constant = gammaln(.5*(self.nu+1.)) - gammaln(.5*self.nu) - .5*np.log(np.pi*self.A*self.nu)
+
+    def __str__(self):
+        return "hT({:.2g}, {:.2g})".format(self.A, self.nu)
+
+    def lnpdf(self,theta):
+        return (theta>0) * ( self.constant -.5*(self.nu+1) * np.log( 1.+ (1./self.nu) * (theta/self.A)**2 ) )
+
+        #theta = theta if isinstance(theta,np.ndarray) else np.array([theta])
+        #lnpdfs = np.zeros_like(theta)
+        #theta = np.array([theta])
+        #above_zero = theta.flatten()>1e-6
+        #v = self.nu
+        #sigma2=self.A
+        #stop
+        #lnpdfs[above_zero] = (+ gammaln((v + 1) * 0.5)
+        #    - gammaln(v * 0.5)
+        #    - 0.5*np.log(sigma2 * v * np.pi)
+        #    - 0.5*(v + 1)*np.log(1 + (1/np.float(v))*((theta[above_zero][0]**2)/sigma2))
+        #)
+        #return lnpdfs
+
+    def lnpdf_grad(self,theta):
+        theta = theta if isinstance(theta,np.ndarray) else np.array([theta])
+        grad = np.zeros_like(theta)
+        above_zero = theta>1e-6
+        v = self.nu
+        sigma2=self.A
+        grad[above_zero] = -0.5*(v+1)*(2*theta[above_zero])/(v*sigma2 + theta[above_zero][0]**2)
+        return grad
+
+    def rvs(self, n):
+         #return np.random.randn(n) * self.sigma + self.mu
+         from scipy.stats import t
+         #[np.abs(x) for x in t.rvs(df=4,loc=0,scale=50, size=10000)])
+         ret = t.rvs(self.nu,loc=0,scale=self.A, size=n)
+         ret[ret<0] = 0
+         return ret
+
--- a/GPy/core/parameterization/ties_and_remappings.py
+++ b/GPy/core/parameterization/ties_and_remappings.py
@ -0,0 +1,225 @@
+# Copyright (c) 2014, James Hensman, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from parameterized import Parameterized
+from param import Param
+
+class Remapping(Parameterized):
+    def mapping(self):
+        """
+        The return value of this function gives the values which the re-mapped
+        parameters should take. Implement in sub-classes.
+        """
+        raise NotImplementedError
+
+    def callback(self):
+        raise NotImplementedError
+
+    def __str__(self):
+        return self.name
+
+    def parameters_changed(self):
+        #ensure all out parameters have the correct value, as specified by our mapping
+        index = self._highest_parent_.constraints[self]
+        self._highest_parent_.param_array[index] = self.mapping()
+        [p.notify_observers(which=self) for p in self.tied_parameters]
+
+class Fix(Remapping):
+    pass
+
+
+
+
+class Tie(Parameterized):
+    """
+    The new parameter tie framework. (under development)
+    
+    All the parameters tied together get a new parameter inside the *Tie* object. 
+    Its value should always be equal to all the tied parameters, and its gradient
+    is the sum of all the tied parameters.
+    
+    =====Implementation Details=====
+    The *Tie* object should only exist on the top of param tree (the highest parent).
+    
+    self.label_buf:
+    It uses a label buffer that has the same length as all the parameters (self._highest_parent_.param_array).
+    The buffer keeps track of all the tied parameters. All the tied parameters have a label (an interger) higher 
+    than 0, and the parameters that have the same label are tied together.
+    
+    self.buf_index:
+    An auxiliary index list for the global index of the tie parameter inside the *Tie* object.
+    
+    ================================
+    
+    TODO:
+    * EVERYTHING
+    
+    """
+    def __init__(self, name='tie'):
+        super(Tie, self).__init__(name)
+        self.tied_param = None
+        # The buffer keeps track of tie status
+        self.label_buf = None
+        # The global indices of the 'tied' param
+        self.buf_idx = None
+        # A boolean array indicating non-tied parameters
+        self._tie_ = None
+        
+    def getTieFlag(self, p=None):
+        if self.tied_param is None:
+            if self._tie_ is None or self._tie_.size != self._highest_parent_.param_array.size:
+                self._tie_ = np.ones((self._highest_parent_.param_array.size,),dtype=np.bool)
+        if p is not None:
+            return self._tie_[p._highest_parent_._raveled_index_for(p)]
+        return self._tie_
+    
+    def _init_labelBuf(self):
+        if self.label_buf is None:
+            self.label_buf = np.zeros(self._highest_parent_.param_array.shape, dtype=np.int)
+        if self._tie_ is None or self._tie_.size != self._highest_parent_.param_array.size:
+            self._tie_ = np.ones((self._highest_parent_.param_array.size,),dtype=np.bool)
+            
+    def _updateTieFlag(self):
+        if self._tie_.size != self.label_buf.size:
+            self._tie_ = np.ones((self._highest_parent_.param_array.size,),dtype=np.bool)
+        self._tie_[self.label_buf>0] = False
+        self._tie_[self.buf_idx] = True
+
+    def add_tied_parameter(self, p, p2=None):
+        """
+        Tie the list of parameters p together (p2==None) or 
+        Tie the list of parameters p with the list of parameters p2 (p2!=None) 
+        """
+        self._init_labelBuf()
+        if p2 is None:
+            idx = self._highest_parent_._raveled_index_for(p)
+            val = self._sync_val_group(idx)            
+            if np.all(self.label_buf[idx]==0):
+                # None of p has been tied before.
+                tie_idx = self._expandTieParam(1)
+                print tie_idx
+                tie_id = self.label_buf.max()+1
+                self.label_buf[tie_idx] = tie_id
+            else:
+                b = self.label_buf[idx]
+                ids = np.unique(b[b>0])
+                tie_id, tie_idx = self._merge_tie_param(ids)
+            self._highest_parent_.param_array[tie_idx] = val
+            idx = self._highest_parent_._raveled_index_for(p)
+            self.label_buf[idx] = tie_id
+        else:
+            pass
+        self._updateTieFlag()
+        
+    def _merge_tie_param(self, ids):
+        """Merge the tie parameters with ids in the list."""
+        if len(ids)==1:
+            id_final_idx = self.buf_idx[self.label_buf[self.buf_idx]==ids[0]][0]
+            return ids[0],id_final_idx
+        id_final = ids[0]
+        ids_rm = ids[1:]
+        label_buf_param = self.label_buf[self.buf_idx]
+        idx_param = [np.where(label_buf_param==i)[0][0] for i in ids_rm]
+        self._removeTieParam(idx_param)
+        [np.put(self.label_buf, np.where(self.label_buf==i), id_final) for i in ids_rm]
+        id_final_idx = self.buf_idx[self.label_buf[self.buf_idx]==id_final][0]
+        return id_final, id_final_idx
+        
+    def _sync_val_group(self, idx):
+        self._highest_parent_.param_array[idx] = self._highest_parent_.param_array[idx].mean()
+        return self._highest_parent_.param_array[idx][0]
+        
+    def _expandTieParam(self, num):
+        """Expand the tie param with the number of *num* parameters"""
+        if self.tied_param is None:
+            new_buf = np.empty((num,))
+        else:
+            new_buf = np.empty((self.tied_param.size+num,))
+            new_buf[:self.tied_param.size] = self.tied_param.param_array.copy()
+            self.remove_parameter(self.tied_param)
+        self.tied_param = Param('tied',new_buf)
+        self.add_parameter(self.tied_param)
+        buf_idx_new = self._highest_parent_._raveled_index_for(self.tied_param)
+        self._expand_label_buf(self.buf_idx, buf_idx_new)
+        self.buf_idx = buf_idx_new
+        return self.buf_idx[-num:]
+
+    def _removeTieParam(self, idx):
+        """idx within tied_param"""
+        new_buf = np.empty((self.tied_param.size-len(idx),))
+        bool_list = np.ones((self.tied_param.size,),dtype=np.bool)
+        bool_list[idx] = False
+        new_buf[:] = self.tied_param.param_array[bool_list]
+        self.remove_parameter(self.tied_param)
+        self.tied_param = Param('tied',new_buf)
+        self.add_parameter(self.tied_param)
+        buf_idx_new = self._highest_parent_._raveled_index_for(self.tied_param)
+        self._shrink_label_buf(self.buf_idx, buf_idx_new, bool_list)
+        self.buf_idx = buf_idx_new
+        
+    def _expand_label_buf(self, idx_old, idx_new):
+        """Expand label buffer accordingly"""
+        if idx_old is None:
+            self.label_buf = np.zeros(self._highest_parent_.param_array.shape, dtype=np.int)
+        else:
+            bool_old = np.zeros((self.label_buf.size,),dtype=np.bool)
+            bool_old[idx_old] = True
+            bool_new = np.zeros((self._highest_parent_.param_array.size,),dtype=np.bool)
+            bool_new[idx_new] = True
+            label_buf_new = np.zeros(self._highest_parent_.param_array.shape, dtype=np.int)
+            label_buf_new[np.logical_not(bool_new)] = self.label_buf[np.logical_not(bool_old)]
+            label_buf_new[idx_new[:len(idx_old)]] = self.label_buf[idx_old]
+            self.label_buf = label_buf_new
+
+    def _shrink_label_buf(self, idx_old, idx_new, bool_list):
+        bool_old = np.zeros((self.label_buf.size,),dtype=np.bool)
+        bool_old[idx_old] = True
+        bool_new = np.zeros((self._highest_parent_.param_array.size,),dtype=np.bool)
+        bool_new[idx_new] = True
+        label_buf_new = np.empty(self._highest_parent_.param_array.shape, dtype=np.int)
+        label_buf_new[np.logical_not(bool_new)] = self.label_buf[np.logical_not(bool_old)]
+        label_buf_new[idx_new] = self.label_buf[idx_old[bool_list]]
+        self.label_buf = label_buf_new
+
+    def _check_change(self):
+        changed = False
+        if self.tied_param is not None:
+            for i in xrange(self.tied_param.size):
+                b0 = self.label_buf==self.label_buf[self.buf_idx[i]]
+                b = self._highest_parent_.param_array[b0]!=self.tied_param[i]
+                if b.sum()==0:
+                    print 'XXX'
+                    continue
+                elif b.sum()==1:
+                    print '!!!'
+                    val = self._highest_parent_.param_array[b0][b][0]
+                    self._highest_parent_.param_array[b0] = val
+                else:
+                    print '@@@'
+                    self._highest_parent_.param_array[b0] = self.tied_param[i]
+                changed = True
+        return changed
+
+    def parameters_changed(self):
+        #ensure all out parameters have the correct value, as specified by our mapping
+        changed = self._check_change()
+        if changed:
+            self._highest_parent_._trigger_params_changed()
+        self.collate_gradient()
+
+    def collate_gradient(self):
+        if self.tied_param is not None:
+            self.tied_param.gradient = 0.
+            [np.put(self.tied_param.gradient, i, self._highest_parent_.gradient[self.label_buf==self.label_buf[self.buf_idx[i]]].sum()) 
+                for i in xrange(self.tied_param.size)]
+    
+    def propagate_val(self):
+        if self.tied_param is not None:
+            for i in xrange(self.tied_param.size):
+                self._highest_parent_.param_array[self.label_buf==self.label_buf[self.buf_idx[i]]] = self.tied_param[i]
+
+
+
+
+
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@ -0,0 +1,433 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from domains import _POSITIVE,_NEGATIVE, _BOUNDED
+import weakref
+
+import sys
+
+_exp_lim_val = np.finfo(np.float64).max
+_lim_val = 36.0
+epsilon = np.finfo(np.float64).resolution
+
+#===============================================================================
+# Fixing constants
+__fixed__ = "fixed"
+FIXED = False
+UNFIXED = True
+#===============================================================================
+
+
+class Transformation(object):
+    domain = None
+    _instance = None
+    def __new__(cls, *args, **kwargs):
+        if not cls._instance or cls._instance.__class__ is not cls:
+            cls._instance = super(Transformation, cls).__new__(cls, *args, **kwargs)
+        return cls._instance
+    def f(self, opt_param):
+        raise NotImplementedError
+    def finv(self, model_param):
+        raise NotImplementedError
+    def gradfactor(self, model_param, dL_dmodel_param):
+        """ df(opt_param)_dopt_param evaluated at self.f(opt_param)=model_param, times the gradient dL_dmodel_param,
+
+        i.e.:
+        define
+
+        .. math::
+
+            \frac{\frac{\partial L}{\partial f}\left(\left.\partial f(x)}{\partial x}\right|_{x=f^{-1}(f)\right)}
+        """
+        raise NotImplementedError
+    def initialize(self, f):
+        """ produce a sensible initial value for f(x)"""
+        raise NotImplementedError
+    def plot(self, xlabel=r'transformed $\theta$', ylabel=r'$\theta$', axes=None, *args,**kw):
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        import matplotlib.pyplot as plt
+        from ...plotting.matplot_dep import base_plots
+        x = np.linspace(-8,8)
+        base_plots.meanplot(x, self.f(x),axes=axes*args,**kw)
+        axes = plt.gca()
+        axes.set_xlabel(xlabel)
+        axes.set_ylabel(ylabel)
+    def __str__(self):
+        raise NotImplementedError
+    def __repr__(self):
+        return self.__class__.__name__
+
+class Logexp(Transformation):
+    domain = _POSITIVE
+    def f(self, x):
+        return np.where(x>_lim_val, x, np.log1p(np.exp(np.clip(x, -_lim_val, _lim_val)))) + epsilon
+        #raises overflow warning: return np.where(x>_lim_val, x, np.log(1. + np.exp(x)))
+    def finv(self, f):
+        return np.where(f>_lim_val, f, np.log(np.exp(f+1e-20) - 1.))
+    def gradfactor(self, f, df):
+        return np.einsum('i,i->i', df, np.where(f>_lim_val, 1., 1. - np.exp(-f)))
+    def initialize(self, f):
+        if np.any(f < 0.):
+            print "Warning: changing parameters to satisfy constraints"
+        return np.abs(f)
+    def __str__(self):
+        return '+ve'
+
+
+class NormalTheta(Transformation):
+    _instances = []
+    def __new__(cls, mu_indices, var_indices):
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if np.all(instance().mu_indices==mu_indices, keepdims=False) and np.all(instance().var_indices==var_indices, keepdims=False):
+                    return instance()
+        o = super(Transformation, cls).__new__(cls, mu_indices, var_indices)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, mu_indices, var_indices):
+        self.mu_indices = mu_indices
+        self.var_indices = var_indices
+
+    def f(self, theta):
+        # In here abs is only a trick to make sure the numerics are ok.
+        # The variance will never go below zero, but at initialization we need to make sure
+        # that the values are ok
+        # Before:
+        theta[self.var_indices] = np.abs(-.5/theta[self.var_indices])
+        theta[self.mu_indices] *= theta[self.var_indices]
+        return theta # which is now {mu, var}
+
+    def finv(self, muvar):
+        # before:
+        varp = muvar[self.var_indices]
+        muvar[self.mu_indices] /= varp
+        muvar[self.var_indices] = -.5/varp
+
+        return muvar # which is now {theta1, theta2}
+
+    def gradfactor(self, muvar, dmuvar):
+        mu = muvar[self.mu_indices]
+        var = muvar[self.var_indices]
+        #=======================================================================
+        # theta gradients
+        # This works and the gradient checks!
+        dmuvar[self.mu_indices] *= var
+        dmuvar[self.var_indices] *= 2*(var)**2
+        dmuvar[self.var_indices] += 2*dmuvar[self.mu_indices]*mu
+        #=======================================================================
+
+        return dmuvar # which is now the gradient multiplicator for {theta1, theta2}
+
+    def initialize(self, f):
+        if np.any(f[self.var_indices] < 0.):
+            print "Warning: changing parameters to satisfy constraints"
+            f[self.var_indices] = np.abs(f[self.var_indices])
+        return f
+
+    def __str__(self):
+        return "theta"
+
+    def __getstate__(self):
+        return [self.mu_indices, self.var_indices]
+
+    def __setstate__(self, state):
+        self.mu_indices = state[0]
+        self.var_indices = state[1]
+
+class NormalNaturalAntti(NormalTheta):
+    _instances = []
+    _logexp = Logexp()
+    def __new__(cls, mu_indices, var_indices):
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if np.all(instance().mu_indices==mu_indices, keepdims=False) and np.all(instance().var_indices==var_indices, keepdims=False):
+                    return instance()
+        o = super(Transformation, cls).__new__(cls, mu_indices, var_indices)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, mu_indices, var_indices):
+        self.mu_indices = mu_indices
+        self.var_indices = var_indices
+
+    def gradfactor(self, muvar, dmuvar):
+        mu = muvar[self.mu_indices]
+        var = muvar[self.var_indices]
+
+        #=======================================================================
+        # theta gradients
+        # This works and the gradient checks!
+        dmuvar[self.mu_indices] *= var
+        dmuvar[self.var_indices] *= 2*var**2#np.einsum('i,i,i,i->i', dmuvar[self.var_indices], [2], var, var)
+        #=======================================================================
+
+        return dmuvar # which is now the gradient multiplicator
+
+    def initialize(self, f):
+        if np.any(f[self.var_indices] < 0.):
+            print "Warning: changing parameters to satisfy constraints"
+            f[self.var_indices] = np.abs(f[self.var_indices])
+        return f
+
+    def __str__(self):
+        return "natantti"
+
+class NormalEta(Transformation):
+    _instances = []
+    def __new__(cls, mu_indices, var_indices):
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if np.all(instance().mu_indices==mu_indices, keepdims=False) and np.all(instance().var_indices==var_indices, keepdims=False):
+                    return instance()
+        o = super(Transformation, cls).__new__(cls, mu_indices, var_indices)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, mu_indices, var_indices):
+        self.mu_indices = mu_indices
+        self.var_indices = var_indices
+
+    def f(self, theta):
+        theta[self.var_indices] = np.abs(theta[self.var_indices] - theta[self.mu_indices]**2)
+        return theta # which is now {mu, var}
+
+    def finv(self, muvar):
+        muvar[self.var_indices] += muvar[self.mu_indices]**2
+        return muvar # which is now {eta1, eta2}
+
+    def gradfactor(self, muvar, dmuvar):
+        mu = muvar[self.mu_indices]
+        #=======================================================================
+        # Lets try natural gradients instead: Not working with bfgs... try stochastic!
+        dmuvar[self.mu_indices] -= 2*mu*dmuvar[self.var_indices]
+        #=======================================================================
+        return dmuvar # which is now the gradient multiplicator
+
+    def initialize(self, f):
+        if np.any(f[self.var_indices] < 0.):
+            print "Warning: changing parameters to satisfy constraints"
+            f[self.var_indices] = np.abs(f[self.var_indices])
+        return f
+
+    def __str__(self):
+        return "eta"
+
+class NormalNaturalThroughTheta(NormalTheta):
+    _instances = []
+    def __new__(cls, mu_indices, var_indices):
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if np.all(instance().mu_indices==mu_indices, keepdims=False) and np.all(instance().var_indices==var_indices, keepdims=False):
+                    return instance()
+        o = super(Transformation, cls).__new__(cls, mu_indices, var_indices)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, mu_indices, var_indices):
+        self.mu_indices = mu_indices
+        self.var_indices = var_indices
+
+    def gradfactor(self, muvar, dmuvar):
+        mu = muvar[self.mu_indices]
+        var = muvar[self.var_indices]
+
+        #=======================================================================
+        # This is just eta direction:
+        dmuvar[self.mu_indices] -= 2*mu*dmuvar[self.var_indices]
+        #=======================================================================
+
+
+        #=======================================================================
+        # This is by going through theta fully and then going into eta direction:
+        #dmu = dmuvar[self.mu_indices]
+        #dmuvar[self.var_indices] += dmu*mu*(var + 4/var)
+        #=======================================================================
+        return dmuvar # which is now the gradient multiplicator
+
+    def __str__(self):
+        return "natgrad"
+
+class NormalNaturalThroughEta(NormalEta):
+    _instances = []
+    def __new__(cls, mu_indices, var_indices):
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if np.all(instance().mu_indices==mu_indices, keepdims=False) and np.all(instance().var_indices==var_indices, keepdims=False):
+                    return instance()
+        o = super(Transformation, cls).__new__(cls, mu_indices, var_indices)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+
+    def __init__(self, mu_indices, var_indices):
+        self.mu_indices = mu_indices
+        self.var_indices = var_indices
+
+    def gradfactor(self, muvar, dmuvar):
+        mu = muvar[self.mu_indices]
+        var = muvar[self.var_indices]
+        #=======================================================================
+        # theta gradients
+        # This works and the gradient checks!
+        dmuvar[self.mu_indices] *= var
+        dmuvar[self.var_indices] *= 2*(var)**2
+        dmuvar[self.var_indices] += 2*dmuvar[self.mu_indices]*mu
+        #=======================================================================
+        return dmuvar
+
+    def __str__(self):
+        return "natgrad"
+
+
+class LogexpNeg(Transformation):
+    domain = _POSITIVE
+    def f(self, x):
+        return np.where(x>_lim_val, -x, -np.log(1. + np.exp(np.clip(x, -np.inf, _lim_val))))
+        #raises overflow warning: return np.where(x>_lim_val, x, np.log(1. + np.exp(x)))
+    def finv(self, f):
+        return np.where(f>_lim_val, 0, np.log(np.exp(-f) - 1.))
+    def gradfactor(self, f, df):
+        return np.einsum('i,i->i', df, np.where(f>_lim_val, -1, -1 + np.exp(-f)))
+    def initialize(self, f):
+        if np.any(f < 0.):
+            print "Warning: changing parameters to satisfy constraints"
+        return np.abs(f)
+    def __str__(self):
+        return '+ve'
+
+
+class NegativeLogexp(Transformation):
+    domain = _NEGATIVE
+    logexp = Logexp()
+    def f(self, x):
+        return -self.logexp.f(x)  # np.log(1. + np.exp(x))
+    def finv(self, f):
+        return self.logexp.finv(-f)  # np.log(np.exp(-f) - 1.)
+    def gradfactor(self, f, df):
+        return np.einsum('i,i->i', df, -self.logexp.gradfactor(-f))
+    def initialize(self, f):
+        return -self.logexp.initialize(f)  # np.abs(f)
+    def __str__(self):
+        return '-ve'
+
+class LogexpClipped(Logexp):
+    max_bound = 1e100
+    min_bound = 1e-10
+    log_max_bound = np.log(max_bound)
+    log_min_bound = np.log(min_bound)
+    domain = _POSITIVE
+    _instances = []
+    def __new__(cls, lower=1e-6, *args, **kwargs):
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if instance().lower == lower:
+                    return instance()
+        o = super(Transformation, cls).__new__(cls, lower, *args, **kwargs)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+    def __init__(self, lower=1e-6):
+        self.lower = lower
+    def f(self, x):
+        exp = np.exp(np.clip(x, self.log_min_bound, self.log_max_bound))
+        f = np.log(1. + exp)
+#         if np.isnan(f).any():
+#             import ipdb;ipdb.set_trace()
+        return np.clip(f, self.min_bound, self.max_bound)
+    def finv(self, f):
+        return np.log(np.exp(f - 1.))
+    def gradfactor(self, f, df):
+        ef = np.exp(f) # np.clip(f, self.min_bound, self.max_bound))
+        gf = (ef - 1.) / ef
+        return np.einsum('i,i->i', df, gf) # np.where(f < self.lower, 0, gf)
+    def initialize(self, f):
+        if np.any(f < 0.):
+            print "Warning: changing parameters to satisfy constraints"
+        return np.abs(f)
+    def __str__(self):
+        return '+ve_c'
+
+class Exponent(Transformation):
+    # TODO: can't allow this to go to zero, need to set a lower bound. Similar with negative Exponent below. See old MATLAB code.
+    domain = _POSITIVE
+    def f(self, x):
+        return np.where(x<_lim_val, np.where(x>-_lim_val, np.exp(x), np.exp(-_lim_val)), np.exp(_lim_val))
+    def finv(self, x):
+        return np.log(x)
+    def gradfactor(self, f, df):
+        return np.einsum('i,i->i', df, f)
+    def initialize(self, f):
+        if np.any(f < 0.):
+            print "Warning: changing parameters to satisfy constraints"
+        return np.abs(f)
+    def __str__(self):
+        return '+ve'
+
+class NegativeExponent(Exponent):
+    domain = _NEGATIVE
+    def f(self, x):
+        return -Exponent.f(x)
+    def finv(self, f):
+        return Exponent.finv(-f)
+    def gradfactor(self, f, df):
+        return np.einsum('i,i->i', df, f)
+    def initialize(self, f):
+        return -Exponent.initialize(f) #np.abs(f)
+    def __str__(self):
+        return '-ve'
+
+class Square(Transformation):
+    domain = _POSITIVE
+    def f(self, x):
+        return x ** 2
+    def finv(self, x):
+        return np.sqrt(x)
+    def gradfactor(self, f, df):
+        return np.einsum('i,i->i', df, 2 * np.sqrt(f))
+    def initialize(self, f):
+        return np.abs(f)
+    def __str__(self):
+        return '+sq'
+
+class Logistic(Transformation):
+    domain = _BOUNDED
+    _instances = []
+    def __new__(cls, lower=1e-6, upper=1e-6, *args, **kwargs):
+        if cls._instances:
+            cls._instances[:] = [instance for instance in cls._instances if instance()]
+            for instance in cls._instances:
+                if instance().lower == lower and instance().upper == upper:
+                    return instance()
+        o = super(Transformation, cls).__new__(cls, lower, upper, *args, **kwargs)
+        cls._instances.append(weakref.ref(o))
+        return cls._instances[-1]()
+    def __init__(self, lower, upper):
+        assert lower < upper
+        self.lower, self.upper = float(lower), float(upper)
+        self.difference = self.upper - self.lower
+    def f(self, x):
+        if (x<-300.).any():
+            x = x.copy()
+            x[x<-300.] = -300.
+        return self.lower + self.difference / (1. + np.exp(-x))
+    def finv(self, f):
+        return np.log(np.clip(f - self.lower, 1e-10, np.inf) / np.clip(self.upper - f, 1e-10, np.inf))
+    def gradfactor(self, f, df):
+        return np.einsum('i,i->i', df, (f - self.lower) * (self.upper - f) / self.difference)
+    def initialize(self, f):
+        if np.any(np.logical_or(f < self.lower, f > self.upper)):
+            print "Warning: changing parameters to satisfy constraints"
+        #return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(f * 0.), f)
+        #FIXME: Max, zeros_like right?
+        return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(np.zeros_like(f)), f)
+    def __str__(self):
+        return '{},{}'.format(self.lower, self.upper)
+
+
--- a/GPy/core/parameterization/updateable.py
+++ b/GPy/core/parameterization/updateable.py
@ -0,0 +1,63 @@
+'''
+Created on 11 Nov 2014
+
+@author: maxz
+'''
+from observable import Observable
+
+
+class Updateable(Observable):
+    """
+    A model can be updated or not.
+    Make sure updates can be switched on and off.
+    """
+    _updates = True
+    def __init__(self, *args, **kwargs):
+        super(Updateable, self).__init__(*args, **kwargs)
+
+    @property
+    def updates(self):
+        raise DeprecationWarning("updates is now a function, see update(True|False|None)")
+
+    @updates.setter
+    def updates(self, ups):
+        raise DeprecationWarning("updates is now a function, see update(True|False|None)")
+
+    def update_model(self, updates=None):
+        """
+        Get or set, whether automatic updates are performed. When updates are
+        off, the model might be in a non-working state. To make the model work
+        turn updates on again.
+
+        :param bool|None updates:
+
+            bool: whether to do updates
+            None: get the current update state
+        """
+        if updates is None:
+            p = getattr(self, '_highest_parent_', None)
+            if p is not None:
+                self._updates = p._updates
+            return self._updates
+        assert isinstance(updates, bool), "updates are either on (True) or off (False)"
+        p = getattr(self, '_highest_parent_', None)
+        if p is not None:
+            p._updates = updates
+        self._updates = updates
+        self.trigger_update()
+
+    def toggle_update(self):
+        self.update_model(not self.update_model())
+
+    def trigger_update(self, trigger_parent=True):
+        """
+        Update the model from the current state.
+        Make sure that updates are on, otherwise this
+        method will do nothing
+
+        :param bool trigger_parent: Whether to trigger the parent, after self has updated
+        """
+        if not self.update_model() or (hasattr(self, "_in_init_") and self._in_init_):
+            #print "Warning: updates are off, updating the model will do nothing"
+            return
+        self._trigger_params_changed(trigger_parent)
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@ -0,0 +1,220 @@
+'''
+Created on 6 Nov 2013
+
+@author: maxz
+'''
+
+import numpy as np
+from parameterized import Parameterized
+from param import Param
+from transformations import Logexp, Logistic,__fixed__
+from GPy.util.misc import param_to_array
+from GPy.util.caching import Cache_this
+
+class VariationalPrior(Parameterized):
+    def __init__(self, name='latent space', **kw):
+        super(VariationalPrior, self).__init__(name=name, **kw)
+
+    def KL_divergence(self, variational_posterior):
+        raise NotImplementedError, "override this for variational inference of latent space"
+
+    def update_gradients_KL(self, variational_posterior):
+        """
+        updates the gradients for mean and variance **in place**
+        """
+        raise NotImplementedError, "override this for variational inference of latent space"
+
+class NormalPrior(VariationalPrior):
+    def KL_divergence(self, variational_posterior):
+        var_mean = np.square(variational_posterior.mean).sum()
+        var_S = (variational_posterior.variance - np.log(variational_posterior.variance)).sum()
+        return 0.5 * (var_mean + var_S) - 0.5 * variational_posterior.input_dim * variational_posterior.num_data
+
+    def update_gradients_KL(self, variational_posterior):
+        # dL:
+        variational_posterior.mean.gradient -= variational_posterior.mean
+        variational_posterior.variance.gradient -= (1. - (1. / (variational_posterior.variance))) * 0.5
+
+class SpikeAndSlabPrior(VariationalPrior):
+    def __init__(self, pi=None, learnPi=False, variance = 1.0, name='SpikeAndSlabPrior', **kw):
+        super(SpikeAndSlabPrior, self).__init__(name=name, **kw)        
+        self.variance = Param('variance',variance)
+        self.learnPi = learnPi
+        if learnPi:
+            self.pi = Param('Pi', pi, Logistic(1e-10,1.-1e-10))
+        else:
+            self.pi = Param('Pi', pi, __fixed__)
+        self.link_parameter(self.pi)
+
+
+    def KL_divergence(self, variational_posterior):
+        mu = variational_posterior.mean
+        S = variational_posterior.variance
+        gamma,gamma1 = variational_posterior.gamma_probabilities()
+        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        if len(self.pi.shape)==2:
+            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            pi = self.pi[idx]
+        else:
+            pi = self.pi
+            
+        var_mean = np.square(mu)/self.variance
+        var_S = (S/self.variance - np.log(S))
+        var_gamma = (gamma*(log_gamma-np.log(pi))).sum()+(gamma1*(log_gamma1-np.log(1-pi))).sum()
+        return var_gamma+ (gamma* (np.log(self.variance)-1. +var_mean + var_S)).sum()/2.
+
+    def update_gradients_KL(self, variational_posterior):
+        mu = variational_posterior.mean
+        S = variational_posterior.variance
+        gamma,gamma1 = variational_posterior.gamma_probabilities()
+        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        if len(self.pi.shape)==2:
+            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
+            pi = self.pi[idx]
+        else:
+            pi = self.pi
+
+        variational_posterior.binary_prob.gradient -= (np.log((1-pi)/pi)+log_gamma-log_gamma1+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.)*gamma*gamma1
+        mu.gradient -= gamma*mu/self.variance
+        S.gradient -= (1./self.variance - 1./S) * gamma /2.
+        if self.learnPi:
+            if len(self.pi)==1:
+                self.pi.gradient = (gamma/self.pi - (1.-gamma)/(1.-self.pi)).sum()
+            elif len(self.pi.shape)==1:
+                self.pi.gradient = (gamma/self.pi - (1.-gamma)/(1.-self.pi)).sum(axis=0)
+            else:
+                self.pi[idx].gradient = (gamma/self.pi[idx] - (1.-gamma)/(1.-self.pi[idx]))
+
+class VariationalPosterior(Parameterized):
+    def __init__(self, means=None, variances=None, name='latent space', *a, **kw):
+        super(VariationalPosterior, self).__init__(name=name, *a, **kw)
+        self.mean = Param("mean", means)
+        self.variance = Param("variance", variances, Logexp())
+        self.ndim = self.mean.ndim
+        self.shape = self.mean.shape
+        self.num_data, self.input_dim = self.mean.shape
+        self.link_parameters(self.mean, self.variance)
+        self.num_data, self.input_dim = self.mean.shape
+        if self.has_uncertain_inputs():
+            assert self.variance.shape == self.mean.shape, "need one variance per sample and dimenion"
+
+    def set_gradients(self, grad):
+        self.mean.gradient, self.variance.gradient = grad
+
+    def _raveled_index(self):
+        index = np.empty(dtype=int, shape=0)
+        size = 0
+        for p in self.parameters:
+            index = np.hstack((index, p._raveled_index()+size))
+            size += p._realsize_ if hasattr(p, '_realsize_') else p.size
+        return index
+
+    def has_uncertain_inputs(self):
+        return not self.variance is None
+
+    def __getitem__(self, s):
+        if isinstance(s, (int, slice, tuple, list, np.ndarray)):
+            import copy
+            n = self.__new__(self.__class__, self.name)
+            dc = self.__dict__.copy()
+            dc['mean'] = self.mean[s]
+            dc['variance'] = self.variance[s]
+            dc['parameters'] = copy.copy(self.parameters)
+            n.__dict__.update(dc)
+            n.parameters[dc['mean']._parent_index_] = dc['mean']
+            n.parameters[dc['variance']._parent_index_] = dc['variance']
+            n._gradient_array_ = None
+            oversize = self.size - self.mean.size - self.variance.size
+            n.size = n.mean.size + n.variance.size + oversize
+            n.ndim = n.mean.ndim
+            n.shape = n.mean.shape
+            n.num_data = n.mean.shape[0]
+            n.input_dim = n.mean.shape[1] if n.ndim != 1 else 1
+            return n
+        else:
+            return super(VariationalPosterior, self).__getitem__(s)
+
+class NormalPosterior(VariationalPosterior):
+    '''
+    NormalPosterior distribution for variational approximations.
+
+    holds the means and variances for a factorizing multivariate normal distribution
+    '''
+
+    def plot(self, *args):
+        """
+        Plot latent space X in 1D:
+
+        See  GPy.plotting.matplot_dep.variational_plots
+        """
+        import sys
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ...plotting.matplot_dep import variational_plots
+        import matplotlib
+        return variational_plots.plot(self,*args)
+
+class SpikeAndSlabPosterior(VariationalPosterior):
+    '''
+    The SpikeAndSlab distribution for variational approximations.
+    '''
+    def __init__(self, means, variances, binary_prob, name='latent space'):
+        """
+        binary_prob : the probability of the distribution on the slab part.
+        """
+        super(SpikeAndSlabPosterior, self).__init__(means, variances, name)
+        self.gamma = Param("binary_prob",binary_prob)
+        self.link_parameter(self.gamma)
+        
+    @Cache_this(limit=5)
+    def gamma_probabilities(self):
+        prob = np.zeros_like(param_to_array(self.gamma))
+        prob[self.gamma>-710] = 1./(1.+np.exp(-self.gamma[self.gamma>-710]))
+        prob1 = -np.zeros_like(param_to_array(self.gamma))
+        prob1[self.gamma<710] = 1./(1.+np.exp(self.gamma[self.gamma<710]))
+        return prob, prob1
+    
+    @Cache_this(limit=5)
+    def gamma_log_prob(self):
+        loggamma = param_to_array(self.gamma).copy()
+        loggamma[loggamma>-40] = -np.log1p(np.exp(-loggamma[loggamma>-40]))
+        loggamma1 = -param_to_array(self.gamma).copy()
+        loggamma1[loggamma1>-40] = -np.log1p(np.exp(-loggamma1[loggamma1>-40]))
+        return loggamma,loggamma1
+
+    def set_gradients(self, grad):
+        self.mean.gradient, self.variance.gradient, self.gamma.gradient = grad
+
+    def __getitem__(self, s):
+        if isinstance(s, (int, slice, tuple, list, np.ndarray)):
+            import copy
+            n = self.__new__(self.__class__, self.name)
+            dc = self.__dict__.copy()
+            dc['mean'] = self.mean[s]
+            dc['variance'] = self.variance[s]
+            dc['binary_prob'] = self.binary_prob[s]
+            dc['parameters'] = copy.copy(self.parameters)
+            n.__dict__.update(dc)
+            n.parameters[dc['mean']._parent_index_] = dc['mean']
+            n.parameters[dc['variance']._parent_index_] = dc['variance']
+            n.parameters[dc['binary_prob']._parent_index_] = dc['binary_prob']
+            n._gradient_array_ = None
+            oversize = self.size - self.mean.size - self.variance.size
+            n.size = n.mean.size + n.variance.size + oversize
+            n.ndim = n.mean.ndim
+            n.shape = n.mean.shape
+            n.num_data = n.mean.shape[0]
+            n.input_dim = n.mean.shape[1] if n.ndim != 1 else 1
+            return n
+        else:
+            return super(VariationalPrior, self).__getitem__(s)
+
+    def plot(self, *args, **kwargs):
+        """
+        Plot latent space X in 1D:
+
+        See  GPy.plotting.matplot_dep.variational_plots
+        """
+        import sys
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ...plotting.matplot_dep import variational_plots
+        return variational_plots.plot_SpikeSlab(self,*args, **kwargs)
--- a/GPy/core/parameterized.py
+++ b/GPy/core/parameterized.py
@ -1,465 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-import re
-import copy
-import cPickle
-import warnings
-import transformations
-
-class Parameterized(object):
-    def __init__(self):
-        """
-        This is the base class for model and kernel. Mostly just handles tieing and constraining of parameters
-        """
-        self.tied_indices = []
-        self.fixed_indices = []
-        self.fixed_values = []
-        self.constrained_indices = []
-        self.constraints = []
-
-    def _get_params(self):
-        raise NotImplementedError, "this needs to be implemented to use the Parameterized class"
-    def _set_params(self, x):
-        raise NotImplementedError, "this needs to be implemented to use the Parameterized class"
-
-    def _get_param_names(self):
-        raise NotImplementedError, "this needs to be implemented to use the Parameterized class"
-    #def _get_print_names(self):
-    #    """ Override for which names to print out, when using print m """
-    #    return self._get_param_names()
-
-    def pickle(self, filename, protocol=-1):
-        with open(filename, 'wb') as f:
-            cPickle.dump(self, f, protocol=protocol)
-
-    def copy(self):
-        """Returns a (deep) copy of the current model """
-        return copy.deepcopy(self)
-
-    def __getstate__(self):
-        if self._has_get_set_state():
-            return self.getstate()
-        return self.__dict__
-
-    def __setstate__(self, state):
-        if self._has_get_set_state():
-            self.setstate(state) # set state
-            self._set_params(self._get_params()) # restore all values
-            return
-        self.__dict__ = state
-
-    def _has_get_set_state(self):
-        return 'getstate' in vars(self.__class__) and 'setstate' in vars(self.__class__)
-
-    def getstate(self):
-        """
-        Get the current state of the class,
-        here just all the indices, rest can get recomputed
-        For inheriting from Parameterized:
-
-        Allways append the state of the inherited object
-        and call down to the inherited object in setstate!!
-        """
-        return [self.tied_indices,
-                self.fixed_indices,
-                self.fixed_values,
-                self.constrained_indices,
-                self.constraints]
-
-    def setstate(self, state):
-        self.constraints = state.pop()
-        self.constrained_indices = state.pop()
-        self.fixed_values = state.pop()
-        self.fixed_indices = state.pop()
-        self.tied_indices = state.pop()
-
-    def __getitem__(self, regexp, return_names=False):
-        """
-        Get a model parameter by name.  The name is applied as a regular
-        expression and all parameters that match that regular expression are
-        returned.
-        """
-        matches = self.grep_param_names(regexp)
-        if len(matches):
-            if return_names:
-                return self._get_params()[matches], np.asarray(self._get_param_names())[matches].tolist()
-            else:
-                return self._get_params()[matches]
-        else:
-            raise AttributeError, "no parameter matches %s" % regexp
-
-    def __setitem__(self, name, val):
-        """
-        Set model parameter(s) by name. The name is provided as a regular
-        expression. All parameters matching that regular expression are set to
-        the given value.
-        """
-        matches = self.grep_param_names(name)
-        if len(matches):
-            val = np.array(val)
-            assert (val.size == 1) or val.size == len(matches), "Shape mismatch: {}:({},)".format(val.size, len(matches))
-            x = self._get_params()
-            x[matches] = val
-            self._set_params(x)
-        else:
-            raise AttributeError, "no parameter matches %s" % name
-
-    def tie_params(self, regexp):
-        """
-        Tie (all!) parameters matching the regular expression `regexp`. 
-        """
-        matches = self.grep_param_names(regexp)
-        assert matches.size > 0, "need at least something to tie together"
-        if len(self.tied_indices):
-            assert not np.any(matches[:, None] == np.hstack(self.tied_indices)), "Some indices are already tied!"
-        self.tied_indices.append(matches)
-        # TODO only one of the priors will be evaluated. Give a warning message if the priors are not identical
-        if hasattr(self, 'prior'):
-            pass
-
-        self._set_params_transformed(self._get_params_transformed()) # sets tied parameters to single value
-
-    def untie_everything(self):
-        """Unties all parameters by setting tied_indices to an empty list."""
-        self.tied_indices = []
-
-    def grep_param_names(self, regexp, transformed=False, search=False):
-        """
-        :param regexp: regular expression to select parameter names
-        :type regexp: re | str | int
-        :rtype: the indices of self._get_param_names which match the regular expression.
-
-        Note:-
-          Other objects are passed through - i.e. integers which weren't meant for grepping
-        """
-
-        if transformed:
-            names = self._get_param_names_transformed()
-        else:
-            names = self._get_param_names()
-
-        if type(regexp) in [str, np.string_, np.str]:
-            regexp = re.compile(regexp)
-        elif type(regexp) is re._pattern_type:
-            pass
-        else:
-            return regexp
-        if search:
-            return np.nonzero([regexp.search(name) for name in names])[0]
-        else:
-            return np.nonzero([regexp.match(name) for name in names])[0]
-
-    def num_params_transformed(self):
-        removed = 0
-        for tie in self.tied_indices:
-            removed += tie.size - 1
-
-        for fix in self.fixed_indices:
-            removed += fix.size
-
-        return len(self._get_params()) - removed
-
-    def unconstrain(self, regexp):
-        """Unconstrain matching parameters.  Does not untie parameters"""
-        matches = self.grep_param_names(regexp)
-
-        # tranformed contraints:
-        for match in matches:
-            self.constrained_indices = [i[i <> match] for i in self.constrained_indices]
-
-        # remove empty constraints
-        tmp = zip(*[(i, t) for i, t in zip(self.constrained_indices, self.constraints) if len(i)])
-        if tmp:
-            self.constrained_indices, self.constraints = zip(*[(i, t) for i, t in zip(self.constrained_indices, self.constraints) if len(i)])
-            self.constrained_indices, self.constraints = list(self.constrained_indices), list(self.constraints)
-
-        # fixed:
-        self.fixed_values = [np.delete(values, np.nonzero(np.sum(indices[:, None] == matches[None, :], 1))[0]) for indices, values in zip(self.fixed_indices, self.fixed_values)]
-        self.fixed_indices = [np.delete(indices, np.nonzero(np.sum(indices[:, None] == matches[None, :], 1))[0]) for indices in self.fixed_indices]
-
-        # remove empty elements
-        tmp = [(i, v) for i, v in zip(self.fixed_indices, self.fixed_values) if len(i)]
-        if tmp:
-            self.fixed_indices, self.fixed_values = zip(*tmp)
-            self.fixed_indices, self.fixed_values = list(self.fixed_indices), list(self.fixed_values)
-        else:
-            self.fixed_indices, self.fixed_values = [], []
-
-    def constrain_negative(self, regexp, warning=True):
-        """ Set negative constraints. """
-        self.constrain(regexp, transformations.negative_logexp(), warning=warning)
-
-    def constrain_positive(self, regexp, warning=True):
-        """ Set positive constraints. """
-        self.constrain(regexp, transformations.logexp(), warning=warning)
-
-    def constrain_bounded(self, regexp, lower, upper, warning=True):
-        """ Set bounded constraints. """
-        self.constrain(regexp, transformations.logistic(lower, upper), warning=warning)
-
-    def all_constrained_indices(self):
-        if len(self.constrained_indices) or len(self.fixed_indices):
-            return np.hstack(self.constrained_indices + self.fixed_indices)
-        else:
-            return np.empty(shape=(0,))
-
-    def constrain(self, regexp, transform, warning=True):
-        assert isinstance(transform, transformations.transformation)
-
-        matches = self.grep_param_names(regexp)
-        if warning:
-            overlap = set(matches).intersection(set(self.all_constrained_indices()))
-            if overlap:
-                self.unconstrain(np.asarray(list(overlap)))
-                print 'Warning: re-constraining these parameters'
-                pn = self._get_param_names()
-                for i in overlap:
-                    print pn[i]
-
-        self.constrained_indices.append(matches)
-        self.constraints.append(transform)
-        x = self._get_params()
-        x[matches] = transform.initialize(x[matches])
-        self._set_params(x)
-
-    def constrain_fixed(self, regexp, value=None, warning=True):
-        """
-
-        :param regexp: which parameters need to be fixed.
-        :type regexp: ndarray(dtype=int) or regular expression object or string
-        :param value: the vlaue to fix the parameters to. If the value is not specified,
-                 the parameter is fixed to the current value
-        :type value: float
-
-        **Notes**
-
-        Fixing a parameter which is tied to another, or constrained in some way will result in an error.
-
-        To fix multiple parameters to the same value, simply pass a regular expression which matches both parameter names, or pass both of the indexes.
-
-        """
-        matches = self.grep_param_names(regexp)
-        if warning:
-            overlap = set(matches).intersection(set(self.all_constrained_indices()))
-            if overlap:
-                self.unconstrain(np.asarray(list(overlap)))
-                print 'Warning: re-constraining these parameters'
-                pn = self._get_param_names()
-                for i in overlap:
-                    print pn[i]
-
-        self.fixed_indices.append(matches)
-        if value != None:
-            self.fixed_values.append(value)
-        else:
-            self.fixed_values.append(self._get_params()[self.fixed_indices[-1]])
-
-        # self.fixed_values.append(value)
-        self._set_params_transformed(self._get_params_transformed())
-
-    def _get_params_transformed(self):
-        """use self._get_params to get the 'true' parameters of the model, which are then tied, constrained and fixed"""
-        x = self._get_params()
-        [np.put(x, i, t.finv(x[i])) for i, t in zip(self.constrained_indices, self.constraints)]
-
-        to_remove = self.fixed_indices + [t[1:] for t in self.tied_indices]
-        if len(to_remove):
-            return np.delete(x, np.hstack(to_remove))
-        else:
-            return x
-
-    def _set_params_transformed(self, x):
-        """ takes the vector x, which is then modified (by untying, reparameterising or inserting fixed values), and then call self._set_params"""
-        self._set_params(self._untransform_params(x))
-
-    def _untransform_params(self, x):
-        """
-        The transformation required for _set_params_transformed.
-
-        This moves the vector x seen by the optimiser (unconstrained) to the
-        valid parameter vector seen by the model
-
-        Note:
-          - This function is separate from _set_params_transformed for downstream flexibility
-        """
-        # work out how many places are fixed, and where they are. tricky logic!
-        fix_places = self.fixed_indices + [t[1:] for t in self.tied_indices]
-        if len(fix_places):
-            fix_places = np.hstack(fix_places)
-            Nfix_places = fix_places.size
-        else:
-            Nfix_places = 0
-
-        free_places = np.setdiff1d(np.arange(Nfix_places + x.size, dtype=np.int), fix_places)
-
-        # put the models values in the vector xx
-        xx = np.zeros(Nfix_places + free_places.size, dtype=np.float64)
-
-        xx[free_places] = x
-        [np.put(xx, i, v) for i, v in zip(self.fixed_indices, self.fixed_values)]
-        [np.put(xx, i, v) for i, v in [(t[1:], xx[t[0]]) for t in self.tied_indices] ]
-
-        [np.put(xx, i, t.f(xx[i])) for i, t in zip(self.constrained_indices, self.constraints)]
-        if hasattr(self, 'debug'):
-            stop # @UndefinedVariable
-
-        return xx
-
-    def _get_param_names_transformed(self):
-        """
-        Returns the parameter names as propagated after constraining,
-        tying or fixing, i.e. a list of the same length as _get_params_transformed()
-        """
-        n = self._get_param_names()
-
-        # remove/concatenate the tied parameter names
-        if len(self.tied_indices):
-            for t in self.tied_indices:
-                n[t[0]] = "<tie>".join([n[tt] for tt in t])
-            remove = np.hstack([t[1:] for t in self.tied_indices])
-        else:
-            remove = np.empty(shape=(0,), dtype=np.int)
-
-        # also remove the fixed params
-        if len(self.fixed_indices):
-            remove = np.hstack((remove, np.hstack(self.fixed_indices)))
-
-        # add markers to show that some variables are constrained
-        for i, t in zip(self.constrained_indices, self.constraints):
-            for ii in i:
-                n[ii] = n[ii] + t.__str__()
-
-        n = [nn for i, nn in enumerate(n) if not i in remove]
-        return n
-
-    #@property
-    #def all(self):
-    #    return self.__str__(self._get_param_names())
-
-
-    #def __str__(self, names=None, nw=30):
-    def __str__(self, nw=30):
-        """
-        Return a string describing the parameter names and their ties and constraints
-        """
-        names = self._get_param_names()
-        #if names is None:
-        #    names = self._get_print_names()
-        #name_indices = self.grep_param_names("|".join(names))
-        N = len(names)
-
-        if not N:
-            return "This object has no free parameters."
-        header = ['Name', 'Value', 'Constraints', 'Ties']
-        values = self._get_params() # map(str,self._get_params())
-        #values = self._get_params()[name_indices] # map(str,self._get_params())
-        # sort out the constraints
-        constraints = [''] * len(names)
-        #constraints = [''] * len(self._get_param_names())
-        for i, t in zip(self.constrained_indices, self.constraints):
-            for ii in i:
-                constraints[ii] = t.__str__()
-        for i in self.fixed_indices:
-            for ii in i:
-                constraints[ii] = 'Fixed'
-        # sort out the ties
-        ties = [''] * len(names)
-        for i, tie in enumerate(self.tied_indices):
-            for j in tie:
-                ties[j] = '(' + str(i) + ')'
-
-        if values.size == 1:
-            values = ['%.4f' %float(values)]
-        else:
-            values = ['%.4f' % float(v) for v in values]
-        max_names = max([len(names[i]) for i in range(len(names))] + [len(header[0])])
-        max_values = max([len(values[i]) for i in range(len(values))] + [len(header[1])])
-        max_constraint = max([len(constraints[i]) for i in range(len(constraints))] + [len(header[2])])
-        max_ties = max([len(ties[i]) for i in range(len(ties))] + [len(header[3])])
-        cols = np.array([max_names, max_values, max_constraint, max_ties]) + 4
-        # columns = cols.sum()
-
-        header_string = ["{h:^{col}}".format(h=header[i], col=cols[i]) for i in range(len(cols))]
-        header_string = map(lambda x: '|'.join(x), [header_string])
-        separator = '-' * len(header_string[0])
-        param_string = ["{n:^{c0}}|{v:^{c1}}|{c:^{c2}}|{t:^{c3}}".format(n=names[i], v=values[i], c=constraints[i], t=ties[i], c0=cols[0], c1=cols[1], c2=cols[2], c3=cols[3]) for i in range(len(values))]
-
-
-        return ('\n'.join([header_string[0], separator] + param_string)) + '\n'
-
-    def grep_model(self,regexp):
-        regexp_indices = self.grep_param_names(regexp)
-        all_names = self._get_param_names()
-
-        names = [all_names[pj] for pj in regexp_indices]
-        N = len(names)
-
-        if not N:
-            return "Match not found."
-
-        header = ['Name', 'Value', 'Constraints', 'Ties']
-        all_values = self._get_params()
-        values = np.array([all_values[pj] for pj in regexp_indices])
-        constraints = [''] * len(names)
-
-        _constrained_indices,aux = self._pick_elements(regexp_indices,self.constrained_indices)
-        _constraints = [self.constraints[pj] for pj in aux]
-
-        for i, t in zip(_constrained_indices, _constraints):
-            for ii in i:
-                iii = regexp_indices.tolist().index(ii)
-                constraints[iii] = t.__str__()
-
-        _fixed_indices,aux = self._pick_elements(regexp_indices,self.fixed_indices)
-        for i in _fixed_indices:
-            for ii in i:
-                iii = regexp_indices.tolist().index(ii)
-                constraints[ii] = 'Fixed'
-
-        _tied_indices,aux = self._pick_elements(regexp_indices,self.tied_indices)
-        ties = [''] * len(names)
-        for i,ti in zip(_tied_indices,aux):
-            for ii in i:
-                iii = regexp_indices.tolist().index(ii)
-                ties[iii] = '(' + str(ti) + ')'
-
-        if values.size == 1:
-            values = ['%.4f' %float(values)]
-        else:
-            values = ['%.4f' % float(v) for v in values]
-
-        max_names = max([len(names[i]) for i in range(len(names))] + [len(header[0])])
-        max_values = max([len(values[i]) for i in range(len(values))] + [len(header[1])])
-        max_constraint = max([len(constraints[i]) for i in range(len(constraints))] + [len(header[2])])
-        max_ties = max([len(ties[i]) for i in range(len(ties))] + [len(header[3])])
-        cols = np.array([max_names, max_values, max_constraint, max_ties]) + 4
-
-        header_string = ["{h:^{col}}".format(h=header[i], col=cols[i]) for i in range(len(cols))]
-        header_string = map(lambda x: '|'.join(x), [header_string])
-        separator = '-' * len(header_string[0])
-        param_string = ["{n:^{c0}}|{v:^{c1}}|{c:^{c2}}|{t:^{c3}}".format(n=names[i], v=values[i], c=constraints[i], t=ties[i], c0=cols[0], c1=cols[1], c2=cols[2], c3=cols[3]) for i in range(len(values))]
-
-        print header_string[0]
-        print separator
-        for string in param_string:
-            print string
-
-    def _pick_elements(self,regexp_ind,array_list):
-        """Removes from array_list the elements different from regexp_ind"""
-        new_array_list = [] #New list with elements matching regexp_ind
-        array_indices = [] #Indices that matches the arrays in new_array_list and array_list
-
-        array_index = 0
-        for array in array_list:
-            _new = []
-            for ai in array:
-                if ai in regexp_ind:
-                    _new.append(ai)
-            if len(_new):
-                new_array_list.append(np.array(_new))
-                array_indices.append(array_index)
-            array_index += 1
-        return new_array_list, array_indices
--- a/GPy/core/priors.py
+++ b/GPy/core/priors.py
@ -1,217 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-import pylab as pb
-from scipy.special import gammaln, digamma
-from ..util.linalg import pdinv
-from GPy.core.domains import REAL, POSITIVE
-import warnings
-
-class Prior:
-    domain = None
-    def pdf(self, x):
-        return np.exp(self.lnpdf(x))
-
-    def plot(self):
-        rvs = self.rvs(1000)
-        pb.hist(rvs, 100, normed=True)
-        xmin, xmax = pb.xlim()
-        xx = np.linspace(xmin, xmax, 1000)
-        pb.plot(xx, self.pdf(xx), 'r', linewidth=2)
-
-
-class Gaussian(Prior):
-    """
-    Implementation of the univariate Gaussian probability function, coupled with random variables.
-
-    :param mu: mean
-    :param sigma: standard deviation
-
-    .. Note:: Bishop 2006 notation is used throughout the code
-
-    """
-    domain = REAL
-    def __init__(self, mu, sigma):
-        self.mu = float(mu)
-        self.sigma = float(sigma)
-        self.sigma2 = np.square(self.sigma)
-        self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)
-
-    def __str__(self):
-        return "N(" + str(np.round(self.mu)) + ', ' + str(np.round(self.sigma2)) + ')'
-
-    def lnpdf(self, x):
-        return self.constant - 0.5 * np.square(x - self.mu) / self.sigma2
-
-    def lnpdf_grad(self, x):
-        return -(x - self.mu) / self.sigma2
-
-    def rvs(self, n):
-        return np.random.randn(n) * self.sigma + self.mu
-
-
-class LogGaussian(Prior):
-    """
-    Implementation of the univariate *log*-Gaussian probability function, coupled with random variables.
-
-    :param mu: mean
-    :param sigma: standard deviation
-
-    .. Note:: Bishop 2006 notation is used throughout the code
-
-    """
-    domain = POSITIVE
-    def __init__(self, mu, sigma):
-        self.mu = float(mu)
-        self.sigma = float(sigma)
-        self.sigma2 = np.square(self.sigma)
-        self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)
-
-    def __str__(self):
-        return "lnN(" + str(np.round(self.mu)) + ', ' + str(np.round(self.sigma2)) + ')'
-
-    def lnpdf(self, x):
-        return self.constant - 0.5 * np.square(np.log(x) - self.mu) / self.sigma2 - np.log(x)
-
-    def lnpdf_grad(self, x):
-        return -((np.log(x) - self.mu) / self.sigma2 + 1.) / x
-
-    def rvs(self, n):
-        return np.exp(np.random.randn(n) * self.sigma + self.mu)
-
-
-class MultivariateGaussian:
-    """
-    Implementation of the multivariate Gaussian probability function, coupled with random variables.
-
-    :param mu: mean (N-dimensional array)
-    :param var: covariance matrix (NxN)
-
-    .. Note:: Bishop 2006 notation is used throughout the code
-
-    """
-    domain = REAL
-    def __init__(self, mu, var):
-        self.mu = np.array(mu).flatten()
-        self.var = np.array(var)
-        assert len(self.var.shape) == 2
-        assert self.var.shape[0] == self.var.shape[1]
-        assert self.var.shape[0] == self.mu.size
-        self.input_dim = self.mu.size
-        self.inv, self.hld = pdinv(self.var)
-        self.constant = -0.5 * self.input_dim * np.log(2 * np.pi) - self.hld
-
-    def summary(self):
-        raise NotImplementedError
-
-    def pdf(self, x):
-        return np.exp(self.lnpdf(x))
-
-    def lnpdf(self, x):
-        d = x - self.mu
-        return self.constant - 0.5 * np.sum(d * np.dot(d, self.inv), 1)
-
-    def lnpdf_grad(self, x):
-        d = x - self.mu
-        return -np.dot(self.inv, d)
-
-    def rvs(self, n):
-        return np.random.multivariate_normal(self.mu, self.var, n)
-
-    def plot(self):
-        if self.input_dim == 2:
-            rvs = self.rvs(200)
-            pb.plot(rvs[:, 0], rvs[:, 1], 'kx', mew=1.5)
-            xmin, xmax = pb.xlim()
-            ymin, ymax = pb.ylim()
-            xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
-            xflat = np.vstack((xx.flatten(), yy.flatten())).T
-            zz = self.pdf(xflat).reshape(100, 100)
-            pb.contour(xx, yy, zz, linewidths=2)
-
-
-def gamma_from_EV(E, V):
-    warnings.warn("use Gamma.from_EV to create Gamma Prior", FutureWarning)
-    return Gamma.from_EV(E, V)
-
-
-class Gamma(Prior):
-    """
-    Implementation of the Gamma probability function, coupled with random variables.
-
-    :param a: shape parameter
-    :param b: rate parameter (warning: it's the *inverse* of the scale)
-
-    .. Note:: Bishop 2006 notation is used throughout the code
-
-    """
-    domain = POSITIVE
-    def __init__(self, a, b):
-        self.a = float(a)
-        self.b = float(b)
-        self.constant = -gammaln(self.a) + a * np.log(b)
-
-    def __str__(self):
-        return "Ga(" + str(np.round(self.a)) + ', ' + str(np.round(self.b)) + ')'
-
-    def summary(self):
-        ret = {"E[x]": self.a / self.b, \
-            "E[ln x]": digamma(self.a) - np.log(self.b), \
-            "var[x]": self.a / self.b / self.b, \
-            "Entropy": gammaln(self.a) - (self.a - 1.) * digamma(self.a) - np.log(self.b) + self.a}
-        if self.a > 1:
-            ret['Mode'] = (self.a - 1.) / self.b
-        else:
-            ret['mode'] = np.nan
-        return ret
-
-    def lnpdf(self, x):
-        return self.constant + (self.a - 1) * np.log(x) - self.b * x
-
-    def lnpdf_grad(self, x):
-        return (self.a - 1.) / x - self.b
-
-    def rvs(self, n):
-        return np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
-    @staticmethod
-    def from_EV(E, V):
-        """
-        Creates an instance of a Gamma Prior  by specifying the Expected value(s)
-        and Variance(s) of the distribution.
-    
-        :param E: expected value
-        :param V: variance
-        """
-        a = np.square(E) / V
-        b = E / V
-        return Gamma(a, b)
-
-class inverse_gamma(Prior):
-    """
-    Implementation of the inverse-Gamma probability function, coupled with random variables.
-
-    :param a: shape parameter
-    :param b: rate parameter (warning: it's the *inverse* of the scale)
-
-    .. Note:: Bishop 2006 notation is used throughout the code
-
-    """
-    domain = POSITIVE
-    def __init__(self, a, b):
-        self.a = float(a)
-        self.b = float(b)
-        self.constant = -gammaln(self.a) + a * np.log(b)
-
-    def __str__(self):
-        return "iGa(" + str(np.round(self.a)) + ', ' + str(np.round(self.b)) + ')'
-
-    def lnpdf(self, x):
-        return self.constant - (self.a + 1) * np.log(x) - self.b / x
-
-    def lnpdf_grad(self, x):
-        return -(self.a + 1.) / x + self.b / x ** 2
-
-    def rvs(self, n):
-        return 1. / np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -1,16 +1,28 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import numpy as np
-import pylab as pb
-from ..util.linalg import mdot, jitchol, tdot, symmetrify, backsub_both_sides, chol_inv, dtrtrs, dpotrs, dpotri
-from scipy import linalg
-from ..likelihoods import Gaussian, EP,EP_Mixed_Noise
-from gp_base import GPBase
+from gp import GP
+from parameterization.param import Param
+from ..inference.latent_function_inference import var_dtc
+from .. import likelihoods
+from parameterization.variational import VariationalPosterior

-class SparseGP(GPBase):
+import logging
+from GPy.inference.latent_function_inference.posterior import Posterior
+from GPy.inference.optimization.stochastics import SparseGPStochastics,\
+    SparseGPMissing
+#no stochastics.py file added! from GPy.inference.optimization.stochastics import SparseGPStochastics,\
+    #SparseGPMissing
+logger = logging.getLogger("sparse gp")
+
+class SparseGP(GP):
    """
-    Variational sparse GP model
+    A general purpose Sparse GP model
+
+    This model allows (approximate) inference using variational DTC or FITC
+    (Gaussian likelihoods) as well as non-conjugate sparse methods based on
+    these.

    :param X: inputs
    :type X: np.ndarray (num_data x input_dim)
@ -20,478 +32,101 @@ class SparseGP(GPBase):
    :type kernel: a GPy.kern.kern instance
    :param X_variance: The uncertainty in the measurements of X (Gaussian variance)
    :type X_variance: np.ndarray (num_data x input_dim) | None
-    :param Z: inducing inputs (optional, see note)
-    :type Z: np.ndarray (num_inducing x input_dim) | None
+    :param Z: inducing inputs
+    :type Z: np.ndarray (num_inducing x input_dim)
    :param num_inducing: Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type num_inducing: int
-    :param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales)
-    :type normalize_(X|Y): bool

    """

-    def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False):
-        GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
+    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
+                 name='sparse gp', Y_metadata=None, normalizer=False):
+        #pick a sensible inference method
+        if inference_method is None:
+            if isinstance(likelihood, likelihoods.Gaussian):
+                inference_method = var_dtc.VarDTC(limit=1 if not self.missing_data else Y.shape[1])
+            else:
+                #inference_method = ??
+                raise NotImplementedError, "what to do what to do?"
+            print "defaulting to ", inference_method, "for latent function inference"

-        self.Z = Z
+        self.Z = Param('inducing inputs', Z)
        self.num_inducing = Z.shape[0]
-        self.backsub = 0

-        if X_variance is None:
-            self.has_uncertain_inputs = False
-            self.X_variance = None
+        GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
+
+        logger.info("Adding Z as parameter")
+        self.link_parameter(self.Z, index=0)
+        self.posterior = None
+
+    def has_uncertain_inputs(self):
+        return isinstance(self.X, VariationalPosterior)
+
+    def parameters_changed(self):
+        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata)
+
+        self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
+
+        if isinstance(self.X, VariationalPosterior):
+            #gradients wrt kernel
+            dL_dKmm = self.grad_dict['dL_dKmm']
+            self.kern.update_gradients_full(dL_dKmm, self.Z, None)
+            kerngrad = self.kern.gradient.copy()
+            self.kern.update_gradients_expectations(variational_posterior=self.X,
+                                                    Z=self.Z,
+                                                    dL_dpsi0=self.grad_dict['dL_dpsi0'],
+                                                    dL_dpsi1=self.grad_dict['dL_dpsi1'],
+                                                    dL_dpsi2=self.grad_dict['dL_dpsi2'])
+            self.kern.gradient += kerngrad
+
+            #gradients wrt Z
+            self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z)
+            self.Z.gradient += self.kern.gradients_Z_expectations(
+                               self.grad_dict['dL_dpsi0'],
+                               self.grad_dict['dL_dpsi1'],
+                               self.grad_dict['dL_dpsi2'],
+                               Z=self.Z,
+                               variational_posterior=self.X)
        else:
-            assert X_variance.shape == X.shape
-            self.has_uncertain_inputs = True
-            self.X_variance = X_variance
-
-        if normalize_X:
-            self.Z = (self.Z.copy() - self._Xoffset) / self._Xscale
-
-        # normalize X uncertainty also
-        if self.has_uncertain_inputs:
-            self.X_variance /= np.square(self._Xscale)
-
-        self._const_jitter = None
-
-    def _compute_kernel_matrices(self):
-        # kernel computations, using BGPLVM notation
-        self.Kmm = self.kern.K(self.Z)
-        if self.has_uncertain_inputs:
-            self.psi0 = self.kern.psi0(self.Z, self.X, self.X_variance)
-            self.psi1 = self.kern.psi1(self.Z, self.X, self.X_variance)
-            self.psi2 = self.kern.psi2(self.Z, self.X, self.X_variance)
-        else:
-            self.psi0 = self.kern.Kdiag(self.X)
-            self.psi1 = self.kern.K(self.X, self.Z)
-            self.psi2 = None
-
-    def _computations(self):
-        if self._const_jitter is None or not(self._const_jitter.shape[0] == self.num_inducing):
-            self._const_jitter = np.eye(self.num_inducing) * 1e-7
-
-        # factor Kmm
-        self._Lm = jitchol(self.Kmm + self._const_jitter)    
-        if not self.backsub:
-            self._LmInv = linalg.lapack.dtrtri(self._Lm, lower=1)[0] # TODO: not needed in old version
-        
-        # The rather complex computations of self._A
-        if self.has_uncertain_inputs:
-            if self.likelihood.is_heteroscedastic:
-                psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.num_data, 1, 1))).sum(0)
-            else:
-                psi2_beta = self.psi2.sum(0) * self.likelihood.precision
-            if self.backsub:
-                evals, evecs = linalg.eigh(psi2_beta)
-                clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
-                if not np.array_equal(evals, clipped_evals):
-                    pass # print evals
-                tmp = evecs * np.sqrt(clipped_evals)
-                tmp = tmp.T
-                tmp, _ = dtrtrs(self._Lm, np.asfortranarray(tmp.T), lower=1)
-                self._A = tdot(tmp) 
-            else:
-                self._A = np.dot(np.dot(self._LmInv,
-                                        psi2_beta),
-                                 self._LmInv.T)
-        else:
-            if self.likelihood.is_heteroscedastic:
-                tmp = self.psi1 * (np.sqrt(self.likelihood.precision.flatten().reshape(self.num_data, 1)))
-            else:
-                tmp = self.psi1 * (np.sqrt(self.likelihood.precision))
-            tmp, _ = dtrtrs(self._Lm, np.asfortranarray(tmp.T), lower=1)
-            self._A = tdot(tmp)        
-        
-        # factor B
-        self.B = np.eye(self.num_inducing) + self._A
-        self.LB = jitchol(self.B)
-
-        # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
-        self.psi1Vf = np.dot(self.psi1.T, self.likelihood.VVT_factor)
-
-        if 1:#self.backsub:
-            # back substutue C into psi1Vf
-            tmp, info1 = dtrtrs(self._Lm, np.asfortranarray(self.psi1Vf), lower=1, trans=0)
-            self._LBi_Lmi_psi1Vf, _ = dtrtrs(self.LB, np.asfortranarray(tmp), lower=1, trans=0)
-            # tmp, info2 = dpotrs(self.LB, tmp, lower=1)
-            tmp, info2 = dtrtrs(self.LB, self._LBi_Lmi_psi1Vf, lower=1, trans=1)
-            self.Cpsi1Vf, info3 = dtrtrs(self._Lm, tmp, lower=1, trans=1)
-        else:
-            # slower, but more stable (?) version:
-            tmp = np.dot(self._LmInv, self.psi1Vf)
-            self._LBInv = linalg.lapack.dtrtri(self.LB, lower=True)[0]
-            self._LBi_Lmi_psi1Vf = np.dot(self._LBInv, tmp)
-            tmp = np.dot(self._LBInv.T, self._LBi_Lmi_psi1Vf)
-            self.Cpsi1Vf = np.dot(self._LmInv.T, tmp)
-        
-        #import ipdb;ipdb.set_trace()
-        
-        # Compute dL_dKmm
-        tmp = tdot(self._LBi_Lmi_psi1Vf)
-        self.data_fit = np.trace(tmp)
-        self.DBi_plus_BiPBi = backsub_both_sides(self.LB, self.output_dim * np.eye(self.num_inducing) + tmp)
-        tmp = -0.5 * self.DBi_plus_BiPBi
-        tmp += -0.5 * self.B * self.output_dim
-        tmp += self.output_dim * np.eye(self.num_inducing)
-        self.dL_dKmm = backsub_both_sides(self._Lm, tmp)
-
-        # Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertain inputs case
-        self.dL_dpsi0 = -0.5 * self.output_dim * (self.likelihood.precision * np.ones([self.num_data, 1])).flatten()
-        self.dL_dpsi1 = np.dot(self.likelihood.VVT_factor, self.Cpsi1Vf.T)
-        dL_dpsi2_beta = 0.5 * backsub_both_sides(self._Lm, self.output_dim * np.eye(self.num_inducing) - self.DBi_plus_BiPBi)
-
-        if self.likelihood.is_heteroscedastic:
-
-            if self.has_uncertain_inputs:
-                self.dL_dpsi2 = self.likelihood.precision.flatten()[:, None, None] * dL_dpsi2_beta[None, :, :]
-            else:
-                self.dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (self.psi1 * self.likelihood.precision.reshape(self.num_data, 1)).T).T
-                self.dL_dpsi2 = None
-        else:
-            dL_dpsi2 = self.likelihood.precision * dL_dpsi2_beta
-            if self.has_uncertain_inputs:
-                # repeat for each of the N psi_2 matrices
-                self.dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], self.num_data, axis=0)
-            else:
-                # subsume back into psi1 (==Kmn)
-                self.dL_dpsi1 += 2.*np.dot(self.psi1, dL_dpsi2)
-                self.dL_dpsi2 = None
+            #gradients wrt kernel
+            self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X)
+            kerngrad = self.kern.gradient.copy()
+            self.kern.update_gradients_full(self.grad_dict['dL_dKnm'], self.X, self.Z)
+            kerngrad += self.kern.gradient
+            self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z, None)
+            self.kern.gradient += kerngrad
+            #gradients wrt Z
+            self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
+            self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)


-        # the partial derivative vector for the likelihood
-        if self.likelihood.num_params == 0:
-            # save computation here.
-            self.partial_for_likelihood = None
-        elif self.likelihood.is_heteroscedastic:
-
-            if self.has_uncertain_inputs:
-                raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
-
-            else:
-
-                LBi = chol_inv(self.LB)
-                Lmi_psi1, nil = dtrtrs(self._Lm, np.asfortranarray(self.psi1.T), lower=1, trans=0)
-                _LBi_Lmi_psi1, _ = dtrtrs(self.LB, np.asfortranarray(Lmi_psi1), lower=1, trans=0)
-
-
-                self.partial_for_likelihood = -0.5 * self.likelihood.precision + 0.5 * self.likelihood.V**2
-                self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0 - np.sum(Lmi_psi1**2,0))[:,None] * self.likelihood.precision**2
-
-                self.partial_for_likelihood += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*self.likelihood.precision**2
-
-                self.partial_for_likelihood += -np.dot(self._LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * self.likelihood.Y * self.likelihood.precision**2
-                self.partial_for_likelihood += 0.5*np.dot(self._LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * self.likelihood.precision**2
-
-        else:
-            # likelihood is not heteroscedatic
-            self.partial_for_likelihood = -0.5 * self.num_data * self.output_dim * self.likelihood.precision + 0.5 * self.likelihood.trYYT * self.likelihood.precision ** 2
-            self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0.sum() * self.likelihood.precision ** 2 - np.trace(self._A) * self.likelihood.precision)
-            self.partial_for_likelihood += self.likelihood.precision * (0.5 * np.sum(self._A * self.DBi_plus_BiPBi) - self.data_fit)
-
-    def log_likelihood(self):
-        """ Compute the (lower bound on the) log marginal likelihood """
-        if self.likelihood.is_heteroscedastic:
-            A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.likelihood.V * self.likelihood.Y)
-            B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self._A))
-        else:
-            A = -0.5 * self.num_data * self.output_dim * (np.log(2.*np.pi) - np.log(self.likelihood.precision)) - 0.5 * self.likelihood.precision * self.likelihood.trYYT
-            B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self._A))
-        C = -self.output_dim * (np.sum(np.log(np.diag(self.LB)))) # + 0.5 * self.num_inducing * np.log(sf2))
-        D = 0.5 * self.data_fit
-        self._A_part, self._B_part, self._C_part, self._D_part = A, B, C, D
-        return A + B + C + D + self.likelihood.Z
-
-    def _set_params(self, p):
-        self.Z = p[:self.num_inducing * self.input_dim].reshape(self.num_inducing, self.input_dim)
-        self.kern._set_params(p[self.Z.size:self.Z.size + self.kern.num_params])
-        self.likelihood._set_params(p[self.Z.size + self.kern.num_params:])
-        self._compute_kernel_matrices()
-        self._computations()
-        self.Cpsi1V = None
-
-    def _get_params(self):
-        return np.hstack([self.Z.flatten(), self.kern._get_params_transformed(), self.likelihood._get_params()])
-
-    def _get_param_names(self):
-        return sum([['iip_%i_%i' % (i, j) for j in range(self.Z.shape[1])] for i in range(self.Z.shape[0])], [])\
-            + self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
-
-    #def _get_print_names(self):
-    #    return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
-
-    def update_likelihood_approximation(self, **kwargs):
+    def _raw_predict(self, Xnew, full_cov=False, kern=None):
        """
-        Approximates a non-gaussian likelihood using Expectation Propagation
-
-        For a Gaussian likelihood, no iteration is required:
-        this function does nothing
-        """
-        if not isinstance(self.likelihood, Gaussian): # Updates not needed for Gaussian likelihood
-            self.likelihood.restart()
-            if self.has_uncertain_inputs:
-                Lmi = chol_inv(self._Lm)
-                Kmmi = tdot(Lmi.T)
-                diag_tr_psi2Kmmi = np.array([np.trace(psi2_Kmmi) for psi2_Kmmi in np.dot(self.psi2, Kmmi)])
-
-                self.likelihood.fit_FITC(self.Kmm, self.psi1.T, diag_tr_psi2Kmmi, **kwargs) # This uses the fit_FITC code, but does not perfomr a FITC-EP.#TODO solve potential confusion
-                # raise NotImplementedError, "EP approximation not implemented for uncertain inputs"
-            else:
-                self.likelihood.fit_DTC(self.Kmm, self.psi1.T, **kwargs)
-                # self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
-                self._set_params(self._get_params()) # update the GP
-
-    def _log_likelihood_gradients(self):
-        return np.hstack((self.dL_dZ().flatten(), self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))
-
-    def dL_dtheta(self):
-        """
-        Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel
-        """
-        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm, self.Z)
-        if self.has_uncertain_inputs:
-            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X, self.X_variance)
-            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1, self.Z, self.X, self.X_variance)
-            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X, self.X_variance)
-        else:
-            dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1, self.X, self.Z)
-            dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
-
-        return dL_dtheta
-
-    def dL_dZ(self):
-        """
-        The derivative of the bound wrt the inducing inputs Z
-        """
-        dL_dZ = self.kern.dK_dX(self.dL_dKmm, self.Z)
-        if self.has_uncertain_inputs:
-            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1, self.Z, self.X, self.X_variance)
-            dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2, self.Z, self.X, self.X_variance)
-        else:
-            dL_dZ += self.kern.dK_dX(self.dL_dpsi1.T, self.Z, self.X)
-        return dL_dZ
-
-    def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
-        """
-        Internal helper function for making predictions, does not account for
-        normalization or likelihood function
+        Make a prediction for the latent function values
        """

-        Bi, _ = dpotri(self.LB, lower=0) # WTH? this lower switch should be 1, but that doesn't work!
-        symmetrify(Bi)
-        Kmmi_LmiBLmi = backsub_both_sides(self._Lm, np.eye(self.num_inducing) - Bi)
+        if kern is None: kern = self.kern

-        if self.Cpsi1V is None:
-            psi1V = np.dot(self.psi1.T, self.likelihood.V)
-            tmp, _ = dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
-            tmp, _ = dpotrs(self.LB, tmp, lower=1)
-            self.Cpsi1V, _ = dtrtrs(self._Lm, tmp, lower=1, trans=1)
-
-        if X_variance_new is None:
-            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
-            mu = np.dot(Kx.T, self.Cpsi1V)
+        if not isinstance(Xnew, VariationalPosterior):
+            Kx = kern.K(self.Z, Xnew)
+            mu = np.dot(Kx.T, self.posterior.woodbury_vector)
            if full_cov:
-                Kxx = self.kern.K(Xnew, which_parts=which_parts)
-                var = Kxx - mdot(Kx.T, Kmmi_LmiBLmi, Kx) # NOTE this won't work for plotting
+                Kxx = kern.K(Xnew)
+                if self.posterior.woodbury_inv.ndim == 2:
+                    var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
+                elif self.posterior.woodbury_inv.ndim == 3:
+                    var = Kxx[:,:,None] - np.tensordot(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx).T, Kx, [1,0]).swapaxes(1,2)
+                var = var
            else:
-                Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)
-                var = Kxx - np.sum(Kx * np.dot(Kmmi_LmiBLmi, Kx), 0)
+                Kxx = kern.Kdiag(Xnew)
+                var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
        else:
-            # assert which_parts=='all', "swithching out parts of variational kernels is not implemented"
-            Kx = self.kern.psi1(self.Z, Xnew, X_variance_new) # , which_parts=which_parts) TODO: which_parts
-            mu = np.dot(Kx, self.Cpsi1V)
+            Kx = kern.psi1(self.Z, Xnew)
+            mu = np.dot(Kx, self.posterior.woodbury_vector)
            if full_cov:
                raise NotImplementedError, "TODO"
            else:
-                Kxx = self.kern.psi0(self.Z, Xnew, X_variance_new)
-                psi2 = self.kern.psi2(self.Z, Xnew, X_variance_new)
+                Kxx = kern.psi0(self.Z, Xnew)
+                psi2 = kern.psi2(self.Z, Xnew)
                var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
-
-        return mu, var[:, None]
-
-    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False, **likelihood_args):
-        """
-        Predict the function(s) at the new point(s) Xnew.
-
-        **Arguments**
-
-        :param Xnew: The points at which to make a prediction
-        :type Xnew: np.ndarray, Nnew x self.input_dim
-        :param X_variance_new: The uncertainty in the prediction points
-        :type X_variance_new: np.ndarray, Nnew x self.input_dim
-        :param which_parts:  specifies which outputs kernel(s) to use in prediction
-        :type which_parts: ('all', list of bools)
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal
-        :type full_cov: bool
-        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
-        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
-        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
-
-
-           If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
-           This is to allow for different normalizations of the output dimensions.
-
-        """
-        # normalize X values
-        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
-        if X_variance_new is not None:
-            X_variance_new = X_variance_new / self._Xscale ** 2
-
-        # here's the actual prediction by the GP model
-        mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts)
-
-        # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, **likelihood_args)
-
-        return mean, var, _025pm, _975pm
-
-
-    def plot_f(self, samples=0, plot_limits=None, which_data_rows='all',
-            which_data_ycols='all', which_parts='all', resolution=None,
-            full_cov=False, fignum=None, ax=None):
-
-        """
-        Plot the GP's view of the world, where the data is normalized and the
-          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
-          - In two dimsensions, a contour-plot shows the mean predicted function
-          - Not implemented in higher dimensions
-
-        :param samples: the number of a posteriori samples to plot
-        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
-        :param which_data_rows: which if the training data to plot (default all)
-        :type which_data_rows: 'all' or a slice object to slice self.X, self.Y
-        :param which_parts: which of the kernel functions to plot (additively)
-        :type which_parts: 'all', or list of bools
-        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
-        :type resolution: int
-        :param full_cov:
-        :type full_cov: bool
-                :param fignum: figure to plot on.
-        :type fignum: figure number
-        :param ax: axes to plot on.
-        :type ax: axes handle
-
-        :param output: which output to plot (for multiple output models only)
-        :type output: integer (first output is 0)
-        """
-        if ax is None:
-            fig = pb.figure(num=fignum)
-            ax = fig.add_subplot(111)
-        if fignum is None and ax is None:
-                fignum = fig.num
-        if which_data_rows is 'all':
-            which_data_rows = slice(None)
-
-        GPBase.plot_f(self, samples=samples, plot_limits=plot_limits, which_data_rows=which_data_rows, which_data_ycols=which_data_ycols, which_parts=which_parts, resolution=resolution, fignum=fignum, ax=ax)
-
-        if self.X.shape[1] == 1:
-            if self.has_uncertain_inputs:
-                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
-                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
-                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
-                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
-            Zu = self.Z * self._Xscale + self._Xoffset
-            ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
-
-        elif self.X.shape[1] == 2:
-            Zu = self.Z * self._Xscale + self._Xoffset
-            ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
-
-        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
-
-    def plot(self, plot_limits=None, which_data_rows='all',
-            which_data_ycols='all', which_parts='all', fixed_inputs=[],
-            plot_raw=False,
-            levels=20, samples=0, fignum=None, ax=None, resolution=None):
-        """
-        Plot the posterior of the sparse GP.
-          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
-          - In two dimsensions, a contour-plot shows the mean predicted function
-          - In higher dimensions, use fixed_inputs to plot the GP  with some of the inputs fixed.
-
-        Can plot only part of the data and part of the posterior functions
-        using which_data_rowsm which_data_ycols and which_parts
-
-        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
-        :type plot_limits: np.array
-        :param which_data_rows: which of the training data to plot (default all)
-        :type which_data_rows: 'all' or a slice object to slice self.X, self.Y
-        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
-        :type which_data_rows: 'all' or a list of integers
-        :param which_parts: which of the kernel functions to plot (additively)
-        :type which_parts: 'all', or list of bools
-        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
-        :type fixed_inputs: a list of tuples
-        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
-        :type resolution: int
-        :param levels: number of levels to plot in a contour plot.
-        :type levels: int
-        :param samples: the number of a posteriori samples to plot
-        :type samples: int
-        :param fignum: figure to plot on.
-        :type fignum: figure number
-        :param ax: axes to plot on.
-        :type ax: axes handle
-        :type output: integer (first output is 0)
-        :param linecol: color of line to plot.
-        :type linecol:
-        :param fillcol: color of fill
-        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
-        """
-        #deal work out which ax to plot on
-        #Need these because we use which_data_rows in this function not just base
-        if which_data_rows == 'all':
-            which_data_rows = slice(None)
-        if which_data_ycols == 'all':
-            which_data_ycols = np.arange(self.output_dim)
-        if ax is None:
-            fig = pb.figure(num=fignum)
-            ax = fig.add_subplot(111)
-
-        #work out what the inputs are for plotting (1D or 2D)
-        fixed_dims = np.array([i for i,v in fixed_inputs])
-        free_dims = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
-
-        #call the base plotting
-        GPBase.plot(self, samples=samples, plot_limits=plot_limits,
-                which_data_rows=which_data_rows,
-                which_data_ycols=which_data_ycols, fixed_inputs=fixed_inputs,
-                which_parts=which_parts, resolution=resolution, levels=20,
-                fignum=fignum, ax=ax)
-
-        if len(free_dims) == 1:
-            #plot errorbars for the uncertain inputs
-            if self.has_uncertain_inputs:
-                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
-                ax.errorbar(Xu[which_data_rows, 0], self.likelihood.data[which_data_rows, 0],
-                            xerr=2 * np.sqrt(self.X_variance[which_data_rows, 0]),
-                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
-
-            #plot the inducing inputs
-            Zu = self.Z * self._Xscale + self._Xoffset
-            ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
-
-        elif len(free_dims) == 2:
-            Zu = self.Z * self._Xscale + self._Xoffset
-            ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
-
-        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
-
-    def getstate(self):
-        """
-        Get the current state of the class,
-        here just all the indices, rest can get recomputed
-        """
-        return GPBase.getstate(self) + [self.Z,
-                self.num_inducing,
-                self.has_uncertain_inputs,
-                self.X_variance]
-
-    def setstate(self, state):
-        self.X_variance = state.pop()
-        self.has_uncertain_inputs = state.pop()
-        self.num_inducing = state.pop()
-        self.Z = state.pop()
-        GPBase.setstate(self, state)
-
-
+        return mu, var
--- a/GPy/core/sparse_gp_mpi.py
+++ b/GPy/core/sparse_gp_mpi.py
@ -0,0 +1,120 @@
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from sparse_gp import SparseGP
+from numpy.linalg.linalg import LinAlgError
+from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch
+
+import logging
+logger = logging.getLogger("sparse gp mpi")
+
+class SparseGP_MPI(SparseGP):
+    """
+    A general purpose Sparse GP model with MPI parallelization support
+
+    This model allows (approximate) inference using variational DTC or FITC
+    (Gaussian likelihoods) as well as non-conjugate sparse methods based on
+    these.
+
+    :param X: inputs
+    :type X: np.ndarray (num_data x input_dim)
+    :param likelihood: a likelihood instance, containing the observed data
+    :type likelihood: GPy.likelihood.(Gaussian | EP | Laplace)
+    :param kernel: the kernel (covariance function). See link kernels
+    :type kernel: a GPy.kern.kern instance
+    :param X_variance: The uncertainty in the measurements of X (Gaussian variance)
+    :type X_variance: np.ndarray (num_data x input_dim) | None
+    :param Z: inducing inputs
+    :type Z: np.ndarray (num_inducing x input_dim)
+    :param num_inducing: Number of inducing points (optional, default 10. Ignored if Z is not None)
+    :type num_inducing: int
+    :param mpi_comm: The communication group of MPI, e.g. mpi4py.MPI.COMM_WORLD
+    :type mpi_comm: mpi4py.MPI.Intracomm
+
+    """
+
+    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp mpi', Y_metadata=None, mpi_comm=None, normalizer=False):
+        self._IN_OPTIMIZATION_ = False
+        if mpi_comm != None:
+            if inference_method is None:
+                inference_method = VarDTC_minibatch(mpi_comm=mpi_comm)
+            else:
+                assert isinstance(inference_method, VarDTC_minibatch), 'inference_method has to support MPI!'
+
+        super(SparseGP_MPI, self).__init__(X, Y, Z, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
+        self.update_model(False)
+        
+        if variational_prior is not None:
+            self.link_parameter(variational_prior)
+            
+        self.mpi_comm = mpi_comm
+        # Manage the data (Y) division
+        if mpi_comm != None:
+            from ..util.parallel import divide_data
+            N_start, N_end, N_list = divide_data(Y.shape[0], mpi_comm.rank, mpi_comm.size)
+            self.N_range = (N_start, N_end)
+            self.N_list = np.array(N_list)
+            self.Y_local = self.Y[N_start:N_end]
+            print 'MPI RANK '+str(self.mpi_comm.rank)+' with the data range '+str(self.N_range)
+            mpi_comm.Bcast(self.param_array, root=0)
+        self.update_model(True)
+
+    def __getstate__(self):
+        dc = super(SparseGP_MPI, self).__getstate__()
+        dc['mpi_comm'] = None
+        if self.mpi_comm != None:
+            del dc['N_range']
+            del dc['N_list']
+            del dc['Y_local']
+        if 'normalizer' not in dc:
+            dc['normalizer'] = None
+            dc['Y_normalized'] = dc['Y']
+        return dc
+
+    #=====================================================
+    # The MPI parallelization
+    #     - can move to model at some point
+    #=====================================================
+
+    @SparseGP.optimizer_array.setter
+    def optimizer_array(self, p):
+        if self.mpi_comm != None:
+            if self._IN_OPTIMIZATION_ and self.mpi_comm.rank==0:
+                self.mpi_comm.Bcast(np.int32(1),root=0)
+            self.mpi_comm.Bcast(p, root=0)
+        SparseGP.optimizer_array.fset(self,p)
+
+    def optimize(self, optimizer=None, start=None, **kwargs):
+        self._IN_OPTIMIZATION_ = True
+        if self.mpi_comm==None:
+            super(SparseGP_MPI, self).optimize(optimizer,start,**kwargs)
+        elif self.mpi_comm.rank==0:
+            super(SparseGP_MPI, self).optimize(optimizer,start,**kwargs)
+            self.mpi_comm.Bcast(np.int32(-1),root=0)
+        elif self.mpi_comm.rank>0:
+            x = self.optimizer_array.copy()
+            flag = np.empty(1,dtype=np.int32)
+            while True:
+                self.mpi_comm.Bcast(flag,root=0)
+                if flag==1:
+                    try:
+                        self.optimizer_array = x
+                        self._fail_count = 0
+                    except (LinAlgError, ZeroDivisionError, ValueError):
+                        if self._fail_count >= self._allowed_failures:
+                            raise
+                        self._fail_count += 1
+                elif flag==-1:
+                    break
+                else:
+                    self._IN_OPTIMIZATION_ = False
+                    raise Exception("Unrecognizable flag for synchronization!")
+        self._IN_OPTIMIZATION_ = False
+
+    def parameters_changed(self):
+        if isinstance(self.inference_method,VarDTC_minibatch):
+            update_gradients(self, mpi_comm=self.mpi_comm)
+        else:
+            super(SparseGP_MPI,self).parameters_changed()
+
--- a/GPy/core/svigp.py
+++ b/GPy/core/svigp.py
@ -1,512 +0,0 @@
-# Copyright (c) 2012, James Hensman and Nicolo' Fusi
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import numpy as np
-import pylab as pb
-from .. import kern
-from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs, jitchol, backsub_both_sides
-from ..likelihoods import EP
-from gp_base import GPBase
-from model import Model
-import time
-import sys
-
-
-class SVIGP(GPBase):
-    """
-
-    Stochastic Variational inference in a Gaussian Process
-
-    :param X: inputs
-    :type X: np.ndarray (num_data x num_inputs)
-    :param Y: observed data
-    :type Y: np.ndarray of observations (num_data x output_dim)
-    :param batchsize: the size of a minibatch
-    :param q_u: canonical parameters of the distribution squasehd into a 1D array
-    :type q_u: np.ndarray
-    :param kernel: the kernel/covariance function. See link kernels
-    :type kernel: a GPy kernel
-    :param Z: inducing inputs
-    :type Z: np.ndarray (num_inducing x num_inputs)
-
-    """
-
-    def __init__(self, X, likelihood, kernel, Z, q_u=None, batchsize=10, X_variance=None):
-        GPBase.__init__(self, X, likelihood, kernel, normalize_X=False)
-        self.batchsize=batchsize
-        self.Y = self.likelihood.Y.copy()
-        self.Z = Z
-        self.num_inducing = Z.shape[0]
-        self.batchcounter = 0
-        self.epochs = 0
-        self.iterations = 0
-
-        self.vb_steplength = 0.05
-        self.param_steplength = 1e-5
-        self.momentum = 0.9
-
-        if X_variance is None:
-            self.has_uncertain_inputs = False
-        else:
-            self.has_uncertain_inputs = True
-            self.X_variance = X_variance
-
-
-        if q_u is None:
-             q_u = np.hstack((np.random.randn(self.num_inducing*self.output_dim),-.5*np.eye(self.num_inducing).flatten()))
-        self.set_vb_param(q_u)
-
-        self._permutation = np.random.permutation(self.num_data)
-        self.load_batch()
-
-        self._param_trace = []
-        self._ll_trace = []
-        self._grad_trace = []
-
-        #set the adaptive steplength parameters
-        self.hbar_t = 0.0
-        self.tau_t = 100.0
-        self.gbar_t = 0.0
-        self.gbar_t1 = 0.0
-        self.gbar_t2 = 0.0
-        self.hbar_tp = 0.0
-        self.tau_tp = 10000.0
-        self.gbar_tp = 0.0
-        self.adapt_param_steplength = True
-        self.adapt_vb_steplength = True
-        self._param_steplength_trace = []
-        self._vb_steplength_trace = []
-
-        self.ensure_default_constraints()
-
-    def getstate(self):
-        steplength_params = [self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength]
-        return GPBase.getstate(self) + \
-            [self.get_vb_param(),
-             self.Z,
-             self.num_inducing,
-             self.has_uncertain_inputs,
-             self.X_variance,
-             self.X_batch,
-             self.X_variance_batch,
-             steplength_params,
-             self.batchcounter,
-             self.batchsize,
-             self.epochs,
-             self.momentum,
-             self.data_prop,
-             self._param_trace,
-             self._param_steplength_trace,
-             self._vb_steplength_trace,
-             self._ll_trace,
-             self._grad_trace,
-             self.Y,
-             self._permutation,
-             self.iterations
-            ]
-
-    def setstate(self, state):
-        self.iterations = state.pop()
-        self._permutation = state.pop()
-        self.Y = state.pop()
-        self._grad_trace = state.pop()
-        self._ll_trace = state.pop()
-        self._vb_steplength_trace = state.pop()
-        self._param_steplength_trace = state.pop()
-        self._param_trace = state.pop()
-        self.data_prop = state.pop()
-        self.momentum = state.pop()
-        self.epochs = state.pop()
-        self.batchsize = state.pop()
-        self.batchcounter = state.pop()
-        steplength_params = state.pop()
-        (self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength) = steplength_params
-        self.X_variance_batch = state.pop()
-        self.X_batch = state.pop()
-        self.X_variance = state.pop()
-        self.has_uncertain_inputs = state.pop()
-        self.num_inducing = state.pop()
-        self.Z = state.pop()
-        vb_param = state.pop()
-        GPBase.setstate(self, state)
-        self.set_vb_param(vb_param)
-
-    def _compute_kernel_matrices(self):
-        # kernel computations, using BGPLVM notation
-        self.Kmm = self.kern.K(self.Z)
-        if self.has_uncertain_inputs:
-            self.psi0 = self.kern.psi0(self.Z, self.X_batch, self.X_variance_batch)
-            self.psi1 = self.kern.psi1(self.Z, self.X_batch, self.X_variance_batch)
-            self.psi2 = self.kern.psi2(self.Z, self.X_batch, self.X_variance_batch)
-        else:
-            self.psi0 = self.kern.Kdiag(self.X_batch)
-            self.psi1 = self.kern.K(self.X_batch, self.Z)
-            self.psi2 = None
-
-    def dL_dtheta(self):
-        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm, self.Z)
-        if self.has_uncertain_inputs:
-            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X_batch, self.X_variance_batch)
-            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1, self.Z, self.X_batch, self.X_variance_batch)
-            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X_batch, self.X_variance_batch)
-        else:
-            dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1, self.X_batch, self.Z)
-            dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X_batch)
-        return dL_dtheta
-
-    def _set_params(self, p, computations=True):
-        self.kern._set_params_transformed(p[:self.kern.num_params])
-        self.likelihood._set_params(p[self.kern.num_params:])
-        if computations:
-            self._compute_kernel_matrices()
-            self._computations()
-
-    def _get_params(self):
-        return np.hstack((self.kern._get_params_transformed() , self.likelihood._get_params()))
-
-    def _get_param_names(self):
-        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
-
-    def load_batch(self):
-        """
-        load a batch of data (set self.X_batch and self.likelihood.Y from self.X, self.Y)
-        """
-
-        #if we've seen all the data, start again with them in a new random order
-        if self.batchcounter+self.batchsize > self.num_data:
-            self.batchcounter = 0
-            self.epochs += 1
-            self._permutation = np.random.permutation(self.num_data)
-
-        this_perm = self._permutation[self.batchcounter:self.batchcounter+self.batchsize]
-
-        self.X_batch = self.X[this_perm]
-        self.likelihood.set_data(self.Y[this_perm])
-        if self.has_uncertain_inputs:
-            self.X_variance_batch = self.X_variance[this_perm]
-
-        self.batchcounter += self.batchsize
-
-        self.data_prop = float(self.batchsize)/self.num_data
-
-        self._compute_kernel_matrices()
-        self._computations()
-
-    def _computations(self,do_Kmm=True, do_Kmm_grad=True):
-        """
-        All of the computations needed. Some are optional, see kwargs.
-        """
-
-        if do_Kmm:
-            self.Lm = jitchol(self.Kmm)
-
-        # The rather complex computations of self.A
-        if self.has_uncertain_inputs:
-            if self.likelihood.is_heteroscedastic:
-                psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.batchsize, 1, 1))).sum(0)
-            else:
-                psi2_beta = self.psi2.sum(0) * self.likelihood.precision
-            evals, evecs = np.linalg.eigh(psi2_beta)
-            clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
-            tmp = evecs * np.sqrt(clipped_evals)
-        else:
-            if self.likelihood.is_heteroscedastic:
-                tmp = self.psi1.T * (np.sqrt(self.likelihood.precision.flatten().reshape(1, self.batchsize)))
-            else:
-                tmp = self.psi1.T * (np.sqrt(self.likelihood.precision))
-        tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1)
-        self.A = tdot(tmp)
-
-        self.V = self.likelihood.precision*self.likelihood.Y
-        self.VmT = np.dot(self.V,self.q_u_expectation[0].T)
-        self.psi1V = np.dot(self.psi1.T, self.V)
-
-        self.B = np.eye(self.num_inducing)*self.data_prop + self.A
-        self.Lambda = backsub_both_sides(self.Lm, self.B.T)
-        self.LQL = backsub_both_sides(self.Lm,self.q_u_expectation[1].T,transpose='right')
-
-        self.trace_K = self.psi0.sum() - np.trace(self.A)/self.likelihood.precision
-        self.Kmmi_m, _ = dpotrs(self.Lm, self.q_u_expectation[0], lower=1)
-        self.projected_mean = np.dot(self.psi1,self.Kmmi_m)
-
-        # Compute dL_dpsi
-        self.dL_dpsi0 = - 0.5 * self.output_dim * self.likelihood.precision * np.ones(self.batchsize)
-        self.dL_dpsi1, _ = dpotrs(self.Lm,np.asfortranarray(self.VmT.T),lower=1)
-        self.dL_dpsi1 = self.dL_dpsi1.T
-
-        dL_dpsi2 = -0.5 * self.likelihood.precision * backsub_both_sides(self.Lm, self.LQL - self.output_dim * np.eye(self.num_inducing))
-        if self.has_uncertain_inputs:
-            self.dL_dpsi2 = np.repeat(dL_dpsi2[None,:,:],self.batchsize,axis=0)
-        else:
-            self.dL_dpsi1 += 2.*np.dot(dL_dpsi2,self.psi1.T).T
-            self.dL_dpsi2 = None
-
-        # Compute dL_dKmm
-        if do_Kmm_grad:
-            tmp = np.dot(self.LQL,self.A) - backsub_both_sides(self.Lm,np.dot(self.q_u_expectation[0],self.psi1V.T),transpose='right')
-            tmp += tmp.T
-            tmp += -self.output_dim*self.B
-            tmp += self.data_prop*self.LQL
-            self.dL_dKmm = 0.5*backsub_both_sides(self.Lm,tmp)
-
-        #Compute the gradient of the log likelihood wrt noise variance
-        self.partial_for_likelihood =  -0.5*(self.batchsize*self.output_dim - np.sum(self.A*self.LQL))*self.likelihood.precision
-        self.partial_for_likelihood +=  (0.5*self.output_dim*self.trace_K + 0.5 * self.likelihood.trYYT - np.sum(self.likelihood.Y*self.projected_mean))*self.likelihood.precision**2
-
-
-    def log_likelihood(self):
-        """
-        As for uncollapsed sparse GP, but account for the proportion of data we're looking at right now.
-
-        NB. self.batchsize is the size of the batch, not the size of X_all
-        """
-        assert not self.likelihood.is_heteroscedastic
-        A = -0.5*self.batchsize*self.output_dim*(np.log(2.*np.pi) - np.log(self.likelihood.precision))
-        B = -0.5*self.likelihood.precision*self.output_dim*self.trace_K
-        Kmm_logdet = 2.*np.sum(np.log(np.diag(self.Lm)))
-        C = -0.5*self.output_dim*self.data_prop*(Kmm_logdet-self.q_u_logdet - self.num_inducing)
-        C += -0.5*np.sum(self.LQL * self.B)
-        D = -0.5*self.likelihood.precision*self.likelihood.trYYT
-        E = np.sum(self.V*self.projected_mean)
-        return (A+B+C+D+E)/self.data_prop
-
-    def _log_likelihood_gradients(self):
-        return np.hstack((self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))/self.data_prop
-
-    def vb_grad_natgrad(self):
-        """
-        Compute the gradients of the lower bound wrt the canonical and
-        Expectation parameters of u.
-
-        Note that the natural gradient in either is given by the gradient in the other (See Hensman et al 2012 Fast Variational inference in the conjugate exponential Family)
-        """
-
-        # Gradient for eta
-        dL_dmmT_S = -0.5*self.Lambda/self.data_prop + 0.5*self.q_u_prec
-        Kmmipsi1V,_ = dpotrs(self.Lm,self.psi1V,lower=1)
-        dL_dm = (Kmmipsi1V - np.dot(self.Lambda,self.q_u_mean))/self.data_prop
-
-        # Gradients for theta
-        S = self.q_u_cov
-        Si = self.q_u_prec
-        m = self.q_u_mean
-        dL_dSi = -mdot(S,dL_dmmT_S, S)
-
-        dL_dmhSi = -2*dL_dSi
-        dL_dSim = np.dot(dL_dSi,m) + np.dot(Si, dL_dm)
-
-        return np.hstack((dL_dm.flatten(),dL_dmmT_S.flatten())) , np.hstack((dL_dSim.flatten(), dL_dmhSi.flatten()))
-
-
-    def optimize(self, iterations, print_interval=10, callback=lambda:None, callback_interval=5):
-
-        param_step = 0.
-
-        #Iterate!
-        for i in range(iterations):
-            
-            #store the current configuration for plotting later
-            self._param_trace.append(self._get_params())
-            self._ll_trace.append(self.log_likelihood() + self.log_prior())
-
-            #load a batch and do the appropriate computations (kernel matrices, etc)
-            self.load_batch()
-
-            #compute the (stochastic) gradient
-            natgrads = self.vb_grad_natgrad()
-            grads = self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
-            self._grad_trace.append(grads)
-
-            #compute the steps in all parameters
-            vb_step = self.vb_steplength*natgrads[0]
-            #only move the parameters after the first epoch and only if the steplength is nonzero
-            if (self.epochs>=1) and (self.param_steplength > 0):
-                param_step = self.momentum*param_step + self.param_steplength*grads
-            else:
-                param_step = 0.
-
-            self.set_vb_param(self.get_vb_param() + vb_step)
-            #Note: don't recompute everything here, wait until the next iteration when we have a new batch
-            self._set_params(self._untransform_params(self._get_params_transformed() + param_step), computations=False)
-
-            #print messages if desired
-            if i and (not i%print_interval):
-                print i, np.mean(self._ll_trace[-print_interval:]) #, self.log_likelihood()
-                print np.round(np.mean(self._grad_trace[-print_interval:],0),3)
-                sys.stdout.flush()
-
-            #callback
-            if i and not i%callback_interval:
-                callback(self) # Change this to callback()
-                time.sleep(0.01)
-
-            if self.epochs > 10:
-                self._adapt_steplength()
-            self._vb_steplength_trace.append(self.vb_steplength)
-            self._param_steplength_trace.append(self.param_steplength)
-
-            self.iterations += 1
-
-
-    def _adapt_steplength(self):
-        if self.adapt_vb_steplength:
-            # self._adaptive_vb_steplength()
-            self._adaptive_vb_steplength_KL()
-        #self._vb_steplength_trace.append(self.vb_steplength)
-        assert self.vb_steplength >= 0
-
-        if self.adapt_param_steplength:
-            self._adaptive_param_steplength()
-            # self._adaptive_param_steplength_log()
-            # self._adaptive_param_steplength_from_vb()
-        #self._param_steplength_trace.append(self.param_steplength)
-
-    def _adaptive_param_steplength(self):
-        if hasattr(self, 'adapt_param_steplength_decr'):
-            decr_factor = self.adapt_param_steplength_decr
-        else:
-            decr_factor = 0.02
-        g_tp = self._transform_gradients(self._log_likelihood_gradients())
-        self.gbar_tp = (1-1/self.tau_tp)*self.gbar_tp + 1/self.tau_tp * g_tp
-        self.hbar_tp = (1-1/self.tau_tp)*self.hbar_tp + 1/self.tau_tp * np.dot(g_tp.T, g_tp)
-        new_param_steplength = np.dot(self.gbar_tp.T, self.gbar_tp) / self.hbar_tp
-        #- hack
-        new_param_steplength *= decr_factor
-        self.param_steplength = (self.param_steplength + new_param_steplength)/2
-        #-
-        self.tau_tp = self.tau_tp*(1-self.param_steplength) + 1
-
-    def _adaptive_param_steplength_log(self):
-        stp = np.logspace(np.log(0.0001), np.log(1e-6), base=np.e, num=18000)
-        self.param_steplength = stp[self.iterations]
-
-    def _adaptive_param_steplength_log2(self):
-        self.param_steplength = (self.iterations + 0.001)**-0.5
-
-    def _adaptive_param_steplength_from_vb(self):
-        self.param_steplength = self.vb_steplength * 0.01
-
-    def _adaptive_vb_steplength(self):
-        decr_factor = 0.1
-        g_t = self.vb_grad_natgrad()[0]
-        self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t
-        self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t)
-        new_vb_steplength = np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t
-        #- hack
-        new_vb_steplength *= decr_factor
-        self.vb_steplength = (self.vb_steplength + new_vb_steplength)/2
-        #-
-        self.tau_t = self.tau_t*(1-self.vb_steplength) + 1
-
-    def _adaptive_vb_steplength_KL(self):
-        decr_factor = 0.1
-        natgrad = self.vb_grad_natgrad()
-        g_t1 = natgrad[0]
-        g_t2 = natgrad[1]
-        self.gbar_t1 = (1-1/self.tau_t)*self.gbar_t1 + 1/self.tau_t * g_t1
-        self.gbar_t2 = (1-1/self.tau_t)*self.gbar_t2 + 1/self.tau_t * g_t2
-        self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t1.T, g_t2)
-        self.vb_steplength = np.dot(self.gbar_t1.T, self.gbar_t2) / self.hbar_t
-        self.vb_steplength *= decr_factor
-        self.tau_t = self.tau_t*(1-self.vb_steplength) + 1
-
-    def _raw_predict(self, X_new, X_variance_new=None, which_parts='all',full_cov=False):
-        """Internal helper function for making predictions, does not account for normalization"""
-
-        #TODO: make this more efficient!
-        self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)
-        tmp = self.Kmmi- mdot(self.Kmmi,self.q_u_cov,self.Kmmi)
-
-        if X_variance_new is None:
-            Kx = self.kern.K(X_new,self.Z)
-            mu = np.dot(Kx,self.Kmmi_m)
-            if full_cov:
-                Kxx = self.kern.K(X_new)
-                var = Kxx - mdot(Kx,tmp,Kx.T)
-            else:
-                Kxx = self.kern.Kdiag(X_new)
-                var = (Kxx - np.sum(Kx*np.dot(Kx,tmp),1))[:,None]
-            return mu, var
-        else:
-            assert X_variance_new.shape == X_new.shape
-            Kx = self.kern.psi1(self.Z,X_new, X_variance_new)
-            mu = np.dot(Kx,self.Kmmi_m)
-            Kxx = self.kern.psi0(self.Z,X_new,X_variance_new)
-            psi2 = self.kern.psi2(self.Z,X_new,X_variance_new)
-            diag_var = Kxx - np.sum(np.sum(psi2*tmp[None,:,:],1),1)
-            if full_cov:
-                raise NotImplementedError
-            else:
-                return mu, diag_var[:,None]
-
-    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False, sampling=False, num_samples=15000):
-        # normalize X values
-        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
-        if X_variance_new is not None:
-            X_variance_new = X_variance_new / self._Xscale ** 2
-
-        # here's the actual prediction by the GP model
-        mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts)
-
-        # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, sampling=sampling, num_samples=num_samples)
-
-        return mean, var, _025pm, _975pm
-
-
-    def set_vb_param(self,vb_param):
-        """set the distribution q(u) from the canonical parameters"""
-        self.q_u_canonical_flat = vb_param.copy()
-        self.q_u_canonical = self.q_u_canonical_flat[:self.num_inducing*self.output_dim].reshape(self.num_inducing,self.output_dim),self.q_u_canonical_flat[self.num_inducing*self.output_dim:].reshape(self.num_inducing,self.num_inducing)
-
-        self.q_u_prec = -2.*self.q_u_canonical[1]
-        self.q_u_cov, q_u_Li, q_u_L, tmp = pdinv(self.q_u_prec)
-        self.q_u_Li = q_u_Li
-        self.q_u_logdet = -tmp
-        self.q_u_mean, _ = dpotrs(q_u_Li, np.asfortranarray(self.q_u_canonical[0]),lower=1)
-
-        self.q_u_expectation = (self.q_u_mean, np.dot(self.q_u_mean,self.q_u_mean.T)+self.q_u_cov*self.output_dim)
-
-
-    def get_vb_param(self):
-        """
-        Return the canonical parameters of the distribution q(u)
-        """
-        return self.q_u_canonical_flat
-
-
-    def plot(self, ax=None, fignum=None, Z_height=None, **kwargs):
-
-        if ax is None:
-            fig = pb.figure(num=fignum)
-            ax = fig.add_subplot(111)
-
-        #horrible hack here:
-        data = self.likelihood.data.copy()
-        self.likelihood.data = self.Y
-        GPBase.plot(self, ax=ax, **kwargs)
-        self.likelihood.data = data
-
-        Zu = self.Z * self._Xscale + self._Xoffset
-        if self.input_dim==1:
-            ax.plot(self.X_batch, self.likelihood.data, 'gx',mew=2)
-            if Z_height is None:
-                Z_height = ax.get_ylim()[0]
-            ax.plot(Zu, np.zeros_like(Zu) + Z_height, 'r|', mew=1.5, markersize=12)
-
-        if self.input_dim==2:
-            ax.scatter(self.X[:,0], self.X[:,1], 20., self.Y[:,0], linewidth=0, cmap=pb.cm.jet)
-            ax.plot(Zu[:,0], Zu[:,1], 'w^')
-
-    def plot_traces(self):
-        pb.figure()
-        t = np.array(self._param_trace)
-        pb.subplot(2,1,1)
-        for l,ti in zip(self._get_param_names(),t.T):
-            if not l[:3]=='iip':
-                pb.plot(ti,label=l)
-        pb.legend(loc=0)
-
-        pb.subplot(2,1,2)
-        pb.plot(np.asarray(self._ll_trace),label='stochastic likelihood')
-        pb.legend(loc=0)
--- a/GPy/core/symbolic.py
+++ b/GPy/core/symbolic.py
@ -0,0 +1,420 @@
+# Copyright (c) 2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import sys
+import re
+from ..core.parameterization import Parameterized
+import numpy as np
+import sympy as sym
+from ..core.parameterization import Param
+from sympy.utilities.lambdify import lambdastr, _imp_namespace, _get_namespace
+from sympy.utilities.iterables import numbered_symbols
+import scipy
+import GPy
+
+
+def getFromDict(dataDict, mapList):
+    return reduce(lambda d, k: d[k], mapList, dataDict)
+
+def setInDict(dataDict, mapList, value):
+    getFromDict(dataDict, mapList[:-1])[mapList[-1]] = value
+
+class Symbolic_core():
+    """
+    Base model symbolic class.
+    """
+
+    def __init__(self, expressions, cacheable, derivatives=None, parameters=None, func_modules=[]):
+        # Base class init, do some basic derivatives etc.
+
+        # Func_modules sets up the right mapping for functions.
+        func_modules += [{'gamma':scipy.special.gamma,
+                          'gammaln':scipy.special.gammaln,
+                          'erf':scipy.special.erf, 'erfc':scipy.special.erfc,
+                          'erfcx':scipy.special.erfcx,
+                          'polygamma':scipy.special.polygamma,
+                          'normcdf':GPy.util.functions.normcdf,
+                          'normcdfln':GPy.util.functions.normcdfln,
+                          'logistic':GPy.util.functions.logistic,
+                          'logisticln':GPy.util.functions.logisticln},
+                         'numpy']
+
+        self._set_expressions(expressions)
+        self._set_variables(cacheable)
+        self._set_derivatives(derivatives)
+        self._set_parameters(parameters)
+        # Convert the expressions to a list for common sub expression elimination
+        # We should find the following type of expressions: 'function', 'derivative', 'second_derivative', 'third_derivative'. 
+        self.update_expression_list()
+
+        # Apply any global stabilisation operations to expressions.
+        self.global_stabilize()
+
+        # Helper functions to get data in and out of dictionaries.
+        # this code from http://stackoverflow.com/questions/14692690/access-python-nested-dictionary-items-via-a-list-of-keys
+
+        self.extract_sub_expressions()
+        self._gen_code()
+        self._set_namespace(func_modules)
+
+    def _set_namespace(self, namespaces):
+        """Set the name space for use when calling eval. This needs to contain all the relvant functions for mapping from symbolic python to the numerical python. It also contains variables, cached portions etc."""
+        self.namespace = {}
+        for m in namespaces[::-1]:
+            buf = _get_namespace(m)
+            self.namespace.update(buf)
+        self.namespace.update(self.__dict__)
+
+    def _set_expressions(self, expressions):
+        """Extract expressions and variables from the user provided expressions."""
+        self.expressions = {}
+        for key, item in expressions.items():
+            self.expressions[key] = {'function': item}
+
+    def _set_variables(self, cacheable):
+        """Pull the variable names out of the provided expressions and separate into cacheable expressions and normal parameters. Those that are only stored in the cache, the parameters are stored in this object."""
+        # pull the parameters and inputs out of the symbolic pdf
+        def extract_vars(expr):
+            return [e for e in expr.atoms() if e.is_Symbol and e not in vars]
+        self.cacheable = cacheable
+        self.variables = {}
+        vars = []
+        for expression in self.expressions.values():
+            vars += extract_vars(expression['function'])
+        # inputs are assumed to be those things that are
+        # cacheable. I.e. those things that aren't stored within the
+        # object except as cached. For covariance functions this is X
+        # and Z, for likelihoods F and for mapping functions X.
+        self.cacheable_vars = [] # list of everything that's cacheable
+        for var in cacheable:            
+            self.variables[var] = [e for e in vars if e.name.split('_')[0]==var.lower()]
+            self.cacheable_vars += self.variables[var]
+        for var in cacheable:
+            if not self.variables[var]:
+                raise ValueError('Variable ' + var + ' was specified as cacheable but is not in expression. Expected to find symbols of the form ' + var.lower() + '_0 to represent ' + var)
+
+        # things that aren't cacheable are assumed to be parameters.
+        self.variables['theta'] = sorted([e for e in vars if not e in self.cacheable_vars],key=lambda e:e.name)
+
+    def _set_derivatives(self, derivatives):
+        # these are arguments for computing derivatives.
+        def extract_derivative(function, derivative_arguments):
+            return {theta.name : self.stabilize(sym.diff(function,theta)) for theta in derivative_arguments}
+        derivative_arguments = []
+        if derivatives is not None:
+            for derivative in derivatives:
+                derivative_arguments += self.variables[derivative]
+
+            # Do symbolic work to compute derivatives.        
+            for key, func in self.expressions.items():
+                # if func['function'].is_Matrix:
+                #     rows = func['function'].shape[0]
+                #     cols = func['function'].shape[1]
+                #     self.expressions[key]['derivative'] = sym.zeros(rows, cols)
+                #     for i in xrange(rows):
+                #         for j in xrange(cols):
+                #             self.expressions[key]['derivative'][i, j] = extract_derivative(func['function'][i, j], derivative_arguments)
+                # else:
+                    self.expressions[key]['derivative'] = extract_derivative(func['function'], derivative_arguments)
+
+    def _set_parameters(self, parameters):
+        """Add parameters to the model and initialize with given values."""
+        for theta in self.variables['theta']:
+            val = 1.0
+            # TODO: improve approach for initializing parameters.
+            if parameters is not None:
+                if parameters.has_key(theta.name):
+                    val = parameters[theta.name]
+            # Add parameter.
+            
+            self.link_parameters(Param(theta.name, val, None))
+            #self._set_attribute(theta.name, )
+
+    def eval_parameters_changed(self):
+        # TODO: place checks for inf/nan in here
+        # do all the precomputation codes.
+        self.eval_update_cache()
+
+    def eval_update_cache(self, **kwargs):
+        # TODO: place checks for inf/nan in here
+        # for all provided keywords
+
+        for var, code in self.variable_sort(self.code['parameters_changed']):
+            self._set_attribute(var, eval(code, self.namespace))
+
+        for var, value in kwargs.items():
+            # update their cached values
+            if value is not None:
+                if var == 'X' or var == 'F' or var == 'M':
+                    value = np.atleast_2d(value)
+                    for i, theta in enumerate(self.variables[var]):
+                        self._set_attribute(theta.name, value[:, i][:, None])
+                elif var == 'Y':
+                    # Y values can be missing.
+                    value = np.atleast_2d(value)
+                    for i, theta in enumerate(self.variables[var]):
+                        self._set_attribute('missing' + str(i), np.isnan(value[:, i]))
+                        self._set_attribute(theta.name, value[:, i][:, None])
+                elif var == 'Z':
+                    value = np.atleast_2d(value)
+                    for i, theta in enumerate(self.variables[var]):
+                        self._set_attribute(theta.name, value[:, i][None, :])
+                else:
+                    value = np.atleast_1d(value)
+                    for i, theta in enumerate(self.variables[var]):
+                        self._set_attribute(theta.name, value[i])
+        for var, code in self.variable_sort(self.code['update_cache']):
+            self._set_attribute(var, eval(code, self.namespace))
+
+    def eval_update_gradients(self, function, partial, **kwargs):
+        # TODO: place checks for inf/nan in here?
+        self.eval_update_cache(**kwargs)
+        gradient = {}
+        for theta in self.variables['theta']:
+            code = self.code[function]['derivative'][theta.name]
+            gradient[theta.name] = (partial*eval(code, self.namespace)).sum()
+        return gradient
+        
+    def eval_gradients_X(self, function, partial, **kwargs):
+        if kwargs.has_key('X'):
+            gradients_X = np.zeros_like(kwargs['X'])
+        self.eval_update_cache(**kwargs)
+        for i, theta in enumerate(self.variables['X']):
+            code = self.code[function]['derivative'][theta.name]
+            gradients_X[:, i:i+1] = partial*eval(code, self.namespace)
+        return gradients_X
+
+    def eval_function(self, function, **kwargs):
+        self.eval_update_cache(**kwargs)
+        return eval(self.code[function]['function'], self.namespace)
+
+    def code_parameters_changed(self):
+        # do all the precomputation codes.
+        lcode = ''
+        for variable, code in self.variable_sort(self.code['parameters_changed']):
+            lcode += self._print_code(variable) + ' = ' + self._print_code(code) + '\n'
+        return lcode
+    
+    def code_update_cache(self):
+        lcode = ''
+        for var in self.cacheable:
+            lcode += 'if ' + var + ' is not None:\n'
+            if var == 'X':
+                reorder = '[:, None]'
+            elif var == 'Z':
+                reorder = '[None, :]'
+            else:
+                reorder = ''
+            for i, theta in enumerate(self.variables[var]):
+                lcode+= "\t" + var + '= np.atleast_2d(' + var + ')\n'
+                lcode+= "\t" + self._print_code(theta.name) + ' = ' + var + '[:, ' + str(i) + "]" + reorder + "\n"
+    
+        for variable, code in self.variable_sort(self.code['update_cache']):
+            lcode+= self._print_code(variable) + ' = ' + self._print_code(code) + "\n"
+
+        return lcode
+
+    def code_update_gradients(self, function):
+        lcode = ''
+        for theta in self.variables['theta']:
+            code = self.code[function]['derivative'][theta.name]
+            lcode += self._print_code(theta.name) + '.gradient = (partial*(' + self._print_code(code) + ')).sum()\n'
+        return lcode
+
+    def code_gradients_cacheable(self, function, variable):
+        if variable not in self.cacheable:
+            raise RuntimeError, variable + ' must be a cacheable.'
+        lcode = 'gradients_' + variable + ' = np.zeros_like(' + variable + ')\n'
+        lcode += 'self.update_cache(' + ', '.join(self.cacheable) + ')\n'
+        for i, theta in enumerate(self.variables[variable]):
+            code = self.code[function]['derivative'][theta.name]
+            lcode += 'gradients_' + variable + '[:, ' + str(i) + ':' + str(i) + '+1] = partial*' + self._print_code(code) + '\n'
+        lcode += 'return gradients_' + variable + '\n'
+        return lcode
+
+    def code_function(self, function):
+        lcode = 'self.update_cache(' + ', '.join(self.cacheable) + ')\n'
+        lcode += 'return ' + self._print_code(self.code[function]['function'])
+        return lcode
+
+    def stabilize(self, expr):
+        """Stabilize the code in the model."""
+        # this code is applied to expressions in the model in an attempt to sabilize them.
+        return expr
+
+    def global_stabilize(self):
+        """Stabilize all code in the model."""
+        pass
+
+    def _set_attribute(self, name, value):
+        """Make sure namespace gets updated when setting attributes."""
+        setattr(self, name, value)
+        self.namespace.update({name: getattr(self, name)})
+        
+
+    def update_expression_list(self):
+        """Extract a list of expressions from the dictionary of expressions."""
+        self.expression_list = [] # code arrives in dictionary, but is passed in this list
+        self.expression_keys = [] # Keep track of the dictionary keys.
+        self.expression_order = [] # This may be unecessary. It's to give ordering for cse
+        for fname, fexpressions in self.expressions.items():
+            for type, texpressions in fexpressions.items():
+                if type == 'function':
+                    self.expression_list.append(texpressions)            
+                    self.expression_keys.append([fname, type])
+                    self.expression_order.append(1) 
+                elif type[-10:] == 'derivative':
+                    for dtype, expression in texpressions.items():
+                        self.expression_list.append(expression)
+                        self.expression_keys.append([fname, type, dtype])
+                        if type[:-10] == 'first_' or type[:-10] == '':
+                            self.expression_order.append(3) #sym.count_ops(self.expressions[type][dtype]))
+                        elif type[:-10] == 'second_':
+                            self.expression_order.append(4) #sym.count_ops(self.expressions[type][dtype]))
+                        elif type[:-10] == 'third_':
+                            self.expression_order.append(5) #sym.count_ops(self.expressions[type][dtype]))
+                else:
+                    self.expression_list.append(fexpressions[type])            
+                    self.expression_keys.append([fname, type])
+                    self.expression_order.append(2) 
+
+        # This step may be unecessary.
+        # Not 100% sure if the sub expression elimination is order sensitive. This step orders the list with the 'function' code first and derivatives after.
+        self.expression_order, self.expression_list, self.expression_keys = zip(*sorted(zip(self.expression_order, self.expression_list, self.expression_keys)))
+
+    def extract_sub_expressions(self, cache_prefix='cache', sub_prefix='sub', prefix='XoXoXoX'):
+        # Do the common sub expression elimination.
+        common_sub_expressions, expression_substituted_list = sym.cse(self.expression_list, numbered_symbols(prefix=prefix))
+
+        self.variables[cache_prefix] = []
+        self.variables[sub_prefix] = []
+
+        # Create dictionary of new sub expressions
+        sub_expression_dict = {}
+        for var, void in common_sub_expressions:
+            sub_expression_dict[var.name] = var
+
+        # Sort out any expression that's dependent on something that scales with data size (these are listed in cacheable).
+        cacheable_list = []
+        params_change_list = []
+        # common_sube_expressions contains a list of paired tuples with the new variable and what it equals
+        for var, expr in common_sub_expressions:
+            arg_list = [e for e in expr.atoms() if e.is_Symbol]
+            # List any cacheable dependencies of the sub-expression
+            cacheable_symbols = [e for e in arg_list if e in cacheable_list or e in self.cacheable_vars]
+            if cacheable_symbols:
+                # list which ensures dependencies are cacheable.
+                cacheable_list.append(var)
+            else:
+                params_change_list.append(var)
+
+        replace_dict = {}
+        for i, expr in enumerate(cacheable_list):
+            sym_var = sym.var(cache_prefix + str(i))
+            self.variables[cache_prefix].append(sym_var)
+            replace_dict[expr.name] = sym_var
+            
+        for i, expr in enumerate(params_change_list):
+            sym_var = sym.var(sub_prefix + str(i))
+            self.variables[sub_prefix].append(sym_var)
+            replace_dict[expr.name] = sym_var
+
+        for replace, void in common_sub_expressions:
+            for expr, keys in zip(expression_substituted_list, self.expression_keys):
+                setInDict(self.expressions, keys, expr.subs(replace, replace_dict[replace.name]))
+            for void, expr in common_sub_expressions:
+                expr = expr.subs(replace, replace_dict[replace.name])
+
+        # Replace original code with code including subexpressions.
+        for keys in self.expression_keys:
+            for replace, void in common_sub_expressions:
+                setInDict(self.expressions, keys, getFromDict(self.expressions, keys).subs(replace, replace_dict[replace.name]))
+        
+        self.expressions['parameters_changed'] = {}
+        self.expressions['update_cache'] = {}
+        for var, expr in common_sub_expressions:
+            for replace, void in common_sub_expressions:
+                expr = expr.subs(replace, replace_dict[replace.name])
+            if var in cacheable_list:
+                self.expressions['update_cache'][replace_dict[var.name].name] = expr
+            else:
+                self.expressions['parameters_changed'][replace_dict[var.name].name] = expr
+            
+
+    def _gen_code(self):
+        """Generate code for the list of expressions provided using the common sub-expression eliminator to separate out portions that are computed multiple times."""
+        # This is the dictionary that stores all the generated code.
+
+        self.code = {}
+        def match_key(expr):
+            if type(expr) is dict:
+                code = {}
+                for key in expr.keys():
+                    code[key] = match_key(expr[key])
+            else:
+                arg_list = [e for e in expr.atoms() if e.is_Symbol]
+                code = self._expr2code(arg_list, expr)
+            return code
+
+        self.code = match_key(self.expressions)
+                            
+ 
+    def _expr2code(self, arg_list, expr):
+        """Convert the given symbolic expression into code."""
+        code = lambdastr(arg_list, expr)
+        function_code = code.split(':')[1].strip()
+        #for arg in arg_list:
+        #    function_code = function_code.replace(arg.name, 'self.'+arg.name)
+
+        return function_code
+
+    def _print_code(self, code):
+        """Prepare code for string writing."""
+        # This needs a rewrite --- it doesn't check for match clashes! So sub11 would be replaced by sub1 before being replaced with sub11!!
+        for key in self.variables.keys():
+            for arg in self.variables[key]:
+                code = code.replace(arg.name, 'self.'+arg.name)
+        return code
+
+    def _display_expression(self, keys, user_substitutes={}):
+        """Helper function for human friendly display of the symbolic components."""
+        # Create some pretty maths symbols for the display.
+        sigma, alpha, nu, omega, l, variance = sym.var('\sigma, \alpha, \nu, \omega, \ell, \sigma^2')
+        substitutes = {'scale': sigma, 'shape': alpha, 'lengthscale': l, 'variance': variance}
+        substitutes.update(user_substitutes)
+
+        function_substitutes = {normcdfln : lambda arg : sym.log(normcdf(arg)),
+                                logisticln : lambda arg : -sym.log(1+sym.exp(-arg)),
+                                logistic : lambda arg : 1/(1+sym.exp(-arg)),
+                                erfcx : lambda arg : erfc(arg)/sym.exp(arg*arg),
+                                gammaln : lambda arg : sym.log(sym.gamma(arg))}
+        expr = getFromDict(self.expressions, keys)
+        for var_name, sub in self.variable_sort(self.expressions['update_cache'], reverse=True):
+            for var in self.variables['cache']:
+                if var_name == var.name:
+                    expr = expr.subs(var, sub)
+                    break
+        for var_name, sub in self.variable_sort(self.expressions['parameters_changed'], reverse=True):
+            for var in self.variables['sub']:
+                if var_name == var.name:
+                    expr = expr.subs(var, sub)
+                    break
+
+        for var_name, sub in self.variable_sort(substitutes, reverse=True):
+            for var in self.variables['theta']:
+                if var_name == var.name:
+                    expr = expr.subs(var, sub)
+                    break
+        for m, r in function_substitutes.iteritems():
+            expr = expr.replace(m, r)#normcdfln, lambda arg : sym.log(normcdf(arg)))
+        return expr.simplify()
+
+    def variable_sort(self, var_dict, reverse=False):
+        def sort_key(x):
+            digits = re.findall(r'\d+$', x[0])
+            if digits:
+                return int(digits[0])
+            else:
+                return x[0]
+            
+        return sorted(var_dict.iteritems(), key=sort_key, reverse=reverse)
--- a/GPy/core/transformations.py
+++ b/GPy/core/transformations.py
@ -1,143 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-import numpy as np
-from GPy.core.domains import POSITIVE, NEGATIVE, BOUNDED
-import sys 
-lim_val = -np.log(sys.float_info.epsilon) 
-
-class transformation(object):
-    domain = None
-    def f(self, x):
-        raise NotImplementedError
-
-    def finv(self, x):
-        raise NotImplementedError
-
-    def gradfactor(self, f):
-        """ df_dx evaluated at self.f(x)=f"""
-        raise NotImplementedError
-
-    def initialize(self, f):
-        """ produce a sensible initial value for f(x)"""
-        raise NotImplementedError
-
-    def __str__(self):
-        raise NotImplementedError
-
-class logexp(transformation):
-    domain = POSITIVE
-    def f(self, x):
-        return np.where(x<-lim_val, np.log(1+np.exp(-lim_val)), np.where(x>lim_val, x, np.log(1. + np.exp(x))))
-    def finv(self, f):
-        return np.where(f>lim_val, f, np.log(np.exp(f) - 1.))
-    def gradfactor(self, f):
-        return np.where(f>lim_val, 1., 1 - np.exp(-f))
-    def initialize(self, f):
-        if np.any(f < 0.):
-            print "Warning: changing parameters to satisfy constraints"
-        return np.abs(f)
-    def __str__(self):
-        return '(+ve)'
-
-class negative_logexp(transformation):
-    domain = NEGATIVE
-    def f(self, x):
-        return -logexp.f(x)
-    def finv(self, f):
-        return logexp.finv(-f) 
-    def gradfactor(self, f):
-        return -logexp.gradfactor(-f)
-    def initialize(self, f):
-        return -logexp.initialize(f)
-    def __str__(self):
-        return '(-ve)'
-
-class logexp_clipped(logexp):
-    max_bound = 1e100
-    min_bound = 1e-10
-    log_max_bound = np.log(max_bound)
-    log_min_bound = np.log(min_bound)
-    domain = POSITIVE
-    def __init__(self, lower=1e-6):
-        self.lower = lower
-    def f(self, x):
-        exp = np.exp(np.clip(x, self.log_min_bound, self.log_max_bound))
-        f = np.log(1. + exp)
-#         if np.isnan(f).any():
-#             import ipdb;ipdb.set_trace()
-        return np.clip(f, self.min_bound, self.max_bound)
-    def finv(self, f):
-        return np.log(np.exp(f - 1.))
-    def gradfactor(self, f):
-        ef = np.exp(f) # np.clip(f, self.min_bound, self.max_bound))
-        gf = (ef - 1.) / ef
-        return gf # np.where(f < self.lower, 0, gf)
-    def initialize(self, f):
-        if np.any(f < 0.):
-            print "Warning: changing parameters to satisfy constraints"
-        return np.abs(f)
-    def __str__(self):
-        return '(+ve_c)'
-
-class exponent(transformation):
-    domain = POSITIVE
-    def f(self, x):
-        return np.where(x<lim_val, np.where(x>-lim_val, np.exp(x), np.exp(-lim_val)), np.exp(lim_val))
-    def finv(self, x):
-        return np.log(x)
-    def gradfactor(self, f):
-        return f
-    def initialize(self, f):
-        if np.any(f < 0.):
-            print "Warning: changing parameters to satisfy constraints"
-        return np.abs(f)
-    def __str__(self):
-        return '(+ve)'
-
-class negative_exponent(exponent):
-    domain = NEGATIVE
-    def f(self, x):
-        return -exponent.f(x)
-    def finv(self, f):
-        return exponent.finv(-f)
-    def gradfactor(self, f):
-        return f
-    def initialize(self, f):
-        return -exponent.initialize(f) #np.abs(f)
-    def __str__(self):
-        return '(-ve)'
-
-class square(transformation):
-    domain = POSITIVE
-    def f(self, x):
-        return x ** 2
-    def finv(self, x):
-        return np.sqrt(x)
-    def gradfactor(self, f):
-        return 2 * np.sqrt(f)
-    def initialize(self, f):
-        return np.abs(f)
-    def __str__(self):
-        return '(+sq)'
-
-class logistic(transformation):
-    domain = BOUNDED
-    def __init__(self, lower, upper):
-        assert lower < upper
-        self.lower, self.upper = float(lower), float(upper)
-        self.difference = self.upper - self.lower
-    def f(self, x):
-        return self.lower + self.difference / (1. + np.exp(-x))
-    def finv(self, f):
-        return np.log(np.clip(f - self.lower, 1e-10, np.inf) / np.clip(self.upper - f, 1e-10, np.inf))
-    def gradfactor(self, f):
-        return (f - self.lower) * (self.upper - f) / self.difference
-    def initialize(self, f):
-        if np.any(np.logical_or(f < self.lower, f > self.upper)):
-            print "Warning: changing parameters to satisfy constraints"
-        return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(f * 0.), f)
-    def __str__(self):
-        return '({},{})'.format(self.lower, self.upper)
-
--- a/GPy/defaults.cfg
+++ b/GPy/defaults.cfg
@ -0,0 +1,27 @@
+# This is the default configuration file for GPy
+
+# Do note edit this file.
+
+# For machine specific changes (i.e. those specific to a given installation) edit GPy/installation.cfg
+
+# For user specific changes edit $HOME/.gpy_user.cfg
+[parallel]
+# Enable openmp support. This speeds up some computations, depending on the number
+# of cores available. Setting up a compiler with openmp support can be difficult on
+# some platforms, hence by default it is off.
+openmp=False
+
+[datasets]
+# location for the local data cache
+dir=$HOME/tmp/GPy-datasets/
+
+[anaconda]
+# if you have an anaconda python installation please specify it here.
+installed = False
+location = None
+ # set this to true if you have the MKL optimizations installed:
+MKL = False
+
+[weave]
+#if true, try to use weave, and fall back to numpy. if false, just use numpy.
+working = True
--- a/GPy/examples/init.py
+++ b/GPy/examples/init.py
@ -1,8 +1,7 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 import classification
 import regression
 import dimensionality_reduction
-import tutorials
-import stochastic
+import non_gaussian
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -1,11 +1,10 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


 """
-Gaussian Processes classification
+Gaussian Processes classification examples
 """
-import pylab as pb
 import GPy

 default_seed = 10000
@ -15,7 +14,9 @@ def oil(num_inducing=50, max_iters=100, kernel=None, optimize=True, plot=True):
    Run a Gaussian process classification on the three phase oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.

    """
-    data = GPy.util.datasets.oil()
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.oil()
    X = data['X']
    Xtest = data['Xtest']
    Y = data['Y'][:, 0:1]
@ -27,13 +28,13 @@ def oil(num_inducing=50, max_iters=100, kernel=None, optimize=True, plot=True):
    m = GPy.models.SparseGPClassification(X, Y, kernel=kernel, num_inducing=num_inducing)

    # Contrain all parameters to be positive
-    m.tie_params('.*len')
+    #m.tie_params('.*len')
    m['.*len'] = 10.
-    m.update_likelihood_approximation()

    # Optimize
    if optimize:
-        m.optimize(max_iters=max_iters)
+        for _ in range(5):
+            m.optimize(max_iters=int(max_iters/5))
    print(m)

    #Test
@ -50,7 +51,9 @@ def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):

    """

-    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.toy_linear_1d_classification(seed=seed)
    Y = data['Y'][:, 0:1]
    Y[Y.flatten() == -1] = 0

@ -61,13 +64,14 @@ def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):
    if optimize:
        #m.update_likelihood_approximation()
        # Parameters optimization:
-        #m.optimize()
+        m.optimize()
        #m.update_likelihood_approximation()
-        m.pseudo_EM()
+        #m.pseudo_EM()

    # Plot
    if plot:
-        fig, axes = pb.subplots(2, 1)
+        from matplotlib import pyplot as plt
+        fig, axes = plt.subplots(2, 1)
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])

@ -83,27 +87,30 @@ def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=

    """

-    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.toy_linear_1d_classification(seed=seed)
    Y = data['Y'][:, 0:1]
    Y[Y.flatten() == -1] = 0

-    bern_noise_model = GPy.likelihoods.bernoulli()
-    laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), bern_noise_model)
+    likelihood = GPy.likelihoods.Bernoulli()
+    laplace_inf = GPy.inference.latent_function_inference.Laplace()
+    kernel = GPy.kern.RBF(1)

    # Model definition
-    m = GPy.models.GPClassification(data['X'], Y, likelihood=laplace_likelihood)
-    print m
+    m = GPy.core.GP(data['X'], Y, kernel=kernel, likelihood=likelihood, inference_method=laplace_inf)

    # Optimize
    if optimize:
-        #m.update_likelihood_approximation()
-        # Parameters optimization:
-        m.optimize('bfgs', messages=1)
-        #m.pseudo_EM()
+        try:
+            m.optimize('scg', messages=1)
+        except Exception as e:
+            return m

    # Plot
    if plot:
-        fig, axes = pb.subplots(2, 1)
+        from matplotlib import pyplot as plt
+        fig, axes = plt.subplots(2, 1)
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])

@ -119,7 +126,9 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti

    """

-    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.toy_linear_1d_classification(seed=seed)
    Y = data['Y'][:, 0:1]
    Y[Y.flatten() == -1] = 0

@ -129,21 +138,19 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti

    # Optimize
    if optimize:
-        #m.update_likelihood_approximation()
-        # Parameters optimization:
-        #m.optimize()
-        m.pseudo_EM()
+        m.optimize()

    # Plot
    if plot:
-        fig, axes = pb.subplots(2, 1)
+        from matplotlib import pyplot as plt
+        fig, axes = plt.subplots(2, 1)
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])

    print m
    return m

-def toy_heaviside(seed=default_seed, optimize=True, plot=True):
+def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):
    """
    Simple 1D classification example using a heavy side gp transformation

@ -152,25 +159,30 @@ def toy_heaviside(seed=default_seed, optimize=True, plot=True):

    """

-    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.toy_linear_1d_classification(seed=seed)
    Y = data['Y'][:, 0:1]
    Y[Y.flatten() == -1] = 0

    # Model definition
-    noise_model = GPy.likelihoods.bernoulli(GPy.likelihoods.noise_models.gp_transformations.Heaviside())
-    likelihood = GPy.likelihoods.EP(Y, noise_model)
-    m = GPy.models.GPClassification(data['X'], likelihood=likelihood)
+    kernel = GPy.kern.RBF(1)
+    likelihood = GPy.likelihoods.Bernoulli(gp_link=GPy.likelihoods.link_functions.Heaviside())
+    ep = GPy.inference.latent_function_inference.expectation_propagation.EP()
+    m = GPy.core.GP(X=data['X'], Y=Y, kernel=kernel, likelihood=likelihood, inference_method=ep, name='gp_classification_heaviside')
+    #m = GPy.models.GPClassification(data['X'], likelihood=likelihood)

    # Optimize
    if optimize:
-        m.update_likelihood_approximation()
        # Parameters optimization:
-        m.optimize()
-        #m.pseudo_EM()
+        for _ in range(5):
+            m.optimize(max_iters=int(max_iters/5))
+        print m

    # Plot
    if plot:
-        fig, axes = pb.subplots(2, 1)
+        from matplotlib import pyplot as plt
+        fig, axes = plt.subplots(2, 1)
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])

@ -189,7 +201,9 @@ def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=
    :param kernel: kernel to use in the model
    :type kernel: a GPy kernel
    """
-    data = GPy.util.datasets.crescent_data(seed=seed)
+    try:import pods
+    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    data = pods.datasets.crescent_data(seed=seed)
    Y = data['Y']
    Y[Y.flatten()==-1] = 0

--- a/GPy/examples/coreg_example.py
+++ b/GPy/examples/coreg_example.py
@ -0,0 +1,89 @@
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+try:
+    import pylab as pb
+except:
+    pass
+import GPy
+pb.ion()
+pb.close('all')
+
+X1 = np.arange(3)[:,None]
+X2 = np.arange(4)[:,None]
+I1 = np.zeros_like(X1)
+I2 = np.ones_like(X2)
+
+_X = np.vstack([ X1, X2 ])
+_I = np.vstack([ I1, I2 ])
+
+X = np.hstack([ _X, _I ])
+
+Y1 = np.sin(X1/8.)
+Y2 = np.cos(X2/8.)
+
+Bias = GPy.kern.Bias(1,active_dims=[0])
+Coreg = GPy.kern.Coregionalize(1,2,active_dims=[1])
+K = Bias.prod(Coreg,name='X')
+
+#K.coregion.W = 0
+#print K.coregion.W
+#print Bias.K(_X,_X)
+#print K.K(X,X)
+#pb.matshow(K.K(X,X))
+
+Mlist = [GPy.kern.Matern32(1,lengthscale=20.,name="Mat")]
+kern = GPy.util.multioutput.LCM(input_dim=1,num_outputs=2,kernels_list=Mlist,name='H')
+kern.B.W = 0
+kern.B.kappa = 1.
+#kern.B.W.fix()
+#kern.B.kappa.fix()
+#m = GPy.models.GPCoregionalizedRegression(X_list=[X1,X2], Y_list=[Y1,Y2], kernel=kern)
+
+
+Z1 = np.array([1.5,2.5])[:,None]
+
+m = GPy.models.SparseGPCoregionalizedRegression(X_list=[X1], Y_list=[Y1], Z_list = [Z1], kernel=kern)
+#m.optimize()
+m.checkgrad(verbose=1)
+
+"""
+fig = pb.figure()
+ax0 = fig.add_subplot(211)
+ax1 = fig.add_subplot(212)
+slices = GPy.util.multioutput.get_slices([Y1,Y2])
+m.plot(fixed_inputs=[(1,0)],which_data_rows=slices[0],ax=ax0)
+#m.plot(fixed_inputs=[(1,1)],which_data_rows=slices[1],ax=ax1)
+"""
+
+
+
+"""
+
+X1 = 100 * np.random.rand(100)[:,None]
+X2 = 100 * np.random.rand(100)[:,None]
+#X1.sort()
+#X2.sort()
+
+Y1 = np.sin(X1/10.) + np.random.rand(100)[:,None]
+Y2 = np.cos(X2/10.) + np.random.rand(100)[:,None]
+
+
+
+
+Mlist = [GPy.kern.Matern32(1,lengthscale=20.,name="Mat")]
+kern = GPy.util.multioutput.LCM(input_dim=1,num_outputs=12,kernels_list=Mlist,name='H')
+
+
+m = GPy.models.GPCoregionalizedRegression(X_list=[X1,X2], Y_list=[Y1,Y2], kernel=kern)
+m.optimize()
+
+fig = pb.figure()
+ax0 = fig.add_subplot(211)
+ax1 = fig.add_subplot(212)
+slices = GPy.util.multioutput.get_slices([Y1,Y2])
+m.plot(fixed_inputs=[(1,0)],which_data_rows=slices[0],ax=ax0)
+m.plot(fixed_inputs=[(1,1)],which_data_rows=slices[1],ax=ax1)
+
+"""
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -1,75 +1,80 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as _np
 default_seed = 123344

-def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False):
+# default_seed = _np.random.seed(123344)
+
+def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan=False):
    """
    model for testing purposes. Samples from a GP with rbf kernel and learns
    the samples with a new kernel. Normally not for optimization, just model cheking
    """
-    from GPy.likelihoods.gaussian import Gaussian
    import GPy

    num_inputs = 13
    num_inducing = 5
    if plot:
        output_dim = 1
-        input_dim = 2
+        input_dim = 3
    else:
        input_dim = 2
-        output_dim = 25
+        output_dim = output_dim

    # generate GPLVM-like data
    X = _np.random.rand(num_inputs, input_dim)
    lengthscales = _np.random.rand(input_dim)
-    k = (GPy.kern.rbf(input_dim, .5, lengthscales, ARD=True)
-         + GPy.kern.white(input_dim, 0.01))
+    k = GPy.kern.RBF(input_dim, .5, lengthscales, ARD=True)
    K = k.K(X)
-    Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, output_dim).T
-    lik = Gaussian(Y, normalize=True)
+    Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, (output_dim,)).T

-    k = GPy.kern.rbf_inv(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
-    # k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
-    # k = GPy.kern.rbf(input_dim, ARD = False)  + GPy.kern.white(input_dim, 0.00001)
-    # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.rbf(input_dim, .3, _np.ones(input_dim) * .2, ARD=True)
-    # k = GPy.kern.rbf(input_dim, .5, 2., ARD=0) + GPy.kern.rbf(input_dim, .3, .2, ARD=0)
-    # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.linear(input_dim, _np.ones(input_dim) * .2, ARD=True)
+    # k = GPy.kern.RBF_inv(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
+    # k = GPy.kern.linear(input_dim)# + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+    # k = GPy.kern.RBF(input_dim, ARD = False)  + GPy.kern.white(input_dim, 0.00001)
+    # k = GPy.kern.RBF(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.RBF(input_dim, .3, _np.ones(input_dim) * .2, ARD=True)
+    # k = GPy.kern.RBF(input_dim, .5, 2., ARD=0) + GPy.kern.RBF(input_dim, .3, .2, ARD=0)
+    # k = GPy.kern.RBF(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.linear(input_dim, _np.ones(input_dim) * .2, ARD=True)
+
+    p = .3
+
+    m = GPy.models.BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
+
+    if nan:
+        m.inference_method = GPy.inference.latent_function_inference.var_dtc.VarDTCMissingData()
+        m.Y[_np.random.binomial(1, p, size=(Y.shape)).astype(bool)] = _np.nan
+        m.parameters_changed()

-    m = GPy.models.BayesianGPLVM(lik, input_dim, kernel=k, num_inducing=num_inducing)
    #===========================================================================
    # randomly obstruct data with percentage p
-    p = .8
-    Y_obstruct = Y.copy()
-    Y_obstruct[_np.random.uniform(size=(Y.shape)) < p] = _np.nan
    #===========================================================================
-    m2 = GPy.models.BayesianGPLVMWithMissingData(Y_obstruct, input_dim, kernel=k, num_inducing=num_inducing)
-    m.lengthscales = lengthscales
+    # m2 = GPy.models.BayesianGPLVMWithMissingData(Y_obstruct, input_dim, kernel=k, num_inducing=num_inducing)
+    # m.lengthscales = lengthscales

    if plot:
        import matplotlib.pyplot as pb
        m.plot()
        pb.title('PCA initialisation')
-        m2.plot()
-        pb.title('PCA initialisation')
+        # m2.plot()
+        # pb.title('PCA initialisation')

    if optimize:
        m.optimize('scg', messages=verbose)
-        m2.optimize('scg', messages=verbose)
+        # m2.optimize('scg', messages=verbose)
        if plot:
            m.plot()
            pb.title('After optimisation')
-            m2.plot()
-            pb.title('After optimisation')
+            # m2.plot()
+            # pb.title('After optimisation')

-    return m, m2
+    return m

 def gplvm_oil_100(optimize=True, verbose=1, plot=True):
    import GPy
-    data = GPy.util.datasets.oil_100()
+    import pods
+    data = pods.datasets.oil_100()
    Y = data['X']
    # create simple GP model
-    kernel = GPy.kern.rbf(6, ARD=True) + GPy.kern.bias(6)
+    kernel = GPy.kern.RBF(6, ARD=True) + GPy.kern.Bias(6)
    m = GPy.models.GPLVM(Y, 6, kernel=kernel)
    m.data_labels = data['Y'].argmax(axis=1)
    if optimize: m.optimize('scg', messages=verbose)
@ -78,13 +83,15 @@ def gplvm_oil_100(optimize=True, verbose=1, plot=True):

 def sparse_gplvm_oil(optimize=True, verbose=0, plot=True, N=100, Q=6, num_inducing=15, max_iters=50):
    import GPy
+    import pods
+
    _np.random.seed(0)
-    data = GPy.util.datasets.oil()
+    data = pods.datasets.oil()
    Y = data['X'][:N]
    Y = Y - Y.mean(0)
    Y /= Y.std(0)
    # Create the model
-    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q)
+    kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.Bias(Q)
    m = GPy.models.SparseGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing)
    m.data_labels = data['Y'][:N].argmax(axis=1)

@ -94,9 +101,9 @@ def sparse_gplvm_oil(optimize=True, verbose=0, plot=True, N=100, Q=6, num_induci
        m.kern.plot_ARD()
    return m

-def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=15, Q=4, sigma=.2):
+def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=25, Q=4, sigma=.2):
    import GPy
-    from GPy.util.datasets import swiss_roll_generated
+    from pods.datasets import swiss_roll_generated
    from GPy.models import BayesianGPLVM

    data = swiss_roll_generated(num_samples=N, sigma=sigma)
@ -134,93 +141,103 @@ def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=15, Q=4
                                         (1 - var))) + .001
    Z = _np.random.permutation(X)[:num_inducing]

-    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))
+    kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.Bias(Q, _np.exp(-2)) + GPy.kern.White(Q, _np.exp(-2))

    m = BayesianGPLVM(Y, Q, X=X, X_variance=S, num_inducing=num_inducing, Z=Z, kernel=kernel)
    m.data_colors = c
    m.data_t = t
-    m['noise_variance'] = Y.var() / 100.

    if optimize:
-        m.optimize('scg', messages=verbose, max_iters=2e3)
+        m.optimize('bfgs', messages=verbose, max_iters=2e3)

    if plot:
        fig = plt.figure('fitted')
        ax = fig.add_subplot(111)
        s = m.input_sensitivity().argsort()[::-1][:2]
-        ax.scatter(*m.X.T[s], c=c)
+        ax.scatter(*m.X.mean.T[s], c=c)

    return m

 def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, max_iters=1000, **k):
    import GPy
-    from GPy.likelihoods import Gaussian
    from matplotlib import pyplot as plt
-
+    import numpy as np
    _np.random.seed(0)
+    try:
+        import pods
+        data = pods.datasets.oil()
+    except ImportError:
        data = GPy.util.datasets.oil()

-    kernel = GPy.kern.rbf_inv(Q, 1., [.1] * Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2))
+
+    kernel = GPy.kern.RBF(Q, 1., 1. / _np.random.uniform(0, 1, (Q,)), ARD=True)  # + GPy.kern.Bias(Q, _np.exp(-2))
    Y = data['X'][:N]
-    Yn = Gaussian(Y, normalize=True)
-    m = GPy.models.BayesianGPLVM(Yn, Q, kernel=kernel, num_inducing=num_inducing, **k)
+    m = GPy.models.BayesianGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing, **k)
    m.data_labels = data['Y'][:N].argmax(axis=1)
-    m['noise'] = Yn.Y.var() / 100.

    if optimize:
-        m.optimize('scg', messages=verbose, max_iters=max_iters, gtol=.05)
+        m.optimize('bfgs', messages=verbose, max_iters=max_iters, gtol=.05)

    if plot:
-        y = m.likelihood.Y[0, :]
        fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
-        m.plot_latent(ax=latent_axes)
-        data_show = GPy.util.visualize.vector_show(y)
-        lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :], # @UnusedVariable
-            m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
+        m.plot_latent(ax=latent_axes, labels=m.data_labels)
+        data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0, :]))
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean.values[0:1, :],  # @UnusedVariable
+            m, data_show, latent_axes=latent_axes, sense_axes=sense_axes, labels=m.data_labels)
        raw_input('Press enter to finish')
        plt.close(fig)
    return m

-def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
-    x = _np.linspace(0, 4 * _np.pi, N)[:, None]
-    s1 = _np.vectorize(lambda x: _np.sin(x))
-    s2 = _np.vectorize(lambda x: _np.cos(x))
-    s3 = _np.vectorize(lambda x:-_np.exp(-_np.cos(2 * x)))
-    sS = _np.vectorize(lambda x: _np.sin(2 * x))
+def ssgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, max_iters=1000, **k):
+    import GPy
+    from matplotlib import pyplot as plt
+    import pods

-    s1 = s1(x)
-    s2 = s2(x)
-    s3 = s3(x)
-    sS = sS(x)
+    _np.random.seed(0)
+    data = pods.datasets.oil()

-    S1 = _np.hstack([s1, sS])
-    S2 = _np.hstack([s2, s3, sS])
-    S3 = _np.hstack([s3, sS])
+    kernel = GPy.kern.RBF(Q, 1., 1. / _np.random.uniform(0, 1, (Q,)), ARD=True)  # + GPy.kern.Bias(Q, _np.exp(-2))
+    Y = data['X'][:N]
+    m = GPy.models.SSGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing, **k)
+    m.data_labels = data['Y'][:N].argmax(axis=1)

-    Y1 = S1.dot(_np.random.randn(S1.shape[1], D1))
-    Y2 = S2.dot(_np.random.randn(S2.shape[1], D2))
-    Y3 = S3.dot(_np.random.randn(S3.shape[1], D3))
+    if optimize:
+        m.optimize('bfgs', messages=verbose, max_iters=max_iters, gtol=.05)

-    Y1 += .3 * _np.random.randn(*Y1.shape)
-    Y2 += .2 * _np.random.randn(*Y2.shape)
-    Y3 += .25 * _np.random.randn(*Y3.shape)
+    if plot:
+        fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
+        m.plot_latent(ax=latent_axes, labels=m.data_labels)
+        data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0, :]))
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean.values[0:1, :],  # @UnusedVariable
+            m, data_show, latent_axes=latent_axes, sense_axes=sense_axes, labels=m.data_labels)
+        raw_input('Press enter to finish')
+        plt.close(fig)
+    return m

-    Y1 -= Y1.mean(0)
-    Y2 -= Y2.mean(0)
-    Y3 -= Y3.mean(0)
-    Y1 /= Y1.std(0)
-    Y2 /= Y2.std(0)
-    Y3 /= Y3.std(0)
+def _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim=False):
+    Q_signal = 4
+    import GPy
+    import numpy as np
+    np.random.seed(3000)
+
+    k = GPy.kern.Matern32(Q_signal, 1., lengthscale=(np.random.uniform(1, 6, Q_signal)), ARD=1)
+    for i in range(Q_signal):
+        k += GPy.kern.PeriodicExponential(1, variance=1., active_dims=[i], period=3., lower=-2, upper=6)
+    t = np.c_[[np.linspace(-1, 5, N) for _ in range(Q_signal)]].T
+    K = k.K(t)
+    s2, s1, s3, sS = np.random.multivariate_normal(np.zeros(K.shape[0]), K, size=(4))[:, :, None]
+
+    Y1, Y2, Y3, S1, S2, S3 = _generate_high_dimensional_output(D1, D2, D3, s1, s2, s3, sS)

    slist = [sS, s1, s2, s3]
    slist_names = ["sS", "s1", "s2", "s3"]
    Ylist = [Y1, Y2, Y3]

    if plot_sim:
-        import pylab
+        from matplotlib import pyplot as plt
        import matplotlib.cm as cm
        import itertools
-        fig = pylab.figure("MRD Simulation Data", figsize=(8, 6))
+        fig = plt.figure("MRD Simulation Data", figsize=(8, 6))
        fig.clf()
        ax = fig.add_subplot(2, 1, 1)
        labls = slist_names
@ -231,28 +248,73 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
            ax = fig.add_subplot(2, len(Ylist), len(Ylist) + 1 + i)
            ax.imshow(Y, aspect='auto', cmap=cm.gray)  # @UndefinedVariable
            ax.set_title("Y{}".format(i + 1))
-        pylab.draw()
-        pylab.tight_layout()
+        plt.draw()
+        plt.tight_layout()

    return slist, [S1, S2, S3], Ylist

-# def bgplvm_simulation_matlab_compare():
-#     from GPy.util.datasets import simulation_BGPLVM
-#     from GPy import kern
-#     from GPy.models import BayesianGPLVM
-#
-#     sim_data = simulation_BGPLVM()
-#     Y = sim_data['Y']
-#     mu = sim_data['mu']
-#     num_inducing, [_, Q] = 3, mu.shape
-#
-#     k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2))
-#     m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k,
-#                        _debug=False)
-#     m.auto_scale_factor = True
-#     m['noise'] = Y.var() / 100.
-#     m['linear_variance'] = .01
-#     return m
+def _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim=False):
+    _np.random.seed(1234)
+
+    x = _np.linspace(0, 4 * _np.pi, N)[:, None]
+    s1 = _np.vectorize(lambda x: _np.sin(x))
+    s2 = _np.vectorize(lambda x: _np.cos(x) ** 2)
+    s3 = _np.vectorize(lambda x:-_np.exp(-_np.cos(2 * x)))
+    sS = _np.vectorize(lambda x: _np.cos(x))
+
+    s1 = s1(x)
+    s2 = s2(x)
+    s3 = s3(x)
+    sS = sS(x)
+
+    s1 -= s1.mean(); s1 /= s1.std(0)
+    s2 -= s2.mean(); s2 /= s2.std(0)
+    s3 -= s3.mean(); s3 /= s3.std(0)
+    sS -= sS.mean(); sS /= sS.std(0)
+
+    Y1, Y2, Y3, S1, S2, S3 = _generate_high_dimensional_output(D1, D2, D3, s1, s2, s3, sS)
+
+    slist = [sS, s1, s2, s3]
+    slist_names = ["sS", "s1", "s2", "s3"]
+    Ylist = [Y1, Y2, Y3]
+
+    if plot_sim:
+        from matplotlib import pyplot as plt
+        import matplotlib.cm as cm
+        import itertools
+        fig = plt.figure("MRD Simulation Data", figsize=(8, 6))
+        fig.clf()
+        ax = fig.add_subplot(2, 1, 1)
+        labls = slist_names
+        for S, lab in itertools.izip(slist, labls):
+            ax.plot(S, label=lab)
+        ax.legend()
+        for i, Y in enumerate(Ylist):
+            ax = fig.add_subplot(2, len(Ylist), len(Ylist) + 1 + i)
+            ax.imshow(Y, aspect='auto', cmap=cm.gray)  # @UndefinedVariable
+            ax.set_title("Y{}".format(i + 1))
+        plt.draw()
+        plt.tight_layout()
+
+    return slist, [S1, S2, S3], Ylist
+
+def _generate_high_dimensional_output(D1, D2, D3, s1, s2, s3, sS):
+    S1 = _np.hstack([s1, sS])
+    S2 = _np.hstack([s2, s3, sS])
+    S3 = _np.hstack([s3, sS])
+    Y1 = S1.dot(_np.random.randn(S1.shape[1], D1))
+    Y2 = S2.dot(_np.random.randn(S2.shape[1], D2))
+    Y3 = S3.dot(_np.random.randn(S3.shape[1], D3))
+    Y1 += .3 * _np.random.randn(*Y1.shape)
+    Y2 += .2 * _np.random.randn(*Y2.shape)
+    Y3 += .25 * _np.random.randn(*Y3.shape)
+    Y1 -= Y1.mean(0)
+    Y2 -= Y2.mean(0)
+    Y3 -= Y3.mean(0)
+    Y1 /= Y1.std(0)
+    Y2 /= Y2.std(0)
+    Y3 /= Y3.std(0)
+    return Y1, Y2, Y3, S1, S2, S3

 def bgplvm_simulation(optimize=True, verbose=1,
                      plot=True, plot_sim=False,
@ -261,95 +323,181 @@ def bgplvm_simulation(optimize=True, verbose=1,
    from GPy import kern
    from GPy.models import BayesianGPLVM

-    D1, D2, D3, N, num_inducing, Q = 49, 30, 10, 12, 3, 10
-    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
+    D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 45, 3, 9
+    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
    Y = Ylist[0]
-    k = kern.linear(Q, ARD=True)
+    k = kern.Linear(Q, ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+    # k = kern.RBF(Q, ARD=True, lengthscale=10.)
    m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k)
-    m.X_variance = m.X_variance * .7
-    m['noise'] = Y.var() / 100.
+    m.X.variance[:] = _np.random.uniform(0, .01, m.X.shape)
+    m.likelihood.variance = .1
+
+    if optimize:
+        print "Optimizing model:"
+        m.optimize('bfgs', messages=verbose, max_iters=max_iters,
+                   gtol=.05)
+    if plot:
+        m.X.plot("BGPLVM Latent Space 1D")
+        m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
+    return m
+
+def ssgplvm_simulation(optimize=True, verbose=1,
+                      plot=True, plot_sim=False,
+                      max_iters=2e4, useGPU=False
+                      ):
+    from GPy import kern
+    from GPy.models import SSGPLVM
+
+    D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 45, 3, 9
+    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
+    Y = Ylist[0]
+    k = kern.Linear(Q, ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+    # k = kern.RBF(Q, ARD=True, lengthscale=10.)
+    m = SSGPLVM(Y, Q, init="pca", num_inducing=num_inducing, kernel=k)
+    m.X.variance[:] = _np.random.uniform(0, .01, m.X.shape)
+    m.likelihood.variance = .1

    if optimize:
        print "Optimizing model:"
        m.optimize('scg', messages=verbose, max_iters=max_iters,
                   gtol=.05)
    if plot:
-        m.plot_X_1d("BGPLVM Latent Space 1D")
+        m.X.plot("SSGPLVM Latent Space 1D")
+        m.kern.plot_ARD('SSGPLVM Simulation ARD Parameters')
+    return m
+
+def bgplvm_simulation_missing_data(optimize=True, verbose=1,
+                      plot=True, plot_sim=False,
+                      max_iters=2e4, percent_missing=.1,
+                      ):
+    from GPy import kern
+    from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
+
+    D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 400, 3, 4
+    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
+    Y = Ylist[0]
+    k = kern.Linear(Q, ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+
+    inan = _np.random.binomial(1, percent_missing, size=Y.shape).astype(bool)  # 80% missing data
+    Ymissing = Y.copy()
+    Ymissing[inan] = _np.nan
+
+    m = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
+                      kernel=k, missing_data=True)
+
+    m.Yreal = Y
+
+    if optimize:
+        print "Optimizing model:"
+        m.optimize('bfgs', messages=verbose, max_iters=max_iters,
+                   gtol=.05)
+    if plot:
+        m.X.plot("BGPLVM Latent Space 1D")
        m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
    return m

+
 def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
    from GPy import kern
    from GPy.models import MRD
-    from GPy.likelihoods import Gaussian

    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
-    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
-    likelihood_list = [Gaussian(x, normalize=True) for x in Ylist]
+    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)

-    k = kern.linear(Q, ARD=True)# + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2))
-    m = MRD(likelihood_list, input_dim=Q, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw)
-    m.ensure_default_constraints()
+    # Ylist = [Ylist[0]]
+    k = kern.Linear(Q, ARD=True)
+    m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing, kernel=k, initx="PCA_concat", initz='permute', **kw)
+
+    m['.*noise'] = [Y.var() / 40. for Y in Ylist]

-    for i, bgplvm in enumerate(m.bgplvms):
-        m['{}_noise'.format(i)] = 1 #bgplvm.likelihood.Y.var() / 500.
-        bgplvm.X_variance = bgplvm.X_variance #* .1
    if optimize:
        print "Optimizing Model:"
-        m.optimize(messages=verbose, max_iters=8e3, gtol=.1)
+        m.optimize(messages=verbose, max_iters=8e3)
    if plot:
-        m.plot_X_1d("MRD Latent Space 1D")
+        m.X.plot("MRD Latent Space 1D")
+        m.plot_scales("MRD Scales")
+    return m
+
+def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
+    from GPy import kern
+    from GPy.models import MRD
+
+    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
+    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
+
+    # Ylist = [Ylist[0]]
+    k = kern.Linear(Q, ARD=True)
+    inanlist = []
+
+    for Y in Ylist:
+        inan = _np.random.binomial(1, .6, size=Y.shape).astype(bool)
+        inanlist.append(inan)
+        Y[inan] = _np.nan
+
+    m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing,
+            kernel=k, inference_method=None,
+            initx="random", initz='permute', **kw)
+
+    if optimize:
+        print "Optimizing Model:"
+        m.optimize('bfgs', messages=verbose, max_iters=8e3, gtol=.1)
+    if plot:
+        m.X.plot("MRD Latent Space 1D")
        m.plot_scales("MRD Scales")
    return m

 def brendan_faces(optimize=True, verbose=True, plot=True):
    import GPy
+    import pods

-    data = GPy.util.datasets.brendan_faces()
+    data = pods.datasets.brendan_faces()
    Q = 2
    Y = data['Y']
    Yn = Y - Y.mean()
    Yn /= Yn.std()

-    m = GPy.models.GPLVM(Yn, Q)
+    m = GPy.models.BayesianGPLVM(Yn, Q, num_inducing=20)

    # optimize
-    m.constrain('rbf|noise|white', GPy.core.transformations.logexp_clipped())

-    if optimize: m.optimize('scg', messages=verbose, max_iters=1000)
+    if optimize: m.optimize('bfgs', messages=verbose, max_iters=1000)

    if plot:
        ax = m.plot_latent(which_indices=(0, 1))
-        y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, order='F', invert=False, scale=False)
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        y = m.Y[0, :]
+        data_show = GPy.plotting.matplot_dep.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, order='F', invert=False, scale=False)
+        lvm = GPy.plotting.matplot_dep.visualize.lvm(m.X.mean[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')

    return m

 def olivetti_faces(optimize=True, verbose=True, plot=True):
    import GPy
+    import pods

-    data = GPy.util.datasets.olivetti_faces()
+    data = pods.datasets.olivetti_faces()
    Q = 2
    Y = data['Y']
    Yn = Y - Y.mean()
    Yn /= Yn.std()

-    m = GPy.models.GPLVM(Yn, Q)
-    if optimize: m.optimize('scg', messages=verbose, max_iters=1000)
+    m = GPy.models.BayesianGPLVM(Yn, Q, num_inducing=20)
+
+    if optimize: m.optimize('bfgs', messages=verbose, max_iters=1000)
    if plot:
        ax = m.plot_latent(which_indices=(0, 1))
-        y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        y = m.Y[0, :]
+        data_show = GPy.plotting.matplot_dep.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
+        lvm = GPy.plotting.matplot_dep.visualize.lvm(m.X.mean[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')

    return m

 def stick_play(range=None, frame_rate=15, optimize=False, verbose=True, plot=True):
    import GPy
-    data = GPy.util.datasets.osu_run1()
+    import pods
+
+    data = pods.datasets.osu_run1()
    # optimize
    if range == None:
        Y = data['Y'].copy()
@ -357,43 +505,46 @@ def stick_play(range=None, frame_rate=15, optimize=False, verbose=True, plot=Tru
        Y = data['Y'][range[0]:range[1], :].copy()
    if plot:
        y = Y[0, :]
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.util.visualize.data_play(Y, data_show, frame_rate)
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
+        GPy.plotting.matplot_dep.visualize.data_play(Y, data_show, frame_rate)
    return Y

 def stick(kernel=None, optimize=True, verbose=True, plot=True):
    from matplotlib import pyplot as plt
    import GPy
+    import pods

-    data = GPy.util.datasets.osu_run1()
+    data = pods.datasets.osu_run1()
    # optimize
    m = GPy.models.GPLVM(data['Y'], 2, kernel=kernel)
-    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
-    if plot and GPy.util.visualize.visual_available:
+    if optimize: m.optimize('bfgs', messages=verbose, max_f_eval=10000)
+    if plot:
        plt.clf
        ax = m.plot_latent()
-        y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        y = m.Y[0, :]
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[:1, :].copy(), m, data_show, latent_axes=ax)
        raw_input('Press enter to finish')
-
+        lvm_visualizer.close()
+        data_show.close()
    return m

 def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):
    from matplotlib import pyplot as plt
    import GPy
+    import pods

-    data = GPy.util.datasets.osu_run1()
+    data = pods.datasets.osu_run1()
    # optimize
    mapping = GPy.mappings.Linear(data['Y'].shape[1], 2)
    m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
-    if plot and GPy.util.visualize.visual_available:
+    if plot and GPy.plotting.matplot_dep.visualize.visual_available:
        plt.clf
        ax = m.plot_latent()
        y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
+        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')

    return m
@ -401,32 +552,33 @@ def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):
 def bcgplvm_stick(kernel=None, optimize=True, verbose=True, plot=True):
    from matplotlib import pyplot as plt
    import GPy
+    import pods

-    data = GPy.util.datasets.osu_run1()
+    data = pods.datasets.osu_run1()
    # optimize
-    back_kernel=GPy.kern.rbf(data['Y'].shape[1], lengthscale=5.)
+    back_kernel = GPy.kern.RBF(data['Y'].shape[1], lengthscale=5.)
    mapping = GPy.mappings.Kernel(X=data['Y'], output_dim=2, kernel=back_kernel)
    m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
-    if plot and GPy.util.visualize.visual_available:
+    if plot and GPy.plotting.matplot_dep.visualize.visual_available:
        plt.clf
        ax = m.plot_latent()
        y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
-        raw_input('Press enter to finish')
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
+        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        # raw_input('Press enter to finish')

    return m

 def robot_wireless(optimize=True, verbose=True, plot=True):
    from matplotlib import pyplot as plt
    import GPy
+    import pods

-    data = GPy.util.datasets.robot_wireless()
+    data = pods.datasets.robot_wireless()
    # optimize
-    m = GPy.models.GPLVM(data['Y'], 2)
+    m = GPy.models.BayesianGPLVM(data['Y'], 4, num_inducing=25)
    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
-    m._set_params(m._get_params())
    if plot:
        m.plot_latent()

@ -435,23 +587,33 @@ def robot_wireless(optimize=True, verbose=True, plot=True):
 def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
    from GPy.models import BayesianGPLVM
    from matplotlib import pyplot as plt
+    import numpy as np
    import GPy
+    import pods

-    data = GPy.util.datasets.osu_run1()
+    data = pods.datasets.osu_run1()
    Q = 6
-    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))
+    kernel = GPy.kern.RBF(Q, lengthscale=np.repeat(.5, Q), ARD=True)
    m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel)
+
+    m.data = data
+    m.likelihood.variance = 0.001
+
    # optimize
-    m.ensure_default_constraints()
-    if optimize: m.optimize('scg', messages=verbose, max_iters=200, xtol=1e-300, ftol=1e-300)
-    m._set_params(m._get_params())
+    try:
+        if optimize: m.optimize('bfgs', messages=verbose, max_iters=5e3, bfgs_factor=10)
+    except KeyboardInterrupt:
+        print "Keyboard interrupt, continuing to plot and return"
+
    if plot:
-        plt.clf, (latent_axes, sense_axes) = plt.subplots(1, 2)
+        fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
        plt.sca(latent_axes)
-        m.plot_latent()
-        y = m.likelihood.Y[0, :].copy()
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.util.visualize.lvm_dimselect(m.X[0, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
+        m.plot_latent(ax=latent_axes)
+        y = m.Y[:1, :].copy()
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y, connect=data['connect'])
+        dim_select = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean[:1, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
+        fig.canvas.draw()
+        fig.canvas.show()
        raw_input('Press enter to finish')

    return m
@ -459,20 +621,50 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):

 def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose=True, plot=True):
    import GPy
+    import pods

-    data = GPy.util.datasets.cmu_mocap(subject, motion)
+    data = pods.datasets.cmu_mocap(subject, motion)
    if in_place:
        # Make figure move in place.
        data['Y'][:, 0:3] = 0.0
-    m = GPy.models.GPLVM(data['Y'], 2, normalize_Y=True)
+    Y = data['Y']
+    Y_mean = Y.mean(0)
+    Y_std = Y.std(0)
+    m = GPy.models.GPLVM((Y - Y_mean) / Y_std, 2)

    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
    if plot:
        ax = m.plot_latent()
-        y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.skeleton_show(y[None, :], data['skel'])
-        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        y = m.Y[0, :]
+        data_show = GPy.plotting.matplot_dep.visualize.skeleton_show(y[None, :], data['skel'])
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[0].copy(), m, data_show, latent_axes=ax)
        raw_input('Press enter to finish')
        lvm_visualizer.close()
+        data_show.close()

    return m
+
+def ssgplvm_simulation_linear():
+    import numpy as np
+    import GPy
+    N, D, Q = 1000, 20, 5
+    pi = 0.2
+
+    def sample_X(Q, pi):
+        x = np.empty(Q)
+        dies = np.random.rand(Q)
+        for q in xrange(Q):
+            if dies[q] < pi:
+                x[q] = np.random.randn()
+            else:
+                x[q] = 0.
+        return x
+
+    Y = np.empty((N, D))
+    X = np.empty((N, Q))
+    # Generate data from random sampled weight matrices
+    for n in xrange(N):
+        X[n] = sample_X(Q, pi)
+        w = np.random.randn(D, Q)
+        Y[n] = np.dot(w, X[n])
+
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@ -1,7 +1,13 @@
+# Copyright (c) 2014, Alan Saul
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
 import GPy
 import numpy as np
-import matplotlib.pyplot as plt
 from GPy.util import datasets
+try:
+    import matplotlib.pyplot as plt
+except:
+    pass

 def student_t_approx(optimize=True, plot=True):
    """
@ -30,47 +36,53 @@ def student_t_approx(optimize=True, plot=True):
    #Yc = Yc/Yc.max()

    #Add student t random noise to datapoints
-    deg_free = 5
+    deg_free = 1
    print "Real noise: ", real_std
    initial_var_guess = 0.5
    edited_real_sd = initial_var_guess

    # Kernel object
-    kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
-    kernel2 = kernel1.copy()
-    kernel3 = kernel1.copy()
-    kernel4 = kernel1.copy()
+    kernel1 = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])
+    kernel2 = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])
+    kernel3 = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])
+    kernel4 = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])

    #Gaussian GP model on clean data
    m1 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel1)
    # optimize
-    m1.ensure_default_constraints()
-    m1.constrain_fixed('white', 1e-5)
+    m1['.*white'].constrain_fixed(1e-5)
    m1.randomize()

    #Gaussian GP model on corrupt data
    m2 = GPy.models.GPRegression(X, Yc.copy(), kernel=kernel2)
-    m2.ensure_default_constraints()
-    m2.constrain_fixed('white', 1e-5)
+    m2['.*white'].constrain_fixed(1e-5)
    m2.randomize()

    #Student t GP model on clean data
-    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
-    m3 = GPy.models.GPRegression(X, Y.copy(), kernel3, likelihood=stu_t_likelihood)
-    m3.ensure_default_constraints()
-    m3.constrain_bounded('t_noise', 1e-6, 10.)
-    m3.constrain_fixed('white', 1e-5)
+    t_distribution = GPy.likelihoods.StudentT(deg_free=deg_free, sigma2=edited_real_sd)
+    laplace_inf = GPy.inference.latent_function_inference.Laplace()
+    m3 = GPy.core.GP(X, Y.copy(), kernel3, likelihood=t_distribution, inference_method=laplace_inf)
+    m3['.*t_scale2'].constrain_bounded(1e-6, 10.)
+    m3['.*white'].constrain_fixed(1e-5)
    m3.randomize()

    #Student t GP model on corrupt data
-    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
-    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
-    m4 = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood)
-    m4.ensure_default_constraints()
-    m4.constrain_bounded('t_noise', 1e-6, 10.)
-    m4.constrain_fixed('white', 1e-5)
+    t_distribution = GPy.likelihoods.StudentT(deg_free=deg_free, sigma2=edited_real_sd)
+    laplace_inf = GPy.inference.latent_function_inference.Laplace()
+    m4 = GPy.core.GP(X, Yc.copy(), kernel4, likelihood=t_distribution, inference_method=laplace_inf)
+    m4['.*t_scale2'].constrain_bounded(1e-6, 10.)
+    m4['.*white'].constrain_fixed(1e-5)
    m4.randomize()
+    print m4
+    debug=True
+    if debug:
+        m4.optimize(messages=1)
+        import pylab as pb
+        pb.plot(m4.X, m4.inference_method.f_hat)
+        pb.plot(m4.X, m4.Y, 'rx')
+        m4.plot()
+        print m4
+        return m4

    if optimize:
        optimizer='scg'
@ -115,6 +127,7 @@ def student_t_approx(optimize=True, plot=True):
    return m1, m2, m3, m4

 def boston_example(optimize=True, plot=True):
+    raise NotImplementedError("Needs updating")
    import sklearn
    from sklearn.cross_validation import KFold
    optimizer='bfgs'
@ -143,8 +156,8 @@ def boston_example(optimize=True, plot=True):
        noise = 1e-1 #np.exp(-2)
        rbf_len = 0.5
        data_axis_plot = 4
-        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
-        kernelgp = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+        kernelstu = GPy.kern.RBF(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+        kernelgp = GPy.kern.RBF(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])

        #Baseline
        score_folds[0, n] = rmse(Y_test, np.mean(Y_train))
@ -152,10 +165,9 @@ def boston_example(optimize=True, plot=True):
        #Gaussian GP
        print "Gauss GP"
        mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp.copy())
-        mgp.ensure_default_constraints()
-        mgp.constrain_fixed('white', 1e-5)
-        mgp['rbf_len'] = rbf_len
-        mgp['noise'] = noise
+        mgp.constrain_fixed('.*white', 1e-5)
+        mgp['.*len'] = rbf_len
+        mgp['.*noise'] = noise
        print mgp
        if optimize:
            mgp.optimize(optimizer=optimizer, messages=messages)
@ -170,9 +182,8 @@ def boston_example(optimize=True, plot=True):
        g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=noise, N=N, D=D)
        g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution)
        mg = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=g_likelihood)
-        mg.ensure_default_constraints()
        mg.constrain_positive('noise_variance')
-        mg.constrain_fixed('white', 1e-5)
+        mg.constrain_fixed('.*white', 1e-5)
        mg['rbf_len'] = rbf_len
        mg['noise'] = noise
        print mg
@ -190,11 +201,10 @@ def boston_example(optimize=True, plot=True):
            t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=df, sigma2=noise)
            stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
            mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
-            mstu_t.ensure_default_constraints()
-            mstu_t.constrain_fixed('white', 1e-5)
-            mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
+            mstu_t.constrain_fixed('.*white', 1e-5)
+            mstu_t.constrain_bounded('.*t_scale2', 0.0001, 1000)
            mstu_t['rbf_len'] = rbf_len
-            mstu_t['t_noise'] = noise
+            mstu_t['.*t_scale2'] = noise
            print mstu_t
            if optimize:
                mstu_t.optimize(optimizer=optimizer, messages=messages)
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -1,22 +1,29 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 """
 Gaussian Processes regression examples
 """
-import pylab as pb
+try:
+    import pylab as pb
+except:
+    pass
 import numpy as np
 import GPy

 def olympic_marathon_men(optimize=True, plot=True):
    """Run a standard Gaussian process regression on the Olympic marathon data."""
-    data = GPy.util.datasets.olympic_marathon_men()
+    try:import pods
+    except ImportError:
+        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        return
+    data = pods.datasets.olympic_marathon_men()

    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])

    # set the lengthscale to be something sensible (defaults to 1)
-    m['rbf_lengthscale'] = 10
+    m.kern.lengthscale = 10.

    if optimize:
        m.optimize('bfgs', max_iters=200)
@ -25,79 +32,51 @@ def olympic_marathon_men(optimize=True, plot=True):

    return m

-def coregionalization_toy2(optimize=True, plot=True):
+def coregionalization_toy(optimize=True, plot=True):
    """
    A simple demonstration of coregionalization on two sinusoidal functions.
    """
    #build a design matrix with a column of integers indicating the output
    X1 = np.random.rand(50, 1) * 8
    X2 = np.random.rand(30, 1) * 5
-    index = np.vstack((np.zeros_like(X1), np.ones_like(X2)))
-    X = np.hstack((np.vstack((X1, X2)), index))

    #build a suitable set of observed variables
    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
    Y2 = np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + 2.
-    Y = np.vstack((Y1, Y2))

-    #build the kernel
-    k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
-    k2 = GPy.kern.coregionalize(2,1)
-    k = k1**k2
-    m = GPy.models.GPRegression(X, Y, kernel=k)
-    m.constrain_fixed('.*rbf_var', 1.)
+    m = GPy.models.GPCoregionalizedRegression(X_list=[X1,X2], Y_list=[Y1,Y2])

    if optimize:
        m.optimize('bfgs', max_iters=100)

    if plot:
-        m.plot(fixed_inputs=[(1,0)])
-        m.plot(fixed_inputs=[(1,1)], ax=pb.gca())
-
+        slices = GPy.util.multioutput.get_slices([X1,X2])
+        m.plot(fixed_inputs=[(1,0)],which_data_rows=slices[0],Y_metadata={'output_index':0})
+        m.plot(fixed_inputs=[(1,1)],which_data_rows=slices[1],Y_metadata={'output_index':1},ax=pb.gca())
    return m

-#FIXME: Needs recovering once likelihoods are consolidated
-#def coregionalization_toy(optimize=True, plot=True):
-#    """
-#    A simple demonstration of coregionalization on two sinusoidal functions.
-#    """
-#    X1 = np.random.rand(50, 1) * 8
-#    X2 = np.random.rand(30, 1) * 5
-#    X = np.vstack((X1, X2))
-#    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
-#    Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
-#    Y = np.vstack((Y1, Y2))
-#
-#    k1 = GPy.kern.rbf(1)
-#    m = GPy.models.GPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1])
-#    m.constrain_fixed('.*rbf_var', 1.)
-#    m.optimize(max_iters=100)
-#
-#    fig, axes = pb.subplots(2,1)
-#    m.plot(fixed_inputs=[(1,0)],ax=axes[0])
-#    m.plot(fixed_inputs=[(1,1)],ax=axes[1])
-#    axes[0].set_title('Output 0')
-#    axes[1].set_title('Output 1')
-#    return m
-
 def coregionalization_sparse(optimize=True, plot=True):
    """
    A simple demonstration of coregionalization on two sinusoidal functions using sparse approximations.
    """
-    #fetch the data from the non sparse examples
-    m = coregionalization_toy2(optimize=False, plot=False)
-    X, Y = m.X, m.likelihood.Y
+    #build a design matrix with a column of integers indicating the output
+    X1 = np.random.rand(50, 1) * 8
+    X2 = np.random.rand(30, 1) * 5

-    #construct a model
-    m = GPy.models.SparseGPRegression(X,Y)
-    m.constrain_fixed('iip_\d+_1') # don't optimize the inducing input indexes
+    #build a suitable set of observed variables
+    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
+    Y2 = np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + 2.
+
+    m = GPy.models.SparseGPCoregionalizedRegression(X_list=[X1,X2], Y_list=[Y1,Y2])

    if optimize:
-        m.optimize('bfgs', max_iters=100, messages=1)
+        m.optimize('bfgs', max_iters=100)

    if plot:
-        m.plot(fixed_inputs=[(1,0)])
-        m.plot(fixed_inputs=[(1,1)], ax=pb.gca())
+        slices = GPy.util.multioutput.get_slices([X1,X2])
+        m.plot(fixed_inputs=[(1,0)],which_data_rows=slices[0],Y_metadata={'output_index':0})
+        m.plot(fixed_inputs=[(1,1)],which_data_rows=slices[1],Y_metadata={'output_index':1},ax=pb.gca())
+        pb.ylim(-3,)

    return m

@ -107,7 +86,11 @@ def epomeo_gpx(max_iters=200, optimize=True, plot=True):
    from the Mount Epomeo runs. Requires gpxpy to be installed on your system
    to load in the data.
    """
-    data = GPy.util.datasets.epomeo_gpx()
+    try:import pods
+    except ImportError:
+        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        return
+    data = pods.datasets.epomeo_gpx()
    num_data_list = []
    for Xpart in data['X']:
        num_data_list.append(Xpart.shape[0])
@ -127,14 +110,14 @@ def epomeo_gpx(max_iters=200, optimize=True, plot=True):
    Z = np.hstack((np.linspace(t[:,0].min(), t[:, 0].max(), num_inducing)[:, None],
                   np.random.randint(0, 4, num_inducing)[:, None]))

-    k1 = GPy.kern.rbf(1)
-    k2 = GPy.kern.coregionalize(output_dim=5, rank=5)
+    k1 = GPy.kern.RBF(1)
+    k2 = GPy.kern.Coregionalize(output_dim=5, rank=5)
    k = k1**k2

    m = GPy.models.SparseGPRegression(t, Y, kernel=k, Z=Z, normalize_Y=True)
-    m.constrain_fixed('.*rbf_var', 1.)
-    m.constrain_fixed('iip')
-    m.constrain_bounded('noise_variance', 1e-3, 1e-1)
+    m.constrain_fixed('.*variance', 1.)
+    m.inducing_inputs.constrain_fixed()
+    m.Gaussian_noise.variance.constrain_bounded(1e-3, 1e-1)
    m.optimize(max_iters=max_iters,messages=True)

    return m
@ -150,13 +133,17 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
    length_scales = np.linspace(0.1, 60., resolution)
    log_SNRs = np.linspace(-3., 4., resolution)

-    data = GPy.util.datasets.della_gatta_TRP63_gene_expression(data_set='della_gatta',gene_number=gene_number)
+    try:import pods
+    except ImportError:
+        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        return
+    data = pods.datasets.della_gatta_TRP63_gene_expression(data_set='della_gatta',gene_number=gene_number)
    # data['Y'] = data['Y'][0::2, :]
    # data['X'] = data['X'][0::2, :]

    data['Y'] = data['Y'] - np.mean(data['Y'])

-    lls = GPy.examples.regression._contour_data(data, length_scales, log_SNRs, GPy.kern.rbf)
+    lls = GPy.examples.regression._contour_data(data, length_scales, log_SNRs, GPy.kern.RBF)
    if plot:
        pb.contour(length_scales, log_SNRs, np.exp(lls), 20, cmap=pb.cm.jet)
        ax = pb.gca()
@ -172,20 +159,20 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
    optim_point_y = np.empty(2)
    np.random.seed(seed=seed)
    for i in range(0, model_restarts):
-        # kern = GPy.kern.rbf(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.))
-        kern = GPy.kern.rbf(1, variance=np.random.uniform(1e-3, 1), lengthscale=np.random.uniform(5, 50))
+        # kern = GPy.kern.RBF(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.))
+        kern = GPy.kern.RBF(1, variance=np.random.uniform(1e-3, 1), lengthscale=np.random.uniform(5, 50))

        m = GPy.models.GPRegression(data['X'], data['Y'], kernel=kern)
-        m['noise_variance'] = np.random.uniform(1e-3, 1)
-        optim_point_x[0] = m['rbf_lengthscale']
-        optim_point_y[0] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);
+        m.likelihood.variance = np.random.uniform(1e-3, 1)
+        optim_point_x[0] = m.rbf.lengthscale
+        optim_point_y[0] = np.log10(m.rbf.variance) - np.log10(m.likelihood.variance);

        # optimize
        if optimize:
            m.optimize('scg', xtol=1e-6, ftol=1e-6, max_iters=max_iters)

-        optim_point_x[1] = m['rbf_lengthscale']
-        optim_point_y[1] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);
+        optim_point_x[1] = m.rbf.lengthscale
+        optim_point_y[1] = np.log10(m.rbf.variance) - np.log10(m.likelihood.variance);

        if plot:
            pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1] - optim_point_x[0], optim_point_y[1] - optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k')
@ -196,7 +183,7 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
        ax.set_ylim(ylim)
    return m # (models, lls)

-def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):
+def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.RBF):
    """
    Evaluate the GP objective function for a given data set for a range of
    signal to noise ratios and a range of lengthscales.
@ -216,7 +203,7 @@ def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):
        noise_var = total_var / (1. + SNR)
        signal_var = total_var - noise_var
        model.kern['.*variance'] = signal_var
-        model['noise_variance'] = noise_var
+        model.likelihood.variance = noise_var
        length_scale_lls = []

        for length_scale in length_scales:
@ -230,13 +217,17 @@ def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):

 def olympic_100m_men(optimize=True, plot=True):
    """Run a standard Gaussian process regression on the Rogers and Girolami olympics data."""
-    data = GPy.util.datasets.olympic_100m_men()
+    try:import pods
+    except ImportError:
+        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        return
+    data = pods.datasets.olympic_100m_men()

    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])

    # set the lengthscale to be something sensible (defaults to 1)
-    m['rbf_lengthscale'] = 10
+    m.rbf.lengthscale = 10

    if optimize:
        m.optimize('bfgs', max_iters=200)
@ -247,7 +238,11 @@ def olympic_100m_men(optimize=True, plot=True):

 def toy_rbf_1d(optimize=True, plot=True):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
-    data = GPy.util.datasets.toy_rbf_1d()
+    try:import pods
+    except ImportError:
+        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        return
+    data = pods.datasets.toy_rbf_1d()

    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])
@ -261,7 +256,11 @@ def toy_rbf_1d(optimize=True, plot=True):

 def toy_rbf_1d_50(optimize=True, plot=True):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
-    data = GPy.util.datasets.toy_rbf_1d_50()
+    try:import pods
+    except ImportError:
+        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        return
+    data = pods.datasets.toy_rbf_1d_50()

    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])
@ -278,14 +277,15 @@ def toy_poisson_rbf_1d_laplace(optimize=True, plot=True):
    optimizer='scg'
    x_len = 30
    X = np.linspace(0, 10, x_len)[:, None]
-    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.rbf(1).K(X))
+    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.RBF(1).K(X))
    Y = np.array([np.random.poisson(np.exp(f)) for f in f_true])[:,None]

-    noise_model = GPy.likelihoods.poisson()
-    likelihood = GPy.likelihoods.Laplace(Y,noise_model)
+    kern = GPy.kern.RBF(1)
+    poisson_lik = GPy.likelihoods.Poisson()
+    laplace_inf = GPy.inference.latent_function_inference.Laplace()

    # create simple GP Model
-    m = GPy.models.GPRegression(X, Y, likelihood=likelihood)
+    m = GPy.core.GP(X, Y, kernel=kern, likelihood=poisson_lik, inference_method=laplace_inf)

    if optimize:
        m.optimize(optimizer)
@ -316,23 +316,22 @@ def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize
    Y /= Y.std()

    if kernel_type == 'linear':
-        kernel = GPy.kern.linear(X.shape[1], ARD=1)
+        kernel = GPy.kern.Linear(X.shape[1], ARD=1)
    elif kernel_type == 'rbf_inv':
-        kernel = GPy.kern.rbf_inv(X.shape[1], ARD=1)
+        kernel = GPy.kern.RBF_inv(X.shape[1], ARD=1)
    else:
-        kernel = GPy.kern.rbf(X.shape[1], ARD=1)
-    kernel += GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+        kernel = GPy.kern.RBF(X.shape[1], ARD=1)
+    kernel += GPy.kern.White(X.shape[1]) + GPy.kern.Bias(X.shape[1])
    m = GPy.models.GPRegression(X, Y, kernel)
    # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
    # m.set_prior('.*lengthscale',len_prior)

    if optimize:
-        m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
+        m.optimize(optimizer='scg', max_iters=max_iters)

    if plot:
        m.kern.plot_ARD()

-    print m
    return m

 def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize=True, plot=True):
@ -355,36 +354,39 @@ def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4, o
    Y /= Y.std()

    if kernel_type == 'linear':
-        kernel = GPy.kern.linear(X.shape[1], ARD=1)
+        kernel = GPy.kern.Linear(X.shape[1], ARD=1)
    elif kernel_type == 'rbf_inv':
-        kernel = GPy.kern.rbf_inv(X.shape[1], ARD=1)
+        kernel = GPy.kern.RBF_inv(X.shape[1], ARD=1)
    else:
-        kernel = GPy.kern.rbf(X.shape[1], ARD=1)
-    kernel += GPy.kern.bias(X.shape[1])
+        kernel = GPy.kern.RBF(X.shape[1], ARD=1)
+    #kernel += GPy.kern.Bias(X.shape[1])
    X_variance = np.ones(X.shape) * 0.5
    m = GPy.models.SparseGPRegression(X, Y, kernel, X_variance=X_variance)
    # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
    # m.set_prior('.*lengthscale',len_prior)

    if optimize:
-        m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
+        m.optimize(optimizer='scg', max_iters=max_iters)

    if plot:
        m.kern.plot_ARD()

-    print m
    return m

 def robot_wireless(max_iters=100, kernel=None, optimize=True, plot=True):
    """Predict the location of a robot given wirelss signal strength readings."""
-    data = GPy.util.datasets.robot_wireless()
+    try:import pods
+    except ImportError:
+        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        return
+    data = pods.datasets.robot_wireless()

    # create simple GP Model
    m = GPy.models.GPRegression(data['Y'], data['X'], kernel=kernel)

    # optimize
    if optimize:
-        m.optimize(messages=True, max_iters=max_iters)
+        m.optimize(max_iters=max_iters)

    Xpredict = m.predict(data['Ytest'])[0]
    if plot:
@ -396,13 +398,16 @@ def robot_wireless(max_iters=100, kernel=None, optimize=True, plot=True):

    sse = ((data['Xtest'] - Xpredict)**2).sum()

-    print m
    print('Sum of squares error on test data: ' + str(sse))
    return m

 def silhouette(max_iters=100, optimize=True, plot=True):
    """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper."""
-    data = GPy.util.datasets.silhouette()
+    try:import pods
+    except ImportError:
+        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        return
+    data = pods.datasets.silhouette()

    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])
@ -414,32 +419,38 @@ def silhouette(max_iters=100, optimize=True, plot=True):
    print m
    return m

-def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, optimize=True, plot=True):
+def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, optimize=True, plot=True, checkgrad=False):
    """Run a 1D example of a sparse GP regression."""
    # sample inputs and outputs
    X = np.random.uniform(-3., 3., (num_samples, 1))
    Y = np.sin(X) + np.random.randn(num_samples, 1) * 0.05
    # construct kernel
-    rbf = GPy.kern.rbf(1)
+    rbf = GPy.kern.RBF(1)
    # create simple GP Model
    m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing)
-    m.checkgrad(verbose=1)
+
+    if checkgrad:
+        m.checkgrad()

    if optimize:
-        m.optimize('tnc', messages=1, max_iters=max_iters)
+        m.optimize('tnc', max_iters=max_iters)

    if plot:
        m.plot()

    return m

-def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, optimize=True, plot=True):
+def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, optimize=True, plot=True, nan=False):
    """Run a 2D example of a sparse GP regression."""
+    np.random.seed(1234)
    X = np.random.uniform(-3., 3., (num_samples, 2))
    Y = np.sin(X[:, 0:1]) * np.sin(X[:, 1:2]) + np.random.randn(num_samples, 1) * 0.05
+    if nan:
+        inan = np.random.binomial(1,.2,size=Y.shape)
+        Y[inan] = np.nan

    # construct kernel
-    rbf = GPy.kern.rbf(2)
+    rbf = GPy.kern.RBF(2)

    # create simple GP Model
    m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing)
@ -462,7 +473,7 @@ def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, opt

 def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
    """Run a 1D example of a sparse GP regression with uncertain inputs."""
-    fig, axes = pb.subplots(1, 2, figsize=(12, 5))
+    fig, axes = pb.subplots(1, 2, figsize=(12, 5), sharex=True, sharey=True)

    # sample inputs and outputs
    S = np.ones((20, 1))
@ -471,8 +482,7 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
    # likelihood = GPy.likelihoods.Gaussian(Y)
    Z = np.random.uniform(-3., 3., (7, 1))

-    k = GPy.kern.rbf(1)
-
+    k = GPy.kern.RBF(1)
    # create simple GP Model - no input uncertainty on this one
    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z)

@ -485,7 +495,7 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
    print m

    # the same Model with uncertainty
-    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z, X_variance=S)
+    m = GPy.models.SparseGPRegression(X, Y, kernel=GPy.kern.RBF(1), Z=Z, X_variance=S)
    if optimize:
        m.optimize('scg', messages=1, max_iters=max_iters)
    if plot:
--- a/GPy/examples/stochastic.py
+++ b/GPy/examples/stochastic.py
@ -1,37 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-import pylab as pb
-import numpy as np
-import GPy
-
-def toy_1d(optimize=True, plot=True):
-    N = 2000
-    M = 20
-
-    #create data
-    X = np.linspace(0,32,N)[:,None]
-    Z = np.linspace(0,32,M)[:,None]
-    Y = np.sin(X) + np.cos(0.3*X) + np.random.randn(*X.shape)/np.sqrt(50.)
-
-    m = GPy.models.SVIGPRegression(X,Y, batchsize=10, Z=Z)
-    m.constrain_bounded('noise_variance',1e-3,1e-1)
-    m.constrain_bounded('white_variance',1e-3,1e-1)
-
-    m.param_steplength = 1e-4
-
-    if plot:
-        fig = pb.figure()
-        ax = fig.add_subplot(111)
-        def cb(foo):
-            ax.cla()
-            m.plot(ax=ax,Z_height=-3)
-            ax.set_ylim(-3,3)
-            fig.canvas.draw()
-
-    if optimize:
-        m.optimize(500, callback=cb, callback_interval=1)
-
-    if plot:
-        m.plot_traces()
-    return m
--- a/GPy/examples/tutorials.py
+++ b/GPy/examples/tutorials.py
@ -1,153 +0,0 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
-
-
-"""
-Code of Tutorials
-"""
-
-import pylab as pb
-pb.ion()
-import numpy as np
-import GPy
-
-def tuto_GP_regression(optimize=True, plot=True):
-    """The detailed explanations of the commands used in this file can be found in the tutorial section"""
-
-    X = np.random.uniform(-3.,3.,(20,1))
-    Y = np.sin(X) + np.random.randn(20,1)*0.05
-
-    kernel = GPy.kern.rbf(input_dim=1, variance=1., lengthscale=1.)
-
-    m = GPy.models.GPRegression(X, Y, kernel)
-
-    print m
-    if plot:
-        m.plot()
-
-    m.constrain_positive('')
-
-    m.unconstrain('')               # may be used to remove the previous constrains
-    m.constrain_positive('.*rbf_variance')
-    m.constrain_bounded('.*lengthscale',1.,10. )
-    m.constrain_fixed('.*noise',0.0025)
-
-    if optimize:
-        m.optimize()
-        m.optimize_restarts(num_restarts = 10)
-
-    #######################################################
-    #######################################################
-    # sample inputs and outputs
-    X = np.random.uniform(-3.,3.,(50,2))
-    Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(50,1)*0.05
-
-    # define kernel
-    ker = GPy.kern.Matern52(2,ARD=True) + GPy.kern.white(2)
-
-    # create simple GP model
-    m = GPy.models.GPRegression(X, Y, ker)
-
-    # contrain all parameters to be positive
-    m.constrain_positive('')
-
-    # optimize and plot
-    if optimize:
-        m.optimize('tnc', max_f_eval = 1000)
-    if plot:
-        m.plot()
-
-    print m
-    return(m)
-
-def tuto_kernel_overview(optimize=True, plot=True):
-    """The detailed explanations of the commands used in this file can be found in the tutorial section"""
-    ker1 = GPy.kern.rbf(1)  # Equivalent to ker1 = GPy.kern.rbf(input_dim=1, variance=1., lengthscale=1.)
-    ker2 = GPy.kern.rbf(input_dim=1, variance = .75, lengthscale=2.)
-    ker3 = GPy.kern.rbf(1, .5, .5)
-
-    print ker2
-
-    if plot:
-        ker1.plot()
-        ker2.plot()
-        ker3.plot()
-
-    k1 = GPy.kern.rbf(1,1.,2.)
-    k2 = GPy.kern.Matern32(1, 0.5, 0.2)
-
-    # Product of kernels
-    k_prod = k1.prod(k2)                        # By default, tensor=False
-    k_prodtens = k1.prod(k2,tensor=True)
-
-    # Sum of kernels
-    k_add = k1.add(k2)                          # By default, tensor=False
-    k_addtens = k1.add(k2,tensor=True)
-
-    k1 = GPy.kern.rbf(1,1.,2)
-    k2 = GPy.kern.periodic_Matern52(1,variance=1e3, lengthscale=1, period = 1.5, lower=-5., upper = 5)
-
-    k = k1 * k2  # equivalent to k = k1.prod(k2)
-    print k
-
-    # Simulate sample paths
-    X = np.linspace(-5,5,501)[:,None]
-    Y = np.random.multivariate_normal(np.zeros(501),k.K(X),1)
-
-    k1 = GPy.kern.rbf(1)
-    k2 = GPy.kern.Matern32(1)
-    k3 = GPy.kern.white(1)
-
-    k = k1 + k2 + k3
-    print k
-
-    k.constrain_positive('.*var')
-    k.constrain_fixed(np.array([1]),1.75)
-    k.tie_params('.*len')
-    k.unconstrain('white')
-    k.constrain_bounded('white',lower=1e-5,upper=.5)
-    print k
-
-    k_cst = GPy.kern.bias(1,variance=1.)
-    k_mat = GPy.kern.Matern52(1,variance=1., lengthscale=3)
-    Kanova = (k_cst + k_mat).prod(k_cst + k_mat,tensor=True)
-    print Kanova
-
-    # sample inputs and outputs
-    X = np.random.uniform(-3.,3.,(40,2))
-    Y = 0.5*X[:,:1] + 0.5*X[:,1:] + 2*np.sin(X[:,:1]) * np.sin(X[:,1:])
-
-    # Create GP regression model
-    m = GPy.models.GPRegression(X, Y, Kanova)
-
-    if plot:
-        fig = pb.figure(figsize=(5,5))
-        ax = fig.add_subplot(111)
-        m.plot(ax=ax)
-
-        pb.figure(figsize=(20,3))
-        pb.subplots_adjust(wspace=0.5)
-        axs = pb.subplot(1,5,1)
-        m.plot(ax=axs)
-        pb.subplot(1,5,2)
-        pb.ylabel("=   ",rotation='horizontal',fontsize='30')
-        axs = pb.subplot(1,5,3)
-        m.plot(ax=axs, which_parts=[False,True,False,False])
-        pb.ylabel("cst          +",rotation='horizontal',fontsize='30')
-        axs = pb.subplot(1,5,4)
-        m.plot(ax=axs, which_parts=[False,False,True,False])
-        pb.ylabel("+   ",rotation='horizontal',fontsize='30')
-        axs = pb.subplot(1,5,5)
-        pb.ylabel("+   ",rotation='horizontal',fontsize='30')
-        m.plot(ax=axs, which_parts=[False,False,False,True])
-
-    return(m)
-
-
-def model_interaction(optimize=True, plot=True):
-    X = np.random.randn(20,1)
-    Y = np.sin(X) + np.random.randn(*X.shape)*0.01 + 5.
-    k = GPy.kern.rbf(1) + GPy.kern.bias(1)
-    m = GPy.models.GPRegression(X, Y, kernel=k)
-    return m
-
--- a/GPy/gpy_config.cfg
+++ b/GPy/gpy_config.cfg
@ -1,7 +0,0 @@
-# This is the configuration file for GPy
-
-[parallel]
-# Enable openmp support. This speeds up some computations, depending on the number
-# of cores available. Setting up a compiler with openmp support can be difficult on 
-# some platforms, hence this option.
-openmp=False
--- a/GPy/inference/init.py
+++ b/GPy/inference/init.py
@ -0,0 +1,2 @@
+import latent_function_inference
+import optimization
--- a/GPy/inference/latent_function_inference/init.py
+++ b/GPy/inference/latent_function_inference/init.py
@ -0,0 +1,98 @@
+# Copyright (c) 2012, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+__doc__ = """
+Inference over Gaussian process latent functions
+
+In all our GP models, the consistency propery means that we have a Gaussian
+prior over a finite set of points f. This prior is
+
+  math:: N(f | 0, K)
+
+where K is the kernel matrix.
+
+We also have a likelihood (see GPy.likelihoods) which defines how the data are
+related to the latent function: p(y | f).  If the likelihood is also a Gaussian,
+the inference over f is tractable (see exact_gaussian_inference.py).
+
+If the likelihood object is something other than Gaussian, then exact inference
+is not tractable. We then resort to a Laplace approximation (laplace.py) or
+expectation propagation (ep.py).
+
+The inference methods return a
+:class:`~GPy.inference.latent_function_inference.posterior.Posterior`
+instance, which is a simple
+structure which contains a summary of the posterior. The model classes can then
+use this posterior object for making predictions, optimizing hyper-parameters,
+etc.
+
+"""
+
+class LatentFunctionInference(object):
+    def on_optimization_start(self):
+        """
+        This function gets called, just before the optimization loop to start.
+        """
+        pass
+
+    def on_optimization_end(self):
+        """
+        This function gets called, just after the optimization loop ended.
+        """
+        pass
+
+class InferenceMethodList(LatentFunctionInference, list):
+
+    def on_optimization_start(self):
+        for inf in self:
+            inf.on_optimization_start()
+
+    def on_optimization_end(self):
+        for inf in self:
+            inf.on_optimization_end()
+    
+    def __getstate__(self):
+        state = []
+        for inf in self:
+            state.append(inf)
+        return state
+    
+    def __setstate__(self, state):
+        for inf in state:
+            self.append(inf)
+
+from exact_gaussian_inference import ExactGaussianInference
+from laplace import Laplace
+from GPy.inference.latent_function_inference.var_dtc import VarDTC
+from expectation_propagation import EP
+from expectation_propagation_dtc import EPDTC
+from dtc import DTC
+from fitc import FITC
+from var_dtc_parallel import VarDTC_minibatch
+
+# class FullLatentFunctionData(object):
+#
+#
+
+# class EMLikeLatentFunctionInference(LatentFunctionInference):
+#     def update_approximation(self):
+#         """
+#         This function gets called when the 
+#         """
+#     
+#     def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+#         """
+#         Do inference on the latent functions given a covariance function `kern`,
+#         inputs and outputs `X` and `Y`, inducing_inputs `Z`, and a likelihood `likelihood`.
+#         Additional metadata for the outputs `Y` can be given in `Y_metadata`.
+#         """
+#         raise NotImplementedError, "Abstract base class for full inference"
+# 
+# class VariationalLatentFunctionInference(LatentFunctionInference):
+#     def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+#         """
+#         Do inference on the latent functions given a covariance function `kern`,
+#         inputs and outputs `X` and `Y`, inducing_inputs `Z`, and a likelihood `likelihood`.
+#         Additional metadata for the outputs `Y` can be given in `Y_metadata`.
+#         """
+#         raise NotImplementedError, "Abstract base class for full inference"
--- a/GPy/inference/latent_function_inference/dtc.py
+++ b/GPy/inference/latent_function_inference/dtc.py
@ -0,0 +1,162 @@
+# Copyright (c) 2012-2014, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from posterior import Posterior
+from ...util.linalg import jitchol, tdot, dtrtrs, dpotri, pdinv
+import numpy as np
+from . import LatentFunctionInference
+log_2_pi = np.log(2*np.pi)
+
+class DTC(LatentFunctionInference):
+    """
+    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
+
+    The function self.inference returns a Posterior object, which summarizes
+    the posterior.
+
+    NB. It's not recommended to use this function! It's here for historical purposes. 
+
+    """
+    def __init__(self):
+        self.const_jitter = 1e-6
+
+    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+        assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
+
+        num_inducing, _ = Z.shape
+        num_data, output_dim = Y.shape
+
+        #make sure the noise is not hetero
+        beta = 1./likelihood.gaussian_variance(Y_metadata)
+        if beta.size > 1:
+            raise NotImplementedError, "no hetero noise with this implementation of DTC"
+
+        Kmm = kern.K(Z)
+        Knn = kern.Kdiag(X)
+        Knm = kern.K(X, Z)
+        U = Knm
+        Uy = np.dot(U.T,Y)
+
+        #factor Kmm
+        Kmmi, L, Li, _ = pdinv(Kmm)
+
+        # Compute A
+        LiUTbeta = np.dot(Li, U.T)*np.sqrt(beta)
+        A = tdot(LiUTbeta) + np.eye(num_inducing)
+
+        # factor A
+        LA = jitchol(A)
+
+        # back substutue to get b, P, v
+        tmp, _ = dtrtrs(L, Uy, lower=1)
+        b, _ = dtrtrs(LA, tmp*beta, lower=1)
+        tmp, _ = dtrtrs(LA, b, lower=1, trans=1)
+        v, _ = dtrtrs(L, tmp, lower=1, trans=1)
+        tmp, _ = dtrtrs(LA, Li, lower=1, trans=0)
+        P = tdot(tmp.T)
+
+        #compute log marginal
+        log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \
+                       -np.sum(np.log(np.diag(LA)))*output_dim + \
+                       0.5*num_data*output_dim*np.log(beta) + \
+                       -0.5*beta*np.sum(np.square(Y)) + \
+                       0.5*np.sum(np.square(b))
+
+        # Compute dL_dKmm
+        vvT_P = tdot(v.reshape(-1,1)) + P
+        dL_dK = 0.5*(Kmmi - vvT_P)
+
+        # Compute dL_dU
+        vY = np.dot(v.reshape(-1,1),Y.T)
+        dL_dU = vY - np.dot(vvT_P, U.T)
+        dL_dU *= beta
+
+        #compute dL_dR
+        Uv = np.dot(U, v)
+        dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - 1./beta + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) + np.sum(np.square(Uv), 1))*beta**2
+
+        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR)
+
+        grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':np.zeros_like(Knn), 'dL_dKnm':dL_dU.T, 'dL_dthetaL':dL_dthetaL}
+
+        #construct a posterior object
+        post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L)
+
+        return post, log_marginal, grad_dict
+
+class vDTC(object):
+    def __init__(self):
+        self.const_jitter = 1e-6
+
+    def inference(self, kern, X, X_variance, Z, likelihood, Y, Y_metadata):
+        assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
+
+        num_inducing, _ = Z.shape
+        num_data, output_dim = Y.shape
+
+        #make sure the noise is not hetero
+        beta = 1./likelihood.gaussian_variance(Y_metadata)
+        if beta.size > 1:
+            raise NotImplementedError, "no hetero noise with this implementation of DTC"
+
+        Kmm = kern.K(Z)
+        Knn = kern.Kdiag(X)
+        Knm = kern.K(X, Z)
+        U = Knm
+        Uy = np.dot(U.T,Y)
+
+        #factor Kmm
+        Kmmi, L, Li, _ = pdinv(Kmm)
+
+        # Compute A
+        LiUTbeta = np.dot(Li, U.T)*np.sqrt(beta)
+        A_ = tdot(LiUTbeta)
+        trace_term = -0.5*(np.sum(Knn)*beta - np.trace(A_))
+        A = A_ + np.eye(num_inducing)
+
+        # factor A
+        LA = jitchol(A)
+
+        # back substutue to get b, P, v
+        tmp, _ = dtrtrs(L, Uy, lower=1)
+        b, _ = dtrtrs(LA, tmp*beta, lower=1)
+        tmp, _ = dtrtrs(LA, b, lower=1, trans=1)
+        v, _ = dtrtrs(L, tmp, lower=1, trans=1)
+        tmp, _ = dtrtrs(LA, Li, lower=1, trans=0)
+        P = tdot(tmp.T)
+        stop
+
+        #compute log marginal
+        log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \
+                       -np.sum(np.log(np.diag(LA)))*output_dim + \
+                       0.5*num_data*output_dim*np.log(beta) + \
+                       -0.5*beta*np.sum(np.square(Y)) + \
+                       0.5*np.sum(np.square(b)) + \
+                       trace_term
+
+        # Compute dL_dKmm
+        vvT_P = tdot(v.reshape(-1,1)) + P
+        LAL = Li.T.dot(A).dot(Li)
+        dL_dK = Kmmi - 0.5*(vvT_P + LAL)
+
+        # Compute dL_dU
+        vY = np.dot(v.reshape(-1,1),Y.T)
+        #dL_dU = vY - np.dot(vvT_P, U.T)
+        dL_dU = vY - np.dot(vvT_P - Kmmi, U.T)
+        dL_dU *= beta
+
+        #compute dL_dR
+        Uv = np.dot(U, v)
+        dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - 1./beta + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) + np.sum(np.square(Uv), 1) )*beta**2
+        dL_dR -=beta*trace_term/num_data
+
+        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR)
+        grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':np.zeros_like(Knn) + -0.5*beta, 'dL_dKnm':dL_dU.T, 'dL_dthetaL':dL_dthetaL}
+
+        #construct a posterior object
+        post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L)
+
+
+        return post, log_marginal, grad_dict
+
+
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@ -0,0 +1,59 @@
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from posterior import Posterior
+from ...util.linalg import pdinv, dpotrs, tdot
+from ...util import diag
+import numpy as np
+from . import LatentFunctionInference
+log_2_pi = np.log(2*np.pi)
+
+
+class ExactGaussianInference(LatentFunctionInference):
+    """
+    An object for inference when the likelihood is Gaussian.
+
+    The function self.inference returns a Posterior object, which summarizes
+    the posterior.
+
+    For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.
+
+    """
+    def __init__(self):
+        pass#self._YYTfactor_cache = caching.cache()
+
+    def get_YYTfactor(self, Y):
+        """
+        find a matrix L which satisfies LL^T = YY^T.
+
+        Note that L may have fewer columns than Y, else L=Y.
+        """
+        N, D = Y.shape
+        if (N>D):
+            return Y
+        else:
+            #if Y in self.cache, return self.Cache[Y], else store Y in cache and return L.
+            #print "WARNING: N>D of Y, we need caching of L, such that L*L^T = Y, returning Y still!"
+            return Y
+
+    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+        """
+        Returns a Posterior class containing essential quantities of the posterior
+        """
+        YYT_factor = self.get_YYTfactor(Y)
+
+        K = kern.K(X)
+
+        Ky = K.copy()
+        diag.add(Ky, likelihood.gaussian_variance(Y_metadata))
+        Wi, LW, LWi, W_logdet = pdinv(Ky)
+
+        alpha, _ = dpotrs(LW, YYT_factor, lower=1)
+
+        log_marginal =  0.5*(-Y.size * log_2_pi - Y.shape[1] * W_logdet - np.sum(alpha * YYT_factor))
+
+        dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi)
+
+        dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK),Y_metadata)
+
+        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@ -0,0 +1,122 @@
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+import numpy as np
+from ...util.linalg import pdinv,jitchol,DSYR,tdot,dtrtrs, dpotrs
+from posterior import Posterior
+from . import LatentFunctionInference
+log_2_pi = np.log(2*np.pi)
+
+class EP(LatentFunctionInference):
+    def __init__(self, epsilon=1e-6, eta=1., delta=1.):
+        """
+        The expectation-propagation algorithm.
+        For nomenclature see Rasmussen & Williams 2006.
+
+        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
+        :type epsilon: float
+        :param eta: parameter for fractional EP updates.
+        :type eta: float64
+        :param delta: damping EP updates factor.
+        :type delta: float64
+        """
+        self.epsilon, self.eta, self.delta = epsilon, eta, delta
+        self.reset()
+
+    def reset(self):
+        self.old_mutilde, self.old_vtilde = None, None
+        self._ep_approximation = None
+
+    def on_optimization_start(self):
+        self._ep_approximation = None
+
+    def on_optimization_end(self):
+        # TODO: update approximation in the end as well? Maybe even with a switch?
+        pass
+
+    def inference(self, kern, X, likelihood, Y, Y_metadata=None, Z=None):
+        num_data, output_dim = Y.shape
+        assert output_dim ==1, "ep in 1D only (for now!)"
+
+        K = kern.K(X)
+
+        if self._ep_approximation is None:
+            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
+        else:
+            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation
+
+        Wi, LW, LWi, W_logdet = pdinv(K + np.diag(1./tau_tilde))
+
+        alpha, _ = dpotrs(LW, mu_tilde, lower=1)
+
+        log_marginal =  0.5*(-num_data * log_2_pi - W_logdet - np.sum(alpha * mu_tilde)) # TODO: add log Z_hat??
+
+        dL_dK = 0.5 * (tdot(alpha[:,None]) - Wi)
+
+        dL_dthetaL = np.zeros(likelihood.size)#TODO: derivatives of the likelihood parameters
+
+        return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
+
+    def expectation_propagation(self, K, Y, likelihood, Y_metadata):
+
+        num_data, data_dim = Y.shape
+        assert data_dim == 1, "This EP methods only works for 1D outputs"
+
+
+        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
+        mu = np.zeros(num_data)
+        Sigma = K.copy()
+
+        #Initial values - Marginal moments
+        Z_hat = np.empty(num_data,dtype=np.float64)
+        mu_hat = np.empty(num_data,dtype=np.float64)
+        sigma2_hat = np.empty(num_data,dtype=np.float64)
+
+        #initial values - Gaussian factors
+        if self.old_mutilde is None:
+            tau_tilde, mu_tilde, v_tilde = np.zeros((3, num_data))
+        else:
+            assert old_mutilde.size == num_data, "data size mis-match: did you change the data? try resetting!"
+            mu_tilde, v_tilde = self.old_mutilde, self.old_vtilde
+            tau_tilde = v_tilde/mu_tilde
+
+        #Approximation
+        tau_diff = self.epsilon + 1.
+        v_diff = self.epsilon + 1.
+       	iterations = 0
+        while (tau_diff > self.epsilon) or (v_diff > self.epsilon):
+            update_order = np.random.permutation(num_data)
+            for i in update_order:
+                #Cavity distribution parameters
+                tau_cav = 1./Sigma[i,i] - self.eta*tau_tilde[i]
+                v_cav = mu[i]/Sigma[i,i] - self.eta*v_tilde[i]
+                #Marginal moments
+                Z_hat[i], mu_hat[i], sigma2_hat[i] = likelihood.moments_match_ep(Y[i], tau_cav, v_cav)#, Y_metadata=None)#=(None if Y_metadata is None else Y_metadata[i]))
+                #Site parameters update
+                delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
+                delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
+                tau_tilde[i] += delta_tau
+                v_tilde[i] += delta_v
+                #Posterior distribution parameters update
+                DSYR(Sigma, Sigma[:,i].copy(), -delta_tau/(1.+ delta_tau*Sigma[i,i]))
+                mu = np.dot(Sigma, v_tilde)
+
+            #(re) compute Sigma and mu using full Cholesky decompy
+            tau_tilde_root = np.sqrt(tau_tilde)
+            Sroot_tilde_K = tau_tilde_root[:,None] * K
+            B = np.eye(num_data) + Sroot_tilde_K * tau_tilde_root[None,:]
+            L = jitchol(B)
+            V, _ = dtrtrs(L, Sroot_tilde_K, lower=1)
+            Sigma = K - np.dot(V.T,V)
+            mu = np.dot(Sigma,v_tilde)
+
+            #monitor convergence
+            if iterations>0:
+                tau_diff = np.mean(np.square(tau_tilde-tau_tilde_old))
+                v_diff = np.mean(np.square(v_tilde-v_tilde_old))
+            tau_tilde_old = tau_tilde.copy()
+            v_tilde_old = v_tilde.copy()
+
+            iterations += 1
+
+        mu_tilde = v_tilde/tau_tilde
+        return mu, Sigma, mu_tilde, tau_tilde, Z_hat
--- a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
@ -0,0 +1,351 @@
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ...util import diag
+from ...util.linalg import mdot, jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify, DSYR
+from ...core.parameterization.variational import VariationalPosterior
+from . import LatentFunctionInference
+from posterior import Posterior
+log_2_pi = np.log(2*np.pi)
+
+class EPDTC(LatentFunctionInference):
+    const_jitter = 1e-6
+    def __init__(self, epsilon=1e-6, eta=1., delta=1., limit=1):
+        from ...util.caching import Cacher
+        self.limit = limit
+        self.get_trYYT = Cacher(self._get_trYYT, limit)
+        self.get_YYTfactor = Cacher(self._get_YYTfactor, limit)
+
+        self.epsilon, self.eta, self.delta = epsilon, eta, delta
+        self.reset()
+
+    def set_limit(self, limit):
+        self.get_trYYT.limit = limit
+        self.get_YYTfactor.limit = limit
+
+    def on_optimization_start(self):
+        self._ep_approximation = None
+
+    def on_optimization_end(self):
+        # TODO: update approximation in the end as well? Maybe even with a switch?
+        pass
+
+    def _get_trYYT(self, Y):
+        return np.sum(np.square(Y))
+
+    def __getstate__(self):
+        # has to be overridden, as Cacher objects cannot be pickled.
+        return self.limit
+
+    def __setstate__(self, state):
+        # has to be overridden, as Cacher objects cannot be pickled.
+        self.limit = state
+        from ...util.caching import Cacher
+        self.get_trYYT = Cacher(self._get_trYYT, self.limit)
+        self.get_YYTfactor = Cacher(self._get_YYTfactor, self.limit)
+
+    def _get_YYTfactor(self, Y):
+        """
+        find a matrix L which satisfies LLT = YYT.
+
+        Note that L may have fewer columns than Y.
+        """
+        N, D = Y.shape
+        if (N>=D):
+            return Y
+        else:
+            return jitchol(tdot(Y))
+
+    def get_VVTfactor(self, Y, prec):
+        return Y * prec # TODO chache this, and make it effective
+
+    def reset(self):
+        self.old_mutilde, self.old_vtilde = None, None
+        self._ep_approximation = None
+
+    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+        num_data, output_dim = Y.shape
+        assert output_dim ==1, "ep in 1D only (for now!)"
+
+        Kmm = kern.K(Z)
+        Kmn = kern.K(Z,X)
+
+        if self._ep_approximation is None:
+            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
+        else:
+            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation
+
+
+        if isinstance(X, VariationalPosterior):
+            uncertain_inputs = True
+            psi0 = kern.psi0(Z, X)
+            psi1 = Kmn.T#kern.psi1(Z, X)
+            psi2 = kern.psi2(Z, X)
+        else:
+            uncertain_inputs = False
+            psi0 = kern.Kdiag(X)
+            psi1 = Kmn.T#kern.K(X, Z)
+            psi2 = None
+
+        #see whether we're using variational uncertain inputs
+
+        _, output_dim = Y.shape
+
+        #see whether we've got a different noise variance for each datum
+        #beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6)
+        beta = tau_tilde
+        VVT_factor = beta[:,None]*mu_tilde[:,None]
+        trYYT = self.get_trYYT(mu_tilde[:,None])
+
+        # do the inference:
+        het_noise = beta.size > 1
+        num_inducing = Z.shape[0]
+        num_data = Y.shape[0]
+        # kernel computations, using BGPLVM notation
+
+        Kmm = kern.K(Z).copy()
+        diag.add(Kmm, self.const_jitter)
+        Lm = jitchol(Kmm)
+
+        # The rather complex computations of A
+        if uncertain_inputs:
+            if het_noise:
+                psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0)
+            else:
+                psi2_beta = psi2.sum(0) * beta
+            LmInv = dtrtri(Lm)
+            A = LmInv.dot(psi2_beta.dot(LmInv.T))
+        else:
+            if het_noise:
+                tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
+            else:
+                tmp = psi1 * (np.sqrt(beta))
+            tmp, _ = dtrtrs(Lm, tmp.T, lower=1)
+            A = tdot(tmp) #print A.sum()
+
+        # factor B
+        B = np.eye(num_inducing) + A
+        LB = jitchol(B)
+        psi1Vf = np.dot(psi1.T, VVT_factor)
+        # back substutue C into psi1Vf
+        tmp, _ = dtrtrs(Lm, psi1Vf, lower=1, trans=0)
+        _LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0)
+        tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
+        Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
+
+        # data fit and derivative of L w.r.t. Kmm
+        delit = tdot(_LBi_Lmi_psi1Vf)
+        data_fit = np.trace(delit)
+        DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit)
+        delit = -0.5 * DBi_plus_BiPBi
+        delit += -0.5 * B * output_dim
+        delit += output_dim * np.eye(num_inducing)
+        # Compute dL_dKmm
+        dL_dKmm = backsub_both_sides(Lm, delit)
+
+        # derivatives of L w.r.t. psi
+        dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm,
+            VVT_factor, Cpsi1Vf, DBi_plus_BiPBi,
+            psi1, het_noise, uncertain_inputs)
+
+        # log marginal likelihood
+        log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise,
+            psi0, A, LB, trYYT, data_fit, VVT_factor)
+
+        #put the gradients in the right places
+        dL_dR = _compute_dL_dR(likelihood,
+            het_noise, uncertain_inputs, LB,
+            _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A,
+            psi0, psi1, beta,
+            data_fit, num_data, output_dim, trYYT, mu_tilde[:,None])
+
+        dL_dthetaL = 0#likelihood.exact_inference_gradients(dL_dR,Y_metadata)
+
+        if uncertain_inputs:
+            grad_dict = {'dL_dKmm': dL_dKmm,
+                         'dL_dpsi0':dL_dpsi0,
+                         'dL_dpsi1':dL_dpsi1,
+                         'dL_dpsi2':dL_dpsi2,
+                         'dL_dthetaL':dL_dthetaL}
+        else:
+            grad_dict = {'dL_dKmm': dL_dKmm,
+                         'dL_dKdiag':dL_dpsi0,
+                         'dL_dKnm':dL_dpsi1,
+                         'dL_dthetaL':dL_dthetaL}
+
+        #get sufficient things for posterior prediction
+        #TODO: do we really want to do this in  the loop?
+        if VVT_factor.shape[1] == Y.shape[1]:
+            woodbury_vector = Cpsi1Vf # == Cpsi1V
+        else:
+            print 'foobar'
+            psi1V = np.dot(mu_tilde[:,None].T*beta, psi1).T
+            tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
+            tmp, _ = dpotrs(LB, tmp, lower=1)
+            woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
+        Bi, _ = dpotri(LB, lower=1)
+        symmetrify(Bi)
+        Bi = -dpotri(LB, lower=1)[0]
+        diag.add(Bi, 1)
+
+        woodbury_inv = backsub_both_sides(Lm, Bi)
+
+        #construct a posterior object
+        post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
+        return post, log_marginal, grad_dict
+
+
+
+
+
+    def expectation_propagation(self, Kmm, Kmn, Y, likelihood, Y_metadata):
+
+        num_data, data_dim = Y.shape
+        assert data_dim == 1, "This EP methods only works for 1D outputs"
+
+        KmnKnm = np.dot(Kmn,Kmn.T)
+        Lm = jitchol(Kmm)
+        Lmi = dtrtrs(Lm,np.eye(Lm.shape[0]))[0] #chol_inv(Lm)
+        Kmmi = np.dot(Lmi.T,Lmi)
+        KmmiKmn = np.dot(Kmmi,Kmn)
+        Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
+        LLT0 = Kmm.copy()
+
+        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
+        mu = np.zeros(num_data)
+        LLT = Kmm.copy() #Sigma = K.copy()
+        Sigma_diag = Qnn_diag.copy()
+
+        #Initial values - Marginal moments
+        Z_hat = np.empty(num_data,dtype=np.float64)
+        mu_hat = np.empty(num_data,dtype=np.float64)
+        sigma2_hat = np.empty(num_data,dtype=np.float64)
+
+        #initial values - Gaussian factors
+        if self.old_mutilde is None:
+            tau_tilde, mu_tilde, v_tilde = np.zeros((3, num_data))
+        else:
+            assert old_mutilde.size == num_data, "data size mis-match: did you change the data? try resetting!"
+            mu_tilde, v_tilde = self.old_mutilde, self.old_vtilde
+            tau_tilde = v_tilde/mu_tilde
+
+        #Approximation
+        tau_diff = self.epsilon + 1.
+        v_diff = self.epsilon + 1.
+       	iterations = 0
+        while (tau_diff > self.epsilon) or (v_diff > self.epsilon):
+            update_order = np.random.permutation(num_data)
+            for i in update_order:
+                #Cavity distribution parameters
+                tau_cav = 1./Sigma_diag[i] - self.eta*tau_tilde[i]
+                v_cav = mu[i]/Sigma_diag[i] - self.eta*v_tilde[i]
+                #Marginal moments
+                Z_hat[i], mu_hat[i], sigma2_hat[i] = likelihood.moments_match_ep(Y[i], tau_cav, v_cav)#, Y_metadata=None)#=(None if Y_metadata is None else Y_metadata[i]))
+                #Site parameters update
+                delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
+                delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
+                tau_tilde[i] += delta_tau
+                v_tilde[i] += delta_v
+                #Posterior distribution parameters update
+
+                #DSYR(Sigma, Sigma[:,i].copy(), -delta_tau/(1.+ delta_tau*Sigma[i,i]))
+                DSYR(LLT,Kmn[:,i].copy(),delta_tau)
+                L = jitchol(LLT)
+
+                V,info = dtrtrs(L,Kmn,lower=1)
+                Sigma_diag = np.sum(V*V,-2)
+                si = np.sum(V.T*V[:,i],-1)
+                mu += (delta_v-delta_tau*mu[i])*si
+                #mu = np.dot(Sigma, v_tilde)
+
+            #(re) compute Sigma and mu using full Cholesky decompy
+            LLT = LLT0 + np.dot(Kmn*tau_tilde[None,:],Kmn.T)
+            L = jitchol(LLT)
+            V,info = dtrtrs(L,Kmn,lower=1)
+            V2,info = dtrtrs(L.T,V,lower=0)
+            #Sigma_diag = np.sum(V*V,-2)
+            #Knmv_tilde = np.dot(Kmn,v_tilde)
+            #mu = np.dot(V2.T,Knmv_tilde)
+            Sigma = np.dot(V2.T,V2)
+            mu = np.dot(Sigma,v_tilde)
+
+            #monitor convergence
+            if iterations>0:
+                tau_diff = np.mean(np.square(tau_tilde-tau_tilde_old))
+                v_diff = np.mean(np.square(v_tilde-v_tilde_old))
+            tau_tilde_old = tau_tilde.copy()
+            v_tilde_old = v_tilde.copy()
+
+            tau_diff = 0
+            v_diff = 0
+            iterations += 1
+
+        mu_tilde = v_tilde/tau_tilde
+        return mu, Sigma, mu_tilde, tau_tilde, Z_hat
+
+def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs):
+    dL_dpsi0 = -0.5 * output_dim * (beta[:,None] * np.ones([num_data, 1])).flatten()
+    dL_dpsi1 = np.dot(VVT_factor, Cpsi1Vf.T)
+    dL_dpsi2_beta = 0.5 * backsub_both_sides(Lm, output_dim * np.eye(num_inducing) - DBi_plus_BiPBi)
+    if het_noise:
+        if uncertain_inputs:
+            dL_dpsi2 = beta[:, None, None] * dL_dpsi2_beta[None, :, :]
+        else:
+            dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * beta.reshape(num_data, 1)).T).T
+            dL_dpsi2 = None
+    else:
+        dL_dpsi2 = beta * dL_dpsi2_beta
+        if uncertain_inputs:
+            # repeat for each of the N psi_2 matrices
+            dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], num_data, axis=0)
+        else:
+            # subsume back into psi1 (==Kmn)
+            dL_dpsi1 += 2.*np.dot(psi1, dL_dpsi2)
+            dL_dpsi2 = None
+
+    return dL_dpsi0, dL_dpsi1, dL_dpsi2
+
+
+def _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT, Y):
+    # the partial derivative vector for the likelihood
+    if likelihood.size == 0:
+        # save computation here.
+        dL_dR = None
+    elif het_noise:
+        if uncertain_inputs:
+            raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
+        else:
+            #from ...util.linalg import chol_inv
+            #LBi = chol_inv(LB)
+            LBi, _ = dtrtrs(LB,np.eye(LB.shape[0]))
+
+            Lmi_psi1, nil = dtrtrs(Lm, psi1.T, lower=1, trans=0)
+            _LBi_Lmi_psi1, _ = dtrtrs(LB, Lmi_psi1, lower=1, trans=0)
+
+            dL_dR = -0.5 * beta + 0.5 * (beta*Y)**2
+            dL_dR += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * beta**2
+
+            dL_dR += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*beta**2
+
+            dL_dR += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * Y * beta**2
+            dL_dR += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * beta**2
+    else:
+        # likelihood is not heteroscedatic
+        dL_dR = -0.5 * num_data * output_dim * beta + 0.5 * trYYT * beta ** 2
+        dL_dR += 0.5 * output_dim * (psi0.sum() * beta ** 2 - np.trace(A) * beta)
+        dL_dR += beta * (0.5 * np.sum(A * DBi_plus_BiPBi) - data_fit)
+    return dL_dR
+
+def _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT, data_fit,Y):
+    #compute log marginal likelihood
+    if het_noise:
+        lik_1 = -0.5 * num_data * output_dim * np.log(2. * np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(beta * np.square(Y).sum(axis=-1))
+        lik_2 = -0.5 * output_dim * (np.sum(beta.flatten() * psi0) - np.trace(A))
+    else:
+        lik_1 = -0.5 * num_data * output_dim * (np.log(2. * np.pi) - np.log(beta)) - 0.5 * beta * trYYT
+        lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
+    lik_3 = -output_dim * (np.sum(np.log(np.diag(LB))))
+    lik_4 = 0.5 * data_fit
+    log_marginal = lik_1 + lik_2 + lik_3 + lik_4
+    return log_marginal
--- a/GPy/inference/latent_function_inference/fitc.py
+++ b/GPy/inference/latent_function_inference/fitc.py
@ -0,0 +1,89 @@
+# Copyright (c) 2012, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from posterior import Posterior
+from ...util.linalg import jitchol, tdot, dtrtrs, dpotri, pdinv
+from ...util import diag
+import numpy as np
+from . import LatentFunctionInference
+log_2_pi = np.log(2*np.pi)
+
+class FITC(LatentFunctionInference):
+    """
+    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
+
+    The function self.inference returns a Posterior object, which summarizes
+    the posterior.
+
+    """
+    const_jitter = 1e-6
+
+    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
+
+        num_inducing, _ = Z.shape
+        num_data, output_dim = Y.shape
+
+        #make sure the noise is not hetero
+        sigma_n = likelihood.gaussian_variance(Y_metadata)
+        if sigma_n.size >1:
+            raise NotImplementedError, "no hetero noise with this implementation of FITC"
+
+        Kmm = kern.K(Z)
+        Knn = kern.Kdiag(X)
+        Knm = kern.K(X, Z)
+        U = Knm
+
+        #factor Kmm
+        diag.add(Kmm, self.const_jitter)
+        Kmmi, L, Li, _ = pdinv(Kmm)
+
+        #compute beta_star, the effective noise precision
+        LiUT = np.dot(Li, U.T)
+        sigma_star = Knn + sigma_n - np.sum(np.square(LiUT),0)
+        beta_star = 1./sigma_star
+
+        # Compute and factor A
+        A = tdot(LiUT*np.sqrt(beta_star)) + np.eye(num_inducing)
+        LA = jitchol(A)
+
+        # back substutue to get b, P, v
+        URiy = np.dot(U.T*beta_star,Y)
+        tmp, _ = dtrtrs(L, URiy, lower=1)
+        b, _ = dtrtrs(LA, tmp, lower=1)
+        tmp, _ = dtrtrs(LA, b, lower=1, trans=1)
+        v, _ = dtrtrs(L, tmp, lower=1, trans=1)
+        tmp, _ = dtrtrs(LA, Li, lower=1, trans=0)
+        P = tdot(tmp.T)
+
+        #compute log marginal
+        log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \
+                       -np.sum(np.log(np.diag(LA)))*output_dim + \
+                       0.5*output_dim*np.sum(np.log(beta_star)) + \
+                       -0.5*np.sum(np.square(Y.T*np.sqrt(beta_star))) + \
+                       0.5*np.sum(np.square(b))
+        #compute dL_dR
+        Uv = np.dot(U, v)
+        dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - 1./beta_star + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) + np.sum(np.square(Uv), 1))*beta_star**2
+
+
+        # Compute dL_dKmm
+        vvT_P = tdot(v.reshape(-1,1)) + P
+        dL_dK = 0.5*(Kmmi - vvT_P)
+        KiU = np.dot(Kmmi, U.T)
+        dL_dK += np.dot(KiU*dL_dR, KiU.T)
+
+        # Compute dL_dU
+        vY = np.dot(v.reshape(-1,1),Y.T)
+        dL_dU = vY - np.dot(vvT_P, U.T)
+        dL_dU *= beta_star
+        dL_dU -= 2.*KiU*dL_dR
+
+        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR)
+        grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':dL_dR, 'dL_dKnm':dL_dU.T, 'dL_dthetaL':dL_dthetaL}
+
+        #construct a posterior object
+        post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L)
+
+        return post, log_marginal, grad_dict
+
+
--- a/GPy/inference/latent_function_inference/inferenceX.py
+++ b/GPy/inference/latent_function_inference/inferenceX.py
@ -0,0 +1,162 @@
+# Copyright (c) 2014, Zhenwen Dai
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ...core import Model
+from ...core.parameterization import variational
+
+def infer_newX(model, Y_new, optimize=True, init='L2'):
+    """
+    Infer the distribution of X for the new observed data *Y_new*.
+
+    :param model: the GPy model used in inference
+    :type model: GPy.core.Model
+    :param Y_new: the new observed data for inference
+    :type Y_new: numpy.ndarray
+    :param optimize: whether to optimize the location of new X (True by default)
+    :type optimize: boolean
+    :return: a tuple containing the estimated posterior distribution of X and the model that optimize X
+    :rtype: (GPy.core.parameterization.variational.VariationalPosterior, GPy.core.Model)
+    """
+    infr_m = InferenceX(model, Y_new, init=init)
+
+    if optimize:
+        infr_m.optimize()
+
+    return infr_m.X, infr_m
+
+class InferenceX(Model):
+    """
+    The class for inference of new X with given new Y. (do_test_latent)
+
+    :param model: the GPy model used in inference
+    :type model: GPy.core.Model
+    :param Y: the new observed data for inference
+    :type Y: numpy.ndarray
+    """
+    def __init__(self, model, Y, name='inferenceX', init='L2'):
+        if np.isnan(Y).any() or getattr(model, 'missing_data', False):
+            assert Y.shape[0]==1, "The current implementation of inference X only support one data point at a time with missing data!"
+            self.missing_data = True
+            self.valid_dim = np.logical_not(np.isnan(Y[0]))
+            self.ninan = getattr(model, 'ninan', None)
+        else:
+            self.missing_data = False
+        super(InferenceX, self).__init__(name)
+        self.likelihood = model.likelihood.copy()
+        self.kern = model.kern.copy()
+        if model.kern.useGPU:
+            from ...models import SSGPLVM
+            if isinstance(model, SSGPLVM):
+                self.kern.GPU_SSRBF(True)
+            else:
+                self.kern.GPU(True)
+        from copy import deepcopy
+        self.posterior = deepcopy(model.posterior)
+        if hasattr(model, 'variational_prior'):
+            self.uncertain_input = True
+            self.variational_prior = model.variational_prior.copy()
+        else:
+            self.uncertain_input = False
+        if hasattr(model, 'inducing_inputs'):
+            self.sparse_gp = True
+            self.Z = model.Z.copy()
+        else:
+            self.sparse_gp = False
+            self.uncertain_input = False
+            self.Z = model.X.copy()
+        self.Y = Y
+        self.X = self._init_X(model, Y, init=init)
+        self.compute_dL()
+
+        self.link_parameter(self.X)
+
+    def _init_X(self, model, Y_new, init='L2'):
+        # Initialize the new X by finding the nearest point in Y space.
+
+        Y = model.Y
+        if self.missing_data:
+            Y = Y[:,self.valid_dim]
+            Y_new = Y_new[:,self.valid_dim]
+            dist = -2.*Y_new.dot(Y.T) + np.square(Y_new).sum(axis=1)[:,None]+ np.square(Y).sum(axis=1)[None,:]
+        else:
+            if init=='L2':
+                dist = -2.*Y_new.dot(Y.T) + np.square(Y_new).sum(axis=1)[:,None]+ np.square(Y).sum(axis=1)[None,:]
+            elif init=='NCC':
+                dist = Y_new.dot(Y.T)
+            elif init=='rand':
+                dist = np.random.rand(Y_new.shape[0],Y.shape[0])
+        idx = dist.argmin(axis=1)
+
+        from ...models import SSGPLVM
+        from ...util.misc import param_to_array
+        if isinstance(model, SSGPLVM):
+            X = variational.SpikeAndSlabPosterior(param_to_array(model.X.mean[idx]), param_to_array(model.X.variance[idx]), param_to_array(model.X.gamma[idx]))
+            if model.group_spike:
+                X.gamma.fix()
+        else:
+            if self.uncertain_input and self.sparse_gp:
+                X = variational.NormalPosterior(param_to_array(model.X.mean[idx]), param_to_array(model.X.variance[idx]))
+            else:
+                from ...core import Param
+                X = Param('latent mean',param_to_array(model.X[idx]).copy())
+
+        return X
+
+    def compute_dL(self):
+        # Common computation
+        beta = 1./np.fmax(self.likelihood.variance, 1e-6)
+        output_dim = self.Y.shape[-1]
+        wv = self.posterior.woodbury_vector
+        if self.missing_data:
+            wv = wv[:,self.valid_dim]
+            output_dim = self.valid_dim.sum()
+            if self.ninan is not None:
+                self.dL_dpsi2 = beta/2.*(self.posterior.woodbury_inv[:,:,self.valid_dim] - np.einsum('md,od->mo',wv, wv)[:, :, None]).sum(-1)
+            else:
+                self.dL_dpsi2 = beta/2.*(output_dim*self.posterior.woodbury_inv - np.einsum('md,od->mo',wv, wv))
+            self.dL_dpsi1 = beta*np.dot(self.Y[:,self.valid_dim], wv.T)
+            self.dL_dpsi0 = - beta/2.* np.ones(self.Y.shape[0])
+        else:
+            self.dL_dpsi2 = beta*(output_dim*self.posterior.woodbury_inv - np.einsum('md,od->mo',wv, wv))/2.
+            self.dL_dpsi1 = beta*np.dot(self.Y, wv.T)
+            self.dL_dpsi0 = -beta/2.*output_dim* np.ones(self.Y.shape[0])
+
+    def parameters_changed(self):
+        if self.uncertain_input:
+            psi0 = self.kern.psi0(self.Z, self.X)
+            psi1 = self.kern.psi1(self.Z, self.X)
+            psi2 = self.kern.psi2(self.Z, self.X)
+        else:
+            psi0 = self.kern.Kdiag(self.X)
+            psi1 = self.kern.K(self.X, self.Z)
+            psi2 = np.dot(psi1.T,psi1)
+
+        self._log_marginal_likelihood = (self.dL_dpsi2*psi2).sum()+(self.dL_dpsi1*psi1).sum()+(self.dL_dpsi0*psi0).sum()
+
+        if self.uncertain_input:
+            X_grad = self.kern.gradients_qX_expectations(variational_posterior=self.X, Z=self.Z, dL_dpsi0=self.dL_dpsi0, dL_dpsi1=self.dL_dpsi1, dL_dpsi2=self.dL_dpsi2)
+            self.X.set_gradients(X_grad)
+        else:
+            dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(psi1,self.dL_dpsi2)
+            X_grad = self.kern.gradients_X_diag(self.dL_dpsi0, self.X)
+            X_grad += self.kern.gradients_X(dL_dpsi1, self.X, self.Z)
+            self.X.gradient = X_grad
+
+        if self.uncertain_input:
+            from ...core.parameterization.variational import SpikeAndSlabPrior
+            if isinstance(self.variational_prior, SpikeAndSlabPrior):
+                # Update Log-likelihood
+                KL_div = self.variational_prior.KL_divergence(self.X, N=self.Y.shape[0])
+                # update for the KL divergence
+                self.variational_prior.update_gradients_KL(self.X, N=self.Y.shape[0])
+            else:
+                # Update Log-likelihood
+                KL_div = self.variational_prior.KL_divergence(self.X)
+                # update for the KL divergence
+                self.variational_prior.update_gradients_KL(self.X)
+            self._log_marginal_likelihood += -KL_div
+
+    def log_likelihood(self):
+        return self._log_marginal_likelihood
+
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@ -0,0 +1,251 @@
+# Copyright (c) 2013, 2014 Alan Saul
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+#
+#Parts of this file were influenced by the Matlab GPML framework written by
+#Carl Edward Rasmussen & Hannes Nickisch, however all bugs are our own.
+#
+#The GPML code is released under the FreeBSD License.
+#Copyright (c) 2005-2013 Carl Edward Rasmussen & Hannes Nickisch. All rights reserved.
+#
+#The code and associated documentation is available from
+#http://gaussianprocess.org/gpml/code.
+
+import numpy as np
+from ...util.linalg import mdot, jitchol, dpotrs, dtrtrs, dpotri, symmetrify, pdinv
+from posterior import Posterior
+import warnings
+def warning_on_one_line(message, category, filename, lineno, file=None, line=None):
+    return ' %s:%s: %s:%s\n' % (filename, lineno, category.__name__, message)
+warnings.formatwarning = warning_on_one_line
+from scipy import optimize
+from . import LatentFunctionInference
+
+class Laplace(LatentFunctionInference):
+
+    def __init__(self):
+        """
+        Laplace Approximation
+
+        Find the moments \hat{f} and the hessian at this point
+        (using Newton-Raphson) of the unnormalised posterior
+
+        """
+
+        self._mode_finding_tolerance = 1e-7
+        self._mode_finding_max_iter = 60
+        self.bad_fhat = False
+        #Store whether it is the first run of the inference so that we can choose whether we need
+        #to calculate things or reuse old variables
+        self.first_run = True
+        self._previous_Ki_fhat = None
+
+    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
+        """
+        Returns a Posterior class containing essential quantities of the posterior
+        """
+
+        # Compute K
+        K = kern.K(X)
+
+        #Find mode
+        if self.bad_fhat or self.first_run:
+            Ki_f_init = np.zeros_like(Y)
+            first_run = False
+        else:
+            Ki_f_init = self._previous_Ki_fhat
+
+        f_hat, Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
+        self.f_hat = f_hat
+        self.Ki_fhat =  Ki_fhat
+        self.K = K.copy()
+        #Compute hessian and other variables at mode
+        log_marginal, woodbury_inv, dL_dK, dL_dthetaL = self.mode_computations(f_hat, Ki_fhat, K, Y, likelihood, kern, Y_metadata)
+
+        self._previous_Ki_fhat = Ki_fhat.copy()
+        return Posterior(woodbury_vector=Ki_fhat, woodbury_inv=woodbury_inv, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
+
+    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None):
+        """
+        Rasmussen's numerically stable mode finding
+        For nomenclature see Rasmussen & Williams 2006
+        Influenced by GPML (BSD) code, all errors are our own
+
+        :param K: Covariance matrix evaluated at locations X
+        :type K: NxD matrix
+        :param Y: The data
+        :type Y: np.ndarray
+        :param likelihood: the likelihood of the latent function value for the given data
+        :type likelihood: a GPy.likelihood object
+        :param Ki_f_init: the initial guess at the mode
+        :type Ki_f_init: np.ndarray
+        :param Y_metadata: information about the data, e.g. which likelihood to take from a multi-likelihood object
+        :type Y_metadata: np.ndarray | None
+        :returns: f_hat, mode on which to make laplace approxmiation
+        :rtype: np.ndarray
+        """
+
+        Ki_f = Ki_f_init.copy()
+        f = np.dot(K, Ki_f)
+
+        #define the objective function (to be maximised)
+        def obj(Ki_f, f):
+            return -0.5*np.dot(Ki_f.flatten(), f.flatten()) + np.sum(likelihood.logpdf(f, Y, Y_metadata=Y_metadata))
+
+        difference = np.inf
+        iteration = 0
+        while difference > self._mode_finding_tolerance and iteration < self._mode_finding_max_iter:
+            W = -likelihood.d2logpdf_df2(f, Y, Y_metadata=Y_metadata)
+            if np.any(np.isnan(W)):
+                raise ValueError('One or more element(s) of W is NaN')
+            grad = likelihood.dlogpdf_df(f, Y, Y_metadata=Y_metadata)
+            if np.any(np.isnan(grad)):
+                raise ValueError('One or more element(s) of grad is NaN')
+
+            W_f = W*f
+
+            b = W_f + grad # R+W p46 line 6.
+            W12BiW12, _, _ = self._compute_B_statistics(K, W, likelihood.log_concave)
+            W12BiW12Kb = np.dot(W12BiW12, np.dot(K, b))
+
+            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
+            full_step_Ki_f = b - W12BiW12Kb # full_step_Ki_f = a in R&W p46 line 6.
+            dKi_f = full_step_Ki_f - Ki_f
+
+            #define an objective for the line search (minimize this one)
+            def inner_obj(step_size):
+                Ki_f_trial = Ki_f + step_size*dKi_f
+                f_trial = np.dot(K, Ki_f_trial)
+                return -obj(Ki_f_trial, f_trial)
+
+            #use scipy for the line search, the compute new values of f, Ki_f
+            step = optimize.brent(inner_obj, tol=1e-4, maxiter=12)
+            Ki_f_new = Ki_f + step*dKi_f
+            f_new = np.dot(K, Ki_f_new)
+
+            difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f))
+            Ki_f = Ki_f_new
+            f = f_new
+            iteration += 1
+
+        #Warn of bad fits
+        if difference > self._mode_finding_tolerance:
+            if not self.bad_fhat:
+                warnings.warn("Not perfect mode found (f_hat). difference: {}, iteration: {} out of max {}".format(difference, iteration, self._mode_finding_max_iter))
+            self.bad_fhat = True
+        elif self.bad_fhat:
+            self.bad_fhat = False
+            warnings.warn("f_hat now fine again. difference: {}, iteration: {} out of max {}".format(difference, iteration, self._mode_finding_max_iter))
+
+        return f, Ki_f
+
+    def mode_computations(self, f_hat, Ki_f, K, Y, likelihood, kern, Y_metadata):
+        """
+        At the mode, compute the hessian and effective covariance matrix.
+
+        returns: logZ : approximation to the marginal likelihood
+                 woodbury_inv : variable required for calculating the approximation to the covariance matrix
+                 dL_dthetaL : array of derivatives (1 x num_kernel_params)
+                 dL_dthetaL : array of derivatives (1 x num_likelihood_params)
+        """
+        #At this point get the hessian matrix (or vector as W is diagonal)
+        W = -likelihood.d2logpdf_df2(f_hat, Y, Y_metadata=Y_metadata)
+        if np.any(np.isnan(W)):
+            raise ValueError('One or more element(s) of W is NaN')
+
+        K_Wi_i, L, LiW12 = self._compute_B_statistics(K, W, likelihood.log_concave)
+
+        #compute vital matrices
+        C = np.dot(LiW12, K)
+        Ki_W_i  = K - C.T.dot(C)
+
+        #compute the log marginal
+        log_marginal = -0.5*np.dot(Ki_f.flatten(), f_hat.flatten()) + np.sum(likelihood.logpdf(f_hat, Y, Y_metadata=Y_metadata)) - np.sum(np.log(np.diag(L)))
+
+        # Compute matrices for derivatives
+        dW_df = -likelihood.d3logpdf_df3(f_hat, Y, Y_metadata=Y_metadata) # -d3lik_d3fhat
+        if np.any(np.isnan(dW_df)):
+            raise ValueError('One or more element(s) of dW_df is NaN')
+
+        dL_dfhat = -0.5*(np.diag(Ki_W_i)[:, None]*dW_df) # s2 in R&W p126 line 9.
+        #BiK, _ = dpotrs(L, K, lower=1)
+        #dL_dfhat = 0.5*np.diag(BiK)[:, None]*dW_df
+        I_KW_i = np.eye(Y.shape[0]) - np.dot(K, K_Wi_i)
+
+        ####################
+        #  compute dL_dK   #
+        ####################
+        if kern.size > 0 and not kern.is_fixed:
+            #Explicit
+            explicit_part = 0.5*(np.dot(Ki_f, Ki_f.T) - K_Wi_i)
+
+            #Implicit
+            implicit_part = np.dot(Ki_f, dL_dfhat.T).dot(I_KW_i)
+
+            dL_dK = explicit_part + implicit_part
+        else:
+            dL_dK = np.zeros(likelihood.size)
+
+        ####################
+        #compute dL_dthetaL#
+        ####################
+        if likelihood.size > 0 and not likelihood.is_fixed:
+            dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = likelihood._laplace_gradients(f_hat, Y, Y_metadata=Y_metadata)
+
+            num_params = likelihood.size
+            # make space for one derivative for each likelihood parameter
+            dL_dthetaL = np.zeros(num_params)
+            for thetaL_i in range(num_params):
+                #Explicit
+                dL_dthetaL_exp = ( np.sum(dlik_dthetaL[thetaL_i])
+                                # The + comes from the fact that dlik_hess_dthetaL == -dW_dthetaL
+                                + 0.5*np.sum(np.diag(Ki_W_i).flatten()*dlik_hess_dthetaL[:, thetaL_i].flatten())
+                                )
+
+                #Implicit
+                dfhat_dthetaL = mdot(I_KW_i, K, dlik_grad_dthetaL[:, thetaL_i])
+                #dfhat_dthetaL = mdot(Ki_W_i, dlik_grad_dthetaL[:, thetaL_i])
+                dL_dthetaL_imp = np.dot(dL_dfhat.T, dfhat_dthetaL)
+                dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
+
+        else:
+            dL_dthetaL = np.zeros(likelihood.size)
+
+        return log_marginal, K_Wi_i, dL_dK, dL_dthetaL
+
+    def _compute_B_statistics(self, K, W, log_concave):
+        """
+        Rasmussen suggests the use of a numerically stable positive definite matrix B
+        Which has a positive diagonal elements and can be easily inverted
+
+        :param K: Prior Covariance matrix evaluated at locations X
+        :type K: NxN matrix
+        :param W: Negative hessian at a point (diagonal matrix)
+        :type W: Vector of diagonal values of Hessian (1xN)
+        :returns: (W12BiW12, L_B, Li_W12)
+        """
+        if not log_concave:
+            #print "Under 1e-10: {}".format(np.sum(W < 1e-6))
+            W[W<1e-6] = 1e-6
+            # NOTE: when setting a parameter inside parameters_changed it will allways come to closed update circles!!!
+            #W.__setitem__(W < 1e-6, 1e-6, update=False)  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
+                                # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
+                                # To cause the posterior to become less certain than the prior and likelihood,
+                                # This is a property only held by non-log-concave likelihoods
+        if np.any(np.isnan(W)):
+            raise ValueError('One or more element(s) of W is NaN')
+        #W is diagonal so its sqrt is just the sqrt of the diagonal elements
+        W_12 = np.sqrt(W)
+        B = np.eye(K.shape[0]) + W_12*K*W_12.T
+        L = jitchol(B)
+
+        LiW12, _ = dtrtrs(L, np.diagflat(W_12), lower=1, trans=0)
+        K_Wi_i = np.dot(LiW12.T, LiW12) # R = W12BiW12, in R&W p 126, eq 5.25
+
+        #here's a better way to compute the required matrix.
+        # you could do the model finding witha backsub, instead of a dot...
+        #L2 = L/W_12
+        #K_Wi_i_2 , _= dpotri(L2)
+        #symmetrify(K_Wi_i_2)
+
+        return K_Wi_i, L, LiW12
+
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@ -0,0 +1,186 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from ...util.linalg import pdinv, dpotrs, dpotri, symmetrify, jitchol
+
+class Posterior(object):
+    """
+    An object to represent a Gaussian posterior over latent function values, p(f|D).
+    This may be computed exactly for Gaussian likelihoods, or approximated for
+    non-Gaussian likelihoods.
+
+    The purpose of this class is to serve as an interface between the inference
+    schemes and the model classes.  the model class can make predictions for
+    the function at any new point x_* by integrating over this posterior.
+
+    """
+    def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None):
+        """
+        woodbury_chol : a lower triangular matrix L that satisfies posterior_covariance = K - K L^{-T} L^{-1} K
+        woodbury_vector : a matrix (or vector, as Nx1 matrix) M which satisfies posterior_mean = K M
+        K : the proir covariance (required for lazy computation of various quantities)
+        mean : the posterior mean
+        cov : the posterior covariance
+
+        Not all of the above need to be supplied! You *must* supply:
+
+          K (for lazy computation)
+          or
+          K_chol (for lazy computation)
+
+       You may supply either:
+
+          woodbury_chol
+          woodbury_vector
+
+        Or:
+
+          mean
+          cov
+
+        Of course, you can supply more than that, but this class will lazily
+        compute all other quantites on demand.
+
+        """
+        #obligatory
+        self._K = K
+
+        if ((woodbury_chol is not None) and (woodbury_vector is not None))\
+                or ((woodbury_inv is not None) and (woodbury_vector is not None))\
+                or ((woodbury_inv is not None) and (mean is not None))\
+                or ((mean is not None) and (cov is not None)):
+            pass # we have sufficient to compute the posterior
+        else:
+            raise ValueError, "insufficient information to compute the posterior"
+
+        self._K_chol = K_chol
+        self._K = K
+        #option 1:
+        self._woodbury_chol = woodbury_chol
+        self._woodbury_vector = woodbury_vector
+
+        #option 2.
+        self._woodbury_inv = woodbury_inv
+        #and woodbury vector
+
+        #option 2:
+        self._mean = mean
+        self._covariance = cov
+
+        #compute this lazily
+        self._precision = None
+
+    @property
+    def mean(self):
+        """
+        Posterior mean
+        $$
+        K_{xx}v
+        v := \texttt{Woodbury vector}
+        $$
+        """
+        if self._mean is None:
+            self._mean = np.dot(self._K, self.woodbury_vector)
+        return self._mean
+
+    @property
+    def covariance(self):
+        """
+        Posterior covariance
+        $$
+        K_{xx} - K_{xx}W_{xx}^{-1}K_{xx}
+        W_{xx} := \texttt{Woodbury inv}
+        $$
+        """
+        if self._covariance is None:
+            #LiK, _ = dtrtrs(self.woodbury_chol, self._K, lower=1)
+            self._covariance = (np.atleast_3d(self._K) - np.tensordot(np.dot(np.atleast_3d(self.woodbury_inv).T, self._K), self._K, [1,0]).T).squeeze()
+            #self._covariance = self._K - self._K.dot(self.woodbury_inv).dot(self._K)
+        return self._covariance
+
+    @property
+    def precision(self):
+        """
+        Inverse of posterior covariance
+        """
+        if self._precision is None:
+            cov = np.atleast_3d(self.covariance)
+            self._precision = np.zeros(cov.shape) # if one covariance per dimension
+            for p in xrange(cov.shape[-1]):
+                self._precision[:,:,p] = pdinv(cov[:,:,p])[0]
+        return self._precision
+
+    @property
+    def woodbury_chol(self):
+        """
+        return $L_{W}$ where L is the lower triangular Cholesky decomposition of the Woodbury matrix
+        $$
+        L_{W}L_{W}^{\top} = W^{-1}
+        W^{-1} := \texttt{Woodbury inv}
+        $$
+        """
+        if self._woodbury_chol is None:
+            #compute woodbury chol from
+            if self._woodbury_inv is not None:
+                winv = np.atleast_3d(self._woodbury_inv)
+                self._woodbury_chol = np.zeros(winv.shape)
+                for p in xrange(winv.shape[-1]):
+                    self._woodbury_chol[:,:,p] = pdinv(winv[:,:,p])[2]
+                #Li = jitchol(self._woodbury_inv)
+                #self._woodbury_chol, _ = dtrtri(Li)
+                #W, _, _, _, = pdinv(self._woodbury_inv)
+                #symmetrify(W)
+                #self._woodbury_chol = jitchol(W)
+            #try computing woodbury chol from cov
+            elif self._covariance is not None:
+                raise NotImplementedError, "TODO: check code here"
+                B = self._K - self._covariance
+                tmp, _ = dpotrs(self.K_chol, B)
+                self._woodbury_inv, _ = dpotrs(self.K_chol, tmp.T)
+                _, _, self._woodbury_chol, _ = pdinv(self._woodbury_inv)
+            else:
+                raise ValueError, "insufficient information to compute posterior"
+        return self._woodbury_chol
+
+    @property
+    def woodbury_inv(self):
+        """
+        The inverse of the woodbury matrix, in the gaussian likelihood case it is defined as
+        $$
+        (K_{xx} + \Sigma_{xx})^{-1}
+        \Sigma_{xx} := \texttt{Likelihood.variance / Approximate likelihood covariance}
+        $$
+        """
+        if self._woodbury_inv is None:
+            if self._woodbury_chol is not None:
+                self._woodbury_inv, _ = dpotri(self._woodbury_chol, lower=1)
+                #self._woodbury_inv, _ = dpotrs(self.woodbury_chol, np.eye(self.woodbury_chol.shape[0]), lower=1)
+                symmetrify(self._woodbury_inv)
+            elif self._covariance is not None:
+                B = self._K - self._covariance
+                tmp, _ = dpotrs(self.K_chol, B)
+                self._woodbury_inv, _ = dpotrs(self.K_chol, tmp.T)                
+        return self._woodbury_inv
+
+    @property
+    def woodbury_vector(self):
+        """
+        Woodbury vector in the gaussian likelihood case only is defined as
+        $$
+        (K_{xx} + \Sigma)^{-1}Y
+        \Sigma := \texttt{Likelihood.variance / Approximate likelihood covariance}
+        $$
+        """
+        if self._woodbury_vector is None:
+            self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean)
+        return self._woodbury_vector
+
+    @property
+    def K_chol(self):
+        """
+        Cholesky of the prior covariance K
+        """
+        if self._K_chol is None:
+            self._K_chol = jitchol(self._K)
+        return self._K_chol
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@ -0,0 +1,249 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from posterior import Posterior
+from ...util.linalg import mdot, jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify
+from ...util import diag
+from ...core.parameterization.variational import VariationalPosterior
+import numpy as np
+from . import LatentFunctionInference
+log_2_pi = np.log(2*np.pi)
+import logging, itertools
+logger = logging.getLogger('vardtc')
+
+class VarDTC(LatentFunctionInference):
+    """
+    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
+
+    The function self.inference returns a Posterior object, which summarizes
+    the posterior.
+
+    For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.
+
+    """
+    const_jitter = 1e-6
+    def __init__(self, limit=1):
+        #self._YYTfactor_cache = caching.cache()
+        from ...util.caching import Cacher
+        self.limit = limit
+        self.get_trYYT = Cacher(self._get_trYYT, limit)
+        self.get_YYTfactor = Cacher(self._get_YYTfactor, limit)
+
+    def set_limit(self, limit):
+        self.get_trYYT.limit = limit
+        self.get_YYTfactor.limit = limit
+
+    def _get_trYYT(self, Y):
+        return np.einsum("ij,ij->", Y, Y)
+        # faster than, but same as:
+        # return np.sum(np.square(Y))
+
+    def __getstate__(self):
+        # has to be overridden, as Cacher objects cannot be pickled.
+        return self.limit
+
+    def __setstate__(self, state):
+        # has to be overridden, as Cacher objects cannot be pickled.
+        self.limit = state
+        from ...util.caching import Cacher
+        self.get_trYYT = Cacher(self._get_trYYT, self.limit)
+        self.get_YYTfactor = Cacher(self._get_YYTfactor, self.limit)
+
+    def _get_YYTfactor(self, Y):
+        """
+        find a matrix L which satisfies LLT = YYT.
+
+        Note that L may have fewer columns than Y.
+        """
+        N, D = Y.shape
+        if (N>=D):
+            return Y.view(np.ndarray)
+        else:
+            return jitchol(tdot(Y))
+
+    def get_VVTfactor(self, Y, prec):
+        return Y * prec # TODO chache this, and make it effective
+
+
+
+    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None):
+
+        _, output_dim = Y.shape
+        uncertain_inputs = isinstance(X, VariationalPosterior)
+
+        #see whether we've got a different noise variance for each datum
+        beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6)
+        # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
+        #self.YYTfactor = self.get_YYTfactor(Y)
+        #VVT_factor = self.get_VVTfactor(self.YYTfactor, beta)
+        het_noise = beta.size > 1
+        if beta.ndim == 1:
+            beta = beta[:, None]
+        VVT_factor = beta*Y
+        #VVT_factor = beta*Y
+        trYYT = self.get_trYYT(Y)
+
+        # do the inference:
+        num_inducing = Z.shape[0]
+        num_data = Y.shape[0]
+        # kernel computations, using BGPLVM notation
+
+        Kmm = kern.K(Z).copy()
+        diag.add(Kmm, self.const_jitter)
+        if Lm is None:
+            Lm = jitchol(Kmm)
+
+        # The rather complex computations of A, and the psi stats
+        if uncertain_inputs:
+            psi0 = kern.psi0(Z, X)
+            psi1 = kern.psi1(Z, X)
+            if het_noise:
+                psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0)
+            else:
+                psi2_beta = kern.psi2(Z,X) * beta
+            LmInv = dtrtri(Lm)
+            A = LmInv.dot(psi2_beta.dot(LmInv.T))
+        else:
+            psi0 = kern.Kdiag(X)
+            psi1 = kern.K(X, Z)
+            if het_noise:
+                tmp = psi1 * (np.sqrt(beta))
+            else:
+                tmp = psi1 * (np.sqrt(beta))
+            tmp, _ = dtrtrs(Lm, tmp.T, lower=1)
+            A = tdot(tmp) #print A.sum()
+
+        # factor B
+        B = np.eye(num_inducing) + A
+        LB = jitchol(B)
+        psi1Vf = np.dot(psi1.T, VVT_factor)
+        # back substutue C into psi1Vf
+        tmp, _ = dtrtrs(Lm, psi1Vf, lower=1, trans=0)
+        _LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0)
+        tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
+        Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
+
+        # data fit and derivative of L w.r.t. Kmm
+        delit = tdot(_LBi_Lmi_psi1Vf)
+        data_fit = np.trace(delit)
+        DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit)
+        if dL_dKmm is None:
+            delit = -0.5 * DBi_plus_BiPBi
+            delit += -0.5 * B * output_dim
+            delit += output_dim * np.eye(num_inducing)
+            # Compute dL_dKmm
+            dL_dKmm = backsub_both_sides(Lm, delit)
+
+        # derivatives of L w.r.t. psi
+        dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm,
+            VVT_factor, Cpsi1Vf, DBi_plus_BiPBi,
+            psi1, het_noise, uncertain_inputs)
+
+        # log marginal likelihood
+        log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise,
+            psi0, A, LB, trYYT, data_fit, Y)
+
+        #noise derivatives
+        dL_dR = _compute_dL_dR(likelihood,
+            het_noise, uncertain_inputs, LB,
+            _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A,
+            psi0, psi1, beta,
+            data_fit, num_data, output_dim, trYYT, Y, VVT_factor)
+
+        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR,Y_metadata)
+
+        #put the gradients in the right places
+        if uncertain_inputs:
+            grad_dict = {'dL_dKmm': dL_dKmm,
+                         'dL_dpsi0':dL_dpsi0,
+                         'dL_dpsi1':dL_dpsi1,
+                         'dL_dpsi2':dL_dpsi2,
+                         'dL_dthetaL':dL_dthetaL}
+        else:
+            grad_dict = {'dL_dKmm': dL_dKmm,
+                         'dL_dKdiag':dL_dpsi0,
+                         'dL_dKnm':dL_dpsi1,
+                         'dL_dthetaL':dL_dthetaL}
+
+        #get sufficient things for posterior prediction
+        #TODO: do we really want to do this in  the loop?
+        if VVT_factor.shape[1] == Y.shape[1]:
+            woodbury_vector = Cpsi1Vf # == Cpsi1V
+        else:
+            print 'foobar'
+            import ipdb; ipdb.set_trace()
+            psi1V = np.dot(Y.T*beta, psi1).T
+            tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
+            tmp, _ = dpotrs(LB, tmp, lower=1)
+            woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
+        Bi, _ = dpotri(LB, lower=1)
+        symmetrify(Bi)
+        Bi = -dpotri(LB, lower=1)[0]
+        diag.add(Bi, 1)
+
+        woodbury_inv = backsub_both_sides(Lm, Bi)
+
+        #construct a posterior object
+        post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
+        return post, log_marginal, grad_dict
+
+def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs):
+    dL_dpsi0 = -0.5 * output_dim * (beta* np.ones([num_data, 1])).flatten()
+    dL_dpsi1 = np.dot(VVT_factor, Cpsi1Vf.T)
+    dL_dpsi2_beta = 0.5 * backsub_both_sides(Lm, output_dim * np.eye(num_inducing) - DBi_plus_BiPBi)
+    if het_noise:
+        if uncertain_inputs:
+            dL_dpsi2 = beta[:, None] * dL_dpsi2_beta[None, :, :]
+        else:
+            dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * beta).T).T
+            dL_dpsi2 = None
+    else:
+        dL_dpsi2 = beta * dL_dpsi2_beta
+        if not uncertain_inputs:
+            # subsume back into psi1 (==Kmn)
+            dL_dpsi1 += 2.*np.dot(psi1, dL_dpsi2)
+            dL_dpsi2 = None
+    return dL_dpsi0, dL_dpsi1, dL_dpsi2
+
+
+def _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT, Y, VVT_factr=None):
+    # the partial derivative vector for the likelihood
+    if likelihood.size == 0:
+        # save computation here.
+        dL_dR = None
+    elif het_noise:
+        if uncertain_inputs:
+            raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
+        else:
+            #from ...util.linalg import chol_inv
+            #LBi = chol_inv(LB)
+            LBi, _ = dtrtrs(LB,np.eye(LB.shape[0]))
+
+            Lmi_psi1, nil = dtrtrs(Lm, psi1.T, lower=1, trans=0)
+            _LBi_Lmi_psi1, _ = dtrtrs(LB, Lmi_psi1, lower=1, trans=0)
+            dL_dR = -0.5 * beta + 0.5 * VVT_factr**2
+            dL_dR += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * beta**2
+
+            dL_dR += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*beta**2
+
+            dL_dR += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * Y * beta**2
+            dL_dR += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * beta**2
+    else:
+        # likelihood is not heteroscedatic
+        dL_dR = -0.5 * num_data * output_dim * beta + 0.5 * trYYT * beta ** 2
+        dL_dR += 0.5 * output_dim * (psi0.sum() * beta ** 2 - np.trace(A) * beta)
+        dL_dR += beta * (0.5 * np.sum(A * DBi_plus_BiPBi) - data_fit)
+    return dL_dR
+
+def _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT, data_fit, Y):
+    #compute log marginal likelihood
+    if het_noise:
+        lik_1 = -0.5 * num_data * output_dim * np.log(2. * np.pi) + 0.5 * output_dim * np.sum(np.log(beta)) - 0.5 * np.sum(beta.ravel() * np.square(Y).sum(axis=-1))
+        lik_2 = -0.5 * output_dim * (np.sum(beta.flatten() * psi0) - np.trace(A))
+    else:
+        lik_1 = -0.5 * num_data * output_dim * (np.log(2. * np.pi) - np.log(beta)) - 0.5 * beta * trYYT
+        lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
+    lik_3 = -output_dim * (np.sum(np.log(np.diag(LB))))
+    lik_4 = 0.5 * data_fit
+    log_marginal = lik_1 + lik_2 + lik_3 + lik_4
+    return log_marginal
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@ -0,0 +1,479 @@
+# Copyright (c) 2014, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from posterior import Posterior
+from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri,pdinv
+from ...util import diag
+from ...core.parameterization.variational import VariationalPosterior
+import numpy as np
+from . import LatentFunctionInference
+log_2_pi = np.log(2*np.pi)
+
+try:
+    from mpi4py import MPI
+except:
+    pass
+
+class VarDTC_minibatch(LatentFunctionInference):
+    """
+    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
+
+    The function self.inference returns a Posterior object, which summarizes
+    the posterior.
+
+    For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.
+
+    """
+    const_jitter = 1e-6
+    def __init__(self, batchsize=None, limit=1, mpi_comm=None):
+
+        self.batchsize = batchsize
+        self.mpi_comm = mpi_comm
+        self.limit = limit
+
+        # Cache functions
+        from ...util.caching import Cacher
+        self.get_trYYT = Cacher(self._get_trYYT, limit)
+        self.get_YYTfactor = Cacher(self._get_YYTfactor, limit)
+
+        self.midRes = {}
+        self.batch_pos = 0 # the starting position of the current mini-batch
+        self.Y_speedup = False # Replace Y with the cholesky factor of YY.T, but the computation of posterior object will be skipped.
+
+    def __getstate__(self):
+        # has to be overridden, as Cacher objects cannot be pickled.
+        return self.batchsize, self.limit, self.Y_speedup
+
+    def __setstate__(self, state):
+        # has to be overridden, as Cacher objects cannot be pickled.
+        self.batchsize, self.limit, self.Y_speedup = state
+        self.mpi_comm = None
+        self.midRes = {}
+        self.batch_pos = 0
+        from ...util.caching import Cacher
+        self.get_trYYT = Cacher(self._get_trYYT, self.limit)
+        self.get_YYTfactor = Cacher(self._get_YYTfactor, self.limit)
+
+    def set_limit(self, limit):
+        self.get_trYYT.limit = limit
+        self.get_YYTfactor.limit = limit
+
+    def _get_trYYT(self, Y):
+        return np.sum(np.square(Y))
+
+    def _get_YYTfactor(self, Y):
+        """
+        find a matrix L which satisfies LLT = YYT.
+
+        Note that L may have fewer columns than Y.
+        """
+        N, D = Y.shape
+        if (N>=D):
+            return Y.view(np.ndarray)
+        else:
+            return jitchol(tdot(Y))
+
+    def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs):
+
+        het_noise = beta.size > 1
+        
+        assert beta.size == 1
+
+        trYYT = self.get_trYYT(Y)
+        if self.Y_speedup and not het_noise:
+            Y =  self.get_YYTfactor(Y)
+
+        num_inducing = Z.shape[0]
+        num_data, output_dim = Y.shape
+        batchsize = num_data if self.batchsize is None else self.batchsize
+
+        psi2_full = np.zeros((num_inducing,num_inducing)) # MxM
+        psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM
+        psi0_full = 0.
+        YRY_full = 0.
+
+        for n_start in xrange(0,num_data,batchsize):
+            n_end = min(batchsize+n_start, num_data)
+            if batchsize==num_data:
+                Y_slice = Y
+                X_slice = X
+            else:
+                Y_slice = Y[n_start:n_end]
+                X_slice = X[n_start:n_end]
+
+            if het_noise:
+                b = beta[n_start]
+                YRY_full += np.inner(Y_slice, Y_slice)*b
+            else:
+                b = beta
+
+            if uncertain_inputs:
+                psi0 = kern.psi0(Z, X_slice)
+                psi1 = kern.psi1(Z, X_slice)
+                psi2_full += kern.psi2(Z, X_slice)*b
+            else:
+                psi0 = kern.Kdiag(X_slice)
+                psi1 = kern.K(X_slice, Z)
+                psi2_full += np.dot(psi1.T,psi1)*b
+
+            psi0_full += psi0.sum()*b
+            psi1Y_full += np.dot(Y_slice.T,psi1)*b # DxM
+
+        if not het_noise:
+            YRY_full = trYYT*beta
+
+        if self.mpi_comm != None:
+            psi0_all = np.array(psi0_full)
+            psi1Y_all = psi1Y_full.copy()
+            psi2_all = psi2_full.copy()
+            YRY_all = np.array(YRY_full)
+            self.mpi_comm.Allreduce([psi0_full, MPI.DOUBLE], [psi0_all, MPI.DOUBLE])
+            self.mpi_comm.Allreduce([psi1Y_full, MPI.DOUBLE], [psi1Y_all, MPI.DOUBLE])
+            self.mpi_comm.Allreduce([psi2_full, MPI.DOUBLE], [psi2_all, MPI.DOUBLE])
+            self.mpi_comm.Allreduce([YRY_full, MPI.DOUBLE], [YRY_all, MPI.DOUBLE])
+            return psi0_all, psi1Y_all, psi2_all, YRY_all
+
+        return psi0_full, psi1Y_full, psi2_full, YRY_full
+
+    def inference_likelihood(self, kern, X, Z, likelihood, Y):
+        """
+        The first phase of inference:
+        Compute: log-likelihood, dL_dKmm
+
+        Cached intermediate results: Kmm, KmmInv,
+        """
+
+        num_data, output_dim = Y.shape
+        input_dim = Z.shape[0]
+        if self.mpi_comm != None:
+            num_data_all = np.array(num_data,dtype=np.int32)
+            self.mpi_comm.Allreduce([np.int32(num_data), MPI.INT], [num_data_all, MPI.INT])
+            num_data = num_data_all
+
+        if isinstance(X, VariationalPosterior):
+            uncertain_inputs = True
+        else:
+            uncertain_inputs = False
+
+        #see whether we've got a different noise variance for each datum
+        beta = 1./np.fmax(likelihood.variance, 1e-6)
+        het_noise = beta.size > 1
+        if het_noise:
+            self.batchsize = 1
+
+        psi0_full, psi1Y_full, psi2_full, YRY_full = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs)
+
+        #======================================================================
+        # Compute Common Components
+        #======================================================================
+
+        Kmm = kern.K(Z).copy()
+        diag.add(Kmm, self.const_jitter)
+        Lm = jitchol(Kmm, maxtries=100)
+
+        LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right')
+        Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT
+        LL = jitchol(Lambda, maxtries=100)
+        logdet_L = 2.*np.sum(np.log(np.diag(LL)))
+        b = dtrtrs(LL,dtrtrs(Lm,psi1Y_full.T)[0])[0]
+        bbt = np.square(b).sum()
+        v = dtrtrs(Lm,dtrtrs(LL,b,trans=1)[0],trans=1)[0]
+
+        tmp  = -backsub_both_sides(LL, tdot(b)+output_dim*np.eye(input_dim), transpose='left')
+        dL_dpsi2R = backsub_both_sides(Lm, tmp+output_dim*np.eye(input_dim), transpose='left')/2.
+
+        # Cache intermediate results
+        self.midRes['dL_dpsi2R'] = dL_dpsi2R
+        self.midRes['v'] = v
+
+        #======================================================================
+        # Compute log-likelihood
+        #======================================================================
+        if het_noise:
+            logL_R = -np.log(beta).sum()
+        else:
+            logL_R = -num_data*np.log(beta)
+        logL = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-np.trace(LmInvPsi2LmInvT))+YRY_full-bbt)/2.-output_dim*logdet_L/2.
+
+        #======================================================================
+        # Compute dL_dKmm
+        #======================================================================
+
+        dL_dKmm =  dL_dpsi2R - output_dim*backsub_both_sides(Lm, LmInvPsi2LmInvT, transpose='left')/2.
+
+        #======================================================================
+        # Compute the Posterior distribution of inducing points p(u|Y)
+        #======================================================================
+
+        if not self.Y_speedup or het_noise:
+            wd_inv = backsub_both_sides(Lm, np.eye(input_dim)- backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left')
+            post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=Lm)
+        else:
+            post = None
+
+        #======================================================================
+        # Compute dL_dthetaL for uncertian input and non-heter noise
+        #======================================================================
+
+        if not het_noise:
+            dL_dthetaL = (YRY_full*beta + beta*output_dim*psi0_full - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2_full).sum() - beta*(v.T*psi1Y_full).sum()
+            self.midRes['dL_dthetaL'] = dL_dthetaL
+
+        return logL, dL_dKmm, post
+
+    def inference_minibatch(self, kern, X, Z, likelihood, Y):
+        """
+        The second phase of inference: Computing the derivatives over a minibatch of Y
+        Compute: dL_dpsi0, dL_dpsi1, dL_dpsi2, dL_dthetaL
+        return a flag showing whether it reached the end of Y (isEnd)
+        """
+
+        num_data, output_dim = Y.shape
+
+        if isinstance(X, VariationalPosterior):
+            uncertain_inputs = True
+        else:
+            uncertain_inputs = False
+
+        #see whether we've got a different noise variance for each datum
+        beta = 1./np.fmax(likelihood.variance, 1e-6)
+        het_noise = beta.size > 1
+        # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
+        #self.YYTfactor = beta*self.get_YYTfactor(Y)
+        if self.Y_speedup and not het_noise:
+            YYT_factor = self.get_YYTfactor(Y)
+        else:
+            YYT_factor = Y
+
+        n_start = self.batch_pos
+        batchsize = num_data if self.batchsize is None else self.batchsize
+        n_end = min(batchsize+n_start, num_data)
+        if n_end==num_data:
+            isEnd = True
+            self.batch_pos = 0
+        else:
+            isEnd = False
+            self.batch_pos = n_end
+
+        if batchsize==num_data:
+            Y_slice = YYT_factor
+            X_slice =X
+        else:
+            Y_slice = YYT_factor[n_start:n_end]
+            X_slice = X[n_start:n_end]
+
+        if not uncertain_inputs:
+            psi0 = kern.Kdiag(X_slice)
+            psi1 = kern.K(X_slice, Z)
+            psi2 = None
+            betapsi1 = np.einsum('n,nm->nm',beta,psi1)
+        elif het_noise:
+            psi0 = kern.psi0(Z, X_slice)
+            psi1 = kern.psi1(Z, X_slice)
+            psi2 = kern.psi2(Z, X_slice)
+            betapsi1 = np.einsum('n,nm->nm',beta,psi1)
+
+        if het_noise:
+            beta = beta[n_start] # assuming batchsize==1
+
+        betaY = beta*Y_slice
+
+        #======================================================================
+        # Load Intermediate Results
+        #======================================================================
+
+        dL_dpsi2R = self.midRes['dL_dpsi2R']
+        v = self.midRes['v']
+
+        #======================================================================
+        # Compute dL_dpsi
+        #======================================================================
+
+        dL_dpsi0 = -output_dim * (beta * np.ones((n_end-n_start,)))/2.
+
+        dL_dpsi1 = np.dot(betaY,v.T)
+
+        if uncertain_inputs:
+            dL_dpsi2 = beta* dL_dpsi2R
+        else:
+            dL_dpsi1 += np.dot(betapsi1,dL_dpsi2R)*2.
+            dL_dpsi2 = None
+
+        #======================================================================
+        # Compute dL_dthetaL
+        #======================================================================
+
+        if het_noise:
+            if uncertain_inputs:
+                psiR = np.einsum('mo,mo->',dL_dpsi2R,psi2)
+            else:
+                psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R)
+
+            dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1)
+        else:
+            if isEnd:
+                dL_dthetaL = self.midRes['dL_dthetaL']
+            else:
+                dL_dthetaL = 0.
+
+        if uncertain_inputs:
+            grad_dict = {'dL_dpsi0':dL_dpsi0,
+                         'dL_dpsi1':dL_dpsi1,
+                         'dL_dpsi2':dL_dpsi2,
+                         'dL_dthetaL':dL_dthetaL}
+        else:
+            grad_dict = {'dL_dKdiag':dL_dpsi0,
+                         'dL_dKnm':dL_dpsi1,
+                         'dL_dthetaL':dL_dthetaL}
+
+        return isEnd, (n_start,n_end), grad_dict
+
+
+def update_gradients(model, mpi_comm=None):
+    if mpi_comm == None:
+        Y = model.Y
+        X = model.X
+    else:
+        Y = model.Y_local
+        X = model.X[model.N_range[0]:model.N_range[1]]
+
+    model._log_marginal_likelihood, dL_dKmm, model.posterior = model.inference_method.inference_likelihood(model.kern, X, model.Z, model.likelihood, Y)
+
+    het_noise = model.likelihood.variance.size > 1
+
+    if het_noise:
+        dL_dthetaL = np.empty((model.Y.shape[0],))
+    else:
+        dL_dthetaL = np.float64(0.)
+
+    kern_grad = model.kern.gradient.copy()
+    kern_grad[:] = 0.
+    model.Z.gradient = 0.
+
+    isEnd = False
+    while not isEnd:
+        isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, X, model.Z, model.likelihood, Y)
+        if isinstance(model.X, VariationalPosterior):
+            if (n_range[1]-n_range[0])==X.shape[0]:
+                X_slice = X
+            elif mpi_comm ==None:
+                X_slice = model.X[n_range[0]:n_range[1]]
+            else:
+                X_slice = model.X[model.N_range[0]+n_range[0]:model.N_range[0]+n_range[1]]
+
+            #gradients w.r.t. kernel
+            model.kern.update_gradients_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'])
+            kern_grad += model.kern.gradient
+
+            #gradients w.r.t. Z
+            model.Z.gradient += model.kern.gradients_Z_expectations(
+                               dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=X_slice)
+
+            #gradients w.r.t. posterior parameters of X
+            X_grad = model.kern.gradients_qX_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'])
+            model.set_X_gradients(X_slice, X_grad)
+
+            if het_noise:
+                dL_dthetaL[n_range[0]:n_range[1]] = grad_dict['dL_dthetaL']
+            else:
+                dL_dthetaL += grad_dict['dL_dthetaL']
+
+    # Gather the gradients from multiple MPI nodes
+    if mpi_comm != None:
+        if het_noise:
+            raise "het_noise not implemented!"
+        kern_grad_all = kern_grad.copy()
+        Z_grad_all = model.Z.gradient.copy()
+        mpi_comm.Allreduce([kern_grad, MPI.DOUBLE], [kern_grad_all, MPI.DOUBLE])
+        mpi_comm.Allreduce([model.Z.gradient, MPI.DOUBLE], [Z_grad_all, MPI.DOUBLE])
+        kern_grad = kern_grad_all
+        model.Z.gradient = Z_grad_all
+
+    #gradients w.r.t. kernel
+    model.kern.update_gradients_full(dL_dKmm, model.Z, None)
+    model.kern.gradient += kern_grad
+
+    #gradients w.r.t. Z
+    model.Z.gradient += model.kern.gradients_X(dL_dKmm, model.Z)
+
+    # Update Log-likelihood
+    KL_div = model.variational_prior.KL_divergence(X)
+    # update for the KL divergence
+    model.variational_prior.update_gradients_KL(X)
+
+    if mpi_comm != None:
+        KL_div_all = np.array(KL_div)
+        mpi_comm.Allreduce([np.float64(KL_div), MPI.DOUBLE], [KL_div_all, MPI.DOUBLE])
+        KL_div = KL_div_all
+        [mpi_comm.Allgatherv([pp.copy(), MPI.DOUBLE], [pa, (model.N_list*pa.shape[-1], None), MPI.DOUBLE]) for pp,pa in zip(model.get_X_gradients(X),model.get_X_gradients(model.X))]
+#         from ...models import SSGPLVM
+#         if isinstance(model, SSGPLVM):
+#             grad_pi = np.array(model.variational_prior.pi.gradient)
+#             mpi_comm.Allreduce([grad_pi.copy(), MPI.DOUBLE], [model.variational_prior.pi.gradient, MPI.DOUBLE])
+    model._log_marginal_likelihood -= KL_div
+
+    # dL_dthetaL
+    model.likelihood.update_gradients(dL_dthetaL)
+
+def update_gradients_sparsegp(model, mpi_comm=None):
+    if mpi_comm == None:
+        Y = model.Y
+        X = model.X
+    else:
+        Y = model.Y_local
+        X = model.X[model.N_range[0]:model.N_range[1]]
+
+    model._log_marginal_likelihood, dL_dKmm, model.posterior = model.inference_method.inference_likelihood(model.kern, X, model.Z, model.likelihood, Y)
+    
+    het_noise = model.likelihood.variance.size > 1
+    
+    if het_noise:
+        dL_dthetaL = np.empty((model.Y.shape[0],))
+    else:
+        dL_dthetaL = np.float64(0.)
+    
+    kern_grad = model.kern.gradient.copy()
+    kern_grad[:] = 0.
+    model.Z.gradient = 0.
+    
+    isEnd = False
+    while not isEnd:
+        isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, X, model.Z, model.likelihood, Y)
+
+        if (n_range[1]-n_range[0])==X.shape[0]:
+            X_slice = X
+        elif mpi_comm ==None:
+            X_slice = model.X[n_range[0]:n_range[1]]
+        else:
+            X_slice = model.X[model.N_range[0]+n_range[0]:model.N_range[0]+n_range[1]]
+                
+        model.kern.update_gradients_diag(grad_dict['dL_dKdiag'], X_slice)
+        kern_grad += model.kern.gradient
+        model.kern.update_gradients_full(grad_dict['dL_dKnm'], X_slice, model.Z)
+        kern_grad += model.kern.gradient
+            
+        model.Z.gradient += model.kern.gradients_X(grad_dict['dL_dKnm'].T, model.Z, X_slice)
+                
+        if het_noise:
+            dL_dthetaL[n_range[0]:n_range[1]] = grad_dict['dL_dthetaL']
+        else:
+            dL_dthetaL += grad_dict['dL_dthetaL']
+    
+    # Gather the gradients from multiple MPI nodes
+    if mpi_comm != None:
+        if het_noise:
+            raise "het_noise not implemented!"
+        kern_grad_all = kern_grad.copy()
+        Z_grad_all = model.Z.gradient.copy()
+        mpi_comm.Allreduce([kern_grad, MPI.DOUBLE], [kern_grad_all, MPI.DOUBLE])
+        mpi_comm.Allreduce([model.Z.gradient, MPI.DOUBLE], [Z_grad_all, MPI.DOUBLE])
+        kern_grad = kern_grad_all
+        model.Z.gradient = Z_grad_all
+
+    model.kern.update_gradients_full(dL_dKmm, model.Z, None)
+    model.kern.gradient += kern_grad
+
+    model.Z.gradient += model.kern.gradients_X(dL_dKmm, model.Z)
+
+    # dL_dthetaL
+    model.likelihood.update_gradients(dL_dthetaL)
--- a/GPy/inference/mcmc/init.py
+++ b/GPy/inference/mcmc/init.py
@ -0,0 +1 @@
+from hmc import HMC
--- a/GPy/inference/mcmc/hmc.py
+++ b/GPy/inference/mcmc/hmc.py
@ -0,0 +1,174 @@
+# ## Copyright (c) 2014, Zhenwen Dai
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+
+
+class HMC:
+    """
+    An implementation of Hybrid Monte Carlo (HMC) for GPy models
+    
+    Initialize an object for HMC sampling. Note that the status of the model (model parameters) will be changed during sampling.
+    
+    :param model: the GPy model that will be sampled
+    :type model: GPy.core.Model
+    :param M: the mass matrix (an identity matrix by default)
+    :type M: numpy.ndarray
+    :param stepsize: the step size for HMC sampling
+    :type stepsize: float
+    """
+    def __init__(self, model, M=None,stepsize=1e-1):
+        self.model = model
+        self.stepsize = stepsize
+        self.p = np.empty_like(model.optimizer_array.copy())
+        if M is None:
+            self.M = np.eye(self.p.size)
+        else:
+            self.M = M
+        self.Minv = np.linalg.inv(self.M)
+
+    def sample(self, num_samples=1000, hmc_iters=20):
+        """
+        Sample the (unfixed) model parameters.
+        
+        :param num_samples: the number of samples to draw (1000 by default)
+        :type num_samples: int
+        :param hmc_iters: the number of leap-frog iterations (20 by default)
+        :type hmc_iters: int
+        :return: the list of parameters samples with the size N x P (N - the number of samples, P - the number of parameters to sample) 
+        :rtype: numpy.ndarray
+        """
+        params = np.empty((num_samples,self.p.size))
+        for i in xrange(num_samples):
+            self.p[:] = np.random.multivariate_normal(np.zeros(self.p.size),self.M)
+            H_old = self._computeH()
+            theta_old = self.model.optimizer_array.copy()
+            params[i] = self.model.unfixed_param_array
+            #Matropolis
+            self._update(hmc_iters)
+            H_new = self._computeH()
+
+            if H_old>H_new:
+                k = 1.
+            else:
+                k = np.exp(H_old-H_new)
+            if np.random.rand()<k:
+                params[i] = self.model.unfixed_param_array
+            else:
+                self.model.optimizer_array = theta_old
+        return params
+
+    def _update(self, hmc_iters):
+        for i in xrange(hmc_iters):
+            self.p[:] += -self.stepsize/2.*self.model._transform_gradients(self.model.objective_function_gradients())
+            self.model.optimizer_array = self.model.optimizer_array + self.stepsize*np.dot(self.Minv, self.p)
+            self.p[:] += -self.stepsize/2.*self.model._transform_gradients(self.model.objective_function_gradients())
+
+    def _computeH(self,):
+        return self.model.objective_function()+self.p.size*np.log(2*np.pi)/2.+np.log(np.linalg.det(self.M))/2.+np.dot(self.p, np.dot(self.Minv,self.p[:,None]))/2.
+
+class HMC_shortcut:
+    def __init__(self,model,M=None,stepsize_range=[1e-6, 1e-1],groupsize=5, Hstd_th=[1e-5, 3.]):
+        self.model = model
+        self.stepsize_range = np.log(stepsize_range)
+        self.p = np.empty_like(model.optimizer_array.copy())
+        self.groupsize = groupsize
+        self.Hstd_th = Hstd_th
+        if M is None:
+            self.M = np.eye(self.p.size)
+        else:
+            self.M = M
+        self.Minv = np.linalg.inv(self.M)
+
+    def sample(self, m_iters=1000, hmc_iters=20):
+        params = np.empty((m_iters,self.p.size))
+        for i in xrange(m_iters):
+            # sample a stepsize from the uniform distribution
+            stepsize = np.exp(np.random.rand()*(self.stepsize_range[1]-self.stepsize_range[0])+self.stepsize_range[0])
+            self.p[:] = np.random.multivariate_normal(np.zeros(self.p.size),self.M)
+            H_old = self._computeH()
+            params[i] = self.model.unfixed_param_array
+            theta_old = self.model.optimizer_array.copy()
+            #Matropolis
+            self._update(hmc_iters, stepsize)
+            H_new = self._computeH()
+
+            if H_old>H_new:
+                k = 1.
+            else:
+                k = np.exp(H_old-H_new)
+            if np.random.rand()<k:
+                params[i] = self.model.unfixed_param_array
+            else:
+                self.model.optimizer_array = theta_old
+        return params
+
+    def _update(self, hmc_iters, stepsize):
+        theta_buf = np.empty((2*hmc_iters+1,self.model.optimizer_array.size))
+        p_buf = np.empty((2*hmc_iters+1,self.p.size))
+        H_buf = np.empty((2*hmc_iters+1,))
+        # Set initial position
+        theta_buf[hmc_iters] = self.model.optimizer_array
+        p_buf[hmc_iters] = self.p
+        H_buf[hmc_iters] = self._computeH()
+
+        reversal = []
+        pos = 1
+        i=0
+        while i<hmc_iters:
+            self.p[:] += -stepsize/2.*self.model._transform_gradients(self.model.objective_function_gradients())
+            self.model.optimizer_array = self.model.optimizer_array + stepsize*np.dot(self.Minv, self.p)
+            self.p[:] += -stepsize/2.*self.model._transform_gradients(self.model.objective_function_gradients())
+
+            theta_buf[hmc_iters+pos] = self.model.optimizer_array
+            p_buf[hmc_iters+pos] = self.p
+            H_buf[hmc_iters+pos] = self._computeH()
+            i+=1
+
+            if i<self.groupsize:
+                pos += 1
+                continue
+            else:
+                if len(reversal)==0:
+                    Hlist = range(hmc_iters+pos,hmc_iters+pos-self.groupsize,-1)
+                    if self._testH(H_buf[Hlist]):
+                        pos += 1
+                    else:
+                        # Reverse the trajectory for the 1st time
+                        reversal.append(pos)
+                        if hmc_iters-i>pos:
+                            pos = -1
+                            i += pos
+                            self.model.optimizer_array = theta_buf[hmc_iters]
+                            self.p[:] = -p_buf[hmc_iters]
+                        else:
+                            pos_new = pos-hmc_iters+i
+                            self.model.optimizer_array = theta_buf[hmc_iters+pos_new]
+                            self.p[:] = -p_buf[hmc_iters+pos_new]
+                            break
+                else:
+                    Hlist = range(hmc_iters+pos,hmc_iters+pos+self.groupsize)
+
+                    if self._testH(H_buf[Hlist]):
+                        pos += -1
+                    else:
+                        # Reverse the trajectory for the 2nd time
+                        r = (hmc_iters - i)%((reversal[0]-pos)*2)
+                        if r>(reversal[0]-pos):
+                            pos_new = 2*reversal[0] - r - pos
+                        else:
+                            pos_new = pos + r
+                        self.model.optimizer_array = theta_buf[hmc_iters+pos_new]
+                        self.p[:] = p_buf[hmc_iters+pos_new] # the sign of momentum might be wrong!
+                        break
+
+    def _testH(self, Hlist):
+        Hstd = np.std(Hlist)
+        if Hstd<self.Hstd_th[0] or Hstd>self.Hstd_th[1]:
+            return False
+        else:
+            return True
+
+    def _computeH(self,):
+        return self.model.objective_function()+self.p.size*np.log(2*np.pi)/2.+np.log(np.linalg.det(self.M))/2.+np.dot(self.p, np.dot(self.Minv,self.p[:,None]))/2.
+
--- a/GPy/inference/mcmc/samplers.py
+++ b/GPy/inference/mcmc/samplers.py
@ -1,10 +1,9 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# ## Copyright (c) 2014, Zhenwen Dai
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


 import numpy as np
 from scipy import linalg, optimize
-import pylab as pb
 import Tango
 import sys
 import re
@ -80,6 +79,3 @@ class Metropolis_Hastings:
            fs.append(function(*args))
        self.model._set_params(param)# reset model to starting state
        return fs
-
-
-
--- a/GPy/inference/optimization/init.py
+++ b/GPy/inference/optimization/init.py
@ -0,0 +1,2 @@
+from scg import SCG
+from optimization import *
--- a/GPy/inference/optimization/conjugate_gradient_descent.py
+++ b/GPy/inference/optimization/conjugate_gradient_descent.py
@ -1,9 +1,7 @@
-'''
-Created on 24 Apr 2013
+# Copyright (c) 2012-2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)

-@author: maxz
-'''
-from GPy.inference.gradient_descent_update_rules import FletcherReeves, \
+from gradient_descent_update_rules import FletcherReeves, \
    PolakRibiere
 from Queue import Empty
 from multiprocessing import Value
--- a/GPy/inference/optimization/gradient_descent_update_rules.py
+++ b/GPy/inference/optimization/gradient_descent_update_rules.py
@ -1,8 +1,6 @@
-'''
-Created on 24 Apr 2013
+# Copyright (c) 2012-2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)

-@author: maxz
-'''
 import numpy

 class GDUpdateRule():
--- a/GPy/inference/optimization/optimization.py
+++ b/GPy/inference/optimization/optimization.py
@ -1,7 +1,6 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-import pylab as pb
 import datetime as dt
 from scipy import optimize
 from warnings import warn
@ -57,13 +56,14 @@ class Optimizer():
        raise NotImplementedError, "this needs to be implemented to use the optimizer class"

    def plot(self):
-        if self.trace == None:
-            print "No trace present so I can't plot it. Please check that the optimizer actually supplies a trace."
-        else:
-            pb.figure()
-            pb.plot(self.trace)
-            pb.xlabel('Iteration')
-            pb.ylabel('f(x)')
+        """
+        See GPy.plotting.matplot_dep.inference_plots
+        """
+        import sys
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ...plotting.matplot_dep import inference_plots
+        inference_plots.plot_optimizer(self)
+

    def __str__(self):
        diagnostics = "Optimizer: \t\t\t\t %s\n" % self.opt_name
@ -118,7 +118,7 @@ class opt_lbfgsb(Optimizer):
        assert f_fp != None, "BFGS requires f_fp"

        if self.messages:
-            iprint = 0
+            iprint = 1
        else:
            iprint = -1

@ -126,29 +126,18 @@ class opt_lbfgsb(Optimizer):
        if self.xtol is not None:
            print "WARNING: l-bfgs-b doesn't have an xtol arg, so I'm going to ignore it"
        if self.ftol is not None:
-            opt_dict['ftol'] = self.ftol
-        #    print "WARNING: l-bfgs-b doesn't have an ftol arg, so I'm going to ignore it"
+            print "WARNING: l-bfgs-b doesn't have an ftol arg, so I'm going to ignore it"
        if self.gtol is not None:
-            opt_dict['gtol'] = self.gtol
+            opt_dict['pgtol'] = self.gtol
        if self.bfgs_factor is not None:
            opt_dict['factr'] = self.bfgs_factor
-        opt_dict['iprint'] = iprint
-        opt_dict['maxiter'] = self.max_iters
-        opt_dict['disp'] = self.messages
-        #dict(maxiter=self.max_iters, disp=self.messages, iprint=iprint, ftol=self.ftol, gtol=self.gtol)

-        opt_result = optimize.minimize(f_fp, self.x_init, method='L-BFGS-B', jac=True, options=opt_dict)
-        #opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint=iprint,
-        #                                    maxfun=self.max_iters, **opt_dict)
-        #self.x_opt = opt_result[0]
-        #self.f_opt = f_fp(self.x_opt)[0]
-        #self.funct_eval = opt_result[2]['funcalls']
-        #self.status = rcstrings[opt_result[2]['warnflag']]
-        self.x_opt = opt_result.x
-        self.status = opt_result.success
-        self.funct_eval = opt_result.nfev
-        self.f_opt = opt_result.fun
-        self.opt_result = opt_result
+        opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint=iprint,
+                                            maxfun=self.max_iters, **opt_dict)
+        self.x_opt = opt_result[0]
+        self.f_opt = f_fp(self.x_opt)[0]
+        self.funct_eval = opt_result[2]['funcalls']
+        self.status = rcstrings[opt_result[2]['warnflag']]

 class opt_simplex(Optimizer):
    def __init__(self, *args, **kwargs):
@ -236,13 +225,11 @@ class opt_SCG(Optimizer):
        self.status = opt_result[3]

 def get_optimizer(f_min):
-    from sgd import opt_SGD

    optimizers = {'fmin_tnc': opt_tnc,
          'simplex': opt_simplex,
          'lbfgsb': opt_lbfgsb,
-          'scg': opt_SCG,
-          'sgd': opt_SGD}
+          'scg': opt_SCG}

    if rasm_available:
        optimizers['rasmussen'] = opt_rasm
--- a/GPy/inference/optimization/scg.py
+++ b/GPy/inference/optimization/scg.py
@ -28,11 +28,11 @@ import sys

 def print_out(len_maxiters, fnow, current_grad, beta, iteration):
    print '\r',
-    print '{0:>0{mi}g}  {1:> 12e}  {2:> 12e}  {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
+    print '{0:>0{mi}g}  {1:> 12e}  {2:< 12.6e}  {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
    sys.stdout.flush()

 def exponents(fnow, current_grad):
-    exps = [np.abs(fnow), current_grad]
+    exps = [np.abs(np.float(fnow)), current_grad]
    return np.sign(exps) * np.log10(exps).astype(int)

 def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True, xtol=None, ftol=None, gtol=None):
@ -56,13 +56,13 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
    if gtol is None:
        gtol = 1e-5

-    sigma0 = 1.0e-8
+    sigma0 = 1.0e-7
    fold = f(x, *optargs) # Initial function value.
    function_eval = 1
    fnow = fold
    gradnew = gradf(x, *optargs) # Initial gradient.
-    if any(np.isnan(gradnew)):
-        raise UnexpectedInfOrNan, "Gradient contribution resulted in a NaN value"
+    #if any(np.isnan(gradnew)):
+    #    raise UnexpectedInfOrNan, "Gradient contribution resulted in a NaN value"
    current_grad = np.dot(gradnew, gradnew)
    gradold = gradnew.copy()
    d = -gradnew # Initial search direction.
@ -168,13 +168,13 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
        if Delta < 0.25:
            beta = min(4.0 * beta, betamax)
        if Delta > 0.75:
-            beta = max(0.5 * beta, betamin)
+            beta = max(0.25 * beta, betamin)

        # Update search direction using Polak-Ribiere formula, or re-start
        # in direction of negative gradient after nparams steps.
        if nsuccess == x.size:
            d = -gradnew
-#             beta = 1.  # TODO: betareset!!
+            beta = 1. # This is not in the original paper
            nsuccess = 0
        elif success:
            Gamma = np.dot(gradold - gradnew, gradnew) / (mu)
--- a/GPy/inference/optimization/stochastics.py
+++ b/GPy/inference/optimization/stochastics.py
@ -0,0 +1,56 @@
+# Copyright (c) 2012-2014, Max Zwiessele
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+class StochasticStorage(object):
+    '''
+    This is a container for holding the stochastic parameters,
+    such as subset indices or step length and so on.
+    '''
+    def __init__(self, model):
+        """
+        Initialize this stochastic container using the given model
+        """
+
+    def do_stochastics(self):
+        """
+        Update the internal state to the next batch of the stochastic
+        descent algorithm.
+        """
+        pass
+
+    def reset(self):
+        """
+        Reset the state of this stochastics generator.
+        """
+
+class SparseGPMissing(StochasticStorage):
+    def __init__(self, model, batchsize=1):
+        """
+        Here we want to loop over all dimensions everytime.
+        Thus, we can just make sure the loop goes over self.d every
+        time.
+        """
+        self.d = xrange(model.Y_normalized.shape[1])
+
+class SparseGPStochastics(StochasticStorage):
+    """
+    For the sparse gp we need to store the dimension we are in,
+    and the indices corresponding to those
+    """
+    def __init__(self, model, batchsize=1):
+        self.batchsize = batchsize
+        self.output_dim = model.Y.shape[1]
+        self.reset()
+        self.do_stochastics()
+
+    def do_stochastics(self):
+        if self.batchsize == 1:
+            self.current_dim = (self.current_dim+1)%self.output_dim
+            self.d = [self.current_dim]
+        else:
+            import numpy as np
+            self.d = np.random.choice(self.output_dim, size=self.batchsize, replace=False)
+
+    def reset(self):
+        self.current_dim = -1
+        self.d = None
--- a/GPy/inference/sgd.py
+++ b/GPy/inference/sgd.py
@ -1,355 +0,0 @@
-import numpy as np
-import scipy as sp
-import scipy.sparse
-from optimization import Optimizer
-from scipy import linalg, optimize
-import pylab as plt
-import copy, sys, pickle
-
-class opt_SGD(Optimizer):
-    """
-    Optimize using stochastic gradient descent.
-
-    :param Model: reference to the Model object
-    :param iterations: number of iterations
-    :param learning_rate: learning rate
-    :param momentum: momentum
-
-    """
-
-    def __init__(self, start, iterations = 10, learning_rate = 1e-4, momentum = 0.9, model = None, messages = False, batch_size = 1, self_paced = False, center = True, iteration_file = None, learning_rate_adaptation=None, actual_iter=None, schedule=None, **kwargs):
-        self.opt_name = "Stochastic Gradient Descent"
-
-        self.Model = model
-        self.iterations = iterations
-        self.momentum = momentum
-        self.learning_rate = learning_rate
-        self.x_opt = None
-        self.f_opt = None
-        self.messages = messages
-        self.batch_size = batch_size
-        self.self_paced = self_paced
-        self.center = center
-        self.param_traces = [('noise',[])]
-        self.iteration_file = iteration_file
-        self.learning_rate_adaptation = learning_rate_adaptation
-        self.actual_iter = actual_iter
-        if self.learning_rate_adaptation != None:
-            if self.learning_rate_adaptation == 'annealing':
-                self.learning_rate_0 = self.learning_rate
-            else:
-                self.learning_rate_0 = self.learning_rate.mean()
-
-        self.schedule = schedule
-        # if len([p for p in self.model.kern.parts if p.name == 'bias']) == 1:
-        #     self.param_traces.append(('bias',[]))
-        # if len([p for p in self.model.kern.parts if p.name == 'linear']) == 1:
-        #     self.param_traces.append(('linear',[]))
-        # if len([p for p in self.model.kern.parts if p.name == 'rbf']) == 1:
-        #     self.param_traces.append(('rbf_var',[]))
-
-        self.param_traces = dict(self.param_traces)
-        self.fopt_trace = []
-
-        num_params = len(self.Model._get_params())
-        if isinstance(self.learning_rate, float):
-            self.learning_rate = np.ones((num_params,)) * self.learning_rate
-
-        assert (len(self.learning_rate) == num_params), "there must be one learning rate per parameter"
-
-    def __str__(self):
-        status = "\nOptimizer: \t\t\t %s\n" % self.opt_name
-        status += "f(x_opt): \t\t\t %.4f\n" % self.f_opt
-        status += "Number of iterations: \t\t %d\n" % self.iterations
-        status += "Learning rate: \t\t\t max %.3f, min %.3f\n" % (self.learning_rate.max(), self.learning_rate.min())
-        status += "Momentum: \t\t\t %.3f\n" % self.momentum
-        status += "Batch size: \t\t\t %d\n" % self.batch_size
-        status += "Time elapsed: \t\t\t %s\n" % self.time
-        return status
-
-    def plot_traces(self):
-        plt.figure()
-        plt.subplot(211)
-        plt.title('Parameters')
-        for k in self.param_traces.keys():
-            plt.plot(self.param_traces[k], label=k)
-        plt.legend(loc=0)
-        plt.subplot(212)
-        plt.title('Objective function')
-        plt.plot(self.fopt_trace)
-
-
-    def non_null_samples(self, data):
-        return (np.isnan(data).sum(axis=1) == 0)
-
-    def check_for_missing(self, data):
-        if sp.sparse.issparse(self.Model.likelihood.Y):
-            return True
-        else:
-            return np.isnan(data).sum() > 0
-
-    def subset_parameter_vector(self, x, samples, param_shapes):
-        subset = np.array([], dtype = int)
-        x = np.arange(0, len(x))
-        i = 0
-
-        for s in param_shapes:
-            N, input_dim = s
-            X = x[i:i+N*input_dim].reshape(N, input_dim)
-            X = X[samples]
-            subset = np.append(subset, X.flatten())
-            i += N*input_dim
-
-        subset = np.append(subset, x[i:])
-
-        return subset
-
-    def shift_constraints(self, j):
-
-        constrained_indices = copy.deepcopy(self.Model.constrained_indices)
-
-        for c, constraint in enumerate(constrained_indices):
-            mask = (np.ones_like(constrained_indices[c]) == 1)
-            for i in range(len(constrained_indices[c])):
-                pos = np.where(j == constrained_indices[c][i])[0]
-                if len(pos) == 1:
-                    self.Model.constrained_indices[c][i] = pos
-                else:
-                    mask[i] = False
-
-            self.Model.constrained_indices[c] = self.Model.constrained_indices[c][mask]
-        return constrained_indices
-        # back them up
-        # bounded_i = copy.deepcopy(self.Model.constrained_bounded_indices)
-        # bounded_l = copy.deepcopy(self.Model.constrained_bounded_lowers)
-        # bounded_u = copy.deepcopy(self.Model.constrained_bounded_uppers)
-
-        # for b in range(len(bounded_i)): # for each group of constraints
-        #     for bc in range(len(bounded_i[b])):
-        #         pos = np.where(j == bounded_i[b][bc])[0]
-        #         if len(pos) == 1:
-        #             pos2 = np.where(self.Model.constrained_bounded_indices[b] == bounded_i[b][bc])[0][0]
-        #             self.Model.constrained_bounded_indices[b][pos2] = pos[0]
-        #         else:
-        #             if len(self.Model.constrained_bounded_indices[b]) == 1:
-        #                 # if it's the last index to be removed
-        #                 # the logic here is just a mess. If we remove the last one, then all the
-        #                 # b-indices change and we have to iterate through everything to find our
-        #                 # current index. Can't deal with this right now.
-        #                 raise NotImplementedError
-
-        #             else: # just remove it from the indices
-        #                 mask = self.Model.constrained_bounded_indices[b] != bc
-        #                 self.Model.constrained_bounded_indices[b] = self.Model.constrained_bounded_indices[b][mask]
-
-
-        # # here we shif the positive constraints. We cycle through each positive
-        # # constraint
-        # positive = self.Model.constrained_positive_indices.copy()
-        # mask = (np.ones_like(positive) == 1)
-        # for p in range(len(positive)):
-        #     # we now check whether the constrained index appears in the j vector
-        #     # (the vector of the "active" indices)
-        #     pos = np.where(j == self.Model.constrained_positive_indices[p])[0]
-        #     if len(pos) == 1:
-        #         self.Model.constrained_positive_indices[p] = pos
-        #     else:
-        #         mask[p] = False
-        # self.Model.constrained_positive_indices = self.Model.constrained_positive_indices[mask]
-
-        # return (bounded_i, bounded_l, bounded_u), positive
-
-    def restore_constraints(self, c):#b, p):
-        # self.Model.constrained_bounded_indices = b[0]
-        # self.Model.constrained_bounded_lowers = b[1]
-        # self.Model.constrained_bounded_uppers = b[2]
-        # self.Model.constrained_positive_indices = p
-        self.Model.constrained_indices = c
-
-    def get_param_shapes(self, N = None, input_dim = None):
-        model_name = self.Model.__class__.__name__
-        if model_name == 'GPLVM':
-            return [(N, input_dim)]
-        if model_name == 'Bayesian_GPLVM':
-            return [(N, input_dim), (N, input_dim)]
-        else:
-            raise NotImplementedError
-
-    def step_with_missing_data(self, f_fp, X, step, shapes):
-        N, input_dim = X.shape
-
-        if not sp.sparse.issparse(self.Model.likelihood.Y):
-            Y = self.Model.likelihood.Y
-            samples = self.non_null_samples(self.Model.likelihood.Y)
-            self.Model.N = samples.sum()
-            Y = Y[samples]
-        else:
-            samples = self.Model.likelihood.Y.nonzero()[0]
-            self.Model.N = len(samples)
-            Y = np.asarray(self.Model.likelihood.Y[samples].todense(), dtype = np.float64)
-
-        if self.Model.N == 0 or Y.std() == 0.0:
-            return 0, step, self.Model.N
-
-        self.Model.likelihood._offset = Y.mean()
-        self.Model.likelihood._scale = Y.std()
-        self.Model.likelihood.set_data(Y)
-        # self.Model.likelihood.V = self.Model.likelihood.Y*self.Model.likelihood.precision
-
-        sigma = self.Model.likelihood._variance
-        self.Model.likelihood._variance = None # invalidate cache
-        self.Model.likelihood._set_params(sigma)
-
-
-        j = self.subset_parameter_vector(self.x_opt, samples, shapes)
-        self.Model.X = X[samples]
-
-        model_name = self.Model.__class__.__name__
-
-        if model_name == 'Bayesian_GPLVM':
-            self.Model.likelihood.YYT = np.dot(self.Model.likelihood.Y, self.Model.likelihood.Y.T)
-            self.Model.likelihood.trYYT = np.trace(self.Model.likelihood.YYT)
-
-        ci = self.shift_constraints(j)
-        f, fp = f_fp(self.x_opt[j])
-
-        step[j] = self.momentum * step[j] + self.learning_rate[j] * fp
-        self.x_opt[j] -= step[j]
-        self.restore_constraints(ci)
-
-        self.Model.grads[j] = fp
-        # restore likelihood _offset and _scale, otherwise when we call set_data(y) on
-        # the next feature, it will get normalized with the mean and std of this one.
-        self.Model.likelihood._offset = 0
-        self.Model.likelihood._scale = 1
-
-        return f, step, self.Model.N
-
-    def adapt_learning_rate(self, t, D):
-        if self.learning_rate_adaptation == 'adagrad':
-            if t > 0:
-                g_k = self.Model.grads
-                self.s_k += np.square(g_k)
-                t0 = 100.0
-                self.learning_rate = 0.1/(t0 + np.sqrt(self.s_k))
-
-                import pdb; pdb.set_trace()
-            else:
-                self.learning_rate = np.zeros_like(self.learning_rate)
-                self.s_k = np.zeros_like(self.x_opt)
-
-        elif self.learning_rate_adaptation == 'annealing':
-            #self.learning_rate = self.learning_rate_0/(1+float(t+1)/10)
-            self.learning_rate = np.ones_like(self.learning_rate) * self.schedule[t]
-
-
-        elif self.learning_rate_adaptation == 'semi_pesky':
-            if self.Model.__class__.__name__ == 'Bayesian_GPLVM':
-                g_t = self.Model.grads
-                if t == 0:
-                    self.hbar_t = 0.0
-                    self.tau_t = 100.0
-                    self.gbar_t = 0.0
-
-                self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t
-                self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t)
-                self.learning_rate = np.ones_like(self.learning_rate)*(np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t)
-                tau_t = self.tau_t*(1-self.learning_rate) + 1
-
-
-    def opt(self, f_fp=None, f=None, fp=None):
-        self.x_opt = self.Model._get_params_transformed()
-        self.grads = []
-
-        X, Y = self.Model.X.copy(), self.Model.likelihood.Y.copy()
-
-        self.Model.likelihood.YYT = 0
-        self.Model.likelihood.trYYT = 0
-        self.Model.likelihood._offset = 0.0
-        self.Model.likelihood._scale = 1.0
-
-        N, input_dim = self.Model.X.shape
-        D = self.Model.likelihood.Y.shape[1]
-        num_params = self.Model._get_params()
-        self.trace = []
-        missing_data = self.check_for_missing(self.Model.likelihood.Y)
-
-        step = np.zeros_like(num_params)
-        for it in range(self.iterations):
-            if self.actual_iter != None:
-                it = self.actual_iter
-
-            self.Model.grads = np.zeros_like(self.x_opt) # TODO this is ugly
-
-            if it == 0 or self.self_paced is False:
-                features = np.random.permutation(Y.shape[1])
-            else:
-                features = np.argsort(NLL)
-
-            b = len(features)/self.batch_size
-            features = [features[i::b] for i in range(b)]
-            NLL = []
-            import pylab as plt
-            for count, j in enumerate(features):
-                self.Model.input_dim = len(j)
-                self.Model.likelihood.input_dim = len(j)
-                self.Model.likelihood.set_data(Y[:, j])
-                # self.Model.likelihood.V = self.Model.likelihood.Y*self.Model.likelihood.precision
-
-                sigma = self.Model.likelihood._variance
-                self.Model.likelihood._variance = None # invalidate cache
-                self.Model.likelihood._set_params(sigma)
-
-                if missing_data:
-                    shapes = self.get_param_shapes(N, input_dim)
-                    f, step, Nj = self.step_with_missing_data(f_fp, X, step, shapes)
-                else:
-                    self.Model.likelihood.YYT = np.dot(self.Model.likelihood.Y, self.Model.likelihood.Y.T)
-                    self.Model.likelihood.trYYT = np.trace(self.Model.likelihood.YYT)
-                    Nj = N
-                    f, fp = f_fp(self.x_opt)
-                    self.Model.grads = fp.copy()
-                    step = self.momentum * step + self.learning_rate * fp
-                    self.x_opt -= step
-
-                if self.messages == 2:
-                    noise = self.Model.likelihood._variance
-                    status = "evaluating {feature: 5d}/{tot: 5d} \t f: {f: 2.3f} \t non-missing: {nm: 4d}\t noise: {noise: 2.4f}\r".format(feature = count, tot = len(features), f = f, nm = Nj, noise = noise)
-                    sys.stdout.write(status)
-                    sys.stdout.flush()
-                    self.param_traces['noise'].append(noise)
-
-                self.adapt_learning_rate(it+count, D)
-                NLL.append(f)
-                self.fopt_trace.append(NLL[-1])
-                # fig = plt.figure('traces')
-                # plt.clf()
-                # plt.plot(self.param_traces['noise'])
-
-                # for k in self.param_traces.keys():
-                #     self.param_traces[k].append(self.Model.get(k)[0])
-            self.grads.append(self.Model.grads.tolist())
-            # should really be a sum(), but earlier samples in the iteration will have a very crappy ll
-            self.f_opt = np.mean(NLL)
-            self.Model.N = N
-            self.Model.X = X
-            self.Model.input_dim = D
-            self.Model.likelihood.N = N
-            self.Model.likelihood.input_dim = D
-            self.Model.likelihood.Y = Y
-            sigma = self.Model.likelihood._variance
-            self.Model.likelihood._variance = None # invalidate cache
-            self.Model.likelihood._set_params(sigma)
-
-            self.trace.append(self.f_opt)
-            if self.iteration_file is not None:
-                f = open(self.iteration_file + "iteration%d.pickle" % it, 'w')
-                data = [self.x_opt, self.fopt_trace, self.param_traces]
-                pickle.dump(data, f)
-                f.close()
-
-            if self.messages != 0:
-                sys.stdout.write('\r' + ' '*len(status)*2 + '  \r')
-                status = "SGD Iteration: {0: 3d}/{1: 3d}  f: {2: 2.3f}   max eta: {3: 1.5f}\n".format(it+1, self.iterations, self.f_opt, self.learning_rate.max())
-                sys.stdout.write(status)
-                sys.stdout.flush()
--- a/GPy/installation.cfg
+++ b/GPy/installation.cfg
@ -0,0 +1,2 @@
+# This is the local installation configuration file for GPy
+
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -1,9 +1,19 @@
-# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
+from _src.kern import Kern
+from _src.rbf import RBF
+from _src.linear import Linear, LinearFull
+from _src.static import Bias, White
+from _src.brownian import Brownian
+from _src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
+from _src.mlp import MLP
+from _src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52
+from _src.independent_outputs import IndependentOutputs, Hierarchical
+from _src.coregionalize import Coregionalize
+from _src.ODE_UY import ODE_UY
+from _src.ODE_UYC import ODE_UYC
+from _src.ODE_st import ODE_st
+from _src.ODE_t import ODE_t
+from _src.poly import Poly
+
+from _src.trunclinear import TruncLinear,TruncLinear_inf
+from _src.splitKern import SplitKern,DiffGenomeKern

-from constructors import *
-try:
-    from constructors import rbf_sympy, sympykern # these depend on sympy
-except:
-    pass
-from kern import *
--- a/GPy/kern/_src/ODE_UY.py
+++ b/GPy/kern/_src/ODE_UY.py
@ -0,0 +1,282 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+import numpy as np
+from independent_outputs import index_to_slices
+
+class ODE_UY(Kern):
+    def __init__(self, input_dim, variance_U=3., variance_Y=1., lengthscale_U=1., lengthscale_Y=1., active_dims=None, name='ode_uy'):
+        assert input_dim ==2, "only defined for 2 input dims"
+        super(ODE_UY, self).__init__(input_dim, active_dims, name)
+
+        self.variance_Y = Param('variance_Y', variance_Y, Logexp())
+        self.variance_U = Param('variance_U', variance_Y, Logexp())
+        self.lengthscale_Y = Param('lengthscale_Y', lengthscale_Y, Logexp())
+        self.lengthscale_U = Param('lengthscale_U', lengthscale_Y, Logexp())
+
+        self.link_parameters(self.variance_Y, self.variance_U, self.lengthscale_Y, self.lengthscale_U)
+
+    def K(self, X, X2=None):
+        # model :   a * dy/dt + b * y = U
+        #lu=sqrt(3)/theta1  ly=1/theta2  theta2= a/b :thetay   sigma2=1/(2ab) :sigmay
+
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        if X2 is None:
+            X2,slices2 = X,slices
+            K = np.zeros((X.shape[0], X.shape[0]))
+        else:
+            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+            K = np.zeros((X.shape[0], X2.shape[0]))
+
+
+        #rdist = X[:,0][:,None] - X2[:,0][:,None].T
+        rdist = X - X2.T
+        ly=1/self.lengthscale_Y
+        lu=np.sqrt(3)/self.lengthscale_U
+        #iu=self.input_lengthU  #dimention of U
+        Vu=self.variance_U
+        Vy=self.variance_Y
+        #Vy=ly/2
+        #stop
+
+
+        # kernel for kuu  matern3/2
+        kuu = lambda dist:Vu * (1 + lu* np.abs(dist)) * np.exp(-lu * np.abs(dist))
+
+        # kernel for kyy
+        k1 = lambda dist:np.exp(-ly*np.abs(dist))*(2*lu+ly)/(lu+ly)**2
+        k2 = lambda dist:(np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2
+        k3 = lambda dist:np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
+        kyy = lambda dist:Vu*Vy*(k1(dist) + k2(dist) + k3(dist))
+
+
+        # cross covariance function
+        kyu3 = lambda dist:np.exp(-lu*dist)/(lu+ly)*(1+lu*(dist+1/(lu+ly)))
+        #kyu3 = lambda dist: 0
+
+        k1cros = lambda dist:np.exp(ly*dist)/(lu-ly) * ( 1- np.exp( (lu-ly)*dist) + lu* ( dist*np.exp( (lu-ly)*dist ) + (1- np.exp( (lu-ly)*dist ) ) /(lu-ly)   )    )
+        #k1cros = lambda dist:0
+
+        k2cros = lambda dist:np.exp(ly*dist)*( 1/(lu+ly) + lu/(lu+ly)**2 )
+        #k2cros = lambda dist:0
+
+        Vyu=np.sqrt(Vy*ly*2)
+
+        # cross covariance kuy
+        kuyp = lambda dist:Vu*Vyu*(kyu3(dist))       #t>0 kuy
+        kuyn = lambda dist:Vu*Vyu*(k1cros(dist)+k2cros(dist))      #t<0 kuy
+        # cross covariance kyu
+        kyup = lambda dist:Vu*Vyu*(k1cros(-dist)+k2cros(-dist))    #t>0 kyu
+        kyun = lambda dist:Vu*Vyu*(kyu3(-dist))       #t<0 kyu
+
+
+        for i, s1 in enumerate(slices):
+            for j, s2 in enumerate(slices2):
+                for ss1 in s1:
+                    for ss2 in s2:
+                        if i==0 and j==0:
+                            K[ss1,ss2] = kuu(np.abs(rdist[ss1,ss2]))
+                        elif i==0 and j==1:
+                            #K[ss1,ss2]=  np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[ss1,ss2]) )   )
+                            K[ss1,ss2]=  np.where(  rdist[ss1,ss2]>0 , kuyp(rdist[ss1,ss2]), kuyn(rdist[ss1,ss2] )   )
+                        elif i==1 and j==1:
+                            K[ss1,ss2] = kyy(np.abs(rdist[ss1,ss2]))
+                        else:
+                            #K[ss1,ss2]= 0
+                            #K[ss1,ss2]= np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[ss1,ss2]) )   )
+                            K[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(rdist[ss1,ss2]), kyun(rdist[ss1,ss2] )   )
+        return K
+
+
+
+    def Kdiag(self, X):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        Kdiag = np.zeros(X.shape[0])
+        ly=1/self.lengthscale_Y
+        lu=np.sqrt(3)/self.lengthscale_U
+
+        Vu = self.variance_U
+        Vy=self.variance_Y
+
+        k1 = (2*lu+ly)/(lu+ly)**2
+        k2 = (ly-2*lu + 2*lu-ly ) / (ly-lu)**2
+        k3 = 1/(lu+ly) + (lu)/(lu+ly)**2
+
+        slices = index_to_slices(X[:,-1])
+
+        for i, ss1 in enumerate(slices):
+            for s1 in ss1:
+                if i==0:
+                    Kdiag[s1]+= self.variance_U
+                elif i==1:
+                    Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
+                else:
+                    raise ValueError, "invalid input/output index"
+        #Kdiag[slices[0][0]]+= self.variance_U   #matern32 diag
+        #Kdiag[slices[1][0]]+= self.variance_U*self.variance_Y*(k1+k2+k3)  #  diag
+        return Kdiag
+
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        """derivative of the covariance matrix with respect to the parameters."""
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        if X2 is None:
+            X2,slices2 = X,slices
+        else:
+            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+        #rdist = X[:,0][:,None] - X2[:,0][:,None].T
+
+        rdist = X - X2.T
+        ly=1/self.lengthscale_Y
+        lu=np.sqrt(3)/self.lengthscale_U
+
+        Vu=self.variance_U
+        Vy=self.variance_Y
+        Vyu = np.sqrt(Vy*ly*2)
+        dVdly = 0.5/np.sqrt(ly)*np.sqrt(2*Vy)
+        dVdVy = 0.5/np.sqrt(Vy)*np.sqrt(2*ly)
+
+        rd=rdist.shape
+        dktheta1 = np.zeros(rd)
+        dktheta2 = np.zeros(rd)
+        dkUdvar = np.zeros(rd)
+        dkYdvar = np.zeros(rd)
+
+        # dk dtheta for UU
+        UUdtheta1 = lambda dist: np.exp(-lu* dist)*dist + (-dist)*np.exp(-lu* dist)*(1+lu*dist)
+        UUdtheta2 = lambda dist: 0
+        #UUdvar = lambda dist: (1 + lu*dist)*np.exp(-lu*dist)
+        UUdvar = lambda dist: (1 + lu* np.abs(dist)) * np.exp(-lu * np.abs(dist))
+
+        # dk dtheta for YY
+
+        dk1theta1 = lambda dist: np.exp(-ly*dist)*2*(-lu)/(lu+ly)**3
+
+        dk2theta1 = lambda dist: (1.0)*(
+            np.exp(-lu*dist)*dist*(-ly+2*lu-lu*ly*dist+dist*lu**2)*(ly-lu)**(-2) + np.exp(-lu*dist)*(-2+ly*dist-2*dist*lu)*(ly-lu)**(-2)
+            +np.exp(-dist*lu)*(ly-2*lu+ly*lu*dist-dist*lu**2)*2*(ly-lu)**(-3)
+            +np.exp(-dist*ly)*2*(ly-lu)**(-2)
+            +np.exp(-dist*ly)*2*(2*lu-ly)*(ly-lu)**(-3)
+            )
+
+        dk3theta1 = lambda dist: np.exp(-dist*lu)*(lu+ly)**(-2)*((2*lu+ly+dist*lu**2+lu*ly*dist)*(-dist-2/(lu+ly))+2+2*lu*dist+ly*dist)
+
+        #dktheta1 = lambda dist: self.variance_U*self.variance_Y*(dk1theta1+dk2theta1+dk3theta1)
+
+
+
+
+        dk1theta2 = lambda dist: np.exp(-ly*dist) * ((lu+ly)**(-2)) * (  (-dist)*(2*lu+ly)  +  1  +  (-2)*(2*lu+ly)/(lu+ly)  )
+
+        dk2theta2 =lambda dist:  1*(
+            np.exp(-dist*lu)*(ly-lu)**(-2) * ( 1+lu*dist+(-2)*(ly-2*lu+lu*ly*dist-dist*lu**2)*(ly-lu)**(-1) )
+            +np.exp(-dist*ly)*(ly-lu)**(-2) * ( (-dist)*(2*lu-ly) -1+(2*lu-ly)*(-2)*(ly-lu)**(-1) )
+            )
+
+        dk3theta2 = lambda dist: np.exp(-dist*lu) * (-3*lu-ly-dist*lu**2-lu*ly*dist)/(lu+ly)**3
+
+        #dktheta2 = lambda dist: self.variance_U*self.variance_Y*(dk1theta2 + dk2theta2 +dk3theta2)
+
+        # kyy kernel
+
+        k1 = lambda dist: np.exp(-ly*dist)*(2*lu+ly)/(lu+ly)**2
+        k2 = lambda dist: (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2
+        k3 = lambda dist: np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
+        #dkdvar = k1+k2+k3
+
+
+
+        # cross covariance function
+        kyu3 = lambda dist:np.exp(-lu*dist)/(lu+ly)*(1+lu*(dist+1/(lu+ly)))
+
+        k1cros = lambda dist:np.exp(ly*dist)/(lu-ly) * ( 1- np.exp( (lu-ly)*dist) + lu* ( dist*np.exp( (lu-ly)*dist ) + (1- np.exp( (lu-ly)*dist ) ) /(lu-ly)   )    )
+
+        k2cros = lambda dist:np.exp(ly*dist)*( 1/(lu+ly) + lu/(lu+ly)**2 )
+        # cross covariance kuy
+        kuyp = lambda dist:(kyu3(dist))       #t>0 kuy
+        kuyn = lambda dist:(k1cros(dist)+k2cros(dist))      #t<0 kuy
+        # cross covariance kyu
+        kyup = lambda dist:(k1cros(-dist)+k2cros(-dist))    #t>0 kyu
+        kyun = lambda dist:(kyu3(-dist))       #t<0 kyu
+
+        # dk dtheta for UY
+
+
+        dkyu3dtheta2 = lambda dist: np.exp(-lu*dist) * ( (-1)*(lu+ly)**(-2)*(1+lu*dist+lu*(lu+ly)**(-1)) + (lu+ly)**(-1)*(-lu)*(lu+ly)**(-2) )
+        dkyu3dtheta1 = lambda dist: np.exp(-lu*dist)*(lu+ly)**(-1)* ( (-dist)*(1+dist*lu+lu*(lu+ly)**(-1)) -\
+         (lu+ly)**(-1)*(1+dist*lu+lu*(lu+ly)**(-1)) +dist+(lu+ly)**(-1)-lu*(lu+ly)**(-2) )
+
+        dkcros2dtheta1 = lambda dist: np.exp(ly*dist)* ( -(ly+lu)**(-2) + (ly+lu)**(-2) + (-2)*lu*(lu+ly)**(-3)  )
+        dkcros2dtheta2 = lambda dist: np.exp(ly*dist)*dist* ( (ly+lu)**(-1) + lu*(lu+ly)**(-2) ) + \
+                                      np.exp(ly*dist)*( -(lu+ly)**(-2) + lu*(-2)*(lu+ly)**(-3)  )
+
+        dkcros1dtheta1 = lambda dist: np.exp(ly*dist)*(     -(lu-ly)**(-2)*(  1-np.exp((lu-ly)*dist) + lu*dist*np.exp((lu-ly)*dist)+ \
+          lu*(1-np.exp((lu-ly)*dist))/(lu-ly)  )  +  (lu-ly)**(-1)*(  -np.exp( (lu-ly)*dist )*dist + dist*np.exp( (lu-ly)*dist)+\
+          lu*dist**2*np.exp((lu-ly)*dist)+(1-np.exp((lu-ly)*dist))/(lu-ly) - lu*np.exp((lu-ly)*dist)*dist/(lu-ly) -\
+          lu*(1-np.exp((lu-ly)*dist))/(lu-ly)**2  )   )
+
+        dkcros1dtheta2 = lambda t: np.exp(ly*t)*t/(lu-ly)*( 1-np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t)+\
+            lu*(1-np.exp((lu-ly)*t))/(lu-ly)  )+\
+            np.exp(ly*t)/(lu-ly)**2* ( 1-np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t) + lu*( 1-np.exp((lu-ly)*t) )/(lu-ly)  )+\
+            np.exp(ly*t)/(lu-ly)*( np.exp((lu-ly)*t)*t -lu*t*t*np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t)/(lu-ly)+\
+            lu*( 1-np.exp((lu-ly)*t) )/(lu-ly)**2 )
+
+        dkuypdtheta1 = lambda dist:(dkyu3dtheta1(dist))       #t>0 kuy
+        dkuyndtheta1 = lambda dist:(dkcros1dtheta1(dist)+dkcros2dtheta1(dist))      #t<0 kuy
+        # cross covariance kyu
+        dkyupdtheta1 = lambda dist:(dkcros1dtheta1(-dist)+dkcros2dtheta1(-dist))    #t>0 kyu
+        dkyundtheta1 = lambda dist:(dkyu3dtheta1(-dist))       #t<0 kyu
+
+        dkuypdtheta2 = lambda dist:(dkyu3dtheta2(dist))       #t>0 kuy
+        dkuyndtheta2 = lambda dist:(dkcros1dtheta2(dist)+dkcros2dtheta2(dist))      #t<0 kuy
+        # cross covariance kyu
+        dkyupdtheta2 = lambda dist:(dkcros1dtheta2(-dist)+dkcros2dtheta2(-dist))    #t>0 kyu
+        dkyundtheta2 = lambda dist:(dkyu3dtheta2(-dist))       #t<0 kyu
+
+
+        for i, s1 in enumerate(slices):
+            for j, s2 in enumerate(slices2):
+                for ss1 in s1:
+                    for ss2 in s2:
+                        if i==0 and j==0:
+                            #target[ss1,ss2] = kuu(np.abs(rdist[ss1,ss2]))
+                            dktheta1[ss1,ss2] = Vu*UUdtheta1(np.abs(rdist[ss1,ss2]))
+                            dktheta2[ss1,ss2] = 0
+                            dkUdvar[ss1,ss2] = UUdvar(np.abs(rdist[ss1,ss2]))
+                            dkYdvar[ss1,ss2] = 0
+                        elif i==0 and j==1:
+                            ########target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) )   )
+                            #np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) )   )
+                            #dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , self.variance_U*self.variance_Y*dkcrtheta1(np.abs(rdist[ss1,ss2])) ,self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2])))    )
+                            #dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , self.variance_U*self.variance_Y*dkcrtheta2(np.abs(rdist[ss1,ss2])) ,self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2]))+dk2theta2(np.abs(rdist[ss1,ss2])))    )
+                            dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkuypdtheta1(rdist[ss1,ss2]),Vu*Vyu*dkuyndtheta1(rdist[ss1,ss2]) )
+                            dkUdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vyu*kuyp(rdist[ss1,ss2]), Vyu* kuyn(rdist[ss1,ss2])  )
+                            dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkuypdtheta2(rdist[ss1,ss2])+Vu*dVdly*kuyp(rdist[ss1,ss2]),Vu*Vyu*dkuyndtheta2(rdist[ss1,ss2])+Vu*dVdly*kuyn(rdist[ss1,ss2]) )
+                            dkYdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*dVdVy*kuyp(rdist[ss1,ss2]), Vu*dVdVy* kuyn(rdist[ss1,ss2])  )
+                        elif i==1 and j==1:
+                            #target[ss1,ss2] = kyy(np.abs(rdist[ss1,ss2]))
+                            dktheta1[ss1,ss2] = self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2]))+dk3theta1(np.abs(rdist[ss1,ss2])))
+                            dktheta2[ss1,ss2] = self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2])) + dk2theta2(np.abs(rdist[ss1,ss2])) +dk3theta2(np.abs(rdist[ss1,ss2])))
+                            dkUdvar[ss1,ss2] = self.variance_Y*(k1(np.abs(rdist[ss1,ss2]))+k2(np.abs(rdist[ss1,ss2]))+k3(np.abs(rdist[ss1,ss2])) )
+                            dkYdvar[ss1,ss2] = self.variance_U*(k1(np.abs(rdist[ss1,ss2]))+k2(np.abs(rdist[ss1,ss2]))+k3(np.abs(rdist[ss1,ss2])) )
+                        else:
+                            #######target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[s1[0],s2[0]]) )   )
+                            #dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 ,self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2]))) , self.variance_U*self.variance_Y*dkcrtheta1(np.abs(rdist[ss1,ss2])) )
+                            #dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 ,self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2]))+dk2theta2(np.abs(rdist[ss1,ss2]))) , self.variance_U*self.variance_Y*dkcrtheta2(np.abs(rdist[ss1,ss2])) )
+                            dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkyupdtheta1(rdist[ss1,ss2]),Vu*Vyu*dkyundtheta1(rdist[ss1,ss2])  )
+                            dkUdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vyu*kyup(rdist[ss1,ss2]),Vyu*kyun(rdist[ss1,ss2]))
+                            dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkyupdtheta2(rdist[ss1,ss2])+Vu*dVdly*kyup(rdist[ss1,ss2]),Vu*Vyu*dkyundtheta2(rdist[ss1,ss2])+Vu*dVdly*kyun(rdist[ss1,ss2])  )
+                            dkYdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*dVdVy*kyup(rdist[ss1,ss2]), Vu*dVdVy*kyun(rdist[ss1,ss2]))
+
+        #stop
+        self.variance_U.gradient = np.sum(dkUdvar * dL_dK)     # Vu
+
+        self.variance_Y.gradient = np.sum(dkYdvar * dL_dK)     # Vy
+
+        self.lengthscale_U.gradient = np.sum(dktheta1*(-np.sqrt(3)*self.lengthscale_U**(-2))* dL_dK)     #lu
+
+        self.lengthscale_Y.gradient = np.sum(dktheta2*(-self.lengthscale_Y**(-2)) * dL_dK)              #ly
+
--- a/GPy/kern/_src/ODE_UYC.py
+++ b/GPy/kern/_src/ODE_UYC.py
@ -0,0 +1,290 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+import numpy as np
+from independent_outputs import index_to_slices
+
+class ODE_UYC(Kern):
+    def __init__(self, input_dim, variance_U=3., variance_Y=1., lengthscale_U=1., lengthscale_Y=1., ubias =1. ,active_dims=None, name='ode_uyc'):
+        assert input_dim ==2, "only defined for 2 input dims"
+        super(ODE_UYC, self).__init__(input_dim, active_dims, name)
+
+        self.variance_Y = Param('variance_Y', variance_Y, Logexp())
+        self.variance_U = Param('variance_U', variance_U, Logexp())
+        self.lengthscale_Y = Param('lengthscale_Y', lengthscale_Y, Logexp())
+        self.lengthscale_U = Param('lengthscale_U', lengthscale_U, Logexp())
+        self.ubias = Param('ubias', ubias, Logexp())
+
+        self.add_parameters(self.variance_Y, self.variance_U, self.lengthscale_Y, self.lengthscale_U, self.ubias)
+
+    def K(self, X, X2=None):
+        # model :   a * dy/dt + b * y = U
+        #lu=sqrt(3)/theta1  ly=1/theta2  theta2= a/b :thetay   sigma2=1/(2ab) :sigmay
+
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        if X2 is None:
+            X2,slices2 = X,slices
+            K = np.zeros((X.shape[0], X.shape[0]))
+        else:
+            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+            K = np.zeros((X.shape[0], X2.shape[0]))
+
+        #stop
+        #rdist = X[:,0][:,None] - X2[:,0][:,None].T
+        rdist = X - X2.T
+        ly=1/self.lengthscale_Y
+        lu=np.sqrt(3)/self.lengthscale_U
+        #iu=self.input_lengthU  #dimention of U
+        Vu=self.variance_U
+        Vy=self.variance_Y
+        #Vy=ly/2
+        #stop
+
+
+        # kernel for kuu  matern3/2
+        kuu = lambda dist:Vu * (1 + lu* np.abs(dist)) * np.exp(-lu * np.abs(dist)) +self.ubias
+
+        # kernel for kyy
+        k1 = lambda dist:np.exp(-ly*np.abs(dist))*(2*lu+ly)/(lu+ly)**2
+        k2 = lambda dist:(np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2
+        k3 = lambda dist:np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
+        kyy = lambda dist:Vu*Vy*(k1(dist) + k2(dist) + k3(dist))
+
+
+        # cross covariance function
+        kyu3 = lambda dist:np.exp(-lu*dist)/(lu+ly)*(1+lu*(dist+1/(lu+ly)))
+        #kyu3 = lambda dist: 0
+
+        k1cros = lambda dist:np.exp(ly*dist)/(lu-ly) * ( 1- np.exp( (lu-ly)*dist) + lu* ( dist*np.exp( (lu-ly)*dist ) + (1- np.exp( (lu-ly)*dist ) ) /(lu-ly)   )    )
+        #k1cros = lambda dist:0
+
+        k2cros = lambda dist:np.exp(ly*dist)*( 1/(lu+ly) + lu/(lu+ly)**2 )
+        #k2cros = lambda dist:0
+
+        Vyu=np.sqrt(Vy*ly*2)
+
+        # cross covariance kuy
+        kuyp = lambda dist:Vu*Vyu*(kyu3(dist))       #t>0 kuy
+        kuyn = lambda dist:Vu*Vyu*(k1cros(dist)+k2cros(dist))      #t<0 kuy
+        # cross covariance kyu
+        kyup = lambda dist:Vu*Vyu*(k1cros(-dist)+k2cros(-dist))    #t>0 kyu
+        kyun = lambda dist:Vu*Vyu*(kyu3(-dist))       #t<0 kyu
+
+
+        for i, s1 in enumerate(slices):
+            for j, s2 in enumerate(slices2):
+                for ss1 in s1:
+                    for ss2 in s2:
+                        if i==0 and j==0:
+                            K[ss1,ss2] = kuu(np.abs(rdist[ss1,ss2]))
+                        elif i==0 and j==1:
+                            #K[ss1,ss2]=  np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[ss1,ss2]) )   )
+                            K[ss1,ss2]=  np.where(  rdist[ss1,ss2]>0 , kuyp(rdist[ss1,ss2]), kuyn(rdist[ss1,ss2] )   )
+                        elif i==1 and j==1:
+                            K[ss1,ss2] = kyy(np.abs(rdist[ss1,ss2]))
+                        else:
+                            #K[ss1,ss2]= 0
+                            #K[ss1,ss2]= np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[ss1,ss2]) )   )
+                            K[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(rdist[ss1,ss2]), kyun(rdist[ss1,ss2] )   )
+        return K
+
+
+
+    def Kdiag(self, X):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        Kdiag = np.zeros(X.shape[0])
+        ly=1/self.lengthscale_Y
+        lu=np.sqrt(3)/self.lengthscale_U
+
+        Vu = self.variance_U
+        Vy=self.variance_Y
+
+        k1 = (2*lu+ly)/(lu+ly)**2
+        k2 = (ly-2*lu + 2*lu-ly ) / (ly-lu)**2
+        k3 = 1/(lu+ly) + (lu)/(lu+ly)**2
+
+        slices = index_to_slices(X[:,-1])
+
+        for i, ss1 in enumerate(slices):
+            for s1 in ss1:
+                if i==0:
+                    Kdiag[s1]+= self.variance_U + self.ubias
+                elif i==1:
+                    Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
+                else:
+                    raise ValueError, "invalid input/output index"
+        #Kdiag[slices[0][0]]+= self.variance_U   #matern32 diag
+        #Kdiag[slices[1][0]]+= self.variance_U*self.variance_Y*(k1+k2+k3)  #  diag
+        return Kdiag
+
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        """derivative of the covariance matrix with respect to the parameters."""
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        if X2 is None:
+            X2,slices2 = X,slices
+        else:
+            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+        #rdist = X[:,0][:,None] - X2[:,0][:,None].T
+
+        rdist = X - X2.T
+        ly=1/self.lengthscale_Y
+        lu=np.sqrt(3)/self.lengthscale_U
+
+        Vu=self.variance_U
+        Vy=self.variance_Y
+        Vyu = np.sqrt(Vy*ly*2)
+        dVdly = 0.5/np.sqrt(ly)*np.sqrt(2*Vy)
+        dVdVy = 0.5/np.sqrt(Vy)*np.sqrt(2*ly)
+
+        rd=rdist.shape[0]
+        dktheta1 = np.zeros([rd,rd])
+        dktheta2 = np.zeros([rd,rd])
+        dkUdvar = np.zeros([rd,rd])
+        dkYdvar = np.zeros([rd,rd])
+
+        dkdubias = np.zeros([rd,rd])
+
+        # dk dtheta for UU
+        UUdtheta1 = lambda dist: np.exp(-lu* dist)*dist + (-dist)*np.exp(-lu* dist)*(1+lu*dist)
+        UUdtheta2 = lambda dist: 0
+        #UUdvar = lambda dist: (1 + lu*dist)*np.exp(-lu*dist)
+        UUdvar = lambda dist: (1 + lu* np.abs(dist)) * np.exp(-lu * np.abs(dist))
+
+        # dk dtheta for YY
+
+        dk1theta1 = lambda dist: np.exp(-ly*dist)*2*(-lu)/(lu+ly)**3
+
+        dk2theta1 = lambda dist: (1.0)*(
+            np.exp(-lu*dist)*dist*(-ly+2*lu-lu*ly*dist+dist*lu**2)*(ly-lu)**(-2) + np.exp(-lu*dist)*(-2+ly*dist-2*dist*lu)*(ly-lu)**(-2)
+            +np.exp(-dist*lu)*(ly-2*lu+ly*lu*dist-dist*lu**2)*2*(ly-lu)**(-3)
+            +np.exp(-dist*ly)*2*(ly-lu)**(-2)
+            +np.exp(-dist*ly)*2*(2*lu-ly)*(ly-lu)**(-3)
+            )
+
+        dk3theta1 = lambda dist: np.exp(-dist*lu)*(lu+ly)**(-2)*((2*lu+ly+dist*lu**2+lu*ly*dist)*(-dist-2/(lu+ly))+2+2*lu*dist+ly*dist)
+
+        #dktheta1 = lambda dist: self.variance_U*self.variance_Y*(dk1theta1+dk2theta1+dk3theta1)
+
+
+
+
+        dk1theta2 = lambda dist: np.exp(-ly*dist) * ((lu+ly)**(-2)) * (  (-dist)*(2*lu+ly)  +  1  +  (-2)*(2*lu+ly)/(lu+ly)  )
+
+        dk2theta2 =lambda dist:  1*(
+            np.exp(-dist*lu)*(ly-lu)**(-2) * ( 1+lu*dist+(-2)*(ly-2*lu+lu*ly*dist-dist*lu**2)*(ly-lu)**(-1) )
+            +np.exp(-dist*ly)*(ly-lu)**(-2) * ( (-dist)*(2*lu-ly) -1+(2*lu-ly)*(-2)*(ly-lu)**(-1) )
+            )
+
+        dk3theta2 = lambda dist: np.exp(-dist*lu) * (-3*lu-ly-dist*lu**2-lu*ly*dist)/(lu+ly)**3
+
+        #dktheta2 = lambda dist: self.variance_U*self.variance_Y*(dk1theta2 + dk2theta2 +dk3theta2)
+
+        # kyy kernel
+
+        k1 = lambda dist: np.exp(-ly*dist)*(2*lu+ly)/(lu+ly)**2
+        k2 = lambda dist: (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2
+        k3 = lambda dist: np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
+        #dkdvar = k1+k2+k3
+
+
+
+        # cross covariance function
+        kyu3 = lambda dist:np.exp(-lu*dist)/(lu+ly)*(1+lu*(dist+1/(lu+ly)))
+
+        k1cros = lambda dist:np.exp(ly*dist)/(lu-ly) * ( 1- np.exp( (lu-ly)*dist) + lu* ( dist*np.exp( (lu-ly)*dist ) + (1- np.exp( (lu-ly)*dist ) ) /(lu-ly)   )    )
+
+        k2cros = lambda dist:np.exp(ly*dist)*( 1/(lu+ly) + lu/(lu+ly)**2 )
+        # cross covariance kuy
+        kuyp = lambda dist:(kyu3(dist))       #t>0 kuy
+        kuyn = lambda dist:(k1cros(dist)+k2cros(dist))      #t<0 kuy
+        # cross covariance kyu
+        kyup = lambda dist:(k1cros(-dist)+k2cros(-dist))    #t>0 kyu
+        kyun = lambda dist:(kyu3(-dist))       #t<0 kyu
+
+        # dk dtheta for UY
+
+
+        dkyu3dtheta2 = lambda dist: np.exp(-lu*dist) * ( (-1)*(lu+ly)**(-2)*(1+lu*dist+lu*(lu+ly)**(-1)) + (lu+ly)**(-1)*(-lu)*(lu+ly)**(-2) )
+        dkyu3dtheta1 = lambda dist: np.exp(-lu*dist)*(lu+ly)**(-1)* ( (-dist)*(1+dist*lu+lu*(lu+ly)**(-1)) -\
+         (lu+ly)**(-1)*(1+dist*lu+lu*(lu+ly)**(-1)) +dist+(lu+ly)**(-1)-lu*(lu+ly)**(-2) )
+
+        dkcros2dtheta1 = lambda dist: np.exp(ly*dist)* ( -(ly+lu)**(-2) + (ly+lu)**(-2) + (-2)*lu*(lu+ly)**(-3)  )
+        dkcros2dtheta2 = lambda dist: np.exp(ly*dist)*dist* ( (ly+lu)**(-1) + lu*(lu+ly)**(-2) ) + \
+                                      np.exp(ly*dist)*( -(lu+ly)**(-2) + lu*(-2)*(lu+ly)**(-3)  )
+
+        dkcros1dtheta1 = lambda dist: np.exp(ly*dist)*(     -(lu-ly)**(-2)*(  1-np.exp((lu-ly)*dist) + lu*dist*np.exp((lu-ly)*dist)+ \
+          lu*(1-np.exp((lu-ly)*dist))/(lu-ly)  )  +  (lu-ly)**(-1)*(  -np.exp( (lu-ly)*dist )*dist + dist*np.exp( (lu-ly)*dist)+\
+          lu*dist**2*np.exp((lu-ly)*dist)+(1-np.exp((lu-ly)*dist))/(lu-ly) - lu*np.exp((lu-ly)*dist)*dist/(lu-ly) -\
+          lu*(1-np.exp((lu-ly)*dist))/(lu-ly)**2  )   )
+
+        dkcros1dtheta2 = lambda t: np.exp(ly*t)*t/(lu-ly)*( 1-np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t)+\
+            lu*(1-np.exp((lu-ly)*t))/(lu-ly)  )+\
+            np.exp(ly*t)/(lu-ly)**2* ( 1-np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t) + lu*( 1-np.exp((lu-ly)*t) )/(lu-ly)  )+\
+            np.exp(ly*t)/(lu-ly)*( np.exp((lu-ly)*t)*t -lu*t*t*np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t)/(lu-ly)+\
+            lu*( 1-np.exp((lu-ly)*t) )/(lu-ly)**2 )
+
+        dkuypdtheta1 = lambda dist:(dkyu3dtheta1(dist))       #t>0 kuy
+        dkuyndtheta1 = lambda dist:(dkcros1dtheta1(dist)+dkcros2dtheta1(dist))      #t<0 kuy
+        # cross covariance kyu
+        dkyupdtheta1 = lambda dist:(dkcros1dtheta1(-dist)+dkcros2dtheta1(-dist))    #t>0 kyu
+        dkyundtheta1 = lambda dist:(dkyu3dtheta1(-dist))       #t<0 kyu
+
+        dkuypdtheta2 = lambda dist:(dkyu3dtheta2(dist))       #t>0 kuy
+        dkuyndtheta2 = lambda dist:(dkcros1dtheta2(dist)+dkcros2dtheta2(dist))      #t<0 kuy
+        # cross covariance kyu
+        dkyupdtheta2 = lambda dist:(dkcros1dtheta2(-dist)+dkcros2dtheta2(-dist))    #t>0 kyu
+        dkyundtheta2 = lambda dist:(dkyu3dtheta2(-dist))       #t<0 kyu
+
+
+        for i, s1 in enumerate(slices):
+            for j, s2 in enumerate(slices2):
+                for ss1 in s1:
+                    for ss2 in s2:
+                        if i==0 and j==0:
+                            #target[ss1,ss2] = kuu(np.abs(rdist[ss1,ss2]))
+                            dktheta1[ss1,ss2] = Vu*UUdtheta1(np.abs(rdist[ss1,ss2]))
+                            dktheta2[ss1,ss2] = 0
+                            dkUdvar[ss1,ss2] = UUdvar(np.abs(rdist[ss1,ss2]))
+                            dkYdvar[ss1,ss2] = 0
+                            dkdubias[ss1,ss2] = 1
+                        elif i==0 and j==1:
+                            ########target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) )   )
+                            #np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) )   )
+                            #dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , self.variance_U*self.variance_Y*dkcrtheta1(np.abs(rdist[ss1,ss2])) ,self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2])))    )
+                            #dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , self.variance_U*self.variance_Y*dkcrtheta2(np.abs(rdist[ss1,ss2])) ,self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2]))+dk2theta2(np.abs(rdist[ss1,ss2])))    )
+                            dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkuypdtheta1(rdist[ss1,ss2]),Vu*Vyu*dkuyndtheta1(rdist[ss1,ss2]) )
+                            dkUdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vyu*kuyp(rdist[ss1,ss2]), Vyu* kuyn(rdist[ss1,ss2])  )
+                            dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkuypdtheta2(rdist[ss1,ss2])+Vu*dVdly*kuyp(rdist[ss1,ss2]),Vu*Vyu*dkuyndtheta2(rdist[ss1,ss2])+Vu*dVdly*kuyn(rdist[ss1,ss2]) )
+                            dkYdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*dVdVy*kuyp(rdist[ss1,ss2]), Vu*dVdVy* kuyn(rdist[ss1,ss2])  )
+                            dkdubias[ss1,ss2] = 0
+                        elif i==1 and j==1:
+                            #target[ss1,ss2] = kyy(np.abs(rdist[ss1,ss2]))
+                            dktheta1[ss1,ss2] = self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2]))+dk3theta1(np.abs(rdist[ss1,ss2])))
+                            dktheta2[ss1,ss2] = self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2])) + dk2theta2(np.abs(rdist[ss1,ss2])) +dk3theta2(np.abs(rdist[ss1,ss2])))
+                            dkUdvar[ss1,ss2] = self.variance_Y*(k1(np.abs(rdist[ss1,ss2]))+k2(np.abs(rdist[ss1,ss2]))+k3(np.abs(rdist[ss1,ss2])) )
+                            dkYdvar[ss1,ss2] = self.variance_U*(k1(np.abs(rdist[ss1,ss2]))+k2(np.abs(rdist[ss1,ss2]))+k3(np.abs(rdist[ss1,ss2])) )
+                            dkdubias[ss1,ss2] = 0
+                        else:
+                            #######target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[s1[0],s2[0]]) )   )
+                            #dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 ,self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2]))) , self.variance_U*self.variance_Y*dkcrtheta1(np.abs(rdist[ss1,ss2])) )
+                            #dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 ,self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2]))+dk2theta2(np.abs(rdist[ss1,ss2]))) , self.variance_U*self.variance_Y*dkcrtheta2(np.abs(rdist[ss1,ss2])) )
+                            dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkyupdtheta1(rdist[ss1,ss2]),Vu*Vyu*dkyundtheta1(rdist[ss1,ss2])  )
+                            dkUdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vyu*kyup(rdist[ss1,ss2]),Vyu*kyun(rdist[ss1,ss2]))
+                            dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkyupdtheta2(rdist[ss1,ss2])+Vu*dVdly*kyup(rdist[ss1,ss2]),Vu*Vyu*dkyundtheta2(rdist[ss1,ss2])+Vu*dVdly*kyun(rdist[ss1,ss2])  )
+                            dkYdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*dVdVy*kyup(rdist[ss1,ss2]), Vu*dVdVy*kyun(rdist[ss1,ss2]))
+                            dkdubias[ss1,ss2] = 0
+        #stop
+        self.variance_U.gradient = np.sum(dkUdvar * dL_dK)     # Vu
+
+        self.variance_Y.gradient = np.sum(dkYdvar * dL_dK)     # Vy
+
+        self.lengthscale_U.gradient = np.sum(dktheta1*(-np.sqrt(3)*self.lengthscale_U**(-2))* dL_dK)     #lu
+
+        self.lengthscale_Y.gradient = np.sum(dktheta2*(-self.lengthscale_Y**(-2)) * dL_dK)              #ly
+
+        self.ubias.gradient = np.sum(dkdubias * dL_dK) 
+
--- a/GPy/kern/_src/ODE_st.py
+++ b/GPy/kern/_src/ODE_st.py
@ -0,0 +1,267 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+import numpy as np
+from independent_outputs import index_to_slices
+
+
+class ODE_st(Kern):
+    """
+    kernel resultiong from a first order ODE with OU driving GP
+
+    :param input_dim: the number of input dimension, has to be equal to one
+    :type input_dim: int
+    :param varianceU: variance of the driving GP
+    :type varianceU: float
+    :param lengthscaleU: lengthscale of the driving GP  (sqrt(3)/lengthscaleU)
+    :type lengthscaleU: float
+    :param varianceY: 'variance' of the transfer function
+    :type varianceY: float
+    :param lengthscaleY: 'lengthscale' of the transfer function (1/lengthscaleY)
+    :type lengthscaleY: float
+    :rtype: kernel object
+
+    """
+    
+    def __init__(self, input_dim, a=1.,b=1., c=1.,variance_Yx=3.,variance_Yt=1.5, lengthscale_Yx=1.5, lengthscale_Yt=1.5, active_dims=None, name='ode_st'):
+        assert input_dim ==3, "only defined for 3 input dims"
+        super(ODE_st, self).__init__(input_dim, active_dims, name)
+
+        self.variance_Yt = Param('variance_Yt', variance_Yt, Logexp())
+        self.variance_Yx = Param('variance_Yx', variance_Yx, Logexp())
+        self.lengthscale_Yt = Param('lengthscale_Yt', lengthscale_Yt, Logexp())
+        self.lengthscale_Yx = Param('lengthscale_Yx', lengthscale_Yx, Logexp())        
+
+        self.a= Param('a', a, Logexp())
+        self.b = Param('b', b, Logexp())
+        self.c = Param('c', c, Logexp())
+
+        self.add_parameters(self.a, self.b, self.c, self.variance_Yt, self.variance_Yx, self.lengthscale_Yt,self.lengthscale_Yx)
+
+
+    def K(self, X, X2=None):        
+    # model :   -a d^2y/dx^2  + b dy/dt + c * y = U
+    # kernel Kyy rbf spatiol temporal
+    # vyt Y temporal variance  vyx Y spatiol variance   lyt Y temporal lengthscale   lyx Y spatiol lengthscale
+    # kernel Kuu doper( doper(Kyy))
+    # a   b    c    lyt   lyx    vyx*vyt
+        """Compute the covariance matrix between X and X2."""        
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        if X2 is None:
+            X2,slices2 = X,slices
+            K = np.zeros((X.shape[0], X.shape[0]))
+        else:
+            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+            K = np.zeros((X.shape[0], X2.shape[0]))
+
+
+        tdist = (X[:,0][:,None] - X2[:,0][None,:])**2
+        xdist = (X[:,1][:,None] - X2[:,1][None,:])**2
+
+        ttdist = (X[:,0][:,None] - X2[:,0][None,:])
+        #rdist = [tdist,xdist]
+        #dist = np.abs(X - X2.T)
+        vyt = self.variance_Yt
+        vyx = self.variance_Yx
+        
+        lyt=1/(2*self.lengthscale_Yt)
+        lyx=1/(2*self.lengthscale_Yx)
+
+        a = self.a ## -a is used in the model, negtive diffusion
+        b = self.b
+        c = self.c
+
+        kyy = lambda tdist,xdist: np.exp(-lyt*(tdist) -lyx*(xdist))
+
+        k1 = lambda tdist: (2*lyt - 4*lyt**2 * (tdist) )
+
+        k2 = lambda xdist: ( 4*lyx**2 * (xdist)  - 2*lyx )
+
+        k3 = lambda xdist: ( 3*4*lyx**2 - 6*8*xdist*lyx**3 + 16*xdist**2*lyx**4 )
+
+        k4 = lambda ttdist: 2*lyt*(ttdist)
+
+        for i, s1 in enumerate(slices):
+            for j, s2 in enumerate(slices2):
+                for ss1 in s1:
+                    for ss2 in s2:
+                        if i==0 and j==0:
+                            K[ss1,ss2] = vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                        elif i==0 and j==1:
+                            K[ss1,ss2] = (-a*k2(xdist[ss1,ss2]) + b*k4(ttdist[ss1,ss2]) + c)*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            #K[ss1,ss2]=  np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[ss1,ss2]) )   )
+                            #K[ss1,ss2]=  np.where(  rdist[ss1,ss2]>0 , kuyp(rdist[ss1,ss2]), kuyn(rdist[ss1,ss2] )   )
+                        elif i==1 and j==1:
+                            K[ss1,ss2] = ( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 )* vyt*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                        else:
+                            K[ss1,ss2] = (-a*k2(xdist[ss1,ss2]) - b*k4(ttdist[ss1,ss2]) + c)*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            #K[ss1,ss2]= np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[ss1,ss2]) )   )
+                            #K[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(rdist[ss1,ss2]), kyun(rdist[ss1,ss2] )   )
+        
+        #stop
+        return K
+
+    def Kdiag(self, X):
+        """Compute the diagonal of the covariance matrix associated to X."""
+        vyt = self.variance_Yt
+        vyx = self.variance_Yx
+
+        lyt = 1./(2*self.lengthscale_Yt)
+        lyx = 1./(2*self.lengthscale_Yx)
+
+        a = self.a
+        b = self.b
+        c = self.c
+
+        ## dk^2/dtdt'
+        k1 = (2*lyt )*vyt*vyx
+        ## dk^2/dx^2
+        k2 = ( - 2*lyx )*vyt*vyx
+        ## dk^4/dx^2dx'^2
+        k3 = ( 4*3*lyx**2 )*vyt*vyx
+
+
+        Kdiag = np.zeros(X.shape[0])
+        slices = index_to_slices(X[:,-1])
+
+        for i, ss1 in enumerate(slices):
+            for s1 in ss1:
+                if i==0:
+                    Kdiag[s1]+= vyt*vyx
+                elif i==1:
+                    #i=1
+                    Kdiag[s1]+= b**2*k1 - 2*a*c*k2 + a**2*k3 + c**2*vyt*vyx
+                    #Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
+                else:
+                    raise ValueError, "invalid input/output index"
+
+        return Kdiag
+        
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+    #def dK_dtheta(self, dL_dK, X, X2, target):
+        """derivative of the covariance matrix with respect to the parameters."""
+        X,slices = X[:,:-1],index_to_slices(X[:,-1])
+        if X2 is None:
+            X2,slices2 = X,slices
+            K = np.zeros((X.shape[0], X.shape[0]))
+        else:
+            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+        
+        vyt = self.variance_Yt
+        vyx = self.variance_Yx
+
+        lyt = 1./(2*self.lengthscale_Yt)
+        lyx = 1./(2*self.lengthscale_Yx)
+
+        a = self.a
+        b = self.b
+        c = self.c
+
+        tdist = (X[:,0][:,None] - X2[:,0][None,:])**2
+        xdist = (X[:,1][:,None] - X2[:,1][None,:])**2
+        #rdist = [tdist,xdist]
+        ttdist = (X[:,0][:,None] - X2[:,0][None,:])
+        
+        rd=tdist.shape[0]
+
+        dka = np.zeros([rd,rd])
+        dkb = np.zeros([rd,rd])
+        dkc = np.zeros([rd,rd])
+        dkYdvart = np.zeros([rd,rd])
+        dkYdvarx = np.zeros([rd,rd])
+        dkYdlent = np.zeros([rd,rd])
+        dkYdlenx = np.zeros([rd,rd])
+
+
+        kyy = lambda tdist,xdist: np.exp(-lyt*(tdist) -lyx*(xdist))
+        #k1 = lambda tdist: (lyt - lyt**2 * (tdist) )
+        #k2 = lambda xdist: ( lyx**2 * (xdist)  - lyx )
+        #k3 = lambda xdist: ( 3*lyx**2 - 6*xdist*lyx**3 + xdist**2*lyx**4 )
+        #k4 = lambda tdist: -lyt*np.sqrt(tdist)
+
+        k1 = lambda tdist: (2*lyt - 4*lyt**2 * (tdist) )
+
+        k2 = lambda xdist: ( 4*lyx**2 * (xdist)  - 2*lyx )
+
+        k3 = lambda xdist: ( 3*4*lyx**2 - 6*8*xdist*lyx**3 + 16*xdist**2*lyx**4 )
+
+        k4 = lambda ttdist: 2*lyt*(ttdist)
+
+        dkyydlyx = lambda tdist,xdist: kyy(tdist,xdist)*(-xdist)
+        dkyydlyt = lambda tdist,xdist: kyy(tdist,xdist)*(-tdist)
+
+        dk1dlyt = lambda tdist: 2. - 4*2.*lyt*tdist
+        dk2dlyx = lambda xdist: (4.*2.*lyx*xdist -2.)
+        dk3dlyx = lambda xdist: (6.*4.*lyx - 18.*8*xdist*lyx**2 + 4*16*xdist**2*lyx**3)
+
+        dk4dlyt = lambda ttdist: 2*(ttdist)
+
+        for i, s1 in enumerate(slices):
+            for j, s2 in enumerate(slices2):
+                for ss1 in s1:
+                    for ss2 in s2:
+                        if i==0 and j==0:
+                            dka[ss1,ss2] = 0
+                            dkb[ss1,ss2] = 0
+                            dkc[ss1,ss2] = 0
+                            dkYdvart[ss1,ss2] = vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdvarx[ss1,ss2] = vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdlenx[ss1,ss2] = vyt*vyx*dkyydlyx(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdlent[ss1,ss2] = vyt*vyx*dkyydlyt(tdist[ss1,ss2],xdist[ss1,ss2])
+                        elif i==0 and j==1:
+                            dka[ss1,ss2] = -k2(xdist[ss1,ss2])*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkb[ss1,ss2] = k4(ttdist[ss1,ss2])*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkc[ss1,ss2] = vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            #dkYdvart[ss1,ss2] = 0
+                            #dkYdvarx[ss1,ss2] = 0
+                            #dkYdlent[ss1,ss2] = 0
+                            #dkYdlenx[ss1,ss2] = 0
+                            dkYdvart[ss1,ss2] = (-a*k2(xdist[ss1,ss2])+b*k4(ttdist[ss1,ss2])+c)*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdvarx[ss1,ss2] = (-a*k2(xdist[ss1,ss2])+b*k4(ttdist[ss1,ss2])+c)*vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdlent[ss1,ss2] = vyt*vyx*dkyydlyt(tdist[ss1,ss2],xdist[ss1,ss2])* (-a*k2(xdist[ss1,ss2])+b*k4(ttdist[ss1,ss2])+c)+\
+                            vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*b*dk4dlyt(ttdist[ss1,ss2])
+                            dkYdlenx[ss1,ss2] = vyt*vyx*dkyydlyx(tdist[ss1,ss2],xdist[ss1,ss2])*(-a*k2(xdist[ss1,ss2])+b*k4(ttdist[ss1,ss2])+c)+\
+                            vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*(-a*dk2dlyx(xdist[ss1,ss2]))
+                        elif i==1 and j==1:
+                            dka[ss1,ss2] = (2*a*k3(xdist[ss1,ss2]) - 2*c*k2(xdist[ss1,ss2]))*vyt*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkb[ss1,ss2] = 2*b*k1(tdist[ss1,ss2])*vyt*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkc[ss1,ss2] = (-2*a*k2(xdist[ss1,ss2]) + 2*c )*vyt*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdvart[ss1,ss2] = ( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 )*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdvarx[ss1,ss2] = ( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 )*vyt* kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdlent[ss1,ss2] = vyt*vyx*dkyydlyt(tdist[ss1,ss2],xdist[ss1,ss2])*( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 ) +\
+                            vyx*vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*b**2*dk1dlyt(tdist[ss1,ss2])
+                            dkYdlenx[ss1,ss2] = vyt*vyx*dkyydlyx(tdist[ss1,ss2],xdist[ss1,ss2])*( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 ) +\
+                            vyx*vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2])* (-2*a*c*dk2dlyx(xdist[ss1,ss2]) + a**2*dk3dlyx(xdist[ss1,ss2]) )
+                        else:
+                            dka[ss1,ss2] = -k2(xdist[ss1,ss2])*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkb[ss1,ss2] = -k4(ttdist[ss1,ss2])*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkc[ss1,ss2] = vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            #dkYdvart[ss1,ss2] = 0
+                            #dkYdvarx[ss1,ss2] = 0
+                            #dkYdlent[ss1,ss2] = 0
+                            #dkYdlenx[ss1,ss2] = 0
+                            dkYdvart[ss1,ss2] = (-a*k2(xdist[ss1,ss2])-b*k4(ttdist[ss1,ss2])+c)*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdvarx[ss1,ss2] = (-a*k2(xdist[ss1,ss2])-b*k4(ttdist[ss1,ss2])+c)*vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
+                            dkYdlent[ss1,ss2] = vyt*vyx*dkyydlyt(tdist[ss1,ss2],xdist[ss1,ss2])* (-a*k2(xdist[ss1,ss2])-b*k4(ttdist[ss1,ss2])+c)+\
+                            vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*(-1)*b*dk4dlyt(ttdist[ss1,ss2])
+                            dkYdlenx[ss1,ss2] = vyt*vyx*dkyydlyx(tdist[ss1,ss2],xdist[ss1,ss2])*(-a*k2(xdist[ss1,ss2])-b*k4(ttdist[ss1,ss2])+c)+\
+                            vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*(-a*dk2dlyx(xdist[ss1,ss2])) 
+
+        self.a.gradient = np.sum(dka * dL_dK)  
+
+        self.b.gradient = np.sum(dkb * dL_dK) 
+
+        self.c.gradient = np.sum(dkc * dL_dK)
+
+
+        self.variance_Yt.gradient = np.sum(dkYdvart * dL_dK)  # Vy
+
+        self.variance_Yx.gradient = np.sum(dkYdvarx * dL_dK)
+
+        self.lengthscale_Yt.gradient = np.sum(dkYdlent*(-0.5*self.lengthscale_Yt**(-2)) * dL_dK)    #ly np.sum(dktheta2*(-self.lengthscale_Y**(-2)) * dL_dK) 
+
+        self.lengthscale_Yx.gradient =  np.sum(dkYdlenx*(-0.5*self.lengthscale_Yx**(-2)) * dL_dK)
+
--- a/GPy/kern/_src/ODE_t.py
+++ b/GPy/kern/_src/ODE_t.py
@ -0,0 +1,165 @@
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+import numpy as np
+from independent_outputs import index_to_slices
+
+
+class ODE_t(Kern):
+
+        def __init__(self, input_dim, a=1., c=1.,variance_Yt=3., lengthscale_Yt=1.5,ubias =1., active_dims=None, name='ode_st'):
+                assert input_dim ==2, "only defined for 2 input dims"
+                super(ODE_t, self).__init__(input_dim, active_dims, name)
+
+                self.variance_Yt = Param('variance_Yt', variance_Yt, Logexp())
+                self.lengthscale_Yt = Param('lengthscale_Yt', lengthscale_Yt, Logexp())        
+
+                self.a= Param('a', a, Logexp())
+                self.c = Param('c', c, Logexp())
+                self.ubias = Param('ubias', ubias, Logexp())
+                self.add_parameters(self.a, self.c, self.variance_Yt, self.lengthscale_Yt,self.ubias)
+
+        def K(self, X, X2=None):
+                """Compute the covariance matrix between X and X2."""        
+                X,slices = X[:,:-1],index_to_slices(X[:,-1])
+                if X2 is None:
+                        X2,slices2 = X,slices
+                        K = np.zeros((X.shape[0], X.shape[0]))
+                else:
+                        X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+                        K = np.zeros((X.shape[0], X2.shape[0]))
+
+                tdist = (X[:,0][:,None] - X2[:,0][None,:])**2
+                ttdist = (X[:,0][:,None] - X2[:,0][None,:])
+                
+                vyt = self.variance_Yt
+                
+                lyt=1/(2*self.lengthscale_Yt)
+
+                a = -self.a
+                c = self.c
+
+                kyy = lambda tdist: np.exp(-lyt*(tdist))
+
+                k1 = lambda tdist: (2*lyt - 4*lyt**2 *(tdist) )
+
+                k4 = lambda tdist: 2*lyt*(tdist)
+
+                for i, s1 in enumerate(slices):
+                        for j, s2 in enumerate(slices2):
+                                for ss1 in s1:
+                                    for ss2 in s2:
+                                        if i==0 and j==0:
+                                            K[ss1,ss2] = vyt*kyy(tdist[ss1,ss2])
+                                        elif i==0 and j==1:
+                                            K[ss1,ss2] = (k4(ttdist[ss1,ss2])+1)*vyt*kyy(tdist[ss1,ss2])
+                                            #K[ss1,ss2] = (2*lyt*(ttdist[ss1,ss2])+1)*vyt*kyy(tdist[ss1,ss2])
+                                        elif i==1 and j==1:
+                                            K[ss1,ss2] = ( k1(tdist[ss1,ss2]) + 1. )*vyt* kyy(tdist[ss1,ss2])+self.ubias
+                                        else:
+                                            K[ss1,ss2] = (-k4(ttdist[ss1,ss2])+1)*vyt*kyy(tdist[ss1,ss2])
+                                            #K[ss1,ss2] = (-2*lyt*(ttdist[ss1,ss2])+1)*vyt*kyy(tdist[ss1,ss2])
+                #stop
+                return K
+
+
+        def Kdiag(self, X):
+
+                vyt = self.variance_Yt
+                lyt = 1./(2*self.lengthscale_Yt)
+
+                a = -self.a
+                c = self.c        
+                
+                k1 = (2*lyt )*vyt
+                
+                Kdiag = np.zeros(X.shape[0])
+                slices = index_to_slices(X[:,-1])
+
+                for i, ss1 in enumerate(slices):
+                    for s1 in ss1:
+                        if i==0:
+                            Kdiag[s1]+= vyt
+                        elif i==1:
+                            #i=1
+                            Kdiag[s1]+= k1 + vyt+self.ubias
+                            #Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
+                        else:
+                            raise ValueError, "invalid input/output index"
+
+                return Kdiag
+
+        def update_gradients_full(self, dL_dK, X, X2=None):
+                """derivative of the covariance matrix with respect to the parameters."""
+                X,slices = X[:,:-1],index_to_slices(X[:,-1])
+                if X2 is None:
+                    X2,slices2 = X,slices
+                    K = np.zeros((X.shape[0], X.shape[0]))
+                else:
+                    X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
+
+
+                vyt = self.variance_Yt
+
+                lyt = 1./(2*self.lengthscale_Yt)
+
+                tdist = (X[:,0][:,None] - X2[:,0][None,:])**2
+                ttdist = (X[:,0][:,None] - X2[:,0][None,:])
+                #rdist = [tdist,xdist]
+                
+                rd=tdist.shape[0]
+
+                dka = np.zeros([rd,rd])
+                dkc = np.zeros([rd,rd])
+                dkYdvart = np.zeros([rd,rd])
+                dkYdlent = np.zeros([rd,rd])
+
+                dkdubias = np.zeros([rd,rd])
+
+                kyy = lambda tdist: np.exp(-lyt*(tdist))
+                dkyydlyt = lambda tdist: kyy(tdist)*(-tdist)
+
+                k1 = lambda tdist: (2*lyt - 4*lyt**2 * (tdist) )
+
+                k4 = lambda ttdist: 2*lyt*(ttdist)
+
+                dk1dlyt = lambda tdist: 2. - 4*2.*lyt*tdist
+
+                dk4dlyt = lambda ttdist: 2*(ttdist)
+
+                for i, s1 in enumerate(slices):
+                    for j, s2 in enumerate(slices2):
+                        for ss1 in s1:
+                            for ss2 in s2:
+                                if i==0 and j==0:
+                                    dkYdvart[ss1,ss2] = kyy(tdist[ss1,ss2])
+                                    dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])
+                                    dkdubias[ss1,ss2] = 0
+                                elif i==0 and j==1:
+                                    dkYdvart[ss1,ss2] = (k4(ttdist[ss1,ss2])+1)*kyy(tdist[ss1,ss2])
+                                    #dkYdvart[ss1,ss2] = ((2*lyt*ttdist[ss1,ss2])+1)*kyy(tdist[ss1,ss2])
+                                    dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])* (k4(ttdist[ss1,ss2])+1.)+\
+                                    vyt*kyy(tdist[ss1,ss2])*(dk4dlyt(ttdist[ss1,ss2]))
+                                    #dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])* (2*lyt*(ttdist[ss1,ss2])+1.)+\
+                                    #vyt*kyy(tdist[ss1,ss2])*(2*ttdist[ss1,ss2])
+                                    dkdubias[ss1,ss2] = 0
+                                elif i==1 and j==1:
+                                    dkYdvart[ss1,ss2] = (k1(tdist[ss1,ss2]) + 1. )* kyy(tdist[ss1,ss2])
+                                    dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])*( k1(tdist[ss1,ss2]) + 1. ) +\
+                          			vyt*kyy(tdist[ss1,ss2])*dk1dlyt(tdist[ss1,ss2])
+                                    dkdubias[ss1,ss2] = 1
+                                else:
+                                    dkYdvart[ss1,ss2] = (-k4(ttdist[ss1,ss2])+1)*kyy(tdist[ss1,ss2])
+                                    #dkYdvart[ss1,ss2] = (-2*lyt*(ttdist[ss1,ss2])+1)*kyy(tdist[ss1,ss2])
+                                    dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])* (-k4(ttdist[ss1,ss2])+1.)+\
+                                    vyt*kyy(tdist[ss1,ss2])*(-dk4dlyt(ttdist[ss1,ss2]) )
+                                    dkdubias[ss1,ss2] = 0
+                                    #dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])* (-2*lyt*(ttdist[ss1,ss2])+1.)+\
+                                    #vyt*kyy(tdist[ss1,ss2])*(-2)*(ttdist[ss1,ss2])
+   
+
+                self.variance_Yt.gradient = np.sum(dkYdvart * dL_dK)
+
+                self.lengthscale_Yt.gradient =  np.sum(dkYdlent*(-0.5*self.lengthscale_Yt**(-2)) * dL_dK)
+
+                self.ubias.gradient = np.sum(dkdubias * dL_dK) 
--- a/GPy/kern/_src/init.py
+++ b/GPy/kern/_src/init.py
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@ -0,0 +1,188 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+import itertools
+from ...util.caching import Cache_this
+from kern import CombinationKernel
+
+class Add(CombinationKernel):
+    """
+    Add given list of kernels together.
+    propagates gradients through.
+
+    This kernel will take over the active dims of it's subkernels passed in.
+    """
+    def __init__(self, subkerns, name='add'):
+        for i, kern in enumerate(subkerns[:]):
+            if isinstance(kern, Add):
+                del subkerns[i]
+                for part in kern.parts[::-1]:
+                    kern.unlink_parameter(part)
+                    subkerns.insert(i, part)
+
+        super(Add, self).__init__(subkerns, name)
+
+    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    def K(self, X, X2=None, which_parts=None):
+        """
+        Add all kernels together.
+        If a list of parts (of this kernel!) `which_parts` is given, only
+        the parts of the list are taken to compute the covariance.
+        """
+        if which_parts is None:
+            which_parts = self.parts
+        elif not isinstance(which_parts, (list, tuple)):
+            # if only one part is given
+            which_parts = [which_parts]
+        return reduce(np.add, (p.K(X, X2) for p in which_parts))
+
+    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    def Kdiag(self, X, which_parts=None):
+        if which_parts is None:
+            which_parts = self.parts
+        elif not isinstance(which_parts, (list, tuple)):
+            # if only one part is given
+            which_parts = [which_parts]
+        return reduce(np.add, (p.Kdiag(X) for p in which_parts))
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        [p.update_gradients_full(dL_dK, X, X2) for p in self.parts if not p.is_fixed]
+
+    def update_gradients_diag(self, dL_dK, X):
+        [p.update_gradients_diag(dL_dK, X) for p in self.parts]
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        """Compute the gradient of the objective function with respect to X.
+
+        :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
+        :type dL_dK: np.ndarray (num_samples x num_inducing)
+        :param X: Observed data inputs
+        :type X: np.ndarray (num_samples x input_dim)
+        :param X2: Observed data inputs (optional, defaults to X)
+        :type X2: np.ndarray (num_inducing x input_dim)"""
+
+        target = np.zeros(X.shape)
+        [target.__iadd__(p.gradients_X(dL_dK, X, X2)) for p in self.parts]
+        return target
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        target = np.zeros(X.shape)
+        [target.__iadd__(p.gradients_X_diag(dL_dKdiag, X)) for p in self.parts]
+        return target
+    
+    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    def psi0(self, Z, variational_posterior):
+        return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts))
+    
+    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    def psi1(self, Z, variational_posterior):
+        return reduce(np.add, (p.psi1(Z, variational_posterior) for p in self.parts))
+
+    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    def psi2(self, Z, variational_posterior):
+        psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts))
+        #return psi2
+        # compute the "cross" terms
+        from static import White, Bias
+        from rbf import RBF
+        #from rbf_inv import RBFInv
+        from linear import Linear
+        #ffrom fixed import Fixed
+
+        for p1, p2 in itertools.combinations(self.parts, 2):
+            # i1, i2 = p1.active_dims, p2.active_dims
+            # white doesn;t combine with anything
+            if isinstance(p1, White) or isinstance(p2, White):
+                pass
+            # rbf X bias
+            #elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
+            elif isinstance(p1,  Bias) and isinstance(p2, (RBF, Linear)):
+                tmp = p2.psi1(Z, variational_posterior).sum(axis=0)
+                psi2 += p1.variance * (tmp[:,None]+tmp[None,:]) #(tmp[:, :, None] + tmp[:, None, :])
+            #elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
+            elif isinstance(p2, Bias) and isinstance(p1, (RBF, Linear)):
+                tmp = p1.psi1(Z, variational_posterior).sum(axis=0)
+                psi2 += p2.variance * (tmp[:,None]+tmp[None,:]) #(tmp[:, :, None] + tmp[:, None, :])
+            elif isinstance(p2, (RBF, Linear)) and isinstance(p1, (RBF, Linear)):
+                assert np.intersect1d(p1.active_dims, p2.active_dims).size == 0, "only non overlapping kernel dimensions allowed so far"
+                tmp1 = p1.psi1(Z, variational_posterior)
+                tmp2 = p2.psi1(Z, variational_posterior)
+                psi2 += np.einsum('nm,no->mo',tmp1,tmp2)+np.einsum('nm,no->mo',tmp2,tmp1)
+                #(tmp1[:, :, None] * tmp2[:, None, :]) + (tmp2[:, :, None] * tmp1[:, None, :])
+            else:
+                raise NotImplementedError, "psi2 cannot be computed for this kernel"
+        return psi2
+
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        from static import White, Bias
+        for p1 in self.parts:
+            #compute the effective dL_dpsi1. Extra terms appear becaue of the cross terms in psi2!
+            eff_dL_dpsi1 = dL_dpsi1.copy()
+            for p2 in self.parts:
+                if p2 is p1:
+                    continue
+                if isinstance(p2, White):
+                    continue
+                elif isinstance(p2, Bias):
+                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
+                else:# np.setdiff1d(p1.active_dims, ar2, assume_unique): # TODO: Careful, not correct for overlapping active_dims
+                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
+            p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
+
+    def gradients_Z_expectations(self, dL_psi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        from static import White, Bias
+        target = np.zeros(Z.shape)
+        for p1 in self.parts:
+            #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
+            eff_dL_dpsi1 = dL_dpsi1.copy()
+            for p2 in self.parts:
+                if p2 is p1:
+                    continue
+                if isinstance(p2, White):
+                    continue
+                elif isinstance(p2, Bias):
+                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
+                else:
+                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
+            target += p1.gradients_Z_expectations(dL_psi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
+        return target
+
+    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        from static import White, Bias
+        target_grads = [np.zeros(v.shape) for v in variational_posterior.parameters]
+        for p1 in self.parameters:
+            #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
+            eff_dL_dpsi1 = dL_dpsi1.copy()
+            for p2 in self.parameters:
+                if p2 is p1:
+                    continue
+                if isinstance(p2, White):
+                    continue
+                elif isinstance(p2, Bias):
+                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
+                else:
+                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
+            grads = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
+            [np.add(target_grads[i],grads[i],target_grads[i]) for i in xrange(len(grads))]
+        return target_grads
+
+    def add(self, other):
+        if isinstance(other, Add):
+            other_params = other.parameters[:]
+            for p in other_params:
+                other.unlink_parameter(p)
+            self.link_parameters(*other_params)
+        else:
+            self.link_parameter(other)
+        self.input_dim, self.active_dims = self.get_input_dim_active_dims(self.parts)
+        return self
+
+    def input_sensitivity(self, summarize=True):
+        if summarize:
+            return reduce(np.add, [k.input_sensitivity(summarize) for k in self.parts])
+        else:
+            i_s = np.zeros((len(self.parts), self.input_dim))
+            from operator import setitem
+            [setitem(i_s, (i, Ellipsis), k.input_sensitivity(summarize)) for i, k in enumerate(self.parts)]
+            return i_s
--- a/GPy/kern/_src/brownian.py
+++ b/GPy/kern/_src/brownian.py
@ -0,0 +1,50 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+import numpy as np
+
+class Brownian(Kern):
+    """
+    Brownian motion in 1D only.
+
+    Negative times are treated as a separate (backwards!) Brownian motion.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance:
+    :type variance: float
+    """
+    def __init__(self, input_dim=1, variance=1., active_dims=None, name='Brownian'):
+        assert input_dim==1, "Brownian motion in 1D only"
+        super(Brownian, self).__init__(input_dim, active_dims, name)
+
+        self.variance = Param('variance', variance, Logexp())
+        self.link_parameters(self.variance)
+
+    def K(self,X,X2=None):
+        if X2 is None:
+            X2 = X
+        return self.variance*np.where(np.sign(X)==np.sign(X2.T),np.fmin(np.abs(X),np.abs(X2.T)), 0.)
+
+    def Kdiag(self,X):
+        return self.variance*np.abs(X.flatten())
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        if X2 is None:
+            X2 = X
+        self.variance.gradient = np.sum(dL_dK * np.where(np.sign(X)==np.sign(X2.T),np.fmin(np.abs(X),np.abs(X2.T)), 0.))
+
+    #def update_gradients_diag(self, dL_dKdiag, X):
+        #self.variance.gradient = np.dot(np.abs(X.flatten()), dL_dKdiag)
+
+    #def gradients_X(self, dL_dK, X, X2=None):
+        #if X2 is None:
+            #return np.sum(self.variance*dL_dK*np.abs(X),1)[:,None]
+        #else:
+            #return np.sum(np.where(np.logical_and(np.abs(X)<np.abs(X2.T), np.sign(X)==np.sign(X2)), self.variance*dL_dK,0.),1)[:,None]
+
+
+
--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@ -0,0 +1,174 @@
+# Copyright (c) 2012, James Hensman and Ricardo Andrade
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kern import Kern
+import numpy as np
+from scipy import weave
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+from ...util.config import config # for assesing whether to use weave
+
+class Coregionalize(Kern):
+    """
+    Covariance function for intrinsic/linear coregionalization models
+
+    This covariance has the form:
+    .. math::
+       \mathbf{B} = \mathbf{W}\mathbf{W}^\top + \text{diag}(kappa)
+
+    An intrinsic/linear coregionalization covariance function of the form:
+    .. math::
+
+       k_2(x, y)=\mathbf{B} k(x, y)
+
+    it is obtained as the tensor product between a covariance function
+    k(x, y) and B.
+
+    :param output_dim: number of outputs to coregionalize
+    :type output_dim: int
+    :param rank: number of columns of the W matrix (this parameter is ignored if parameter W is not None)
+    :type rank: int
+    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalization matrix B
+    :type W: numpy array of dimensionality (num_outpus, W_columns)
+    :param kappa: a vector which allows the outputs to behave independently
+    :type kappa: numpy array of dimensionality  (output_dim, )
+
+    .. note: see coregionalization examples in GPy.examples.regression for some usage.
+    """
+    def __init__(self, input_dim, output_dim, rank=1, W=None, kappa=None, active_dims=None, name='coregion'):
+        super(Coregionalize, self).__init__(input_dim, active_dims, name=name)
+        self.output_dim = output_dim
+        self.rank = rank
+        if self.rank>output_dim:
+            print("Warning: Unusual choice of rank, it should normally be less than the output_dim.")
+        if W is None:
+            W = 0.5*np.random.randn(self.output_dim, self.rank)/np.sqrt(self.rank)
+        else:
+            assert W.shape==(self.output_dim, self.rank)
+        self.W = Param('W', W)
+        if kappa is None:
+            kappa = 0.5*np.ones(self.output_dim)
+        else:
+            assert kappa.shape==(self.output_dim, )
+        self.kappa = Param('kappa', kappa, Logexp())
+        self.link_parameters(self.W, self.kappa)
+
+    def parameters_changed(self):
+        self.B = np.dot(self.W, self.W.T) + np.diag(self.kappa)
+
+    def K(self, X, X2=None):
+        if config.getboolean('weave', 'working'):
+            try:
+                return self._K_weave(X, X2)
+            except:
+                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                config.set('weave', 'working', 'False')
+                return self._K_numpy(X, X2)
+        else:
+            return self._K_numpy(X, X2)
+
+
+    def _K_numpy(self, X, X2=None):
+        index = np.asarray(X, dtype=np.int)
+        if X2 is None:
+            return self.B[index,index.T]
+        else:
+            index2 = np.asarray(X2, dtype=np.int)
+            return self.B[index,index2.T]
+
+    def _K_weave(self, X, X2=None):
+        """compute the kernel function using scipy.weave"""
+        index = np.asarray(X, dtype=np.int)
+
+        if X2 is None:
+            target = np.empty((X.shape[0], X.shape[0]), dtype=np.float64)
+            code="""
+            for(int i=0;i<N; i++){
+              target[i+i*N] = B[index[i]+output_dim*index[i]];
+              for(int j=0; j<i; j++){
+                  target[j+i*N] = B[index[i]+output_dim*index[j]];
+                  target[i+j*N] = target[j+i*N];
+                }
+              }
+            """
+            N, B, output_dim = index.size, self.B, self.output_dim
+            weave.inline(code, ['target', 'index', 'N', 'B', 'output_dim'])
+        else:
+            index2 = np.asarray(X2, dtype=np.int)
+            target = np.empty((X.shape[0], X2.shape[0]), dtype=np.float64)
+            code="""
+            for(int i=0;i<num_inducing; i++){
+              for(int j=0; j<N; j++){
+                  target[i+j*num_inducing] = B[output_dim*index[j]+index2[i]];
+                }
+              }
+            """
+            N, num_inducing, B, output_dim = index.size, index2.size, self.B, self.output_dim
+            weave.inline(code, ['target', 'index', 'index2', 'N', 'num_inducing', 'B', 'output_dim'])
+        return target
+
+
+    def Kdiag(self, X):
+        return np.diag(self.B)[np.asarray(X, dtype=np.int).flatten()]
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        index = np.asarray(X, dtype=np.int)
+        if X2 is None:
+            index2 = index
+        else:
+            index2 = np.asarray(X2, dtype=np.int)
+
+        #attempt to use weave for a nasty double indexing loop: fall back to numpy
+        if config.getboolean('weave', 'working'):
+            try:
+                dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
+            except:
+                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                config.set('weave', 'working', 'False')
+                dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
+        else:
+            dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
+
+
+
+        dkappa = np.diag(dL_dK_small)
+        dL_dK_small += dL_dK_small.T
+        dW = (self.W[:, None, :]*dL_dK_small[:, :, None]).sum(0)
+
+        self.W.gradient = dW
+        self.kappa.gradient = dkappa
+
+    def _gradient_reduce_weave(self, dL_dK, index, index2):
+        dL_dK_small = np.zeros_like(self.B)
+        code="""
+        for(int i=0; i<num_inducing; i++){
+          for(int j=0; j<N; j++){
+            dL_dK_small[index[j] + output_dim*index2[i]] += dL_dK[i+j*num_inducing];
+          }
+        }
+        """
+        N, num_inducing, output_dim = index.size, index2.size, self.output_dim
+        weave.inline(code, ['N', 'num_inducing', 'output_dim', 'dL_dK', 'dL_dK_small', 'index', 'index2'])
+        return dL_dK_small
+
+    def _gradient_reduce_numpy(self, dL_dK, index, index2):
+        index, index2 = index[:,0], index2[:,0]
+        dL_dK_small = np.zeros_like(self.B)
+        for i in range(k.output_dim):
+            tmp1 = dL_dK[index==i]
+            for j in range(k.output_dim):
+                dL_dK_small[j,i] = tmp1[:,index2==j].sum()
+        return dL_dK_small
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        index = np.asarray(X, dtype=np.int).flatten()
+        dL_dKdiag_small = np.array([dL_dKdiag[index==i].sum() for i in xrange(self.output_dim)])
+        self.W.gradient = 2.*self.W*dL_dKdiag_small[:, None]
+        self.kappa.gradient = dL_dKdiag_small
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        return np.zeros(X.shape)
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        return np.zeros(X.shape)
+
--- a/GPy/kern/_src/independent_outputs.py
+++ b/GPy/kern/_src/independent_outputs.py
@ -0,0 +1,202 @@
+# Copyright (c) 2012, James Hesnsman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kern import Kern, CombinationKernel
+import numpy as np
+import itertools
+
+def index_to_slices(index):
+    """
+    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index. 
+
+    e.g.
+    >>> index = np.asarray([0,0,0,1,1,1,2,2,2])
+    returns
+    >>> [[slice(0,3,None)],[slice(3,6,None)],[slice(6,9,None)]]
+
+    or, a more complicated example
+    >>> index = np.asarray([0,0,1,1,0,2,2,2,1,1])
+    returns
+    >>> [[slice(0,2,None),slice(4,5,None)],[slice(2,4,None),slice(8,10,None)],[slice(5,8,None)]]
+    """
+    if len(index)==0:
+        return[]
+
+    #contruct the return structure
+    ind = np.asarray(index,dtype=np.int)
+    ret = [[] for i in range(ind.max()+1)]
+
+    #find the switchpoints
+    ind_ = np.hstack((ind,ind[0]+ind[-1]+1))
+    switchpoints = np.nonzero(ind_ - np.roll(ind_,+1))[0]
+
+    [ret[ind_i].append(slice(*indexes_i)) for ind_i,indexes_i in zip(ind[switchpoints[:-1]],zip(switchpoints,switchpoints[1:]))]
+    return ret
+
+class IndependentOutputs(CombinationKernel):
+    """
+    A kernel which can represent several independent functions.  this kernel
+    'switches off' parts of the matrix where the output indexes are different.
+
+    The index of the functions is given by the last column in the input X the
+    rest of the columns of X are passed to the underlying kernel for
+    computation (in blocks).
+
+    :param kernels: either a kernel, or list of kernels to work with. If it is
+    a list of kernels the indices in the index_dim, index the kernels you gave!
+    """
+    def __init__(self, kernels, index_dim=-1, name='independ'):
+        assert isinstance(index_dim, int), "IndependentOutputs kernel is only defined with one input dimension being the index"
+        if not isinstance(kernels, list):
+            self.single_kern = True
+            self.kern = kernels
+            kernels = [kernels]
+        else:
+            self.single_kern = False
+            self.kern = kernels
+        super(IndependentOutputs, self).__init__(kernels=kernels, extra_dims=[index_dim], name=name)
+        self.index_dim = index_dim
+
+    def K(self,X ,X2=None):
+        slices = index_to_slices(X[:,self.index_dim])
+        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
+        if X2 is None:
+            target = np.zeros((X.shape[0], X.shape[0]))
+            [[target.__setitem__((s,ss), kern.K(X[s,:], X[ss,:])) for s,ss in itertools.product(slices_i, slices_i)] for kern, slices_i in zip(kerns, slices)]
+        else:
+            slices2 = index_to_slices(X2[:,self.index_dim])
+            target = np.zeros((X.shape[0], X2.shape[0]))
+            [[target.__setitem__((s,s2), kern.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices_i, slices_j)] for kern, slices_i,slices_j in zip(kerns, slices,slices2)]
+        return target
+
+    def Kdiag(self,X):
+        slices = index_to_slices(X[:,self.index_dim])
+        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
+        target = np.zeros(X.shape[0])
+        [[np.copyto(target[s], kern.Kdiag(X[s])) for s in slices_i] for kern, slices_i in zip(kerns, slices)]
+        return target
+
+    def update_gradients_full(self,dL_dK,X,X2=None):
+        slices = index_to_slices(X[:,self.index_dim])
+        if self.single_kern: 
+            target = np.zeros(self.kern.size)
+            kerns = itertools.repeat(self.kern)
+        else: 
+            kerns = self.kern
+            target = [np.zeros(kern.size) for kern, _ in zip(kerns, slices)]
+        def collate_grads(kern, i, dL, X, X2):
+            kern.update_gradients_full(dL,X,X2)
+            if self.single_kern: target[:] += kern.gradient
+            else: target[i][:] += kern.gradient
+        if X2 is None:
+            [[collate_grads(kern, i, dL_dK[s,ss], X[s], X[ss]) for s,ss in itertools.product(slices_i, slices_i)] for i,(kern,slices_i) in enumerate(zip(kerns,slices))]
+        else:
+            slices2 = index_to_slices(X2[:,self.index_dim])
+            [[[collate_grads(kern, i, dL_dK[s,s2],X[s],X2[s2]) for s in slices_i] for s2 in slices_j] for i,(kern,slices_i,slices_j) in enumerate(zip(kerns,slices,slices2))]
+        if self.single_kern: kern.gradient = target
+        else:[kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(kerns, slices))]
+
+    def gradients_X(self,dL_dK, X, X2=None):
+        target = np.zeros(X.shape)
+        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
+        if X2 is None:
+            # TODO: make use of index_to_slices
+            values = np.unique(X[:,self.index_dim])
+            slices = [X[:,self.index_dim]==i for i in values]
+            [target.__setitem__(s, kern.gradients_X(dL_dK[s,s],X[s],None))
+              for kern, s in zip(kerns, slices)]
+            #slices = index_to_slices(X[:,self.index_dim])
+            #[[np.add(target[s], kern.gradients_X(dL_dK[s,s], X[s]), out=target[s]) 
+            #  for s in slices_i] for kern, slices_i in zip(kerns, slices)]
+            #import ipdb;ipdb.set_trace()
+            #[[(np.add(target[s ], kern.gradients_X(dL_dK[s ,ss],X[s ], X[ss]), out=target[s ]),
+            #   np.add(target[ss], kern.gradients_X(dL_dK[ss,s ],X[ss], X[s ]), out=target[ss]))
+            #  for s, ss in itertools.combinations(slices_i, 2)] for kern, slices_i in zip(kerns, slices)]
+        else:
+            values = np.unique(X[:,self.index_dim])
+            slices = [X[:,self.index_dim]==i for i in values]
+            slices2 = [X2[:,self.index_dim]==i for i in values]
+            [target.__setitem__(s, kern.gradients_X(dL_dK[s, :][:, s2],X[s],X2[s2]))
+              for kern, s, s2 in zip(kerns, slices, slices2)]
+            # TODO: make work with index_to_slices
+            #slices = index_to_slices(X[:,self.index_dim])
+            #slices2 = index_to_slices(X2[:,self.index_dim])
+            #[[target.__setitem__(s, target[s] + kern.gradients_X(dL_dK[s,s2], X[s], X2[s2])) for s, s2 in itertools.product(slices_i, slices_j)] for kern, slices_i,slices_j in zip(kerns, slices,slices2)]
+        return target
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        slices = index_to_slices(X[:,self.index_dim])
+        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
+        target = np.zeros(X.shape)
+        [[target.__setitem__(s, kern.gradients_X_diag(dL_dKdiag[s],X[s])) for s in slices_i] for kern, slices_i in zip(kerns, slices)]
+        return target
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        slices = index_to_slices(X[:,self.index_dim])
+        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
+        if self.single_kern: target = np.zeros(self.kern.size)
+        else: target = [np.zeros(kern.size) for kern, _ in zip(kerns, slices)]
+        def collate_grads(kern, i, dL, X):
+            kern.update_gradients_diag(dL,X)
+            if self.single_kern: target[:] += kern.gradient
+            else: target[i][:] += kern.gradient
+        [[collate_grads(kern, i, dL_dKdiag[s], X[s,:]) for s in slices_i] for i, (kern, slices_i) in enumerate(zip(kerns, slices))]
+        if self.single_kern: kern.gradient = target
+        else:[kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(kerns, slices))]
+
+class Hierarchical(CombinationKernel):
+    """
+    A kernel which can represent a simple hierarchical model.
+
+    See Hensman et al 2013, "Hierarchical Bayesian modelling of gene expression time
+    series across irregularly sampled replicates and clusters"
+    http://www.biomedcentral.com/1471-2105/14/252
+
+    To construct this kernel, you must pass a list of kernels. the first kernel
+    will be assumed to be the 'base' kernel, and will be computed everywhere.
+    For every additional kernel, we assume another layer in the hierachy, with
+    a corresponding column of the input matrix which indexes which function the
+    data are in at that level.
+
+    For more, see the ipython notebook documentation on Hierarchical
+    covariances.
+    """
+    def __init__(self, kernels, name='hierarchy'):
+        assert all([k.input_dim==kernels[0].input_dim for k in kernels])
+        assert len(kernels) > 1
+        self.levels = len(kernels) -1
+        input_max = max([k.input_dim for k in kernels])
+        super(Hierarchical, self).__init__(kernels=kernels, extra_dims = range(input_max, input_max + len(kernels)-1), name=name)
+
+    def K(self,X ,X2=None):
+        K = self.parts[0].K(X, X2) # compute 'base' kern everywhere
+        slices = [index_to_slices(X[:,i]) for i in self.extra_dims]
+        if X2 is None:
+            [[[np.add(K[s,s], k.K(X[s], None), K[s, s]) for s in slices_i] for slices_i in slices_k] for k, slices_k in zip(self.parts[1:], slices)]
+        else:
+            slices2 = [index_to_slices(X2[:,i]) for i in self.extra_dims]
+            [[[np.add(K[s,ss], k.K(X[s], X2[ss]), K[s, ss]) for s,ss in zip(slices_i, slices_j)] for slices_i, slices_j in zip(slices_k1, slices_k2)] for k, slices_k1, slices_k2 in zip(self.parts[1:], slices, slices2)]
+        return K
+
+    def Kdiag(self,X):
+        return np.diag(self.K(X))
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        raise NotImplementedError
+
+    def update_gradients_full(self,dL_dK,X,X2=None):
+        slices = [index_to_slices(X[:,i]) for i in self.extra_dims]
+        if X2 is None:
+            self.parts[0].update_gradients_full(dL_dK, X, None)
+            for k, slices_k in zip(self.parts[1:], slices):
+                target = np.zeros(k.size)
+                def collate_grads(dL, X, X2, target):
+                    k.update_gradients_full(dL,X,X2)
+                    target += k.gradient
+                [[collate_grads(dL_dK[s,s], X[s], None, target) for s in slices_i] for slices_i in slices_k]
+                k.gradient[:] = target
+        else:
+            raise NotImplementedError
+
+
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@ -0,0 +1,280 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import sys
+import numpy as np
+from ...core.parameterization.parameterized import Parameterized
+from kernel_slice_operations import KernCallsViaSlicerMeta
+from ...util.caching import Cache_this
+from GPy.core.parameterization.observable_array import ObsAr
+
+
+
+class Kern(Parameterized):
+    #===========================================================================
+    # This adds input slice support. The rather ugly code for slicing can be
+    # found in kernel_slice_operations
+    __metaclass__ = KernCallsViaSlicerMeta
+    #===========================================================================
+    _support_GPU=False
+    def __init__(self, input_dim, active_dims, name, useGPU=False, *a, **kw):
+        """
+        The base class for a kernel: a positive definite function
+        which forms of a covariance function (kernel).
+
+        input_dim:
+
+            is the number of dimensions to work on. Make sure to give the
+            tight dimensionality of inputs.
+            You most likely want this to be the integer telling the number of
+            input dimensions of the kernel.
+            If this is not an integer (!) we will work on the whole input matrix X,
+            and not check whether dimensions match or not (!).
+
+        active_dims:
+
+            is the active_dimensions of inputs X we will work on.
+            All kernels will get sliced Xes as inputs, if active_dims is not None
+            Only positive integers are allowed in active_dims!
+            if active_dims is None, slicing is switched off and all X will be passed through as given.
+
+        :param int input_dim: the number of input dimensions to the function
+        :param array-like|None active_dims: list of indices on which dimensions this kernel works on, or none if no slicing
+
+        Do not instantiate.
+        """
+        super(Kern, self).__init__(name=name, *a, **kw)
+        self.input_dim = int(input_dim)
+
+        if active_dims is None:
+            active_dims = np.arange(input_dim)
+
+        self.active_dims = np.atleast_1d(active_dims).astype(int)
+
+        assert self.active_dims.size == self.input_dim, "input_dim={} does not match len(active_dim)={}, active_dims={}".format(self.input_dim, self.active_dims.size, self.active_dims)
+
+        self._sliced_X = 0
+        self.useGPU = self._support_GPU and useGPU
+        self._return_psi2_n_flag = ObsAr(np.zeros(1)).astype(bool)
+
+    @property
+    def return_psi2_n(self):
+        """
+        Flag whether to pass back psi2 as NxMxM or MxM, by summing out N.
+        """
+        return self._return_psi2_n_flag[0]
+    @return_psi2_n.setter
+    def return_psi2_n(self, val):
+        def visit(self):
+            if isinstance(self, Kern):
+                self._return_psi2_n_flag[0]=val
+        self.traverse(visit)
+
+    @Cache_this(limit=20)
+    def _slice_X(self, X):
+        return X[:, self.active_dims]
+
+    def K(self, X, X2):
+        """
+        Compute the kernel function.
+
+        :param X: the first set of inputs to the kernel
+        :param X2: (optional) the second set of arguments to the kernel. If X2
+                   is None, this is passed throgh to the 'part' object, which
+                   handLes this as X2 == X.
+        """
+        raise NotImplementedError
+    def Kdiag(self, X):
+        raise NotImplementedError
+    def psi0(self, Z, variational_posterior):
+        raise NotImplementedError
+    def psi1(self, Z, variational_posterior):
+        raise NotImplementedError
+    def psi2(self, Z, variational_posterior):
+        raise NotImplementedError
+    def gradients_X(self, dL_dK, X, X2):
+        raise NotImplementedError
+    def gradients_X_diag(self, dL_dKdiag, X):
+        raise NotImplementedError
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        """ update the gradients of all parameters when using only the diagonal elements of the covariance matrix"""
+        raise NotImplementedError
+
+    def update_gradients_full(self, dL_dK, X, X2):
+        """Set the gradients of all parameters when doing full (N) inference."""
+        raise NotImplementedError
+
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        """
+        Set the gradients of all parameters when doing inference with
+        uncertain inputs, using expectations of the kernel.
+
+        The esential maths is
+
+        dL_d{theta_i} = dL_dpsi0 * dpsi0_d{theta_i} +
+                        dL_dpsi1 * dpsi1_d{theta_i} +
+                        dL_dpsi2 * dpsi2_d{theta_i}
+        """
+        raise NotImplementedError
+
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        """
+        Returns the derivative of the objective wrt Z, using the chain rule
+        through the expectation variables.
+        """
+        raise NotImplementedError
+
+    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        """
+        Compute the gradients wrt the parameters of the variational
+        distruibution q(X), chain-ruling via the expectations of the kernel
+        """
+        raise NotImplementedError
+
+    def plot(self, x=None, fignum=None, ax=None, title=None, plot_limits=None, resolution=None, **mpl_kwargs):
+        """
+        plot this kernel.
+        :param x: the value to use for the other kernel argument (kernels are a function of two variables!)
+        :param fignum: figure number of the plot
+        :param ax: matplotlib axis to plot on
+        :param title: the matplotlib title
+        :param plot_limits: the range over which to plot the kernel
+        :resolution: the resolution of the lines used in plotting
+        :mpl_kwargs avalid keyword arguments to pass through to matplotlib (e.g. lw=7)
+        """
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ...plotting.matplot_dep import kernel_plots
+        kernel_plots.plot(self, x, fignum, ax, title, plot_limits, resolution, **mpl_kwargs)
+
+    def plot_ARD(self, *args, **kw):
+        """
+        See :class:`~GPy.plotting.matplot_dep.kernel_plots`
+        """
+        import sys
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
+        from ...plotting.matplot_dep import kernel_plots
+        return kernel_plots.plot_ARD(self,*args,**kw)
+
+    def input_sensitivity(self, summarize=True):
+        """
+        Returns the sensitivity for each dimension of this kernel.
+        """
+        return np.zeros(self.input_dim)
+
+    def __add__(self, other):
+        """ Overloading of the '+' operator. for more control, see self.add """
+        return self.add(other)
+
+    def __iadd__(self, other):
+        return self.add(other)
+
+    def add(self, other, name='add'):
+        """
+        Add another kernel to this one.
+
+        :param other: the other kernel to be added
+        :type other: GPy.kern
+
+        """
+        assert isinstance(other, Kern), "only kernels can be added to kernels..."
+        from add import Add
+        return Add([self, other], name=name)
+
+    def __mul__(self, other):
+        """ Here we overload the '*' operator. See self.prod for more information"""
+        return self.prod(other)
+
+    def __imul__(self, other):
+        """ Here we overload the '*' operator. See self.prod for more information"""
+        return self.prod(other)
+
+    def __pow__(self, other):
+        """
+        Shortcut for tensor `prod`.
+        """
+        assert np.all(self.active_dims == range(self.input_dim)), "Can only use kernels, which have their input_dims defined from 0"
+        assert np.all(other.active_dims == range(other.input_dim)), "Can only use kernels, which have their input_dims defined from 0"
+        other.active_dims += self.input_dim
+        return self.prod(other)
+
+    def prod(self, other, name='mul'):
+        """
+        Multiply two kernels (either on the same space, or on the tensor
+        product of the input space).
+
+        :param other: the other kernel to be added
+        :type other: GPy.kern
+        :param tensor: whether or not to use the tensor space (default is false).
+        :type tensor: bool
+
+        """
+        assert isinstance(other, Kern), "only kernels can be multiplied to kernels..."
+        from prod import Prod
+        #kernels = []
+        #if isinstance(self, Prod): kernels.extend(self.parameters)
+        #else: kernels.append(self)
+        #if isinstance(other, Prod): kernels.extend(other.parameters)
+        #else: kernels.append(other)
+        return Prod([self, other], name)
+
+    def _check_input_dim(self, X):
+        assert X.shape[1] == self.input_dim, "{} did not specify active_dims and X has wrong shape: X_dim={}, whereas input_dim={}".format(self.name, X.shape[1], self.input_dim)
+
+    def _check_active_dims(self, X):
+        assert X.shape[1] >= len(self.active_dims), "At least {} dimensional X needed, X.shape={!s}".format(len(self.active_dims), X.shape)
+
+
+class CombinationKernel(Kern):
+    """
+    Abstract super class for combination kernels.
+    A combination kernel combines (a list of) kernels and works on those.
+    Examples are the HierarchicalKernel or Add and Prod kernels.
+    """
+    def __init__(self, kernels, name, extra_dims=[]):
+        """
+        Abstract super class for combination kernels.
+        A combination kernel combines (a list of) kernels and works on those.
+        Examples are the HierarchicalKernel or Add and Prod kernels.
+
+        :param list kernels: List of kernels to combine (can be only one element)
+        :param str name: name of the combination kernel
+        :param array-like extra_dims: if needed extra dimensions for the combination kernel to work on
+        """
+        assert all([isinstance(k, Kern) for k in kernels])
+        extra_dims = np.array(extra_dims, dtype=int)
+        input_dim, active_dims = self.get_input_dim_active_dims(kernels, extra_dims)
+        # initialize the kernel with the full input_dim
+        super(CombinationKernel, self).__init__(input_dim, active_dims, name)
+        self.extra_dims = extra_dims
+        self.link_parameters(*kernels)
+
+    @property
+    def parts(self):
+        return self.parameters
+
+    def get_input_dim_active_dims(self, kernels, extra_dims = None):
+        #active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int))
+        #active_dims = np.array(np.concatenate((active_dims, extra_dims if extra_dims is not None else [])), dtype=int)
+        input_dim = reduce(max, (k.active_dims.max() for k in kernels)) + 1
+
+        if extra_dims is not None:
+            input_dim += extra_dims.size
+
+        active_dims = np.arange(input_dim)
+        return input_dim, active_dims
+
+    def input_sensitivity(self, summarize=True):
+        """
+        If summize is true, we want to get the summerized view of the sensitivities,
+        otherwise put everything into an array with shape (#kernels, input_dim)
+        in the order of appearance of the kernels in the parameterized object.
+        """
+        raise NotImplementedError("Choose the kernel you want to get the sensitivity for. You need to override the default behaviour for getting the input sensitivity to be able to get the input sensitivity. For sum kernel it is the sum of all sensitivities, TODO: product kernel? Other kernels?, also TODO: shall we return all the sensitivities here in the combination kernel? So we can combine them however we want? This could lead to just plot all the sensitivities here...")
+
+    def _check_active_dims(self, X):
+        return
+
+    def _check_input_dim(self, X):
+        # As combination kernels cannot always know, what their inner kernels have as input dims, the check will be done inside them, respectively
+        return
--- a/GPy/kern/_src/kernel_slice_operations.py
+++ b/GPy/kern/_src/kernel_slice_operations.py
@ -0,0 +1,143 @@
+'''
+Created on 11 Mar 2014
+
+@author: maxz
+'''
+from ...core.parameterization.parameterized import ParametersChangedMeta
+import numpy as np
+from functools import wraps
+
+def put_clean(dct, name, func):
+    if name in dct:
+        dct['_clean_{}'.format(name)] = dct[name]
+        dct[name] = func(dct[name])
+
+class KernCallsViaSlicerMeta(ParametersChangedMeta):
+    def __new__(cls, name, bases, dct):
+        put_clean(dct, 'K', _slice_K)
+        put_clean(dct, 'Kdiag', _slice_Kdiag)
+        put_clean(dct, 'update_gradients_full', _slice_update_gradients_full)
+        put_clean(dct, 'update_gradients_diag', _slice_update_gradients_diag)
+        put_clean(dct, 'gradients_X', _slice_gradients_X)
+        put_clean(dct, 'gradients_X_diag', _slice_gradients_X_diag)
+
+        put_clean(dct, 'psi0', _slice_psi)
+        put_clean(dct, 'psi1', _slice_psi)
+        put_clean(dct, 'psi2', _slice_psi)
+        put_clean(dct, 'update_gradients_expectations', _slice_update_gradients_expectations)
+        put_clean(dct, 'gradients_Z_expectations', _slice_gradients_Z_expectations)
+        put_clean(dct, 'gradients_qX_expectations', _slice_gradients_qX_expectations)
+        return super(KernCallsViaSlicerMeta, cls).__new__(cls, name, bases, dct)
+
+class _Slice_wrap(object):
+    def __init__(self, k, X, X2=None):
+        self.k = k
+        self.shape = X.shape
+        assert X.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X.shape={!s}".format(X.shape)
+        if X2 is not None:
+            assert X2.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X2.shape={!s}".format(X2.shape)
+        if (self.k.active_dims is not None) and (self.k._sliced_X == 0):
+            self.k._check_active_dims(X)
+            self.X = self.k._slice_X(X)
+            self.X2 = self.k._slice_X(X2) if X2 is not None else X2
+            self.ret = True
+        else:
+            self.k._check_input_dim(X)
+            self.X = X
+            self.X2 = X2
+            self.ret = False
+    def __enter__(self):
+        self.k._sliced_X += 1
+        return self
+    def __exit__(self, *a):
+        self.k._sliced_X -= 1
+    def handle_return_array(self, return_val):
+        if self.ret:
+            ret = np.zeros(self.shape)
+            ret[:, self.k.active_dims] = return_val
+            return ret
+        return return_val
+
+def _slice_K(f):
+    @wraps(f)
+    def wrap(self, X, X2 = None, *a, **kw):
+        with _Slice_wrap(self, X, X2) as s:
+            ret = f(self, s.X, s.X2, *a, **kw)
+        return ret
+    return wrap
+
+def _slice_Kdiag(f):
+    @wraps(f)
+    def wrap(self, X, *a, **kw):
+        with _Slice_wrap(self, X, None) as s:
+            ret = f(self, s.X, *a, **kw)
+        return ret
+    return wrap
+
+def _slice_update_gradients_full(f):
+    @wraps(f)
+    def wrap(self, dL_dK, X, X2=None):
+        with _Slice_wrap(self, X, X2) as s:
+            ret = f(self, dL_dK, s.X, s.X2)
+        return ret
+    return wrap
+
+def _slice_update_gradients_diag(f):
+    @wraps(f)
+    def wrap(self, dL_dKdiag, X):
+        with _Slice_wrap(self, X, None) as s:
+            ret = f(self, dL_dKdiag, s.X)
+        return ret
+    return wrap
+
+def _slice_gradients_X(f):
+    @wraps(f)
+    def wrap(self, dL_dK, X, X2=None):
+        with _Slice_wrap(self, X, X2) as s:
+            ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2))
+        return ret
+    return wrap
+
+def _slice_gradients_X_diag(f):
+    @wraps(f)
+    def wrap(self, dL_dKdiag, X):
+        with _Slice_wrap(self, X, None) as s:
+            ret = s.handle_return_array(f(self, dL_dKdiag, s.X))
+        return ret
+    return wrap
+
+def _slice_psi(f):
+    @wraps(f)
+    def wrap(self, Z, variational_posterior):
+        with _Slice_wrap(self, Z, variational_posterior) as s:
+            ret = f(self, s.X, s.X2)
+        return ret
+    return wrap
+
+def _slice_update_gradients_expectations(f):
+    @wraps(f)
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        with _Slice_wrap(self, Z, variational_posterior) as s:
+            ret = f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2)
+        return ret
+    return wrap
+
+def _slice_gradients_Z_expectations(f):
+    @wraps(f)
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        with _Slice_wrap(self, Z, variational_posterior) as s:
+            ret = s.handle_return_array(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2))
+        return ret
+    return wrap
+
+def _slice_gradients_qX_expectations(f):
+    @wraps(f)
+    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        with _Slice_wrap(self, variational_posterior, Z) as s:
+            ret = list(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X2, s.X))
+            r2 = ret[:2]
+            ret[0] = s.handle_return_array(r2[0])
+            ret[1] = s.handle_return_array(r2[1])
+            del r2
+        return ret
+    return wrap
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@ -0,0 +1,177 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from kern import Kern
+from ...util.linalg import tdot
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+from ...util.caching import Cache_this
+from ...util.config import *
+from .psi_comp import PSICOMP_Linear
+
+class Linear(Kern):
+    """
+    Linear kernel
+
+    .. math::
+
+       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i x_iy_i
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variances: the vector of variances :math:`\sigma^2_i`
+    :type variances: array or list of the appropriate size (or float if there
+                     is only one variance parameter)
+    :param ARD: Auto Relevance Determination. If False, the kernel has only one
+                variance parameter \sigma^2, otherwise there is one variance
+                parameter per dimension.
+    :type ARD: Boolean
+    :rtype: kernel object
+
+    """
+
+    def __init__(self, input_dim, variances=None, ARD=False, active_dims=None, name='linear'):
+        super(Linear, self).__init__(input_dim, active_dims, name)
+        self.ARD = ARD
+        if not ARD:
+            if variances is not None:
+                variances = np.asarray(variances)
+                assert variances.size == 1, "Only one variance needed for non-ARD kernel"
+            else:
+                variances = np.ones(1)
+        else:
+            if variances is not None:
+                variances = np.asarray(variances)
+                assert variances.size == self.input_dim, "bad number of variances, need one ARD variance per input_dim"
+            else:
+                variances = np.ones(self.input_dim)
+
+        self.variances = Param('variances', variances, Logexp())
+        self.link_parameter(self.variances)
+        self.psicomp = PSICOMP_Linear()
+
+    @Cache_this(limit=2)
+    def K(self, X, X2=None):
+        if self.ARD:
+            if X2 is None:
+                return tdot(X*np.sqrt(self.variances))
+            else:
+                rv = np.sqrt(self.variances)
+                return np.dot(X*rv, (X2*rv).T)
+        else:
+            return self._dot_product(X, X2) * self.variances
+
+    @Cache_this(limit=1, ignore_args=(0,))
+    def _dot_product(self, X, X2=None):
+        if X2 is None:
+            return tdot(X)
+        else:
+            return np.dot(X, X2.T)
+
+    def Kdiag(self, X):
+        return np.sum(self.variances * np.square(X), -1)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        if self.ARD:
+            if X2 is None:
+                #self.variances.gradient = np.array([np.sum(dL_dK * tdot(X[:, i:i + 1])) for i in range(self.input_dim)])
+                self.variances.gradient = np.einsum('ij,iq,jq->q', dL_dK, X, X)
+            else:
+                #product = X[:, None, :] * X2[None, :, :]
+                #self.variances.gradient = (dL_dK[:, :, None] * product).sum(0).sum(0)
+                self.variances.gradient = np.einsum('ij,iq,jq->q', dL_dK, X, X2)
+        else:
+            self.variances.gradient = np.sum(self._dot_product(X, X2) * dL_dK)
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        tmp = dL_dKdiag[:, None] * X ** 2
+        if self.ARD:
+            self.variances.gradient = tmp.sum(0)
+        else:
+            self.variances.gradient = np.atleast_1d(tmp.sum())
+
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        if X2 is None:
+            return np.einsum('jq,q,ij->iq', X, 2*self.variances, dL_dK)
+        else:
+            #return (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
+            return np.einsum('jq,q,ij->iq', X2, self.variances, dL_dK)
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        return 2.*self.variances*dL_dKdiag[:,None]*X
+
+    def input_sensitivity(self, summarize=True):
+        return np.ones(self.input_dim) * self.variances
+
+    #---------------------------------------#
+    #             PSI statistics            #
+    #---------------------------------------#
+
+    def psi0(self, Z, variational_posterior):
+        return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[0]
+
+    def psi1(self, Z, variational_posterior):
+        return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[1]
+
+    def psi2(self, Z, variational_posterior):
+        return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[2]
+
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        dL_dvar = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[0]
+        if self.ARD:
+            self.variances.gradient = dL_dvar
+        else:
+            self.variances.gradient = dL_dvar.sum()
+
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[1]
+
+    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[2:]
+
+class LinearFull(Kern):
+    def __init__(self, input_dim, rank, W=None, kappa=None, active_dims=None, name='linear_full'):
+        super(LinearFull, self).__init__(input_dim, active_dims, name)
+        if W is None:
+            W = np.ones((input_dim, rank))
+        if kappa is None:
+            kappa = np.ones(input_dim)
+        assert W.shape == (input_dim, rank)
+        assert kappa.shape == (input_dim,)
+
+        self.W = Param('W', W)
+        self.kappa = Param('kappa', kappa, Logexp())
+        self.link_parameters(self.W, self.kappa)
+
+    def K(self, X, X2=None):
+        P = np.dot(self.W, self.W.T) + np.diag(self.kappa)
+        return np.einsum('ij,jk,lk->il', X, P, X if X2 is None else X2)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        self.kappa.gradient = np.einsum('ij,ik,kj->j', X, dL_dK, X if X2 is None else X2)
+        self.W.gradient = np.einsum('ij,kl,ik,lm->jm', X, X if X2 is None else X2, dL_dK, self.W)
+        self.W.gradient += np.einsum('ij,kl,ik,jm->lm', X, X if X2 is None else X2, dL_dK, self.W)
+
+    def Kdiag(self, X):
+        P = np.dot(self.W, self.W.T) + np.diag(self.kappa)
+        return np.einsum('ij,jk,ik->i', X, P, X)
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        self.kappa.gradient = np.einsum('ij,i->j', np.square(X), dL_dKdiag)
+        self.W.gradient = 2.*np.einsum('ij,ik,jl,i->kl', X, X, self.W, dL_dKdiag)
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        P = np.dot(self.W, self.W.T) + np.diag(self.kappa)
+        if X2 is None:
+            return 2.*np.einsum('ij,jk,kl->il', dL_dK, X, P)
+        else:
+            return np.einsum('ij,jk,kl->il', dL_dK, X2, P)
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        P = np.dot(self.W, self.W.T) + np.diag(self.kappa)
+        return 2.*np.einsum('jk,i,ij->ik', P, dL_dKdiag, X)
+
+
--- a/GPy/kern/_src/mlp.py
+++ b/GPy/kern/_src/mlp.py
@ -0,0 +1,129 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+import numpy as np
+four_over_tau = 2./np.pi
+
+class MLP(Kern):
+    """
+
+    Multi layer perceptron kernel (also known as arc sine kernel or neural network kernel)
+
+    .. math::
+
+          k(x,y) = \\sigma^{2}\\frac{2}{\\pi }  \\text{asin} \\left ( \\frac{ \\sigma_w^2 x^\\top y+\\sigma_b^2}{\\sqrt{\\sigma_w^2x^\\top x + \\sigma_b^2 + 1}\\sqrt{\\sigma_w^2 y^\\top y \\sigma_b^2 +1}} \\right )
+
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance :math:`\sigma^2`
+    :type variance: float
+    :param weight_variance: the vector of the variances of the prior over input weights in the neural network :math:`\sigma^2_w`
+    :type weight_variance: array or list of the appropriate size (or float if there is only one weight variance parameter)
+    :param bias_variance: the variance of the prior over bias parameters :math:`\sigma^2_b`
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
+    :type ARD: Boolean
+    :rtype: Kernpart object
+
+
+    """
+
+    def __init__(self, input_dim, variance=1., weight_variance=1., bias_variance=100., active_dims=None, name='mlp'):
+        super(MLP, self).__init__(input_dim, active_dims, name)
+        self.variance = Param('variance', variance, Logexp())
+        self.weight_variance = Param('weight_variance', weight_variance, Logexp())
+        self.bias_variance = Param('bias_variance', bias_variance, Logexp())
+        self.link_parameters(self.variance, self.weight_variance, self.bias_variance)
+
+
+    def K(self, X, X2=None):
+        self._K_computations(X, X2)
+        return self.variance*self._K_dvar
+
+    def Kdiag(self, X):
+        """Compute the diagonal of the covariance matrix for X."""
+        self._K_diag_computations(X)
+        return self.variance*self._K_diag_dvar
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        """Derivative of the covariance with respect to the parameters."""
+        self._K_computations(X, X2)
+        self.variance.gradient = np.sum(self._K_dvar*dL_dK)
+
+        denom3 = self._K_denom**3
+        base = four_over_tau*self.variance/np.sqrt(1-self._K_asin_arg*self._K_asin_arg)
+        base_cov_grad = base*dL_dK
+
+        if X2 is None:
+            vec = np.diag(self._K_inner_prod)
+            self.weight_variance.gradient = ((self._K_inner_prod/self._K_denom
+                           -.5*self._K_numer/denom3
+                           *(np.outer((self.weight_variance*vec+self.bias_variance+1.), vec)
+                             +np.outer(vec,(self.weight_variance*vec+self.bias_variance+1.))))*base_cov_grad).sum()
+            self.bias_variance.gradient = ((1./self._K_denom
+                           -.5*self._K_numer/denom3
+                           *((vec[None, :]+vec[:, None])*self.weight_variance
+                           +2.*self.bias_variance + 2.))*base_cov_grad).sum()
+        else:
+            vec1 = (X*X).sum(1)
+            vec2 = (X2*X2).sum(1)
+            self.weight_variance.gradient = ((self._K_inner_prod/self._K_denom
+                           -.5*self._K_numer/denom3
+                           *(np.outer((self.weight_variance*vec1+self.bias_variance+1.), vec2) + np.outer(vec1, self.weight_variance*vec2 + self.bias_variance+1.)))*base_cov_grad).sum()
+            self.bias_variance.gradient = ((1./self._K_denom
+                           -.5*self._K_numer/denom3
+                           *((vec1[:, None]+vec2[None, :])*self.weight_variance
+                             + 2*self.bias_variance + 2.))*base_cov_grad).sum()
+
+    def update_gradients_diag(self, X):
+        raise NotImplementedError, "TODO"
+
+
+    def gradients_X(self, dL_dK, X, X2):
+        """Derivative of the covariance matrix with respect to X"""
+        self._K_computations(X, X2)
+        arg = self._K_asin_arg
+        numer = self._K_numer
+        denom = self._K_denom
+        denom3 = denom*denom*denom
+        if X2 is not None:
+            vec2 = (X2*X2).sum(1)*self.weight_variance+self.bias_variance + 1.
+            return four_over_tau*self.weight_variance*self.variance*((X2[None, :, :]/denom[:, :, None] - vec2[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
+        else:
+            vec = (X*X).sum(1)*self.weight_variance+self.bias_variance + 1.
+            return 2*four_over_tau*self.weight_variance*self.variance*((X[None, :, :]/denom[:, :, None] - vec[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        """Gradient of diagonal of covariance with respect to X"""
+        self._K_diag_computations(X)
+        arg = self._K_diag_asin_arg
+        denom = self._K_diag_denom
+        #numer = self._K_diag_numer
+        return four_over_tau*2.*self.weight_variance*self.variance*X*(1./denom*(1. - arg)*dL_dKdiag/(np.sqrt(1-arg*arg)))[:, None]
+
+
+    def _K_computations(self, X, X2):
+        """Pre-computations for the covariance matrix (used for computing the covariance and its gradients."""
+        if X2 is None:
+            self._K_inner_prod = np.dot(X,X.T)
+            self._K_numer = self._K_inner_prod*self.weight_variance + self.bias_variance
+            vec = np.diag(self._K_numer) + 1.
+            self._K_denom = np.sqrt(np.outer(vec,vec))
+        else:
+            self._K_inner_prod = np.dot(X,X2.T)
+            self._K_numer = self._K_inner_prod*self.weight_variance + self.bias_variance
+            vec1 = (X*X).sum(1)*self.weight_variance + self.bias_variance + 1.
+            vec2 = (X2*X2).sum(1)*self.weight_variance + self.bias_variance + 1.
+            self._K_denom = np.sqrt(np.outer(vec1,vec2))
+        self._K_asin_arg = self._K_numer/self._K_denom
+        self._K_dvar = four_over_tau*np.arcsin(self._K_asin_arg)
+
+    def _K_diag_computations(self, X):
+        """Pre-computations concerning the diagonal terms (used for computation of diagonal and its gradients)."""
+        self._K_diag_numer = (X*X).sum(1)*self.weight_variance + self.bias_variance
+        self._K_diag_denom = self._K_diag_numer+1.
+        self._K_diag_asin_arg = self._K_diag_numer/self._K_diag_denom
+        self._K_diag_dvar = four_over_tau*np.arcsin(self._K_diag_asin_arg)
--- a/GPy/kern/parts/periodic_Matern52.py
+++ b/GPy/kern/parts/periodic_Matern52.py
@ -2,12 +2,288 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)


-from kernpart import Kernpart
 import numpy as np
-from GPy.util.linalg import mdot
-from GPy.util.decorators import silence_errors
+from kern import Kern
+from ...util.linalg import mdot
+from ...util.decorators import silence_errors
+from ...core.parameterization.param import Param
+from ...core.parameterization.transformations import Logexp

-class PeriodicMatern52(Kernpart):
+class Periodic(Kern):
+    def __init__(self, input_dim, variance, lengthscale, period, n_freq, lower, upper, active_dims, name):
+        """
+        :type input_dim: int
+        :param variance: the variance of the Matern kernel
+        :type variance: float
+        :param lengthscale: the lengthscale of the Matern kernel
+        :type lengthscale: np.ndarray of size (input_dim,)
+        :param period: the period
+        :type period: float
+        :param n_freq: the number of frequencies considered for the periodic subspace
+        :type n_freq: int
+        :rtype: kernel object
+        """
+
+        assert input_dim==1, "Periodic kernels are only defined for input_dim=1"
+        super(Periodic, self).__init__(input_dim, active_dims, name)
+        self.input_dim = input_dim
+        self.lower,self.upper = lower, upper
+        self.n_freq = n_freq
+        self.n_basis = 2*n_freq
+        self.variance = Param('variance', np.float64(variance), Logexp())
+        self.lengthscale = Param('lengthscale', np.float64(lengthscale), Logexp())
+        self.period = Param('period', np.float64(period), Logexp())
+        self.link_parameters(self.variance, self.lengthscale, self.period)
+
+    def _cos(self, alpha, omega, phase):
+        def f(x):
+            return alpha*np.cos(omega*x + phase)
+        return f
+
+    @silence_errors
+    def _cos_factorization(self, alpha, omega, phase):
+        r1 = np.sum(alpha*np.cos(phase),axis=1)[:,None]
+        r2 = np.sum(alpha*np.sin(phase),axis=1)[:,None]
+        r =  np.sqrt(r1**2 + r2**2)
+        psi = np.where(r1 != 0, (np.arctan(r2/r1) + (r1<0.)*np.pi),np.arcsin(r2))
+        return r,omega[:,0:1], psi
+
+    @silence_errors
+    def _int_computation(self,r1,omega1,phi1,r2,omega2,phi2):
+        Gint1 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) + 1./(omega1-omega2.T)*( np.sin((omega1-omega2.T)*self.upper+phi1-phi2.T) - np.sin((omega1-omega2.T)*self.lower+phi1-phi2.T) )
+        Gint2 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) +  np.cos(phi1-phi2.T)*(self.upper-self.lower)
+        Gint = np.dot(r1,r2.T)/2 * np.where(np.isnan(Gint1),Gint2,Gint1)
+        return Gint
+
+    def K(self, X, X2=None):
+        FX = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+        if X2 is None:
+            FX2 = FX
+        else:
+            FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
+        return mdot(FX,self.Gi,FX2.T)
+
+    def Kdiag(self,X):
+        return np.diag(self.K(X))
+
+
+
+
+class PeriodicExponential(Periodic):
+    """
+    Kernel of the periodic subspace (up to a given frequency) of a exponential
+    (Matern 1/2) RKHS.
+
+    Only defined for input_dim=1.
+    """
+
+    def __init__(self, input_dim=1, variance=1., lengthscale=1., period=2.*np.pi, n_freq=10, lower=0., upper=4*np.pi, active_dims=None, name='periodic_exponential'):
+        super(PeriodicExponential, self).__init__(input_dim, variance, lengthscale, period, n_freq, lower, upper, active_dims, name)
+
+    def parameters_changed(self):
+        self.a = [1./self.lengthscale, 1.]
+        self.b = [1]
+
+        self.basis_alpha = np.ones((self.n_basis,))
+        self.basis_omega = (2*np.pi*np.arange(1,self.n_freq+1)/self.period).repeat(2)
+        self.basis_phi =   np.zeros(self.n_freq * 2)
+        self.basis_phi[::2] = -np.pi/2
+
+        self.G = self.Gram_matrix()
+        self.Gi = np.linalg.inv(self.G)
+
+    def Gram_matrix(self):
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega))
+        Lo = np.column_stack((self.basis_omega,self.basis_omega))
+        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+        return(self.lengthscale/(2*self.variance) * Gint + 1./self.variance*np.dot(Flower,Flower.T))
+
+    @silence_errors
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        """derivative of the covariance matrix with respect to the parameters (shape is N x num_inducing x num_params)"""
+        if X2 is None: X2 = X
+        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
+
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega))
+        Lo = np.column_stack((self.basis_omega,self.basis_omega))
+        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+
+        #dK_dvar
+        dK_dvar = 1./self.variance*mdot(FX,self.Gi,FX2.T)
+
+        #dK_dlen
+        da_dlen = [-1./self.lengthscale**2,0.]
+        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)),da_dlen[1]*self.basis_omega))
+        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
+        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
+        dGint_dlen = dGint_dlen + dGint_dlen.T
+        dG_dlen = 1./2*Gint + self.lengthscale/2*dGint_dlen
+        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX2.T)
+
+        #dK_dper
+        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
+        dFX2_dper = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X2,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X2)
+
+        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period))
+        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi))
+        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
+
+        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
+        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
+        # SIMPLIFY!!!       IPPprim1 = (self.upper - self.lower)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
+        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
+        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
+        IPPprim = np.where(np.logical_or(np.isnan(IPPprim1), np.isinf(IPPprim1)), IPPprim2, IPPprim1)
+
+
+        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
+        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
+        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + 1./2*self.upper**2*np.cos(phi-phi1.T)
+        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + 1./2*self.lower**2*np.cos(phi-phi1.T)
+        #IPPint2[0,0] = (self.upper**2 - self.lower**2)*np.cos(phi[0,0])*np.cos(phi1[0,0])
+        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
+
+        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period))
+        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2))
+        r2,omega2,phi2 = dLa_dper2.T,Lo[:,0:1],dLp_dper2.T
+
+        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) + self._int_computation(r2,omega2,phi2, r,omega,phi)
+        dGint_dper = dGint_dper + dGint_dper.T
+
+        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+
+        dG_dper = 1./self.variance*(self.lengthscale/2*dGint_dper + self.b[0]*(np.dot(dFlower_dper,Flower.T)+np.dot(Flower,dFlower_dper.T)))
+
+        dK_dper = mdot(dFX_dper,self.Gi,FX2.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX2.T) + mdot(FX,self.Gi,dFX2_dper.T)
+
+        self.variance.gradient = np.sum(dK_dvar*dL_dK)
+        self.lengthscale.gradient = np.sum(dK_dlen*dL_dK)
+        self.period.gradient = np.sum(dK_dper*dL_dK)
+
+
+
+class PeriodicMatern32(Periodic):
+    """
+    Kernel of the periodic subspace (up to a given frequency) of a Matern 3/2 RKHS. Only defined for input_dim=1.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance of the Matern kernel
+    :type variance: float
+    :param lengthscale: the lengthscale of the Matern kernel
+    :type lengthscale: np.ndarray of size (input_dim,)
+    :param period: the period
+    :type period: float
+    :param n_freq: the number of frequencies considered for the periodic subspace
+    :type n_freq: int
+    :rtype: kernel object
+
+    """
+
+    def __init__(self, input_dim=1, variance=1., lengthscale=1., period=2.*np.pi, n_freq=10, lower=0., upper=4*np.pi, active_dims=None, name='periodic_Matern32'):
+        super(PeriodicMatern32, self).__init__(input_dim, variance, lengthscale, period, n_freq, lower, upper, active_dims, name)
+    def parameters_changed(self):
+        self.a = [3./self.lengthscale**2, 2*np.sqrt(3)/self.lengthscale, 1.]
+        self.b = [1,self.lengthscale**2/3]
+
+        self.basis_alpha = np.ones((self.n_basis,))
+        self.basis_omega = (2*np.pi*np.arange(1,self.n_freq+1)/self.period).repeat(2)
+        self.basis_phi =   np.zeros(self.n_freq * 2)
+        self.basis_phi[::2] = -np.pi/2
+
+        self.G = self.Gram_matrix()
+        self.Gi = np.linalg.inv(self.G)
+
+    def Gram_matrix(self):
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega,self.a[2]*self.basis_omega**2))
+        Lo = np.column_stack((self.basis_omega,self.basis_omega,self.basis_omega))
+        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2,self.basis_phi+np.pi))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        return(self.lengthscale**3/(12*np.sqrt(3)*self.variance) * Gint + 1./self.variance*np.dot(Flower,Flower.T) + self.lengthscale**2/(3.*self.variance)*np.dot(F1lower,F1lower.T))
+
+
+    @silence_errors
+    def update_gradients_full(self,dL_dK,X,X2):
+        """derivative of the covariance matrix with respect to the parameters (shape is num_data x num_inducing x num_params)"""
+        if X2 is None: X2 = X
+        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
+        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
+
+        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega,self.a[2]*self.basis_omega**2))
+        Lo = np.column_stack((self.basis_omega,self.basis_omega,self.basis_omega))
+        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2,self.basis_phi+np.pi))
+        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
+        Gint = self._int_computation( r,omega,phi, r,omega,phi)
+
+        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
+        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+
+        #dK_dvar
+        dK_dvar = 1./self.variance*mdot(FX,self.Gi,FX2.T)
+
+        #dK_dlen
+        da_dlen = [-6/self.lengthscale**3,-2*np.sqrt(3)/self.lengthscale**2,0.]
+        db_dlen = [0.,2*self.lengthscale/3.]
+        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)),da_dlen[1]*self.basis_omega,da_dlen[2]*self.basis_omega**2))
+        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
+        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
+        dGint_dlen = dGint_dlen + dGint_dlen.T
+        dG_dlen = self.lengthscale**2/(4*np.sqrt(3))*Gint + self.lengthscale**3/(12*np.sqrt(3))*dGint_dlen + db_dlen[0]*np.dot(Flower,Flower.T) + db_dlen[1]*np.dot(F1lower,F1lower.T)
+        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX2.T)
+
+        #dK_dper
+        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
+        dFX2_dper = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X2,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X2)
+
+        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period, -self.a[2]*self.basis_omega**3/self.period))
+        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi,self.basis_phi+np.pi*3/2))
+        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
+
+        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
+        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
+        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
+        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
+        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)
+
+        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
+        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
+        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + 1./2*self.upper**2*np.cos(phi-phi1.T)
+        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + 1./2*self.lower**2*np.cos(phi-phi1.T)
+        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
+
+        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period, -2*self.a[2]*self.basis_omega**2/self.period))
+        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi))
+        r2,omega2,phi2 =  self._cos_factorization(dLa_dper2,Lo[:,0:2],dLp_dper2)
+
+        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) +  self._int_computation(r2,omega2,phi2, r,omega,phi)
+        dGint_dper = dGint_dper + dGint_dper.T
+
+        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+        dF1lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower)+self._cos(-self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
+
+        dG_dper = 1./self.variance*(self.lengthscale**3/(12*np.sqrt(3))*dGint_dper + self.b[0]*(np.dot(dFlower_dper,Flower.T)+np.dot(Flower,dFlower_dper.T)) + self.b[1]*(np.dot(dF1lower_dper,F1lower.T)+np.dot(F1lower,dF1lower_dper.T)))
+
+        dK_dper = mdot(dFX_dper,self.Gi,FX2.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX2.T) + mdot(FX,self.Gi,dFX2_dper.T)
+
+        self.variance.gradient = np.sum(dK_dvar*dL_dK)
+        self.lengthscale.gradient = np.sum(dK_dlen*dL_dK)
+        self.period.gradient = np.sum(dK_dper*dL_dK)
+
+
+
+class PeriodicMatern52(Periodic):
    """
    Kernel of the periodic subspace (up to a given frequency) of a Matern 5/2 RKHS. Only defined for input_dim=1.

@ -25,67 +301,21 @@ class PeriodicMatern52(Kernpart):

    """

-    def __init__(self,input_dim=1,variance=1.,lengthscale=None,period=2*np.pi,n_freq=10,lower=0.,upper=4*np.pi):
-        assert input_dim==1, "Periodic kernels are only defined for input_dim=1"
-        self.name = 'periodic_Mat52'
-        self.input_dim = input_dim
-        if lengthscale is not None:
-            lengthscale = np.asarray(lengthscale)
-            assert lengthscale.size == 1, "Wrong size: only one lengthscale needed"
-        else:
-            lengthscale = np.ones(1)
-        self.lower,self.upper = lower, upper
-        self.num_params = 3
-        self.n_freq = n_freq
-        self.n_basis = 2*n_freq
-        self._set_params(np.hstack((variance,lengthscale,period)))
-
-    def _cos(self,alpha,omega,phase):
-        def f(x):
-            return alpha*np.cos(omega*x+phase)
-        return f
-
-    @silence_errors
-    def _cos_factorization(self,alpha,omega,phase):
-        r1 = np.sum(alpha*np.cos(phase),axis=1)[:,None]
-        r2 = np.sum(alpha*np.sin(phase),axis=1)[:,None]
-        r =  np.sqrt(r1**2 + r2**2)
-        psi = np.where(r1 != 0, (np.arctan(r2/r1) + (r1<0.)*np.pi),np.arcsin(r2))
-        return r,omega[:,0:1], psi
-
-    @silence_errors
-    def _int_computation(self,r1,omega1,phi1,r2,omega2,phi2):
-        Gint1 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) + 1./(omega1-omega2.T)*( np.sin((omega1-omega2.T)*self.upper+phi1-phi2.T) - np.sin((omega1-omega2.T)*self.lower+phi1-phi2.T) )
-        Gint2 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) +  np.cos(phi1-phi2.T)*(self.upper-self.lower)
-        #Gint2[0,0] = 2.*(self.upper-self.lower)*np.cos(phi1[0,0])*np.cos(phi2[0,0])
-        Gint = np.dot(r1,r2.T)/2 * np.where(np.isnan(Gint1),Gint2,Gint1)
-        return Gint
-
-    def _get_params(self):
-        """return the value of the parameters."""
-        return np.hstack((self.variance,self.lengthscale,self.period))
-
-    def _set_params(self,x):
-        """set the value of the parameters."""
-        assert x.size==3
-        self.variance = x[0]
-        self.lengthscale = x[1]
-        self.period = x[2]
+    def __init__(self, input_dim=1, variance=1., lengthscale=1., period=2.*np.pi, n_freq=10, lower=0., upper=4*np.pi, active_dims=None, name='periodic_Matern52'):
+        super(PeriodicMatern52, self).__init__(input_dim, variance, lengthscale, period, n_freq, lower, upper, active_dims, name)

+    def parameters_changed(self):
        self.a = [5*np.sqrt(5)/self.lengthscale**3, 15./self.lengthscale**2,3*np.sqrt(5)/self.lengthscale, 1.]
        self.b  = [9./8, 9*self.lengthscale**4/200., 3*self.lengthscale**2/5., 3*self.lengthscale**2/(5*8.), 3*self.lengthscale**2/(5*8.)]

        self.basis_alpha = np.ones((2*self.n_freq,))
-        self.basis_omega = np.array(sum([[i*2*np.pi/self.period]*2 for i in  range(1,self.n_freq+1)],[]))
-        self.basis_phi =   np.array(sum([[-np.pi/2, 0.]  for i in range(1,self.n_freq+1)],[]))
+        self.basis_omega = (2*np.pi*np.arange(1,self.n_freq+1)/self.period).repeat(2)
+        self.basis_phi =   np.zeros(self.n_freq * 2)
+        self.basis_phi[::2] = -np.pi/2

        self.G = self.Gram_matrix()
        self.Gi = np.linalg.inv(self.G)

-    def _get_param_names(self):
-        """return parameter names."""
-        return ['variance','lengthscale','period']
-
    def Gram_matrix(self):
        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)), self.a[1]*self.basis_omega, self.a[2]*self.basis_omega**2, self.a[3]*self.basis_omega**3))
        Lo = np.column_stack((self.basis_omega, self.basis_omega, self.basis_omega, self.basis_omega))
@ -99,23 +329,8 @@ class PeriodicMatern52(Kernpart):
        lower_terms = self.b[0]*np.dot(Flower,Flower.T) + self.b[1]*np.dot(F2lower,F2lower.T) + self.b[2]*np.dot(F1lower,F1lower.T) + self.b[3]*np.dot(F2lower,Flower.T) + self.b[4]*np.dot(Flower,F2lower.T)
        return(3*self.lengthscale**5/(400*np.sqrt(5)*self.variance) * Gint + 1./self.variance*lower_terms)

-    def K(self,X,X2,target):
-        """Compute the covariance matrix between X and X2."""
-        FX = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
-        if X2 is None:
-            FX2 = FX
-        else:
-            FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
-        np.add(mdot(FX,self.Gi,FX2.T), target,target)
-
-    def Kdiag(self,X,target):
-        """Compute the diagonal of the covariance matrix associated to X."""
-        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
-        np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target)
-
    @silence_errors
-    def dK_dtheta(self,dL_dK,X,X2,target):
-        """derivative of the covariance matrix with respect to the parameters (shape is num_data x num_inducing x num_params)"""
+    def update_gradients_full(self, dL_dK, X, X2=None):
        if X2 is None: X2 = X
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
@ -156,14 +371,12 @@ class PeriodicMatern52(Kernpart):
        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
-        #IPPprim2[0,0] = 2*(self.upper**2 - self.lower**2)*np.cos(phi[0,0])*np.cos(phi1[0,0])
        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)

        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + 1./2*self.upper**2*np.cos(phi-phi1.T)
        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + 1./2*self.lower**2*np.cos(phi-phi1.T)
-        #IPPint2[0,0] = (self.upper**2 - self.lower**2)*np.cos(phi[0,0])*np.cos(phi1[0,0])
        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)

        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period, -2*self.a[2]*self.basis_omega**2/self.period, -3*self.a[3]*self.basis_omega**3/self.period))
@ -186,81 +399,7 @@ class PeriodicMatern52(Kernpart):
        dG_dper = 1./self.variance*(3*self.lengthscale**5/(400*np.sqrt(5))*dGint_dper + 0.5*dlower_terms_dper)
        dK_dper = mdot(dFX_dper,self.Gi,FX2.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX2.T) + mdot(FX,self.Gi,dFX2_dper.T)

-        # np.add(target[:,:,0],dK_dvar, target[:,:,0])
-        target[0] += np.sum(dK_dvar*dL_dK)
-        #np.add(target[:,:,1],dK_dlen, target[:,:,1])
-        target[1] += np.sum(dK_dlen*dL_dK)
-        #np.add(target[:,:,2],dK_dper, target[:,:,2])
-        target[2] += np.sum(dK_dper*dL_dK)
+        self.variance.gradient = np.sum(dK_dvar*dL_dK)
+        self.lengthscale.gradient = np.sum(dK_dlen*dL_dK)
+        self.period.gradient = np.sum(dK_dper*dL_dK)

-    @silence_errors
-    def dKdiag_dtheta(self,dL_dKdiag,X,target):
-        """derivative of the diagonal of the covariance matrix with respect to the parameters"""
-        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
-
-        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)), self.a[1]*self.basis_omega, self.a[2]*self.basis_omega**2, self.a[3]*self.basis_omega**3))
-        Lo = np.column_stack((self.basis_omega, self.basis_omega, self.basis_omega, self.basis_omega))
-        Lp = np.column_stack((self.basis_phi, self.basis_phi+np.pi/2, self.basis_phi+np.pi, self.basis_phi+np.pi*3/2))
-        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
-        Gint = self._int_computation( r,omega,phi, r,omega,phi)
-
-        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
-        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
-        F2lower = np.array(self._cos(self.basis_alpha*self.basis_omega**2,self.basis_omega,self.basis_phi+np.pi)(self.lower))[:,None]
-
-        #dK_dvar
-        dK_dvar = 1. / self.variance * mdot(FX, self.Gi, FX.T)
-
-        #dK_dlen
-        da_dlen = [-3*self.a[0]/self.lengthscale, -2*self.a[1]/self.lengthscale, -self.a[2]/self.lengthscale, 0.]
-        db_dlen = [0., 4*self.b[1]/self.lengthscale, 2*self.b[2]/self.lengthscale, 2*self.b[3]/self.lengthscale, 2*self.b[4]/self.lengthscale]
-        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)), da_dlen[1]*self.basis_omega, da_dlen[2]*self.basis_omega**2, da_dlen[3]*self.basis_omega**3))
-        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
-        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
-        dGint_dlen = dGint_dlen + dGint_dlen.T
-        dlower_terms_dlen = db_dlen[0]*np.dot(Flower,Flower.T) + db_dlen[1]*np.dot(F2lower,F2lower.T) + db_dlen[2]*np.dot(F1lower,F1lower.T) + db_dlen[3]*np.dot(F2lower,Flower.T) + db_dlen[4]*np.dot(Flower,F2lower.T)
-        dG_dlen = 15*self.lengthscale**4/(400*np.sqrt(5))*Gint + 3*self.lengthscale**5/(400*np.sqrt(5))*dGint_dlen + dlower_terms_dlen
-        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX.T)
-
-        #dK_dper
-        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
-
-        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period, -self.a[2]*self.basis_omega**3/self.period, -self.a[3]*self.basis_omega**4/self.period))
-        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi,self.basis_phi+np.pi*3/2,self.basis_phi))
-        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
-
-        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
-        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
-        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
-        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
-        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)
-
-        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
-        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
-        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + .5*self.upper**2*np.cos(phi-phi1.T)
-        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + .5*self.lower**2*np.cos(phi-phi1.T)
-        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
-
-        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period, -2*self.a[2]*self.basis_omega**2/self.period, -3*self.a[3]*self.basis_omega**3/self.period))
-        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2, self.basis_phi+np.pi, self.basis_phi+np.pi*3/2))
-        r2,omega2,phi2 =  self._cos_factorization(dLa_dper2,Lo[:,0:2],dLp_dper2)
-
-        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) +  self._int_computation(r2,omega2,phi2, r,omega,phi)
-        dGint_dper = dGint_dper + dGint_dper.T
-
-        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
-        dF1lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower)+self._cos(-self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
-        dF2lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**3/self.period,self.basis_omega,self.basis_phi+np.pi*3/2)(self.lower) + self._cos(-2*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower))[:,None]
-
-        dlower_terms_dper  = self.b[0] * (np.dot(dFlower_dper,Flower.T) + np.dot(Flower.T,dFlower_dper))
-        dlower_terms_dper += self.b[1] * (np.dot(dF2lower_dper,F2lower.T) + np.dot(F2lower,dF2lower_dper.T)) - 4*self.b[1]/self.period*np.dot(F2lower,F2lower.T)
-        dlower_terms_dper += self.b[2] * (np.dot(dF1lower_dper,F1lower.T) + np.dot(F1lower,dF1lower_dper.T)) - 2*self.b[2]/self.period*np.dot(F1lower,F1lower.T)
-        dlower_terms_dper += self.b[3] * (np.dot(dF2lower_dper,Flower.T) + np.dot(F2lower,dFlower_dper.T)) - 2*self.b[3]/self.period*np.dot(F2lower,Flower.T)
-        dlower_terms_dper += self.b[4] * (np.dot(dFlower_dper,F2lower.T) + np.dot(Flower,dF2lower_dper.T)) - 2*self.b[4]/self.period*np.dot(Flower,F2lower.T)
-
-        dG_dper = 1./self.variance*(3*self.lengthscale**5/(400*np.sqrt(5))*dGint_dper + 0.5*dlower_terms_dper)
-        dK_dper = 2*mdot(dFX_dper,self.Gi,FX.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX.T)
-
-        target[0] += np.sum(np.diag(dK_dvar)*dL_dKdiag)
-        target[1] += np.sum(np.diag(dK_dlen)*dL_dKdiag)
-        target[2] += np.sum(np.diag(dK_dper)*dL_dKdiag)
--- a/GPy/kern/_src/poly.py
+++ b/GPy/kern/_src/poly.py
@ -0,0 +1,41 @@
+# Copyright (c) 2014, James Hensman
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+class Poly(Kern):
+    """
+    Polynomial kernel
+    """
+
+    def __init__(self, input_dim, variance=1., order=3., active_dims=None, name='poly'):
+        super(Poly, self).__init__(input_dim, active_dims, name)
+        self.variance = Param('variance', variance, Logexp())
+        self.link_parameter(self.variance)
+        self.order=order
+
+    def K(self, X, X2=None):
+        return (self._dot_product(X, X2) + 1.)**self.order * self.variance
+
+    def _dot_product(self, X, X2=None):
+        if X2 is None:
+            return np.dot(X, X.T)
+        else:
+            return np.dot(X, X2.T)
+
+    def Kdiag(self, X):
+        return self.variance*(np.square(X).sum(1) + 1.)**self.order
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        self.variance.gradient = np.sum(dL_dK * (self._dot_product(X, X2) + 1.)**self.order)
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        raise NotImplementedError
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        raise NotImplementedError
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        raise NotImplementedError
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@ -0,0 +1,66 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import numpy as np
+from kern import CombinationKernel
+from ...util.caching import Cache_this
+import itertools
+
+class Prod(CombinationKernel):
+    """
+    Computes the product of 2 kernels
+
+    :param k1, k2: the kernels to multiply
+    :type k1, k2: Kern
+    :param tensor: The kernels are either multiply as functions defined on the same input space (default) or on the product of the input spaces
+    :type tensor: Boolean
+    :rtype: kernel object
+
+    """
+    def __init__(self, kernels, name='mul'):
+        for i, kern in enumerate(kernels[:]):
+            if isinstance(kern, Prod):
+                del kernels[i]
+                for part in kern.parts[::-1]:
+                    kern.unlink_parameter(part)
+                    kernels.insert(i, part)
+        super(Prod, self).__init__(kernels, name)
+
+    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    def K(self, X, X2=None, which_parts=None):
+        if which_parts is None:
+            which_parts = self.parts
+        elif not isinstance(which_parts, (list, tuple)):
+            # if only one part is given
+            which_parts = [which_parts]
+        return reduce(np.multiply, (p.K(X, X2) for p in which_parts))
+
+    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    def Kdiag(self, X, which_parts=None):
+        if which_parts is None:
+            which_parts = self.parts
+        return reduce(np.multiply, (p.Kdiag(X) for p in which_parts))
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        k = self.K(X,X2)*dL_dK
+        for p in self.parts:
+            p.update_gradients_full(k/p.K(X,X2),X,X2)
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        k = self.Kdiag(X)*dL_dKdiag
+        for p in self.parts:
+            p.update_gradients_diag(k/p.Kdiag(X),X)
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        target = np.zeros(X.shape)
+        k = self.K(X,X2)*dL_dK
+        for p in self.parts:
+            target += p.gradients_X(k/p.K(X,X2),X,X2)
+        return target
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        target = np.zeros(X.shape)
+        k = self.Kdiag(X)*dL_dKdiag
+        for p in self.parts:
+            target += p.gradients_X_diag(k/p.Kdiag(X),X)
+        return target
--- a/GPy/kern/_src/psi_comp/init.py
+++ b/GPy/kern/_src/psi_comp/init.py
@ -0,0 +1,55 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from ....core.parameterization.parameter_core import Pickleable
+from GPy.util.caching import Cache_this
+from ....core.parameterization import variational
+import rbf_psi_comp
+import ssrbf_psi_comp
+import sslinear_psi_comp
+import linear_psi_comp
+
+class PSICOMP_RBF(Pickleable):
+    @Cache_this(limit=2, ignore_args=(0,))
+    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
+        if isinstance(variational_posterior, variational.NormalPosterior):
+            return rbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior)
+        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
+            return ssrbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior)
+        else:
+            raise ValueError, "unknown distriubtion received for psi-statistics"
+
+    @Cache_this(limit=2, ignore_args=(0,1,2,3))
+    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+        if isinstance(variational_posterior, variational.NormalPosterior):
+            return rbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior)
+        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
+            return ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior)
+        else:
+            raise ValueError, "unknown distriubtion received for psi-statistics"
+
+    def _setup_observers(self):
+        pass
+
+class PSICOMP_Linear(Pickleable):
+
+    @Cache_this(limit=2, ignore_args=(0,))
+    def psicomputations(self, variance, Z, variational_posterior):
+        if isinstance(variational_posterior, variational.NormalPosterior):
+            return linear_psi_comp.psicomputations(variance, Z, variational_posterior)
+        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
+            return sslinear_psi_comp.psicomputations(variance, Z, variational_posterior)
+        else:
+            raise ValueError, "unknown distriubtion received for psi-statistics"
+
+    @Cache_this(limit=2, ignore_args=(0,1,2,3))
+    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior):
+        if isinstance(variational_posterior, variational.NormalPosterior):
+            return linear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
+        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
+            return sslinear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
+        else:
+            raise ValueError, "unknown distriubtion received for psi-statistics"
+
+    def _setup_observers(self):
+        pass
--- a/GPy/kern/_src/psi_comp/linear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/linear_psi_comp.py
@ -0,0 +1,77 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+"""
+The package for the Psi statistics computation of the linear kernel for Bayesian GPLVM
+"""
+
+import numpy as np
+from ....util.linalg import tdot
+
+def psicomputations(variance, Z, variational_posterior):
+    """
+    Compute psi-statistics for ss-linear kernel
+    """
+    # here are the "statistics" for psi0, psi1 and psi2
+    # Produced intermediate results:
+    # psi0    N
+    # psi1    NxM
+    # psi2    MxM
+    mu = variational_posterior.mean
+    S = variational_posterior.variance
+
+    psi0 = (variance*(np.square(mu)+S)).sum(axis=1)
+    psi1 = np.dot(mu,(variance*Z).T)
+    psi2 = np.dot(S.sum(axis=0)*np.square(variance)*Z,Z.T)+ tdot(psi1.T)
+
+    return psi0, psi1, psi2
+
+def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior):
+    mu = variational_posterior.mean
+    S = variational_posterior.variance
+
+    dL_dvar, dL_dmu, dL_dS, dL_dZ = _psi2computations(dL_dpsi2, variance, Z, mu, S)
+
+    # Compute for psi0 and psi1
+    mu2S = np.square(mu)+S
+    dL_dpsi0_var = dL_dpsi0[:,None]*variance[None,:]
+    dL_dpsi1_mu = np.dot(dL_dpsi1.T,mu)
+    dL_dvar += (dL_dpsi0[:,None]*mu2S).sum(axis=0)+ (dL_dpsi1_mu*Z).sum(axis=0)
+    dL_dmu += 2.*dL_dpsi0_var*mu+np.dot(dL_dpsi1,Z)*variance
+    dL_dS += dL_dpsi0_var
+    dL_dZ += dL_dpsi1_mu*variance
+    
+    return dL_dvar, dL_dZ, dL_dmu, dL_dS
+
+def _psi2computations(dL_dpsi2, variance, Z, mu, S):
+    """
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    """
+    # here are the "statistics" for psi1 and psi2
+    # Produced intermediate results:
+    # _psi2_dvariance      Q
+    # _psi2_dZ             MxQ
+    # _psi2_dmu            NxQ
+    # _psi2_dS             NxQ
+    
+    variance2 = np.square(variance)
+    common_sum = np.dot(mu,(variance*Z).T)
+    Z_expect = (np.dot(dL_dpsi2,Z)*Z).sum(axis=0)
+    dL_dpsi2T = dL_dpsi2+dL_dpsi2.T
+    common_expect = np.dot(common_sum,np.dot(dL_dpsi2T,Z))
+    Z2_expect = np.inner(common_sum,dL_dpsi2T)
+    Z1_expect = np.dot(dL_dpsi2T,Z)
+
+    dL_dvar = 2.*S.sum(axis=0)*variance*Z_expect+(common_expect*mu).sum(axis=0)
+            
+    dL_dmu = common_expect*variance
+    
+    dL_dS = np.empty(S.shape)
+    dL_dS[:] = Z_expect*variance2
+    
+    dL_dZ = variance2*S.sum(axis=0)*Z1_expect+np.dot(Z2_expect.T,variance*mu)
+
+    return dL_dvar, dL_dmu, dL_dS, dL_dZ
--- a/GPy/kern/_src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
@ -0,0 +1,161 @@
+"""
+The module for psi-statistics for RBF kernel
+"""
+
+import numpy as np
+from GPy.util.caching import Cacher
+
+def psicomputations(variance, lengthscale, Z, variational_posterior):
+    """
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    """
+    # here are the "statistics" for psi0, psi1 and psi2
+    # Produced intermediate results:
+    # _psi1                NxM
+    mu = variational_posterior.mean
+    S = variational_posterior.variance
+
+    psi0 = np.empty(mu.shape[0])
+    psi0[:] = variance
+    psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
+    psi2 = _psi2computations(variance, lengthscale, Z, mu, S).sum(axis=0)
+    return psi0, psi1, psi2
+
+def __psi1computations(variance, lengthscale, Z, mu, S):
+    """
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    """
+    # here are the "statistics" for psi1
+    # Produced intermediate results:
+    # _psi1                NxM
+
+    lengthscale2 = np.square(lengthscale)
+
+    # psi1
+    _psi1_logdenom = np.log(S/lengthscale2+1.).sum(axis=-1) # N
+    _psi1_log = (_psi1_logdenom[:,None]+np.einsum('nmq,nq->nm',np.square(mu[:,None,:]-Z[None,:,:]),1./(S+lengthscale2)))/(-2.)
+    _psi1 = variance*np.exp(_psi1_log)
+
+    return _psi1
+
+def __psi2computations(variance, lengthscale, Z, mu, S):
+    """
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    """
+    # here are the "statistics" for psi2
+    # Produced intermediate results:
+    # _psi2                MxM
+
+    lengthscale2 = np.square(lengthscale)
+
+    _psi2_logdenom = np.log(2.*S/lengthscale2+1.).sum(axis=-1)/(-2.) # N
+    _psi2_exp1 = (np.square(Z[:,None,:]-Z[None,:,:])/lengthscale2).sum(axis=-1)/(-4.) #MxM
+    Z_hat = (Z[:,None,:]+Z[None,:,:])/2. #MxMxQ
+    denom = 1./(2.*S+lengthscale2)
+    _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+2.*np.einsum('nq,moq,nq->nmo',mu,Z_hat,denom)-np.einsum('moq,nq->nmo',np.square(Z_hat),denom)
+    _psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2)
+
+
+    return _psi2
+
+def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+    ARD = (len(lengthscale)!=1)
+
+    dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
+    dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
+
+    dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
+
+    dL_dlengscale = dl_psi1 + dl_psi2
+    if not ARD:
+        dL_dlengscale = dL_dlengscale.sum()
+
+    dL_dmu = dmu_psi1 + dmu_psi2
+    dL_dS = dS_psi1 + dS_psi2
+    dL_dZ = dZ_psi1 + dZ_psi2
+
+    return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS
+
+def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S):
+    """
+    dL_dpsi1 - NxM
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    """
+    # here are the "statistics" for psi1
+    # Produced intermediate results: dL_dparams w.r.t. psi1
+    # _dL_dvariance     1
+    # _dL_dlengthscale  Q
+    # _dL_dZ            MxQ
+    # _dL_dgamma        NxQ
+    # _dL_dmu           NxQ
+    # _dL_dS            NxQ
+
+    lengthscale2 = np.square(lengthscale)
+
+    _psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
+    Lpsi1 = dL_dpsi1*_psi1
+    Zmu = Z[None,:,:]-mu[:,None,:] # NxMxQ
+    denom = 1./(S+lengthscale2)
+    Zmu2_denom = np.square(Zmu)*denom[:,None,:] #NxMxQ
+    _dL_dvar = Lpsi1.sum()/variance
+    _dL_dmu = np.einsum('nm,nmq,nq->nq',Lpsi1,Zmu,denom)
+    _dL_dS = np.einsum('nm,nmq,nq->nq',Lpsi1,(Zmu2_denom-1.),denom)/2.
+    _dL_dZ = -np.einsum('nm,nmq,nq->mq',Lpsi1,Zmu,denom)
+    _dL_dl = np.einsum('nm,nmq,nq->q',Lpsi1,(Zmu2_denom+(S/lengthscale2)[:,None,:]),denom*lengthscale)
+
+    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
+
+def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
+    """
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    dL_dpsi2 - MxM
+    """
+    # here are the "statistics" for psi2
+    # Produced the derivatives w.r.t. psi2:
+    # _dL_dvariance      1
+    # _dL_dlengthscale   Q
+    # _dL_dZ             MxQ
+    # _dL_dgamma         NxQ
+    # _dL_dmu            NxQ
+    # _dL_dS             NxQ
+
+    lengthscale2 = np.square(lengthscale)
+    denom = 1./(2*S+lengthscale2)
+    denom2 = np.square(denom)
+
+    _psi2 = _psi2computations(variance, lengthscale, Z, mu, S) # NxMxM
+    Lpsi2 = dL_dpsi2*_psi2 # dL_dpsi2 is MxM, using broadcast to multiply N out
+    Lpsi2sum = np.einsum('nmo->n',Lpsi2) #N
+    Lpsi2Z = np.einsum('nmo,oq->nq',Lpsi2,Z) #NxQ
+    Lpsi2Z2 = np.einsum('nmo,oq,oq->nq',Lpsi2,Z,Z) #NxQ
+    Lpsi2Z2p = np.einsum('nmo,mq,oq->nq',Lpsi2,Z,Z) #NxQ
+    Lpsi2Zhat = Lpsi2Z
+    Lpsi2Zhat2 = (Lpsi2Z2+Lpsi2Z2p)/2
+
+    _dL_dvar = Lpsi2sum.sum()*2/variance
+    _dL_dmu = (-2*denom) * (mu*Lpsi2sum[:,None]-Lpsi2Zhat)
+    _dL_dS = (2*np.square(denom))*(np.square(mu)*Lpsi2sum[:,None]-2*mu*Lpsi2Zhat+Lpsi2Zhat2) - denom*Lpsi2sum[:,None]
+    _dL_dZ = -np.einsum('nmo,oq->oq',Lpsi2,Z)/lengthscale2+np.einsum('nmo,oq->mq',Lpsi2,Z)/lengthscale2+ \
+             2*np.einsum('nmo,nq,nq->mq',Lpsi2,mu,denom) - np.einsum('nmo,nq,mq->mq',Lpsi2,denom,Z) - np.einsum('nmo,oq,nq->mq',Lpsi2,Z,denom)
+    _dL_dl = 2*lengthscale* ((S/lengthscale2*denom+np.square(mu*denom))*Lpsi2sum[:,None]+(Lpsi2Z2-Lpsi2Z2p)/(2*np.square(lengthscale2))-
+                             (2*mu*denom2)*Lpsi2Zhat+denom2*Lpsi2Zhat2).sum(axis=0)
+
+    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
+
+_psi1computations = Cacher(__psi1computations, limit=1)
+_psi2computations = Cacher(__psi2computations, limit=1)
--- a/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py
@ -0,0 +1,411 @@
+"""
+The module for psi-statistics for RBF kernel
+"""
+
+import numpy as np
+from ....util.caching import Cache_this
+from . import PSICOMP_RBF
+from ....util import gpu_init
+
+try:
+    import pycuda.gpuarray as gpuarray
+    from pycuda.compiler import SourceModule
+    from ....util.linalg_gpu import sum_axis
+except:
+    pass    
+
+gpu_code = """
+    // define THREADNUM
+
+    #define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
+    #define IDX_NMM(n,m1,m2) ((m2*M+m1)*N+n)
+    #define IDX_NQ(n,q) (q*N+n)
+    #define IDX_NM(n,m) (m*N+n)
+    #define IDX_MQ(m,q) (q*M+m)
+    #define IDX_MM(m1,m2) (m2*M+m1)
+    #define IDX_NQB(n,q,b) ((b*Q+q)*N+n)
+    #define IDX_QB(q,b) (b*Q+q)
+
+    // Divide data evenly
+    __device__ void divide_data(int total_data, int psize, int pidx, int *start, int *end) {
+        int residue = (total_data)%psize;
+        if(pidx<residue) {
+            int size = total_data/psize+1;
+            *start = size*pidx;
+            *end = *start+size;
+        } else {
+            int size = total_data/psize;
+            *start = size*pidx+residue;
+            *end = *start+size;
+        }
+    }
+    
+    __device__ void reduce_sum(double* array, int array_size) {
+        int s;
+        if(array_size >= blockDim.x) {
+            for(int i=blockDim.x+threadIdx.x; i<array_size; i+= blockDim.x) {
+                array[threadIdx.x] += array[i];
+            }
+            array_size = blockDim.x;
+        }
+        __syncthreads();
+        for(int i=1; i<=array_size;i*=2) {s=i;}
+        if(threadIdx.x < array_size-s) {array[threadIdx.x] += array[s+threadIdx.x];}
+        __syncthreads();
+        for(s=s/2;s>=1;s=s/2) {
+            if(threadIdx.x < s) {array[threadIdx.x] += array[s+threadIdx.x];}
+            __syncthreads();
+        }
+    }
+
+    __global__ void compDenom(double *log_denom1, double *log_denom2, double *l, double *S, int N, int Q)
+    {
+        int n_start, n_end;
+        divide_data(N, gridDim.x, blockIdx.x, &n_start, &n_end);
+        
+        for(int i=n_start*Q+threadIdx.x; i<n_end*Q; i+=blockDim.x) {
+            int n=i/Q;
+            int q=i%Q;
+
+            double Snq = S[IDX_NQ(n,q)];
+            double lq = l[q]*l[q];
+            log_denom1[IDX_NQ(n,q)] = log(Snq/lq+1.);
+            log_denom2[IDX_NQ(n,q)] = log(2.*Snq/lq+1.);
+        }
+    }
+
+    __global__ void psi1computations(double *psi1, double *log_denom1, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
+    {
+        int m_start, m_end;
+        divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
+        
+        for(int m=m_start; m<m_end; m++) {
+            for(int n=threadIdx.x; n<N; n+= blockDim.x) {            
+                double log_psi1 = 0;
+                for(int q=0;q<Q;q++) {
+                    double muZ = mu[IDX_NQ(n,q)]-Z[IDX_MQ(m,q)];
+                    double Snq = S[IDX_NQ(n,q)];
+                    double lq = l[q]*l[q];
+                    log_psi1 += (muZ*muZ/(Snq+lq)+log_denom1[IDX_NQ(n,q)])/(-2.);
+                }
+                psi1[IDX_NM(n,m)] = var*exp(log_psi1);
+            }
+        }
+    }
+    
+    __global__ void psi2computations(double *psi2, double *psi2n, double *log_denom2, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
+    {
+        int psi2_idx_start, psi2_idx_end;
+        __shared__ double psi2_local[THREADNUM];
+        divide_data((M+1)*M/2, gridDim.x, blockIdx.x, &psi2_idx_start, &psi2_idx_end);
+        
+        for(int psi2_idx=psi2_idx_start; psi2_idx<psi2_idx_end; psi2_idx++) {
+            int m1 = int((sqrt(8.*psi2_idx+1.)-1.)/2.);
+            int m2 = psi2_idx - (m1+1)*m1/2;
+            
+            psi2_local[threadIdx.x] = 0;
+            for(int n=threadIdx.x;n<N;n+=blockDim.x) {
+                double log_psi2_n = 0;
+                for(int q=0;q<Q;q++) {
+                    double dZ = Z[IDX_MQ(m1,q)] - Z[IDX_MQ(m2,q)];
+                    double muZhat = mu[IDX_NQ(n,q)]- (Z[IDX_MQ(m1,q)]+Z[IDX_MQ(m2,q)])/2.;
+                    double Snq = S[IDX_NQ(n,q)];
+                    double lq = l[q]*l[q];
+                    log_psi2_n += dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) + log_denom2[IDX_NQ(n,q)]/(-2.);
+                }
+                double exp_psi2_n = exp(log_psi2_n);
+                psi2n[IDX_NMM(n,m1,m2)] = var*var*exp_psi2_n;
+                if(m1!=m2) { psi2n[IDX_NMM(n,m2,m1)] = var*var*exp_psi2_n;}
+                psi2_local[threadIdx.x] += exp_psi2_n;
+            }
+            __syncthreads();
+            reduce_sum(psi2_local, THREADNUM);
+            if(threadIdx.x==0) {
+                psi2[IDX_MM(m1,m2)] = var*var*psi2_local[0];
+                if(m1!=m2) { psi2[IDX_MM(m2,m1)] = var*var*psi2_local[0]; }
+            }
+            __syncthreads();
+        }
+    }
+    
+    __global__ void psi1compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dL_dpsi1, double *psi1, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
+    {
+        int m_start, m_end;
+        __shared__ double g_local[THREADNUM];
+        divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
+        int P = int(ceil(double(N)/THREADNUM));
+
+        double dvar_local = 0;
+        for(int q=0;q<Q;q++) {
+            double lq_sqrt = l[q];
+            double lq = lq_sqrt*lq_sqrt;
+            double dl_local = 0;
+            for(int p=0;p<P;p++) {
+                int n = p*THREADNUM + threadIdx.x;
+                double dmu_local = 0;
+                double dS_local = 0;
+                double Snq,mu_nq;
+                if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)];}
+                for(int m=m_start; m<m_end; m++) {
+                    if(n<N) {
+                        double lpsi1 = psi1[IDX_NM(n,m)]*dL_dpsi1[IDX_NM(n,m)];
+                        if(q==0) {dvar_local += lpsi1;}
+                        
+                        double Zmu = Z[IDX_MQ(m,q)] - mu_nq;
+                        double denom = Snq+lq;
+                        double Zmu2_denom = Zmu*Zmu/denom;
+                        
+                        dmu_local += lpsi1*Zmu/denom;
+                        dS_local += lpsi1*(Zmu2_denom-1.)/denom;
+                        dl_local += lpsi1*(Zmu2_denom+Snq/lq)/denom;
+                        g_local[threadIdx.x] = -lpsi1*Zmu/denom;
+                    }
+                    __syncthreads();
+                    reduce_sum(g_local, p<P-1?THREADNUM:N-(P-1)*THREADNUM);
+                    if(threadIdx.x==0) {dZ[IDX_MQ(m,q)] += g_local[0];}
+                }
+                if(n<N) {
+                    dmu[IDX_NQB(n,q,blockIdx.x)] += dmu_local;
+                    dS[IDX_NQB(n,q,blockIdx.x)] += dS_local/2.;
+                }
+                __threadfence_block();
+            }
+            g_local[threadIdx.x] = dl_local*lq_sqrt;
+            __syncthreads();
+            reduce_sum(g_local, THREADNUM);
+            if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
+        }
+        g_local[threadIdx.x] = dvar_local;
+        __syncthreads();
+        reduce_sum(g_local, THREADNUM);
+        if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]/var;}        
+    }
+    
+    __global__ void psi2compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dL_dpsi2, double *psi2n, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
+    {
+        int m_start, m_end;
+        __shared__ double g_local[THREADNUM];
+        divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
+        int P = int(ceil(double(N)/THREADNUM));
+
+        double dvar_local = 0;
+        for(int q=0;q<Q;q++) {
+            double lq_sqrt = l[q];
+            double lq = lq_sqrt*lq_sqrt;
+            double dl_local = 0;
+            for(int p=0;p<P;p++) {
+                int n = p*THREADNUM + threadIdx.x;
+                double dmu_local = 0;
+                double dS_local = 0;
+                double Snq,mu_nq;
+                if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)];}
+                for(int m1=m_start; m1<m_end; m1++) {
+                    g_local[threadIdx.x] = 0;
+                    for(int m2=0;m2<M;m2++) {
+                        if(n<N) {
+                            double lpsi2 = psi2n[IDX_NMM(n,m1,m2)]*dL_dpsi2[IDX_MM(m1,m2)];
+                            if(q==0) {dvar_local += lpsi2;}
+                            
+                            double dZ = Z[IDX_MQ(m1,q)] - Z[IDX_MQ(m2,q)];
+                            double muZhat =  mu_nq - (Z[IDX_MQ(m1,q)] + Z[IDX_MQ(m2,q)])/2.;
+                            double denom = 2.*Snq+lq;
+                            double muZhat2_denom = muZhat*muZhat/denom;
+                            
+                            dmu_local += lpsi2*muZhat/denom;
+                            dS_local += lpsi2*(2.*muZhat2_denom-1.)/denom;
+                            dl_local += lpsi2*((Snq/lq+muZhat2_denom)/denom+dZ*dZ/(4.*lq*lq));
+                            g_local[threadIdx.x] += 2.*lpsi2*(muZhat/denom-dZ/(2*lq));
+                        }
+                    }
+                    __syncthreads();
+                    reduce_sum(g_local, p<P-1?THREADNUM:N-(P-1)*THREADNUM);
+                    if(threadIdx.x==0) {dZ[IDX_MQ(m1,q)] += g_local[0];}
+                }
+                if(n<N) {
+                    dmu[IDX_NQB(n,q,blockIdx.x)] += -2.*dmu_local;
+                    dS[IDX_NQB(n,q,blockIdx.x)] += dS_local;
+                }
+                __threadfence_block();
+            }
+            g_local[threadIdx.x] = dl_local*2.*lq_sqrt;
+            __syncthreads();
+            reduce_sum(g_local, THREADNUM);
+            if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
+        }
+        g_local[threadIdx.x] = dvar_local;
+        __syncthreads();
+        reduce_sum(g_local, THREADNUM);
+        if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]*2/var;}
+    }
+    """
+
+class PSICOMP_RBF_GPU(PSICOMP_RBF):
+
+    def __init__(self, threadnum=128, blocknum=15, GPU_direct=False):
+        self.GPU_direct = GPU_direct
+        self.gpuCache = None
+        
+        self.threadnum = threadnum
+        self.blocknum = blocknum
+        module = SourceModule("#define THREADNUM "+str(self.threadnum)+"\n"+gpu_code)
+        self.g_psi1computations = module.get_function('psi1computations')
+        self.g_psi1computations.prepare('PPdPPPPiii')
+        self.g_psi2computations = module.get_function('psi2computations')
+        self.g_psi2computations.prepare('PPPdPPPPiii')
+        self.g_psi1compDer = module.get_function('psi1compDer')
+        self.g_psi1compDer.prepare('PPPPPPPdPPPPiii')
+        self.g_psi2compDer = module.get_function('psi2compDer')
+        self.g_psi2compDer.prepare('PPPPPPPdPPPPiii')
+        self.g_compDenom = module.get_function('compDenom')
+        self.g_compDenom.prepare('PPPPii')
+        
+    def __deepcopy__(self, memo):
+        s = PSICOMP_RBF_GPU(threadnum=self.threadnum, blocknum=self.blocknum, GPU_direct=self.GPU_direct)
+        memo[id(self)] = s 
+        return s
+    
+    def _initGPUCache(self, N, M, Q):            
+        if self.gpuCache == None:
+            self.gpuCache = {
+                             'l_gpu'                :gpuarray.empty((Q,),np.float64,order='F'),
+                             'Z_gpu'                :gpuarray.empty((M,Q),np.float64,order='F'),
+                             'mu_gpu'               :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'S_gpu'                :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'psi1_gpu'             :gpuarray.empty((N,M),np.float64,order='F'),
+                             'psi2_gpu'             :gpuarray.empty((M,M),np.float64,order='F'),
+                             'psi2n_gpu'            :gpuarray.empty((N,M,M),np.float64,order='F'),
+                             'dL_dpsi1_gpu'         :gpuarray.empty((N,M),np.float64,order='F'),
+                             'dL_dpsi2_gpu'         :gpuarray.empty((M,M),np.float64,order='F'),
+                             'log_denom1_gpu'       :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'log_denom2_gpu'       :gpuarray.empty((N,Q),np.float64,order='F'),
+                             # derivatives
+                             'dvar_gpu'             :gpuarray.empty((self.blocknum,),np.float64, order='F'),
+                             'dl_gpu'               :gpuarray.empty((Q,self.blocknum),np.float64, order='F'),
+                             'dZ_gpu'               :gpuarray.empty((M,Q),np.float64, order='F'),
+                             'dmu_gpu'              :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
+                             'dS_gpu'               :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
+                             # grad
+                             'grad_l_gpu'               :gpuarray.empty((Q,),np.float64, order='F'),
+                             'grad_mu_gpu'              :gpuarray.empty((N,Q,),np.float64, order='F'),
+                             'grad_S_gpu'               :gpuarray.empty((N,Q,),np.float64, order='F'),
+                             }
+        else:
+            assert N==self.gpuCache['mu_gpu'].shape[0]
+            assert M==self.gpuCache['Z_gpu'].shape[0]
+            assert Q==self.gpuCache['l_gpu'].shape[0]
+    
+    def sync_params(self, lengthscale, Z, mu, S):
+        if len(lengthscale)==1:
+            self.gpuCache['l_gpu'].fill(lengthscale)
+        else:
+            self.gpuCache['l_gpu'].set(np.asfortranarray(lengthscale))
+        self.gpuCache['Z_gpu'].set(np.asfortranarray(Z))
+        self.gpuCache['mu_gpu'].set(np.asfortranarray(mu))
+        self.gpuCache['S_gpu'].set(np.asfortranarray(S))
+        N,Q = self.gpuCache['S_gpu'].shape
+        # t=self.g_compDenom(self.gpuCache['log_denom1_gpu'],self.gpuCache['log_denom2_gpu'],self.gpuCache['l_gpu'],self.gpuCache['S_gpu'], np.int32(N), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
+        # print 'g_compDenom '+str(t)
+        self.g_compDenom.prepared_call((self.blocknum,1),(self.threadnum,1,1), self.gpuCache['log_denom1_gpu'].gpudata,self.gpuCache['log_denom2_gpu'].gpudata,self.gpuCache['l_gpu'].gpudata,self.gpuCache['S_gpu'].gpudata, np.int32(N), np.int32(Q))
+        
+    def reset_derivative(self):
+        self.gpuCache['dvar_gpu'].fill(0.)
+        self.gpuCache['dl_gpu'].fill(0.)
+        self.gpuCache['dZ_gpu'].fill(0.)
+        self.gpuCache['dmu_gpu'].fill(0.)
+        self.gpuCache['dS_gpu'].fill(0.)
+        self.gpuCache['grad_l_gpu'].fill(0.)
+        self.gpuCache['grad_mu_gpu'].fill(0.)
+        self.gpuCache['grad_S_gpu'].fill(0.)
+    
+    def get_dimensions(self, Z, variational_posterior):
+        return variational_posterior.mean.shape[0], Z.shape[0], Z.shape[1]
+
+    @Cache_this(limit=1, ignore_args=(0,))
+    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        """
+        N,M,Q = self.get_dimensions(Z, variational_posterior)
+        self._initGPUCache(N,M,Q)
+        self.sync_params(lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
+        
+        psi1_gpu = self.gpuCache['psi1_gpu']
+        psi2_gpu = self.gpuCache['psi2_gpu']
+        psi2n_gpu = self.gpuCache['psi2n_gpu']
+        l_gpu = self.gpuCache['l_gpu']
+        Z_gpu = self.gpuCache['Z_gpu']
+        mu_gpu = self.gpuCache['mu_gpu']
+        S_gpu = self.gpuCache['S_gpu']
+        log_denom1_gpu = self.gpuCache['log_denom1_gpu']
+        log_denom2_gpu = self.gpuCache['log_denom2_gpu']
+
+        psi0 = np.empty((N,))
+        psi0[:] = variance
+        self.g_psi1computations.prepared_call((self.blocknum,1),(self.threadnum,1,1),psi1_gpu.gpudata, log_denom1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
+        self.g_psi2computations.prepared_call((self.blocknum,1),(self.threadnum,1,1),psi2_gpu.gpudata, psi2n_gpu.gpudata, log_denom2_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
+        # t = self.g_psi1computations(psi1_gpu, log_denom1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
+        # print 'g_psi1computations '+str(t)
+        # t = self.g_psi2computations(psi2_gpu, psi2n_gpu, log_denom2_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
+        # print 'g_psi2computations '+str(t)
+         
+        if self.GPU_direct:
+            return psi0, psi1_gpu, psi2_gpu
+        else:
+            return psi0, psi1_gpu.get(), psi2_gpu.get()
+
+    @Cache_this(limit=1, ignore_args=(0,1,2,3))
+    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+        ARD = (len(lengthscale)!=1)
+        
+        N,M,Q = self.get_dimensions(Z, variational_posterior)
+        psi1_gpu = self.gpuCache['psi1_gpu']
+        psi2n_gpu = self.gpuCache['psi2n_gpu']
+        l_gpu = self.gpuCache['l_gpu']
+        Z_gpu = self.gpuCache['Z_gpu']
+        mu_gpu = self.gpuCache['mu_gpu']
+        S_gpu = self.gpuCache['S_gpu']
+        dvar_gpu = self.gpuCache['dvar_gpu']
+        dl_gpu = self.gpuCache['dl_gpu']
+        dZ_gpu = self.gpuCache['dZ_gpu']
+        dmu_gpu = self.gpuCache['dmu_gpu']
+        dS_gpu = self.gpuCache['dS_gpu']
+        grad_l_gpu = self.gpuCache['grad_l_gpu']
+        grad_mu_gpu = self.gpuCache['grad_mu_gpu']
+        grad_S_gpu = self.gpuCache['grad_S_gpu']
+        
+        if self.GPU_direct:
+            dL_dpsi1_gpu = dL_dpsi1
+            dL_dpsi2_gpu = dL_dpsi2
+            dL_dpsi0_sum = dL_dpsi0.get().sum() #gpuarray.sum(dL_dpsi0).get()
+        else:
+            dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu']
+            dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu']
+            dL_dpsi1_gpu.set(np.asfortranarray(dL_dpsi1))
+            dL_dpsi2_gpu.set(np.asfortranarray(dL_dpsi2))
+            dL_dpsi0_sum = dL_dpsi0.sum()
+
+        self.reset_derivative()
+        # t=self.g_psi1compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi1_gpu,psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
+        # print 'g_psi1compDer '+str(t)
+        # t=self.g_psi2compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi2_gpu,psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
+        # print 'g_psi2compDer '+str(t)
+        self.g_psi1compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dL_dpsi1_gpu.gpudata,psi1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
+        self.g_psi2compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dL_dpsi2_gpu.gpudata,psi2n_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
+
+        dL_dvar = dL_dpsi0_sum + dvar_gpu.get().sum()#gpuarray.sum(dvar_gpu).get()
+        sum_axis(grad_mu_gpu,dmu_gpu,N*Q,self.blocknum)
+        dL_dmu = grad_mu_gpu.get()
+        sum_axis(grad_S_gpu,dS_gpu,N*Q,self.blocknum)
+        dL_dS = grad_S_gpu.get()
+        dL_dZ = dZ_gpu.get()
+        if ARD:
+            sum_axis(grad_l_gpu,dl_gpu,Q,self.blocknum)
+            dL_dlengscale = grad_l_gpu.get()
+        else:
+            dL_dlengscale = dl_gpu.get().sum() #gpuarray.sum(dl_gpu).get()
+            
+        return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS
+    
+
--- a/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
@ -0,0 +1,92 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+"""
+The package for the Psi statistics computation of the linear kernel for SSGPLVM
+"""
+
+from ....util.linalg import tdot
+
+import numpy as np
+
+def psicomputations(variance, Z, variational_posterior):
+    """
+    Compute psi-statistics for ss-linear kernel
+    """
+    # here are the "statistics" for psi0, psi1 and psi2
+    # Produced intermediate results:
+    # psi0    N
+    # psi1    NxM
+    # psi2    MxM
+    mu = variational_posterior.mean
+    S = variational_posterior.variance
+    gamma = variational_posterior.binary_prob
+
+    psi0 = (gamma*(np.square(mu)+S)*variance).sum(axis=-1)
+    psi1 = np.inner(variance*gamma*mu,Z)
+    psi2 = np.inner(np.square(variance)*(gamma*((1-gamma)*np.square(mu)+S)).sum(axis=0)*Z,Z)+tdot(psi1.T)
+
+    return psi0, psi1, psi2
+
+def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior):
+    mu = variational_posterior.mean
+    S = variational_posterior.variance
+    gamma = variational_posterior.binary_prob
+
+    dL_dvar, dL_dgamma, dL_dmu, dL_dS, dL_dZ = _psi2computations(dL_dpsi2, variance, Z, mu, S, gamma)
+
+    # Compute for psi0 and psi1
+    mu2S = np.square(mu)+S
+    dL_dvar += np.einsum('n,nq,nq->q',dL_dpsi0,gamma,mu2S) + np.einsum('nm,nq,mq,nq->q',dL_dpsi1,gamma,Z,mu)
+    dL_dgamma += np.einsum('n,q,nq->nq',dL_dpsi0,variance,mu2S) + np.einsum('nm,q,mq,nq->nq',dL_dpsi1,variance,Z,mu)
+    dL_dmu += np.einsum('n,nq,q,nq->nq',dL_dpsi0,gamma,2.*variance,mu) + np.einsum('nm,nq,q,mq->nq',dL_dpsi1,gamma,variance,Z)
+    dL_dS += np.einsum('n,nq,q->nq',dL_dpsi0,gamma,variance)
+    dL_dZ +=  np.einsum('nm,nq,q,nq->mq',dL_dpsi1,gamma, variance,mu)
+    
+    return dL_dvar, dL_dZ, dL_dmu, dL_dS, dL_dgamma
+
+def _psi2computations(dL_dpsi2, variance, Z, mu, S, gamma):
+    """
+    Z - MxQ
+    mu - NxQ
+    S - NxQ
+    gamma - NxQ
+    """
+    # here are the "statistics" for psi1 and psi2
+    # Produced intermediate results:
+    # _psi2_dvariance      Q
+    # _psi2_dZ             MxQ
+    # _psi2_dgamma         NxQ
+    # _psi2_dmu            NxQ
+    # _psi2_dS             NxQ
+    
+    mu2 = np.square(mu)
+    gamma2 = np.square(gamma)
+    variance2 = np.square(variance)
+    mu2S = mu2+S # NxQ
+    gvm = np.einsum('nq,nq,q->nq',gamma,mu,variance)
+    common_sum = np.einsum('nq,mq->nm',gvm,Z)
+#     common_sum = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) # NxM
+    Z_expect = np.einsum('mo,mq,oq->q',dL_dpsi2,Z,Z)
+    dL_dpsi2T = dL_dpsi2+dL_dpsi2.T
+    tmp = np.einsum('mo,oq->mq',dL_dpsi2T,Z)
+    common_expect = np.einsum('mq,nm->nq',tmp,common_sum)
+#     common_expect = np.einsum('mo,mq,no->nq',dL_dpsi2+dL_dpsi2.T,Z,common_sum)
+    Z2_expect = np.einsum('om,nm->no',dL_dpsi2T,common_sum)
+    Z1_expect = np.einsum('om,mq->oq',dL_dpsi2T,Z)
+    
+    dL_dvar = np.einsum('nq,q,q->q',2.*(gamma*mu2S-gamma2*mu2),variance,Z_expect)+\
+        np.einsum('nq,nq,nq->q',common_expect,gamma,mu)
+        
+    dL_dgamma = np.einsum('q,q,nq->nq',Z_expect,variance2,(mu2S-2.*gamma*mu2))+\
+        np.einsum('nq,q,nq->nq',common_expect,variance,mu)
+    
+    dL_dmu = np.einsum('q,q,nq,nq->nq',Z_expect,variance2,mu,2.*(gamma-gamma2))+\
+            np.einsum('nq,nq,q->nq',common_expect,gamma,variance)
+                    
+    dL_dS = np.einsum('q,nq,q->nq',Z_expect,gamma,variance2)
+    
+#     dL_dZ = 2.*(np.einsum('om,nq,q,mq,nq->oq',dL_dpsi2,gamma,variance2,Z,(mu2S-gamma*mu2))+np.einsum('om,nq,q,nq,nm->oq',dL_dpsi2,gamma,variance,mu,common_sum))
+    dL_dZ = Z1_expect*np.einsum('nq,q,nq->q',gamma,variance2,(mu2S-gamma*mu2))+np.einsum('nq,q,nq,nm->mq',gamma,variance,mu,Z2_expect)
+
+    return dL_dvar, dL_dgamma, dL_dmu, dL_dS, dL_dZ
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
@ -0,0 +1,394 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+"""
+The package for the psi statistics computation
+"""
+
+import numpy as np
+
+try:
+    from scipy import weave
+     
+    def _psicomputations(variance, lengthscale, Z, variational_posterior):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        """
+        # here are the "statistics" for psi0, psi1 and psi2
+        # Produced intermediate results:
+        # _psi1                NxM
+        mu = variational_posterior.mean
+        S = variational_posterior.variance
+         
+        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
+        l2 = np.square(lengthscale)
+        log_denom1 = np.log(S/l2+1)
+        log_denom2 = np.log(2*S/l2+1)
+        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        variance = float(variance)
+        psi0 = np.empty(N)
+        psi0[:] = variance
+        psi1 = np.empty((N,M))
+        psi2n = np.empty((N,M,M))
+         
+        from ....util.misc import param_to_array
+        S = param_to_array(S)
+        mu = param_to_array(mu)
+        Z = param_to_array(Z)
+         
+        support_code = """
+        #include <math.h>
+        """
+        code = """
+        for(int n=0; n<N; n++) {
+            for(int m1=0;m1<M;m1++) {
+                double log_psi1=0;
+                for(int m2=0;m2<=m1;m2++) {
+                    double log_psi2_n=0;
+                    for(int q=0;q<Q;q++) {
+                        double Snq = S(n,q);
+                        double lq = l2(q);
+                        double Zm1q = Z(m1,q);
+                        double Zm2q = Z(m2,q);
+                         
+                        if(m2==0) {
+                            // Compute Psi_1
+                            double muZ = mu(n,q)-Z(m1,q);
+                             
+                            double psi1_exp1 = log_gamma(n,q) - (muZ*muZ/(Snq+lq) +log_denom1(n,q))/2.;
+                            double psi1_exp2 = log_gamma1(n,q) -Zm1q*Zm1q/(2.*lq);
+                            log_psi1 += (psi1_exp1>psi1_exp2)?psi1_exp1+log1p(exp(psi1_exp2-psi1_exp1)):psi1_exp2+log1p(exp(psi1_exp1-psi1_exp2));
+                        }
+                        // Compute Psi_2
+                        double muZhat = mu(n,q) - (Zm1q+Zm2q)/2.;
+                        double Z2 = Zm1q*Zm1q+ Zm2q*Zm2q;
+                        double dZ = Zm1q - Zm2q;
+                         
+                        double psi2_exp1 = dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2(n,q)/2. + log_gamma(n,q);
+                        double psi2_exp2 = log_gamma1(n,q) - Z2/(2.*lq);
+                        log_psi2_n += (psi2_exp1>psi2_exp2)?psi2_exp1+log1p(exp(psi2_exp2-psi2_exp1)):psi2_exp2+log1p(exp(psi2_exp1-psi2_exp2));                    
+                    }
+                    double exp_psi2_n = exp(log_psi2_n);
+                    psi2n(n,m1,m2) = variance*variance*exp_psi2_n;
+                    if(m1!=m2) { psi2n(n,m2,m1) = variance*variance*exp_psi2_n;}
+                }
+                psi1(n,m1) = variance*exp(log_psi1);
+            }
+        }
+        """
+        weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz)
+     
+        psi2 = psi2n.sum(axis=0)
+        return psi0,psi1,psi2,psi2n
+     
+    from GPy.util.caching import Cacher
+    psicomputations = Cacher(_psicomputations, limit=1)
+     
+    def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+        ARD = (len(lengthscale)!=1)
+         
+        _,psi1,_,psi2n = psicomputations(variance, lengthscale, Z, variational_posterior)
+     
+        mu = variational_posterior.mean
+        S = variational_posterior.variance
+        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
+        l2 = np.square(lengthscale)
+        log_denom1 = np.log(S/l2+1)
+        log_denom2 = np.log(2*S/l2+1)
+        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        gamma, gamma1 = variational_posterior.gamma_probabilities()
+        variance = float(variance)
+     
+        dvar = np.zeros(1)
+        dmu = np.zeros((N,Q))
+        dS = np.zeros((N,Q))
+        dgamma = np.zeros((N,Q))
+        dl = np.zeros(Q)
+        dZ = np.zeros((M,Q))
+        dvar += np.sum(dL_dpsi0)
+         
+        from ....util.misc import param_to_array
+        S = param_to_array(S)
+        mu = param_to_array(mu)
+        Z = param_to_array(Z)
+         
+        support_code = """
+        #include <math.h>
+        """
+        code = """
+        for(int n=0; n<N; n++) {
+            for(int m1=0;m1<M;m1++) {
+                double log_psi1=0;
+                for(int m2=0;m2<M;m2++) {
+                    double log_psi2_n=0;
+                    for(int q=0;q<Q;q++) {
+                        double Snq = S(n,q);
+                        double lq = l2(q);
+                        double Zm1q = Z(m1,q);
+                        double Zm2q = Z(m2,q);
+                        double gnq = gamma(n,q);
+                        double g1nq = gamma1(n,q);
+                        double mu_nq = mu(n,q);
+                         
+                        if(m2==0) {
+                            // Compute Psi_1                        
+                            double lpsi1 = psi1(n,m1)*dL_dpsi1(n,m1);
+                            if(q==0) {dvar(0) += lpsi1/variance;}
+                             
+                            double Zmu = Zm1q - mu_nq;
+                            double denom = Snq+lq;
+                            double Zmu2_denom = Zmu*Zmu/denom;
+                             
+                            double exp1 = log_gamma(n,q)-(Zmu*Zmu/(Snq+lq)+log_denom1(n,q))/(2.);
+                            double exp2 = log_gamma1(n,q)-Zm1q*Zm1q/(2.*lq);
+                            double d_exp1,d_exp2;
+                            if(exp1>exp2) {
+                                d_exp1 = 1.;
+                                d_exp2 = exp(exp2-exp1);
+                            } else {
+                                d_exp1 = exp(exp1-exp2);
+                                d_exp2 = 1.;
+                            }
+                            double exp_sum = d_exp1+d_exp2;
+                             
+                            dmu(n,q) += lpsi1*Zmu*d_exp1/(denom*exp_sum);
+                            dS(n,q) += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum)/2.;
+                            dgamma(n,q) += lpsi1*(d_exp1*g1nq-d_exp2*gnq)/exp_sum;
+                            dl(q) += lpsi1*((Zmu2_denom+Snq/lq)/denom*d_exp1+Zm1q*Zm1q/(lq*lq)*d_exp2)/(2.*exp_sum);
+                            dZ(m1,q) += lpsi1*(-Zmu/denom*d_exp1-Zm1q/lq*d_exp2)/exp_sum;
+                        }
+                        // Compute Psi_2
+                        double lpsi2 = psi2n(n,m1,m2)*dL_dpsi2(m1,m2);
+                        if(q==0) {dvar(0) += lpsi2*2/variance;}
+                         
+                        double dZm1m2 = Zm1q - Zm2q;
+                        double Z2 = Zm1q*Zm1q+Zm2q*Zm2q;
+                        double muZhat =  mu_nq - (Zm1q + Zm2q)/2.;
+                        double denom = 2.*Snq+lq;
+                        double muZhat2_denom = muZhat*muZhat/denom;
+                         
+                        double exp1 = dZm1m2*dZm1m2/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2(n,q)/2. + log_gamma(n,q);
+                        double exp2 = log_gamma1(n,q) - Z2/(2.*lq);
+                        double d_exp1,d_exp2;
+                        if(exp1>exp2) {
+                            d_exp1 = 1.;
+                            d_exp2 = exp(exp2-exp1);
+                        } else {
+                            d_exp1 = exp(exp1-exp2);
+                            d_exp2 = 1.;
+                        }
+                        double exp_sum = d_exp1+d_exp2;
+                         
+                        dmu(n,q) += -2.*lpsi2*muZhat/denom*d_exp1/exp_sum;
+                        dS(n,q) += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum;
+                        dgamma(n,q) += lpsi2*(d_exp1*g1nq-d_exp2*gnq)/exp_sum;
+                        dl(q) += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZm1m2*dZm1m2/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum;
+                        dZ(m1,q) += 2.*lpsi2*((muZhat/denom-dZm1m2/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum;                   
+                    }
+                }
+            }
+        }
+        """
+        weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','gamma1','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz)
+     
+        dl *= 2.*lengthscale
+        if not ARD:
+            dl = dl.sum()
+         
+        return dvar, dl, dZ, dmu, dS, dgamma
+
+except:
+
+    def psicomputations(variance, lengthscale, Z, variational_posterior):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        """
+        # here are the "statistics" for psi0, psi1 and psi2
+        # Produced intermediate results:
+        # _psi1                NxM
+        mu = variational_posterior.mean
+        S = variational_posterior.variance
+        gamma = variational_posterior.binary_prob
+         
+        psi0 = np.empty(mu.shape[0])
+        psi0[:] = variance
+        psi1 = _psi1computations(variance, lengthscale, Z, mu, S, gamma)
+        psi2 = _psi2computations(variance, lengthscale, Z, mu, S, gamma)
+        return psi0, psi1, psi2
+    
+    def _psi1computations(variance, lengthscale, Z, mu, S, gamma):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        """
+        # here are the "statistics" for psi1
+        # Produced intermediate results:
+        # _psi1                NxM
+    
+        lengthscale2 = np.square(lengthscale)
+    
+        # psi1
+        _psi1_denom = S[:, None, :] / lengthscale2 + 1.  # Nx1xQ
+        _psi1_denom_sqrt = np.sqrt(_psi1_denom) #Nx1xQ
+        _psi1_dist = Z[None, :, :] - mu[:, None, :]  # NxMxQ
+        _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom) # NxMxQ
+        _psi1_common = gamma[:,None,:] / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #Nx1xQ
+        _psi1_exponent1 = np.log(gamma[:,None,:]) - (_psi1_dist_sq + np.log(_psi1_denom))/2. # NxMxQ
+        _psi1_exponent2 = np.log(1.-gamma[:,None,:]) - (np.square(Z[None,:,:])/lengthscale2)/2. # NxMxQ
+        _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2)
+        _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ
+        _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM
+        _psi1 = variance * np.exp(_psi1_exp_sum) # NxM
+    
+        return _psi1
+    
+    def _psi2computations(variance, lengthscale, Z, mu, S, gamma):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        """
+        # here are the "statistics" for psi2
+        # Produced intermediate results:
+        # _psi2                MxM
+        
+        lengthscale2 = np.square(lengthscale)
+        
+        _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
+        _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
+        _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
+        _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
+    
+        # psi2
+        _psi2_denom = 2.*S[:, None, None, :] / lengthscale2 + 1. # Nx1x1xQ
+        _psi2_denom_sqrt = np.sqrt(_psi2_denom)
+        _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q
+        _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom)
+        _psi2_common = gamma[:,None,None,:]/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # Nx1x1xQ
+        _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom)+np.log(gamma[:,None,None,:]) #N,M,M,Q
+        _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ
+        _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2)
+        _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
+        _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
+        _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
+    
+        return _psi2
+    
+    def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+        ARD = (len(lengthscale)!=1)
+         
+        dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1, dgamma_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+        dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2, dgamma_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+     
+        dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
+         
+        dL_dlengscale = dl_psi1 + dl_psi2
+        if not ARD:
+            dL_dlengscale = dL_dlengscale.sum()
+     
+        dL_dgamma = dgamma_psi1 + dgamma_psi2
+        dL_dmu = dmu_psi1 + dmu_psi2
+        dL_dS = dS_psi1 + dS_psi2
+        dL_dZ = dZ_psi1 + dZ_psi2
+         
+        return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma
+    
+    def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, gamma):
+        """
+        dL_dpsi1 - NxM
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        """
+        # here are the "statistics" for psi1
+        # Produced intermediate results: dL_dparams w.r.t. psi1
+        # _dL_dvariance     1
+        # _dL_dlengthscale  Q
+        # _dL_dZ            MxQ
+        # _dL_dgamma        NxQ
+        # _dL_dmu           NxQ
+        # _dL_dS            NxQ
+        
+        lengthscale2 = np.square(lengthscale)
+    
+        # psi1
+        _psi1_denom = S / lengthscale2 + 1.  # NxQ
+        _psi1_denom_sqrt = np.sqrt(_psi1_denom) #NxQ
+        _psi1_dist = Z[None, :, :] - mu[:, None, :]  # NxMxQ
+        _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom[:,None,:]) # NxMxQ
+        _psi1_common = gamma / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #NxQ
+        _psi1_exponent1 = np.log(gamma[:,None,:]) -0.5 * (_psi1_dist_sq + np.log(_psi1_denom[:, None,:])) # NxMxQ
+        _psi1_exponent2 = np.log(1.-gamma[:,None,:]) -0.5 * (np.square(Z[None,:,:])/lengthscale2) # NxMxQ
+        _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2)
+        _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ
+        _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM
+        _psi1_exp_dist_sq = np.exp(-0.5*_psi1_dist_sq) # NxMxQ
+        _psi1_exp_Z = np.exp(-0.5*np.square(Z[None,:,:])/lengthscale2) # 1xMxQ
+        _psi1_q = variance * np.exp(_psi1_exp_sum[:,:,None] - _psi1_exponent) # NxMxQ
+        _psi1 = variance * np.exp(_psi1_exp_sum) # NxM
+        _dL_dvariance = np.einsum('nm,nm->',dL_dpsi1, _psi1)/variance # 1
+        _dL_dgamma = np.einsum('nm,nmq,nmq->nq',dL_dpsi1, _psi1_q, (_psi1_exp_dist_sq/_psi1_denom_sqrt[:,None,:]-_psi1_exp_Z)) # NxQ
+        _dL_dmu = np.einsum('nm, nmq, nmq, nmq, nq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_dist,_psi1_common)  # NxQ
+        _dL_dS = np.einsum('nm,nmq,nmq,nq,nmq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_common,(_psi1_dist_sq-1.))/2.  # NxQ
+        _dL_dZ = np.einsum('nm,nmq,nmq->mq',dL_dpsi1,_psi1_q, (- _psi1_common[:,None,:] * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z))
+        _dL_dlengthscale = lengthscale* np.einsum('nm,nmq,nmq->q',dL_dpsi1,_psi1_q,(_psi1_common[:,None,:]*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + (1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z))
+    
+        return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma 
+    
+    def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, gamma):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        dL_dpsi2 - MxM
+        """
+        # here are the "statistics" for psi2
+        # Produced the derivatives w.r.t. psi2:
+        # _dL_dvariance      1
+        # _dL_dlengthscale   Q
+        # _dL_dZ             MxQ
+        # _dL_dgamma         NxQ
+        # _dL_dmu            NxQ
+        # _dL_dS             NxQ
+        
+        lengthscale2 = np.square(lengthscale)
+        
+        _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
+        _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
+        _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
+        _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
+    
+        # psi2
+        _psi2_denom = 2.*S / lengthscale2 + 1. # NxQ
+        _psi2_denom_sqrt = np.sqrt(_psi2_denom)
+        _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q
+        _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom[:,None,None,:])
+        _psi2_common = gamma/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # NxQ
+        _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom[:,None,None,:])+np.log(gamma[:,None,None,:]) #N,M,M,Q
+        _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ
+        _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2)
+        _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
+        _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
+        _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ 
+        _psi2_exp_dist_sq = np.exp(-_psi2_Zdist_sq -_psi2_mudist_sq) # NxMxMxQ
+        _psi2_exp_Z = np.exp(-0.5*_psi2_Z_sq_sum) # MxMxQ
+        _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
+        _dL_dvariance = np.einsum('mo,mo->',dL_dpsi2,_psi2)*2./variance
+        _dL_dgamma = np.einsum('mo,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,(_psi2_exp_dist_sq/_psi2_denom_sqrt[:,None,None,:] - _psi2_exp_Z))
+        _dL_dmu = -2.*np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,_psi2_common,_psi2_mudist,_psi2_exp_dist_sq)
+        _dL_dS = np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q, _psi2_common, (2.*_psi2_mudist_sq-1.), _psi2_exp_dist_sq)
+        _dL_dZ = 2.*np.einsum('mo,nmoq,nmoq->mq',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(-_psi2_Zdist*_psi2_denom[:,None,None,:]+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z))
+        _dL_dlengthscale = 2.*lengthscale* np.einsum('mo,nmoq,nmoq->q',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z))
+    
+        return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py
@ -0,0 +1,474 @@
+
+"""
+The module for psi-statistics for RBF kernel for Spike-and-Slab GPLVM
+"""
+
+import numpy as np
+from ....util.caching import Cache_this
+from . import PSICOMP_RBF
+from ....util import gpu_init
+
+try:
+    import pycuda.gpuarray as gpuarray
+    from pycuda.compiler import SourceModule
+    from ....util.linalg_gpu import sum_axis
+except:
+    pass    
+
+gpu_code = """
+    // define THREADNUM
+
+    #define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
+    #define IDX_NMM(n,m1,m2) ((m2*M+m1)*N+n)
+    #define IDX_NQ(n,q) (q*N+n)
+    #define IDX_NM(n,m) (m*N+n)
+    #define IDX_MQ(m,q) (q*M+m)
+    #define IDX_MM(m1,m2) (m2*M+m1)
+    #define IDX_NQB(n,q,b) ((b*Q+q)*N+n)
+    #define IDX_QB(q,b) (b*Q+q)
+
+    // Divide data evenly
+    __device__ void divide_data(int total_data, int psize, int pidx, int *start, int *end) {
+        int residue = (total_data)%psize;
+        if(pidx<residue) {
+            int size = total_data/psize+1;
+            *start = size*pidx;
+            *end = *start+size;
+        } else {
+            int size = total_data/psize;
+            *start = size*pidx+residue;
+            *end = *start+size;
+        }
+    }
+    
+    __device__ void reduce_sum(double* array, int array_size) {
+        int s;
+        if(array_size >= blockDim.x) {
+            for(int i=blockDim.x+threadIdx.x; i<array_size; i+= blockDim.x) {
+                array[threadIdx.x] += array[i];
+            }
+            array_size = blockDim.x;
+        }
+        __syncthreads();
+        for(int i=1; i<=array_size;i*=2) {s=i;}
+        if(threadIdx.x < array_size-s) {array[threadIdx.x] += array[s+threadIdx.x];}
+        __syncthreads();
+        for(s=s/2;s>=1;s=s/2) {
+            if(threadIdx.x < s) {array[threadIdx.x] += array[s+threadIdx.x];}
+            __syncthreads();
+        }
+    }
+
+    __global__ void compDenom(double *log_denom1, double *log_denom2, double *log_gamma, double*log_gamma1, double *gamma, double *l, double *S, int N, int Q)
+    {
+        int n_start, n_end;
+        divide_data(N, gridDim.x, blockIdx.x, &n_start, &n_end);
+        
+        for(int i=n_start*Q+threadIdx.x; i<n_end*Q; i+=blockDim.x) {
+            int n=i/Q;
+            int q=i%Q;
+
+            double Snq = S[IDX_NQ(n,q)];
+            double lq = l[q]*l[q];
+            double gnq = gamma[IDX_NQ(n,q)];
+            log_denom1[IDX_NQ(n,q)] = log(Snq/lq+1.);
+            log_denom2[IDX_NQ(n,q)] = log(2.*Snq/lq+1.);
+            log_gamma[IDX_NQ(n,q)] = log(gnq);
+            log_gamma1[IDX_NQ(n,q)] = log(1.-gnq);
+        }
+    }
+
+    __global__ void psi1computations(double *psi1, double *log_denom1, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
+    {
+        int m_start, m_end;
+        divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
+        
+        for(int m=m_start; m<m_end; m++) {
+            for(int n=threadIdx.x; n<N; n+= blockDim.x) {            
+                double log_psi1 = 0;
+                for(int q=0;q<Q;q++) {
+                    double Zmq = Z[IDX_MQ(m,q)];
+                    double muZ = mu[IDX_NQ(n,q)]-Zmq;
+                    double Snq = S[IDX_NQ(n,q)];
+                    double lq = l[q]*l[q];
+                    double exp1 = log_gamma[IDX_NQ(n,q)]-(muZ*muZ/(Snq+lq)+log_denom1[IDX_NQ(n,q)])/(2.);
+                    double exp2 = log_gamma1[IDX_NQ(n,q)]-Zmq*Zmq/(2.*lq);
+                    log_psi1 += (exp1>exp2)?exp1+log1p(exp(exp2-exp1)):exp2+log1p(exp(exp1-exp2));
+                }
+                psi1[IDX_NM(n,m)] = var*exp(log_psi1);
+            }
+        }
+    }
+    
+    __global__ void psi2computations(double *psi2, double *psi2n, double *log_denom2, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
+    {
+        int psi2_idx_start, psi2_idx_end;
+        __shared__ double psi2_local[THREADNUM];
+        divide_data((M+1)*M/2, gridDim.x, blockIdx.x, &psi2_idx_start, &psi2_idx_end);
+        
+        for(int psi2_idx=psi2_idx_start; psi2_idx<psi2_idx_end; psi2_idx++) {
+            int m1 = int((sqrt(8.*psi2_idx+1.)-1.)/2.);
+            int m2 = psi2_idx - (m1+1)*m1/2;
+            
+            psi2_local[threadIdx.x] = 0;
+            for(int n=threadIdx.x;n<N;n+=blockDim.x) {
+                double log_psi2_n = 0;
+                for(int q=0;q<Q;q++) {
+                    double Zm1q = Z[IDX_MQ(m1,q)];
+                    double Zm2q = Z[IDX_MQ(m2,q)];
+                    double dZ = Zm1q - Zm2q;
+                    double muZhat = mu[IDX_NQ(n,q)]- (Zm1q+Zm2q)/2.;
+                    double Z2 = Zm1q*Zm1q+Zm2q*Zm2q;
+                    double Snq = S[IDX_NQ(n,q)];
+                    double lq = l[q]*l[q];
+                    double exp1 = dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2[IDX_NQ(n,q)]/2. + log_gamma[IDX_NQ(n,q)];
+                    double exp2 = log_gamma1[IDX_NQ(n,q)] - Z2/(2.*lq);
+                    log_psi2_n += (exp1>exp2)?exp1+log1p(exp(exp2-exp1)):exp2+log1p(exp(exp1-exp2));
+                }
+                double exp_psi2_n = exp(log_psi2_n);
+                psi2n[IDX_NMM(n,m1,m2)] = var*var*exp_psi2_n;
+                if(m1!=m2) { psi2n[IDX_NMM(n,m2,m1)] = var*var*exp_psi2_n;}
+                psi2_local[threadIdx.x] += exp_psi2_n;
+            }
+            __syncthreads();
+            reduce_sum(psi2_local, THREADNUM);
+            if(threadIdx.x==0) {
+                psi2[IDX_MM(m1,m2)] = var*var*psi2_local[0];
+                if(m1!=m2) { psi2[IDX_MM(m2,m1)] = var*var*psi2_local[0]; }
+            }
+            __syncthreads();
+        }
+    }
+    
+    __global__ void psi1compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dgamma, double *dL_dpsi1, double *psi1, double *log_denom1, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q)
+    {
+        int m_start, m_end;
+        __shared__ double g_local[THREADNUM];
+        divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
+        int P = int(ceil(double(N)/THREADNUM));
+
+        double dvar_local = 0;
+        for(int q=0;q<Q;q++) {
+            double lq_sqrt = l[q];
+            double lq = lq_sqrt*lq_sqrt;
+            double dl_local = 0;
+            for(int p=0;p<P;p++) {
+                int n = p*THREADNUM + threadIdx.x;
+                double dmu_local = 0;
+                double dS_local = 0;
+                double dgamma_local = 0;
+                double Snq,mu_nq,gnq,log_gnq,log_gnq1,log_de;
+                if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)]; gnq = gamma[IDX_NQ(n,q)];
+                        log_gnq = log_gamma[IDX_NQ(n,q)]; log_gnq1 = log_gamma1[IDX_NQ(n,q)];
+                        log_de = log_denom1[IDX_NQ(n,q)];}
+                for(int m=m_start; m<m_end; m++) {
+                    if(n<N) {
+                        double lpsi1 = psi1[IDX_NM(n,m)]*dL_dpsi1[IDX_NM(n,m)];
+                        if(q==0) {dvar_local += lpsi1;}
+                        
+                        double Zmq = Z[IDX_MQ(m,q)];
+                        double Zmu = Zmq - mu_nq;
+                        double denom = Snq+lq;
+                        double Zmu2_denom = Zmu*Zmu/denom;
+                        
+                        double exp1 = log_gnq-(Zmu*Zmu/(Snq+lq)+log_de)/(2.);
+                        double exp2 = log_gnq1-Zmq*Zmq/(2.*lq);
+                        double d_exp1,d_exp2;
+                        if(exp1>exp2) {
+                            d_exp1 = 1.;
+                            d_exp2 = exp(exp2-exp1);
+                        } else {
+                            d_exp1 = exp(exp1-exp2);
+                            d_exp2 = 1.;
+                        }
+                        double exp_sum = d_exp1+d_exp2;
+                        
+                        dmu_local += lpsi1*Zmu*d_exp1/(denom*exp_sum);
+                        dS_local += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum);
+                        dgamma_local += lpsi1*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
+                        dl_local += lpsi1*((Zmu2_denom+Snq/lq)/denom*d_exp1+Zmq*Zmq/(lq*lq)*d_exp2)/(2.*exp_sum);
+                        g_local[threadIdx.x] = lpsi1*(-Zmu/denom*d_exp1-Zmq/lq*d_exp2)/exp_sum;
+                    }
+                    __syncthreads();
+                    reduce_sum(g_local, p<P-1?THREADNUM:N-(P-1)*THREADNUM);
+                    if(threadIdx.x==0) {dZ[IDX_MQ(m,q)] += g_local[0];}
+                }
+                if(n<N) {
+                    dmu[IDX_NQB(n,q,blockIdx.x)] += dmu_local;
+                    dS[IDX_NQB(n,q,blockIdx.x)] += dS_local/2.;
+                    dgamma[IDX_NQB(n,q,blockIdx.x)] += dgamma_local;
+                }
+                __threadfence_block();
+            }
+            g_local[threadIdx.x] = dl_local*2.*lq_sqrt;
+            __syncthreads();
+            reduce_sum(g_local, THREADNUM);
+            if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
+        }
+        g_local[threadIdx.x] = dvar_local;
+        __syncthreads();
+        reduce_sum(g_local, THREADNUM);
+        if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]/var;}
+    }
+    
+    __global__ void psi2compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dgamma, double *dL_dpsi2, double *psi2n, double *log_denom2, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q)
+    {
+        int m_start, m_end;
+        __shared__ double g_local[THREADNUM];
+        divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
+        int P = int(ceil(double(N)/THREADNUM));
+
+        double dvar_local = 0;
+        for(int q=0;q<Q;q++) {
+            double lq_sqrt = l[q];
+            double lq = lq_sqrt*lq_sqrt;
+            double dl_local = 0;
+            for(int p=0;p<P;p++) {
+                int n = p*THREADNUM + threadIdx.x;
+                double dmu_local = 0;
+                double dS_local = 0;
+                double dgamma_local = 0;
+                double Snq,mu_nq,gnq,log_gnq,log_gnq1,log_de;
+                if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)]; gnq = gamma[IDX_NQ(n,q)];
+                        log_gnq = log_gamma[IDX_NQ(n,q)]; log_gnq1 = log_gamma1[IDX_NQ(n,q)];
+                        log_de = log_denom2[IDX_NQ(n,q)];}
+                for(int m1=m_start; m1<m_end; m1++) {
+                    g_local[threadIdx.x] = 0;
+                    for(int m2=0;m2<M;m2++) {
+                        if(n<N) {
+                            double lpsi2 = psi2n[IDX_NMM(n,m1,m2)]*dL_dpsi2[IDX_MM(m1,m2)];
+                            if(q==0) {dvar_local += lpsi2;}
+                            
+                            double Zm1q = Z[IDX_MQ(m1,q)];
+                            double Zm2q = Z[IDX_MQ(m2,q)];
+                            double dZ = Zm1q - Zm2q;
+                            double Z2 = Zm1q*Zm1q+Zm2q*Zm2q;
+                            double muZhat =  mu_nq - (Zm1q + Zm2q)/2.;
+                            double denom = 2.*Snq+lq;
+                            double muZhat2_denom = muZhat*muZhat/denom;
+                            
+                            double exp1 = dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_de/2. + log_gnq;
+                            double exp2 = log_gnq1 - Z2/(2.*lq);
+                            double d_exp1,d_exp2;
+                            if(exp1>exp2) {
+                                d_exp1 = 1.;
+                                d_exp2 = exp(exp2-exp1);
+                            } else {
+                                d_exp1 = exp(exp1-exp2);
+                                d_exp2 = 1.;
+                            }
+                            double exp_sum = d_exp1+d_exp2;
+                            
+                            dmu_local += lpsi2*muZhat/denom*d_exp1/exp_sum;
+                            dS_local += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum;
+                            dgamma_local += lpsi2*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
+                            dl_local += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZ*dZ/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum;
+                            g_local[threadIdx.x] += 2.*lpsi2*((muZhat/denom-dZ/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum;
+                        }
+                    }
+                    __syncthreads();
+                    reduce_sum(g_local, p<P-1?THREADNUM:N-(P-1)*THREADNUM);
+                    if(threadIdx.x==0) {dZ[IDX_MQ(m1,q)] += g_local[0];}
+                }
+                if(n<N) {
+                    dmu[IDX_NQB(n,q,blockIdx.x)] += -2.*dmu_local;
+                    dS[IDX_NQB(n,q,blockIdx.x)] += dS_local;
+                    dgamma[IDX_NQB(n,q,blockIdx.x)] += dgamma_local;
+                }
+                __threadfence_block();
+            }
+            g_local[threadIdx.x] = dl_local*2.*lq_sqrt;
+            __syncthreads();
+            reduce_sum(g_local, THREADNUM);
+            if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
+        }
+        g_local[threadIdx.x] = dvar_local;
+        __syncthreads();
+        reduce_sum(g_local, THREADNUM);
+        if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]*2/var;}
+    }
+    """
+
+class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
+
+    def __init__(self, threadnum=128, blocknum=15, GPU_direct=False):
+        self.GPU_direct = GPU_direct
+        self.gpuCache = None
+        
+        self.threadnum = threadnum
+        self.blocknum = blocknum
+        module = SourceModule("#define THREADNUM "+str(self.threadnum)+"\n"+gpu_code)
+        self.g_psi1computations = module.get_function('psi1computations')
+        self.g_psi1computations.prepare('PPPPdPPPPiii')
+        self.g_psi2computations = module.get_function('psi2computations')
+        self.g_psi2computations.prepare('PPPPPdPPPPiii')
+        self.g_psi1compDer = module.get_function('psi1compDer')
+        self.g_psi1compDer.prepare('PPPPPPPPPPPdPPPPPiii')
+        self.g_psi2compDer = module.get_function('psi2compDer')
+        self.g_psi2compDer.prepare('PPPPPPPPPPPdPPPPPiii')
+        self.g_compDenom = module.get_function('compDenom')
+        self.g_compDenom.prepare('PPPPPPPii')
+
+    def __deepcopy__(self, memo):
+        s = PSICOMP_SSRBF_GPU(threadnum=self.threadnum, blocknum=self.blocknum, GPU_direct=self.GPU_direct)
+        memo[id(self)] = s 
+        return s
+
+    def _initGPUCache(self, N, M, Q):            
+        if self.gpuCache == None:
+            self.gpuCache = {
+                             'l_gpu'                :gpuarray.empty((Q,),np.float64,order='F'),
+                             'Z_gpu'                :gpuarray.empty((M,Q),np.float64,order='F'),
+                             'mu_gpu'               :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'S_gpu'                :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'gamma_gpu'            :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'psi1_gpu'             :gpuarray.empty((N,M),np.float64,order='F'),
+                             'psi2_gpu'             :gpuarray.empty((M,M),np.float64,order='F'),
+                             'psi2n_gpu'            :gpuarray.empty((N,M,M),np.float64,order='F'),
+                             'dL_dpsi1_gpu'         :gpuarray.empty((N,M),np.float64,order='F'),
+                             'dL_dpsi2_gpu'         :gpuarray.empty((M,M),np.float64,order='F'),
+                             'log_denom1_gpu'       :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'log_denom2_gpu'       :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'log_gamma_gpu'        :gpuarray.empty((N,Q),np.float64,order='F'),
+                             'log_gamma1_gpu'       :gpuarray.empty((N,Q),np.float64,order='F'),
+                             # derivatives
+                             'dvar_gpu'             :gpuarray.empty((self.blocknum,),np.float64, order='F'),
+                             'dl_gpu'               :gpuarray.empty((Q,self.blocknum),np.float64, order='F'),
+                             'dZ_gpu'               :gpuarray.empty((M,Q),np.float64, order='F'),
+                             'dmu_gpu'              :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
+                             'dS_gpu'               :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
+                             'dgamma_gpu'           :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
+                             # grad
+                             'grad_l_gpu'               :gpuarray.empty((Q,),np.float64, order='F'),
+                             'grad_mu_gpu'              :gpuarray.empty((N,Q,),np.float64, order='F'),
+                             'grad_S_gpu'               :gpuarray.empty((N,Q,),np.float64, order='F'),
+                             'grad_gamma_gpu'           :gpuarray.empty((N,Q,),np.float64, order='F'),
+                             }
+        else:
+            assert N==self.gpuCache['mu_gpu'].shape[0]
+            assert M==self.gpuCache['Z_gpu'].shape[0]
+            assert Q==self.gpuCache['l_gpu'].shape[0]
+    
+    def sync_params(self, lengthscale, Z, mu, S, gamma):
+        if len(lengthscale)==1:
+            self.gpuCache['l_gpu'].fill(lengthscale)
+        else:
+            self.gpuCache['l_gpu'].set(np.asfortranarray(lengthscale))
+        self.gpuCache['Z_gpu'].set(np.asfortranarray(Z))
+        self.gpuCache['mu_gpu'].set(np.asfortranarray(mu))
+        self.gpuCache['S_gpu'].set(np.asfortranarray(S))
+        self.gpuCache['gamma_gpu'].set(np.asfortranarray(gamma))
+        N,Q = self.gpuCache['S_gpu'].shape
+        self.g_compDenom.prepared_call((self.blocknum,1),(self.threadnum,1,1), self.gpuCache['log_denom1_gpu'].gpudata,self.gpuCache['log_denom2_gpu'].gpudata,self.gpuCache['log_gamma_gpu'].gpudata,self.gpuCache['log_gamma1_gpu'].gpudata,self.gpuCache['gamma_gpu'].gpudata,self.gpuCache['l_gpu'].gpudata,self.gpuCache['S_gpu'].gpudata, np.int32(N), np.int32(Q))
+        
+    def reset_derivative(self):
+        self.gpuCache['dvar_gpu'].fill(0.)
+        self.gpuCache['dl_gpu'].fill(0.)
+        self.gpuCache['dZ_gpu'].fill(0.)
+        self.gpuCache['dmu_gpu'].fill(0.)
+        self.gpuCache['dS_gpu'].fill(0.)
+        self.gpuCache['dgamma_gpu'].fill(0.)
+        self.gpuCache['grad_l_gpu'].fill(0.)
+        self.gpuCache['grad_mu_gpu'].fill(0.)
+        self.gpuCache['grad_S_gpu'].fill(0.)
+        self.gpuCache['grad_gamma_gpu'].fill(0.)
+    
+    def get_dimensions(self, Z, variational_posterior):
+        return variational_posterior.mean.shape[0], Z.shape[0], Z.shape[1]
+
+    @Cache_this(limit=1, ignore_args=(0,))
+    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        """
+        N,M,Q = self.get_dimensions(Z, variational_posterior)
+        self._initGPUCache(N,M,Q)
+        self.sync_params(lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+        
+        psi1_gpu = self.gpuCache['psi1_gpu']
+        psi2_gpu = self.gpuCache['psi2_gpu']
+        psi2n_gpu = self.gpuCache['psi2n_gpu']
+        l_gpu = self.gpuCache['l_gpu']
+        Z_gpu = self.gpuCache['Z_gpu']
+        mu_gpu = self.gpuCache['mu_gpu']
+        S_gpu = self.gpuCache['S_gpu']
+        log_denom1_gpu = self.gpuCache['log_denom1_gpu']
+        log_denom2_gpu = self.gpuCache['log_denom2_gpu']
+        log_gamma_gpu = self.gpuCache['log_gamma_gpu']
+        log_gamma1_gpu = self.gpuCache['log_gamma1_gpu']
+
+        psi0 = np.empty((N,))
+        psi0[:] = variance
+        self.g_psi1computations.prepared_call((self.blocknum,1),(self.threadnum,1,1),psi1_gpu.gpudata, log_denom1_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
+        self.g_psi2computations.prepared_call((self.blocknum,1),(self.threadnum,1,1),psi2_gpu.gpudata, psi2n_gpu.gpudata, log_denom2_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
+        
+        if self.GPU_direct:
+            return psi0, psi1_gpu, psi2_gpu
+        else:
+            return psi0, psi1_gpu.get(), psi2_gpu.get()
+
+    @Cache_this(limit=1, ignore_args=(0,1,2,3))
+    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+        ARD = (len(lengthscale)!=1)
+        
+        N,M,Q = self.get_dimensions(Z, variational_posterior)
+        psi1_gpu = self.gpuCache['psi1_gpu']
+        psi2n_gpu = self.gpuCache['psi2n_gpu']
+        l_gpu = self.gpuCache['l_gpu']
+        Z_gpu = self.gpuCache['Z_gpu']
+        mu_gpu = self.gpuCache['mu_gpu']
+        S_gpu = self.gpuCache['S_gpu']
+        gamma_gpu = self.gpuCache['gamma_gpu']
+        dvar_gpu = self.gpuCache['dvar_gpu']
+        dl_gpu = self.gpuCache['dl_gpu']
+        dZ_gpu = self.gpuCache['dZ_gpu']
+        dmu_gpu = self.gpuCache['dmu_gpu']
+        dS_gpu = self.gpuCache['dS_gpu']
+        dgamma_gpu = self.gpuCache['dgamma_gpu']
+        grad_l_gpu = self.gpuCache['grad_l_gpu']
+        grad_mu_gpu = self.gpuCache['grad_mu_gpu']
+        grad_S_gpu = self.gpuCache['grad_S_gpu']
+        grad_gamma_gpu = self.gpuCache['grad_gamma_gpu']
+        log_denom1_gpu = self.gpuCache['log_denom1_gpu']
+        log_denom2_gpu = self.gpuCache['log_denom2_gpu']
+        log_gamma_gpu = self.gpuCache['log_gamma_gpu']
+        log_gamma1_gpu = self.gpuCache['log_gamma1_gpu']
+        
+        if self.GPU_direct:
+            dL_dpsi1_gpu = dL_dpsi1
+            dL_dpsi2_gpu = dL_dpsi2
+            dL_dpsi0_sum = gpuarray.sum(dL_dpsi0).get()
+        else:
+            dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu']
+            dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu']
+            dL_dpsi1_gpu.set(np.asfortranarray(dL_dpsi1))
+            dL_dpsi2_gpu.set(np.asfortranarray(dL_dpsi2))
+            dL_dpsi0_sum = dL_dpsi0.sum()
+
+        self.reset_derivative()
+        # t=self.g_psi1compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi1_gpu,psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
+        # print 'g_psi1compDer '+str(t)
+        # t=self.g_psi2compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi2_gpu,psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
+        # print 'g_psi2compDer '+str(t)
+        self.g_psi1compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dgamma_gpu.gpudata,dL_dpsi1_gpu.gpudata,psi1_gpu.gpudata, log_denom1_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata,gamma_gpu.gpudata,np.int32(N), np.int32(M), np.int32(Q))
+        self.g_psi2compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dgamma_gpu.gpudata,dL_dpsi2_gpu.gpudata,psi2n_gpu.gpudata, log_denom2_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata,gamma_gpu.gpudata,np.int32(N), np.int32(M), np.int32(Q))
+
+        dL_dvar = dL_dpsi0_sum + gpuarray.sum(dvar_gpu).get()
+        sum_axis(grad_mu_gpu,dmu_gpu,N*Q,self.blocknum)
+        dL_dmu = grad_mu_gpu.get()
+        sum_axis(grad_S_gpu,dS_gpu,N*Q,self.blocknum)
+        dL_dS = grad_S_gpu.get()
+        sum_axis(grad_gamma_gpu,dgamma_gpu,N*Q,self.blocknum)
+        dL_dgamma = grad_gamma_gpu.get()
+        dL_dZ = dZ_gpu.get()
+        if ARD:
+            sum_axis(grad_l_gpu,dl_gpu,Q,self.blocknum)
+            dL_dlengscale = grad_l_gpu.get()
+        else:
+            dL_dlengscale = gpuarray.sum(dl_gpu).get()
+            
+        return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma
+    
+
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@ -0,0 +1,71 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+import numpy as np
+from stationary import Stationary
+from psi_comp import PSICOMP_RBF
+from psi_comp.rbf_psi_gpucomp import PSICOMP_RBF_GPU
+from ...util.config import *
+
+class RBF(Stationary):
+    """
+    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel:
+
+    .. math::
+
+       k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg)
+
+    """
+    _support_GPU = True
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='rbf', useGPU=False):
+        super(RBF, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name, useGPU=useGPU)
+        self.psicomp = PSICOMP_RBF()
+        if self.useGPU:
+            self.psicomp = PSICOMP_RBF_GPU()
+        else:
+            self.psicomp = PSICOMP_RBF()
+
+    def K_of_r(self, r):
+        return self.variance * np.exp(-0.5 * r**2)
+
+    def dK_dr(self, r):
+        return -r*self.K_of_r(r)
+
+    def __getstate__(self):
+        dc = super(RBF, self).__getstate__()
+        if self.useGPU:
+            dc['psicomp'] = PSICOMP_RBF()
+        return dc
+
+    def __setstate__(self, state):
+        return super(RBF, self).__setstate__(state)
+
+    def spectrum(self, omega):
+        assert self.input_dim == 1 #TODO: higher dim spectra?
+        return self.variance*np.sqrt(2*np.pi)*self.lengthscale*np.exp(-self.lengthscale*2*omega**2/2)
+
+    #---------------------------------------#
+    #             PSI statistics            #
+    #---------------------------------------#
+
+    def psi0(self, Z, variational_posterior):
+        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[0]
+
+    def psi1(self, Z, variational_posterior):
+        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[1]
+
+    def psi2(self, Z, variational_posterior):
+        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[2]
+
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[:2]
+        self.variance.gradient = dL_dvar
+        self.lengthscale.gradient = dL_dlengscale
+
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[2]
+
+    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[3:]
+
--- a/GPy/kern/_src/splitKern.py
+++ b/GPy/kern/_src/splitKern.py
@ -0,0 +1,204 @@
+"""
+A new kernel
+"""
+
+import numpy as np
+from kern import Kern,CombinationKernel
+from .independent_outputs import index_to_slices
+import itertools
+
+class DiffGenomeKern(Kern):
+
+    def __init__(self, kernel, idx_p, Xp, index_dim=-1, name='DiffGenomeKern'):
+        self.idx_p = idx_p
+        self.index_dim=index_dim
+        self.kern = SplitKern(kernel,Xp, index_dim=index_dim)
+        super(DiffGenomeKern, self).__init__(input_dim=kernel.input_dim+1, active_dims=None, name=name)
+        self.add_parameter(self.kern)
+    
+    def K(self, X, X2=None):
+        assert X2==None
+        K = self.kern.K(X,X2)
+        
+        if self.idx_p<=0 or self.idx_p>X.shape[0]/2:
+            return K
+        
+        slices = index_to_slices(X[:,self.index_dim])
+        idx_start = slices[1][0].start
+        idx_end = idx_start+self.idx_p
+        K_c = K[idx_start:idx_end,idx_start:idx_end].copy()
+        K[idx_start:idx_end,:] = K[:self.idx_p,:]
+        K[:,idx_start:idx_end] = K[:,:self.idx_p]
+        K[idx_start:idx_end,idx_start:idx_end] = K_c
+        
+        return K
+    
+    def Kdiag(self,X):
+        Kdiag = self.kern.Kdiag(X)
+
+        if self.idx_p<=0 or self.idx_p>X.shape[0]/2:
+            return Kdiag
+
+        slices = index_to_slices(X[:,self.index_dim])
+        idx_start = slices[1][0].start
+        idx_end = idx_start+self.idx_p
+        Kdiag[idx_start:idx_end] = Kdiag[:self.idx_p]
+        
+        return Kdiag
+    
+    def update_gradients_full(self,dL_dK,X,X2=None):
+        assert X2==None
+        if self.idx_p<=0 or self.idx_p>X.shape[0]/2:
+            self.kern.update_gradients_full(dL_dK, X)
+            return
+        
+        slices = index_to_slices(X[:,self.index_dim])
+        idx_start = slices[1][0].start
+        idx_end = idx_start+self.idx_p
+        
+        self.kern.update_gradients_full(dL_dK[idx_start:idx_end,:], X[:self.idx_p],X)
+        grad_p1 = self.kern.gradient.copy()
+        self.kern.update_gradients_full(dL_dK[:,idx_start:idx_end], X, X[:self.idx_p])
+        grad_p2 = self.kern.gradient.copy()
+        self.kern.update_gradients_full(dL_dK[idx_start:idx_end,idx_start:idx_end], X[:self.idx_p],X[idx_start:idx_end])
+        grad_p3 = self.kern.gradient.copy()
+        self.kern.update_gradients_full(dL_dK[idx_start:idx_end,idx_start:idx_end], X[idx_start:idx_end], X[:self.idx_p])
+        grad_p4 = self.kern.gradient.copy()
+
+        self.kern.update_gradients_full(dL_dK[idx_start:idx_end,:], X[idx_start:idx_end],X)
+        grad_n1 = self.kern.gradient.copy()
+        self.kern.update_gradients_full(dL_dK[:,idx_start:idx_end], X, X[idx_start:idx_end])
+        grad_n2 = self.kern.gradient.copy()
+        self.kern.update_gradients_full(dL_dK[idx_start:idx_end,idx_start:idx_end], X[idx_start:idx_end], X[idx_start:idx_end])
+        grad_n3 = self.kern.gradient.copy()
+
+        self.kern.update_gradients_full(dL_dK, X)
+        self.kern.gradient += grad_p1+grad_p2-grad_p3-grad_p4-grad_n1-grad_n2+2*grad_n3
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        pass
+
+class SplitKern(CombinationKernel):
+
+    def __init__(self, kernel, Xp, index_dim=-1, name='SplitKern'):
+        assert isinstance(index_dim, int), "The index dimension must be an integer!"
+        self.kern = kernel
+        self.kern_cross = SplitKern_cross(kernel,Xp)
+        super(SplitKern, self).__init__(kernels=[self.kern, self.kern_cross], extra_dims=[index_dim], name=name)
+        self.index_dim = index_dim
+
+    def K(self,X ,X2=None):
+        slices = index_to_slices(X[:,self.index_dim])
+        assert len(slices)<=2, 'The Split kernel only support two different indices'
+        if X2 is None:
+            target = np.zeros((X.shape[0], X.shape[0]))
+            # diagonal blocks
+            [[target.__setitem__((s,ss), self.kern.K(X[s,:], X[ss,:])) for s,ss in itertools.product(slices_i, slices_i)] for slices_i in slices]
+            if len(slices)>1:
+                # cross blocks
+                [target.__setitem__((s,ss), self.kern_cross.K(X[s,:], X[ss,:])) for s,ss in itertools.product(slices[0], slices[1])]
+                # cross blocks
+                [target.__setitem__((s,ss), self.kern_cross.K(X[s,:], X[ss,:])) for s,ss in itertools.product(slices[1], slices[0])]
+        else:
+            slices2 = index_to_slices(X2[:,self.index_dim])
+            assert len(slices2)<=2, 'The Split kernel only support two different indices'
+            target = np.zeros((X.shape[0], X2.shape[0]))
+            # diagonal blocks
+            [[target.__setitem__((s,s2), self.kern.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices[i], slices2[i])] for i in xrange(min(len(slices),len(slices2)))]
+            if len(slices)>1:
+                [target.__setitem__((s,s2), self.kern_cross.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices[1], slices2[0])]
+            if len(slices2)>1:
+                [target.__setitem__((s,s2), self.kern_cross.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices[0], slices2[1])]                
+        return target
+
+    def Kdiag(self,X):
+        return self.kern.Kdiag(X)
+
+    def update_gradients_full(self,dL_dK,X,X2=None):
+        slices = index_to_slices(X[:,self.index_dim])
+        target = np.zeros(self.kern.size)
+
+        def collate_grads(dL, X, X2, cross=False):
+            if cross:
+                self.kern_cross.update_gradients_full(dL,X,X2)
+                target[:] += self.kern_cross.kern.gradient
+            else:
+                self.kern.update_gradients_full(dL,X,X2)
+                target[:] += self.kern.gradient
+    
+        if X2 is None:
+            assert dL_dK.shape==(X.shape[0],X.shape[0])
+            [[collate_grads(dL_dK[s,ss], X[s], X[ss]) for s,ss in itertools.product(slices_i, slices_i)] for slices_i in slices]
+            if len(slices)>1:
+                [collate_grads(dL_dK[s,ss], X[s], X[ss], True) for s,ss in itertools.product(slices[0], slices[1])]
+                [collate_grads(dL_dK[s,ss], X[s], X[ss], True) for s,ss in itertools.product(slices[1], slices[0])]
+        else:
+            assert dL_dK.shape==(X.shape[0],X2.shape[0])
+            slices2 = index_to_slices(X2[:,self.index_dim])
+            [[collate_grads(dL_dK[s,s2],X[s],X2[s2]) for s,s2 in itertools.product(slices[i], slices2[i])] for i in xrange(min(len(slices),len(slices2)))]
+            if len(slices)>1:
+                [collate_grads(dL_dK[s,s2], X[s], X2[s2], True) for s,s2 in itertools.product(slices[1], slices2[0])]
+            if len(slices2)>1:
+                [collate_grads(dL_dK[s,s2], X[s], X2[s2], True) for s,s2 in itertools.product(slices[0], slices2[1])]
+        self.kern.gradient = target
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        self.kern.update_gradients_diag(self, dL_dKdiag, X)
+
+class SplitKern_cross(Kern):
+
+    def __init__(self, kernel, Xp, name='SplitKern_cross'):
+        assert isinstance(kernel, Kern)
+        self.kern = kernel
+        if not isinstance(Xp,np.ndarray):
+            Xp = np.array([[Xp]])
+        self.Xp = Xp
+        super(SplitKern_cross, self).__init__(input_dim=kernel.input_dim, active_dims=None, name=name)
+        
+    def K(self, X, X2=None):
+        if X2 is None:
+            return np.dot(self.kern.K(X,self.Xp),self.kern.K(self.Xp,X))/self.kern.K(self.Xp,self.Xp)
+        else:
+            return np.dot(self.kern.K(X,self.Xp),self.kern.K(self.Xp,X2))/self.kern.K(self.Xp,self.Xp)
+        
+    def Kdiag(self, X):
+        return np.inner(self.kern.K(X,self.Xp),self.kern.K(self.Xp,X).T)/self.kern.K(self.Xp,self.Xp)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        if X2 is None:
+            X2 = X
+                        
+        k1 = self.kern.K(X,self.Xp)
+        k2 = self.kern.K(self.Xp,X2)
+        k3 = self.kern.K(self.Xp,self.Xp)
+        dL_dk1 = np.einsum('ij,j->i',dL_dK,k2[0])/k3[0,0]
+        dL_dk2 = np.einsum('ij,i->j',dL_dK,k1[:,0])/k3[0,0]
+        dL_dk3 = np.einsum('ij,ij->',dL_dK,-np.dot(k1,k2)/(k3[0,0]*k3[0,0]))
+
+        self.kern.update_gradients_full(dL_dk1[:,None],X,self.Xp)
+        grad = self.kern.gradient.copy()
+        self.kern.update_gradients_full(dL_dk2[None,:],self.Xp,X2)
+        grad += self.kern.gradient.copy()
+        self.kern.update_gradients_full(np.array([[dL_dk3]]),self.Xp,self.Xp)
+        grad += self.kern.gradient.copy()
+        
+        self.kern.gradient = grad
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        k1 = self.kern.K(X,self.Xp)
+        k2 = self.kern.K(self.Xp,X)
+        k3 = self.kern.K(self.Xp,self.Xp)
+        dL_dk1 = dL_dKdiag*k2[0]/k3
+        dL_dk2 = dL_dKdiag*k1[:,0]/k3
+        dL_dk3 = -dL_dKdiag*(k1[:,0]*k2[0]).sum()/(k3*k3)
+        
+        self.kern.update_gradients_full(dL_dk1[:,None],X,self.Xp)
+        grad1 = self.kern.gradient.copy()
+        self.kern.update_gradients_full(dL_dk2[None,:],self.Xp,X)
+        grad2 = self.kern.gradient.copy()
+        self.kern.update_gradients_full(np.array([[dL_dk3]]),self.Xp,self.Xp)
+        grad3 = self.kern.gradient.copy()
+        
+        self.kern.gradient = grad1+grad2+grad3
+        
+
--- a/GPy/kern/_src/static.py
+++ b/GPy/kern/_src/static.py
@ -0,0 +1,122 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kern import Kern
+import numpy as np
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+
+class Static(Kern):
+    def __init__(self, input_dim, variance, active_dims, name):
+        super(Static, self).__init__(input_dim, active_dims, name)
+        self.variance = Param('variance', variance, Logexp())
+        self.link_parameters(self.variance)
+
+    def Kdiag(self, X):
+        ret = np.empty((X.shape[0],), dtype=np.float64)
+        ret[:] = self.variance
+        return ret
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        return np.zeros(X.shape)
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        return np.zeros(X.shape)
+
+    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        return np.zeros(Z.shape)
+
+    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        return np.zeros(variational_posterior.shape), np.zeros(variational_posterior.shape)
+
+    def psi0(self, Z, variational_posterior):
+        return self.Kdiag(variational_posterior.mean)
+
+    def psi1(self, Z, variational_posterior):
+        return self.K(variational_posterior.mean, Z)
+
+    def psi2(self, Z, variational_posterior):
+        K = self.K(variational_posterior.mean, Z)
+        return np.einsum('ij,ik->jk',K,K) #K[:,:,None]*K[:,None,:] # NB. more efficient implementations on inherriting classes
+
+    def input_sensitivity(self, summarize=True):
+        if summarize:
+            return super(Static, self).input_sensitivity(summarize=summarize)
+        else:
+            return np.ones(self.input_dim) * self.variance
+
+class White(Static):
+    def __init__(self, input_dim, variance=1., active_dims=None, name='white'):
+        super(White, self).__init__(input_dim, variance, active_dims, name)
+
+    def K(self, X, X2=None):
+        if X2 is None:
+            return np.eye(X.shape[0])*self.variance
+        else:
+            return np.zeros((X.shape[0], X2.shape[0]))
+
+    def psi2(self, Z, variational_posterior):
+        return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        self.variance.gradient = np.trace(dL_dK)
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        self.variance.gradient = dL_dKdiag.sum()
+
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        self.variance.gradient = dL_dpsi0.sum()
+
+class Bias(Static):
+    def __init__(self, input_dim, variance=1., active_dims=None, name='bias'):
+        super(Bias, self).__init__(input_dim, variance, active_dims, name)
+
+    def K(self, X, X2=None):
+        shape = (X.shape[0], X.shape[0] if X2 is None else X2.shape[0])
+        ret = np.empty(shape, dtype=np.float64)
+        ret[:] = self.variance
+        return ret
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        self.variance.gradient = dL_dK.sum()
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        self.variance.gradient = dL_dKdiag.sum()
+
+    def psi2(self, Z, variational_posterior):
+        ret = np.empty((Z.shape[0], Z.shape[0]), dtype=np.float64)
+        ret[:] = self.variance*self.variance*variational_posterior.shape[0]
+        return ret
+
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        self.variance.gradient = dL_dpsi0.sum() + dL_dpsi1.sum() + 2.*self.variance*dL_dpsi2.sum()*variational_posterior.shape[0]
+
+class Fixed(Static):
+    def __init__(self, input_dim, covariance_matrix, variance=1., active_dims=None, name='fixed'):
+        """
+        :param input_dim: the number of input dimensions
+        :type input_dim: int
+        :param variance: the variance of the kernel
+        :type variance: float
+        """
+        super(Fixed, self).__init__(input_dim, variance, active_dims, name)
+        self.fixed_K = covariance_matrix
+    def K(self, X, X2):
+        return self.variance * self.fixed_K
+
+    def Kdiag(self, X):
+        return self.variance * self.fixed_K.diag()
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        self.variance.gradient = np.einsum('ij,ij', dL_dK, self.fixed_K)
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        self.variance.gradient = np.einsum('i,i', dL_dKdiag, self.fixed_K)
+
+    def psi2(self, Z, variational_posterior):
+        return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)
+
+    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
+        self.variance.gradient = dL_dpsi0.sum()
+
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@ -0,0 +1,484 @@
+# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+
+from kern import Kern
+from ...core.parameterization import Param
+from ...core.parameterization.transformations import Logexp
+from ...util.linalg import tdot
+from ... import util
+import numpy as np
+from scipy import integrate, weave
+from ...util.config import config # for assesing whether to use weave
+from ...util.caching import Cache_this
+
+class Stationary(Kern):
+    """
+    Stationary kernels (covariance functions).
+
+    Stationary covariance fucntion depend only on r, where r is defined as
+
+      r = \sqrt{ \sum_{q=1}^Q (x_q - x'_q)^2 }
+
+    The covariance function k(x, x' can then be written k(r).
+
+    In this implementation, r is scaled by the lengthscales parameter(s):
+
+      r = \sqrt{ \sum_{q=1}^Q \frac{(x_q - x'_q)^2}{\ell_q^2} }.
+
+    By default, there's only one lengthscale: seaprate lengthscales for each
+    dimension can be enables by setting ARD=True.
+
+    To implement a stationary covariance function using this class, one need
+    only define the covariance function k(r), and it derivative.
+
+      ...
+      def K_of_r(self, r):
+          return foo
+      def dK_dr(self, r):
+          return bar
+
+    The lengthscale(s) and variance parameters are added to the structure automatically.
+
+    """
+
+    def __init__(self, input_dim, variance, lengthscale, ARD, active_dims, name, useGPU=False):
+        super(Stationary, self).__init__(input_dim, active_dims, name,useGPU=useGPU)
+        self.ARD = ARD
+        if not ARD:
+            if lengthscale is None:
+                lengthscale = np.ones(1)
+            else:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size == 1, "Only 1 lengthscale needed for non-ARD kernel"
+        else:
+            if lengthscale is not None:
+                lengthscale = np.asarray(lengthscale)
+                assert lengthscale.size in [1, input_dim], "Bad number of lengthscales"
+                if lengthscale.size != input_dim:
+                    lengthscale = np.ones(input_dim)*lengthscale
+            else:
+                lengthscale = np.ones(self.input_dim)
+        self.lengthscale = Param('lengthscale', lengthscale, Logexp())
+        self.variance = Param('variance', variance, Logexp())
+        assert self.variance.size==1
+        self.link_parameters(self.variance, self.lengthscale)
+
+    def K_of_r(self, r):
+        raise NotImplementedError, "implement the covariance function as a fn of r to use this class"
+
+    def dK_dr(self, r):
+        raise NotImplementedError, "implement derivative of the covariance function wrt r to use this class"
+
+    @Cache_this(limit=5, ignore_args=())
+    def K(self, X, X2=None):
+        """
+        Kernel function applied on inputs X and X2.
+        In the stationary case there is an inner function depending on the
+        distances from X to X2, called r.
+
+        K(X, X2) = K_of_r((X-X2)**2)
+        """
+        r = self._scaled_dist(X, X2)
+        return self.K_of_r(r)
+
+    @Cache_this(limit=3, ignore_args=())
+    def dK_dr_via_X(self, X, X2):
+        #a convenience function, so we can cache dK_dr
+        return self.dK_dr(self._scaled_dist(X, X2))
+
+    def _unscaled_dist(self, X, X2=None):
+        """
+        Compute the Euclidean distance between each row of X and X2, or between
+        each pair of rows of X if X2 is None.
+        """
+        #X, = self._slice_X(X)
+        if X2 is None:
+            Xsq = np.sum(np.square(X),1)
+            r2 = -2.*tdot(X) + (Xsq[:,None] + Xsq[None,:])
+            util.diag.view(r2)[:,]= 0. # force diagnoal to be zero: sometime numerically a little negative
+            r2 = np.clip(r2, 0, np.inf)
+            return np.sqrt(r2)
+        else:
+            #X2, = self._slice_X(X2)
+            X1sq = np.sum(np.square(X),1)
+            X2sq = np.sum(np.square(X2),1)
+            r2 = -2.*np.dot(X, X2.T) + X1sq[:,None] + X2sq[None,:]
+            r2 = np.clip(r2, 0, np.inf)
+            return np.sqrt(r2)
+
+    @Cache_this(limit=5, ignore_args=())
+    def _scaled_dist(self, X, X2=None):
+        """
+        Efficiently compute the scaled distance, r.
+
+        r = \sqrt( \sum_{q=1}^Q (x_q - x'q)^2/l_q^2 )
+
+        Note that if thre is only one lengthscale, l comes outside the sum. In
+        this case we compute the unscaled distance first (in a separate
+        function for caching) and divide by lengthscale afterwards
+
+        """
+        if self.ARD:
+            if X2 is not None:
+                X2 = X2 / self.lengthscale
+            return self._unscaled_dist(X/self.lengthscale, X2)
+        else:
+            return self._unscaled_dist(X, X2)/self.lengthscale
+
+    def Kdiag(self, X):
+        ret = np.empty(X.shape[0])
+        ret[:] = self.variance
+        return ret
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        """
+        Given the derivative of the objective with respect to the diagonal of
+        the covariance matrix, compute the derivative wrt the parameters of
+        this kernel and stor in the <parameter>.gradient field.
+
+        See also update_gradients_full
+        """
+        self.variance.gradient = np.sum(dL_dKdiag)
+        self.lengthscale.gradient = 0.
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        """
+        Given the derivative of the objective wrt the covariance matrix
+        (dL_dK), compute the gradient wrt the parameters of this kernel,
+        and store in the parameters object as e.g. self.variance.gradient
+        """
+        self.variance.gradient = np.einsum('ij,ij,i', self.K(X, X2), dL_dK, 1./self.variance)
+
+        #now the lengthscale gradient(s)
+        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
+        if self.ARD:
+            #rinv = self._inv_dis# this is rather high memory? Should we loop instead?t(X, X2)
+            #d =  X[:, None, :] - X2[None, :, :]
+            #x_xl3 = np.square(d)
+            #self.lengthscale.gradient = -((dL_dr*rinv)[:,:,None]*x_xl3).sum(0).sum(0)/self.lengthscale**3
+            tmp = dL_dr*self._inv_dist(X, X2)
+            if X2 is None: X2 = X
+            
+
+            if config.getboolean('weave', 'working'):
+                try:
+                    self.lengthscale.gradient = self.weave_lengthscale_grads(tmp, X, X2)
+                except:
+                    print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                    config.set('weave', 'working', 'False')
+                    self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)])
+            else:
+                self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)])
+        else:
+            r = self._scaled_dist(X, X2)
+            self.lengthscale.gradient = -np.sum(dL_dr*r)/self.lengthscale
+
+
+    def _inv_dist(self, X, X2=None):
+        """
+        Compute the elementwise inverse of the distance matrix, expecpt on the
+        diagonal, where we return zero (the distance on the diagonal is zero).
+        This term appears in derviatives.
+        """
+        dist = self._scaled_dist(X, X2).copy()
+        return 1./np.where(dist != 0., dist, np.inf)
+
+    def weave_lengthscale_grads(self, tmp, X, X2):
+        """Use scipy.weave to compute derivatives wrt the lengthscales"""
+        N,M = tmp.shape
+        Q = X.shape[1]
+        if hasattr(X, 'values'):X = X.values
+        if hasattr(X2, 'values'):X2 = X2.values
+        grads = np.zeros(self.input_dim)
+        code = """
+        double gradq;
+        for(int q=0; q<Q; q++){
+          gradq = 0;
+          for(int n=0; n<N; n++){
+            for(int m=0; m<M; m++){
+              gradq += tmp(n,m)*(X(n,q)-X2(m,q))*(X(n,q)-X2(m,q));
+            }
+          }
+          grads(q) = gradq;
+        }
+        """
+        weave.inline(code, ['tmp', 'X', 'X2', 'grads', 'N', 'M', 'Q'], type_converters=weave.converters.blitz, support_code="#include <math.h>")
+        return -grads/self.lengthscale**3
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        """
+        Given the derivative of the objective wrt K (dL_dK), compute the derivative wrt X
+        """
+        if config.getboolean('weave', 'working'):
+            try:
+                return self.gradients_X_weave(dL_dK, X, X2)
+            except:
+                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                config.set('weave', 'working', 'False')
+                return self.gradients_X_(dL_dK, X, X2)
+        else:
+            return self.gradients_X_(dL_dK, X, X2)
+
+    def gradients_X_(self, dL_dK, X, X2=None):
+        invdist = self._inv_dist(X, X2)
+        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
+        tmp = invdist*dL_dr
+        if X2 is None:
+            tmp = tmp + tmp.T
+            X2 = X
+
+        #The high-memory numpy way:
+        #d =  X[:, None, :] - X2[None, :, :]
+        #ret = np.sum(tmp[:,:,None]*d,1)/self.lengthscale**2
+
+        #the lower memory way with a loop
+        ret = np.empty(X.shape, dtype=np.float64)
+        for q in xrange(self.input_dim):
+            np.sum(tmp*(X[:,q][:,None]-X2[:,q][None,:]), axis=1, out=ret[:,q])
+        ret /= self.lengthscale**2
+
+        return ret
+
+    def gradients_X_weave(self, dL_dK, X, X2=None):
+        invdist = self._inv_dist(X, X2)
+        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
+        tmp = invdist*dL_dr
+        if X2 is None:
+            tmp = tmp + tmp.T
+            X2 = X
+
+        code = """
+        int n,m,d;
+        double retnd;
+        #pragma omp parallel for private(n,d, retnd, m)
+        for(d=0;d<D;d++){
+          for(n=0;n<N;n++){
+            retnd = 0.0;
+            for(m=0;m<M;m++){
+              retnd += tmp(n,m)*(X(n,d)-X2(m,d));
+            }
+            ret(n,d) = retnd;
+          }
+        }
+ 
+        """
+        if hasattr(X, 'values'):X = X.values #remove the GPy wrapping to make passing into weave safe
+        if hasattr(X2, 'values'):X2 = X2.values
+        ret = np.zeros(X.shape)
+        N,D = X.shape
+        N,M = tmp.shape
+        from scipy import weave
+        support_code = """
+        #include <omp.h>
+        #include <stdio.h>
+        """
+        weave_options = {'headers'           : ['<omp.h>'],
+                         'extra_compile_args': ['-fopenmp -O3'], # -march=native'],
+                         'extra_link_args'   : ['-lgomp']}
+        weave.inline(code, ['ret', 'N', 'D', 'M', 'tmp', 'X', 'X2'], type_converters=weave.converters.blitz, support_code=support_code, **weave_options)
+        return ret/self.lengthscale**2
+    
+    def gradients_X_diag(self, dL_dKdiag, X):
+        return np.zeros(X.shape)
+
+    def input_sensitivity(self, summarize=True):
+        return np.ones(self.input_dim)/self.lengthscale**2
+
+class Exponential(Stationary):
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Exponential'):
+        super(Exponential, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
+
+    def K_of_r(self, r):
+        return self.variance * np.exp(-0.5 * r)
+
+    def dK_dr(self, r):
+        return -0.5*self.K_of_r(r)
+
+
+class OU(Stationary):
+    """
+    OU kernel:
+
+    .. math::
+
+       k(r) = \\sigma^2 \exp(- r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
+
+    """
+
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='OU'):
+        super(OU, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
+
+    def K_of_r(self, r):
+        return self.variance * np.exp(-r)
+
+    def dK_dr(self,r):
+        return -1.*self.variance*np.exp(-r)
+
+
+class Matern32(Stationary):
+    """
+    Matern 3/2 kernel:
+
+    .. math::
+
+       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
+
+    """
+
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Mat32'):
+        super(Matern32, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
+
+    def K_of_r(self, r):
+        return self.variance * (1. + np.sqrt(3.) * r) * np.exp(-np.sqrt(3.) * r)
+
+    def dK_dr(self,r):
+        return -3.*self.variance*r*np.exp(-np.sqrt(3.)*r)
+
+    def Gram_matrix(self, F, F1, F2, lower, upper):
+        """
+        Return the Gram matrix of the vector of functions F with respect to the
+        RKHS norm. The use of this function is limited to input_dim=1.
+
+        :param F: vector of functions
+        :type F: np.array
+        :param F1: vector of derivatives of F
+        :type F1: np.array
+        :param F2: vector of second derivatives of F
+        :type F2: np.array
+        :param lower,upper: boundaries of the input domain
+        :type lower,upper: floats
+        """
+        assert self.input_dim == 1
+        def L(x, i):
+            return(3. / self.lengthscale ** 2 * F[i](x) + 2 * np.sqrt(3) / self.lengthscale * F1[i](x) + F2[i](x))
+        n = F.shape[0]
+        G = np.zeros((n, n))
+        for i in range(n):
+            for j in range(i, n):
+                G[i, j] = G[j, i] = integrate.quad(lambda x : L(x, i) * L(x, j), lower, upper)[0]
+        Flower = np.array([f(lower) for f in F])[:, None]
+        F1lower = np.array([f(lower) for f in F1])[:, None]
+        return(self.lengthscale ** 3 / (12.*np.sqrt(3) * self.variance) * G + 1. / self.variance * np.dot(Flower, Flower.T) + self.lengthscale ** 2 / (3.*self.variance) * np.dot(F1lower, F1lower.T))
+
+
+class Matern52(Stationary):
+    """
+    Matern 5/2 kernel:
+
+    .. math::
+
+       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r)
+       """
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Mat52'):
+        super(Matern52, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
+
+    def K_of_r(self, r):
+        return self.variance*(1+np.sqrt(5.)*r+5./3*r**2)*np.exp(-np.sqrt(5.)*r)
+
+    def dK_dr(self, r):
+        return self.variance*(10./3*r -5.*r -5.*np.sqrt(5.)/3*r**2)*np.exp(-np.sqrt(5.)*r)
+
+    def Gram_matrix(self, F, F1, F2, F3, lower, upper):
+        """
+        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to input_dim=1.
+
+        :param F: vector of functions
+        :type F: np.array
+        :param F1: vector of derivatives of F
+        :type F1: np.array
+        :param F2: vector of second derivatives of F
+        :type F2: np.array
+        :param F3: vector of third derivatives of F
+        :type F3: np.array
+        :param lower,upper: boundaries of the input domain
+        :type lower,upper: floats
+        """
+        assert self.input_dim == 1
+        def L(x,i):
+            return(5*np.sqrt(5)/self.lengthscale**3*F[i](x) + 15./self.lengthscale**2*F1[i](x)+ 3*np.sqrt(5)/self.lengthscale*F2[i](x) + F3[i](x))
+        n = F.shape[0]
+        G = np.zeros((n,n))
+        for i in range(n):
+            for j in range(i,n):
+                G[i,j] = G[j,i] = integrate.quad(lambda x : L(x,i)*L(x,j),lower,upper)[0]
+        G_coef = 3.*self.lengthscale**5/(400*np.sqrt(5))
+        Flower = np.array([f(lower) for f in F])[:,None]
+        F1lower = np.array([f(lower) for f in F1])[:,None]
+        F2lower = np.array([f(lower) for f in F2])[:,None]
+        orig = 9./8*np.dot(Flower,Flower.T) + 9.*self.lengthscale**4/200*np.dot(F2lower,F2lower.T)
+        orig2 = 3./5*self.lengthscale**2 * ( np.dot(F1lower,F1lower.T) + 1./8*np.dot(Flower,F2lower.T) + 1./8*np.dot(F2lower,Flower.T))
+        return(1./self.variance* (G_coef*G + orig + orig2))
+
+
+class ExpQuad(Stationary):
+    """
+    The Exponentiated quadratic covariance function.
+
+    .. math::
+
+       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r)
+
+    notes::
+     - Yes, this is exactly the same as the RBF covariance function, but the
+       RBF implementation also has some features for doing variational kernels
+       (the psi-statistics).
+
+    """
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='ExpQuad'):
+        super(ExpQuad, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
+
+    def K_of_r(self, r):
+        return self.variance * np.exp(-0.5 * r**2)
+
+    def dK_dr(self, r):
+        return -r*self.K_of_r(r)
+
+class Cosine(Stationary):
+    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Cosine'):
+        super(Cosine, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
+
+    def K_of_r(self, r):
+        return self.variance * np.cos(r)
+
+    def dK_dr(self, r):
+        return -self.variance * np.sin(r)
+
+
+class RatQuad(Stationary):
+    """
+    Rational Quadratic Kernel
+
+    .. math::
+
+       k(r) = \sigma^2 \\bigg( 1 + \\frac{r^2}{2} \\bigg)^{- \\alpha}
+
+    """
+
+
+    def __init__(self, input_dim, variance=1., lengthscale=None, power=2., ARD=False, active_dims=None, name='RatQuad'):
+        super(RatQuad, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
+        self.power = Param('power', power, Logexp())
+        self.link_parameters(self.power)
+
+    def K_of_r(self, r):
+        r2 = np.power(r, 2.)
+        return self.variance*np.power(1. + r2/2., -self.power)
+
+    def dK_dr(self, r):
+        r2 = np.power(r, 2.)
+        return -self.variance*self.power*r*np.power(1. + r2/2., - self.power - 1.)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        super(RatQuad, self).update_gradients_full(dL_dK, X, X2)
+        r = self._scaled_dist(X, X2)
+        r2 = np.power(r, 2.)
+        dK_dpow = -self.variance * np.power(2., self.power) * np.power(r2 + 2., -self.power) * np.log(0.5*(r2+2.))
+        grad = np.sum(dL_dK*dK_dpow)
+        self.power.gradient = grad
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        super(RatQuad, self).update_gradients_diag(dL_dKdiag, X)
+        self.power.gradient = 0.
+
+
--- a/GPy/kern/_src/symbolic.py
+++ b/GPy/kern/_src/symbolic.py
@ -0,0 +1,75 @@
+# Check Matthew Rocklin's blog post.
+import sympy as sym
+import numpy as np
+from kern import Kern
+from ...core.symbolic import Symbolic_core
+
+
+class Symbolic(Kern, Symbolic_core):
+    """
+    """
+    def __init__(self, input_dim, k=None, output_dim=1, name='symbolic', parameters=None, active_dims=None, operators=None, func_modules=[]):
+
+        if k is None:
+            raise ValueError, "You must provide an argument for the covariance function."
+
+        Kern.__init__(self, input_dim, active_dims, name=name)
+        kdiag = k
+        self.cacheable = ['X', 'Z']
+        Symbolic_core.__init__(self, {'k':k,'kdiag':kdiag}, cacheable=self.cacheable, derivatives = ['X', 'theta'], parameters=parameters, func_modules=func_modules)        
+        self.output_dim = output_dim
+
+    def __add__(self,other):
+        return spkern(self._sym_k+other._sym_k)
+
+    def _set_expressions(self, expressions):
+        """This method is overwritten because we need to modify kdiag by substituting z for x. We do this by calling the parent expression method to extract variables from expressions, then subsitute the z variables that are present with x."""
+        Symbolic_core._set_expressions(self, expressions)
+        Symbolic_core._set_variables(self, self.cacheable)
+        # Substitute z with x to obtain kdiag.
+        for x, z in zip(self.variables['X'], self.variables['Z']):
+            expressions['kdiag'] = expressions['kdiag'].subs(z, x)
+        Symbolic_core._set_expressions(self, expressions)
+            
+        
+    def K(self,X,X2=None):
+        if X2 is None:
+            return self.eval_function('k', X=X, Z=X)
+        else:
+            return self.eval_function('k', X=X, Z=X2)
+
+
+    def Kdiag(self,X):
+        d = self.eval_function('kdiag', X=X)
+        if not d.shape[0] == X.shape[0]:
+            d = np.tile(d, (X.shape[0], 1))
+        return d
+
+
+    def gradients_X(self, dL_dK, X, X2=None):
+        #if self._X is None or X.base is not self._X.base or X2 is not None:
+        g = self.eval_gradients_X('k', dL_dK, X=X, Z=X2)
+        if X2 is None:
+            g *= 2
+        return g
+
+    def gradients_X_diag(self, dL_dK, X):
+        return self.eval_gradients_X('kdiag', dL_dK, X=X)
+
+    def update_gradients_full(self, dL_dK, X, X2=None):
+        # Need to extract parameters to local variables first
+        if X2 is None:
+            # need to double this inside ...
+            gradients = self.eval_update_gradients('k', dL_dK, X=X)
+        else:
+            gradients = self.eval_update_gradients('k', dL_dK, X=X, Z=X2)
+
+        for name, val in gradients:
+            setattr(getattr(self, name), 'gradient', val)
+
+
+    def update_gradients_diag(self, dL_dKdiag, X):
+        gradients = self.eval_update_gradients('kdiag', dL_dKdiag, X)
+        for name, val in gradients:
+            setattr(getattr(self, name), 'gradient', val)
+
--- a/GPy/kern/_src/sympy_helpers.cpp
+++ b/GPy/kern/_src/sympy_helpers.cpp
@ -0,0 +1,61 @@
+#include <math.h>
+#include <float.h>
+#include <stdlib.h>
+
+double DiracDelta(double x){
+  // TODO: this doesn't seem to be a dirac delta ... should return infinity. Neil
+    if((x<0.000001) & (x>-0.000001))//go on, laugh at my c++ skills
+        return 1.0;
+    else
+        return 0.0;
+};
+double DiracDelta(double x,int foo){
+    return 0.0;
+};
+
+double sinc(double x){
+  if (x==0)
+    return 1.0;
+  else 
+    return sin(x)/x;
+}
+
+double sinc_grad(double x){
+  if (x==0)
+    return 0.0;
+  else 
+    return (x*cos(x) - sin(x))/(x*x);
+}
+
+double erfcx(double x){
+  double xneg=-sqrt(log(DBL_MAX/2));
+  double xmax = 1/(sqrt(M_PI)*DBL_MIN);
+  xmax = DBL_MAX<xmax ? DBL_MAX : xmax;
+  // Find values where erfcx can be evaluated
+  double t = 3.97886080735226 / (abs(x) + 3.97886080735226);
+  double u = t-0.5;
+  double y = (((((((((u * 0.00127109764952614092 + 1.19314022838340944e-4) * u 
+	      - 0.003963850973605135)   * u - 8.70779635317295828e-4) * u 
+	    + 0.00773672528313526668) * u + 0.00383335126264887303) * u 
+	  - 0.0127223813782122755)  * u - 0.0133823644533460069)  * u 
+	+ 0.0161315329733252248)  * u + 0.0390976845588484035)  * u + 0.00249367200053503304;
+  if (x<xneg)
+    return -INFINITY;
+  else if (x<0)
+    return 2*exp(x*x)-y;
+  else if (x>xmax)
+    return 0.0;
+  else 
+    return y;
+}
+
+double ln_diff_erf(double x0, double x1){
+  if (x0==x1)
+    return INFINITY;
+  else if(x0<0 && x1>0 || x0>0 && x1<0)
+    return log(erf(x0)-erf(x1));
+  else if(x1>0)
+    return log(erfcx(x1)-erfcx(x0)*exp(x1*x1)- x0*x0)-x1*x1;
+  else 
+    return log(erfcx(-x0)-erfcx(-x1)*exp(x0*x0 - x1*x1))-x0*x0;
+}
--- a/GPy/kern/_src/sympy_helpers.h
+++ b/GPy/kern/_src/sympy_helpers.h
@ -0,0 +1,9 @@
+#include <math.h>
+double DiracDelta(double x);
+double DiracDelta(double x, int foo);
+
+double sinc(double x);
+double sinc_grad(double x);
+
+double erfcx(double x);
+double ln_diff_erf(double x0, double x1);
--- a/GPy/kern/_src/todo/ODE_1.py
+++ b/GPy/kern/_src/todo/ODE_1.py
@ -90,7 +90,7 @@ class ODE_1(Kernpart):

        np.add(self.varianceU*self.varianceY*(k1+k2+k3), target, target)

-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
        dist = np.abs(X - X2.T)
@ -138,10 +138,6 @@ class ODE_1(Kernpart):
        k3 = np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
        dkdvar = k1+k2+k3

-        #target[0] dk dvarU
-        #target[1] dk dvarY
-        #target[2] dk d theta1
-        #target[3] dk d theta2 
        target[0] += np.sum(self.varianceY*dkdvar * dL_dK)
        target[1] += np.sum(self.varianceU*dkdvar * dL_dK)
        target[2] += np.sum(dktheta1*(-np.sqrt(3)*self.lengthscaleU**(-2)) * dL_dK)
--- a/GPy/kern/_src/todo/eq_ode1.py
+++ b/GPy/kern/_src/todo/eq_ode1.py
@ -124,7 +124,7 @@ class Eq_ode1(Kernpart):
        #target += np.diag(self.B)[np.asarray(index,dtype=np.int).flatten()]
        pass
    
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
        
        # First extract times and indices.
        self._extract_t_indices(X, X2, dL_dK=dL_dK)
@ -193,7 +193,7 @@ class Eq_ode1(Kernpart):
    def dKdiag_dtheta(self,dL_dKdiag,index,target):
        pass

-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
        pass

    def _extract_t_indices(self, X, X2=None, dL_dK=None):
--- a/GPy/kern/_src/todo/finite_dimensional.py
+++ b/GPy/kern/_src/todo/finite_dimensional.py
@ -50,7 +50,7 @@ class FiniteDimensional(Kernpart):
    def Kdiag(self,X,target):
        product = np.diag(self.K(X, X))
        np.add(target,product,target)
-    def dK_dtheta(self,X,X2,target):
+    def _param_grad_helper(self,X,X2,target):
        """Return shape is NxMx(Ntheta)"""
        if X2 is None: X2 = X
        FX = np.column_stack([f(X) for f in self.F])
--- a/GPy/kern/_src/todo/fixed.py
+++ b/GPy/kern/_src/todo/fixed.py
@ -31,10 +31,10 @@ class Fixed(Kernpart):
    def K(self, X, X2, target):
        target += self.variance * self.fixed_K

-    def dK_dtheta(self, partial, X, X2, target):
+    def _param_grad_helper(self, partial, X, X2, target):
        target += (partial * self.fixed_K).sum()

-    def dK_dX(self, partial, X, X2, target):
+    def gradients_X(self, partial, X, X2, target):
        pass

    def dKdiag_dX(self, partial, X, target):
--- a/Show more
+++ b/Show more
				`@ -0,0 +1,2 @@`
				`# This is the local installation configuration file for GPy`