Huge merge correcting upstream master

2026-05-10 12:32:40 +02:00 · 2014-11-21 16:49:33 +00:00 · 2014-11-21 16:49:33 +00:00 · 34932f8746
commit 34932f8746
parent 67ddbde119 acd9b4b2f8
319 changed files with 26201 additions and 26660 deletions
--- a/.gitignore
+++ b/.gitignore
@ -45,4 +45,4 @@ iterate.dat
 # git merge files #
 ###################
-*.orig
+*.orig
--- a/.travis.yml
+++ b/.travis.yml
@ -2,14 +2,14 @@ language: python
 python:
  - "2.7"
 #Set virtual env with system-site-packages to true
 virtualenv:
  system_site_packages: true
 # command to install dependencies, e.g. pip install -r requirements.txt --use-mirrors
 before_install:
-  - sudo apt-get install -qq python-scipy python-pip
+  #Install a mini version of anaconda such that we can easily install our dependencies
-  - sudo apt-get install -qq python-matplotlib
+  - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
  - chmod +x miniconda.sh
  - ./miniconda.sh -b
  - export PATH=/home/travis/miniconda/bin:$PATH
  - conda update --yes conda
  # Workaround for a permissions issue with Travis virtual machine images
  # that breaks Python's multiprocessing:
  # https://github.com/travis-ci/travis-cookbooks/issues/155
@ -17,11 +17,10 @@ before_install:
  - sudo ln -s /run/shm /dev/shm
 install:
-  - pip install --upgrade numpy==1.7.1
+  - conda install --yes python=$TRAVIS_PYTHON_VERSION atlas numpy=1.7 scipy=0.12 matplotlib nose sphinx pip nose
-  - pip install sphinx
+  - pip install . 
-  - pip install nose
+  #--use-mirrors
-  - pip install . --use-mirrors
+  #
 # command to run tests, e.g. python setup.py test
-script:
+script: 
  - nosetests GPy/testing
  #- yes | nosetests GPy/testing
--- a/GPy/FAQ.txt
+++ b/GPy/FAQ.txt
@ -1,8 +0,0 @@
 Frequently Asked Questions
 --------------------------
 Unit tests are run through Travis-Ci. They can be run locally through entering the GPy route diretory and writing
 nosetests testing/
 Documentation is handled by Sphinx. To build the documentation:
--- a/GPy/init.py
+++ b/GPy/init.py
@ -2,15 +2,10 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import warnings
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 import os
 def read(fname):
    with open(os.path.join(os.path.dirname(__file__), fname)) as f:
        return f.read()
 __version__ = read('version') 
 import core
 from core.parameterization import transformations, priors
 constraints = transformations
 import models
 import mappings
 import inference
@ -19,27 +14,36 @@ import examples
 import likelihoods
 import testing
 from numpy.testing import Tester
 from nose.tools import nottest
 import kern
-from core import priors
+import plotting
-@nottest
+# Direct imports for convenience:
-def tests():
+from core import Model
-    Tester(testing).test(verbose=10)
+from core.parameterization import Param, Parameterized, ObsAr
-if os.name == 'nt':
+#@nottest
 try:
    #Get rid of nose dependency by only ignoring if you have nose installed
    from nose.tools import nottest
    @nottest
    def tests():
        Tester(testing).test(verbose=10)
 except:
    def tests():
        Tester(testing).test(verbose=10)
 def load(file_path):
    """
-    Fortran seems to like to intercept keyboard interrupts on windows.
+    Load a previously pickled model, using `m.pickle('path/to/file.pickle)'
    This means that when a model is optimizing and the user presses Ctrl-C,
    the program will crash. Since it's kind of nice to be able to stop
    the optimization at any time, we define our own handler below.
    :param file_name: path/to/file.pickle
    """
-    import win32api
+    import cPickle as pickle
-    import thread
+    try:
-
+        with open(file_path, 'rb') as f:
-    def handler(sig, hook=thread.interrupt_main):
+            m = pickle.load(f)
-        hook()
+    except:
-        return 1
+        import pickle as pickle
-
+        with open(file_path, 'rb') as f:
-    win32api.SetConsoleCtrlHandler(handler, 1)
+            m = pickle.load(f)
    return m
--- a/GPy/coding_style_guide.txt
+++ b/GPy/coding_style_guide.txt
@ -1,10 +0,0 @@
 In this text document we will describe coding conventions to be used in GPy to keep things consistent.
 All arrays containing data are two dimensional. The first dimension is the number of data, the second dimension is number of features. This keeps things consistent with the idea of a design matrix.
 Input matrices are either X or t, output matrices are Y.
 Input dimensionality is input_dim, output dimensionality is output_dim, number of data is num_data.
 Data sets are preprocessed in the datasets.py file. This file also records where the data set was obtained from in the dictionary stored in the file. Long term we should move this dictionary to sqlite or similar.
--- a/GPy/core/init.py
+++ b/GPy/core/init.py
@ -1,11 +1,11 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from model import *
-from parameterized import *
+from parameterization.parameterized import adjust_name_for_printing, Parameterizable
-import priors
+from parameterization.param import Param, ParamConcatenation
 from parameterization.observable_array import ObsAr
 from gp import GP
 from sparse_gp import SparseGP
 from fitc import FITC
 from svigp import SVIGP
 from mapping import *
--- a/GPy/core/domains.py
+++ b/GPy/core/domains.py
@ -1,26 +0,0 @@
 '''
 Created on 4 Jun 2013
@author: maxz
 (Hyper-)Parameter domains defined for :py:mod:`~GPy.core.priors` and :py:mod:`~GPy.kern`.
 These domains specify the legitimate realm of the parameters to live in.
 :const:`~GPy.core.domains.REAL` :
    real domain, all values in the real numbers are allowed
 :const:`~GPy.core.domains.POSITIVE`:
    positive domain, only positive real values are allowed
 :const:`~GPy.core.domains.NEGATIVE`:
    same as :const:`~GPy.core.domains.POSITIVE`, but only negative values are allowed
 :const:`~GPy.core.domains.BOUNDED`:
    only values within the bounded range are allowed,
    the bounds are specified withing the object with the bounded range
 '''
 REAL = 'real'
 POSITIVE = "positive"
 NEGATIVE = 'negative'
 BOUNDED = 'bounded'
--- a/GPy/core/fitc.py
+++ b/GPy/core/fitc.py
@ -1,248 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 from ..util.linalg import mdot, jitchol, chol_inv, tdot, symmetrify, pdinv, dtrtrs
 from ..util.plot import gpplot
 from .. import kern
 from scipy import stats
 from sparse_gp import SparseGP
 class FITC(SparseGP):
    """
    Sparse FITC approximation
    :param X: inputs
    :type X: np.ndarray (num_data x Q)
    :param likelihood: a likelihood instance, containing the observed data
    :type likelihood: GPy.likelihood.(Gaussian | EP)
    :param kernel: the kernel (covariance function). See link kernels
    :type kernel: a GPy.kern.kern instance
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (M x Q) | None
    :param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """
    def __init__(self, X, likelihood, kernel, Z, normalize_X=False):
        SparseGP.__init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False)
        assert self.output_dim == 1, "FITC model is not defined for handling multiple outputs"
    def update_likelihood_approximation(self, **kwargs):
        """
        Approximates a non-Gaussian likelihood using Expectation Propagation
        For a Gaussian likelihood, no iteration is required:
        this function does nothing
        """
        self.likelihood.restart()
        self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0, **kwargs)
        self._set_params(self._get_params())
    def _compute_kernel_matrices(self):
        # kernel computations, using BGPLVM notation
        self.Kmm = self.kern.K(self.Z)
        self.psi0 = self.kern.Kdiag(self.X)
        self.psi1 = self.kern.K(self.Z, self.X)
        self.psi2 = None
    def _computations(self):
        #factor Kmm
        self.Lm = jitchol(self.Kmm)
        self.Lmi,info = dtrtrs(self.Lm,np.eye(self.num_inducing),lower=1)
        Lmipsi1 = np.dot(self.Lmi,self.psi1)
        self.Qnn = np.dot(Lmipsi1.T,Lmipsi1).copy()
        self.Diag0 = self.psi0 - np.diag(self.Qnn)
        self.beta_star = self.likelihood.precision/(1. + self.likelihood.precision*self.Diag0[:,None]) #NOTE: beta_star contains Diag0 and the precision
        self.V_star = self.beta_star * self.likelihood.Y
        # The rather complex computations of self.A
        tmp = self.psi1 * (np.sqrt(self.beta_star.flatten().reshape(1, self.num_data)))
        tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1)
        self.A = tdot(tmp)
        # factor B
        self.B = np.eye(self.num_inducing) + self.A
        self.LB = jitchol(self.B)
        self.LBi = chol_inv(self.LB)
        self.psi1V = np.dot(self.psi1, self.V_star)
        Lmi_psi1V, info = dtrtrs(self.Lm, np.asfortranarray(self.psi1V), lower=1, trans=0)
        self._LBi_Lmi_psi1V, _ = dtrtrs(self.LB, np.asfortranarray(Lmi_psi1V), lower=1, trans=0)
        Kmmipsi1 = np.dot(self.Lmi.T,Lmipsi1)
        b_psi1_Ki = self.beta_star * Kmmipsi1.T
        Ki_pbp_Ki = np.dot(Kmmipsi1,b_psi1_Ki)
        Kmmi = np.dot(self.Lmi.T,self.Lmi)
        LBiLmi = np.dot(self.LBi,self.Lmi)
        LBL_inv = np.dot(LBiLmi.T,LBiLmi)
        VVT = np.outer(self.V_star,self.V_star)
        VV_p_Ki = np.dot(VVT,Kmmipsi1.T)
        Ki_pVVp_Ki = np.dot(Kmmipsi1,VV_p_Ki)
        psi1beta = self.psi1*self.beta_star.T
        H = self.Kmm + mdot(self.psi1,psi1beta.T)
        LH = jitchol(H)
        LHi = chol_inv(LH)
        Hi = np.dot(LHi.T,LHi)
        betapsi1TLmiLBi = np.dot(psi1beta.T,LBiLmi.T)
        alpha = np.array([np.dot(a.T,a) for a in betapsi1TLmiLBi])[:,None]
        gamma_1 = mdot(VVT,self.psi1.T,Hi)
        pHip = mdot(self.psi1.T,Hi,self.psi1)
        gamma_2 = mdot(self.beta_star*pHip,self.V_star)
        gamma_3 = self.V_star * gamma_2
        self._dL_dpsi0 = -0.5 * self.beta_star#dA_dpsi0: logdet(self.beta_star)
        self._dL_dpsi0 += .5 * self.V_star**2 #dA_psi0: yT*beta_star*y
        self._dL_dpsi0 += .5 *alpha #dC_dpsi0
        self._dL_dpsi0 += 0.5*mdot(self.beta_star*pHip,self.V_star)**2 - self.V_star * mdot(self.V_star.T,pHip*self.beta_star).T #dD_dpsi0
        self._dL_dpsi1 = b_psi1_Ki.copy() #dA_dpsi1: logdet(self.beta_star)
        self._dL_dpsi1 += -np.dot(psi1beta.T,LBL_inv) #dC_dpsi1
        self._dL_dpsi1 += gamma_1 - mdot(psi1beta.T,Hi,self.psi1,gamma_1) #dD_dpsi1
        self._dL_dKmm = -0.5 * np.dot(Kmmipsi1,b_psi1_Ki) #dA_dKmm: logdet(self.beta_star)
        self._dL_dKmm += .5*(LBL_inv - Kmmi) + mdot(LBL_inv,psi1beta,Kmmipsi1.T) #dC_dKmm
        self._dL_dKmm += -.5 * mdot(Hi,self.psi1,gamma_1) #dD_dKmm
        self._dpsi1_dtheta = 0
        self._dpsi1_dX = 0
        self._dKmm_dtheta = 0
        self._dKmm_dX = 0
        self._dpsi1_dX_jkj = 0
        self._dpsi1_dtheta_jkj = 0
        for i,V_n,alpha_n,gamma_n,gamma_k in zip(range(self.num_data),self.V_star,alpha,gamma_2,gamma_3):
            K_pp_K = np.dot(Kmmipsi1[:,i:(i+1)],Kmmipsi1[:,i:(i+1)].T)
            _dpsi1 = (-V_n**2 - alpha_n + 2.*gamma_k - gamma_n**2) * Kmmipsi1.T[i:(i+1),:]
            _dKmm = .5*(V_n**2 + alpha_n + gamma_n**2 - 2.*gamma_k) * K_pp_K #Diag_dD_dKmm
            self._dpsi1_dtheta += self.kern.dK_dtheta(_dpsi1,self.X[i:i+1,:],self.Z)
            self._dKmm_dtheta += self.kern.dK_dtheta(_dKmm,self.Z)
            self._dKmm_dX += self.kern.dK_dX(_dKmm ,self.Z)
            self._dpsi1_dX += self.kern.dK_dX(_dpsi1.T,self.Z,self.X[i:i+1,:])
        # the partial derivative vector for the likelihood
        if self.likelihood.num_params == 0:
            # save computation here.
            self.partial_for_likelihood = None
        elif self.likelihood.is_heteroscedastic:
            raise NotImplementedError, "heteroscedatic derivates not implemented."
        else:
            # likelihood is not heterscedatic
            dbstar_dnoise = self.likelihood.precision * (self.beta_star**2 * self.Diag0[:,None] - self.beta_star)
            Lmi_psi1 = mdot(self.Lmi,self.psi1)
            LBiLmipsi1 = np.dot(self.LBi,Lmi_psi1)
            aux_0 = np.dot(self._LBi_Lmi_psi1V.T,LBiLmipsi1)
            aux_1 = self.likelihood.Y.T * np.dot(self._LBi_Lmi_psi1V.T,LBiLmipsi1)
            aux_2 = np.dot(LBiLmipsi1.T,self._LBi_Lmi_psi1V)
            dA_dnoise = 0.5 * self.input_dim * (dbstar_dnoise/self.beta_star).sum() - 0.5 * self.input_dim * np.sum(self.likelihood.Y**2 * dbstar_dnoise)
            dC_dnoise = -0.5 * np.sum(mdot(self.LBi.T,self.LBi,Lmi_psi1) *  Lmi_psi1 * dbstar_dnoise.T)
            dD_dnoise_1 =  mdot(self.V_star*LBiLmipsi1.T,LBiLmipsi1*dbstar_dnoise.T*self.likelihood.Y.T)
            alpha = mdot(LBiLmipsi1,self.V_star)
            alpha_ = mdot(LBiLmipsi1.T,alpha)
            dD_dnoise_2 = -0.5 * self.input_dim * np.sum(alpha_**2 * dbstar_dnoise )
            dD_dnoise_1 = mdot(self.V_star.T,self.psi1.T,self.Lmi.T,self.LBi.T,self.LBi,self.Lmi,self.psi1,dbstar_dnoise*self.likelihood.Y)
            dD_dnoise_2 = 0.5*mdot(self.V_star.T,self.psi1.T,Hi,self.psi1,dbstar_dnoise*self.psi1.T,Hi,self.psi1,self.V_star)
            dD_dnoise = dD_dnoise_1 + dD_dnoise_2
            self.partial_for_likelihood = dA_dnoise + dC_dnoise + dD_dnoise
    def log_likelihood(self):
        """ Compute the (lower bound on the) log marginal likelihood """
        A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.beta_star)) - 0.5 * np.sum(self.V_star * self.likelihood.Y)
        C = -self.output_dim * (np.sum(np.log(np.diag(self.LB))))
        D = 0.5 * np.sum(np.square(self._LBi_Lmi_psi1V))
        return A + C + D + self.likelihood.Z
    def _log_likelihood_gradients(self):
        pass
        return np.hstack((self.dL_dZ().flatten(), self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))
    def dL_dtheta(self):
        dL_dtheta = self.kern.dKdiag_dtheta(self._dL_dpsi0,self.X)
        dL_dtheta += self.kern.dK_dtheta(self._dL_dpsi1,self.X,self.Z)
        dL_dtheta += self.kern.dK_dtheta(self._dL_dKmm,X=self.Z)
        dL_dtheta += self._dKmm_dtheta
        dL_dtheta += self._dpsi1_dtheta
        return dL_dtheta
    def dL_dZ(self):
        dL_dZ = self.kern.dK_dX(self._dL_dpsi1.T,self.Z,self.X)
        dL_dZ += self.kern.dK_dX(self._dL_dKmm,X=self.Z)
        dL_dZ += self._dpsi1_dX
        dL_dZ += self._dKmm_dX
        return dL_dZ
    def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
        assert X_variance_new is None, "FITC model is not defined for handling uncertain inputs."
        if self.likelihood.is_heteroscedastic:
            Iplus_Dprod_i = 1./(1.+ self.Diag0 * self.likelihood.precision.flatten())
            self.Diag = self.Diag0 * Iplus_Dprod_i
            self.P = Iplus_Dprod_i[:,None] * self.psi1.T
            self.RPT0 = np.dot(self.Lmi,self.psi1)
            self.L = np.linalg.cholesky(np.eye(self.num_inducing) + np.dot(self.RPT0,((1. - Iplus_Dprod_i)/self.Diag0)[:,None]*self.RPT0.T))
            self.R,info = dtrtrs(self.L,self.Lmi,lower=1)
            self.RPT = np.dot(self.R,self.P.T)
            self.Sigma = np.diag(self.Diag) + np.dot(self.RPT.T,self.RPT)
            self.w = self.Diag * self.likelihood.v_tilde
            self.Gamma = np.dot(self.R.T, np.dot(self.RPT,self.likelihood.v_tilde))
            self.mu = self.w + np.dot(self.P,self.Gamma)
            """
            Make a prediction for the generalized FITC model
            Arguments
            ---------
            X : Input prediction data - Nx1 numpy array (floats)
            """
            # q(u|f) = N(u| R0i*mu_u*f, R0i*C*R0i.T)
            # Ci = I + (RPT0)Di(RPT0).T
            # C = I - [RPT0] * (input_dim+[RPT0].T*[RPT0])^-1*[RPT0].T
            #   = I - [RPT0] * (input_dim + self.Qnn)^-1 * [RPT0].T
            #   = I - [RPT0] * (U*U.T)^-1 * [RPT0].T
            #   = I - V.T * V
            U = np.linalg.cholesky(np.diag(self.Diag0) + self.Qnn)
            V,info = dtrtrs(U,self.RPT0.T,lower=1)
            C = np.eye(self.num_inducing) - np.dot(V.T,V)
            mu_u = np.dot(C,self.RPT0)*(1./self.Diag0[None,:])
            #self.C = C
            #self.RPT0 = np.dot(self.R0,self.Knm.T) P0.T
            #self.mu_u = mu_u
            #self.U = U
            # q(u|y) = N(u| R0i*mu_H,R0i*Sigma_H*R0i.T)
            mu_H = np.dot(mu_u,self.mu)
            self.mu_H = mu_H
            Sigma_H = C + np.dot(mu_u,np.dot(self.Sigma,mu_u.T))
            # q(f_star|y) = N(f_star|mu_star,sigma2_star)
            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
            KR0T = np.dot(Kx.T,self.Lmi.T)
            mu_star = np.dot(KR0T,mu_H)
            if full_cov:
                Kxx = self.kern.K(Xnew,which_parts=which_parts)
                var = Kxx + np.dot(KR0T,np.dot(Sigma_H - np.eye(self.num_inducing),KR0T.T))
            else:
                Kxx = self.kern.Kdiag(Xnew,which_parts=which_parts)
                var = (Kxx + np.sum(KR0T.T*np.dot(Sigma_H - np.eye(self.num_inducing),KR0T.T),0))[:,None]
            return mu_star[:,None],var
        else:
            raise NotImplementedError, "Heteroscedastic case not implemented."
            """
            Kx = self.kern.K(self.Z, Xnew)
            mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V)
            if full_cov:
                Kxx = self.kern.K(Xnew)
                var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) #NOTE this won't work for plotting
            else:
                Kxx = self.kern.Kdiag(Xnew)
                var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0)
            return mu,var[:,None]
            """
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -1,204 +1,459 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
-import pylab as pb
+import sys
 from .. import kern
-from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs
+from model import Model
-from ..likelihoods import EP, Laplace
+from parameterization import ObsAr
-from gp_base import GPBase
+from .. import likelihoods
 from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
 from parameterization.variational import VariationalPosterior
-class GP(GPBase):
+import logging
 from GPy.util.normalizer import MeanNorm
 logger = logging.getLogger("GP")
 class GP(Model):
    """
-    Gaussian Process model for regression and EP
+    General purpose Gaussian process model
    :param X: input observations
    :param Y: output observations
    :param kernel: a GPy kernel, defaults to rbf+white
    :param likelihood: a GPy likelihood
-    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
+    :param inference_method: The :class:`~GPy.inference.latent_function_inference.LatentFunctionInference` inference method to use for this GP
    :type normalize_X: False|True
    :rtype: model object
    :param Norm normalizer:
        normalize the outputs Y.
        Prediction will be un-normalized using this normalizer.
        If normalizer is None, we will normalize using MeanNorm.
        If normalizer is False, no normalization will be done.
    .. Note:: Multiple independent outputs are allowed using columns of Y
    """
-    def __init__(self, X, likelihood, kernel, normalize_X=False):
+    def __init__(self, X, Y, kernel, likelihood, inference_method=None, name='gp', Y_metadata=None, normalizer=False):
-        GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
+        super(GP, self).__init__(name)
        self.update_likelihood_approximation()
        assert X.ndim == 2
        if isinstance(X, (ObsAr, VariationalPosterior)):
            self.X = X.copy()
        else: self.X = ObsAr(X)
-    def _set_params(self, p):
+        self.num_data, self.input_dim = self.X.shape
        new_kern_params = p[:self.kern.num_params_transformed()]
        new_likelihood_params = p[self.kern.num_params_transformed():]
        old_likelihood_params = self.likelihood._get_params()
-        self.kern._set_params_transformed(new_kern_params)
+        assert Y.ndim == 2
-        self.likelihood._set_params_transformed(new_likelihood_params)
+        logger.info("initializing Y")
-        self.K = self.kern.K(self.X)
+        if normalizer is True:
-
+            self.normalizer = MeanNorm()
-        #Re fit likelihood approximation (if it is an approx), as parameters have changed
+        elif normalizer is False:
-        if isinstance(self.likelihood, Laplace):
+            self.normalizer = None
            self.likelihood.fit_full(self.K)
        self.K += self.likelihood.covariance_matrix
        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
        # the gradient of the likelihood wrt the covariance matrix
        if self.likelihood.YYT is None:
            # alpha = np.dot(self.Ki, self.likelihood.Y)
            alpha, _ = dpotrs(self.L, self.likelihood.Y, lower=1)
            self.dL_dK = 0.5 * (tdot(alpha) - self.output_dim * self.Ki)
        else:
-            # tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki)
+            self.normalizer = normalizer
            tmp, _ = dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1)
            tmp, _ = dpotrs(self.L, np.asfortranarray(tmp.T), lower=1)
            self.dL_dK = 0.5 * (tmp - self.output_dim * self.Ki)
-        #Adding dZ_dK (0 for a non-approximate likelihood, compensates for
+        if self.normalizer is not None:
-        #additional gradients of K when log-likelihood has non-zero Z term)
+            self.normalizer.scale_by(Y)
-        self.dL_dK += self.likelihood.dZ_dK
+            self.Y_normalized = ObsAr(self.normalizer.normalize(Y))
-
+            self.Y = Y
    def _get_params(self):
        return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params()))
    def _get_param_names(self):
        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
    def update_likelihood_approximation(self, **kwargs):
        """
        Approximates a non-gaussian likelihood using Expectation Propagation
        For a Gaussian likelihood, no iteration is required:
        this function does nothing
        """
        self.likelihood.restart()
        self.likelihood.fit_full(self.kern.K(self.X), **kwargs)
        self._set_params(self._get_params()) # update the GP
    def _model_fit_term(self):
        """
        Computes the model fit using YYT if it's available
        """
        if self.likelihood.YYT is None:
            tmp, _ = dtrtrs(self.L, np.asfortranarray(self.likelihood.Y), lower=1)
            return -0.5 * np.sum(np.square(tmp))
            # return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y)))
        else:
-            return -0.5 * np.sum(np.multiply(self.Ki, self.likelihood.YYT))
+            self.Y = ObsAr(Y)
            self.Y_normalized = self.Y
        assert Y.shape[0] == self.num_data
        _, self.output_dim = self.Y.shape
        #TODO: check the type of this is okay?
        self.Y_metadata = Y_metadata
        assert isinstance(kernel, kern.Kern)
        #assert self.input_dim == kernel.input_dim
        self.kern = kernel
        assert isinstance(likelihood, likelihoods.Likelihood)
        self.likelihood = likelihood
        #find a sensible inference method
        logger.info("initializing inference method")
        if inference_method is None:
            if isinstance(likelihood, likelihoods.Gaussian) or isinstance(likelihood, likelihoods.MixedNoise):
                inference_method = exact_gaussian_inference.ExactGaussianInference()
            else:
                inference_method = expectation_propagation.EP()
                print "defaulting to ", inference_method, "for latent function inference"
        self.inference_method = inference_method
        logger.info("adding kernel and likelihood as parameters")
        self.link_parameter(self.kern)
        self.link_parameter(self.likelihood)
    def set_XY(self, X=None, Y=None):
        """
        Set the input / output data of the model
        This is useful if we wish to change our existing data but maintain the same model
        :param X: input observations
        :type X: np.ndarray
        :param Y: output observations
        :type Y: np.ndarray
        """
        self.update_model(False)
        if Y is not None:
            if self.normalizer is not None:
                self.normalizer.scale_by(Y)
                self.Y_normalized = ObsAr(self.normalizer.normalize(Y))
                self.Y = Y
            else:
                self.Y = ObsAr(Y)
                self.Y_normalized = self.Y
        if X is not None:
            if self.X in self.parameters:
                # LVM models
                if isinstance(self.X, VariationalPosterior):
                    assert isinstance(X, type(self.X)), "The given X must have the same type as the X in the model!"
                    self.unlink_parameter(self.X)
                    self.X = X
                    self.link_parameters(self.X)
                else:
                    self.unlink_parameter(self.X)
                    from ..core import Param
                    self.X = Param('latent mean',X)
                    self.link_parameters(self.X)
            else:
                self.X = ObsAr(X)
        self.update_model(True)
    def set_X(self,X):
        """
        Set the input data of the model
        :param X: input observations
        :type X: np.ndarray
        """
        self.set_XY(X=X)
    def set_Y(self,Y):
        """
        Set the output data of the model
        :param X: output observations
        :type X: np.ndarray
        """
        self.set_XY(Y=Y)
    def parameters_changed(self):
        """
        Method that is called upon any changes to :class:`~GPy.core.parameterization.param.Param` variables within the model.
        In particular in the GP class this method reperforms inference, recalculating the posterior and log marginal likelihood and gradients of the model
        .. warning::
            This method is not designed to be called manually, the framework is set up to automatically call this method upon changes to parameters, if you call
            this method yourself, there may be unexpected consequences.
        """
        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.Y_metadata)
        self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
        self.kern.update_gradients_full(self.grad_dict['dL_dK'], self.X)
    def log_likelihood(self):
        """
-        The log marginal likelihood of the GP.
+        The log marginal likelihood of the model, :math:`p(\mathbf{y})`, this is the objective function of the model being optimised
        """
        return self._log_marginal_likelihood
-        For an EP model,  can be written as the log likelihood of a regression
+    def _raw_predict(self, _Xnew, full_cov=False, kern=None):
        model for a new variable Y* = v_tilde/tau_tilde, with a covariance
        matrix K* = K + diag(1./tau_tilde) plus a normalization term.
        """
-        return (-0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) -
+        For making predictions, does not account for normalization or likelihood
            0.5 * self.output_dim * self.K_logdet + self._model_fit_term() + self.likelihood.Z)
-    def _log_likelihood_gradients(self):
+        full_cov is a boolean which defines whether the full covariance matrix
-        """
+        of the prediction is computed. If full_cov is False (default), only the
-        The gradient of all parameters.
+        diagonal of the covariance is returned.
-        Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
+        .. math::
            p(f*|X*, X, Y) = \int^{\inf}_{\inf} p(f*|f,X*)p(f|X,Y) df
                        = N(f*| K_{x*x}(K_{xx} + \Sigma)^{-1}Y, K_{x*x*} - K_{xx*}(K_{xx} + \Sigma)^{-1}K_{xx*}
            \Sigma := \texttt{Likelihood.variance / Approximate likelihood covariance}
        """
-        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
+        if kern is None:
            kern = self.kern
-    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False):
+        Kx = kern.K(_Xnew, self.X).T
-        """
+        WiKx = np.dot(self.posterior.woodbury_inv, Kx)
-        Internal helper function for making predictions, does not account
+        mu = np.dot(Kx.T, self.posterior.woodbury_vector)
        for normalization or likelihood
        """
        Kx = self.kern.K(_Xnew, self.X, which_parts=which_parts).T
        # KiKx = np.dot(self.Ki, Kx)
        KiKx, _ = dpotrs(self.L, np.asfortranarray(Kx), lower=1)
        mu = np.dot(KiKx.T, self.likelihood.Y)
        if full_cov:
-            Kxx = self.kern.K(_Xnew, which_parts=which_parts)
+            Kxx = kern.K(_Xnew)
-            var = Kxx - np.dot(KiKx.T, Kx)
+            var = Kxx - np.dot(Kx.T, WiKx)
        else:
-            Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
+            Kxx = kern.Kdiag(_Xnew)
-            var = Kxx - np.sum(np.multiply(KiKx, Kx), 0)
+            var = Kxx - np.sum(WiKx*Kx, 0)
-            var = var[:, None]
+            var = var.reshape(-1, 1)
-        if stop:
+
-            debug_this # @UndefinedVariable
+        #force mu to be a column vector
        if len(mu.shape)==1: mu = mu[:,None]
        return mu, var
-    def predict(self, Xnew, which_parts='all', full_cov=False, **likelihood_args):
+    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None):
        """
        Predict the function(s) at the new point(s) Xnew.
        :param Xnew: The points at which to make a prediction
-        :type Xnew: np.ndarray, Nnew x self.input_dim
+        :type Xnew: np.ndarray (Nnew x self.input_dim)
-        :param which_parts:  specifies which outputs kernel(s) to use in prediction
+        :param full_cov: whether to return the full covariance matrix, or just
-        :type which_parts: ('all', list of bools)
+                         the diagonal
        :param full_cov: whether to return the full covariance matrix, or just the diagonal
        :type full_cov: bool
-        :returns: mean: posterior mean,  a Numpy array, Nnew x self.input_dim
+        :param Y_metadata: metadata about the predicting point to pass to the likelihood
-        :returns: var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
+        :param kern: The kernel to use for prediction (defaults to the model
-        :returns: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
+                     kern). this is useful for examining e.g. subprocesses.
-
+        :returns: (mean, var, lower_upper):
            mean: posterior mean, a Numpy array, Nnew x self.input_dim
            var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
            lower_upper: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
           If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
           This is to allow for different normalizations of the output dimensions.
        """
-        # normalize X values
+        #predict the latent function values
-        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
+        mu, var = self._raw_predict(Xnew, full_cov=full_cov, kern=kern)
-        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)
+        if self.normalizer is not None:
            mu, var = self.normalizer.inverse_mean(mu), self.normalizer.inverse_variance(var)
        # now push through likelihood
-        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, **likelihood_args)
+        mean, var = self.likelihood.predictive_values(mu, var, full_cov, Y_metadata)
-        return mean, var, _025pm, _975pm
+        return mean, var
-    def _raw_predict_single_output(self, _Xnew, output, which_parts='all', full_cov=False,stop=False):
+    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None):
        """
-        For a specific output, calls _raw_predict() at the new point(s) _Xnew.
+        Get the predictive quantiles around the prediction at X
        This functions calls _add_output_index(), so _Xnew should not have an index column specifying the output.
        ---------
-        :param Xnew: The points at which to make a prediction
+        :param X: The points at which to make a prediction
-        :type Xnew: np.ndarray, Nnew x self.input_dim
+        :type X: np.ndarray (Xnew x self.input_dim)
-        :param output: output to predict
+        :param quantiles: tuple of quantiles, default is (2.5, 97.5) which is the 95% interval
-        :type output: integer in {0,..., output_dim-1}
+        :type quantiles: tuple
-        :param which_parts:  specifies which outputs kernel(s) to use in prediction
+        :returns: list of quantiles for each X and predictive quantiles for interval combination
-        :type which_parts: ('all', list of bools)
+        :rtype: [np.ndarray (Xnew x self.input_dim), np.ndarray (Xnew x self.input_dim)]
        :param full_cov: whether to return the full covariance matrix, or just the diagonal
        .. Note:: For multiple non-independent outputs models only.
        """
-        _Xnew = self._add_output_index(_Xnew, output)
+        m, v = self._raw_predict(X,  full_cov=False)
-        return self._raw_predict(_Xnew, which_parts=which_parts,full_cov=full_cov, stop=stop)
+        if self.normalizer is not None:
            m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
        return self.likelihood.predictive_quantiles(m, v, quantiles, Y_metadata)
-    def predict_single_output(self, Xnew,output=0, which_parts='all', full_cov=False, likelihood_args=dict()):
+    def predictive_gradients(self, Xnew):
        """
-        For a specific output, calls predict() at the new point(s) Xnew.
+        Compute the derivatives of the latent function with respect to X*
        This functions calls _add_output_index(), so Xnew should not have an index column specifying the output.
-        :param Xnew: The points at which to make a prediction
+        Given a set of points at which to predict X* (size [N*,Q]), compute the
-        :type Xnew: np.ndarray, Nnew x self.input_dim
+        derivatives of the mean and variance. Resulting arrays are sized:
-        :param which_parts:  specifies which outputs kernel(s) to use in prediction
+         dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).
-        :type which_parts: ('all', list of bools)
+
-        :param full_cov: whether to return the full covariance matrix, or just the diagonal
+         dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
-        :type full_cov: bool
+        :param X: The points at which to get the predictive gradients
-        :returns: mean: posterior mean,  a Numpy array, Nnew x self.input_dim
+        :type X: np.ndarray (Xnew x self.input_dim)
-        :returns: var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
+        :returns: dmu_dX, dv_dX
-        :returns: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
+        :rtype: [np.ndarray (N*, Q ,D), np.ndarray (N*,Q) ]
        .. Note:: For multiple non-independent outputs models only.
        """
-        Xnew = self._add_output_index(Xnew, output)
+        dmu_dX = np.empty((Xnew.shape[0],Xnew.shape[1],self.output_dim))
-        return self.predict(Xnew, which_parts=which_parts, full_cov=full_cov, likelihood_args=likelihood_args)
+        for i in range(self.output_dim):
            dmu_dX[:,:,i] = self.kern.gradients_X(self.posterior.woodbury_vector[:,i:i+1].T, Xnew, self.X)
-    def getstate(self):
+        # gradients wrt the diagonal part k_{xx}
-        return GPBase.getstate(self)
+        dv_dX = self.kern.gradients_X(np.eye(Xnew.shape[0]), Xnew)
        #grads wrt 'Schur' part K_{xf}K_{ff}^{-1}K_{fx}
        alpha = -2.*np.dot(self.kern.K(Xnew, self.X),self.posterior.woodbury_inv)
        dv_dX += self.kern.gradients_X(alpha, Xnew, self.X)
        return dmu_dX, dv_dX
    def setstate(self, state):
        GPBase.setstate(self, state)
        self._set_params(self._get_params())
    def posterior_samples_f(self,X,size=10, full_cov=True):
        """
        Samples the posterior GP at the points X.
        :param X: The points at which to take the samples.
        :type X: np.ndarray (Nnew x self.input_dim)
        :param size: the number of a posteriori samples.
        :type size: int.
        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
        :type full_cov: bool.
        :returns: Ysim: set of simulations
        :rtype: np.ndarray (N x samples)
        """
        m, v = self._raw_predict(X,  full_cov=full_cov)
        if self.normalizer is not None:
            m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
        v = v.reshape(m.size,-1) if len(v.shape)==3 else v
        if not full_cov:
            Ysim = np.random.multivariate_normal(m.flatten(), np.diag(v.flatten()), size).T
        else:
            Ysim = np.random.multivariate_normal(m.flatten(), v, size).T
        return Ysim
    def posterior_samples(self, X, size=10, full_cov=False, Y_metadata=None):
        """
        Samples the posterior GP at the points X.
        :param X: the points at which to take the samples.
        :type X: np.ndarray (Nnew x self.input_dim.)
        :param size: the number of a posteriori samples.
        :type size: int.
        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
        :type full_cov: bool.
        :param noise_model: for mixed noise likelihood, the noise model to use in the samples.
        :type noise_model: integer.
        :returns: Ysim: set of simulations, a Numpy array (N x samples).
        """
        Ysim = self.posterior_samples_f(X, size, full_cov=full_cov)
        Ysim = self.likelihood.samples(Ysim, Y_metadata)
        return Ysim
    def plot_f(self, plot_limits=None, which_data_rows='all',
        which_data_ycols='all', fixed_inputs=[],
        levels=20, samples=0, fignum=None, ax=None, resolution=None,
        plot_raw=True,
        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx'):
        """
        Plot the GP's view of the world, where the data is normalized and before applying a likelihood.
        This is a call to plot with plot_raw=True.
        Data will not be plotted in this, as the GP's view of the world
        may live in another space, or units then the data.
        Can plot only part of the data and part of the posterior functions
        using which_data_rowsm which_data_ycols.
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
        :type plot_limits: np.array
        :param which_data_rows: which of the training data to plot (default all)
        :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
        :type which_data_ycols: 'all' or a list of integers
        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
        :type fixed_inputs: a list of tuples
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        :type resolution: int
        :param levels: number of levels to plot in a contour plot.
        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
        :type levels: int
        :param samples: the number of a posteriori samples to plot
        :type samples: int
        :param fignum: figure to plot on.
        :type fignum: figure number
        :param ax: axes to plot on.
        :type ax: axes handle
        :param linecol: color of line to plot [Tango.colorsHex['darkBlue']]
        :type linecol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
        :param fillcol: color of fill [Tango.colorsHex['lightBlue']]
        :type fillcol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
        :param Y_metadata: additional data associated with Y which may be needed
        :type Y_metadata: dict
        :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
        :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import models_plots
        kw = {}
        if linecol is not None:
            kw['linecol'] = linecol
        if fillcol is not None:
            kw['fillcol'] = fillcol
        return models_plots.plot_fit(self, plot_limits, which_data_rows,
                                     which_data_ycols, fixed_inputs,
                                     levels, samples, fignum, ax, resolution,
                                     plot_raw=plot_raw, Y_metadata=Y_metadata,
                                     data_symbol=data_symbol, **kw)
    def plot(self, plot_limits=None, which_data_rows='all',
        which_data_ycols='all', fixed_inputs=[],
        levels=20, samples=0, fignum=None, ax=None, resolution=None,
        plot_raw=False,
        linecol=None,fillcol=None, Y_metadata=None, data_symbol='kx'):
        """
        Plot the posterior of the GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
          - In two dimsensions, a contour-plot shows the mean predicted function
          - In higher dimensions, use fixed_inputs to plot the GP  with some of the inputs fixed.
        Can plot only part of the data and part of the posterior functions
        using which_data_rowsm which_data_ycols.
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
        :type plot_limits: np.array
        :param which_data_rows: which of the training data to plot (default all)
        :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
        :type which_data_ycols: 'all' or a list of integers
        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
        :type fixed_inputs: a list of tuples
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        :type resolution: int
        :param levels: number of levels to plot in a contour plot.
        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
        :type levels: int
        :param samples: the number of a posteriori samples to plot
        :type samples: int
        :param fignum: figure to plot on.
        :type fignum: figure number
        :param ax: axes to plot on.
        :type ax: axes handle
        :param linecol: color of line to plot [Tango.colorsHex['darkBlue']]
        :type linecol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
        :param fillcol: color of fill [Tango.colorsHex['lightBlue']]
        :type fillcol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
        :param Y_metadata: additional data associated with Y which may be needed
        :type Y_metadata: dict
        :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
        :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import models_plots
        kw = {}
        if linecol is not None:
            kw['linecol'] = linecol
        if fillcol is not None:
            kw['fillcol'] = fillcol
        return models_plots.plot_fit(self, plot_limits, which_data_rows,
                                     which_data_ycols, fixed_inputs,
                                     levels, samples, fignum, ax, resolution,
                                     plot_raw=plot_raw, Y_metadata=Y_metadata,
                                     data_symbol=data_symbol, **kw)
    def input_sensitivity(self, summarize=True):
        """
        Returns the sensitivity for each dimension of this model
        """
        return self.kern.input_sensitivity(summarize=summarize)
    def optimize(self, optimizer=None, start=None, **kwargs):
        """
        Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
        kwargs are passed to the optimizer. They can be:
        :param max_f_eval: maximum number of function evaluations
        :type max_f_eval: int
        :messages: whether to display during optimisation
        :type messages: bool
        :param optimizer: which optimizer to use (defaults to self.preferred optimizer), a range of optimisers can be found in :module:`~GPy.inference.optimization`, they include 'scg', 'lbfgs', 'tnc'.
        :type optimizer: string
        """
        self.inference_method.on_optimization_start()
        try:
            super(GP, self).optimize(optimizer, start, **kwargs)
        except KeyboardInterrupt:
            print "KeyboardInterrupt caught, calling on_optimization_end() to round things up"
            self.inference_method.on_optimization_end()
            raise
    def infer_newX(self, Y_new, optimize=True, ):
        """
        Infer the distribution of X for the new observed data *Y_new*.
        :param Y_new: the new observed data for inference
        :type Y_new: numpy.ndarray
        :param optimize: whether to optimize the location of new X (True by default)
        :type optimize: boolean
        :return: a tuple containing the posterior estimation of X and the model that optimize X
        :rtype: (:class:`~GPy.core.parameterization.variational.VariationalPosterior` or numpy.ndarray, :class:`~GPy.core.model.Model`)
        """
        from ..inference.latent_function_inference.inferenceX import infer_newX
        return infer_newX(self, Y_new, optimize=optimize)
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@ -1,276 +0,0 @@
 import numpy as np
 from .. import kern
 from ..util.plot import gpplot, Tango, x_frame1D, x_frame2D
 import pylab as pb
 from GPy.core.model import Model
 import warnings
 from ..likelihoods import Gaussian, Gaussian_Mixed_Noise
 class GPBase(Model):
    """
    Gaussian process base model for holding shared behaviour between
    sparse_GP and GP models, and potentially other models in the future.
    Here we define some functions that are use
    """
    def __init__(self, X, likelihood, kernel, normalize_X=False):
        if len(X.shape)==1:
            X = X.reshape(-1,1)
            warnings.warn("One dimension output (N,) being reshaped to (N,1)")
        self.X = X
        assert len(self.X.shape) == 2, "too many dimensions for X input"
        self.num_data, self.input_dim = self.X.shape
        assert isinstance(kernel, kern.kern)
        self.kern = kernel
        self.likelihood = likelihood
        assert self.X.shape[0] == self.likelihood.data.shape[0]
        self.num_data, self.output_dim = self.likelihood.data.shape
        if normalize_X:
            self._Xoffset = X.mean(0)[None, :]
            self._Xscale = X.std(0)[None, :]
            self._Xscale[np.where(self._Xscale==0)] = 1
            self.X = (X.copy() - self._Xoffset) / self._Xscale
        else:
            self._Xoffset = np.zeros((1, self.input_dim))
            self._Xscale = np.ones((1, self.input_dim))
        super(GPBase, self).__init__()
        # Model.__init__(self)
        # All leaf nodes should call self._set_params(self._get_params()) at
        # the end
    def posterior_samples_f(self,X,size=10,which_parts='all'):
        """
        Samples the posterior GP at the points X.
        :param X: The points at which to take the samples.
        :type X: np.ndarray, Nnew x self.input_dim.
        :param size: the number of a posteriori samples to plot.
        :type size: int.
        :param which_parts: which of the kernel functions to plot (additively).
        :type which_parts: 'all', or list of bools.
        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
        :type full_cov: bool.
        :returns: Ysim: set of simulations, a Numpy array (N x samples).
        """
        m, v = self._raw_predict(X, which_parts=which_parts, full_cov=True)
        v = v.reshape(m.size,-1) if len(v.shape)==3 else v
        Ysim = np.random.multivariate_normal(m.flatten(), v, size).T
        return Ysim
    def posterior_samples(self,X,size=10,which_parts='all',noise_model=None):
        """
        Samples the posterior GP at the points X.
        :param X: the points at which to take the samples.
        :type X: np.ndarray, Nnew x self.input_dim.
        :param size: the number of a posteriori samples to plot.
        :type size: int.
        :param which_parts: which of the kernel functions to plot (additively).
        :type which_parts: 'all', or list of bools.
        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
        :type full_cov: bool.
        :param noise_model: for mixed noise likelihood, the noise model to use in the samples.
        :type noise_model: integer.
        :returns: Ysim: set of simulations, a Numpy array (N x samples).
        """
        Ysim = self.posterior_samples_f(X, size, which_parts=which_parts)
        if isinstance(self.likelihood,Gaussian):
            noise_std = np.sqrt(self.likelihood._get_params())
            Ysim += np.random.normal(0,noise_std,Ysim.shape)
        elif isinstance(self.likelihood,Gaussian_Mixed_Noise):
            assert noise_model is not None, "A noise model must be specified."
            noise_std = np.sqrt(self.likelihood._get_params()[noise_model])
            Ysim += np.random.normal(0,noise_std,Ysim.shape)
        else:
            Ysim = self.likelihood.noise_model.samples(Ysim)
        return Ysim
    def plot_f(self, *args, **kwargs):
        """
        Plot the GP's view of the world, where the data is normalized and before applying a likelihood.
        This is a convenience function: we simply call self.plot with the
        argument use_raw_predict set True. All args and kwargs are passed on to
        plot.
        see also: gp_base.plot
        """
        kwargs['plot_raw'] = True
        self.plot(*args, **kwargs)
    def plot(self, plot_limits=None, which_data_rows='all',
            which_data_ycols='all', which_parts='all', fixed_inputs=[],
            levels=20, samples=0, fignum=None, ax=None, resolution=None,
            plot_raw=False,
            linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
        """
        Plot the posterior of the GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
          - In two dimsensions, a contour-plot shows the mean predicted function
          - In higher dimensions, use fixed_inputs to plot the GP  with some of the inputs fixed.
        Can plot only part of the data and part of the posterior functions
        using which_data_rowsm which_data_ycols and which_parts
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
        :type plot_limits: np.array
        :param which_data_rows: which of the training data to plot (default all)
        :type which_data_rows: 'all' or a slice object to slice self.X, self.Y
        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
        :type which_data_rows: 'all' or a list of integers
        :param which_parts: which of the kernel functions to plot (additively)
        :type which_parts: 'all', or list of bools
        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
        :type fixed_inputs: a list of tuples
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        :type resolution: int
        :param levels: number of levels to plot in a contour plot.
        :type levels: int
        :param samples: the number of a posteriori samples to plot
        :type samples: int
        :param fignum: figure to plot on.
        :type fignum: figure number
        :param ax: axes to plot on.
        :type ax: axes handle
        :type output: integer (first output is 0)
        :param linecol: color of line to plot.
        :type linecol:
        :param fillcol: color of fill
        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
        """
        #deal with optional arguments
        if which_data_rows == 'all':
            which_data_rows = slice(None)
        if which_data_ycols == 'all':
            which_data_ycols = np.arange(self.output_dim)
        if len(which_data_ycols)==0:
            raise ValueError('No data selected for plotting')
        if ax is None:
            fig = pb.figure(num=fignum)
            ax = fig.add_subplot(111)
        #work out what the inputs are for plotting (1D or 2D)
        fixed_dims = np.array([i for i,v in fixed_inputs])
        free_dims = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
        #one dimensional plotting
        if len(free_dims) == 1:
            #define the frame on which to plot
            resolution = resolution or 200
            Xu = self.X * self._Xscale + self._Xoffset #NOTE self.X are the normalized values now
            Xnew, xmin, xmax = x_frame1D(Xu[:,free_dims], plot_limits=plot_limits)
            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
            Xgrid[:,free_dims] = Xnew
            for i,v in fixed_inputs:
                Xgrid[:,i] = v
            #make a prediction on the frame and plot it
            if plot_raw:
                m, v = self._raw_predict(Xgrid, which_parts=which_parts)
                lower = m - 2*np.sqrt(v)
                upper = m + 2*np.sqrt(v)
                Y = self.likelihood.Y
            else:
                m, v, lower, upper = self.predict(Xgrid, which_parts=which_parts, sampling=False) #Compute the exact mean
                m_, v_, lower, upper = self.predict(Xgrid, which_parts=which_parts, sampling=True, num_samples=15000) #Apporximate the percentiles
                Y = self.likelihood.data
            for d in which_data_ycols:
                gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol)
                ax.plot(Xu[which_data_rows,free_dims], Y[which_data_rows, d], 'kx', mew=1.5)
            #optionally plot some samples
            if samples: #NOTE not tested with fixed_inputs
                Ysim = self.posterior_samples(Xgrid, samples, which_parts=which_parts)
                for yi in Ysim.T:
                    ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
                    #ax.plot(Xnew, yi[:,None], marker='x', linestyle='--',color=Tango.colorsHex['darkBlue']) #TODO apply this line for discrete outputs.
            #set the limits of the plot to some sensible values
            ymin, ymax = min(np.append(Y[which_data_rows, which_data_ycols].flatten(), lower)), max(np.append(Y[which_data_rows, which_data_ycols].flatten(), upper))
            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
            ax.set_xlim(xmin, xmax)
            ax.set_ylim(ymin, ymax)
        #2D plotting
        elif len(free_dims) == 2:
            #define the frame for plotting on
            resolution = resolution or 50
            Xu = self.X * self._Xscale + self._Xoffset #NOTE self.X are the normalized values now
            Xnew, _, _, xmin, xmax = x_frame2D(Xu[:,free_dims], plot_limits, resolution)
            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
            Xgrid[:,free_dims] = Xnew
            for i,v in fixed_inputs:
                Xgrid[:,i] = v
            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
            #predict on the frame and plot
            if plot_raw:
                m, _ = self._raw_predict(Xgrid, which_parts=which_parts)
                Y = self.likelihood.Y
            else:
                m, _, _, _ = self.predict(Xgrid, which_parts=which_parts,sampling=False)
                Y = self.likelihood.data
            for d in which_data_ycols:
                m_d = m[:,d].reshape(resolution, resolution).T
                contour = ax.contour(x, y, m_d, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
                scatter = ax.scatter(self.X[which_data_rows, free_dims[0]], self.X[which_data_rows, free_dims[1]], 40, Y[which_data_rows, d], cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
            #set the limits of the plot to some sensible values
            ax.set_xlim(xmin[0], xmax[0])
            ax.set_ylim(xmin[1], xmax[1])
            if samples:
                warnings.warn("Samples are rather difficult to plot for 2D inputs...")
            return contour, scatter
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
    def getstate(self):
        """
        Get the curent state of the class. This is only used to efficiently
        pickle the model. See also self.setstate
        """
        return Model.getstate(self) + [self.X,
                self.num_data,
                self.input_dim,
                self.kern,
                self.likelihood,
                self.output_dim,
                self._Xoffset,
                self._Xscale]
    def setstate(self, state):
        """
        Set the state of the model. Used for efficient pickling
        """
        self._Xscale = state.pop()
        self._Xoffset = state.pop()
        self.output_dim = state.pop()
        self.likelihood = state.pop()
        self.kern = state.pop()
        self.input_dim = state.pop()
        self.num_data = state.pop()
        self.X = state.pop()
        Model.setstate(self, state)
    def log_predictive_density(self, x_test, y_test):
        """
        Calculation of the log predictive density
        .. math:
            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
        :param x_test: test observations (x_{*})
        :type x_test: (Nx1) array
        :param y_test: test observations (y_{*})
        :type y_test: (Nx1) array
        """
        mu_star, var_star = self._raw_predict(x_test)
        return self.likelihood.log_predictive_density(y_test, mu_star, var_star)
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@ -1,24 +1,19 @@
-# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2013,2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-from ..util.plot import Tango, x_frame1D, x_frame2D
+import sys
-from parameterized import Parameterized
+from parameterization import Parameterized
 import numpy as np
 import pylab as pb
 class Mapping(Parameterized):
    """
-    Base model for shared behavior between models that can act like a mapping. 
+    Base model for shared behavior between models that can act like a mapping.
    """
-    def __init__(self, input_dim, output_dim):
+    def __init__(self, input_dim, output_dim, name='mapping'):
        self.input_dim = input_dim
        self.output_dim = output_dim
-
+        super(Mapping, self).__init__(name=name)
        super(Mapping, self).__init__()
        # Model.__init__(self)
        # All leaf nodes should call self._set_params(self._get_params()) at
        # the end
    def f(self, X):
        raise NotImplementedError
@ -35,7 +30,8 @@ class Mapping(Parameterized):
        raise NotImplementedError
    def df_dtheta(self, dL_df, X):
-        """The gradient of the outputs of the multi-layer perceptron with respect to each of the parameters.
+        """The gradient of the outputs of the mapping with respect to each of the parameters.
        :param dL_df: gradient of the objective with respect to the function.
        :type dL_df: ndarray (num_data x output_dim)
        :param X: input locations where the function is evaluated.
@ -43,85 +39,42 @@ class Mapping(Parameterized):
        :returns: Matrix containing gradients with respect to parameters of each output for each input data.
        :rtype: ndarray (num_params length)
        """
        raise NotImplementedError
-    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue']):
+    def plot(self, *args):
        """
        Plot the mapping.
        Plots the mapping associated with the model.
          - In one dimension, the function is plotted.
-          - In two dimsensions, a contour-plot shows the function
+          - In two dimensions, a contour-plot shows the function
          - In higher dimensions, we've not implemented this yet !TODO!
        Can plot only part of the data and part of the posterior functions
        using which_data and which_functions
-        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
+        This is a convenience function: arguments are passed to
-        :type plot_limits: np.array
+        GPy.plotting.matplot_dep.models_plots.plot_mapping
        :param which_data: which if the training data to plot (default all)
        :type which_data: 'all' or a slice object to slice self.X, self.Y
        :param which_parts: which of the kernel functions to plot (additively)
        :type which_parts: 'all', or list of bools
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        :type resolution: int
        :param levels: number of levels to plot in a contour plot.
        :type levels: int
        :param samples: the number of a posteriori samples to plot
        :type samples: int
        :param fignum: figure to plot on.
        :type fignum: figure number
        :param ax: axes to plot on.
        :type ax: axes handle
        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
        :type fixed_inputs: a list of tuples
        :param linecol: color of line to plot.
        :type linecol:
        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
        """
        # TODO include samples
        if which_data == 'all':
            which_data = slice(None)
        if ax is None:
            fig = pb.figure(num=fignum)
            ax = fig.add_subplot(111)
        plotdims = self.input_dim - len(fixed_inputs)
        if plotdims == 1:
            Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
            fixed_dims = np.array([i for i,v in fixed_inputs])
            freedim = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
            Xnew, xmin, xmax = x_frame1D(Xu[:,freedim], plot_limits=plot_limits)
            Xgrid = np.empty((Xnew.shape[0],self.input_dim))
            Xgrid[:,freedim] = Xnew
            for i,v in fixed_inputs:
                Xgrid[:,i] = v
            f = self.predict(Xgrid, which_parts=which_parts)
            for d in range(y.shape[1]):
                ax.plot(Xnew, f[:, d], edgecol=linecol)
        elif self.X.shape[1] == 2:
            resolution = resolution or 50
            Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
            f = self.predict(Xnew, which_parts=which_parts)
            m = m.reshape(resolution, resolution).T
            ax.contour(x, y, f, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable
            ax.set_xlim(xmin[0], xmax[0])
            ax.set_ylim(xmin[1], xmax[1])
        if "matplotlib" in sys.modules:
            from ..plotting.matplot_dep import models_plots
            mapping_plots.plot_mapping(self,*args)
        else:
-            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+            raise NameError, "matplotlib package has not been imported."
-from GPy.core.model import Model
+class Bijective_mapping(Mapping):
    """
    This is a mapping that is bijective, i.e. you can go from X to f and
    also back from f to X. The inverse mapping is called g().
    """
    def __init__(self, input_dim, output_dim, name='bijective_mapping'):
        super(Bijective_apping, self).__init__(name=name)
    def g(self, f):
        """Inverse mapping from output domain of the function to the inputs."""
        raise NotImplementedError
 from model import Model
 class Mapping_check_model(Model):
    """
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -1,182 +1,35 @@
-# Copyright (c) 2012, 2013, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from .. import likelihoods
 from ..inference import optimization
-from ..util.linalg import jitchol
+from ..util.misc import opt_wrapper
-from GPy.util.misc import opt_wrapper
+from parameterization import Parameterized
 from parameterized import Parameterized
 import multiprocessing as mp
 import numpy as np
 from GPy.core.domains import POSITIVE, REAL
 from numpy.linalg.linalg import LinAlgError
 import itertools
 # import numdifftools as ndt
 class Model(Parameterized):
-    _fail_count = 0 # Count of failed optimization steps (see objective)
+    _fail_count = 0  # Count of failed optimization steps (see objective)
-    _allowed_failures = 10 # number of allowed failures
+    _allowed_failures = 10  # number of allowed failures
-    def __init__(self):
+
-        Parameterized.__init__(self)
+    def __init__(self, name):
-        self.priors = None
+        super(Model, self).__init__(name)  # Parameterized.__init__(self)
        self.optimization_runs = []
        self.sampling_runs = []
-        self.preferred_optimizer = 'scg'
+        self.preferred_optimizer = 'bfgs'
-        # self._set_params(self._get_params()) has been taken out as it should only be called on leaf nodes
+        from .parameterization.ties_and_remappings import Tie
        self.tie = Tie()
        self.link_parameter(self.tie, -1)
        self.add_observer(self.tie, self.tie._parameters_changed_notification, priority=-500)
    def log_likelihood(self):
        raise NotImplementedError, "this needs to be implemented to use the model class"
    def _log_likelihood_gradients(self):
-        raise NotImplementedError, "this needs to be implemented to use the model class"
+        return self.gradient
    def getstate(self):
        """
        Get the current state of the class.
        Inherited from Parameterized, so add those parameters to the state
        :return: list of states from the model.
        """
        return Parameterized.getstate(self) + \
            [self.priors, self.optimization_runs,
             self.sampling_runs, self.preferred_optimizer]
    def setstate(self, state):
        """
        set state from previous call to getstate
        call Parameterized with the rest of the state
        :param state: the state of the model.
        :type state: list as returned from getstate.
        """
        self.preferred_optimizer = state.pop()
        self.sampling_runs = state.pop()
        self.optimization_runs = state.pop()
        self.priors = state.pop()
        Parameterized.setstate(self, state)
    def set_prior(self, regexp, what):
        """
        Sets priors on the model parameters.
        **Notes**
        Asserts that the prior is suitable for the constraint. If the
        wrong constraint is in place, an error is raised.  If no
        constraint is in place, one is added (warning printed).
        For tied parameters, the prior will only be "counted" once, thus
        a prior object is only inserted on the first tied index
        :param regexp: regular expression of parameters on which priors need to be set.
        :type param: string, regexp, or integer array
        :param what: prior to set on parameter.
        :type what: GPy.core.Prior type
        """
        if self.priors is None:
            self.priors = [None for i in range(self._get_params().size)]
        which = self.grep_param_names(regexp)
        # check tied situation
        tie_partial_matches = [tie for tie in self.tied_indices if (not set(tie).isdisjoint(set(which))) & (not set(tie) == set(which))]
        if len(tie_partial_matches):
            raise ValueError, "cannot place prior across partial ties"
        tie_matches = [tie for tie in self.tied_indices if set(which) == set(tie) ]
        if len(tie_matches) > 1:
            raise ValueError, "cannot place prior across multiple ties"
        elif len(tie_matches) == 1:
            which = which[:1] # just place a prior object on the first parameter
        # check constraints are okay
        if what.domain is POSITIVE:
            constrained_positive_indices = [i for i, t in zip(self.constrained_indices, self.constraints) if t.domain is POSITIVE]
            if len(constrained_positive_indices):
                constrained_positive_indices = np.hstack(constrained_positive_indices)
            else:
                constrained_positive_indices = np.zeros(shape=(0,))
            bad_constraints = np.setdiff1d(self.all_constrained_indices(), constrained_positive_indices)
            assert not np.any(which[:, None] == bad_constraints), "constraint and prior incompatible"
            unconst = np.setdiff1d(which, constrained_positive_indices)
            if len(unconst):
                print "Warning: constraining parameters to be positive:"
                print '\n'.join([n for i, n in enumerate(self._get_param_names()) if i in unconst])
                print '\n'
                self.constrain_positive(unconst)
        elif what.domain is REAL:
            assert not np.any(which[:, None] == self.all_constrained_indices()), "constraint and prior incompatible"
        else:
            raise ValueError, "prior not recognised"
        # store the prior in a local list
        for w in which:
            self.priors[w] = what
    def get_gradient(self, name, return_names=False):
        """
        Get model gradient(s) by name. The name is applied as a regular expression and all parameters that match that regular expression are returned.
        :param name: the name of parameters required (as a regular expression).
        :type name: regular expression
        :param return_names: whether or not to return the names matched (default False)
        :type return_names: bool
        """
        matches = self.grep_param_names(name)
        if len(matches):
            if return_names:
                return self._log_likelihood_gradients()[matches], np.asarray(self._get_param_names())[matches].tolist()
            else:
                return self._log_likelihood_gradients()[matches]
        else:
            raise AttributeError, "no parameter matches %s" % name
    def log_prior(self):
        """evaluate the prior"""
        if self.priors is not None:
            return np.sum([p.lnpdf(x) for p, x in zip(self.priors, self._get_params()) if p is not None])
        else:
            return 0.
    def _log_prior_gradients(self):
        """evaluate the gradients of the priors"""
        if self.priors is None:
            return 0.
        x = self._get_params()
        ret = np.zeros(x.size)
        [np.put(ret, i, p.lnpdf_grad(xx)) for i, (p, xx) in enumerate(zip(self.priors, x)) if not p is None]
        return ret
    def _transform_gradients(self, g):
        x = self._get_params()
        for index, constraint in zip(self.constrained_indices, self.constraints):
            g[index] = g[index] * constraint.gradfactor(x[index])
        [np.put(g, i, v) for i, v in [(t[0], np.sum(g[t])) for t in self.tied_indices]]
        if len(self.tied_indices) or len(self.fixed_indices):
            to_remove = np.hstack((self.fixed_indices + [t[1:] for t in self.tied_indices]))
            return np.delete(g, to_remove)
        else:
            return g
    def randomize(self):
        """
        Randomize the model.
        Make this draw from the prior if one exists, else draw from N(0,1)
        """
        # first take care of all parameters (from N(0,1))
        x = self._get_params_transformed()
        x = np.random.randn(x.size)
        self._set_params_transformed(x)
        # now draw from prior where possible
        x = self._get_params()
        if self.priors is not None:
            [np.put(x, i, p.rvs(1)) for i, p in enumerate(self.priors) if not p is None]
        self._set_params(x)
        self._set_params_transformed(self._get_params_transformed()) # makes sure all of the tied parameters get the same init (since there's only one prior object...)
    def optimize_restarts(self, num_restarts=10, robust=False, verbose=True, parallel=False, num_processes=None, **kwargs):
        """
@ -207,10 +60,12 @@ class Model(Parameterized):
        :param messages: whether to display during optimisation
        :type messages: bool
-        .. note:: If num_processes is None, the number of workes in the multiprocessing pool is automatically set to the number of processors on the current machine.
+        .. note:: If num_processes is None, the number of workes in the
        multiprocessing pool is automatically set to the number of processors
        on the current machine.
        """
-        initial_parameters = self._get_params_transformed()
+        initial_parameters = self.optimizer_array.copy()
        if parallel:
            try:
@ -221,8 +76,8 @@ class Model(Parameterized):
                    job = pool.apply_async(opt_wrapper, args=(self,), kwds=kwargs)
                    jobs.append(job)
-                pool.close() # signal that no more data coming in
+                pool.close()  # signal that no more data coming in
-                pool.join() # wait for all the tasks to complete
+                pool.join()  # wait for all the tasks to complete
            except KeyboardInterrupt:
                print "Ctrl+c received, terminating and joining pool."
                pool.terminate()
@ -246,11 +101,11 @@ class Model(Parameterized):
        if len(self.optimization_runs):
            i = np.argmin([o.f_opt for o in self.optimization_runs])
-            self._set_params_transformed(self.optimization_runs[i].x_opt)
+            self.optimizer_array = self.optimization_runs[i].x_opt
        else:
-            self._set_params_transformed(initial_parameters)
+            self.optimizer_array = initial_parameters
-    def ensure_default_constraints(self):
+    def ensure_default_constraints(self, warning=True):
        """
        Ensure that any variables which should clearly be positive
        have been constrained somehow. The method performs a regular
@ -258,183 +113,164 @@ class Model(Parameterized):
        'variance', 'lengthscale', 'precision' and 'kappa'. If any of
        these terms are present in the name the parameter is
        constrained positive.
        """
        positive_strings = ['variance', 'lengthscale', 'precision', 'decay', 'kappa']
        # param_names = self._get_param_names()
        currently_constrained = self.all_constrained_indices()
        to_make_positive = []
        for s in positive_strings:
            for i in self.grep_param_names(".*" + s):
                if not (i in currently_constrained):
                    to_make_positive.append(i)
        if len(to_make_positive):
            self.constrain_positive(np.asarray(to_make_positive))
-    def objective_function(self, x):
+        DEPRECATED.
        """
        raise DeprecationWarning, 'parameters now have default constraints'
    def objective_function(self):
        """
        The objective function for the given algorithm.
        This function is the true objective, which wants to be minimized.
        Note that all parameters are already set and in place, so you just need
        to return the objective function here.
        For probabilistic models this is the negative log_likelihood
        (including the MAP prior), so we return it here. If your model is not
        probabilistic, just return your objective to minimize here!
        """
        return -float(self.log_likelihood()) - self.log_prior()
    def objective_function_gradients(self):
        """
        The gradients for the objective function for the given algorithm.
        The gradients are w.r.t. the *negative* objective function, as
        this framework works with *negative* log-likelihoods as a default.
        You can find the gradient for the parameters in self.gradient at all times.
        This is the place, where gradients get stored for parameters.
        This function is the true objective, which wants to be minimized.
        Note that all parameters are already set and in place, so you just need
        to return the gradient here.
        For probabilistic models this is the gradient of the negative log_likelihood
        (including the MAP prior), so we return it here. If your model is not
        probabilistic, just return your *negative* gradient here!
        """
        return -(self._log_likelihood_gradients() + self._log_prior_gradients())
    def _grads(self, x):
        """
        Gets the gradients from the likelihood and the priors.
        Failures are handled robustly. The algorithm will try several times to
        return the gradients, and will raise the original exception if
        the objective cannot be computed.
        :param x: the parameters of the model.
        :type x: np.array
        """
        try:
            # self._set_params_transformed(x)
            self.optimizer_array = x
            obj_grads = self._transform_gradients(self.objective_function_gradients())
            self._fail_count = 0
        except (LinAlgError, ZeroDivisionError, ValueError):
            if self._fail_count >= self._allowed_failures:
                raise
            self._fail_count += 1
            obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e100, 1e100)
        return obj_grads
    def _objective(self, x):
        """
        The objective function passed to the optimizer. It combines
        the likelihood and the priors.
        Failures are handled robustly. The algorithm will try several times to
-        return the objective, and will raise the original exception if it
+        return the objective, and will raise the original exception if
        the objective cannot be computed.
        :param x: the parameters of the model.
        :parameter type: np.array
        """
        try:
-            self._set_params_transformed(x)
+            self.optimizer_array = x
            obj = self.objective_function()
            self._fail_count = 0
-        except (LinAlgError, ZeroDivisionError, ValueError) as e:
+        except (LinAlgError, ZeroDivisionError, ValueError):
            if self._fail_count >= self._allowed_failures:
-                raise e
+                raise
            self._fail_count += 1
            return np.inf
-        return -self.log_likelihood() - self.log_prior()
+        return obj
-    def objective_function_gradients(self, x):
+    def _objective_grads(self, x):
        """
        Gets the gradients from the likelihood and the priors.
        Failures are handled robustly. The algorithm will try several times to
        return the gradients, and will raise the original exception if it
        the objective cannot be computed.
        :param x: the parameters of the model.
        :parameter type: np.array
        """
        try:
-            self._set_params_transformed(x)
+            self.optimizer_array = x
-            obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
+            obj_f, obj_grads = self.objective_function(), self._transform_gradients(self.objective_function_gradients())
            self._fail_count = 0
-        except (LinAlgError, ZeroDivisionError, ValueError) as e:
+        except (LinAlgError, ZeroDivisionError, ValueError):
            if self._fail_count >= self._allowed_failures:
-                raise e
+                raise
            self._fail_count += 1
            obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100)
        return obj_grads
    def objective_and_gradients(self, x):
        """
        Compute the objective function of the model and the gradient of the model at the point given by x.
        :param x: the point at which gradients are to be computed.
        :type np.array:
        """
        try:
            self._set_params_transformed(x)
            obj_f = -self.log_likelihood() - self.log_prior()
            self._fail_count = 0
            obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
        except (LinAlgError, ZeroDivisionError, ValueError) as e:
            if self._fail_count >= self._allowed_failures:
                raise e
            self._fail_count += 1
            obj_f = np.inf
-            obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100)
+            obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e100, 1e100)
        return obj_f, obj_grads
    def optimize(self, optimizer=None, start=None, **kwargs):
        """
        Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
        kwargs are passed to the optimizer. They can be:
        :param max_f_eval: maximum number of function evaluations
        :type max_f_eval: int
        :messages: whether to display during optimisation
        :type messages: bool
-        :param optimzer: which optimizer to use (defaults to self.preferred optimizer)
+        :param optimizer: which optimizer to use (defaults to self.preferred optimizer)
-        :type optimzer: string TODO: valid strings?
+        :type optimizer: string
        Valid optimizers are:
          - 'scg': scaled conjugate gradient method, recommended for stability.
                   See also GPy.inference.optimization.scg
          - 'fmin_tnc': truncated Newton method (see scipy.optimize.fmin_tnc)
          - 'simplex': the Nelder-Mead simplex method (see scipy.optimize.fmin),
          - 'lbfgsb': the l-bfgs-b method (see scipy.optimize.fmin_l_bfgs_b),
          - 'sgd': stochastic gradient decsent (see scipy.optimize.sgd). For experts only!
        """
        if self.is_fixed:
            print 'nothing to optimize'
        if self.size == 0:
            print 'nothing to optimize'
        if not self.update_model():
            print "setting updates on again"
            self.update_model(True)
        if start == None:
            start = self.optimizer_array
        if optimizer is None:
            optimizer = self.preferred_optimizer
-        if start == None:
+        if isinstance(optimizer, optimization.Optimizer):
-            start = self._get_params_transformed()
+            opt = optimizer
            opt.model = self
        else:
            optimizer = optimization.get_optimizer(optimizer)
            opt = optimizer(start, model=self, **kwargs)
-        optimizer = optimization.get_optimizer(optimizer)
+        opt.run(f_fp=self._objective_grads, f=self._objective, fp=self._grads)
        opt = optimizer(start, model=self, **kwargs)
        opt.run(f_fp=self.objective_and_gradients, f=self.objective_function, fp=self.objective_function_gradients)
        self.optimization_runs.append(opt)
-        self._set_params_transformed(opt.x_opt)
+        self.optimizer_array = opt.x_opt
    def optimize_SGD(self, momentum=0.1, learning_rate=0.01, iterations=20, **kwargs):
        # assert self.Y.shape[1] > 1, "SGD only works with D > 1"
-        sgd = SGD.StochasticGD(self, iterations, learning_rate, momentum, **kwargs) # @UndefinedVariable
+        sgd = SGD.StochasticGD(self, iterations, learning_rate, momentum, **kwargs)  # @UndefinedVariable
        sgd.run()
        self.optimization_runs.append(sgd)
-    def Laplace_covariance(self):
+    def _checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3, df_tolerance=1e-12):
        """return the covariance matrix of a Laplace approximation at the current (stationary) point."""
        # TODO add in the prior contributions for MAP estimation
        # TODO fix the hessian for tied, constrained and fixed components
        if hasattr(self, 'log_likelihood_hessian'):
            A = -self.log_likelihood_hessian()
        else:
            print "numerically calculating Hessian. please be patient!"
            x = self._get_params()
            def f(x):
                self._set_params(x)
                return self.log_likelihood()
            h = ndt.Hessian(f) # @UndefinedVariable
            A = -h(x)
            self._set_params(x)
        # check for almost zero components on the diagonal which screw up the cholesky
        aa = np.nonzero((np.diag(A) < 1e-6) & (np.diag(A) > 0.))[0]
        A[aa, aa] = 0.
        return A
    def Laplace_evidence(self):
        """Returns an estiamte of the model evidence based on the Laplace approximation.
        Uses a numerical estimate of the Hessian if none is available analytically."""
        A = self.Laplace_covariance()
        try:
            hld = np.sum(np.log(np.diag(jitchol(A)[0])))
        except:
            return np.nan
        return 0.5 * self._get_params().size * np.log(2 * np.pi) + self.log_likelihood() - hld
    def __str__(self):
        s = Parameterized.__str__(self).split('\n')
        #def __str__(self, names=None):
        #    if names is None:
        #        names = self._get_print_names()
        #s = Parameterized.__str__(self, names=names).split('\n')
        # add priors to the string
        if self.priors is not None:
            strs = [str(p) if p is not None else '' for p in self.priors]
        else:
            strs = [''] * len(self._get_params())
       #         strs = [''] * len(self._get_param_names())
       #     name_indices = self.grep_param_names("|".join(names))
       #     strs = np.array(strs)[name_indices]
        width = np.array(max([len(p) for p in strs] + [5])) + 4
        log_like = self.log_likelihood()
        log_prior = self.log_prior()
        obj_funct = '\nLog-likelihood: {0:.3e}'.format(log_like)
        if len(''.join(strs)) != 0:
            obj_funct += ', Log prior: {0:.3e}, LL+prior = {0:.3e}'.format(log_prior, log_like + log_prior)
        obj_funct += '\n\n'
        s[0] = obj_funct + s[0]
        s[0] += "|{h:^{col}}".format(h='prior', col=width)
        s[1] += '-' * (width + 1)
        for p in range(2, len(strs) + 2):
            s[p] += '|{prior:^{width}}'.format(prior=strs[p - 2], width=width)
        return '\n'.join(s)
    def checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3):
        """
        Check the gradient of the ,odel by comparing to a numerical
-        estimate.  If the verbose flag is passed, invividual
+        estimate.  If the verbose flag is passed, individual
        components are tested (and printed)
        :param verbose: If True, print a "full" checking of each parameter
@ -447,37 +283,54 @@ class Model(Parameterized):
        Note:-
           The gradient is considered correct if the ratio of the analytical
           and numerical gradients is within <tolerance> of unity.
        """
-        x = self._get_params_transformed().copy()
+           The *dF_ratio* indicates the limit of numerical accuracy of numerical gradients.
           If it is too small, e.g., smaller than 1e-12, the numerical gradients are usually
           not accurate enough for the tests (shown with blue).
        """
        x = self.optimizer_array.copy()
        if not verbose:
-            # just check the global ratio
+            # make sure only to test the selected parameters
-
+            if target_param is None:
-            #choose a random direction to find the linear approximation in
+                transformed_index = range(len(x))
            if x.size==2:
                dx = step * np.ones(2) # random direction for 2 parameters can fail dure to symmetry
            else:
-                dx = step * np.sign(np.random.uniform(-1, 1, x.size))
+                transformed_index = self._raveled_index_for(target_param)
                if self._has_fixes():
                    indices = np.r_[:self.size]
                    which = (transformed_index[:, None] == indices[self._fixes_][None, :]).nonzero()
                    transformed_index = (indices - (~self._fixes_).cumsum())[transformed_index[which[0]]]
                if transformed_index.size == 0:
                    print "No free parameters to check"
                    return
            # just check the global ratio
            dx = np.zeros(x.shape)
            dx[transformed_index] = step * (np.sign(np.random.uniform(-1, 1, transformed_index.size)) if transformed_index.size != 2 else 1.)
            # evaulate around the point x
-            f1, g1 = self.objective_and_gradients(x + dx)
+            f1 = self._objective(x + dx)
-            f2, g2 = self.objective_and_gradients(x - dx)
+            f2 = self._objective(x - dx)
-            gradient = self.objective_function_gradients(x)
+            gradient = self._grads(x)
-            numerical_gradient = (f1 - f2) / (2 * dx)
+            dx = dx[transformed_index]
-            global_ratio = (f1 - f2) / (2 * np.dot(dx, np.where(gradient==0, 1e-32, gradient)))
+            gradient = gradient[transformed_index]
-            return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() < tolerance)
+            denominator = (2 * np.dot(dx, gradient))
            global_ratio = (f1 - f2) / np.where(denominator == 0., 1e-32, denominator)
            global_diff = np.abs(f1 - f2) < tolerance and np.allclose(gradient, 0, atol=tolerance)
            if global_ratio is np.nan:
                global_ratio = 0
            return np.abs(1. - global_ratio) < tolerance or global_diff
        else:
            # check the gradient of each parameter individually, and do some pretty printing
            try:
-                names = self._get_param_names_transformed()
+                names = self._get_param_names()
            except NotImplementedError:
                names = ['Variable %i' % i for i in range(len(x))]
            # Prepare for pretty-printing
-            header = ['Name', 'Ratio', 'Difference', 'Analytical', 'Numerical']
+            header = ['Name', 'Ratio', 'Difference', 'Analytical', 'Numerical', 'dF_ratio']
            max_names = max([len(names[i]) for i in range(len(names))] + [len(header[0])])
            float_len = 10
            cols = [max_names]
@ -487,115 +340,77 @@ class Model(Parameterized):
            header_string = map(lambda x: '|'.join(x), [header_string])
            separator = '-' * len(header_string[0])
            print '\n'.join([header_string[0], separator])
            if target_param is None:
-                param_list = range(len(x))
+                param_index = range(len(x))
                transformed_index = param_index
            else:
-                param_list = self.grep_param_names(target_param, transformed=True, search=True)
+                param_index = self._raveled_index_for(target_param)
-                if not np.any(param_list):
+                if self._has_fixes():
                    indices = np.r_[:self.size]
                    which = (param_index[:, None] == indices[self._fixes_][None, :]).nonzero()
                    param_index = param_index[which[0]]
                    transformed_index = (indices - (~self._fixes_).cumsum())[param_index]
                    # print param_index, transformed_index
                else:
                    transformed_index = param_index
                if param_index.size == 0:
                    print "No free parameters to check"
                    return
-
+            gradient = self._grads(x).copy()
-            for i in param_list:
+            np.where(gradient == 0, 1e-312, gradient)
            ret = True
            for nind, xind in itertools.izip(param_index, transformed_index):
                xx = x.copy()
-                xx[i] += step
+                xx[xind] += step
-                f1, g1 = self.objective_and_gradients(xx)
+                f1 = self._objective(xx)
-                xx[i] -= 2.*step
+                xx[xind] -= 2.*step
-                f2, g2 = self.objective_and_gradients(xx)
+                f2 = self._objective(xx)
-                gradient = self.objective_function_gradients(x)[i]
+                df_ratio = np.abs((f1-f2)/min(f1,f2))
-
+                df_unstable = df_ratio<df_tolerance
                numerical_gradient = (f1 - f2) / (2 * step)
-                ratio = (f1 - f2) / (2 * step * np.where(gradient==0, 1e-312, gradient))
+                if np.all(gradient[xind] == 0): ratio = (f1 - f2) == gradient[xind]
-                difference = np.abs((f1 - f2) / 2 / step - gradient)
+                else: ratio = (f1 - f2) / (2 * step * gradient[xind])
                difference = np.abs(numerical_gradient - gradient[xind])
                if (np.abs(1. - ratio) < tolerance) or np.abs(difference) < tolerance:
-                    formatted_name = "\033[92m {0} \033[0m".format(names[i])
+                    formatted_name = "\033[92m {0} \033[0m".format(names[nind])
                    ret &= True
                else:
-                    formatted_name = "\033[91m {0} \033[0m".format(names[i])
+                    formatted_name = "\033[91m {0} \033[0m".format(names[nind])
                    ret &= False
                if df_unstable:
                    formatted_name = "\033[94m {0} \033[0m".format(names[nind])
                r = '%.6f' % float(ratio)
                d = '%.6f' % float(difference)
-                g = '%.6f' % gradient
+                g = '%.6f' % gradient[xind]
                ng = '%.6f' % float(numerical_gradient)
-                grad_string = "{0:^{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}".format(formatted_name, r, d, g, ng, c0=cols[0] + 9, c1=cols[1], c2=cols[2], c3=cols[3], c4=cols[4])
+                df = '%1.e' % float(df_ratio)
                grad_string = "{0:<{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}|{5:^{c5}}".format(formatted_name, r, d, g, ng, df, c0=cols[0] + 9, c1=cols[1], c2=cols[2], c3=cols[3], c4=cols[4], c5=cols[5])
                print grad_string
-    def input_sensitivity(self):
+            self.optimizer_array = x
-        """
+            return ret
        return an array describing the sesitivity of the model to each input
-        NB. Right now, we're basing this on the lengthscales (or
+    def _repr_html_(self):
-        variances) of the kernel.  TODO: proper sensitivity analysis
+        """Representation of the model in html for notebook display."""
-        where we integrate across the model inputs and evaluate the
+        model_details = [['<b>Model</b>', self.name + '<br>'],
-        effect on the variance of the model output.  """
+                         ['<b>Log-likelihood</b>', '{}<br>'.format(float(self.log_likelihood()))],
                         ["<b>Number of Parameters</b>", '{}<br>'.format(self.size)]]
        from operator import itemgetter
        to_print = [""] + ["{}: {}".format(name, detail) for name, detail in model_details] + ["<br><b>Parameters</b>:"]
        to_print.append(super(Model, self)._repr_html_())
        return "\n".join(to_print)
-        if not hasattr(self, 'kern'):
+    def __str__(self):
-            raise ValueError, "this model has no kernel"
+        model_details = [['Name', self.name],
                         ['Log-likelihood', '{}'.format(float(self.log_likelihood()))],
                         ["Number of Parameters", '{}'.format(self.size)]]
        from operator import itemgetter
        max_len = reduce(lambda a, b: max(len(b[0]), a), model_details, 0)
        to_print = [""] + ["{0:{l}} : {1}".format(name, detail, l=max_len) for name, detail in model_details] + ["Parameters:"]
        to_print.append(super(Model, self).__str__())
        return "\n".join(to_print)
        k = [p for p in self.kern.parts if p.name in ['rbf', 'linear', 'rbf_inv']]
        if (not len(k) == 1) or (not k[0].ARD):
            raise ValueError, "cannot determine sensitivity for this kernel"
        k = k[0]
        if k.name == 'rbf':
            return 1. / k.lengthscale
        elif k.name == 'rbf_inv':
            return k.inv_lengthscale
        elif k.name == 'linear':
            return k.variances
    def pseudo_EM(self, stop_crit=.1, **kwargs):
        """
        EM - like algorithm  for Expectation Propagation and Laplace approximation
        :param stop_crit: convergence criterion
        :type stop_crit: float
        .. Note: kwargs are passed to update_likelihood and optimize functions.
        """
        assert isinstance(self.likelihood, (likelihoods.EP, likelihoods.EP_Mixed_Noise, likelihoods.Laplace)), "pseudo_EM is only available for approximate likelihoods"
        ll_change = stop_crit + 1.
        iteration = 0
        last_ll = -np.inf
        convergence = False
        alpha = 0
        stop = False
        #Handle **kwargs
        ep_args = {}
        for arg in kwargs.keys():
            if arg in ('epsilon','power_ep'):
                ep_args[arg] = kwargs[arg]
                del kwargs[arg]
        while not stop:
            last_approximation = self.likelihood.copy()
            last_params = self._get_params()
            if len(ep_args) == 2:
                self.update_likelihood_approximation(epsilon=ep_args['epsilon'],power_ep=ep_args['power_ep'])
            elif len(ep_args) == 1:
                if  ep_args.keys()[0] == 'epsilon':
                    self.update_likelihood_approximation(epsilon=ep_args['epsilon'])
                elif ep_args.keys()[0] == 'power_ep':
                    self.update_likelihood_approximation(power_ep=ep_args['power_ep'])
            else:
                self.update_likelihood_approximation()
            new_ll = self.log_likelihood()
            ll_change = new_ll - last_ll
            if ll_change < 0:
                self.likelihood = last_approximation # restore previous likelihood approximation
                self._set_params(last_params) # restore model parameters
                print "Log-likelihood decrement: %s \nLast likelihood update discarded." % ll_change
                stop = True
            else:
                self.optimize(**kwargs)
                last_ll = self.log_likelihood()
                if ll_change < stop_crit:
                    stop = True
            iteration += 1
            if stop:
                print "%s iterations." % iteration
        self.update_likelihood_approximation()
--- a/GPy/core/parameterization/init.py
+++ b/GPy/core/parameterization/init.py
@ -0,0 +1,5 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from param import Param, ObsAr
 from parameterized import Parameterized
--- a/GPy/core/parameterization/domains.py
+++ b/GPy/core/parameterization/domains.py
@ -0,0 +1,25 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 (Hyper-)Parameter domains defined for :py:mod:`~GPy.core.priors` and :py:mod:`~GPy.kern`.
 These domains specify the legitimate realm of the parameters to live in.
 :const:`~GPy.core.domains._REAL` :
    real domain, all values in the real numbers are allowed
 :const:`~GPy.core.domains._POSITIVE`:
    positive domain, only positive real values are allowed
 :const:`~GPy.core.domains._NEGATIVE`:
    same as :const:`~GPy.core.domains._POSITIVE`, but only negative values are allowed
 :const:`~GPy.core.domains._BOUNDED`:
    only values within the bounded range are allowed,
    the bounds are specified withing the object with the bounded range
 """
 _REAL = 'real'
 _POSITIVE = "positive"
 _NEGATIVE = 'negative'
 _BOUNDED = 'bounded'
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@ -0,0 +1,302 @@
 # Copyright (c) 2014, Max Zwiessele
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy
 from numpy.lib.function_base import vectorize
 from lists_and_dicts import IntArrayDict
 def extract_properties_to_index(index, props):
    prop_index = dict()
    for i, cl in enumerate(props):
        for c in cl:
            ind = prop_index.get(c, list())
            ind.append(index[i])
            prop_index[c] = ind
    for c, i in prop_index.items():
        prop_index[c] = numpy.array(i, dtype=int)
    return prop_index
 class ParameterIndexOperations(object):
    """
    This object wraps a dictionary, whos keys are _operations_ that we'd like
    to apply to a parameter array, and whose values are np integer arrays which
    index the parameter array appropriately.
    A model instance will contain one instance of this class for each thing
    that needs indexing (i.e. constraints, ties and priors). Parameters within
    the model constain instances of the ParameterIndexOperationsView class,
    which can map from a 'local' index (starting 0) to this global index.
    Here's an illustration:
    #=======================================================================
    model : 0 1 2 3 4 5 6 7 8 9
    key1: 4 5
    key2: 7 8
    param1: 0 1 2 3 4 5
    key1: 2 3
    key2: 5
    param2: 0 1 2 3 4
    key1: 0
    key2: 2 3
    #=======================================================================
    The views of this global index have a subset of the keys in this global
    (model) index.
    Adding a new key (e.g. a constraint) to a view will cause the view to pass
    the new key to the global index, along with the local index and an offset.
    This global index then stores the key and the appropriate global index
    (which can be seen by the view).
    See also:
    ParameterIndexOperationsView
    """
    _offset = 0
    def __init__(self, constraints=None):
        self._properties = IntArrayDict()
        if constraints is not None:
            for t, i in constraints.iteritems():
                self.add(t, i)
    def iteritems(self):
        return self._properties.iteritems()
    def items(self):
        return self._properties.items()
    def properties(self):
        return self._properties.keys()
    def iterproperties(self):
        return self._properties.iterkeys()
    def shift_right(self, start, size):
        for ind in self.iterindices():
            toshift = ind>=start
            ind[toshift] += size
    def shift_left(self, start, size):
        for v, ind in self.items():
            todelete = (ind>=start) * (ind<start+size)
            if todelete.size != 0:
                ind = ind[~todelete]
            toshift = ind>=start
            if toshift.size != 0:
                ind[toshift] -= size
            if ind.size != 0: self._properties[v] = ind
            else: del self._properties[v]
    def clear(self):
        self._properties.clear()
    @property
    def size(self):
        return reduce(lambda a,b: a+b.size, self.iterindices(), 0)
    def iterindices(self):
        return self._properties.itervalues()
    def indices(self):
        return self._properties.values()
    def properties_for(self, index):
        """
        Returns a list of properties, such that each entry in the list corresponds
        to the element of the index given.
        Example:
        let properties: 'one':[1,2,3,4], 'two':[3,5,6]
        >>> properties_for([2,3,5])
        [['one'], ['one', 'two'], ['two']]
        """
        return vectorize(lambda i: [prop for prop in self.iterproperties() if i in self[prop]], otypes=[list])(index)
    def properties_to_index_dict(self, index):
        """
        Return a dictionary, containing properties as keys and indices as index
        Thus, the indices for each constraint, which is contained will be collected as
        one dictionary
        Example:
        let properties: 'one':[1,2,3,4], 'two':[3,5,6]
        >>> properties_to_index_dict([2,3,5])
        {'one':[2,3], 'two':[3,5]}
        """
        props = self.properties_for(index)
        prop_index = extract_properties_to_index(index, props)
        return prop_index
    def add(self, prop, indices):
        self._properties[prop] = combine_indices(self._properties[prop], indices)
    def remove(self, prop, indices):
        if prop in self._properties:
            diff = remove_indices(self[prop], indices)
            removed = numpy.intersect1d(self[prop], indices, True)
            if not index_empty(diff):
                self._properties[prop] = diff
            else:
                del self._properties[prop]
            return removed.astype(int)
        return numpy.array([]).astype(int)
    def update(self, parameter_index_view, offset=0):
        for i, v in parameter_index_view.iteritems():
            self.add(i, v+offset)
    def copy(self):
        return self.__deepcopy__(None)
    def __deepcopy__(self, memo):
        return ParameterIndexOperations(dict(self.iteritems()))
    def __getitem__(self, prop):
        return self._properties[prop]
    def __delitem__(self, prop):
        del self._properties[prop]
    def __str__(self, *args, **kwargs):
        import pprint
        return pprint.pformat(dict(self._properties))
 def combine_indices(arr1, arr2):
    return numpy.union1d(arr1, arr2)
 def remove_indices(arr, to_remove):
    return numpy.setdiff1d(arr, to_remove, True)
 def index_empty(index):
    return numpy.size(index) == 0
 class ParameterIndexOperationsView(object):
    def __init__(self, param_index_operations, offset, size):
        self._param_index_ops = param_index_operations
        self._offset = offset
        self._size = size
    def __getstate__(self):
        return [self._param_index_ops, self._offset, self._size]
    def __setstate__(self, state):
        self._param_index_ops = state[0]
        self._offset = state[1]
        self._size = state[2]
    def _filter_index(self, ind):
        return ind[(ind >= self._offset) * (ind < (self._offset + self._size))] - self._offset
    def iteritems(self):
        for i, ind in self._param_index_ops.iteritems():
            ind2 = self._filter_index(ind)
            if ind2.size > 0:
                yield i, ind2
    def items(self):
        return [[i,v] for i,v in self.iteritems()]
    def properties(self):
        return [i for i in self.iterproperties()]
    def iterproperties(self):
        for i, _ in self.iteritems():
            yield i
    def shift_right(self, start, size):
        self._param_index_ops.shift_right(start+self._offset, size)
    def shift_left(self, start, size):
        self._param_index_ops.shift_left(start+self._offset, size)
    def clear(self):
        for i, ind in self.items():
            self._param_index_ops.remove(i, ind+self._offset)
    @property
    def size(self):
        return reduce(lambda a,b: a+b.size, self.iterindices(), 0)
    def iterindices(self):
        for _, ind in self.iteritems():
            yield ind
    def indices(self):
        return [ind for ind in self.iterindices()]
    def properties_for(self, index):
        """
        Returns a list of properties, such that each entry in the list corresponds
        to the element of the index given.
        Example:
        let properties: 'one':[1,2,3,4], 'two':[3,5,6]
        >>> properties_for([2,3,5])
        [['one'], ['one', 'two'], ['two']]
        """
        return vectorize(lambda i: [prop for prop in self.iterproperties() if i in self[prop]], otypes=[list])(index)
    def properties_to_index_dict(self, index):
        """
        Return a dictionary, containing properties as keys and indices as index
        Thus, the indices for each constraint, which is contained will be collected as
        one dictionary
        Example:
        let properties: 'one':[1,2,3,4], 'two':[3,5,6]
        >>> properties_to_index_dict([2,3,5])
        {'one':[2,3], 'two':[3,5]}
        """
        return extract_properties_to_index(index, self.properties_for(index))
    def add(self, prop, indices):
        self._param_index_ops.add(prop, indices+self._offset)
    def remove(self, prop, indices):
        removed = self._param_index_ops.remove(prop, numpy.array(indices)+self._offset)
        if removed.size > 0:
            return removed-self._offset
        return removed
    def __getitem__(self, prop):
        ind = self._filter_index(self._param_index_ops[prop])
        return ind
    def __delitem__(self, prop):
        self.remove(prop, self[prop])
    def __str__(self, *args, **kwargs):
        import pprint
        return pprint.pformat(dict(self.iteritems()))
    def update(self, parameter_index_view, offset=0):
        for i, v in parameter_index_view.iteritems():
            self.add(i, v+offset)
    def copy(self):
        return self.__deepcopy__(None)
    def __deepcopy__(self, memo):
        return ParameterIndexOperations(dict(self.iteritems()))
    pass
--- a/GPy/core/parameterization/lists_and_dicts.py
+++ b/GPy/core/parameterization/lists_and_dicts.py
@ -0,0 +1,139 @@
 # Copyright (c) 2014, Max Zwiessele
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from collections import defaultdict
 import weakref
 def intarray_default_factory():
    import numpy as np
    return np.int_([])
 class IntArrayDict(defaultdict):
    def __init__(self, default_factory=None):
        """
        Default will be self._default, if not set otherwise
        """
        defaultdict.__init__(self, intarray_default_factory)
 class ArrayList(list):
    """
    List to store ndarray-likes in.
    It will look for 'is' instead of calling __eq__ on each element.
    """
    def __contains__(self, other):
        for el in self:
            if el is other:
                return True
        return False
    def index(self, item):
        index = 0
        for el in self:
            if el is item:
                return index
            index += 1
        raise ValueError, "{} is not in list".format(item)
    pass
 class ObserverList(object):
    """
    A list which containts the observables.
    It only holds weak references to observers, such that unbound
    observers dont dangle in memory.
    """
    def __init__(self):
        self._poc = []
    def __getitem__(self, ind):
        p,o,c = self._poc[ind]
        return p, o(), c
    def remove(self, priority, observer, callble):
        """
        Remove one observer, which had priority and callble.
        """
        self.flush()
        for i in range(len(self) - 1, -1, -1):
            p,o,c = self[i]
            if priority==p and observer==o and callble==c:
                del self._poc[i]
    def __repr__(self):
        return self._poc.__repr__()
    def add(self, priority, observer, callble):
        """
        Add an observer with priority and callble
        """
        if observer is not None:
            ins = 0
            for pr, _, _ in self:
                if priority > pr:
                    break
                ins += 1
            self._poc.insert(ins, (priority, weakref.ref(observer), callble))
    def __str__(self):
        from . import ObsAr, Param
        from parameter_core import Parameterizable
        ret = []
        curr_p = None
        def frmt(o):
            if isinstance(o, ObsAr):
                return 'ObsArr <{}>'.format(hex(id(o)))
            elif isinstance(o, (Param,Parameterizable)):
                return '{}'.format(o.hierarchy_name())
            else:
                return repr(o)                
        for p, o, c in self:
            curr = ''
            if curr_p != p:
                pre = "{!s}: ".format(p)
                curr_pre = pre
            else: curr_pre = " "*len(pre)
            curr_p = p
            curr += curr_pre
            ret.append(curr + ", ".join([frmt(o), str(c)]))
            return '\n'.join(ret)
    def flush(self):
        """
        Make sure all weak references, which point to nothing are flushed (deleted)
        """
        self._poc = [(p,o,c) for p,o,c in self._poc if o() is not None]
    def __iter__(self):
        self.flush()
        for p, o, c in self._poc:
            yield p, o(), c 
    def __len__(self):
        self.flush()
        return self._poc.__len__()
    def __deepcopy__(self, memo):
        s = ObserverList()
        for p,o,c in self:
            import copy
            s.add(p, copy.deepcopy(o, memo), copy.deepcopy(c, memo))
        s.flush()
        return s
    def __getstate__(self):
        self.flush()
        from ...util.caching import Cacher
        obs = []
        for p, o, c in self:
            if (getattr(o, c.__name__, None) is not None 
                and not isinstance(o, Cacher)):
                obs.append((p,o,c.__name__))
        return obs
    def __setstate__(self, state):
        self._poc = []
        for p, o, c in state:
            self.add(p,o,getattr(o, c))
    pass
--- a/GPy/core/parameterization/observable.py
+++ b/GPy/core/parameterization/observable.py
@ -0,0 +1,66 @@
 # Copyright (c) 2014, Max Zwiessele
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 class Observable(object):
    """
    Observable pattern for parameterization.
    This Object allows for observers to register with self and a (bound!) function
    as an observer. Every time the observable changes, it sends a notification with
    self as only argument to all its observers.
    """
    def __init__(self, *args, **kwargs):
        super(Observable, self).__init__()
        from lists_and_dicts import ObserverList
        self.observers = ObserverList()
    def add_observer(self, observer, callble, priority=0):
        """
        Add an observer `observer` with the callback `callble`
        and priority `priority` to this observers list.
        """
        self.observers.add(priority, observer, callble)
    def remove_observer(self, observer, callble=None):
        """
        Either (if callble is None) remove all callables,
        which were added alongside observer,
        or remove callable `callble` which was added alongside
        the observer `observer`.
        """
        to_remove = []
        for poc in self.observers:
            _, obs, clble = poc
            if callble is not None:
                if (obs is observer) and (callble == clble):
                    to_remove.append(poc)
            else:
                if obs is observer:
                    to_remove.append(poc)
        for r in to_remove:
            self.observers.remove(*r)
    def notify_observers(self, which=None, min_priority=None):
        """
        Notifies all observers. Which is the element, which kicked off this
        notification loop. The first argument will be self, the second `which`.
        NOTE: notifies only observers with priority p > min_priority!
                                                    ^^^^^^^^^^^^^^^^
        :param min_priority: only notify observers with priority > min_priority
                             if min_priority is None, notify all observers in order
        """
        if which is None:
            which = self
        if min_priority is None:
            [callble(self, which=which) for _, _, callble in self.observers]
        else:
            for p, _, callble in self.observers:
                if p <= min_priority:
                    break
                callble(self, which=which)
    def change_priority(self, observer, callble, priority):
        self.remove_observer(observer, callble)
        self.add_observer(observer, callble, priority)
--- a/GPy/core/parameterization/observable_array.py
+++ b/GPy/core/parameterization/observable_array.py
@ -0,0 +1,147 @@
 # Copyright (c) 2014, Max Zwiessele
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from parameter_core import Pickleable
 from observable import Observable
 class ObsAr(np.ndarray, Pickleable, Observable):
    """
    An ndarray which reports changes to its observers.
    The observers can add themselves with a callable, which
    will be called every time this array changes. The callable
    takes exactly one argument, which is this array itself.
    """
    __array_priority__ = -1 # Never give back ObsAr
    def __new__(cls, input_array, *a, **kw):
        # allways make a copy of input paramters, as we need it to be in C order:
        if not isinstance(input_array, ObsAr):
            obj = np.atleast_1d(np.require(input_array, dtype=np.float64, requirements=['W', 'C'])).view(cls)
        else: obj = input_array
        super(ObsAr, obj).__init__(*a, **kw)
        return obj
    def __array_finalize__(self, obj):
        # see InfoArray.__array_finalize__ for comments
        if obj is None: return
        self.observers = getattr(obj, 'observers', None)
    def __array_wrap__(self, out_arr, context=None):
        return out_arr.view(np.ndarray)
    def _setup_observers(self):
        # do not setup anything, as observable arrays do not have default observers
        pass
    @property
    def values(self):
        return self.view(np.ndarray)
    def copy(self):
        from lists_and_dicts import ObserverList
        memo = {}
        memo[id(self)] = self
        memo[id(self.observers)] = ObserverList()
        return self.__deepcopy__(memo)
    def __deepcopy__(self, memo):
        s = self.__new__(self.__class__, input_array=self.view(np.ndarray).copy())
        memo[id(self)] = s
        import copy
        Pickleable.__setstate__(s, copy.deepcopy(self.__getstate__(), memo))
        return s
    def __reduce__(self):
        func, args, state = super(ObsAr, self).__reduce__()
        return func, args, (state, Pickleable.__getstate__(self))
    def __setstate__(self, state):
        np.ndarray.__setstate__(self, state[0])
        Pickleable.__setstate__(self, state[1])
    def __setitem__(self, s, val):
        super(ObsAr, self).__setitem__(s, val)
        self.notify_observers()
    def __getslice__(self, start, stop):
        return self.__getitem__(slice(start, stop))
    def __setslice__(self, start, stop, val):
        return self.__setitem__(slice(start, stop), val)
    def __ilshift__(self, *args, **kwargs):
        r = np.ndarray.__ilshift__(self, *args, **kwargs)
        self.notify_observers()
        return r
    def __irshift__(self, *args, **kwargs):
        r = np.ndarray.__irshift__(self, *args, **kwargs)
        self.notify_observers()
        return r
    def __ixor__(self, *args, **kwargs):
        r = np.ndarray.__ixor__(self, *args, **kwargs)
        self.notify_observers()
        return r
    def __ipow__(self, *args, **kwargs):
        r = np.ndarray.__ipow__(self, *args, **kwargs)
        self.notify_observers()
        return r
    def __ifloordiv__(self, *args, **kwargs):
        r = np.ndarray.__ifloordiv__(self, *args, **kwargs)
        self.notify_observers()
        return r
    def __isub__(self, *args, **kwargs):
        r = np.ndarray.__isub__(self, *args, **kwargs)
        self.notify_observers()
        return r
    def __ior__(self, *args, **kwargs):
        r = np.ndarray.__ior__(self, *args, **kwargs)
        self.notify_observers()
        return r
    def __itruediv__(self, *args, **kwargs):
        r = np.ndarray.__itruediv__(self, *args, **kwargs)
        self.notify_observers()
        return r
    def __idiv__(self, *args, **kwargs):
        r = np.ndarray.__idiv__(self, *args, **kwargs)
        self.notify_observers()
        return r
    def __iand__(self, *args, **kwargs):
        r = np.ndarray.__iand__(self, *args, **kwargs)
        self.notify_observers()
        return r
    def __imod__(self, *args, **kwargs):
        r = np.ndarray.__imod__(self, *args, **kwargs)
        self.notify_observers()
        return r
    def __iadd__(self, *args, **kwargs):
        r = np.ndarray.__iadd__(self, *args, **kwargs)
        self.notify_observers()
        return r
    def __imul__(self, *args, **kwargs):
        r = np.ndarray.__imul__(self, *args, **kwargs)
        self.notify_observers()
        return r
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@ -0,0 +1,476 @@
 # Copyright (c) 2014, Max Zwiessele
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import itertools
 import numpy
 np = numpy
 from parameter_core import Parameterizable, adjust_name_for_printing, Pickleable
 from observable_array import ObsAr
 ###### printing
 __constraints_name__ = "Constraint"
 __index_name__ = "Index"
 __tie_name__ = "Tied to"
 __priors_name__ = "Prior"
 __precision__ = numpy.get_printoptions()['precision'] # numpy printing precision used, sublassing numpy ndarray after all
 __print_threshold__ = 5
 ######
 class Param(Parameterizable, ObsAr):
    """
    Parameter object for GPy models.
    :param str name:           name of the parameter to be printed
    :param input_array:        array which this parameter handles
    :type input_array:         numpy.ndarray
    :param default_constraint: The default constraint for this parameter
    :type default_constraint:
    You can add/remove constraints by calling constrain on the parameter itself, e.g:
        - self[:,1].constrain_positive()
        - self[0].tie_to(other)
        - self.untie()
        - self[:3,:].unconstrain()
        - self[1].fix()
    Fixing parameters will fix them to the value they are right now. If you change
    the fixed value, it will be fixed to the new value!
    See :py:class:`GPy.core.parameterized.Parameterized` for more details on constraining etc.
    """
    __array_priority__ = -1 # Never give back Param
    _fixes_ = None
    parameters = []
    def __new__(cls, name, input_array, default_constraint=None):
        obj = numpy.atleast_1d(super(Param, cls).__new__(cls, input_array=input_array))
        obj._current_slice_ = (slice(obj.shape[0]),)
        obj._realshape_ = obj.shape
        obj._realsize_ = obj.size
        obj._realndim_ = obj.ndim
        obj._original_ = obj
        return obj
    def __init__(self, name, input_array, default_constraint=None, *a, **kw):
        self._in_init_ = True
        super(Param, self).__init__(name=name, default_constraint=default_constraint, *a, **kw)
        self._in_init_ = False
    def build_pydot(self,G):
        import pydot
        node = pydot.Node(id(self), shape='trapezium', label=self.name)#, fontcolor='white', color='white')
        G.add_node(node)
        for _, o, _ in self.observers:
            label = o.name if hasattr(o, 'name') else str(o)
            observed_node = pydot.Node(id(o), label=label)
            G.add_node(observed_node)
            edge = pydot.Edge(str(id(self)), str(id(o)), color='darkorange2', arrowhead='vee')
            G.add_edge(edge)
        return node
    def __array_finalize__(self, obj):
        # see InfoArray.__array_finalize__ for comments
        if obj is None: return
        super(Param, self).__array_finalize__(obj)
        self._parent_ = getattr(obj, '_parent_', None)
        self._parent_index_ = getattr(obj, '_parent_index_', None)
        self._default_constraint_ = getattr(obj, '_default_constraint_', None)
        self._current_slice_ = getattr(obj, '_current_slice_', None)
        self._realshape_ = getattr(obj, '_realshape_', None)
        self._realsize_ = getattr(obj, '_realsize_', None)
        self._realndim_ = getattr(obj, '_realndim_', None)
        self._original_ = getattr(obj, '_original_', None)
        self._name = getattr(obj, '_name', None)
        self._gradient_array_ = getattr(obj, '_gradient_array_', None)
        self.constraints = getattr(obj, 'constraints', None)
        self.priors = getattr(obj, 'priors', None)
    @property
    def param_array(self):
        """
        As we are a leaf, this just returns self
        """
        return self
    @property
    def values(self):
        """
        Return self as numpy array view
        """
        return self.view(np.ndarray)
    @property
    def gradient(self):
        """
        Return a view on the gradient, which is in the same shape as this parameter is.
        Note: this is not the real gradient array, it is just a view on it.
        To work on the real gradient array use: self.full_gradient
        """
        if getattr(self, '_gradient_array_', None) is None:
            self._gradient_array_ = numpy.empty(self._realshape_, dtype=numpy.float64)
        return self._gradient_array_#[self._current_slice_]
    @gradient.setter
    def gradient(self, val):
        self._gradient_array_[:] = val
    #===========================================================================
    # Array operations -> done
    #===========================================================================
    def __getitem__(self, s, *args, **kwargs):
        if not isinstance(s, tuple):
            s = (s,)
        #if not reduce(lambda a, b: a or numpy.any(b is Ellipsis), s, False) and len(s) <= self.ndim:
        #    s += (Ellipsis,)
        new_arr = super(Param, self).__getitem__(s, *args, **kwargs)
        try:
            new_arr._current_slice_ = s
            new_arr._gradient_array_ = self.gradient[s]
            new_arr._original_ = self._original_
        except AttributeError: pass  # returning 0d array or float, double etc
        return new_arr
    def _raveled_index(self, slice_index=None):
        # return an index array on the raveled array, which is formed by the current_slice
        # of this object
        extended_realshape = numpy.cumprod((1,) + self._realshape_[:0:-1])[::-1]
        ind = self._indices(slice_index)
        if ind.ndim < 2: ind = ind[:, None]
        return numpy.asarray(numpy.apply_along_axis(lambda x: numpy.sum(extended_realshape * x), 1, ind), dtype=int)
    def _raveled_index_for(self, obj):
        return self._raveled_index()
    #===========================================================================
    # Constrainable
    #===========================================================================
    def _ensure_fixes(self):
        if not self._has_fixes(): self._fixes_ = numpy.ones(self._realsize_, dtype=bool)
    #===========================================================================
    # Convenience
    #===========================================================================
    @property
    def is_fixed(self):
        from transformations import __fixed__
        return self.constraints[__fixed__].size == self.size
    def _get_original(self, param):
        return self._original_
    #===========================================================================
    # Pickling and copying
    #===========================================================================
    def copy(self):
        return Parameterizable.copy(self, which=self)
    def __deepcopy__(self, memo):
        s = self.__new__(self.__class__, name=self.name, input_array=self.view(numpy.ndarray).copy())
        memo[id(self)] = s
        import copy
        Pickleable.__setstate__(s, copy.deepcopy(self.__getstate__(), memo))
        return s
    def _setup_observers(self):
        """
        Setup the default observers
        1: pass through to parent, if present
        """
        if self.has_parent():
            self.add_observer(self._parent_, self._parent_._pass_through_notify_observers, -np.inf)
    #===========================================================================
    # Printing -> done
    #===========================================================================
    @property
    def _description_str(self):
        if self.size <= 1:
            return [str(self.view(numpy.ndarray)[0])]
        else: return [str(self.shape)]
    def parameter_names(self, add_self=False, adjust_for_printing=False, recursive=True):
        # this is just overwrighting the parameterized calls to parameter names, in order to maintain OOP
        if adjust_for_printing:
            return [adjust_name_for_printing(self.name)]
        return [self.name]
    @property
    def flattened_parameters(self):
        return [self]
    @property
    def parameter_shapes(self):
        return [self.shape]
    @property
    def num_params(self):
        return 0
    @property
    def _constraints_str(self):
        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.constraints.iteritems()))]
    @property
    def _priors_str(self):
        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.priors.iteritems()))]
    @property
    def _ties_str(self):
        return ['']
    def _ties_for(self, ravi):
        return [['N/A']]*ravi.size
    def __repr__(self, *args, **kwargs):
        name = "\033[1m{x:s}\033[0;0m:\n".format(
                            x=self.hierarchy_name())
        return name + super(Param, self).__repr__(*args, **kwargs)
    def _indices(self, slice_index=None):
        # get a int-array containing all indices in the first axis.
        if slice_index is None:
            slice_index = self._current_slice_
        try:
            indices = np.indices(self._realshape_, dtype=int)
            indices = indices[(slice(None),)+slice_index]
            indices = np.rollaxis(indices, 0, indices.ndim).reshape(-1,self._realndim_)
            #print indices_
            #if not np.all(indices==indices__):
            #    import ipdb; ipdb.set_trace()
        except:
            indices = np.indices(self._realshape_, dtype=int)
            indices = indices[(slice(None),)+slice_index]
            indices = np.rollaxis(indices, 0, indices.ndim)
        return indices
    def _max_len_names(self, gen, header):
        gen = map(lambda x: " ".join(map(str, x)), gen)
        return reduce(lambda a, b:max(a, len(b)), gen, len(header))
    def _max_len_values(self):
        return reduce(lambda a, b:max(a, len("{x:=.{0}g}".format(__precision__, x=b))), self.flat, len(self.hierarchy_name()))
    def _max_len_index(self, ind):
        return reduce(lambda a, b:max(a, len(str(b))), ind, len(__index_name__))
    def _short(self):
        # short string to print
        name = self.hierarchy_name()
        if self._realsize_ < 2:
            return name
        ind = self._indices()
        if ind.size > 4: indstr = ','.join(map(str, ind[:2])) + "..." + ','.join(map(str, ind[-2:]))
        else: indstr = ','.join(map(str, ind))
        return name + '[' + indstr + ']'
    def _repr_html_(self, constr_matrix=None, indices=None, prirs=None, ties=None):
        """Representation of the parameter in html for notebook display."""
        filter_ = self._current_slice_
        vals = self.flat
        if indices is None: indices = self._indices(filter_)
        ravi = self._raveled_index(filter_)
        if constr_matrix is None: constr_matrix = self.constraints.properties_for(ravi)
        if prirs is None: prirs = self.priors.properties_for(ravi)
        if ties is None: ties = self._ties_for(ravi)
        ties = [' '.join(map(lambda x: x, t)) for t in ties]
        header_format = """
 <tr>
  <td><b>{i}</b></td>
  <td><b>{x}</b></td>
  <td><b>{c}</b></td>
  <td><b>{p}</b></td>
  <td><b>{t}</b></td>
 </tr>"""
        header = header_format.format(x=self.hierarchy_name(), c=__constraints_name__, i=__index_name__, t=__tie_name__, p=__priors_name__)  # nice header for printing
        if not ties: ties = itertools.cycle([''])
        return "\n".join(['<table>'] + [header] + ["<tr><td>{i}</td><td align=\"right\">{x}</td><td>{c}</td><td>{p}</td><td>{t}</td></tr>".format(x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in itertools.izip(indices, vals, constr_matrix, ties, prirs)] + ["</table>"])  
    def __str__(self, constr_matrix=None, indices=None, prirs=None, ties=None, lc=None, lx=None, li=None, lp=None, lt=None, only_name=False):
        filter_ = self._current_slice_
        vals = self.flat
        if indices is None: indices = self._indices(filter_)
        ravi = self._raveled_index(filter_)
        if constr_matrix is None: constr_matrix = self.constraints.properties_for(ravi)
        if prirs is None: prirs = self.priors.properties_for(ravi)
        if ties is None: ties = self._ties_for(ravi)
        ties = [' '.join(map(lambda x: x, t)) for t in ties]
        if lc is None: lc = self._max_len_names(constr_matrix, __constraints_name__)
        if lx is None: lx = self._max_len_values()
        if li is None: li = self._max_len_index(indices)
        if lt is None: lt = self._max_len_names(ties, __tie_name__)
        if lp is None: lp = self._max_len_names(prirs, __tie_name__)
        sep = '-'
        header_format = "  {i:{5}^{2}s}  |  \033[1m{x:{5}^{1}s}\033[0;0m  |  {c:{5}^{0}s}  |  {p:{5}^{4}s}  |  {t:{5}^{3}s}"
        if only_name: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.hierarchy_name(), c=sep*lc, i=sep*li, t=sep*lt, p=sep*lp)  # nice header for printing
        else: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.hierarchy_name(), c=__constraints_name__, i=__index_name__, t=__tie_name__, p=__priors_name__)  # nice header for printing
        if not ties: ties = itertools.cycle([''])
        return "\n".join([header] + ["  {i!s:^{3}s}  |  {x: >{1}.{2}g}  |  {c:^{0}s}  |  {p:^{5}s}  |  {t:^{4}s}  ".format(lc, lx, __precision__, li, lt, lp, x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in itertools.izip(indices, vals, constr_matrix, ties, prirs)])  # return all the constraints with right indices
        # except: return super(Param, self).__str__()
 class ParamConcatenation(object):
    def __init__(self, params):
        """
        Parameter concatenation for convenience of printing regular expression matched arrays
        you can index this concatenation as if it was the flattened concatenation
        of all the parameters it contains, same for setting parameters (Broadcasting enabled).
        See :py:class:`GPy.core.parameter.Param` for more details on constraining.
        """
        # self.params = params
        from lists_and_dicts import ArrayList
        self.params = ArrayList([])
        for p in params:
            for p in p.flattened_parameters:
                if p not in self.params:
                    self.params.append(p)
        self._param_sizes = [p.size for p in self.params]
        startstops = numpy.cumsum([0] + self._param_sizes)
        self._param_slices_ = [slice(start, stop) for start,stop in zip(startstops, startstops[1:])]
        parents = dict()
        for p in self.params:
            if p.has_parent():
                parent = p._parent_
                level = 0
                while parent is not None:
                    if parent in parents:
                        parents[parent] = max(level, parents[parent])
                    else:
                        parents[parent] = level
                    level += 1
                    parent = parent._parent_
        import operator
        self.parents = map(lambda x: x[0], sorted(parents.iteritems(), key=operator.itemgetter(1)))
    #===========================================================================
    # Get/set items, enable broadcasting
    #===========================================================================
    def __getitem__(self, s):
        ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True;
        params = [p.param_array.flat[ind[ps]] for p,ps in zip(self.params, self._param_slices_) if numpy.any(p.param_array.flat[ind[ps]])]
        if len(params)==1: return params[0]
        return ParamConcatenation(params)
    def __setitem__(self, s, val, update=True):
        if isinstance(val, ParamConcatenation):
            val = val.values()
        ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True;
        vals = self.values(); vals[s] = val
        for p, ps in zip(self.params, self._param_slices_):
            p.flat[ind[ps]] = vals[ps]
        if update:
            self.update_all_params()
    def values(self):
        return numpy.hstack([p.param_array.flat for p in self.params])
    #===========================================================================
    # parameter operations:
    #===========================================================================
    def update_all_params(self):
        for par in self.parents:
            par.notify_observers()
    def constrain(self, constraint, warning=True):
        [param.constrain(constraint, trigger_parent=False) for param in self.params]
        self.update_all_params()
    constrain.__doc__ = Param.constrain.__doc__
    def constrain_positive(self, warning=True):
        [param.constrain_positive(warning, trigger_parent=False) for param in self.params]
        self.update_all_params()
    constrain_positive.__doc__ = Param.constrain_positive.__doc__
    def constrain_fixed(self, value=None, warning=True, trigger_parent=True):
        [param.constrain_fixed(value, warning, trigger_parent) for param in self.params]
    constrain_fixed.__doc__ = Param.constrain_fixed.__doc__
    fix = constrain_fixed
    def constrain_negative(self, warning=True):
        [param.constrain_negative(warning, trigger_parent=False) for param in self.params]
        self.update_all_params()
    constrain_negative.__doc__ = Param.constrain_negative.__doc__
    def constrain_bounded(self, lower, upper, warning=True):
        [param.constrain_bounded(lower, upper, warning, trigger_parent=False) for param in self.params]
        self.update_all_params()
    constrain_bounded.__doc__ = Param.constrain_bounded.__doc__
    def unconstrain(self, *constraints):
        [param.unconstrain(*constraints) for param in self.params]
    unconstrain.__doc__ = Param.unconstrain.__doc__
    def unconstrain_negative(self):
        [param.unconstrain_negative() for param in self.params]
    unconstrain_negative.__doc__ = Param.unconstrain_negative.__doc__
    def unconstrain_positive(self):
        [param.unconstrain_positive() for param in self.params]
    unconstrain_positive.__doc__ = Param.unconstrain_positive.__doc__
    def unconstrain_fixed(self):
        [param.unconstrain_fixed() for param in self.params]
    unconstrain_fixed.__doc__ = Param.unconstrain_fixed.__doc__
    unfix = unconstrain_fixed
    def unconstrain_bounded(self, lower, upper):
        [param.unconstrain_bounded(lower, upper) for param in self.params]
    unconstrain_bounded.__doc__ = Param.unconstrain_bounded.__doc__
    def untie(self, *ties):
        [param.untie(*ties) for param in self.params]
    def checkgrad(self, verbose=0, step=1e-6, tolerance=1e-3):
        return self.params[0]._highest_parent_._checkgrad(self, verbose, step, tolerance)
    #checkgrad.__doc__ = Gradcheckable.checkgrad.__doc__
    __lt__ = lambda self, val: self.values() < val
    __le__ = lambda self, val: self.values() <= val
    __eq__ = lambda self, val: self.values() == val
    __ne__ = lambda self, val: self.values() != val
    __gt__ = lambda self, val: self.values() > val
    __ge__ = lambda self, val: self.values() >= val
    def __str__(self, *args, **kwargs):
        def f(p):
            ind = p._raveled_index()
            return p.constraints.properties_for(ind), p._ties_for(ind), p.priors.properties_for(ind)
        params = self.params
        constr_matrices, ties_matrices, prior_matrices = zip(*map(f, params))
        indices = [p._indices() for p in params]
        lc = max([p._max_len_names(cm, __constraints_name__) for p, cm in itertools.izip(params, constr_matrices)])
        lx = max([p._max_len_values() for p in params])
        li = max([p._max_len_index(i) for p, i in itertools.izip(params, indices)])
        lt = max([p._max_len_names(tm, __tie_name__) for p, tm in itertools.izip(params, ties_matrices)])
        lp = max([p._max_len_names(pm, __constraints_name__) for p, pm in itertools.izip(params, prior_matrices)])
        strings = []
        start = True
        for p, cm, i, tm, pm in itertools.izip(params,constr_matrices,indices,ties_matrices,prior_matrices):
            strings.append(p.__str__(constr_matrix=cm, indices=i, prirs=pm, ties=tm, lc=lc, lx=lx, li=li, lp=lp, lt=lt, only_name=(1-start)))
            start = False
        return "\n".join(strings)
    def __repr__(self):
        return "\n".join(map(repr,self.params))
    def __ilshift__(self, *args, **kwargs):
        self[:] = np.ndarray.__ilshift__(self.values(), *args, **kwargs)
    def __irshift__(self, *args, **kwargs):
        self[:] = np.ndarray.__irshift__(self.values(), *args, **kwargs)
    def __ixor__(self, *args, **kwargs):
        self[:] = np.ndarray.__ixor__(self.values(), *args, **kwargs)
    def __ipow__(self, *args, **kwargs):
        self[:] = np.ndarray.__ipow__(self.values(), *args, **kwargs)
    def __ifloordiv__(self, *args, **kwargs):
        self[:] = np.ndarray.__ifloordiv__(self.values(), *args, **kwargs)
    def __isub__(self, *args, **kwargs):
        self[:] = np.ndarray.__isub__(self.values(), *args, **kwargs)
    def __ior__(self, *args, **kwargs):
        self[:] = np.ndarray.__ior__(self.values(), *args, **kwargs)
    def __itruediv__(self, *args, **kwargs):
        self[:] = np.ndarray.__itruediv__(self.values(), *args, **kwargs)
    def __idiv__(self, *args, **kwargs):
        self[:] = np.ndarray.__idiv__(self.values(), *args, **kwargs)
    def __iand__(self, *args, **kwargs):
        self[:] = np.ndarray.__iand__(self.values(), *args, **kwargs)
    def __imod__(self, *args, **kwargs):
        self[:] = np.ndarray.__imod__(self.values(), *args, **kwargs)
    def __iadd__(self, *args, **kwargs):
        self[:] = np.ndarray.__iadd__(self.values(), *args, **kwargs)
    def __imul__(self, *args, **kwargs):
        self[:] = np.ndarray.__imul__(self.values(), *args, **kwargs)
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -0,0 +1,418 @@
 # Copyright (c) 2014, Max Zwiessele, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy; np = numpy
 import itertools
 from re import compile, _pattern_type
 from param import ParamConcatenation
 from parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing
 import logging
 from GPy.core.parameterization.index_operations import ParameterIndexOperationsView
 logger = logging.getLogger("parameters changed meta")
 class ParametersChangedMeta(type):
    def __call__(self, *args, **kw):
        self._in_init_ = True
        #import ipdb;ipdb.set_trace()
        self = super(ParametersChangedMeta, self).__call__(*args, **kw)
        logger.debug("finished init")
        self._in_init_ = False
        logger.debug("connecting parameters")
        self._highest_parent_._connect_parameters()
        #self._highest_parent_._notify_parent_change()
        self._highest_parent_._connect_fixes()
        logger.debug("calling parameters changed")
        self.parameters_changed()
        return self
 class Parameterized(Parameterizable):
    """
    Parameterized class
    Say m is a handle to a parameterized class.
    Printing parameters:
        - print m:           prints a nice summary over all parameters
        - print m.name:      prints details for param with name 'name'
        - print m[regexp]: prints details for all the parameters
                             which match (!) regexp
        - print m['']:       prints details for all parameters
        Fields:
            Name:       The name of the param, can be renamed!
            Value:      Shape or value, if one-valued
            Constrain:  constraint of the param, curly "{c}" brackets indicate
                        some parameters are constrained by c. See detailed print
                        to get exact constraints.
            Tied_to:    which paramter it is tied to.
    Getting and setting parameters:
        Set all values in param to one:
            m.name.to.param = 1
    Handling of constraining, fixing and tieing parameters:
        You can constrain parameters by calling the constrain on the param itself, e.g:
            - m.name[:,1].constrain_positive()
            - m.name[0].tie_to(m.name[1])
        Fixing parameters will fix them to the value they are right now. If you change
        the parameters value, the param will be fixed to the new value!
        If you want to operate on all parameters use m[''] to wildcard select all paramters
        and concatenate them. Printing m[''] will result in printing of all parameters in detail.
    """
    #===========================================================================
    # Metaclass for parameters changed after init.
    # This makes sure, that parameters changed will always be called after __init__
    # **Never** call parameters_changed() yourself
    __metaclass__ = ParametersChangedMeta
    #===========================================================================
    def __init__(self, name=None, parameters=[], *a, **kw):
        super(Parameterized, self).__init__(name=name, *a, **kw)
        self.size = sum(p.size for p in self.parameters)
        self.add_observer(self, self._parameters_changed_notification, -100)
        if not self._has_fixes():
            self._fixes_ = None
        self._param_slices_ = []
        #self._connect_parameters()
        self.link_parameters(*parameters)
    def build_pydot(self, G=None):
        import pydot  # @UnresolvedImport
        iamroot = False
        if G is None:
            G = pydot.Dot(graph_type='digraph', bgcolor=None)
            iamroot=True
        node = pydot.Node(id(self), shape='box', label=self.name)#, color='white')
        G.add_node(node)
        for child in self.parameters:
            child_node = child.build_pydot(G)
            G.add_edge(pydot.Edge(node, child_node))#, color='white'))
        for _, o, _ in self.observers:
            label = o.name if hasattr(o, 'name') else str(o)
            observed_node = pydot.Node(id(o), label=label)
            G.add_node(observed_node)
            edge = pydot.Edge(str(id(self)), str(id(o)), color='darkorange2', arrowhead='vee')
            G.add_edge(edge)
        if iamroot:
            return G
        return node
    #===========================================================================
    # Add remove parameters:
    #===========================================================================
    def link_parameter(self, param, index=None, _ignore_added_names=False):
        """
        :param parameters:  the parameters to add
        :type parameters:   list of or one :py:class:`GPy.core.param.Param`
        :param [index]:     index of where to put parameters
        :param bool _ignore_added_names: whether the name of the parameter overrides a possibly existing field
        Add all parameters to this param class, you can insert parameters
        at any given index using the :func:`list.insert` syntax
        """
        if param in self.parameters and index is not None:
            self.unlink_parameter(param)
            self.link_parameter(param, index)
        # elif param.has_parent():
        #    raise HierarchyError, "parameter {} already in another model ({}), create new object (or copy) for adding".format(param._short(), param._highest_parent_._short())
        elif param not in self.parameters:
            if param.has_parent():
                def visit(parent, self):
                    if parent is self:
                        raise HierarchyError, "You cannot add a parameter twice into the hierarchy"
                param.traverse_parents(visit, self)
                param._parent_.unlink_parameter(param)
            # make sure the size is set
            if index is None:
                start = sum(p.size for p in self.parameters)
                self.constraints.shift_right(start, param.size)
                self.priors.shift_right(start, param.size)
                self.constraints.update(param.constraints, self.size)
                self.priors.update(param.priors, self.size)
                param._parent_ = self
                param._parent_index_ = len(self.parameters)
                self.parameters.append(param)
            else:
                start = sum(p.size for p in self.parameters[:index])
                self.constraints.shift_right(start, param.size)
                self.priors.shift_right(start, param.size)
                self.constraints.update(param.constraints, start)
                self.priors.update(param.priors, start)
                param._parent_ = self
                param._parent_index_ = index if index>=0 else len(self.parameters[:index])
                for p in self.parameters[index:]:
                    p._parent_index_ += 1
                self.parameters.insert(index, param)
            param.add_observer(self, self._pass_through_notify_observers, -np.inf)
            parent = self
            while parent is not None:
                parent.size += param.size
                parent = parent._parent_
            self._notify_parent_change()
            if not self._in_init_:
                #self._connect_parameters()
                #self._notify_parent_change()
                self._highest_parent_._connect_parameters(ignore_added_names=_ignore_added_names)
                self._highest_parent_._notify_parent_change()
                self._highest_parent_._connect_fixes()
        else:
            raise HierarchyError, """Parameter exists already, try making a copy"""
    def link_parameters(self, *parameters):
        """
        convenience method for adding several
        parameters without gradient specification
        """
        [self.link_parameter(p) for p in parameters]
    def unlink_parameter(self, param):
        """
        :param param: param object to remove from being a parameter of this parameterized object.
        """
        if not param in self.parameters:
            try:
                raise RuntimeError, "{} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name)
            except AttributeError:
                raise RuntimeError, "{} does not seem to be a parameter, remove parameters directly from their respective parents".format(str(param))
        start = sum([p.size for p in self.parameters[:param._parent_index_]])
        self._remove_parameter_name(param)
        self.size -= param.size
        del self.parameters[param._parent_index_]
        param._disconnect_parent()
        param.remove_observer(self, self._pass_through_notify_observers)
        self.constraints.shift_left(start, param.size)
        self._connect_parameters()
        self._notify_parent_change()
        parent = self._parent_
        while parent is not None:
            parent.size -= param.size
            parent = parent._parent_
        self._highest_parent_._connect_parameters()
        self._highest_parent_._connect_fixes()
        self._highest_parent_._notify_parent_change()
    def add_parameter(self, *args, **kwargs):
        raise DeprecationWarning, "add_parameter was renamed to link_parameter to avoid confusion of setting variables"
    def remove_parameter(self, *args, **kwargs):
        raise DeprecationWarning, "remove_parameter was renamed to link_parameter to avoid confusion of setting variables"
    def _connect_parameters(self, ignore_added_names=False):
        # connect parameterlist to this parameterized object
        # This just sets up the right connection for the params objects
        # to be used as parameters
        # it also sets the constraints for each parameter to the constraints
        # of their respective parents
        if not hasattr(self, "parameters") or len(self.parameters) < 1:
            # no parameters for this class
            return
        if self.param_array.size != self.size:
            self._param_array_ = np.empty(self.size, dtype=np.float64)
        if self.gradient.size != self.size:
            self._gradient_array_ = np.empty(self.size, dtype=np.float64)
        old_size = 0
        self._param_slices_ = []
        for i, p in enumerate(self.parameters):
            if not p.param_array.flags['C_CONTIGUOUS']:
                raise ValueError, "This should not happen! Please write an email to the developers with the code, which reproduces this error. All parameter arrays must be C_CONTIGUOUS"
            p._parent_ = self
            p._parent_index_ = i
            pslice = slice(old_size, old_size + p.size)
            # first connect all children
            p._propagate_param_grad(self.param_array[pslice], self.gradient_full[pslice])
            # then connect children to self
            self.param_array[pslice] = p.param_array.flat  # , requirements=['C', 'W']).ravel(order='C')
            self.gradient_full[pslice] = p.gradient_full.flat  # , requirements=['C', 'W']).ravel(order='C')
            p.param_array.data = self.param_array[pslice].data
            p.gradient_full.data = self.gradient_full[pslice].data
            self._param_slices_.append(pslice)
            self._add_parameter_name(p, ignore_added_names=ignore_added_names)
            old_size += p.size
    #===========================================================================
    # Get/set parameters:
    #===========================================================================
    def grep_param_names(self, regexp):
        """
        create a list of parameters, matching regular expression regexp
        """
        if not isinstance(regexp, _pattern_type): regexp = compile(regexp)
        found_params = []
        for n, p in itertools.izip(self.parameter_names(False, False, True), self.flattened_parameters):
            if regexp.match(n) is not None:
                found_params.append(p)
        return found_params
    def __getitem__(self, name, paramlist=None):
        if isinstance(name, (int, slice, tuple, np.ndarray)):
            return self.param_array[name]
        else:
            if paramlist is None:
                paramlist = self.grep_param_names(name)
            if len(paramlist) < 1: raise AttributeError, name
            if len(paramlist) == 1:
                if isinstance(paramlist[-1], Parameterized):
                    paramlist = paramlist[-1].flattened_parameters
                    if len(paramlist) != 1:
                        return ParamConcatenation(paramlist)
                return paramlist[-1]
            return ParamConcatenation(paramlist)
    def __setitem__(self, name, value, paramlist=None):
        if value is None:
            return # nothing to do here
        if isinstance(name, (slice, tuple, np.ndarray)):
            try:
                self.param_array[name] = value
            except:
                raise ValueError, "Setting by slice or index only allowed with array-like"
            self._trigger_params_changed()
        else:
            try: param = self.__getitem__(name, paramlist)
            except: raise
            param[:] = value
    def __setattr__(self, name, val):
        # override the default behaviour, if setting a param, so broadcasting can by used
        if hasattr(self, "parameters"):
            try:
                pnames = self.parameter_names(False, adjust_for_printing=True, recursive=False)
                if name in pnames:
                    param = self.parameters[pnames.index(name)]
                    param[:] = val; return
            except AttributeError:
                pass
        object.__setattr__(self, name, val);
    #===========================================================================
    # Pickling
    #===========================================================================
    def __setstate__(self, state):
        super(Parameterized, self).__setstate__(state)
        try:
            self._connect_parameters()
            self._connect_fixes()
            self._notify_parent_change()
            self.parameters_changed()
        except Exception as e:
            print "WARNING: caught exception {!s}, trying to continue".format(e)
    def copy(self, memo=None):
        if memo is None:
            memo = {}
        memo[id(self.optimizer_array)] = None # and param_array
        memo[id(self.param_array)] = None # and param_array
        copy = super(Parameterized, self).copy(memo)
        copy._connect_parameters()
        copy._connect_fixes()
        copy._notify_parent_change()
        return copy
    #===========================================================================
    # Printing:
    #===========================================================================
    def _short(self):
        return self.hierarchy_name()
    @property
    def flattened_parameters(self):
        return [xi for x in self.parameters for xi in x.flattened_parameters]
    @property
    def _parameter_sizes_(self):
        return [x.size for x in self.parameters]
    @property
    def parameter_shapes(self):
        return [xi for x in self.parameters for xi in x.parameter_shapes]
    @property
    def _constraints_str(self):
        return [cs for p in self.parameters for cs in p._constraints_str]
    @property
    def _priors_str(self):
        return [cs for p in self.parameters for cs in p._priors_str]
    @property
    def _description_str(self):
        return [xi for x in self.parameters for xi in x._description_str]
    @property
    def _ties_str(self):
        return [','.join(x._ties_str) for x in self.flattened_parameters]
    def _repr_html_(self, header=True):
        """Representation of the parameters in html for notebook display."""
        name = adjust_name_for_printing(self.name) + "."
        constrs = self._constraints_str;
        ts = self._ties_str
        prirs = self._priors_str
        desc = self._description_str; names = self.parameter_names()
        nl = max([len(str(x)) for x in names + [name]])
        sl = max([len(str(x)) for x in desc + ["Value"]])
        cl = max([len(str(x)) if x else 0 for x in constrs + ["Constraint"]])
        tl = max([len(str(x)) if x else 0 for x in ts + ["Tied to"]])
        pl = max([len(str(x)) if x else 0 for x in prirs + ["Prior"]])
        format_spec = "<tr><td>{{name:<{0}s}}</td><td align=\"right\">{{desc:>{1}s}}</td><td>{{const:^{2}s}}</td><td>{{pri:^{3}s}}</td><td>{{t:^{4}s}}</td></tr>".format(nl, sl, cl, pl, tl)
        to_print = []
        for n, d, c, t, p in itertools.izip(names, desc, constrs, ts, prirs):
            to_print.append(format_spec.format(name=n, desc=d, const=c, t=t, pri=p))
        sep = '-' * (nl + sl + cl + + pl + tl + 8 * 2 + 3)
        if header:
            header = """
 <tr>
  <td><b>{name}</b>
  <td><b>Value</b></td>
  <td><b>Constraint</b></td>
  <td><b>Prior</b></td>
  <td><b>Tied to</b></td>""".format(name=name)
            to_print.insert(0, header)
        return '<table>' + '\n'.format(sep).join(to_print) + '\n</table>'
    def __str__(self, header=True):
        name = adjust_name_for_printing(self.name) + "."
        constrs = self._constraints_str;
        ts = self._ties_str
        prirs = self._priors_str
        desc = self._description_str; names = self.parameter_names()
        nl = max([len(str(x)) for x in names + [name]])
        sl = max([len(str(x)) for x in desc + ["Value"]])
        cl = max([len(str(x)) if x else 0 for x in constrs + ["Constraint"]])
        tl = max([len(str(x)) if x else 0 for x in ts + ["Tied to"]])
        pl = max([len(str(x)) if x else 0 for x in prirs + ["Prior"]])
        format_spec = "  \033[1m{{name:<{0}s}}\033[0;0m  |  {{desc:>{1}s}}  |  {{const:^{2}s}}  |  {{pri:^{3}s}}  |  {{t:^{4}s}}".format(nl, sl, cl, pl, tl)
        to_print = []
        for n, d, c, t, p in itertools.izip(names, desc, constrs, ts, prirs):
            to_print.append(format_spec.format(name=n, desc=d, const=c, t=t, pri=p))
        sep = '-' * (nl + sl + cl + + pl + tl + 8 * 2 + 3)
        if header:
            header = "  {{0:<{0}s}}  |  {{1:^{1}s}}  |  {{2:^{2}s}}  |  {{3:^{3}s}}  |  {{4:^{4}s}}".format(nl, sl, cl, pl, tl).format(name, "Value", "Constraint", "Prior", "Tied to")
            to_print.insert(0, header)
        return '\n'.format(sep).join(to_print)
    pass
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@ -0,0 +1,771 @@
 # Copyright (c) 2012 - 2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy.special import gammaln, digamma
 from ...util.linalg import pdinv
 from domains import _REAL, _POSITIVE
 import warnings
 import weakref
 class Prior(object):
    domain = None
    _instance = None
    def __new__(cls, *args, **kwargs):
        if not cls._instance or cls._instance.__class__ is not cls:
            cls._instance = super(Prior, cls).__new__(cls, *args, **kwargs)
        return cls._instance
    def pdf(self, x):
        return np.exp(self.lnpdf(x))
    def plot(self):
        import sys
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ...plotting.matplot_dep import priors_plots
        priors_plots.univariate_plot(self)
    def __repr__(self, *args, **kwargs):
        return self.__str__()
 class Gaussian(Prior):
    """
    Implementation of the univariate Gaussian probability function, coupled with random variables.
    :param mu: mean
    :param sigma: standard deviation
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    domain = _REAL
    _instances = []
    def __new__(cls, mu=0, sigma=1):  # Singleton:
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
                if instance().mu == mu and instance().sigma == sigma:
                    return instance()
        o = super(Prior, cls).__new__(cls, mu, sigma)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()
    def __init__(self, mu, sigma):
        self.mu = float(mu)
        self.sigma = float(sigma)
        self.sigma2 = np.square(self.sigma)
        self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)
    def __str__(self):
        return "N({:.2g}, {:.2g})".format(self.mu, self.sigma)
    def lnpdf(self, x):
        return self.constant - 0.5 * np.square(x - self.mu) / self.sigma2
    def lnpdf_grad(self, x):
        return -(x - self.mu) / self.sigma2
    def rvs(self, n):
        return np.random.randn(n) * self.sigma + self.mu
 #     def __getstate__(self):
 #         return self.mu, self.sigma
 #
 #     def __setstate__(self, state):
 #         self.mu = state[0]
 #         self.sigma = state[1]
 #         self.sigma2 = np.square(self.sigma)
 #         self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)
 class Uniform(Prior):
    domain = _REAL
    _instances = []
    def __new__(cls, lower=0, upper=1):  # Singleton:
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
                if instance().lower == lower and instance().upper == upper:
                    return instance()
        o = super(Prior, cls).__new__(cls, lower, upper)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()
    def __init__(self, lower, upper):
        self.lower = float(lower)
        self.upper = float(upper)
    def __str__(self):
        return "[{:.2g}, {:.2g}]".format(self.lower, self.upper)
    def lnpdf(self, x):
        region = (x >= self.lower) * (x <= self.upper)
        return region
    def lnpdf_grad(self, x):
        return np.zeros(x.shape)
    def rvs(self, n):
        return np.random.uniform(self.lower, self.upper, size=n)
 #     def __getstate__(self):
 #         return self.lower, self.upper
 #
 #     def __setstate__(self, state):
 #         self.lower = state[0]
 #         self.upper = state[1]
 class LogGaussian(Gaussian):
    """
    Implementation of the univariate *log*-Gaussian probability function, coupled with random variables.
    :param mu: mean
    :param sigma: standard deviation
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    domain = _POSITIVE
    _instances = []
    def __new__(cls, mu=0, sigma=1):  # Singleton:
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
                if instance().mu == mu and instance().sigma == sigma:
                    return instance()
        o = super(Prior, cls).__new__(cls, mu, sigma)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()
    def __init__(self, mu, sigma):
        self.mu = float(mu)
        self.sigma = float(sigma)
        self.sigma2 = np.square(self.sigma)
        self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)
    def __str__(self):
        return "lnN({:.2g}, {:.2g})".format(self.mu, self.sigma)
    def lnpdf(self, x):
        return self.constant - 0.5 * np.square(np.log(x) - self.mu) / self.sigma2 - np.log(x)
    def lnpdf_grad(self, x):
        return -((np.log(x) - self.mu) / self.sigma2 + 1.) / x
    def rvs(self, n):
        return np.exp(np.random.randn(n) * self.sigma + self.mu)
 class MultivariateGaussian(Prior):
    """
    Implementation of the multivariate Gaussian probability function, coupled with random variables.
    :param mu: mean (N-dimensional array)
    :param var: covariance matrix (NxN)
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    domain = _REAL
    _instances = []
    def __new__(cls, mu=0, var=1):  # Singleton:
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
                if np.all(instance().mu == mu) and np.all(instance().var == var):
                    return instance()
        o = super(Prior, cls).__new__(cls, mu, var)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()
    def __init__(self, mu, var):
        self.mu = np.array(mu).flatten()
        self.var = np.array(var)
        assert len(self.var.shape) == 2
        assert self.var.shape[0] == self.var.shape[1]
        assert self.var.shape[0] == self.mu.size
        self.input_dim = self.mu.size
        self.inv, self.hld = pdinv(self.var)
        self.constant = -0.5 * self.input_dim * np.log(2 * np.pi) - self.hld
    def summary(self):
        raise NotImplementedError
    def pdf(self, x):
        return np.exp(self.lnpdf(x))
    def lnpdf(self, x):
        d = x - self.mu
        return self.constant - 0.5 * np.sum(d * np.dot(d, self.inv), 1)
    def lnpdf_grad(self, x):
        d = x - self.mu
        return -np.dot(self.inv, d)
    def rvs(self, n):
        return np.random.multivariate_normal(self.mu, self.var, n)
    def plot(self):
        import sys
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import priors_plots
        priors_plots.multivariate_plot(self)
    def __getstate__(self):
        return self.mu, self.var
    def __setstate__(self, state):
        self.mu = state[0]
        self.var = state[1]
        assert len(self.var.shape) == 2
        assert self.var.shape[0] == self.var.shape[1]
        assert self.var.shape[0] == self.mu.size
        self.input_dim = self.mu.size
        self.inv, self.hld = pdinv(self.var)
        self.constant = -0.5 * self.input_dim * np.log(2 * np.pi) - self.hld
 def gamma_from_EV(E, V):
    warnings.warn("use Gamma.from_EV to create Gamma Prior", FutureWarning)
    return Gamma.from_EV(E, V)
 class Gamma(Prior):
    """
    Implementation of the Gamma probability function, coupled with random variables.
    :param a: shape parameter
    :param b: rate parameter (warning: it's the *inverse* of the scale)
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    domain = _POSITIVE
    _instances = []
    def __new__(cls, a=1, b=.5):  # Singleton:
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
                if instance().a == a and instance().b == b:
                    return instance()
        o = super(Prior, cls).__new__(cls, a, b)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()
    def __init__(self, a, b):
        self.a = float(a)
        self.b = float(b)
        self.constant = -gammaln(self.a) + a * np.log(b)
    def __str__(self):
        return "Ga({:.2g}, {:.2g})".format(self.a, self.b)
    def summary(self):
        ret = {"E[x]": self.a / self.b, \
               "E[ln x]": digamma(self.a) - np.log(self.b), \
               "var[x]": self.a / self.b / self.b, \
               "Entropy": gammaln(self.a) - (self.a - 1.) * digamma(self.a) - np.log(self.b) + self.a}
        if self.a > 1:
            ret['Mode'] = (self.a - 1.) / self.b
        else:
            ret['mode'] = np.nan
        return ret
    def lnpdf(self, x):
        return self.constant + (self.a - 1) * np.log(x) - self.b * x
    def lnpdf_grad(self, x):
        return (self.a - 1.) / x - self.b
    def rvs(self, n):
        return np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
    @staticmethod
    def from_EV(E, V):
        """
        Creates an instance of a Gamma Prior  by specifying the Expected value(s)
        and Variance(s) of the distribution.
        :param E: expected value
        :param V: variance
        """
        a = np.square(E) / V
        b = E / V
        return Gamma(a, b)
    def __getstate__(self):
        return self.a, self.b
    def __setstate__(self, state):
        self.a = state[0]
        self.b = state[1]
        self.constant = -gammaln(self.a) + self.a * np.log(self.b)
 class InverseGamma(Gamma):
    """
    Implementation of the inverse-Gamma probability function, coupled with random variables.
    :param a: shape parameter
    :param b: rate parameter (warning: it's the *inverse* of the scale)
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    domain = _POSITIVE
    _instances = []
    def __new__(cls, a=1, b=.5): # Singleton:
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
                if instance().a == a and instance().b == b:
                    return instance()
        o = super(Prior, cls).__new__(cls, a, b)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()
    def __init__(self, a, b):
        self.a = float(a)
        self.b = float(b)
        self.constant = -gammaln(self.a) + a * np.log(b)
    def __str__(self):
        return "iGa({:.2g}, {:.2g})".format(self.a, self.b)
    def lnpdf(self, x):
        return self.constant - (self.a + 1) * np.log(x) - self.b / x
    def lnpdf_grad(self, x):
        return -(self.a + 1.) / x + self.b / x ** 2
    def rvs(self, n):
        return 1. / np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
 class DGPLVM_KFDA(Prior):
    """
    Implementation of the Discriminative Gaussian Process Latent Variable function using
    Kernel Fisher Discriminant Analysis by Seung-Jean Kim for implementing Face paper
    by Chaochao Lu.
    :param lambdaa: constant
    :param sigma2: constant
    .. Note:: Surpassing Human-Level Face paper dgplvm implementation
    """
    domain = _REAL
    # _instances = []
    # def __new__(cls, lambdaa, sigma2):  # Singleton:
    #     if cls._instances:
    #         cls._instances[:] = [instance for instance in cls._instances if instance()]
    #         for instance in cls._instances:
    #             if instance().mu == mu and instance().sigma == sigma:
    #                 return instance()
    #     o = super(Prior, cls).__new__(cls, mu, sigma)
    #     cls._instances.append(weakref.ref(o))
    #     return cls._instances[-1]()
    def __init__(self, lambdaa, sigma2, lbl, kern, x_shape):
        """A description for init"""
        self.datanum = lbl.shape[0]
        self.classnum = lbl.shape[1]
        self.lambdaa = lambdaa
        self.sigma2 = sigma2
        self.lbl = lbl
        self.kern = kern
        lst_ni = self.compute_lst_ni()
        self.a = self.compute_a(lst_ni)
        self.A = self.compute_A(lst_ni)
        self.x_shape = x_shape
    def get_class_label(self, y):
        for idx, v in enumerate(y):
            if v == 1:
                return idx
        return -1
    # This function assigns each data point to its own class
    # and returns the dictionary which contains the class name and parameters.
    def compute_cls(self, x):
        cls = {}
        # Appending each data point to its proper class
        for j in xrange(self.datanum):
            class_label = self.get_class_label(self.lbl[j])
            if class_label not in cls:
                cls[class_label] = []
            cls[class_label].append(x[j])
        if len(cls) > 2:
            for i in range(2, self.classnum):
                del cls[i]
        return cls
    def x_reduced(self, cls):
        x1 = cls[0]
        x2 = cls[1]
        x = np.concatenate((x1, x2), axis=0)
        return x
    def compute_lst_ni(self):
        lst_ni = []
        lst_ni1 = []
        lst_ni2 = []
        f1 = (np.where(self.lbl[:, 0] == 1)[0])
        f2 = (np.where(self.lbl[:, 1] == 1)[0])
        for idx in f1:
            lst_ni1.append(idx)
        for idx in f2:
            lst_ni2.append(idx)
        lst_ni.append(len(lst_ni1))
        lst_ni.append(len(lst_ni2))
        return lst_ni
    def compute_a(self, lst_ni):
        a = np.ones((self.datanum, 1))
        count = 0
        for N_i in lst_ni:
            if N_i == lst_ni[0]:
                a[count:count + N_i] = (float(1) / N_i) * a[count]
                count += N_i
            else:
                if N_i == lst_ni[1]:
                    a[count: count + N_i] = -(float(1) / N_i) * a[count]
                    count += N_i
        return a
    def compute_A(self, lst_ni):
        A = np.zeros((self.datanum, self.datanum))
        idx = 0
        for N_i in lst_ni:
            B = float(1) / np.sqrt(N_i) * (np.eye(N_i) - ((float(1) / N_i) * np.ones((N_i, N_i))))
            A[idx:idx + N_i, idx:idx + N_i] = B
            idx += N_i
        return A
    # Here log function
    def lnpdf(self, x):
        x = x.reshape(self.x_shape)
        K = self.kern.K(x)
        a_trans = np.transpose(self.a)
        paran = self.lambdaa * np.eye(x.shape[0]) + self.A.dot(K).dot(self.A)
        inv_part = pdinv(paran)[0]
        J = a_trans.dot(K).dot(self.a) - a_trans.dot(K).dot(self.A).dot(inv_part).dot(self.A).dot(K).dot(self.a)
        J_star = (1. / self.lambdaa) * J
        return (-1. / self.sigma2) * J_star
    # Here gradient function
    def lnpdf_grad(self, x):
        x = x.reshape(self.x_shape)
        K = self.kern.K(x)
        paran = self.lambdaa * np.eye(x.shape[0]) + self.A.dot(K).dot(self.A)
        inv_part = pdinv(paran)[0]
        b = self.A.dot(inv_part).dot(self.A).dot(K).dot(self.a)
        a_Minus_b = self.a - b
        a_b_trans = np.transpose(a_Minus_b)
        DJ_star_DK = (1. / self.lambdaa) * (a_Minus_b.dot(a_b_trans))
        DJ_star_DX = self.kern.gradients_X(DJ_star_DK, x)
        return (-1. / self.sigma2) * DJ_star_DX
    def rvs(self, n):
        return np.random.rand(n)  # A WRONG implementation
    def __str__(self):
        return 'DGPLVM_prior'
    def __getstate___(self):
        return self.lbl, self.lambdaa, self.sigma2, self.kern, self.x_shape
    def __setstate__(self, state):
        lbl, lambdaa, sigma2, kern, a, A, x_shape = state
        self.datanum = lbl.shape[0]
        self.classnum = lbl.shape[1]
        self.lambdaa = lambdaa
        self.sigma2 = sigma2
        self.lbl = lbl
        self.kern = kern
        lst_ni = self.compute_lst_ni()
        self.a = self.compute_a(lst_ni)
        self.A = self.compute_A(lst_ni)
        self.x_shape = x_shape
 class DGPLVM(Prior):
    """
    Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
    :param sigma2: constant
    .. Note:: DGPLVM for Classification paper implementation
    """
    domain = _REAL
    # _instances = []
    # def __new__(cls, mu, sigma): # Singleton:
    #     if cls._instances:
    #         cls._instances[:] = [instance for instance in cls._instances if instance()]
    #         for instance in cls._instances:
    #             if instance().mu == mu and instance().sigma == sigma:
    #                 return instance()
    #     o = super(Prior, cls).__new__(cls, mu, sigma)
    #     cls._instances.append(weakref.ref(o))
    #     return cls._instances[-1]()
    def __init__(self, sigma2, lbl, x_shape):
        self.sigma2 = sigma2
        # self.x = x
        self.lbl = lbl
        self.classnum = lbl.shape[1]
        self.datanum = lbl.shape[0]
        self.x_shape = x_shape
        self.dim = x_shape[1]
    def get_class_label(self, y):
        for idx, v in enumerate(y):
            if v == 1:
                return idx
        return -1
    # This function assigns each data point to its own class
    # and returns the dictionary which contains the class name and parameters.
    def compute_cls(self, x):
        cls = {}
        # Appending each data point to its proper class
        for j in xrange(self.datanum):
            class_label = self.get_class_label(self.lbl[j])
            if class_label not in cls:
                cls[class_label] = []
            cls[class_label].append(x[j])
        return cls
    # This function computes mean of each class. The mean is calculated through each dimension
    def compute_Mi(self, cls):
        M_i = np.zeros((self.classnum, self.dim))
        for i in cls:
            # Mean of each class
            M_i[i] = np.mean(cls[i], axis=0)
        return M_i
    # Adding data points as tuple to the dictionary so that we can access indices
    def compute_indices(self, x):
        data_idx = {}
        for j in xrange(self.datanum):
            class_label = self.get_class_label(self.lbl[j])
            if class_label not in data_idx:
                data_idx[class_label] = []
            t = (j, x[j])
            data_idx[class_label].append(t)
        return data_idx
    # Adding indices to the list so we can access whole the indices
    def compute_listIndices(self, data_idx):
        lst_idx = []
        lst_idx_all = []
        for i in data_idx:
            if len(lst_idx) == 0:
                pass
                #Do nothing, because it is the first time list is created so is empty
            else:
                lst_idx = []
            # Here we put indices of each class in to the list called lst_idx_all
            for m in xrange(len(data_idx[i])):
                lst_idx.append(data_idx[i][m][0])
            lst_idx_all.append(lst_idx)
        return lst_idx_all
    # This function calculates between classes variances
    def compute_Sb(self, cls, M_i, M_0):
        Sb = np.zeros((self.dim, self.dim))
        for i in cls:
            B = (M_i[i] - M_0).reshape(self.dim, 1)
            B_trans = B.transpose()
            Sb += (float(len(cls[i])) / self.datanum) * B.dot(B_trans)
        return Sb
    # This function calculates within classes variances
    def compute_Sw(self, cls, M_i):
        Sw = np.zeros((self.dim, self.dim))
        for i in cls:
            N_i = float(len(cls[i]))
            W_WT = np.zeros((self.dim, self.dim))
            for xk in cls[i]:
                W = (xk - M_i[i])
                W_WT += np.outer(W, W)
            Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
        return Sw
    # Calculating beta and Bi for Sb
    def compute_sig_beta_Bi(self, data_idx, M_i, M_0, lst_idx_all):
        # import pdb
        # pdb.set_trace()
        B_i = np.zeros((self.classnum, self.dim))
        Sig_beta_B_i_all = np.zeros((self.datanum, self.dim))
        for i in data_idx:
            # pdb.set_trace()
            # Calculating Bi
            B_i[i] = (M_i[i] - M_0).reshape(1, self.dim)
        for k in xrange(self.datanum):
            for i in data_idx:
                N_i = float(len(data_idx[i]))
                if k in lst_idx_all[i]:
                    beta = (float(1) / N_i) - (float(1) / self.datanum)
                    Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
                else:
                    beta = -(float(1) / self.datanum)
                    Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
        Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
        return Sig_beta_B_i_all
    # Calculating W_j s separately so we can access all the W_j s anytime
    def compute_wj(self, data_idx, M_i):
        W_i = np.zeros((self.datanum, self.dim))
        for i in data_idx:
            N_i = float(len(data_idx[i]))
            for tpl in data_idx[i]:
                xj = tpl[1]
                j = tpl[0]
                W_i[j] = (xj - M_i[i])
        return W_i
    # Calculating alpha and Wj for Sw
    def compute_sig_alpha_W(self, data_idx, lst_idx_all, W_i):
        Sig_alpha_W_i = np.zeros((self.datanum, self.dim))
        for i in data_idx:
            N_i = float(len(data_idx[i]))
            for tpl in data_idx[i]:
                k = tpl[0]
                for j in lst_idx_all[i]:
                    if k == j:
                        alpha = 1 - (float(1) / N_i)
                        Sig_alpha_W_i[k] += (alpha * W_i[j])
                    else:
                        alpha = 0 - (float(1) / N_i)
                        Sig_alpha_W_i[k] += (alpha * W_i[j])
        Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
        return Sig_alpha_W_i
    # This function calculates log of our prior
    def lnpdf(self, x):
        x = x.reshape(self.x_shape)
        cls = self.compute_cls(x)
        M_0 = np.mean(x, axis=0)
        M_i = self.compute_Mi(cls)
        Sb = self.compute_Sb(cls, M_i, M_0)
        Sw = self.compute_Sw(cls, M_i)
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
        #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
        Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
        return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
    # This function calculates derivative of the log of prior function
    def lnpdf_grad(self, x):
        x = x.reshape(self.x_shape)
        cls = self.compute_cls(x)
        M_0 = np.mean(x, axis=0)
        M_i = self.compute_Mi(cls)
        Sb = self.compute_Sb(cls, M_i, M_0)
        Sw = self.compute_Sw(cls, M_i)
        data_idx = self.compute_indices(x)
        lst_idx_all = self.compute_listIndices(data_idx)
        Sig_beta_B_i_all = self.compute_sig_beta_Bi(data_idx, M_i, M_0, lst_idx_all)
        W_i = self.compute_wj(data_idx, M_i)
        Sig_alpha_W_i = self.compute_sig_alpha_W(data_idx, lst_idx_all, W_i)
        # Calculating inverse of Sb and its transpose and minus
        # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
        # Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
        Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
        Sb_inv_N_trans = np.transpose(Sb_inv_N)
        Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
        Sw_trans = np.transpose(Sw)
        # Calculating DJ/DXk
        DJ_Dxk = 2 * (
            Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
                Sig_alpha_W_i))
        # Calculating derivative of the log of the prior
        DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
        return DPx_Dx.T
    # def frb(self, x):
    #     from functools import partial
    #     from GPy.models import GradientChecker
    #     f = partial(self.lnpdf)
    #     df = partial(self.lnpdf_grad)
    #     grad = GradientChecker(f, df, x, 'X')
    #     grad.checkgrad(verbose=1)
    def rvs(self, n):
        return np.random.rand(n)  # A WRONG implementation
    def __str__(self):
        return 'DGPLVM_prior'
 class HalfT(Prior):
    """
    Implementation of the half student t probability function, coupled with random variables.
    :param A: scale parameter
    :param nu: degrees of freedom
    """
    domain = _POSITIVE
    _instances = []
    def __new__(cls, A, nu): # Singleton:
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
                if instance().A == A and instance().nu == nu:
                   return instance()
        o = super(Prior, cls).__new__(cls, A, nu)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()
    def __init__(self, A, nu):
        self.A = float(A)
        self.nu = float(nu)
        self.constant = gammaln(.5*(self.nu+1.)) - gammaln(.5*self.nu) - .5*np.log(np.pi*self.A*self.nu)
    def __str__(self):
        return "hT({:.2g}, {:.2g})".format(self.A, self.nu)
    def lnpdf(self,theta):
        return (theta>0) * ( self.constant -.5*(self.nu+1) * np.log( 1.+ (1./self.nu) * (theta/self.A)**2 ) )
        #theta = theta if isinstance(theta,np.ndarray) else np.array([theta])
        #lnpdfs = np.zeros_like(theta)
        #theta = np.array([theta])
        #above_zero = theta.flatten()>1e-6
        #v = self.nu
        #sigma2=self.A
        #stop
        #lnpdfs[above_zero] = (+ gammaln((v + 1) * 0.5)
        #    - gammaln(v * 0.5)
        #    - 0.5*np.log(sigma2 * v * np.pi)
        #    - 0.5*(v + 1)*np.log(1 + (1/np.float(v))*((theta[above_zero][0]**2)/sigma2))
        #)
        #return lnpdfs
    def lnpdf_grad(self,theta):
        theta = theta if isinstance(theta,np.ndarray) else np.array([theta])
        grad = np.zeros_like(theta)
        above_zero = theta>1e-6
        v = self.nu
        sigma2=self.A
        grad[above_zero] = -0.5*(v+1)*(2*theta[above_zero])/(v*sigma2 + theta[above_zero][0]**2)
        return grad
    def rvs(self, n):
         #return np.random.randn(n) * self.sigma + self.mu
         from scipy.stats import t
         #[np.abs(x) for x in t.rvs(df=4,loc=0,scale=50, size=10000)])
         ret = t.rvs(self.nu,loc=0,scale=self.A, size=n)
         ret[ret<0] = 0
         return ret
--- a/GPy/core/parameterization/ties_and_remappings.py
+++ b/GPy/core/parameterization/ties_and_remappings.py
@ -0,0 +1,225 @@
 # Copyright (c) 2014, James Hensman, Max Zwiessele
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from parameterized import Parameterized
 from param import Param
 class Remapping(Parameterized):
    def mapping(self):
        """
        The return value of this function gives the values which the re-mapped
        parameters should take. Implement in sub-classes.
        """
        raise NotImplementedError
    def callback(self):
        raise NotImplementedError
    def __str__(self):
        return self.name
    def parameters_changed(self):
        #ensure all out parameters have the correct value, as specified by our mapping
        index = self._highest_parent_.constraints[self]
        self._highest_parent_.param_array[index] = self.mapping()
        [p.notify_observers(which=self) for p in self.tied_parameters]
 class Fix(Remapping):
    pass
 class Tie(Parameterized):
    """
    The new parameter tie framework. (under development)
    All the parameters tied together get a new parameter inside the *Tie* object. 
    Its value should always be equal to all the tied parameters, and its gradient
    is the sum of all the tied parameters.
    =====Implementation Details=====
    The *Tie* object should only exist on the top of param tree (the highest parent).
    self.label_buf:
    It uses a label buffer that has the same length as all the parameters (self._highest_parent_.param_array).
    The buffer keeps track of all the tied parameters. All the tied parameters have a label (an interger) higher 
    than 0, and the parameters that have the same label are tied together.
    self.buf_index:
    An auxiliary index list for the global index of the tie parameter inside the *Tie* object.
    ================================
    TODO:
    * EVERYTHING
    """
    def __init__(self, name='tie'):
        super(Tie, self).__init__(name)
        self.tied_param = None
        # The buffer keeps track of tie status
        self.label_buf = None
        # The global indices of the 'tied' param
        self.buf_idx = None
        # A boolean array indicating non-tied parameters
        self._tie_ = None
    def getTieFlag(self, p=None):
        if self.tied_param is None:
            if self._tie_ is None or self._tie_.size != self._highest_parent_.param_array.size:
                self._tie_ = np.ones((self._highest_parent_.param_array.size,),dtype=np.bool)
        if p is not None:
            return self._tie_[p._highest_parent_._raveled_index_for(p)]
        return self._tie_
    def _init_labelBuf(self):
        if self.label_buf is None:
            self.label_buf = np.zeros(self._highest_parent_.param_array.shape, dtype=np.int)
        if self._tie_ is None or self._tie_.size != self._highest_parent_.param_array.size:
            self._tie_ = np.ones((self._highest_parent_.param_array.size,),dtype=np.bool)
    def _updateTieFlag(self):
        if self._tie_.size != self.label_buf.size:
            self._tie_ = np.ones((self._highest_parent_.param_array.size,),dtype=np.bool)
        self._tie_[self.label_buf>0] = False
        self._tie_[self.buf_idx] = True
    def add_tied_parameter(self, p, p2=None):
        """
        Tie the list of parameters p together (p2==None) or 
        Tie the list of parameters p with the list of parameters p2 (p2!=None) 
        """
        self._init_labelBuf()
        if p2 is None:
            idx = self._highest_parent_._raveled_index_for(p)
            val = self._sync_val_group(idx)            
            if np.all(self.label_buf[idx]==0):
                # None of p has been tied before.
                tie_idx = self._expandTieParam(1)
                print tie_idx
                tie_id = self.label_buf.max()+1
                self.label_buf[tie_idx] = tie_id
            else:
                b = self.label_buf[idx]
                ids = np.unique(b[b>0])
                tie_id, tie_idx = self._merge_tie_param(ids)
            self._highest_parent_.param_array[tie_idx] = val
            idx = self._highest_parent_._raveled_index_for(p)
            self.label_buf[idx] = tie_id
        else:
            pass
        self._updateTieFlag()
    def _merge_tie_param(self, ids):
        """Merge the tie parameters with ids in the list."""
        if len(ids)==1:
            id_final_idx = self.buf_idx[self.label_buf[self.buf_idx]==ids[0]][0]
            return ids[0],id_final_idx
        id_final = ids[0]
        ids_rm = ids[1:]
        label_buf_param = self.label_buf[self.buf_idx]
        idx_param = [np.where(label_buf_param==i)[0][0] for i in ids_rm]
        self._removeTieParam(idx_param)
        [np.put(self.label_buf, np.where(self.label_buf==i), id_final) for i in ids_rm]
        id_final_idx = self.buf_idx[self.label_buf[self.buf_idx]==id_final][0]
        return id_final, id_final_idx
    def _sync_val_group(self, idx):
        self._highest_parent_.param_array[idx] = self._highest_parent_.param_array[idx].mean()
        return self._highest_parent_.param_array[idx][0]
    def _expandTieParam(self, num):
        """Expand the tie param with the number of *num* parameters"""
        if self.tied_param is None:
            new_buf = np.empty((num,))
        else:
            new_buf = np.empty((self.tied_param.size+num,))
            new_buf[:self.tied_param.size] = self.tied_param.param_array.copy()
            self.remove_parameter(self.tied_param)
        self.tied_param = Param('tied',new_buf)
        self.add_parameter(self.tied_param)
        buf_idx_new = self._highest_parent_._raveled_index_for(self.tied_param)
        self._expand_label_buf(self.buf_idx, buf_idx_new)
        self.buf_idx = buf_idx_new
        return self.buf_idx[-num:]
    def _removeTieParam(self, idx):
        """idx within tied_param"""
        new_buf = np.empty((self.tied_param.size-len(idx),))
        bool_list = np.ones((self.tied_param.size,),dtype=np.bool)
        bool_list[idx] = False
        new_buf[:] = self.tied_param.param_array[bool_list]
        self.remove_parameter(self.tied_param)
        self.tied_param = Param('tied',new_buf)
        self.add_parameter(self.tied_param)
        buf_idx_new = self._highest_parent_._raveled_index_for(self.tied_param)
        self._shrink_label_buf(self.buf_idx, buf_idx_new, bool_list)
        self.buf_idx = buf_idx_new
    def _expand_label_buf(self, idx_old, idx_new):
        """Expand label buffer accordingly"""
        if idx_old is None:
            self.label_buf = np.zeros(self._highest_parent_.param_array.shape, dtype=np.int)
        else:
            bool_old = np.zeros((self.label_buf.size,),dtype=np.bool)
            bool_old[idx_old] = True
            bool_new = np.zeros((self._highest_parent_.param_array.size,),dtype=np.bool)
            bool_new[idx_new] = True
            label_buf_new = np.zeros(self._highest_parent_.param_array.shape, dtype=np.int)
            label_buf_new[np.logical_not(bool_new)] = self.label_buf[np.logical_not(bool_old)]
            label_buf_new[idx_new[:len(idx_old)]] = self.label_buf[idx_old]
            self.label_buf = label_buf_new
    def _shrink_label_buf(self, idx_old, idx_new, bool_list):
        bool_old = np.zeros((self.label_buf.size,),dtype=np.bool)
        bool_old[idx_old] = True
        bool_new = np.zeros((self._highest_parent_.param_array.size,),dtype=np.bool)
        bool_new[idx_new] = True
        label_buf_new = np.empty(self._highest_parent_.param_array.shape, dtype=np.int)
        label_buf_new[np.logical_not(bool_new)] = self.label_buf[np.logical_not(bool_old)]
        label_buf_new[idx_new] = self.label_buf[idx_old[bool_list]]
        self.label_buf = label_buf_new
    def _check_change(self):
        changed = False
        if self.tied_param is not None:
            for i in xrange(self.tied_param.size):
                b0 = self.label_buf==self.label_buf[self.buf_idx[i]]
                b = self._highest_parent_.param_array[b0]!=self.tied_param[i]
                if b.sum()==0:
                    print 'XXX'
                    continue
                elif b.sum()==1:
                    print '!!!'
                    val = self._highest_parent_.param_array[b0][b][0]
                    self._highest_parent_.param_array[b0] = val
                else:
                    print '@@@'
                    self._highest_parent_.param_array[b0] = self.tied_param[i]
                changed = True
        return changed
    def parameters_changed(self):
        #ensure all out parameters have the correct value, as specified by our mapping
        changed = self._check_change()
        if changed:
            self._highest_parent_._trigger_params_changed()
        self.collate_gradient()
    def collate_gradient(self):
        if self.tied_param is not None:
            self.tied_param.gradient = 0.
            [np.put(self.tied_param.gradient, i, self._highest_parent_.gradient[self.label_buf==self.label_buf[self.buf_idx[i]]].sum()) 
                for i in xrange(self.tied_param.size)]
    def propagate_val(self):
        if self.tied_param is not None:
            for i in xrange(self.tied_param.size):
                self._highest_parent_.param_array[self.label_buf==self.label_buf[self.buf_idx[i]]] = self.tied_param[i]
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@ -0,0 +1,433 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from domains import _POSITIVE,_NEGATIVE, _BOUNDED
 import weakref
 import sys
 _exp_lim_val = np.finfo(np.float64).max
 _lim_val = 36.0
 epsilon = np.finfo(np.float64).resolution
 #===============================================================================
 # Fixing constants
 __fixed__ = "fixed"
 FIXED = False
 UNFIXED = True
 #===============================================================================
 class Transformation(object):
    domain = None
    _instance = None
    def __new__(cls, *args, **kwargs):
        if not cls._instance or cls._instance.__class__ is not cls:
            cls._instance = super(Transformation, cls).__new__(cls, *args, **kwargs)
        return cls._instance
    def f(self, opt_param):
        raise NotImplementedError
    def finv(self, model_param):
        raise NotImplementedError
    def gradfactor(self, model_param, dL_dmodel_param):
        """ df(opt_param)_dopt_param evaluated at self.f(opt_param)=model_param, times the gradient dL_dmodel_param,
        i.e.:
        define
        .. math::
            \frac{\frac{\partial L}{\partial f}\left(\left.\partial f(x)}{\partial x}\right|_{x=f^{-1}(f)\right)}
        """
        raise NotImplementedError
    def initialize(self, f):
        """ produce a sensible initial value for f(x)"""
        raise NotImplementedError
    def plot(self, xlabel=r'transformed $\theta$', ylabel=r'$\theta$', axes=None, *args,**kw):
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        import matplotlib.pyplot as plt
        from ...plotting.matplot_dep import base_plots
        x = np.linspace(-8,8)
        base_plots.meanplot(x, self.f(x),axes=axes*args,**kw)
        axes = plt.gca()
        axes.set_xlabel(xlabel)
        axes.set_ylabel(ylabel)
    def __str__(self):
        raise NotImplementedError
    def __repr__(self):
        return self.__class__.__name__
 class Logexp(Transformation):
    domain = _POSITIVE
    def f(self, x):
        return np.where(x>_lim_val, x, np.log1p(np.exp(np.clip(x, -_lim_val, _lim_val)))) + epsilon
        #raises overflow warning: return np.where(x>_lim_val, x, np.log(1. + np.exp(x)))
    def finv(self, f):
        return np.where(f>_lim_val, f, np.log(np.exp(f+1e-20) - 1.))
    def gradfactor(self, f, df):
        return np.einsum('i,i->i', df, np.where(f>_lim_val, 1., 1. - np.exp(-f)))
    def initialize(self, f):
        if np.any(f < 0.):
            print "Warning: changing parameters to satisfy constraints"
        return np.abs(f)
    def __str__(self):
        return '+ve'
 class NormalTheta(Transformation):
    _instances = []
    def __new__(cls, mu_indices, var_indices):
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
                if np.all(instance().mu_indices==mu_indices, keepdims=False) and np.all(instance().var_indices==var_indices, keepdims=False):
                    return instance()
        o = super(Transformation, cls).__new__(cls, mu_indices, var_indices)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()
    def __init__(self, mu_indices, var_indices):
        self.mu_indices = mu_indices
        self.var_indices = var_indices
    def f(self, theta):
        # In here abs is only a trick to make sure the numerics are ok.
        # The variance will never go below zero, but at initialization we need to make sure
        # that the values are ok
        # Before:
        theta[self.var_indices] = np.abs(-.5/theta[self.var_indices])
        theta[self.mu_indices] *= theta[self.var_indices]
        return theta # which is now {mu, var}
    def finv(self, muvar):
        # before:
        varp = muvar[self.var_indices]
        muvar[self.mu_indices] /= varp
        muvar[self.var_indices] = -.5/varp
        return muvar # which is now {theta1, theta2}
    def gradfactor(self, muvar, dmuvar):
        mu = muvar[self.mu_indices]
        var = muvar[self.var_indices]
        #=======================================================================
        # theta gradients
        # This works and the gradient checks!
        dmuvar[self.mu_indices] *= var
        dmuvar[self.var_indices] *= 2*(var)**2
        dmuvar[self.var_indices] += 2*dmuvar[self.mu_indices]*mu
        #=======================================================================
        return dmuvar # which is now the gradient multiplicator for {theta1, theta2}
    def initialize(self, f):
        if np.any(f[self.var_indices] < 0.):
            print "Warning: changing parameters to satisfy constraints"
            f[self.var_indices] = np.abs(f[self.var_indices])
        return f
    def __str__(self):
        return "theta"
    def __getstate__(self):
        return [self.mu_indices, self.var_indices]
    def __setstate__(self, state):
        self.mu_indices = state[0]
        self.var_indices = state[1]
 class NormalNaturalAntti(NormalTheta):
    _instances = []
    _logexp = Logexp()
    def __new__(cls, mu_indices, var_indices):
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
                if np.all(instance().mu_indices==mu_indices, keepdims=False) and np.all(instance().var_indices==var_indices, keepdims=False):
                    return instance()
        o = super(Transformation, cls).__new__(cls, mu_indices, var_indices)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()
    def __init__(self, mu_indices, var_indices):
        self.mu_indices = mu_indices
        self.var_indices = var_indices
    def gradfactor(self, muvar, dmuvar):
        mu = muvar[self.mu_indices]
        var = muvar[self.var_indices]
        #=======================================================================
        # theta gradients
        # This works and the gradient checks!
        dmuvar[self.mu_indices] *= var
        dmuvar[self.var_indices] *= 2*var**2#np.einsum('i,i,i,i->i', dmuvar[self.var_indices], [2], var, var)
        #=======================================================================
        return dmuvar # which is now the gradient multiplicator
    def initialize(self, f):
        if np.any(f[self.var_indices] < 0.):
            print "Warning: changing parameters to satisfy constraints"
            f[self.var_indices] = np.abs(f[self.var_indices])
        return f
    def __str__(self):
        return "natantti"
 class NormalEta(Transformation):
    _instances = []
    def __new__(cls, mu_indices, var_indices):
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
                if np.all(instance().mu_indices==mu_indices, keepdims=False) and np.all(instance().var_indices==var_indices, keepdims=False):
                    return instance()
        o = super(Transformation, cls).__new__(cls, mu_indices, var_indices)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()
    def __init__(self, mu_indices, var_indices):
        self.mu_indices = mu_indices
        self.var_indices = var_indices
    def f(self, theta):
        theta[self.var_indices] = np.abs(theta[self.var_indices] - theta[self.mu_indices]**2)
        return theta # which is now {mu, var}
    def finv(self, muvar):
        muvar[self.var_indices] += muvar[self.mu_indices]**2
        return muvar # which is now {eta1, eta2}
    def gradfactor(self, muvar, dmuvar):
        mu = muvar[self.mu_indices]
        #=======================================================================
        # Lets try natural gradients instead: Not working with bfgs... try stochastic!
        dmuvar[self.mu_indices] -= 2*mu*dmuvar[self.var_indices]
        #=======================================================================
        return dmuvar # which is now the gradient multiplicator
    def initialize(self, f):
        if np.any(f[self.var_indices] < 0.):
            print "Warning: changing parameters to satisfy constraints"
            f[self.var_indices] = np.abs(f[self.var_indices])
        return f
    def __str__(self):
        return "eta"
 class NormalNaturalThroughTheta(NormalTheta):
    _instances = []
    def __new__(cls, mu_indices, var_indices):
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
                if np.all(instance().mu_indices==mu_indices, keepdims=False) and np.all(instance().var_indices==var_indices, keepdims=False):
                    return instance()
        o = super(Transformation, cls).__new__(cls, mu_indices, var_indices)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()
    def __init__(self, mu_indices, var_indices):
        self.mu_indices = mu_indices
        self.var_indices = var_indices
    def gradfactor(self, muvar, dmuvar):
        mu = muvar[self.mu_indices]
        var = muvar[self.var_indices]
        #=======================================================================
        # This is just eta direction:
        dmuvar[self.mu_indices] -= 2*mu*dmuvar[self.var_indices]
        #=======================================================================
        #=======================================================================
        # This is by going through theta fully and then going into eta direction:
        #dmu = dmuvar[self.mu_indices]
        #dmuvar[self.var_indices] += dmu*mu*(var + 4/var)
        #=======================================================================
        return dmuvar # which is now the gradient multiplicator
    def __str__(self):
        return "natgrad"
 class NormalNaturalThroughEta(NormalEta):
    _instances = []
    def __new__(cls, mu_indices, var_indices):
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
                if np.all(instance().mu_indices==mu_indices, keepdims=False) and np.all(instance().var_indices==var_indices, keepdims=False):
                    return instance()
        o = super(Transformation, cls).__new__(cls, mu_indices, var_indices)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()
    def __init__(self, mu_indices, var_indices):
        self.mu_indices = mu_indices
        self.var_indices = var_indices
    def gradfactor(self, muvar, dmuvar):
        mu = muvar[self.mu_indices]
        var = muvar[self.var_indices]
        #=======================================================================
        # theta gradients
        # This works and the gradient checks!
        dmuvar[self.mu_indices] *= var
        dmuvar[self.var_indices] *= 2*(var)**2
        dmuvar[self.var_indices] += 2*dmuvar[self.mu_indices]*mu
        #=======================================================================
        return dmuvar
    def __str__(self):
        return "natgrad"
 class LogexpNeg(Transformation):
    domain = _POSITIVE
    def f(self, x):
        return np.where(x>_lim_val, -x, -np.log(1. + np.exp(np.clip(x, -np.inf, _lim_val))))
        #raises overflow warning: return np.where(x>_lim_val, x, np.log(1. + np.exp(x)))
    def finv(self, f):
        return np.where(f>_lim_val, 0, np.log(np.exp(-f) - 1.))
    def gradfactor(self, f, df):
        return np.einsum('i,i->i', df, np.where(f>_lim_val, -1, -1 + np.exp(-f)))
    def initialize(self, f):
        if np.any(f < 0.):
            print "Warning: changing parameters to satisfy constraints"
        return np.abs(f)
    def __str__(self):
        return '+ve'
 class NegativeLogexp(Transformation):
    domain = _NEGATIVE
    logexp = Logexp()
    def f(self, x):
        return -self.logexp.f(x)  # np.log(1. + np.exp(x))
    def finv(self, f):
        return self.logexp.finv(-f)  # np.log(np.exp(-f) - 1.)
    def gradfactor(self, f, df):
        return np.einsum('i,i->i', df, -self.logexp.gradfactor(-f))
    def initialize(self, f):
        return -self.logexp.initialize(f)  # np.abs(f)
    def __str__(self):
        return '-ve'
 class LogexpClipped(Logexp):
    max_bound = 1e100
    min_bound = 1e-10
    log_max_bound = np.log(max_bound)
    log_min_bound = np.log(min_bound)
    domain = _POSITIVE
    _instances = []
    def __new__(cls, lower=1e-6, *args, **kwargs):
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
                if instance().lower == lower:
                    return instance()
        o = super(Transformation, cls).__new__(cls, lower, *args, **kwargs)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()
    def __init__(self, lower=1e-6):
        self.lower = lower
    def f(self, x):
        exp = np.exp(np.clip(x, self.log_min_bound, self.log_max_bound))
        f = np.log(1. + exp)
 #         if np.isnan(f).any():
 #             import ipdb;ipdb.set_trace()
        return np.clip(f, self.min_bound, self.max_bound)
    def finv(self, f):
        return np.log(np.exp(f - 1.))
    def gradfactor(self, f, df):
        ef = np.exp(f) # np.clip(f, self.min_bound, self.max_bound))
        gf = (ef - 1.) / ef
        return np.einsum('i,i->i', df, gf) # np.where(f < self.lower, 0, gf)
    def initialize(self, f):
        if np.any(f < 0.):
            print "Warning: changing parameters to satisfy constraints"
        return np.abs(f)
    def __str__(self):
        return '+ve_c'
 class Exponent(Transformation):
    # TODO: can't allow this to go to zero, need to set a lower bound. Similar with negative Exponent below. See old MATLAB code.
    domain = _POSITIVE
    def f(self, x):
        return np.where(x<_lim_val, np.where(x>-_lim_val, np.exp(x), np.exp(-_lim_val)), np.exp(_lim_val))
    def finv(self, x):
        return np.log(x)
    def gradfactor(self, f, df):
        return np.einsum('i,i->i', df, f)
    def initialize(self, f):
        if np.any(f < 0.):
            print "Warning: changing parameters to satisfy constraints"
        return np.abs(f)
    def __str__(self):
        return '+ve'
 class NegativeExponent(Exponent):
    domain = _NEGATIVE
    def f(self, x):
        return -Exponent.f(x)
    def finv(self, f):
        return Exponent.finv(-f)
    def gradfactor(self, f, df):
        return np.einsum('i,i->i', df, f)
    def initialize(self, f):
        return -Exponent.initialize(f) #np.abs(f)
    def __str__(self):
        return '-ve'
 class Square(Transformation):
    domain = _POSITIVE
    def f(self, x):
        return x ** 2
    def finv(self, x):
        return np.sqrt(x)
    def gradfactor(self, f, df):
        return np.einsum('i,i->i', df, 2 * np.sqrt(f))
    def initialize(self, f):
        return np.abs(f)
    def __str__(self):
        return '+sq'
 class Logistic(Transformation):
    domain = _BOUNDED
    _instances = []
    def __new__(cls, lower=1e-6, upper=1e-6, *args, **kwargs):
        if cls._instances:
            cls._instances[:] = [instance for instance in cls._instances if instance()]
            for instance in cls._instances:
                if instance().lower == lower and instance().upper == upper:
                    return instance()
        o = super(Transformation, cls).__new__(cls, lower, upper, *args, **kwargs)
        cls._instances.append(weakref.ref(o))
        return cls._instances[-1]()
    def __init__(self, lower, upper):
        assert lower < upper
        self.lower, self.upper = float(lower), float(upper)
        self.difference = self.upper - self.lower
    def f(self, x):
        if (x<-300.).any():
            x = x.copy()
            x[x<-300.] = -300.
        return self.lower + self.difference / (1. + np.exp(-x))
    def finv(self, f):
        return np.log(np.clip(f - self.lower, 1e-10, np.inf) / np.clip(self.upper - f, 1e-10, np.inf))
    def gradfactor(self, f, df):
        return np.einsum('i,i->i', df, (f - self.lower) * (self.upper - f) / self.difference)
    def initialize(self, f):
        if np.any(np.logical_or(f < self.lower, f > self.upper)):
            print "Warning: changing parameters to satisfy constraints"
        #return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(f * 0.), f)
        #FIXME: Max, zeros_like right?
        return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(np.zeros_like(f)), f)
    def __str__(self):
        return '{},{}'.format(self.lower, self.upper)
--- a/GPy/core/parameterization/updateable.py
+++ b/GPy/core/parameterization/updateable.py
@ -0,0 +1,63 @@
 '''
 Created on 11 Nov 2014
@author: maxz
 '''
 from observable import Observable
 class Updateable(Observable):
    """
    A model can be updated or not.
    Make sure updates can be switched on and off.
    """
    _updates = True
    def __init__(self, *args, **kwargs):
        super(Updateable, self).__init__(*args, **kwargs)
    @property
    def updates(self):
        raise DeprecationWarning("updates is now a function, see update(True|False|None)")
    @updates.setter
    def updates(self, ups):
        raise DeprecationWarning("updates is now a function, see update(True|False|None)")
    def update_model(self, updates=None):
        """
        Get or set, whether automatic updates are performed. When updates are
        off, the model might be in a non-working state. To make the model work
        turn updates on again.
        :param bool|None updates:
            bool: whether to do updates
            None: get the current update state
        """
        if updates is None:
            p = getattr(self, '_highest_parent_', None)
            if p is not None:
                self._updates = p._updates
            return self._updates
        assert isinstance(updates, bool), "updates are either on (True) or off (False)"
        p = getattr(self, '_highest_parent_', None)
        if p is not None:
            p._updates = updates
        self._updates = updates
        self.trigger_update()
    def toggle_update(self):
        self.update_model(not self.update_model())
    def trigger_update(self, trigger_parent=True):
        """
        Update the model from the current state.
        Make sure that updates are on, otherwise this
        method will do nothing
        :param bool trigger_parent: Whether to trigger the parent, after self has updated
        """
        if not self.update_model() or (hasattr(self, "_in_init_") and self._in_init_):
            #print "Warning: updates are off, updating the model will do nothing"
            return
        self._trigger_params_changed(trigger_parent)
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@ -0,0 +1,220 @@
 '''
 Created on 6 Nov 2013
@author: maxz
 '''
 import numpy as np
 from parameterized import Parameterized
 from param import Param
 from transformations import Logexp, Logistic,__fixed__
 from GPy.util.misc import param_to_array
 from GPy.util.caching import Cache_this
 class VariationalPrior(Parameterized):
    def __init__(self, name='latent space', **kw):
        super(VariationalPrior, self).__init__(name=name, **kw)
    def KL_divergence(self, variational_posterior):
        raise NotImplementedError, "override this for variational inference of latent space"
    def update_gradients_KL(self, variational_posterior):
        """
        updates the gradients for mean and variance **in place**
        """
        raise NotImplementedError, "override this for variational inference of latent space"
 class NormalPrior(VariationalPrior):
    def KL_divergence(self, variational_posterior):
        var_mean = np.square(variational_posterior.mean).sum()
        var_S = (variational_posterior.variance - np.log(variational_posterior.variance)).sum()
        return 0.5 * (var_mean + var_S) - 0.5 * variational_posterior.input_dim * variational_posterior.num_data
    def update_gradients_KL(self, variational_posterior):
        # dL:
        variational_posterior.mean.gradient -= variational_posterior.mean
        variational_posterior.variance.gradient -= (1. - (1. / (variational_posterior.variance))) * 0.5
 class SpikeAndSlabPrior(VariationalPrior):
    def __init__(self, pi=None, learnPi=False, variance = 1.0, name='SpikeAndSlabPrior', **kw):
        super(SpikeAndSlabPrior, self).__init__(name=name, **kw)        
        self.variance = Param('variance',variance)
        self.learnPi = learnPi
        if learnPi:
            self.pi = Param('Pi', pi, Logistic(1e-10,1.-1e-10))
        else:
            self.pi = Param('Pi', pi, __fixed__)
        self.link_parameter(self.pi)
    def KL_divergence(self, variational_posterior):
        mu = variational_posterior.mean
        S = variational_posterior.variance
        gamma,gamma1 = variational_posterior.gamma_probabilities()
        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
        if len(self.pi.shape)==2:
            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
            pi = self.pi[idx]
        else:
            pi = self.pi
        var_mean = np.square(mu)/self.variance
        var_S = (S/self.variance - np.log(S))
        var_gamma = (gamma*(log_gamma-np.log(pi))).sum()+(gamma1*(log_gamma1-np.log(1-pi))).sum()
        return var_gamma+ (gamma* (np.log(self.variance)-1. +var_mean + var_S)).sum()/2.
    def update_gradients_KL(self, variational_posterior):
        mu = variational_posterior.mean
        S = variational_posterior.variance
        gamma,gamma1 = variational_posterior.gamma_probabilities()
        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
        if len(self.pi.shape)==2:
            idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
            pi = self.pi[idx]
        else:
            pi = self.pi
        variational_posterior.binary_prob.gradient -= (np.log((1-pi)/pi)+log_gamma-log_gamma1+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.)*gamma*gamma1
        mu.gradient -= gamma*mu/self.variance
        S.gradient -= (1./self.variance - 1./S) * gamma /2.
        if self.learnPi:
            if len(self.pi)==1:
                self.pi.gradient = (gamma/self.pi - (1.-gamma)/(1.-self.pi)).sum()
            elif len(self.pi.shape)==1:
                self.pi.gradient = (gamma/self.pi - (1.-gamma)/(1.-self.pi)).sum(axis=0)
            else:
                self.pi[idx].gradient = (gamma/self.pi[idx] - (1.-gamma)/(1.-self.pi[idx]))
 class VariationalPosterior(Parameterized):
    def __init__(self, means=None, variances=None, name='latent space', *a, **kw):
        super(VariationalPosterior, self).__init__(name=name, *a, **kw)
        self.mean = Param("mean", means)
        self.variance = Param("variance", variances, Logexp())
        self.ndim = self.mean.ndim
        self.shape = self.mean.shape
        self.num_data, self.input_dim = self.mean.shape
        self.link_parameters(self.mean, self.variance)
        self.num_data, self.input_dim = self.mean.shape
        if self.has_uncertain_inputs():
            assert self.variance.shape == self.mean.shape, "need one variance per sample and dimenion"
    def set_gradients(self, grad):
        self.mean.gradient, self.variance.gradient = grad
    def _raveled_index(self):
        index = np.empty(dtype=int, shape=0)
        size = 0
        for p in self.parameters:
            index = np.hstack((index, p._raveled_index()+size))
            size += p._realsize_ if hasattr(p, '_realsize_') else p.size
        return index
    def has_uncertain_inputs(self):
        return not self.variance is None
    def __getitem__(self, s):
        if isinstance(s, (int, slice, tuple, list, np.ndarray)):
            import copy
            n = self.__new__(self.__class__, self.name)
            dc = self.__dict__.copy()
            dc['mean'] = self.mean[s]
            dc['variance'] = self.variance[s]
            dc['parameters'] = copy.copy(self.parameters)
            n.__dict__.update(dc)
            n.parameters[dc['mean']._parent_index_] = dc['mean']
            n.parameters[dc['variance']._parent_index_] = dc['variance']
            n._gradient_array_ = None
            oversize = self.size - self.mean.size - self.variance.size
            n.size = n.mean.size + n.variance.size + oversize
            n.ndim = n.mean.ndim
            n.shape = n.mean.shape
            n.num_data = n.mean.shape[0]
            n.input_dim = n.mean.shape[1] if n.ndim != 1 else 1
            return n
        else:
            return super(VariationalPosterior, self).__getitem__(s)
 class NormalPosterior(VariationalPosterior):
    '''
    NormalPosterior distribution for variational approximations.
    holds the means and variances for a factorizing multivariate normal distribution
    '''
    def plot(self, *args):
        """
        Plot latent space X in 1D:
        See  GPy.plotting.matplot_dep.variational_plots
        """
        import sys
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ...plotting.matplot_dep import variational_plots
        import matplotlib
        return variational_plots.plot(self,*args)
 class SpikeAndSlabPosterior(VariationalPosterior):
    '''
    The SpikeAndSlab distribution for variational approximations.
    '''
    def __init__(self, means, variances, binary_prob, name='latent space'):
        """
        binary_prob : the probability of the distribution on the slab part.
        """
        super(SpikeAndSlabPosterior, self).__init__(means, variances, name)
        self.gamma = Param("binary_prob",binary_prob)
        self.link_parameter(self.gamma)
    @Cache_this(limit=5)
    def gamma_probabilities(self):
        prob = np.zeros_like(param_to_array(self.gamma))
        prob[self.gamma>-710] = 1./(1.+np.exp(-self.gamma[self.gamma>-710]))
        prob1 = -np.zeros_like(param_to_array(self.gamma))
        prob1[self.gamma<710] = 1./(1.+np.exp(self.gamma[self.gamma<710]))
        return prob, prob1
    @Cache_this(limit=5)
    def gamma_log_prob(self):
        loggamma = param_to_array(self.gamma).copy()
        loggamma[loggamma>-40] = -np.log1p(np.exp(-loggamma[loggamma>-40]))
        loggamma1 = -param_to_array(self.gamma).copy()
        loggamma1[loggamma1>-40] = -np.log1p(np.exp(-loggamma1[loggamma1>-40]))
        return loggamma,loggamma1
    def set_gradients(self, grad):
        self.mean.gradient, self.variance.gradient, self.gamma.gradient = grad
    def __getitem__(self, s):
        if isinstance(s, (int, slice, tuple, list, np.ndarray)):
            import copy
            n = self.__new__(self.__class__, self.name)
            dc = self.__dict__.copy()
            dc['mean'] = self.mean[s]
            dc['variance'] = self.variance[s]
            dc['binary_prob'] = self.binary_prob[s]
            dc['parameters'] = copy.copy(self.parameters)
            n.__dict__.update(dc)
            n.parameters[dc['mean']._parent_index_] = dc['mean']
            n.parameters[dc['variance']._parent_index_] = dc['variance']
            n.parameters[dc['binary_prob']._parent_index_] = dc['binary_prob']
            n._gradient_array_ = None
            oversize = self.size - self.mean.size - self.variance.size
            n.size = n.mean.size + n.variance.size + oversize
            n.ndim = n.mean.ndim
            n.shape = n.mean.shape
            n.num_data = n.mean.shape[0]
            n.input_dim = n.mean.shape[1] if n.ndim != 1 else 1
            return n
        else:
            return super(VariationalPrior, self).__getitem__(s)
    def plot(self, *args, **kwargs):
        """
        Plot latent space X in 1D:
        See  GPy.plotting.matplot_dep.variational_plots
        """
        import sys
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ...plotting.matplot_dep import variational_plots
        return variational_plots.plot_SpikeSlab(self,*args, **kwargs)
--- a/GPy/core/parameterized.py
+++ b/GPy/core/parameterized.py
@ -1,465 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import re
 import copy
 import cPickle
 import warnings
 import transformations
 class Parameterized(object):
    def __init__(self):
        """
        This is the base class for model and kernel. Mostly just handles tieing and constraining of parameters
        """
        self.tied_indices = []
        self.fixed_indices = []
        self.fixed_values = []
        self.constrained_indices = []
        self.constraints = []
    def _get_params(self):
        raise NotImplementedError, "this needs to be implemented to use the Parameterized class"
    def _set_params(self, x):
        raise NotImplementedError, "this needs to be implemented to use the Parameterized class"
    def _get_param_names(self):
        raise NotImplementedError, "this needs to be implemented to use the Parameterized class"
    #def _get_print_names(self):
    #    """ Override for which names to print out, when using print m """
    #    return self._get_param_names()
    def pickle(self, filename, protocol=-1):
        with open(filename, 'wb') as f:
            cPickle.dump(self, f, protocol=protocol)
    def copy(self):
        """Returns a (deep) copy of the current model """
        return copy.deepcopy(self)
    def __getstate__(self):
        if self._has_get_set_state():
            return self.getstate()
        return self.__dict__
    def __setstate__(self, state):
        if self._has_get_set_state():
            self.setstate(state) # set state
            self._set_params(self._get_params()) # restore all values
            return
        self.__dict__ = state
    def _has_get_set_state(self):
        return 'getstate' in vars(self.__class__) and 'setstate' in vars(self.__class__)
    def getstate(self):
        """
        Get the current state of the class,
        here just all the indices, rest can get recomputed
        For inheriting from Parameterized:
        Allways append the state of the inherited object
        and call down to the inherited object in setstate!!
        """
        return [self.tied_indices,
                self.fixed_indices,
                self.fixed_values,
                self.constrained_indices,
                self.constraints]
    def setstate(self, state):
        self.constraints = state.pop()
        self.constrained_indices = state.pop()
        self.fixed_values = state.pop()
        self.fixed_indices = state.pop()
        self.tied_indices = state.pop()
    def __getitem__(self, regexp, return_names=False):
        """
        Get a model parameter by name.  The name is applied as a regular
        expression and all parameters that match that regular expression are
        returned.
        """
        matches = self.grep_param_names(regexp)
        if len(matches):
            if return_names:
                return self._get_params()[matches], np.asarray(self._get_param_names())[matches].tolist()
            else:
                return self._get_params()[matches]
        else:
            raise AttributeError, "no parameter matches %s" % regexp
    def __setitem__(self, name, val):
        """
        Set model parameter(s) by name. The name is provided as a regular
        expression. All parameters matching that regular expression are set to
        the given value.
        """
        matches = self.grep_param_names(name)
        if len(matches):
            val = np.array(val)
            assert (val.size == 1) or val.size == len(matches), "Shape mismatch: {}:({},)".format(val.size, len(matches))
            x = self._get_params()
            x[matches] = val
            self._set_params(x)
        else:
            raise AttributeError, "no parameter matches %s" % name
    def tie_params(self, regexp):
        """
        Tie (all!) parameters matching the regular expression `regexp`. 
        """
        matches = self.grep_param_names(regexp)
        assert matches.size > 0, "need at least something to tie together"
        if len(self.tied_indices):
            assert not np.any(matches[:, None] == np.hstack(self.tied_indices)), "Some indices are already tied!"
        self.tied_indices.append(matches)
        # TODO only one of the priors will be evaluated. Give a warning message if the priors are not identical
        if hasattr(self, 'prior'):
            pass
        self._set_params_transformed(self._get_params_transformed()) # sets tied parameters to single value
    def untie_everything(self):
        """Unties all parameters by setting tied_indices to an empty list."""
        self.tied_indices = []
    def grep_param_names(self, regexp, transformed=False, search=False):
        """
        :param regexp: regular expression to select parameter names
        :type regexp: re | str | int
        :rtype: the indices of self._get_param_names which match the regular expression.
        Note:-
          Other objects are passed through - i.e. integers which weren't meant for grepping
        """
        if transformed:
            names = self._get_param_names_transformed()
        else:
            names = self._get_param_names()
        if type(regexp) in [str, np.string_, np.str]:
            regexp = re.compile(regexp)
        elif type(regexp) is re._pattern_type:
            pass
        else:
            return regexp
        if search:
            return np.nonzero([regexp.search(name) for name in names])[0]
        else:
            return np.nonzero([regexp.match(name) for name in names])[0]
    def num_params_transformed(self):
        removed = 0
        for tie in self.tied_indices:
            removed += tie.size - 1
        for fix in self.fixed_indices:
            removed += fix.size
        return len(self._get_params()) - removed
    def unconstrain(self, regexp):
        """Unconstrain matching parameters.  Does not untie parameters"""
        matches = self.grep_param_names(regexp)
        # tranformed contraints:
        for match in matches:
            self.constrained_indices = [i[i <> match] for i in self.constrained_indices]
        # remove empty constraints
        tmp = zip(*[(i, t) for i, t in zip(self.constrained_indices, self.constraints) if len(i)])
        if tmp:
            self.constrained_indices, self.constraints = zip(*[(i, t) for i, t in zip(self.constrained_indices, self.constraints) if len(i)])
            self.constrained_indices, self.constraints = list(self.constrained_indices), list(self.constraints)
        # fixed:
        self.fixed_values = [np.delete(values, np.nonzero(np.sum(indices[:, None] == matches[None, :], 1))[0]) for indices, values in zip(self.fixed_indices, self.fixed_values)]
        self.fixed_indices = [np.delete(indices, np.nonzero(np.sum(indices[:, None] == matches[None, :], 1))[0]) for indices in self.fixed_indices]
        # remove empty elements
        tmp = [(i, v) for i, v in zip(self.fixed_indices, self.fixed_values) if len(i)]
        if tmp:
            self.fixed_indices, self.fixed_values = zip(*tmp)
            self.fixed_indices, self.fixed_values = list(self.fixed_indices), list(self.fixed_values)
        else:
            self.fixed_indices, self.fixed_values = [], []
    def constrain_negative(self, regexp, warning=True):
        """ Set negative constraints. """
        self.constrain(regexp, transformations.negative_logexp(), warning=warning)
    def constrain_positive(self, regexp, warning=True):
        """ Set positive constraints. """
        self.constrain(regexp, transformations.logexp(), warning=warning)
    def constrain_bounded(self, regexp, lower, upper, warning=True):
        """ Set bounded constraints. """
        self.constrain(regexp, transformations.logistic(lower, upper), warning=warning)
    def all_constrained_indices(self):
        if len(self.constrained_indices) or len(self.fixed_indices):
            return np.hstack(self.constrained_indices + self.fixed_indices)
        else:
            return np.empty(shape=(0,))
    def constrain(self, regexp, transform, warning=True):
        assert isinstance(transform, transformations.transformation)
        matches = self.grep_param_names(regexp)
        if warning:
            overlap = set(matches).intersection(set(self.all_constrained_indices()))
            if overlap:
                self.unconstrain(np.asarray(list(overlap)))
                print 'Warning: re-constraining these parameters'
                pn = self._get_param_names()
                for i in overlap:
                    print pn[i]
        self.constrained_indices.append(matches)
        self.constraints.append(transform)
        x = self._get_params()
        x[matches] = transform.initialize(x[matches])
        self._set_params(x)
    def constrain_fixed(self, regexp, value=None, warning=True):
        """
        :param regexp: which parameters need to be fixed.
        :type regexp: ndarray(dtype=int) or regular expression object or string
        :param value: the vlaue to fix the parameters to. If the value is not specified,
                 the parameter is fixed to the current value
        :type value: float
        **Notes**
        Fixing a parameter which is tied to another, or constrained in some way will result in an error.
        To fix multiple parameters to the same value, simply pass a regular expression which matches both parameter names, or pass both of the indexes.
        """
        matches = self.grep_param_names(regexp)
        if warning:
            overlap = set(matches).intersection(set(self.all_constrained_indices()))
            if overlap:
                self.unconstrain(np.asarray(list(overlap)))
                print 'Warning: re-constraining these parameters'
                pn = self._get_param_names()
                for i in overlap:
                    print pn[i]
        self.fixed_indices.append(matches)
        if value != None:
            self.fixed_values.append(value)
        else:
            self.fixed_values.append(self._get_params()[self.fixed_indices[-1]])
        # self.fixed_values.append(value)
        self._set_params_transformed(self._get_params_transformed())
    def _get_params_transformed(self):
        """use self._get_params to get the 'true' parameters of the model, which are then tied, constrained and fixed"""
        x = self._get_params()
        [np.put(x, i, t.finv(x[i])) for i, t in zip(self.constrained_indices, self.constraints)]
        to_remove = self.fixed_indices + [t[1:] for t in self.tied_indices]
        if len(to_remove):
            return np.delete(x, np.hstack(to_remove))
        else:
            return x
    def _set_params_transformed(self, x):
        """ takes the vector x, which is then modified (by untying, reparameterising or inserting fixed values), and then call self._set_params"""
        self._set_params(self._untransform_params(x))
    def _untransform_params(self, x):
        """
        The transformation required for _set_params_transformed.
        This moves the vector x seen by the optimiser (unconstrained) to the
        valid parameter vector seen by the model
        Note:
          - This function is separate from _set_params_transformed for downstream flexibility
        """
        # work out how many places are fixed, and where they are. tricky logic!
        fix_places = self.fixed_indices + [t[1:] for t in self.tied_indices]
        if len(fix_places):
            fix_places = np.hstack(fix_places)
            Nfix_places = fix_places.size
        else:
            Nfix_places = 0
        free_places = np.setdiff1d(np.arange(Nfix_places + x.size, dtype=np.int), fix_places)
        # put the models values in the vector xx
        xx = np.zeros(Nfix_places + free_places.size, dtype=np.float64)
        xx[free_places] = x
        [np.put(xx, i, v) for i, v in zip(self.fixed_indices, self.fixed_values)]
        [np.put(xx, i, v) for i, v in [(t[1:], xx[t[0]]) for t in self.tied_indices] ]
        [np.put(xx, i, t.f(xx[i])) for i, t in zip(self.constrained_indices, self.constraints)]
        if hasattr(self, 'debug'):
            stop # @UndefinedVariable
        return xx
    def _get_param_names_transformed(self):
        """
        Returns the parameter names as propagated after constraining,
        tying or fixing, i.e. a list of the same length as _get_params_transformed()
        """
        n = self._get_param_names()
        # remove/concatenate the tied parameter names
        if len(self.tied_indices):
            for t in self.tied_indices:
                n[t[0]] = "<tie>".join([n[tt] for tt in t])
            remove = np.hstack([t[1:] for t in self.tied_indices])
        else:
            remove = np.empty(shape=(0,), dtype=np.int)
        # also remove the fixed params
        if len(self.fixed_indices):
            remove = np.hstack((remove, np.hstack(self.fixed_indices)))
        # add markers to show that some variables are constrained
        for i, t in zip(self.constrained_indices, self.constraints):
            for ii in i:
                n[ii] = n[ii] + t.__str__()
        n = [nn for i, nn in enumerate(n) if not i in remove]
        return n
    #@property
    #def all(self):
    #    return self.__str__(self._get_param_names())
    #def __str__(self, names=None, nw=30):
    def __str__(self, nw=30):
        """
        Return a string describing the parameter names and their ties and constraints
        """
        names = self._get_param_names()
        #if names is None:
        #    names = self._get_print_names()
        #name_indices = self.grep_param_names("|".join(names))
        N = len(names)
        if not N:
            return "This object has no free parameters."
        header = ['Name', 'Value', 'Constraints', 'Ties']
        values = self._get_params() # map(str,self._get_params())
        #values = self._get_params()[name_indices] # map(str,self._get_params())
        # sort out the constraints
        constraints = [''] * len(names)
        #constraints = [''] * len(self._get_param_names())
        for i, t in zip(self.constrained_indices, self.constraints):
            for ii in i:
                constraints[ii] = t.__str__()
        for i in self.fixed_indices:
            for ii in i:
                constraints[ii] = 'Fixed'
        # sort out the ties
        ties = [''] * len(names)
        for i, tie in enumerate(self.tied_indices):
            for j in tie:
                ties[j] = '(' + str(i) + ')'
        if values.size == 1:
            values = ['%.4f' %float(values)]
        else:
            values = ['%.4f' % float(v) for v in values]
        max_names = max([len(names[i]) for i in range(len(names))] + [len(header[0])])
        max_values = max([len(values[i]) for i in range(len(values))] + [len(header[1])])
        max_constraint = max([len(constraints[i]) for i in range(len(constraints))] + [len(header[2])])
        max_ties = max([len(ties[i]) for i in range(len(ties))] + [len(header[3])])
        cols = np.array([max_names, max_values, max_constraint, max_ties]) + 4
        # columns = cols.sum()
        header_string = ["{h:^{col}}".format(h=header[i], col=cols[i]) for i in range(len(cols))]
        header_string = map(lambda x: '|'.join(x), [header_string])
        separator = '-' * len(header_string[0])
        param_string = ["{n:^{c0}}|{v:^{c1}}|{c:^{c2}}|{t:^{c3}}".format(n=names[i], v=values[i], c=constraints[i], t=ties[i], c0=cols[0], c1=cols[1], c2=cols[2], c3=cols[3]) for i in range(len(values))]
        return ('\n'.join([header_string[0], separator] + param_string)) + '\n'
    def grep_model(self,regexp):
        regexp_indices = self.grep_param_names(regexp)
        all_names = self._get_param_names()
        names = [all_names[pj] for pj in regexp_indices]
        N = len(names)
        if not N:
            return "Match not found."
        header = ['Name', 'Value', 'Constraints', 'Ties']
        all_values = self._get_params()
        values = np.array([all_values[pj] for pj in regexp_indices])
        constraints = [''] * len(names)
        _constrained_indices,aux = self._pick_elements(regexp_indices,self.constrained_indices)
        _constraints = [self.constraints[pj] for pj in aux]
        for i, t in zip(_constrained_indices, _constraints):
            for ii in i:
                iii = regexp_indices.tolist().index(ii)
                constraints[iii] = t.__str__()
        _fixed_indices,aux = self._pick_elements(regexp_indices,self.fixed_indices)
        for i in _fixed_indices:
            for ii in i:
                iii = regexp_indices.tolist().index(ii)
                constraints[ii] = 'Fixed'
        _tied_indices,aux = self._pick_elements(regexp_indices,self.tied_indices)
        ties = [''] * len(names)
        for i,ti in zip(_tied_indices,aux):
            for ii in i:
                iii = regexp_indices.tolist().index(ii)
                ties[iii] = '(' + str(ti) + ')'
        if values.size == 1:
            values = ['%.4f' %float(values)]
        else:
            values = ['%.4f' % float(v) for v in values]
        max_names = max([len(names[i]) for i in range(len(names))] + [len(header[0])])
        max_values = max([len(values[i]) for i in range(len(values))] + [len(header[1])])
        max_constraint = max([len(constraints[i]) for i in range(len(constraints))] + [len(header[2])])
        max_ties = max([len(ties[i]) for i in range(len(ties))] + [len(header[3])])
        cols = np.array([max_names, max_values, max_constraint, max_ties]) + 4
        header_string = ["{h:^{col}}".format(h=header[i], col=cols[i]) for i in range(len(cols))]
        header_string = map(lambda x: '|'.join(x), [header_string])
        separator = '-' * len(header_string[0])
        param_string = ["{n:^{c0}}|{v:^{c1}}|{c:^{c2}}|{t:^{c3}}".format(n=names[i], v=values[i], c=constraints[i], t=ties[i], c0=cols[0], c1=cols[1], c2=cols[2], c3=cols[3]) for i in range(len(values))]
        print header_string[0]
        print separator
        for string in param_string:
            print string
    def _pick_elements(self,regexp_ind,array_list):
        """Removes from array_list the elements different from regexp_ind"""
        new_array_list = [] #New list with elements matching regexp_ind
        array_indices = [] #Indices that matches the arrays in new_array_list and array_list
        array_index = 0
        for array in array_list:
            _new = []
            for ai in array:
                if ai in regexp_ind:
                    _new.append(ai)
            if len(_new):
                new_array_list.append(np.array(_new))
                array_indices.append(array_index)
            array_index += 1
        return new_array_list, array_indices
--- a/GPy/core/priors.py
+++ b/GPy/core/priors.py
@ -1,217 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 from scipy.special import gammaln, digamma
 from ..util.linalg import pdinv
 from GPy.core.domains import REAL, POSITIVE
 import warnings
 class Prior:
    domain = None
    def pdf(self, x):
        return np.exp(self.lnpdf(x))
    def plot(self):
        rvs = self.rvs(1000)
        pb.hist(rvs, 100, normed=True)
        xmin, xmax = pb.xlim()
        xx = np.linspace(xmin, xmax, 1000)
        pb.plot(xx, self.pdf(xx), 'r', linewidth=2)
 class Gaussian(Prior):
    """
    Implementation of the univariate Gaussian probability function, coupled with random variables.
    :param mu: mean
    :param sigma: standard deviation
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    domain = REAL
    def __init__(self, mu, sigma):
        self.mu = float(mu)
        self.sigma = float(sigma)
        self.sigma2 = np.square(self.sigma)
        self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)
    def __str__(self):
        return "N(" + str(np.round(self.mu)) + ', ' + str(np.round(self.sigma2)) + ')'
    def lnpdf(self, x):
        return self.constant - 0.5 * np.square(x - self.mu) / self.sigma2
    def lnpdf_grad(self, x):
        return -(x - self.mu) / self.sigma2
    def rvs(self, n):
        return np.random.randn(n) * self.sigma + self.mu
 class LogGaussian(Prior):
    """
    Implementation of the univariate *log*-Gaussian probability function, coupled with random variables.
    :param mu: mean
    :param sigma: standard deviation
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    domain = POSITIVE
    def __init__(self, mu, sigma):
        self.mu = float(mu)
        self.sigma = float(sigma)
        self.sigma2 = np.square(self.sigma)
        self.constant = -0.5 * np.log(2 * np.pi * self.sigma2)
    def __str__(self):
        return "lnN(" + str(np.round(self.mu)) + ', ' + str(np.round(self.sigma2)) + ')'
    def lnpdf(self, x):
        return self.constant - 0.5 * np.square(np.log(x) - self.mu) / self.sigma2 - np.log(x)
    def lnpdf_grad(self, x):
        return -((np.log(x) - self.mu) / self.sigma2 + 1.) / x
    def rvs(self, n):
        return np.exp(np.random.randn(n) * self.sigma + self.mu)
 class MultivariateGaussian:
    """
    Implementation of the multivariate Gaussian probability function, coupled with random variables.
    :param mu: mean (N-dimensional array)
    :param var: covariance matrix (NxN)
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    domain = REAL
    def __init__(self, mu, var):
        self.mu = np.array(mu).flatten()
        self.var = np.array(var)
        assert len(self.var.shape) == 2
        assert self.var.shape[0] == self.var.shape[1]
        assert self.var.shape[0] == self.mu.size
        self.input_dim = self.mu.size
        self.inv, self.hld = pdinv(self.var)
        self.constant = -0.5 * self.input_dim * np.log(2 * np.pi) - self.hld
    def summary(self):
        raise NotImplementedError
    def pdf(self, x):
        return np.exp(self.lnpdf(x))
    def lnpdf(self, x):
        d = x - self.mu
        return self.constant - 0.5 * np.sum(d * np.dot(d, self.inv), 1)
    def lnpdf_grad(self, x):
        d = x - self.mu
        return -np.dot(self.inv, d)
    def rvs(self, n):
        return np.random.multivariate_normal(self.mu, self.var, n)
    def plot(self):
        if self.input_dim == 2:
            rvs = self.rvs(200)
            pb.plot(rvs[:, 0], rvs[:, 1], 'kx', mew=1.5)
            xmin, xmax = pb.xlim()
            ymin, ymax = pb.ylim()
            xx, yy = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
            xflat = np.vstack((xx.flatten(), yy.flatten())).T
            zz = self.pdf(xflat).reshape(100, 100)
            pb.contour(xx, yy, zz, linewidths=2)
 def gamma_from_EV(E, V):
    warnings.warn("use Gamma.from_EV to create Gamma Prior", FutureWarning)
    return Gamma.from_EV(E, V)
 class Gamma(Prior):
    """
    Implementation of the Gamma probability function, coupled with random variables.
    :param a: shape parameter
    :param b: rate parameter (warning: it's the *inverse* of the scale)
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    domain = POSITIVE
    def __init__(self, a, b):
        self.a = float(a)
        self.b = float(b)
        self.constant = -gammaln(self.a) + a * np.log(b)
    def __str__(self):
        return "Ga(" + str(np.round(self.a)) + ', ' + str(np.round(self.b)) + ')'
    def summary(self):
        ret = {"E[x]": self.a / self.b, \
            "E[ln x]": digamma(self.a) - np.log(self.b), \
            "var[x]": self.a / self.b / self.b, \
            "Entropy": gammaln(self.a) - (self.a - 1.) * digamma(self.a) - np.log(self.b) + self.a}
        if self.a > 1:
            ret['Mode'] = (self.a - 1.) / self.b
        else:
            ret['mode'] = np.nan
        return ret
    def lnpdf(self, x):
        return self.constant + (self.a - 1) * np.log(x) - self.b * x
    def lnpdf_grad(self, x):
        return (self.a - 1.) / x - self.b
    def rvs(self, n):
        return np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
    @staticmethod
    def from_EV(E, V):
        """
        Creates an instance of a Gamma Prior  by specifying the Expected value(s)
        and Variance(s) of the distribution.
        :param E: expected value
        :param V: variance
        """
        a = np.square(E) / V
        b = E / V
        return Gamma(a, b)
 class inverse_gamma(Prior):
    """
    Implementation of the inverse-Gamma probability function, coupled with random variables.
    :param a: shape parameter
    :param b: rate parameter (warning: it's the *inverse* of the scale)
    .. Note:: Bishop 2006 notation is used throughout the code
    """
    domain = POSITIVE
    def __init__(self, a, b):
        self.a = float(a)
        self.b = float(b)
        self.constant = -gammaln(self.a) + a * np.log(b)
    def __str__(self):
        return "iGa(" + str(np.round(self.a)) + ', ' + str(np.round(self.b)) + ')'
    def lnpdf(self, x):
        return self.constant - (self.a + 1) * np.log(x) - self.b / x
    def lnpdf_grad(self, x):
        return -(self.a + 1.) / x + self.b / x ** 2
    def rvs(self, n):
        return 1. / np.random.gamma(scale=1. / self.b, shape=self.a, size=n)
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -1,16 +1,28 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
-import pylab as pb
+from gp import GP
-from ..util.linalg import mdot, jitchol, tdot, symmetrify, backsub_both_sides, chol_inv, dtrtrs, dpotrs, dpotri
+from parameterization.param import Param
-from scipy import linalg
+from ..inference.latent_function_inference import var_dtc
-from ..likelihoods import Gaussian, EP,EP_Mixed_Noise
+from .. import likelihoods
-from gp_base import GPBase
+from parameterization.variational import VariationalPosterior
-class SparseGP(GPBase):
+import logging
 from GPy.inference.latent_function_inference.posterior import Posterior
 from GPy.inference.optimization.stochastics import SparseGPStochastics,\
    SparseGPMissing
 #no stochastics.py file added! from GPy.inference.optimization.stochastics import SparseGPStochastics,\
    #SparseGPMissing
 logger = logging.getLogger("sparse gp")
 class SparseGP(GP):
    """
-    Variational sparse GP model
+    A general purpose Sparse GP model
    This model allows (approximate) inference using variational DTC or FITC
    (Gaussian likelihoods) as well as non-conjugate sparse methods based on
    these.
    :param X: inputs
    :type X: np.ndarray (num_data x input_dim)
@ -20,478 +32,101 @@ class SparseGP(GPBase):
    :type kernel: a GPy.kern.kern instance
    :param X_variance: The uncertainty in the measurements of X (Gaussian variance)
    :type X_variance: np.ndarray (num_data x input_dim) | None
-    :param Z: inducing inputs (optional, see note)
+    :param Z: inducing inputs
-    :type Z: np.ndarray (num_inducing x input_dim) | None
+    :type Z: np.ndarray (num_inducing x input_dim)
    :param num_inducing: Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type num_inducing: int
    :param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """
-    def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False):
+    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
-        GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
+                 name='sparse gp', Y_metadata=None, normalizer=False):
        #pick a sensible inference method
        if inference_method is None:
            if isinstance(likelihood, likelihoods.Gaussian):
                inference_method = var_dtc.VarDTC(limit=1 if not self.missing_data else Y.shape[1])
            else:
                #inference_method = ??
                raise NotImplementedError, "what to do what to do?"
            print "defaulting to ", inference_method, "for latent function inference"
-        self.Z = Z
+        self.Z = Param('inducing inputs', Z)
        self.num_inducing = Z.shape[0]
-        self.backsub = 0
+
-        
+        GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
-        if X_variance is None:
+
-            self.has_uncertain_inputs = False
+        logger.info("Adding Z as parameter")
-            self.X_variance = None
+        self.link_parameter(self.Z, index=0)
        self.posterior = None
    def has_uncertain_inputs(self):
        return isinstance(self.X, VariationalPosterior)
    def parameters_changed(self):
        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata)
        self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
        if isinstance(self.X, VariationalPosterior):
            #gradients wrt kernel
            dL_dKmm = self.grad_dict['dL_dKmm']
            self.kern.update_gradients_full(dL_dKmm, self.Z, None)
            kerngrad = self.kern.gradient.copy()
            self.kern.update_gradients_expectations(variational_posterior=self.X,
                                                    Z=self.Z,
                                                    dL_dpsi0=self.grad_dict['dL_dpsi0'],
                                                    dL_dpsi1=self.grad_dict['dL_dpsi1'],
                                                    dL_dpsi2=self.grad_dict['dL_dpsi2'])
            self.kern.gradient += kerngrad
            #gradients wrt Z
            self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z)
            self.Z.gradient += self.kern.gradients_Z_expectations(
                               self.grad_dict['dL_dpsi0'],
                               self.grad_dict['dL_dpsi1'],
                               self.grad_dict['dL_dpsi2'],
                               Z=self.Z,
                               variational_posterior=self.X)
        else:
-            assert X_variance.shape == X.shape
+            #gradients wrt kernel
-            self.has_uncertain_inputs = True
+            self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X)
-            self.X_variance = X_variance
+            kerngrad = self.kern.gradient.copy()
-
+            self.kern.update_gradients_full(self.grad_dict['dL_dKnm'], self.X, self.Z)
-        if normalize_X:
+            kerngrad += self.kern.gradient
-            self.Z = (self.Z.copy() - self._Xoffset) / self._Xscale
+            self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z, None)
-
+            self.kern.gradient += kerngrad
-        # normalize X uncertainty also
+            #gradients wrt Z
-        if self.has_uncertain_inputs:
+            self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
-            self.X_variance /= np.square(self._Xscale)
+            self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)
        self._const_jitter = None
    def _compute_kernel_matrices(self):
        # kernel computations, using BGPLVM notation
        self.Kmm = self.kern.K(self.Z)
        if self.has_uncertain_inputs:
            self.psi0 = self.kern.psi0(self.Z, self.X, self.X_variance)
            self.psi1 = self.kern.psi1(self.Z, self.X, self.X_variance)
            self.psi2 = self.kern.psi2(self.Z, self.X, self.X_variance)
        else:
            self.psi0 = self.kern.Kdiag(self.X)
            self.psi1 = self.kern.K(self.X, self.Z)
            self.psi2 = None
    def _computations(self):
        if self._const_jitter is None or not(self._const_jitter.shape[0] == self.num_inducing):
            self._const_jitter = np.eye(self.num_inducing) * 1e-7
        # factor Kmm
        self._Lm = jitchol(self.Kmm + self._const_jitter)    
        if not self.backsub:
            self._LmInv = linalg.lapack.dtrtri(self._Lm, lower=1)[0] # TODO: not needed in old version
        # The rather complex computations of self._A
        if self.has_uncertain_inputs:
            if self.likelihood.is_heteroscedastic:
                psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.num_data, 1, 1))).sum(0)
            else:
                psi2_beta = self.psi2.sum(0) * self.likelihood.precision
            if self.backsub:
                evals, evecs = linalg.eigh(psi2_beta)
                clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
                if not np.array_equal(evals, clipped_evals):
                    pass # print evals
                tmp = evecs * np.sqrt(clipped_evals)
                tmp = tmp.T
                tmp, _ = dtrtrs(self._Lm, np.asfortranarray(tmp.T), lower=1)
                self._A = tdot(tmp) 
            else:
                self._A = np.dot(np.dot(self._LmInv,
                                        psi2_beta),
                                 self._LmInv.T)
        else:
            if self.likelihood.is_heteroscedastic:
                tmp = self.psi1 * (np.sqrt(self.likelihood.precision.flatten().reshape(self.num_data, 1)))
            else:
                tmp = self.psi1 * (np.sqrt(self.likelihood.precision))
            tmp, _ = dtrtrs(self._Lm, np.asfortranarray(tmp.T), lower=1)
            self._A = tdot(tmp)        
        # factor B
        self.B = np.eye(self.num_inducing) + self._A
        self.LB = jitchol(self.B)
        # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
        self.psi1Vf = np.dot(self.psi1.T, self.likelihood.VVT_factor)
        if 1:#self.backsub:
            # back substutue C into psi1Vf
            tmp, info1 = dtrtrs(self._Lm, np.asfortranarray(self.psi1Vf), lower=1, trans=0)
            self._LBi_Lmi_psi1Vf, _ = dtrtrs(self.LB, np.asfortranarray(tmp), lower=1, trans=0)
            # tmp, info2 = dpotrs(self.LB, tmp, lower=1)
            tmp, info2 = dtrtrs(self.LB, self._LBi_Lmi_psi1Vf, lower=1, trans=1)
            self.Cpsi1Vf, info3 = dtrtrs(self._Lm, tmp, lower=1, trans=1)
        else:
            # slower, but more stable (?) version:
            tmp = np.dot(self._LmInv, self.psi1Vf)
            self._LBInv = linalg.lapack.dtrtri(self.LB, lower=True)[0]
            self._LBi_Lmi_psi1Vf = np.dot(self._LBInv, tmp)
            tmp = np.dot(self._LBInv.T, self._LBi_Lmi_psi1Vf)
            self.Cpsi1Vf = np.dot(self._LmInv.T, tmp)
        #import ipdb;ipdb.set_trace()
        # Compute dL_dKmm
        tmp = tdot(self._LBi_Lmi_psi1Vf)
        self.data_fit = np.trace(tmp)
        self.DBi_plus_BiPBi = backsub_both_sides(self.LB, self.output_dim * np.eye(self.num_inducing) + tmp)
        tmp = -0.5 * self.DBi_plus_BiPBi
        tmp += -0.5 * self.B * self.output_dim
        tmp += self.output_dim * np.eye(self.num_inducing)
        self.dL_dKmm = backsub_both_sides(self._Lm, tmp)
        # Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertain inputs case
        self.dL_dpsi0 = -0.5 * self.output_dim * (self.likelihood.precision * np.ones([self.num_data, 1])).flatten()
        self.dL_dpsi1 = np.dot(self.likelihood.VVT_factor, self.Cpsi1Vf.T)
        dL_dpsi2_beta = 0.5 * backsub_both_sides(self._Lm, self.output_dim * np.eye(self.num_inducing) - self.DBi_plus_BiPBi)
        if self.likelihood.is_heteroscedastic:
            if self.has_uncertain_inputs:
                self.dL_dpsi2 = self.likelihood.precision.flatten()[:, None, None] * dL_dpsi2_beta[None, :, :]
            else:
                self.dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (self.psi1 * self.likelihood.precision.reshape(self.num_data, 1)).T).T
                self.dL_dpsi2 = None
        else:
            dL_dpsi2 = self.likelihood.precision * dL_dpsi2_beta
            if self.has_uncertain_inputs:
                # repeat for each of the N psi_2 matrices
                self.dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], self.num_data, axis=0)
            else:
                # subsume back into psi1 (==Kmn)
                self.dL_dpsi1 += 2.*np.dot(self.psi1, dL_dpsi2)
                self.dL_dpsi2 = None
-        # the partial derivative vector for the likelihood
+    def _raw_predict(self, Xnew, full_cov=False, kern=None):
        if self.likelihood.num_params == 0:
            # save computation here.
            self.partial_for_likelihood = None
        elif self.likelihood.is_heteroscedastic:
            if self.has_uncertain_inputs:
                raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
            else:
                LBi = chol_inv(self.LB)
                Lmi_psi1, nil = dtrtrs(self._Lm, np.asfortranarray(self.psi1.T), lower=1, trans=0)
                _LBi_Lmi_psi1, _ = dtrtrs(self.LB, np.asfortranarray(Lmi_psi1), lower=1, trans=0)
                self.partial_for_likelihood = -0.5 * self.likelihood.precision + 0.5 * self.likelihood.V**2
                self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0 - np.sum(Lmi_psi1**2,0))[:,None] * self.likelihood.precision**2
                self.partial_for_likelihood += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*self.likelihood.precision**2
                self.partial_for_likelihood += -np.dot(self._LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * self.likelihood.Y * self.likelihood.precision**2
                self.partial_for_likelihood += 0.5*np.dot(self._LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * self.likelihood.precision**2
        else:
            # likelihood is not heteroscedatic
            self.partial_for_likelihood = -0.5 * self.num_data * self.output_dim * self.likelihood.precision + 0.5 * self.likelihood.trYYT * self.likelihood.precision ** 2
            self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0.sum() * self.likelihood.precision ** 2 - np.trace(self._A) * self.likelihood.precision)
            self.partial_for_likelihood += self.likelihood.precision * (0.5 * np.sum(self._A * self.DBi_plus_BiPBi) - self.data_fit)
    def log_likelihood(self):
        """ Compute the (lower bound on the) log marginal likelihood """
        if self.likelihood.is_heteroscedastic:
            A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.likelihood.V * self.likelihood.Y)
            B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self._A))
        else:
            A = -0.5 * self.num_data * self.output_dim * (np.log(2.*np.pi) - np.log(self.likelihood.precision)) - 0.5 * self.likelihood.precision * self.likelihood.trYYT
            B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self._A))
        C = -self.output_dim * (np.sum(np.log(np.diag(self.LB)))) # + 0.5 * self.num_inducing * np.log(sf2))
        D = 0.5 * self.data_fit
        self._A_part, self._B_part, self._C_part, self._D_part = A, B, C, D
        return A + B + C + D + self.likelihood.Z
    def _set_params(self, p):
        self.Z = p[:self.num_inducing * self.input_dim].reshape(self.num_inducing, self.input_dim)
        self.kern._set_params(p[self.Z.size:self.Z.size + self.kern.num_params])
        self.likelihood._set_params(p[self.Z.size + self.kern.num_params:])
        self._compute_kernel_matrices()
        self._computations()
        self.Cpsi1V = None
    def _get_params(self):
        return np.hstack([self.Z.flatten(), self.kern._get_params_transformed(), self.likelihood._get_params()])
    def _get_param_names(self):
        return sum([['iip_%i_%i' % (i, j) for j in range(self.Z.shape[1])] for i in range(self.Z.shape[0])], [])\
            + self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
    #def _get_print_names(self):
    #    return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
    def update_likelihood_approximation(self, **kwargs):
        """
-        Approximates a non-gaussian likelihood using Expectation Propagation
+        Make a prediction for the latent function values
        For a Gaussian likelihood, no iteration is required:
        this function does nothing
        """
        if not isinstance(self.likelihood, Gaussian): # Updates not needed for Gaussian likelihood
            self.likelihood.restart()
            if self.has_uncertain_inputs:
                Lmi = chol_inv(self._Lm)
                Kmmi = tdot(Lmi.T)
                diag_tr_psi2Kmmi = np.array([np.trace(psi2_Kmmi) for psi2_Kmmi in np.dot(self.psi2, Kmmi)])
                self.likelihood.fit_FITC(self.Kmm, self.psi1.T, diag_tr_psi2Kmmi, **kwargs) # This uses the fit_FITC code, but does not perfomr a FITC-EP.#TODO solve potential confusion
                # raise NotImplementedError, "EP approximation not implemented for uncertain inputs"
            else:
                self.likelihood.fit_DTC(self.Kmm, self.psi1.T, **kwargs)
                # self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
                self._set_params(self._get_params()) # update the GP
    def _log_likelihood_gradients(self):
        return np.hstack((self.dL_dZ().flatten(), self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))
    def dL_dtheta(self):
        """
        Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel
        """
        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm, self.Z)
        if self.has_uncertain_inputs:
            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X, self.X_variance)
            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1, self.Z, self.X, self.X_variance)
            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X, self.X_variance)
        else:
            dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1, self.X, self.Z)
            dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
        return dL_dtheta
    def dL_dZ(self):
        """
        The derivative of the bound wrt the inducing inputs Z
        """
        dL_dZ = self.kern.dK_dX(self.dL_dKmm, self.Z)
        if self.has_uncertain_inputs:
            dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1, self.Z, self.X, self.X_variance)
            dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2, self.Z, self.X, self.X_variance)
        else:
            dL_dZ += self.kern.dK_dX(self.dL_dpsi1.T, self.Z, self.X)
        return dL_dZ
    def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
        """
        Internal helper function for making predictions, does not account for
        normalization or likelihood function
        """
-        Bi, _ = dpotri(self.LB, lower=0) # WTH? this lower switch should be 1, but that doesn't work!
+        if kern is None: kern = self.kern
        symmetrify(Bi)
        Kmmi_LmiBLmi = backsub_both_sides(self._Lm, np.eye(self.num_inducing) - Bi)
-        if self.Cpsi1V is None:
+        if not isinstance(Xnew, VariationalPosterior):
-            psi1V = np.dot(self.psi1.T, self.likelihood.V)
+            Kx = kern.K(self.Z, Xnew)
-            tmp, _ = dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
+            mu = np.dot(Kx.T, self.posterior.woodbury_vector)
            tmp, _ = dpotrs(self.LB, tmp, lower=1)
            self.Cpsi1V, _ = dtrtrs(self._Lm, tmp, lower=1, trans=1)
        if X_variance_new is None:
            Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts)
            mu = np.dot(Kx.T, self.Cpsi1V)
            if full_cov:
-                Kxx = self.kern.K(Xnew, which_parts=which_parts)
+                Kxx = kern.K(Xnew)
-                var = Kxx - mdot(Kx.T, Kmmi_LmiBLmi, Kx) # NOTE this won't work for plotting
+                if self.posterior.woodbury_inv.ndim == 2:
                    var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
                elif self.posterior.woodbury_inv.ndim == 3:
                    var = Kxx[:,:,None] - np.tensordot(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx).T, Kx, [1,0]).swapaxes(1,2)
                var = var
            else:
-                Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)
+                Kxx = kern.Kdiag(Xnew)
-                var = Kxx - np.sum(Kx * np.dot(Kmmi_LmiBLmi, Kx), 0)
+                var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
        else:
-            # assert which_parts=='all', "swithching out parts of variational kernels is not implemented"
+            Kx = kern.psi1(self.Z, Xnew)
-            Kx = self.kern.psi1(self.Z, Xnew, X_variance_new) # , which_parts=which_parts) TODO: which_parts
+            mu = np.dot(Kx, self.posterior.woodbury_vector)
            mu = np.dot(Kx, self.Cpsi1V)
            if full_cov:
                raise NotImplementedError, "TODO"
            else:
-                Kxx = self.kern.psi0(self.Z, Xnew, X_variance_new)
+                Kxx = kern.psi0(self.Z, Xnew)
-                psi2 = self.kern.psi2(self.Z, Xnew, X_variance_new)
+                psi2 = kern.psi2(self.Z, Xnew)
                var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
-
+        return mu, var
        return mu, var[:, None]
    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False, **likelihood_args):
        """
        Predict the function(s) at the new point(s) Xnew.
        **Arguments**
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.input_dim
        :param X_variance_new: The uncertainty in the prediction points
        :type X_variance_new: np.ndarray, Nnew x self.input_dim
        :param which_parts:  specifies which outputs kernel(s) to use in prediction
        :type which_parts: ('all', list of bools)
        :param full_cov: whether to return the full covariance matrix, or just the diagonal
        :type full_cov: bool
        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
           If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
           This is to allow for different normalizations of the output dimensions.
        """
        # normalize X values
        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
        if X_variance_new is not None:
            X_variance_new = X_variance_new / self._Xscale ** 2
        # here's the actual prediction by the GP model
        mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts)
        # now push through likelihood
        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, **likelihood_args)
        return mean, var, _025pm, _975pm
    def plot_f(self, samples=0, plot_limits=None, which_data_rows='all',
            which_data_ycols='all', which_parts='all', resolution=None,
            full_cov=False, fignum=None, ax=None):
        """
        Plot the GP's view of the world, where the data is normalized and the
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
          - In two dimsensions, a contour-plot shows the mean predicted function
          - Not implemented in higher dimensions
        :param samples: the number of a posteriori samples to plot
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
        :param which_data_rows: which if the training data to plot (default all)
        :type which_data_rows: 'all' or a slice object to slice self.X, self.Y
        :param which_parts: which of the kernel functions to plot (additively)
        :type which_parts: 'all', or list of bools
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        :type resolution: int
        :param full_cov:
        :type full_cov: bool
                :param fignum: figure to plot on.
        :type fignum: figure number
        :param ax: axes to plot on.
        :type ax: axes handle
        :param output: which output to plot (for multiple output models only)
        :type output: integer (first output is 0)
        """
        if ax is None:
            fig = pb.figure(num=fignum)
            ax = fig.add_subplot(111)
        if fignum is None and ax is None:
                fignum = fig.num
        if which_data_rows is 'all':
            which_data_rows = slice(None)
        GPBase.plot_f(self, samples=samples, plot_limits=plot_limits, which_data_rows=which_data_rows, which_data_ycols=which_data_ycols, which_parts=which_parts, resolution=resolution, fignum=fignum, ax=ax)
        if self.X.shape[1] == 1:
            if self.has_uncertain_inputs:
                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
                ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
            Zu = self.Z * self._Xscale + self._Xoffset
            ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
        elif self.X.shape[1] == 2:
            Zu = self.Z * self._Xscale + self._Xoffset
            ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
    def plot(self, plot_limits=None, which_data_rows='all',
            which_data_ycols='all', which_parts='all', fixed_inputs=[],
            plot_raw=False,
            levels=20, samples=0, fignum=None, ax=None, resolution=None):
        """
        Plot the posterior of the sparse GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
          - In two dimsensions, a contour-plot shows the mean predicted function
          - In higher dimensions, use fixed_inputs to plot the GP  with some of the inputs fixed.
        Can plot only part of the data and part of the posterior functions
        using which_data_rowsm which_data_ycols and which_parts
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
        :type plot_limits: np.array
        :param which_data_rows: which of the training data to plot (default all)
        :type which_data_rows: 'all' or a slice object to slice self.X, self.Y
        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
        :type which_data_rows: 'all' or a list of integers
        :param which_parts: which of the kernel functions to plot (additively)
        :type which_parts: 'all', or list of bools
        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
        :type fixed_inputs: a list of tuples
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        :type resolution: int
        :param levels: number of levels to plot in a contour plot.
        :type levels: int
        :param samples: the number of a posteriori samples to plot
        :type samples: int
        :param fignum: figure to plot on.
        :type fignum: figure number
        :param ax: axes to plot on.
        :type ax: axes handle
        :type output: integer (first output is 0)
        :param linecol: color of line to plot.
        :type linecol:
        :param fillcol: color of fill
        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
        """
        #deal work out which ax to plot on
        #Need these because we use which_data_rows in this function not just base
        if which_data_rows == 'all':
            which_data_rows = slice(None)
        if which_data_ycols == 'all':
            which_data_ycols = np.arange(self.output_dim)
        if ax is None:
            fig = pb.figure(num=fignum)
            ax = fig.add_subplot(111)
        #work out what the inputs are for plotting (1D or 2D)
        fixed_dims = np.array([i for i,v in fixed_inputs])
        free_dims = np.setdiff1d(np.arange(self.input_dim),fixed_dims)
        #call the base plotting
        GPBase.plot(self, samples=samples, plot_limits=plot_limits,
                which_data_rows=which_data_rows,
                which_data_ycols=which_data_ycols, fixed_inputs=fixed_inputs,
                which_parts=which_parts, resolution=resolution, levels=20,
                fignum=fignum, ax=ax)
        if len(free_dims) == 1:
            #plot errorbars for the uncertain inputs
            if self.has_uncertain_inputs:
                Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now
                ax.errorbar(Xu[which_data_rows, 0], self.likelihood.data[which_data_rows, 0],
                            xerr=2 * np.sqrt(self.X_variance[which_data_rows, 0]),
                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
            #plot the inducing inputs
            Zu = self.Z * self._Xscale + self._Xoffset
            ax.plot(Zu, np.zeros_like(Zu) + ax.get_ylim()[0], 'r|', mew=1.5, markersize=12)
        elif len(free_dims) == 2:
            Zu = self.Z * self._Xscale + self._Xoffset
            ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
    def getstate(self):
        """
        Get the current state of the class,
        here just all the indices, rest can get recomputed
        """
        return GPBase.getstate(self) + [self.Z,
                self.num_inducing,
                self.has_uncertain_inputs,
                self.X_variance]
    def setstate(self, state):
        self.X_variance = state.pop()
        self.has_uncertain_inputs = state.pop()
        self.num_inducing = state.pop()
        self.Z = state.pop()
        GPBase.setstate(self, state)
--- a/GPy/core/sparse_gp_mpi.py
+++ b/GPy/core/sparse_gp_mpi.py
@ -0,0 +1,120 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from sparse_gp import SparseGP
 from numpy.linalg.linalg import LinAlgError
 from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch
 import logging
 logger = logging.getLogger("sparse gp mpi")
 class SparseGP_MPI(SparseGP):
    """
    A general purpose Sparse GP model with MPI parallelization support
    This model allows (approximate) inference using variational DTC or FITC
    (Gaussian likelihoods) as well as non-conjugate sparse methods based on
    these.
    :param X: inputs
    :type X: np.ndarray (num_data x input_dim)
    :param likelihood: a likelihood instance, containing the observed data
    :type likelihood: GPy.likelihood.(Gaussian | EP | Laplace)
    :param kernel: the kernel (covariance function). See link kernels
    :type kernel: a GPy.kern.kern instance
    :param X_variance: The uncertainty in the measurements of X (Gaussian variance)
    :type X_variance: np.ndarray (num_data x input_dim) | None
    :param Z: inducing inputs
    :type Z: np.ndarray (num_inducing x input_dim)
    :param num_inducing: Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type num_inducing: int
    :param mpi_comm: The communication group of MPI, e.g. mpi4py.MPI.COMM_WORLD
    :type mpi_comm: mpi4py.MPI.Intracomm
    """
    def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp mpi', Y_metadata=None, mpi_comm=None, normalizer=False):
        self._IN_OPTIMIZATION_ = False
        if mpi_comm != None:
            if inference_method is None:
                inference_method = VarDTC_minibatch(mpi_comm=mpi_comm)
            else:
                assert isinstance(inference_method, VarDTC_minibatch), 'inference_method has to support MPI!'
        super(SparseGP_MPI, self).__init__(X, Y, Z, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
        self.update_model(False)
        if variational_prior is not None:
            self.link_parameter(variational_prior)
        self.mpi_comm = mpi_comm
        # Manage the data (Y) division
        if mpi_comm != None:
            from ..util.parallel import divide_data
            N_start, N_end, N_list = divide_data(Y.shape[0], mpi_comm.rank, mpi_comm.size)
            self.N_range = (N_start, N_end)
            self.N_list = np.array(N_list)
            self.Y_local = self.Y[N_start:N_end]
            print 'MPI RANK '+str(self.mpi_comm.rank)+' with the data range '+str(self.N_range)
            mpi_comm.Bcast(self.param_array, root=0)
        self.update_model(True)
    def __getstate__(self):
        dc = super(SparseGP_MPI, self).__getstate__()
        dc['mpi_comm'] = None
        if self.mpi_comm != None:
            del dc['N_range']
            del dc['N_list']
            del dc['Y_local']
        if 'normalizer' not in dc:
            dc['normalizer'] = None
            dc['Y_normalized'] = dc['Y']
        return dc
    #=====================================================
    # The MPI parallelization
    #     - can move to model at some point
    #=====================================================
    @SparseGP.optimizer_array.setter
    def optimizer_array(self, p):
        if self.mpi_comm != None:
            if self._IN_OPTIMIZATION_ and self.mpi_comm.rank==0:
                self.mpi_comm.Bcast(np.int32(1),root=0)
            self.mpi_comm.Bcast(p, root=0)
        SparseGP.optimizer_array.fset(self,p)
    def optimize(self, optimizer=None, start=None, **kwargs):
        self._IN_OPTIMIZATION_ = True
        if self.mpi_comm==None:
            super(SparseGP_MPI, self).optimize(optimizer,start,**kwargs)
        elif self.mpi_comm.rank==0:
            super(SparseGP_MPI, self).optimize(optimizer,start,**kwargs)
            self.mpi_comm.Bcast(np.int32(-1),root=0)
        elif self.mpi_comm.rank>0:
            x = self.optimizer_array.copy()
            flag = np.empty(1,dtype=np.int32)
            while True:
                self.mpi_comm.Bcast(flag,root=0)
                if flag==1:
                    try:
                        self.optimizer_array = x
                        self._fail_count = 0
                    except (LinAlgError, ZeroDivisionError, ValueError):
                        if self._fail_count >= self._allowed_failures:
                            raise
                        self._fail_count += 1
                elif flag==-1:
                    break
                else:
                    self._IN_OPTIMIZATION_ = False
                    raise Exception("Unrecognizable flag for synchronization!")
        self._IN_OPTIMIZATION_ = False
    def parameters_changed(self):
        if isinstance(self.inference_method,VarDTC_minibatch):
            update_gradients(self, mpi_comm=self.mpi_comm)
        else:
            super(SparseGP_MPI,self).parameters_changed()
--- a/GPy/core/svigp.py
+++ b/GPy/core/svigp.py
@ -1,512 +0,0 @@
 # Copyright (c) 2012, James Hensman and Nicolo' Fusi
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import pylab as pb
 from .. import kern
 from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs, jitchol, backsub_both_sides
 from ..likelihoods import EP
 from gp_base import GPBase
 from model import Model
 import time
 import sys
 class SVIGP(GPBase):
    """
    Stochastic Variational inference in a Gaussian Process
    :param X: inputs
    :type X: np.ndarray (num_data x num_inputs)
    :param Y: observed data
    :type Y: np.ndarray of observations (num_data x output_dim)
    :param batchsize: the size of a minibatch
    :param q_u: canonical parameters of the distribution squasehd into a 1D array
    :type q_u: np.ndarray
    :param kernel: the kernel/covariance function. See link kernels
    :type kernel: a GPy kernel
    :param Z: inducing inputs
    :type Z: np.ndarray (num_inducing x num_inputs)
    """
    def __init__(self, X, likelihood, kernel, Z, q_u=None, batchsize=10, X_variance=None):
        GPBase.__init__(self, X, likelihood, kernel, normalize_X=False)
        self.batchsize=batchsize
        self.Y = self.likelihood.Y.copy()
        self.Z = Z
        self.num_inducing = Z.shape[0]
        self.batchcounter = 0
        self.epochs = 0
        self.iterations = 0
        self.vb_steplength = 0.05
        self.param_steplength = 1e-5
        self.momentum = 0.9
        if X_variance is None:
            self.has_uncertain_inputs = False
        else:
            self.has_uncertain_inputs = True
            self.X_variance = X_variance
        if q_u is None:
             q_u = np.hstack((np.random.randn(self.num_inducing*self.output_dim),-.5*np.eye(self.num_inducing).flatten()))
        self.set_vb_param(q_u)
        self._permutation = np.random.permutation(self.num_data)
        self.load_batch()
        self._param_trace = []
        self._ll_trace = []
        self._grad_trace = []
        #set the adaptive steplength parameters
        self.hbar_t = 0.0
        self.tau_t = 100.0
        self.gbar_t = 0.0
        self.gbar_t1 = 0.0
        self.gbar_t2 = 0.0
        self.hbar_tp = 0.0
        self.tau_tp = 10000.0
        self.gbar_tp = 0.0
        self.adapt_param_steplength = True
        self.adapt_vb_steplength = True
        self._param_steplength_trace = []
        self._vb_steplength_trace = []
        self.ensure_default_constraints()
    def getstate(self):
        steplength_params = [self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength]
        return GPBase.getstate(self) + \
            [self.get_vb_param(),
             self.Z,
             self.num_inducing,
             self.has_uncertain_inputs,
             self.X_variance,
             self.X_batch,
             self.X_variance_batch,
             steplength_params,
             self.batchcounter,
             self.batchsize,
             self.epochs,
             self.momentum,
             self.data_prop,
             self._param_trace,
             self._param_steplength_trace,
             self._vb_steplength_trace,
             self._ll_trace,
             self._grad_trace,
             self.Y,
             self._permutation,
             self.iterations
            ]
    def setstate(self, state):
        self.iterations = state.pop()
        self._permutation = state.pop()
        self.Y = state.pop()
        self._grad_trace = state.pop()
        self._ll_trace = state.pop()
        self._vb_steplength_trace = state.pop()
        self._param_steplength_trace = state.pop()
        self._param_trace = state.pop()
        self.data_prop = state.pop()
        self.momentum = state.pop()
        self.epochs = state.pop()
        self.batchsize = state.pop()
        self.batchcounter = state.pop()
        steplength_params = state.pop()
        (self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength) = steplength_params
        self.X_variance_batch = state.pop()
        self.X_batch = state.pop()
        self.X_variance = state.pop()
        self.has_uncertain_inputs = state.pop()
        self.num_inducing = state.pop()
        self.Z = state.pop()
        vb_param = state.pop()
        GPBase.setstate(self, state)
        self.set_vb_param(vb_param)
    def _compute_kernel_matrices(self):
        # kernel computations, using BGPLVM notation
        self.Kmm = self.kern.K(self.Z)
        if self.has_uncertain_inputs:
            self.psi0 = self.kern.psi0(self.Z, self.X_batch, self.X_variance_batch)
            self.psi1 = self.kern.psi1(self.Z, self.X_batch, self.X_variance_batch)
            self.psi2 = self.kern.psi2(self.Z, self.X_batch, self.X_variance_batch)
        else:
            self.psi0 = self.kern.Kdiag(self.X_batch)
            self.psi1 = self.kern.K(self.X_batch, self.Z)
            self.psi2 = None
    def dL_dtheta(self):
        dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm, self.Z)
        if self.has_uncertain_inputs:
            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X_batch, self.X_variance_batch)
            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1, self.Z, self.X_batch, self.X_variance_batch)
            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X_batch, self.X_variance_batch)
        else:
            dL_dtheta += self.kern.dK_dtheta(self.dL_dpsi1, self.X_batch, self.Z)
            dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X_batch)
        return dL_dtheta
    def _set_params(self, p, computations=True):
        self.kern._set_params_transformed(p[:self.kern.num_params])
        self.likelihood._set_params(p[self.kern.num_params:])
        if computations:
            self._compute_kernel_matrices()
            self._computations()
    def _get_params(self):
        return np.hstack((self.kern._get_params_transformed() , self.likelihood._get_params()))
    def _get_param_names(self):
        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
    def load_batch(self):
        """
        load a batch of data (set self.X_batch and self.likelihood.Y from self.X, self.Y)
        """
        #if we've seen all the data, start again with them in a new random order
        if self.batchcounter+self.batchsize > self.num_data:
            self.batchcounter = 0
            self.epochs += 1
            self._permutation = np.random.permutation(self.num_data)
        this_perm = self._permutation[self.batchcounter:self.batchcounter+self.batchsize]
        self.X_batch = self.X[this_perm]
        self.likelihood.set_data(self.Y[this_perm])
        if self.has_uncertain_inputs:
            self.X_variance_batch = self.X_variance[this_perm]
        self.batchcounter += self.batchsize
        self.data_prop = float(self.batchsize)/self.num_data
        self._compute_kernel_matrices()
        self._computations()
    def _computations(self,do_Kmm=True, do_Kmm_grad=True):
        """
        All of the computations needed. Some are optional, see kwargs.
        """
        if do_Kmm:
            self.Lm = jitchol(self.Kmm)
        # The rather complex computations of self.A
        if self.has_uncertain_inputs:
            if self.likelihood.is_heteroscedastic:
                psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.batchsize, 1, 1))).sum(0)
            else:
                psi2_beta = self.psi2.sum(0) * self.likelihood.precision
            evals, evecs = np.linalg.eigh(psi2_beta)
            clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
            tmp = evecs * np.sqrt(clipped_evals)
        else:
            if self.likelihood.is_heteroscedastic:
                tmp = self.psi1.T * (np.sqrt(self.likelihood.precision.flatten().reshape(1, self.batchsize)))
            else:
                tmp = self.psi1.T * (np.sqrt(self.likelihood.precision))
        tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1)
        self.A = tdot(tmp)
        self.V = self.likelihood.precision*self.likelihood.Y
        self.VmT = np.dot(self.V,self.q_u_expectation[0].T)
        self.psi1V = np.dot(self.psi1.T, self.V)
        self.B = np.eye(self.num_inducing)*self.data_prop + self.A
        self.Lambda = backsub_both_sides(self.Lm, self.B.T)
        self.LQL = backsub_both_sides(self.Lm,self.q_u_expectation[1].T,transpose='right')
        self.trace_K = self.psi0.sum() - np.trace(self.A)/self.likelihood.precision
        self.Kmmi_m, _ = dpotrs(self.Lm, self.q_u_expectation[0], lower=1)
        self.projected_mean = np.dot(self.psi1,self.Kmmi_m)
        # Compute dL_dpsi
        self.dL_dpsi0 = - 0.5 * self.output_dim * self.likelihood.precision * np.ones(self.batchsize)
        self.dL_dpsi1, _ = dpotrs(self.Lm,np.asfortranarray(self.VmT.T),lower=1)
        self.dL_dpsi1 = self.dL_dpsi1.T
        dL_dpsi2 = -0.5 * self.likelihood.precision * backsub_both_sides(self.Lm, self.LQL - self.output_dim * np.eye(self.num_inducing))
        if self.has_uncertain_inputs:
            self.dL_dpsi2 = np.repeat(dL_dpsi2[None,:,:],self.batchsize,axis=0)
        else:
            self.dL_dpsi1 += 2.*np.dot(dL_dpsi2,self.psi1.T).T
            self.dL_dpsi2 = None
        # Compute dL_dKmm
        if do_Kmm_grad:
            tmp = np.dot(self.LQL,self.A) - backsub_both_sides(self.Lm,np.dot(self.q_u_expectation[0],self.psi1V.T),transpose='right')
            tmp += tmp.T
            tmp += -self.output_dim*self.B
            tmp += self.data_prop*self.LQL
            self.dL_dKmm = 0.5*backsub_both_sides(self.Lm,tmp)
        #Compute the gradient of the log likelihood wrt noise variance
        self.partial_for_likelihood =  -0.5*(self.batchsize*self.output_dim - np.sum(self.A*self.LQL))*self.likelihood.precision
        self.partial_for_likelihood +=  (0.5*self.output_dim*self.trace_K + 0.5 * self.likelihood.trYYT - np.sum(self.likelihood.Y*self.projected_mean))*self.likelihood.precision**2
    def log_likelihood(self):
        """
        As for uncollapsed sparse GP, but account for the proportion of data we're looking at right now.
        NB. self.batchsize is the size of the batch, not the size of X_all
        """
        assert not self.likelihood.is_heteroscedastic
        A = -0.5*self.batchsize*self.output_dim*(np.log(2.*np.pi) - np.log(self.likelihood.precision))
        B = -0.5*self.likelihood.precision*self.output_dim*self.trace_K
        Kmm_logdet = 2.*np.sum(np.log(np.diag(self.Lm)))
        C = -0.5*self.output_dim*self.data_prop*(Kmm_logdet-self.q_u_logdet - self.num_inducing)
        C += -0.5*np.sum(self.LQL * self.B)
        D = -0.5*self.likelihood.precision*self.likelihood.trYYT
        E = np.sum(self.V*self.projected_mean)
        return (A+B+C+D+E)/self.data_prop
    def _log_likelihood_gradients(self):
        return np.hstack((self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))/self.data_prop
    def vb_grad_natgrad(self):
        """
        Compute the gradients of the lower bound wrt the canonical and
        Expectation parameters of u.
        Note that the natural gradient in either is given by the gradient in the other (See Hensman et al 2012 Fast Variational inference in the conjugate exponential Family)
        """
        # Gradient for eta
        dL_dmmT_S = -0.5*self.Lambda/self.data_prop + 0.5*self.q_u_prec
        Kmmipsi1V,_ = dpotrs(self.Lm,self.psi1V,lower=1)
        dL_dm = (Kmmipsi1V - np.dot(self.Lambda,self.q_u_mean))/self.data_prop
        # Gradients for theta
        S = self.q_u_cov
        Si = self.q_u_prec
        m = self.q_u_mean
        dL_dSi = -mdot(S,dL_dmmT_S, S)
        dL_dmhSi = -2*dL_dSi
        dL_dSim = np.dot(dL_dSi,m) + np.dot(Si, dL_dm)
        return np.hstack((dL_dm.flatten(),dL_dmmT_S.flatten())) , np.hstack((dL_dSim.flatten(), dL_dmhSi.flatten()))
    def optimize(self, iterations, print_interval=10, callback=lambda:None, callback_interval=5):
        param_step = 0.
        #Iterate!
        for i in range(iterations):
            #store the current configuration for plotting later
            self._param_trace.append(self._get_params())
            self._ll_trace.append(self.log_likelihood() + self.log_prior())
            #load a batch and do the appropriate computations (kernel matrices, etc)
            self.load_batch()
            #compute the (stochastic) gradient
            natgrads = self.vb_grad_natgrad()
            grads = self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
            self._grad_trace.append(grads)
            #compute the steps in all parameters
            vb_step = self.vb_steplength*natgrads[0]
            #only move the parameters after the first epoch and only if the steplength is nonzero
            if (self.epochs>=1) and (self.param_steplength > 0):
                param_step = self.momentum*param_step + self.param_steplength*grads
            else:
                param_step = 0.
            self.set_vb_param(self.get_vb_param() + vb_step)
            #Note: don't recompute everything here, wait until the next iteration when we have a new batch
            self._set_params(self._untransform_params(self._get_params_transformed() + param_step), computations=False)
            #print messages if desired
            if i and (not i%print_interval):
                print i, np.mean(self._ll_trace[-print_interval:]) #, self.log_likelihood()
                print np.round(np.mean(self._grad_trace[-print_interval:],0),3)
                sys.stdout.flush()
            #callback
            if i and not i%callback_interval:
                callback(self) # Change this to callback()
                time.sleep(0.01)
            if self.epochs > 10:
                self._adapt_steplength()
            self._vb_steplength_trace.append(self.vb_steplength)
            self._param_steplength_trace.append(self.param_steplength)
            self.iterations += 1
    def _adapt_steplength(self):
        if self.adapt_vb_steplength:
            # self._adaptive_vb_steplength()
            self._adaptive_vb_steplength_KL()
        #self._vb_steplength_trace.append(self.vb_steplength)
        assert self.vb_steplength >= 0
        if self.adapt_param_steplength:
            self._adaptive_param_steplength()
            # self._adaptive_param_steplength_log()
            # self._adaptive_param_steplength_from_vb()
        #self._param_steplength_trace.append(self.param_steplength)
    def _adaptive_param_steplength(self):
        if hasattr(self, 'adapt_param_steplength_decr'):
            decr_factor = self.adapt_param_steplength_decr
        else:
            decr_factor = 0.02
        g_tp = self._transform_gradients(self._log_likelihood_gradients())
        self.gbar_tp = (1-1/self.tau_tp)*self.gbar_tp + 1/self.tau_tp * g_tp
        self.hbar_tp = (1-1/self.tau_tp)*self.hbar_tp + 1/self.tau_tp * np.dot(g_tp.T, g_tp)
        new_param_steplength = np.dot(self.gbar_tp.T, self.gbar_tp) / self.hbar_tp
        #- hack
        new_param_steplength *= decr_factor
        self.param_steplength = (self.param_steplength + new_param_steplength)/2
        #-
        self.tau_tp = self.tau_tp*(1-self.param_steplength) + 1
    def _adaptive_param_steplength_log(self):
        stp = np.logspace(np.log(0.0001), np.log(1e-6), base=np.e, num=18000)
        self.param_steplength = stp[self.iterations]
    def _adaptive_param_steplength_log2(self):
        self.param_steplength = (self.iterations + 0.001)**-0.5
    def _adaptive_param_steplength_from_vb(self):
        self.param_steplength = self.vb_steplength * 0.01
    def _adaptive_vb_steplength(self):
        decr_factor = 0.1
        g_t = self.vb_grad_natgrad()[0]
        self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t
        self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t)
        new_vb_steplength = np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t
        #- hack
        new_vb_steplength *= decr_factor
        self.vb_steplength = (self.vb_steplength + new_vb_steplength)/2
        #-
        self.tau_t = self.tau_t*(1-self.vb_steplength) + 1
    def _adaptive_vb_steplength_KL(self):
        decr_factor = 0.1
        natgrad = self.vb_grad_natgrad()
        g_t1 = natgrad[0]
        g_t2 = natgrad[1]
        self.gbar_t1 = (1-1/self.tau_t)*self.gbar_t1 + 1/self.tau_t * g_t1
        self.gbar_t2 = (1-1/self.tau_t)*self.gbar_t2 + 1/self.tau_t * g_t2
        self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t1.T, g_t2)
        self.vb_steplength = np.dot(self.gbar_t1.T, self.gbar_t2) / self.hbar_t
        self.vb_steplength *= decr_factor
        self.tau_t = self.tau_t*(1-self.vb_steplength) + 1
    def _raw_predict(self, X_new, X_variance_new=None, which_parts='all',full_cov=False):
        """Internal helper function for making predictions, does not account for normalization"""
        #TODO: make this more efficient!
        self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)
        tmp = self.Kmmi- mdot(self.Kmmi,self.q_u_cov,self.Kmmi)
        if X_variance_new is None:
            Kx = self.kern.K(X_new,self.Z)
            mu = np.dot(Kx,self.Kmmi_m)
            if full_cov:
                Kxx = self.kern.K(X_new)
                var = Kxx - mdot(Kx,tmp,Kx.T)
            else:
                Kxx = self.kern.Kdiag(X_new)
                var = (Kxx - np.sum(Kx*np.dot(Kx,tmp),1))[:,None]
            return mu, var
        else:
            assert X_variance_new.shape == X_new.shape
            Kx = self.kern.psi1(self.Z,X_new, X_variance_new)
            mu = np.dot(Kx,self.Kmmi_m)
            Kxx = self.kern.psi0(self.Z,X_new,X_variance_new)
            psi2 = self.kern.psi2(self.Z,X_new,X_variance_new)
            diag_var = Kxx - np.sum(np.sum(psi2*tmp[None,:,:],1),1)
            if full_cov:
                raise NotImplementedError
            else:
                return mu, diag_var[:,None]
    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False, sampling=False, num_samples=15000):
        # normalize X values
        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
        if X_variance_new is not None:
            X_variance_new = X_variance_new / self._Xscale ** 2
        # here's the actual prediction by the GP model
        mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts)
        # now push through likelihood
        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, sampling=sampling, num_samples=num_samples)
        return mean, var, _025pm, _975pm
    def set_vb_param(self,vb_param):
        """set the distribution q(u) from the canonical parameters"""
        self.q_u_canonical_flat = vb_param.copy()
        self.q_u_canonical = self.q_u_canonical_flat[:self.num_inducing*self.output_dim].reshape(self.num_inducing,self.output_dim),self.q_u_canonical_flat[self.num_inducing*self.output_dim:].reshape(self.num_inducing,self.num_inducing)
        self.q_u_prec = -2.*self.q_u_canonical[1]
        self.q_u_cov, q_u_Li, q_u_L, tmp = pdinv(self.q_u_prec)
        self.q_u_Li = q_u_Li
        self.q_u_logdet = -tmp
        self.q_u_mean, _ = dpotrs(q_u_Li, np.asfortranarray(self.q_u_canonical[0]),lower=1)
        self.q_u_expectation = (self.q_u_mean, np.dot(self.q_u_mean,self.q_u_mean.T)+self.q_u_cov*self.output_dim)
    def get_vb_param(self):
        """
        Return the canonical parameters of the distribution q(u)
        """
        return self.q_u_canonical_flat
    def plot(self, ax=None, fignum=None, Z_height=None, **kwargs):
        if ax is None:
            fig = pb.figure(num=fignum)
            ax = fig.add_subplot(111)
        #horrible hack here:
        data = self.likelihood.data.copy()
        self.likelihood.data = self.Y
        GPBase.plot(self, ax=ax, **kwargs)
        self.likelihood.data = data
        Zu = self.Z * self._Xscale + self._Xoffset
        if self.input_dim==1:
            ax.plot(self.X_batch, self.likelihood.data, 'gx',mew=2)
            if Z_height is None:
                Z_height = ax.get_ylim()[0]
            ax.plot(Zu, np.zeros_like(Zu) + Z_height, 'r|', mew=1.5, markersize=12)
        if self.input_dim==2:
            ax.scatter(self.X[:,0], self.X[:,1], 20., self.Y[:,0], linewidth=0, cmap=pb.cm.jet)
            ax.plot(Zu[:,0], Zu[:,1], 'w^')
    def plot_traces(self):
        pb.figure()
        t = np.array(self._param_trace)
        pb.subplot(2,1,1)
        for l,ti in zip(self._get_param_names(),t.T):
            if not l[:3]=='iip':
                pb.plot(ti,label=l)
        pb.legend(loc=0)
        pb.subplot(2,1,2)
        pb.plot(np.asarray(self._ll_trace),label='stochastic likelihood')
        pb.legend(loc=0)
--- a/GPy/core/symbolic.py
+++ b/GPy/core/symbolic.py
@ -0,0 +1,420 @@
 # Copyright (c) 2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import sys
 import re
 from ..core.parameterization import Parameterized
 import numpy as np
 import sympy as sym
 from ..core.parameterization import Param
 from sympy.utilities.lambdify import lambdastr, _imp_namespace, _get_namespace
 from sympy.utilities.iterables import numbered_symbols
 import scipy
 import GPy
 def getFromDict(dataDict, mapList):
    return reduce(lambda d, k: d[k], mapList, dataDict)
 def setInDict(dataDict, mapList, value):
    getFromDict(dataDict, mapList[:-1])[mapList[-1]] = value
 class Symbolic_core():
    """
    Base model symbolic class.
    """
    def __init__(self, expressions, cacheable, derivatives=None, parameters=None, func_modules=[]):
        # Base class init, do some basic derivatives etc.
        # Func_modules sets up the right mapping for functions.
        func_modules += [{'gamma':scipy.special.gamma,
                          'gammaln':scipy.special.gammaln,
                          'erf':scipy.special.erf, 'erfc':scipy.special.erfc,
                          'erfcx':scipy.special.erfcx,
                          'polygamma':scipy.special.polygamma,
                          'normcdf':GPy.util.functions.normcdf,
                          'normcdfln':GPy.util.functions.normcdfln,
                          'logistic':GPy.util.functions.logistic,
                          'logisticln':GPy.util.functions.logisticln},
                         'numpy']
        self._set_expressions(expressions)
        self._set_variables(cacheable)
        self._set_derivatives(derivatives)
        self._set_parameters(parameters)
        # Convert the expressions to a list for common sub expression elimination
        # We should find the following type of expressions: 'function', 'derivative', 'second_derivative', 'third_derivative'. 
        self.update_expression_list()
        # Apply any global stabilisation operations to expressions.
        self.global_stabilize()
        # Helper functions to get data in and out of dictionaries.
        # this code from http://stackoverflow.com/questions/14692690/access-python-nested-dictionary-items-via-a-list-of-keys
        self.extract_sub_expressions()
        self._gen_code()
        self._set_namespace(func_modules)
    def _set_namespace(self, namespaces):
        """Set the name space for use when calling eval. This needs to contain all the relvant functions for mapping from symbolic python to the numerical python. It also contains variables, cached portions etc."""
        self.namespace = {}
        for m in namespaces[::-1]:
            buf = _get_namespace(m)
            self.namespace.update(buf)
        self.namespace.update(self.__dict__)
    def _set_expressions(self, expressions):
        """Extract expressions and variables from the user provided expressions."""
        self.expressions = {}
        for key, item in expressions.items():
            self.expressions[key] = {'function': item}
    def _set_variables(self, cacheable):
        """Pull the variable names out of the provided expressions and separate into cacheable expressions and normal parameters. Those that are only stored in the cache, the parameters are stored in this object."""
        # pull the parameters and inputs out of the symbolic pdf
        def extract_vars(expr):
            return [e for e in expr.atoms() if e.is_Symbol and e not in vars]
        self.cacheable = cacheable
        self.variables = {}
        vars = []
        for expression in self.expressions.values():
            vars += extract_vars(expression['function'])
        # inputs are assumed to be those things that are
        # cacheable. I.e. those things that aren't stored within the
        # object except as cached. For covariance functions this is X
        # and Z, for likelihoods F and for mapping functions X.
        self.cacheable_vars = [] # list of everything that's cacheable
        for var in cacheable:            
            self.variables[var] = [e for e in vars if e.name.split('_')[0]==var.lower()]
            self.cacheable_vars += self.variables[var]
        for var in cacheable:
            if not self.variables[var]:
                raise ValueError('Variable ' + var + ' was specified as cacheable but is not in expression. Expected to find symbols of the form ' + var.lower() + '_0 to represent ' + var)
        # things that aren't cacheable are assumed to be parameters.
        self.variables['theta'] = sorted([e for e in vars if not e in self.cacheable_vars],key=lambda e:e.name)
    def _set_derivatives(self, derivatives):
        # these are arguments for computing derivatives.
        def extract_derivative(function, derivative_arguments):
            return {theta.name : self.stabilize(sym.diff(function,theta)) for theta in derivative_arguments}
        derivative_arguments = []
        if derivatives is not None:
            for derivative in derivatives:
                derivative_arguments += self.variables[derivative]
            # Do symbolic work to compute derivatives.        
            for key, func in self.expressions.items():
                # if func['function'].is_Matrix:
                #     rows = func['function'].shape[0]
                #     cols = func['function'].shape[1]
                #     self.expressions[key]['derivative'] = sym.zeros(rows, cols)
                #     for i in xrange(rows):
                #         for j in xrange(cols):
                #             self.expressions[key]['derivative'][i, j] = extract_derivative(func['function'][i, j], derivative_arguments)
                # else:
                    self.expressions[key]['derivative'] = extract_derivative(func['function'], derivative_arguments)
    def _set_parameters(self, parameters):
        """Add parameters to the model and initialize with given values."""
        for theta in self.variables['theta']:
            val = 1.0
            # TODO: improve approach for initializing parameters.
            if parameters is not None:
                if parameters.has_key(theta.name):
                    val = parameters[theta.name]
            # Add parameter.
            self.link_parameters(Param(theta.name, val, None))
            #self._set_attribute(theta.name, )
    def eval_parameters_changed(self):
        # TODO: place checks for inf/nan in here
        # do all the precomputation codes.
        self.eval_update_cache()
    def eval_update_cache(self, **kwargs):
        # TODO: place checks for inf/nan in here
        # for all provided keywords
        for var, code in self.variable_sort(self.code['parameters_changed']):
            self._set_attribute(var, eval(code, self.namespace))
        for var, value in kwargs.items():
            # update their cached values
            if value is not None:
                if var == 'X' or var == 'F' or var == 'M':
                    value = np.atleast_2d(value)
                    for i, theta in enumerate(self.variables[var]):
                        self._set_attribute(theta.name, value[:, i][:, None])
                elif var == 'Y':
                    # Y values can be missing.
                    value = np.atleast_2d(value)
                    for i, theta in enumerate(self.variables[var]):
                        self._set_attribute('missing' + str(i), np.isnan(value[:, i]))
                        self._set_attribute(theta.name, value[:, i][:, None])
                elif var == 'Z':
                    value = np.atleast_2d(value)
                    for i, theta in enumerate(self.variables[var]):
                        self._set_attribute(theta.name, value[:, i][None, :])
                else:
                    value = np.atleast_1d(value)
                    for i, theta in enumerate(self.variables[var]):
                        self._set_attribute(theta.name, value[i])
        for var, code in self.variable_sort(self.code['update_cache']):
            self._set_attribute(var, eval(code, self.namespace))
    def eval_update_gradients(self, function, partial, **kwargs):
        # TODO: place checks for inf/nan in here?
        self.eval_update_cache(**kwargs)
        gradient = {}
        for theta in self.variables['theta']:
            code = self.code[function]['derivative'][theta.name]
            gradient[theta.name] = (partial*eval(code, self.namespace)).sum()
        return gradient
    def eval_gradients_X(self, function, partial, **kwargs):
        if kwargs.has_key('X'):
            gradients_X = np.zeros_like(kwargs['X'])
        self.eval_update_cache(**kwargs)
        for i, theta in enumerate(self.variables['X']):
            code = self.code[function]['derivative'][theta.name]
            gradients_X[:, i:i+1] = partial*eval(code, self.namespace)
        return gradients_X
    def eval_function(self, function, **kwargs):
        self.eval_update_cache(**kwargs)
        return eval(self.code[function]['function'], self.namespace)
    def code_parameters_changed(self):
        # do all the precomputation codes.
        lcode = ''
        for variable, code in self.variable_sort(self.code['parameters_changed']):
            lcode += self._print_code(variable) + ' = ' + self._print_code(code) + '\n'
        return lcode
    def code_update_cache(self):
        lcode = ''
        for var in self.cacheable:
            lcode += 'if ' + var + ' is not None:\n'
            if var == 'X':
                reorder = '[:, None]'
            elif var == 'Z':
                reorder = '[None, :]'
            else:
                reorder = ''
            for i, theta in enumerate(self.variables[var]):
                lcode+= "\t" + var + '= np.atleast_2d(' + var + ')\n'
                lcode+= "\t" + self._print_code(theta.name) + ' = ' + var + '[:, ' + str(i) + "]" + reorder + "\n"
        for variable, code in self.variable_sort(self.code['update_cache']):
            lcode+= self._print_code(variable) + ' = ' + self._print_code(code) + "\n"
        return lcode
    def code_update_gradients(self, function):
        lcode = ''
        for theta in self.variables['theta']:
            code = self.code[function]['derivative'][theta.name]
            lcode += self._print_code(theta.name) + '.gradient = (partial*(' + self._print_code(code) + ')).sum()\n'
        return lcode
    def code_gradients_cacheable(self, function, variable):
        if variable not in self.cacheable:
            raise RuntimeError, variable + ' must be a cacheable.'
        lcode = 'gradients_' + variable + ' = np.zeros_like(' + variable + ')\n'
        lcode += 'self.update_cache(' + ', '.join(self.cacheable) + ')\n'
        for i, theta in enumerate(self.variables[variable]):
            code = self.code[function]['derivative'][theta.name]
            lcode += 'gradients_' + variable + '[:, ' + str(i) + ':' + str(i) + '+1] = partial*' + self._print_code(code) + '\n'
        lcode += 'return gradients_' + variable + '\n'
        return lcode
    def code_function(self, function):
        lcode = 'self.update_cache(' + ', '.join(self.cacheable) + ')\n'
        lcode += 'return ' + self._print_code(self.code[function]['function'])
        return lcode
    def stabilize(self, expr):
        """Stabilize the code in the model."""
        # this code is applied to expressions in the model in an attempt to sabilize them.
        return expr
    def global_stabilize(self):
        """Stabilize all code in the model."""
        pass
    def _set_attribute(self, name, value):
        """Make sure namespace gets updated when setting attributes."""
        setattr(self, name, value)
        self.namespace.update({name: getattr(self, name)})
    def update_expression_list(self):
        """Extract a list of expressions from the dictionary of expressions."""
        self.expression_list = [] # code arrives in dictionary, but is passed in this list
        self.expression_keys = [] # Keep track of the dictionary keys.
        self.expression_order = [] # This may be unecessary. It's to give ordering for cse
        for fname, fexpressions in self.expressions.items():
            for type, texpressions in fexpressions.items():
                if type == 'function':
                    self.expression_list.append(texpressions)            
                    self.expression_keys.append([fname, type])
                    self.expression_order.append(1) 
                elif type[-10:] == 'derivative':
                    for dtype, expression in texpressions.items():
                        self.expression_list.append(expression)
                        self.expression_keys.append([fname, type, dtype])
                        if type[:-10] == 'first_' or type[:-10] == '':
                            self.expression_order.append(3) #sym.count_ops(self.expressions[type][dtype]))
                        elif type[:-10] == 'second_':
                            self.expression_order.append(4) #sym.count_ops(self.expressions[type][dtype]))
                        elif type[:-10] == 'third_':
                            self.expression_order.append(5) #sym.count_ops(self.expressions[type][dtype]))
                else:
                    self.expression_list.append(fexpressions[type])            
                    self.expression_keys.append([fname, type])
                    self.expression_order.append(2) 
        # This step may be unecessary.
        # Not 100% sure if the sub expression elimination is order sensitive. This step orders the list with the 'function' code first and derivatives after.
        self.expression_order, self.expression_list, self.expression_keys = zip(*sorted(zip(self.expression_order, self.expression_list, self.expression_keys)))
    def extract_sub_expressions(self, cache_prefix='cache', sub_prefix='sub', prefix='XoXoXoX'):
        # Do the common sub expression elimination.
        common_sub_expressions, expression_substituted_list = sym.cse(self.expression_list, numbered_symbols(prefix=prefix))
        self.variables[cache_prefix] = []
        self.variables[sub_prefix] = []
        # Create dictionary of new sub expressions
        sub_expression_dict = {}
        for var, void in common_sub_expressions:
            sub_expression_dict[var.name] = var
        # Sort out any expression that's dependent on something that scales with data size (these are listed in cacheable).
        cacheable_list = []
        params_change_list = []
        # common_sube_expressions contains a list of paired tuples with the new variable and what it equals
        for var, expr in common_sub_expressions:
            arg_list = [e for e in expr.atoms() if e.is_Symbol]
            # List any cacheable dependencies of the sub-expression
            cacheable_symbols = [e for e in arg_list if e in cacheable_list or e in self.cacheable_vars]
            if cacheable_symbols:
                # list which ensures dependencies are cacheable.
                cacheable_list.append(var)
            else:
                params_change_list.append(var)
        replace_dict = {}
        for i, expr in enumerate(cacheable_list):
            sym_var = sym.var(cache_prefix + str(i))
            self.variables[cache_prefix].append(sym_var)
            replace_dict[expr.name] = sym_var
        for i, expr in enumerate(params_change_list):
            sym_var = sym.var(sub_prefix + str(i))
            self.variables[sub_prefix].append(sym_var)
            replace_dict[expr.name] = sym_var
        for replace, void in common_sub_expressions:
            for expr, keys in zip(expression_substituted_list, self.expression_keys):
                setInDict(self.expressions, keys, expr.subs(replace, replace_dict[replace.name]))
            for void, expr in common_sub_expressions:
                expr = expr.subs(replace, replace_dict[replace.name])
        # Replace original code with code including subexpressions.
        for keys in self.expression_keys:
            for replace, void in common_sub_expressions:
                setInDict(self.expressions, keys, getFromDict(self.expressions, keys).subs(replace, replace_dict[replace.name]))
        self.expressions['parameters_changed'] = {}
        self.expressions['update_cache'] = {}
        for var, expr in common_sub_expressions:
            for replace, void in common_sub_expressions:
                expr = expr.subs(replace, replace_dict[replace.name])
            if var in cacheable_list:
                self.expressions['update_cache'][replace_dict[var.name].name] = expr
            else:
                self.expressions['parameters_changed'][replace_dict[var.name].name] = expr
    def _gen_code(self):
        """Generate code for the list of expressions provided using the common sub-expression eliminator to separate out portions that are computed multiple times."""
        # This is the dictionary that stores all the generated code.
        self.code = {}
        def match_key(expr):
            if type(expr) is dict:
                code = {}
                for key in expr.keys():
                    code[key] = match_key(expr[key])
            else:
                arg_list = [e for e in expr.atoms() if e.is_Symbol]
                code = self._expr2code(arg_list, expr)
            return code
        self.code = match_key(self.expressions)
    def _expr2code(self, arg_list, expr):
        """Convert the given symbolic expression into code."""
        code = lambdastr(arg_list, expr)
        function_code = code.split(':')[1].strip()
        #for arg in arg_list:
        #    function_code = function_code.replace(arg.name, 'self.'+arg.name)
        return function_code
    def _print_code(self, code):
        """Prepare code for string writing."""
        # This needs a rewrite --- it doesn't check for match clashes! So sub11 would be replaced by sub1 before being replaced with sub11!!
        for key in self.variables.keys():
            for arg in self.variables[key]:
                code = code.replace(arg.name, 'self.'+arg.name)
        return code
    def _display_expression(self, keys, user_substitutes={}):
        """Helper function for human friendly display of the symbolic components."""
        # Create some pretty maths symbols for the display.
        sigma, alpha, nu, omega, l, variance = sym.var('\sigma, \alpha, \nu, \omega, \ell, \sigma^2')
        substitutes = {'scale': sigma, 'shape': alpha, 'lengthscale': l, 'variance': variance}
        substitutes.update(user_substitutes)
        function_substitutes = {normcdfln : lambda arg : sym.log(normcdf(arg)),
                                logisticln : lambda arg : -sym.log(1+sym.exp(-arg)),
                                logistic : lambda arg : 1/(1+sym.exp(-arg)),
                                erfcx : lambda arg : erfc(arg)/sym.exp(arg*arg),
                                gammaln : lambda arg : sym.log(sym.gamma(arg))}
        expr = getFromDict(self.expressions, keys)
        for var_name, sub in self.variable_sort(self.expressions['update_cache'], reverse=True):
            for var in self.variables['cache']:
                if var_name == var.name:
                    expr = expr.subs(var, sub)
                    break
        for var_name, sub in self.variable_sort(self.expressions['parameters_changed'], reverse=True):
            for var in self.variables['sub']:
                if var_name == var.name:
                    expr = expr.subs(var, sub)
                    break
        for var_name, sub in self.variable_sort(substitutes, reverse=True):
            for var in self.variables['theta']:
                if var_name == var.name:
                    expr = expr.subs(var, sub)
                    break
        for m, r in function_substitutes.iteritems():
            expr = expr.replace(m, r)#normcdfln, lambda arg : sym.log(normcdf(arg)))
        return expr.simplify()
    def variable_sort(self, var_dict, reverse=False):
        def sort_key(x):
            digits = re.findall(r'\d+$', x[0])
            if digits:
                return int(digits[0])
            else:
                return x[0]
        return sorted(var_dict.iteritems(), key=sort_key, reverse=reverse)
--- a/GPy/core/transformations.py
+++ b/GPy/core/transformations.py
@ -1,143 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from GPy.core.domains import POSITIVE, NEGATIVE, BOUNDED
 import sys 
 lim_val = -np.log(sys.float_info.epsilon) 
 class transformation(object):
    domain = None
    def f(self, x):
        raise NotImplementedError
    def finv(self, x):
        raise NotImplementedError
    def gradfactor(self, f):
        """ df_dx evaluated at self.f(x)=f"""
        raise NotImplementedError
    def initialize(self, f):
        """ produce a sensible initial value for f(x)"""
        raise NotImplementedError
    def __str__(self):
        raise NotImplementedError
 class logexp(transformation):
    domain = POSITIVE
    def f(self, x):
        return np.where(x<-lim_val, np.log(1+np.exp(-lim_val)), np.where(x>lim_val, x, np.log(1. + np.exp(x))))
    def finv(self, f):
        return np.where(f>lim_val, f, np.log(np.exp(f) - 1.))
    def gradfactor(self, f):
        return np.where(f>lim_val, 1., 1 - np.exp(-f))
    def initialize(self, f):
        if np.any(f < 0.):
            print "Warning: changing parameters to satisfy constraints"
        return np.abs(f)
    def __str__(self):
        return '(+ve)'
 class negative_logexp(transformation):
    domain = NEGATIVE
    def f(self, x):
        return -logexp.f(x)
    def finv(self, f):
        return logexp.finv(-f) 
    def gradfactor(self, f):
        return -logexp.gradfactor(-f)
    def initialize(self, f):
        return -logexp.initialize(f)
    def __str__(self):
        return '(-ve)'
 class logexp_clipped(logexp):
    max_bound = 1e100
    min_bound = 1e-10
    log_max_bound = np.log(max_bound)
    log_min_bound = np.log(min_bound)
    domain = POSITIVE
    def __init__(self, lower=1e-6):
        self.lower = lower
    def f(self, x):
        exp = np.exp(np.clip(x, self.log_min_bound, self.log_max_bound))
        f = np.log(1. + exp)
 #         if np.isnan(f).any():
 #             import ipdb;ipdb.set_trace()
        return np.clip(f, self.min_bound, self.max_bound)
    def finv(self, f):
        return np.log(np.exp(f - 1.))
    def gradfactor(self, f):
        ef = np.exp(f) # np.clip(f, self.min_bound, self.max_bound))
        gf = (ef - 1.) / ef
        return gf # np.where(f < self.lower, 0, gf)
    def initialize(self, f):
        if np.any(f < 0.):
            print "Warning: changing parameters to satisfy constraints"
        return np.abs(f)
    def __str__(self):
        return '(+ve_c)'
 class exponent(transformation):
    domain = POSITIVE
    def f(self, x):
        return np.where(x<lim_val, np.where(x>-lim_val, np.exp(x), np.exp(-lim_val)), np.exp(lim_val))
    def finv(self, x):
        return np.log(x)
    def gradfactor(self, f):
        return f
    def initialize(self, f):
        if np.any(f < 0.):
            print "Warning: changing parameters to satisfy constraints"
        return np.abs(f)
    def __str__(self):
        return '(+ve)'
 class negative_exponent(exponent):
    domain = NEGATIVE
    def f(self, x):
        return -exponent.f(x)
    def finv(self, f):
        return exponent.finv(-f)
    def gradfactor(self, f):
        return f
    def initialize(self, f):
        return -exponent.initialize(f) #np.abs(f)
    def __str__(self):
        return '(-ve)'
 class square(transformation):
    domain = POSITIVE
    def f(self, x):
        return x ** 2
    def finv(self, x):
        return np.sqrt(x)
    def gradfactor(self, f):
        return 2 * np.sqrt(f)
    def initialize(self, f):
        return np.abs(f)
    def __str__(self):
        return '(+sq)'
 class logistic(transformation):
    domain = BOUNDED
    def __init__(self, lower, upper):
        assert lower < upper
        self.lower, self.upper = float(lower), float(upper)
        self.difference = self.upper - self.lower
    def f(self, x):
        return self.lower + self.difference / (1. + np.exp(-x))
    def finv(self, f):
        return np.log(np.clip(f - self.lower, 1e-10, np.inf) / np.clip(self.upper - f, 1e-10, np.inf))
    def gradfactor(self, f):
        return (f - self.lower) * (self.upper - f) / self.difference
    def initialize(self, f):
        if np.any(np.logical_or(f < self.lower, f > self.upper)):
            print "Warning: changing parameters to satisfy constraints"
        return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(f * 0.), f)
    def __str__(self):
        return '({},{})'.format(self.lower, self.upper)
--- a/GPy/defaults.cfg
+++ b/GPy/defaults.cfg
@ -0,0 +1,27 @@
 # This is the default configuration file for GPy
 # Do note edit this file.
 # For machine specific changes (i.e. those specific to a given installation) edit GPy/installation.cfg
 # For user specific changes edit $HOME/.gpy_user.cfg
 [parallel]
 # Enable openmp support. This speeds up some computations, depending on the number
 # of cores available. Setting up a compiler with openmp support can be difficult on
 # some platforms, hence by default it is off.
 openmp=False
 [datasets]
 # location for the local data cache
 dir=$HOME/tmp/GPy-datasets/
 [anaconda]
 # if you have an anaconda python installation please specify it here.
 installed = False
 location = None
 # set this to true if you have the MKL optimizations installed:
 MKL = False
 [weave]
 #if true, try to use weave, and fall back to numpy. if false, just use numpy.
 working = True
--- a/GPy/examples/init.py
+++ b/GPy/examples/init.py
@ -1,8 +1,7 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import classification
 import regression
 import dimensionality_reduction
-import tutorials
+import non_gaussian
 import stochastic
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -1,11 +1,10 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
-Gaussian Processes classification
+Gaussian Processes classification examples
 """
 import pylab as pb
 import GPy
 default_seed = 10000
@ -15,7 +14,9 @@ def oil(num_inducing=50, max_iters=100, kernel=None, optimize=True, plot=True):
    Run a Gaussian process classification on the three phase oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
    """
-    data = GPy.util.datasets.oil()
+    try:import pods
    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
    data = pods.datasets.oil()
    X = data['X']
    Xtest = data['Xtest']
    Y = data['Y'][:, 0:1]
@ -27,13 +28,13 @@ def oil(num_inducing=50, max_iters=100, kernel=None, optimize=True, plot=True):
    m = GPy.models.SparseGPClassification(X, Y, kernel=kernel, num_inducing=num_inducing)
    # Contrain all parameters to be positive
-    m.tie_params('.*len')
+    #m.tie_params('.*len')
    m['.*len'] = 10.
    m.update_likelihood_approximation()
    # Optimize
    if optimize:
-        m.optimize(max_iters=max_iters)
+        for _ in range(5):
            m.optimize(max_iters=int(max_iters/5))
    print(m)
    #Test
@ -50,7 +51,9 @@ def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):
    """
-    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    try:import pods
    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
    data = pods.datasets.toy_linear_1d_classification(seed=seed)
    Y = data['Y'][:, 0:1]
    Y[Y.flatten() == -1] = 0
@ -61,13 +64,14 @@ def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):
    if optimize:
        #m.update_likelihood_approximation()
        # Parameters optimization:
-        #m.optimize()
+        m.optimize()
        #m.update_likelihood_approximation()
-        m.pseudo_EM()
+        #m.pseudo_EM()
    # Plot
    if plot:
-        fig, axes = pb.subplots(2, 1)
+        from matplotlib import pyplot as plt
        fig, axes = plt.subplots(2, 1)
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])
@ -83,27 +87,30 @@ def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=
    """
-    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    try:import pods
    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
    data = pods.datasets.toy_linear_1d_classification(seed=seed)
    Y = data['Y'][:, 0:1]
    Y[Y.flatten() == -1] = 0
-    bern_noise_model = GPy.likelihoods.bernoulli()
+    likelihood = GPy.likelihoods.Bernoulli()
-    laplace_likelihood = GPy.likelihoods.Laplace(Y.copy(), bern_noise_model)
+    laplace_inf = GPy.inference.latent_function_inference.Laplace()
    kernel = GPy.kern.RBF(1)
    # Model definition
-    m = GPy.models.GPClassification(data['X'], Y, likelihood=laplace_likelihood)
+    m = GPy.core.GP(data['X'], Y, kernel=kernel, likelihood=likelihood, inference_method=laplace_inf)
    print m
    # Optimize
    if optimize:
-        #m.update_likelihood_approximation()
+        try:
-        # Parameters optimization:
+            m.optimize('scg', messages=1)
-        m.optimize('bfgs', messages=1)
+        except Exception as e:
-        #m.pseudo_EM()
+            return m
    # Plot
    if plot:
-        fig, axes = pb.subplots(2, 1)
+        from matplotlib import pyplot as plt
        fig, axes = plt.subplots(2, 1)
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])
@ -119,7 +126,9 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti
    """
-    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    try:import pods
    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
    data = pods.datasets.toy_linear_1d_classification(seed=seed)
    Y = data['Y'][:, 0:1]
    Y[Y.flatten() == -1] = 0
@ -129,21 +138,19 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti
    # Optimize
    if optimize:
-        #m.update_likelihood_approximation()
+        m.optimize()
        # Parameters optimization:
        #m.optimize()
        m.pseudo_EM()
    # Plot
    if plot:
-        fig, axes = pb.subplots(2, 1)
+        from matplotlib import pyplot as plt
        fig, axes = plt.subplots(2, 1)
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])
    print m
    return m
-def toy_heaviside(seed=default_seed, optimize=True, plot=True):
+def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):
    """
    Simple 1D classification example using a heavy side gp transformation
@ -152,25 +159,30 @@ def toy_heaviside(seed=default_seed, optimize=True, plot=True):
    """
-    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
+    try:import pods
    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
    data = pods.datasets.toy_linear_1d_classification(seed=seed)
    Y = data['Y'][:, 0:1]
    Y[Y.flatten() == -1] = 0
    # Model definition
-    noise_model = GPy.likelihoods.bernoulli(GPy.likelihoods.noise_models.gp_transformations.Heaviside())
+    kernel = GPy.kern.RBF(1)
-    likelihood = GPy.likelihoods.EP(Y, noise_model)
+    likelihood = GPy.likelihoods.Bernoulli(gp_link=GPy.likelihoods.link_functions.Heaviside())
-    m = GPy.models.GPClassification(data['X'], likelihood=likelihood)
+    ep = GPy.inference.latent_function_inference.expectation_propagation.EP()
    m = GPy.core.GP(X=data['X'], Y=Y, kernel=kernel, likelihood=likelihood, inference_method=ep, name='gp_classification_heaviside')
    #m = GPy.models.GPClassification(data['X'], likelihood=likelihood)
    # Optimize
    if optimize:
        m.update_likelihood_approximation()
        # Parameters optimization:
-        m.optimize()
+        for _ in range(5):
-        #m.pseudo_EM()
+            m.optimize(max_iters=int(max_iters/5))
        print m
    # Plot
    if plot:
-        fig, axes = pb.subplots(2, 1)
+        from matplotlib import pyplot as plt
        fig, axes = plt.subplots(2, 1)
        m.plot_f(ax=axes[0])
        m.plot(ax=axes[1])
@ -189,7 +201,9 @@ def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=
    :param kernel: kernel to use in the model
    :type kernel: a GPy kernel
    """
-    data = GPy.util.datasets.crescent_data(seed=seed)
+    try:import pods
    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
    data = pods.datasets.crescent_data(seed=seed)
    Y = data['Y']
    Y[Y.flatten()==-1] = 0
--- a/GPy/examples/coreg_example.py
+++ b/GPy/examples/coreg_example.py
@ -0,0 +1,89 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 try:
    import pylab as pb
 except:
    pass
 import GPy
 pb.ion()
 pb.close('all')
 X1 = np.arange(3)[:,None]
 X2 = np.arange(4)[:,None]
 I1 = np.zeros_like(X1)
 I2 = np.ones_like(X2)
 _X = np.vstack([ X1, X2 ])
 _I = np.vstack([ I1, I2 ])
 X = np.hstack([ _X, _I ])
 Y1 = np.sin(X1/8.)
 Y2 = np.cos(X2/8.)
 Bias = GPy.kern.Bias(1,active_dims=[0])
 Coreg = GPy.kern.Coregionalize(1,2,active_dims=[1])
 K = Bias.prod(Coreg,name='X')
 #K.coregion.W = 0
 #print K.coregion.W
 #print Bias.K(_X,_X)
 #print K.K(X,X)
 #pb.matshow(K.K(X,X))
 Mlist = [GPy.kern.Matern32(1,lengthscale=20.,name="Mat")]
 kern = GPy.util.multioutput.LCM(input_dim=1,num_outputs=2,kernels_list=Mlist,name='H')
 kern.B.W = 0
 kern.B.kappa = 1.
 #kern.B.W.fix()
 #kern.B.kappa.fix()
 #m = GPy.models.GPCoregionalizedRegression(X_list=[X1,X2], Y_list=[Y1,Y2], kernel=kern)
 Z1 = np.array([1.5,2.5])[:,None]
 m = GPy.models.SparseGPCoregionalizedRegression(X_list=[X1], Y_list=[Y1], Z_list = [Z1], kernel=kern)
 #m.optimize()
 m.checkgrad(verbose=1)
 """
 fig = pb.figure()
 ax0 = fig.add_subplot(211)
 ax1 = fig.add_subplot(212)
 slices = GPy.util.multioutput.get_slices([Y1,Y2])
 m.plot(fixed_inputs=[(1,0)],which_data_rows=slices[0],ax=ax0)
 #m.plot(fixed_inputs=[(1,1)],which_data_rows=slices[1],ax=ax1)
 """
 """
 X1 = 100 * np.random.rand(100)[:,None]
 X2 = 100 * np.random.rand(100)[:,None]
 #X1.sort()
 #X2.sort()
 Y1 = np.sin(X1/10.) + np.random.rand(100)[:,None]
 Y2 = np.cos(X2/10.) + np.random.rand(100)[:,None]
 Mlist = [GPy.kern.Matern32(1,lengthscale=20.,name="Mat")]
 kern = GPy.util.multioutput.LCM(input_dim=1,num_outputs=12,kernels_list=Mlist,name='H')
 m = GPy.models.GPCoregionalizedRegression(X_list=[X1,X2], Y_list=[Y1,Y2], kernel=kern)
 m.optimize()
 fig = pb.figure()
 ax0 = fig.add_subplot(211)
 ax1 = fig.add_subplot(212)
 slices = GPy.util.multioutput.get_slices([Y1,Y2])
 m.plot(fixed_inputs=[(1,0)],which_data_rows=slices[0],ax=ax0)
 m.plot(fixed_inputs=[(1,1)],which_data_rows=slices[1],ax=ax1)
 """
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -1,75 +1,80 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as _np
 default_seed = 123344
-def bgplvm_test_model(seed=default_seed, optimize=False, verbose=1, plot=False):
+# default_seed = _np.random.seed(123344)
 def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan=False):
    """
    model for testing purposes. Samples from a GP with rbf kernel and learns
    the samples with a new kernel. Normally not for optimization, just model cheking
    """
    from GPy.likelihoods.gaussian import Gaussian
    import GPy
    num_inputs = 13
    num_inducing = 5
    if plot:
        output_dim = 1
-        input_dim = 2
+        input_dim = 3
    else:
        input_dim = 2
-        output_dim = 25
+        output_dim = output_dim
    # generate GPLVM-like data
    X = _np.random.rand(num_inputs, input_dim)
    lengthscales = _np.random.rand(input_dim)
-    k = (GPy.kern.rbf(input_dim, .5, lengthscales, ARD=True)
+    k = GPy.kern.RBF(input_dim, .5, lengthscales, ARD=True)
         + GPy.kern.white(input_dim, 0.01))
    K = k.K(X)
-    Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, output_dim).T
+    Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, (output_dim,)).T
    lik = Gaussian(Y, normalize=True)
-    k = GPy.kern.rbf_inv(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
+    # k = GPy.kern.RBF_inv(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
-    # k = GPy.kern.linear(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+    # k = GPy.kern.linear(input_dim)# + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
-    # k = GPy.kern.rbf(input_dim, ARD = False)  + GPy.kern.white(input_dim, 0.00001)
+    # k = GPy.kern.RBF(input_dim, ARD = False)  + GPy.kern.white(input_dim, 0.00001)
-    # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.rbf(input_dim, .3, _np.ones(input_dim) * .2, ARD=True)
+    # k = GPy.kern.RBF(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.RBF(input_dim, .3, _np.ones(input_dim) * .2, ARD=True)
-    # k = GPy.kern.rbf(input_dim, .5, 2., ARD=0) + GPy.kern.rbf(input_dim, .3, .2, ARD=0)
+    # k = GPy.kern.RBF(input_dim, .5, 2., ARD=0) + GPy.kern.RBF(input_dim, .3, .2, ARD=0)
-    # k = GPy.kern.rbf(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.linear(input_dim, _np.ones(input_dim) * .2, ARD=True)
+    # k = GPy.kern.RBF(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.linear(input_dim, _np.ones(input_dim) * .2, ARD=True)
    p = .3
    m = GPy.models.BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
    if nan:
        m.inference_method = GPy.inference.latent_function_inference.var_dtc.VarDTCMissingData()
        m.Y[_np.random.binomial(1, p, size=(Y.shape)).astype(bool)] = _np.nan
        m.parameters_changed()
    m = GPy.models.BayesianGPLVM(lik, input_dim, kernel=k, num_inducing=num_inducing)
    #===========================================================================
    # randomly obstruct data with percentage p
    p = .8
    Y_obstruct = Y.copy()
    Y_obstruct[_np.random.uniform(size=(Y.shape)) < p] = _np.nan
    #===========================================================================
-    m2 = GPy.models.BayesianGPLVMWithMissingData(Y_obstruct, input_dim, kernel=k, num_inducing=num_inducing)
+    # m2 = GPy.models.BayesianGPLVMWithMissingData(Y_obstruct, input_dim, kernel=k, num_inducing=num_inducing)
-    m.lengthscales = lengthscales
+    # m.lengthscales = lengthscales
    if plot:
        import matplotlib.pyplot as pb
        m.plot()
        pb.title('PCA initialisation')
-        m2.plot()
+        # m2.plot()
-        pb.title('PCA initialisation')
+        # pb.title('PCA initialisation')
    if optimize:
        m.optimize('scg', messages=verbose)
-        m2.optimize('scg', messages=verbose)
+        # m2.optimize('scg', messages=verbose)
        if plot:
            m.plot()
            pb.title('After optimisation')
-            m2.plot()
+            # m2.plot()
-            pb.title('After optimisation')
+            # pb.title('After optimisation')
-    return m, m2
+    return m
 def gplvm_oil_100(optimize=True, verbose=1, plot=True):
    import GPy
-    data = GPy.util.datasets.oil_100()
+    import pods
    data = pods.datasets.oil_100()
    Y = data['X']
    # create simple GP model
-    kernel = GPy.kern.rbf(6, ARD=True) + GPy.kern.bias(6)
+    kernel = GPy.kern.RBF(6, ARD=True) + GPy.kern.Bias(6)
    m = GPy.models.GPLVM(Y, 6, kernel=kernel)
    m.data_labels = data['Y'].argmax(axis=1)
    if optimize: m.optimize('scg', messages=verbose)
@ -78,13 +83,15 @@ def gplvm_oil_100(optimize=True, verbose=1, plot=True):
 def sparse_gplvm_oil(optimize=True, verbose=0, plot=True, N=100, Q=6, num_inducing=15, max_iters=50):
    import GPy
    import pods
    _np.random.seed(0)
-    data = GPy.util.datasets.oil()
+    data = pods.datasets.oil()
    Y = data['X'][:N]
    Y = Y - Y.mean(0)
    Y /= Y.std(0)
    # Create the model
-    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q)
+    kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.Bias(Q)
    m = GPy.models.SparseGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing)
    m.data_labels = data['Y'][:N].argmax(axis=1)
@ -94,9 +101,9 @@ def sparse_gplvm_oil(optimize=True, verbose=0, plot=True, N=100, Q=6, num_induci
        m.kern.plot_ARD()
    return m
-def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=15, Q=4, sigma=.2):
+def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=25, Q=4, sigma=.2):
    import GPy
-    from GPy.util.datasets import swiss_roll_generated
+    from pods.datasets import swiss_roll_generated
    from GPy.models import BayesianGPLVM
    data = swiss_roll_generated(num_samples=N, sigma=sigma)
@ -134,93 +141,103 @@ def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=15, Q=4
                                         (1 - var))) + .001
    Z = _np.random.permutation(X)[:num_inducing]
-    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))
+    kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.Bias(Q, _np.exp(-2)) + GPy.kern.White(Q, _np.exp(-2))
    m = BayesianGPLVM(Y, Q, X=X, X_variance=S, num_inducing=num_inducing, Z=Z, kernel=kernel)
    m.data_colors = c
    m.data_t = t
    m['noise_variance'] = Y.var() / 100.
    if optimize:
-        m.optimize('scg', messages=verbose, max_iters=2e3)
+        m.optimize('bfgs', messages=verbose, max_iters=2e3)
    if plot:
        fig = plt.figure('fitted')
        ax = fig.add_subplot(111)
        s = m.input_sensitivity().argsort()[::-1][:2]
-        ax.scatter(*m.X.T[s], c=c)
+        ax.scatter(*m.X.mean.T[s], c=c)
    return m
 def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, max_iters=1000, **k):
    import GPy
    from GPy.likelihoods import Gaussian
    from matplotlib import pyplot as plt
-
+    import numpy as np
    _np.random.seed(0)
-    data = GPy.util.datasets.oil()
+    try:
        import pods
        data = pods.datasets.oil()
    except ImportError:
        data = GPy.util.datasets.oil()
-    kernel = GPy.kern.rbf_inv(Q, 1., [.1] * Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2))
+
    kernel = GPy.kern.RBF(Q, 1., 1. / _np.random.uniform(0, 1, (Q,)), ARD=True)  # + GPy.kern.Bias(Q, _np.exp(-2))
    Y = data['X'][:N]
-    Yn = Gaussian(Y, normalize=True)
+    m = GPy.models.BayesianGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing, **k)
    m = GPy.models.BayesianGPLVM(Yn, Q, kernel=kernel, num_inducing=num_inducing, **k)
    m.data_labels = data['Y'][:N].argmax(axis=1)
    m['noise'] = Yn.Y.var() / 100.
    if optimize:
-        m.optimize('scg', messages=verbose, max_iters=max_iters, gtol=.05)
+        m.optimize('bfgs', messages=verbose, max_iters=max_iters, gtol=.05)
    if plot:
        y = m.likelihood.Y[0, :]
        fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
-        m.plot_latent(ax=latent_axes)
+        m.plot_latent(ax=latent_axes, labels=m.data_labels)
-        data_show = GPy.util.visualize.vector_show(y)
+        data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0, :]))
-        lvm_visualizer = GPy.util.visualize.lvm_dimselect(m.X[0, :], # @UnusedVariable
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean.values[0:1, :],  # @UnusedVariable
-            m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
+            m, data_show, latent_axes=latent_axes, sense_axes=sense_axes, labels=m.data_labels)
        raw_input('Press enter to finish')
        plt.close(fig)
    return m
-def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
+def ssgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, max_iters=1000, **k):
-    x = _np.linspace(0, 4 * _np.pi, N)[:, None]
+    import GPy
-    s1 = _np.vectorize(lambda x: _np.sin(x))
+    from matplotlib import pyplot as plt
-    s2 = _np.vectorize(lambda x: _np.cos(x))
+    import pods
    s3 = _np.vectorize(lambda x:-_np.exp(-_np.cos(2 * x)))
    sS = _np.vectorize(lambda x: _np.sin(2 * x))
-    s1 = s1(x)
+    _np.random.seed(0)
-    s2 = s2(x)
+    data = pods.datasets.oil()
    s3 = s3(x)
    sS = sS(x)
-    S1 = _np.hstack([s1, sS])
+    kernel = GPy.kern.RBF(Q, 1., 1. / _np.random.uniform(0, 1, (Q,)), ARD=True)  # + GPy.kern.Bias(Q, _np.exp(-2))
-    S2 = _np.hstack([s2, s3, sS])
+    Y = data['X'][:N]
-    S3 = _np.hstack([s3, sS])
+    m = GPy.models.SSGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing, **k)
    m.data_labels = data['Y'][:N].argmax(axis=1)
-    Y1 = S1.dot(_np.random.randn(S1.shape[1], D1))
+    if optimize:
-    Y2 = S2.dot(_np.random.randn(S2.shape[1], D2))
+        m.optimize('bfgs', messages=verbose, max_iters=max_iters, gtol=.05)
    Y3 = S3.dot(_np.random.randn(S3.shape[1], D3))
-    Y1 += .3 * _np.random.randn(*Y1.shape)
+    if plot:
-    Y2 += .2 * _np.random.randn(*Y2.shape)
+        fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
-    Y3 += .25 * _np.random.randn(*Y3.shape)
+        m.plot_latent(ax=latent_axes, labels=m.data_labels)
        data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0, :]))
        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean.values[0:1, :],  # @UnusedVariable
            m, data_show, latent_axes=latent_axes, sense_axes=sense_axes, labels=m.data_labels)
        raw_input('Press enter to finish')
        plt.close(fig)
    return m
-    Y1 -= Y1.mean(0)
+def _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim=False):
-    Y2 -= Y2.mean(0)
+    Q_signal = 4
-    Y3 -= Y3.mean(0)
+    import GPy
-    Y1 /= Y1.std(0)
+    import numpy as np
-    Y2 /= Y2.std(0)
+    np.random.seed(3000)
-    Y3 /= Y3.std(0)
+
    k = GPy.kern.Matern32(Q_signal, 1., lengthscale=(np.random.uniform(1, 6, Q_signal)), ARD=1)
    for i in range(Q_signal):
        k += GPy.kern.PeriodicExponential(1, variance=1., active_dims=[i], period=3., lower=-2, upper=6)
    t = np.c_[[np.linspace(-1, 5, N) for _ in range(Q_signal)]].T
    K = k.K(t)
    s2, s1, s3, sS = np.random.multivariate_normal(np.zeros(K.shape[0]), K, size=(4))[:, :, None]
    Y1, Y2, Y3, S1, S2, S3 = _generate_high_dimensional_output(D1, D2, D3, s1, s2, s3, sS)
    slist = [sS, s1, s2, s3]
    slist_names = ["sS", "s1", "s2", "s3"]
    Ylist = [Y1, Y2, Y3]
    if plot_sim:
-        import pylab
+        from matplotlib import pyplot as plt
        import matplotlib.cm as cm
        import itertools
-        fig = pylab.figure("MRD Simulation Data", figsize=(8, 6))
+        fig = plt.figure("MRD Simulation Data", figsize=(8, 6))
        fig.clf()
        ax = fig.add_subplot(2, 1, 1)
        labls = slist_names
@ -229,30 +246,75 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
        ax.legend()
        for i, Y in enumerate(Ylist):
            ax = fig.add_subplot(2, len(Ylist), len(Ylist) + 1 + i)
-            ax.imshow(Y, aspect='auto', cmap=cm.gray) # @UndefinedVariable
+            ax.imshow(Y, aspect='auto', cmap=cm.gray)  # @UndefinedVariable
            ax.set_title("Y{}".format(i + 1))
-        pylab.draw()
+        plt.draw()
-        pylab.tight_layout()
+        plt.tight_layout()
    return slist, [S1, S2, S3], Ylist
-# def bgplvm_simulation_matlab_compare():
+def _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim=False):
-#     from GPy.util.datasets import simulation_BGPLVM
+    _np.random.seed(1234)
-#     from GPy import kern
+
-#     from GPy.models import BayesianGPLVM
+    x = _np.linspace(0, 4 * _np.pi, N)[:, None]
-#
+    s1 = _np.vectorize(lambda x: _np.sin(x))
-#     sim_data = simulation_BGPLVM()
+    s2 = _np.vectorize(lambda x: _np.cos(x) ** 2)
-#     Y = sim_data['Y']
+    s3 = _np.vectorize(lambda x:-_np.exp(-_np.cos(2 * x)))
-#     mu = sim_data['mu']
+    sS = _np.vectorize(lambda x: _np.cos(x))
-#     num_inducing, [_, Q] = 3, mu.shape
+
-#
+    s1 = s1(x)
-#     k = kern.linear(Q, ARD=True) + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2))
+    s2 = s2(x)
-#     m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k,
+    s3 = s3(x)
-#                        _debug=False)
+    sS = sS(x)
-#     m.auto_scale_factor = True
+
-#     m['noise'] = Y.var() / 100.
+    s1 -= s1.mean(); s1 /= s1.std(0)
-#     m['linear_variance'] = .01
+    s2 -= s2.mean(); s2 /= s2.std(0)
-#     return m
+    s3 -= s3.mean(); s3 /= s3.std(0)
    sS -= sS.mean(); sS /= sS.std(0)
    Y1, Y2, Y3, S1, S2, S3 = _generate_high_dimensional_output(D1, D2, D3, s1, s2, s3, sS)
    slist = [sS, s1, s2, s3]
    slist_names = ["sS", "s1", "s2", "s3"]
    Ylist = [Y1, Y2, Y3]
    if plot_sim:
        from matplotlib import pyplot as plt
        import matplotlib.cm as cm
        import itertools
        fig = plt.figure("MRD Simulation Data", figsize=(8, 6))
        fig.clf()
        ax = fig.add_subplot(2, 1, 1)
        labls = slist_names
        for S, lab in itertools.izip(slist, labls):
            ax.plot(S, label=lab)
        ax.legend()
        for i, Y in enumerate(Ylist):
            ax = fig.add_subplot(2, len(Ylist), len(Ylist) + 1 + i)
            ax.imshow(Y, aspect='auto', cmap=cm.gray)  # @UndefinedVariable
            ax.set_title("Y{}".format(i + 1))
        plt.draw()
        plt.tight_layout()
    return slist, [S1, S2, S3], Ylist
 def _generate_high_dimensional_output(D1, D2, D3, s1, s2, s3, sS):
    S1 = _np.hstack([s1, sS])
    S2 = _np.hstack([s2, s3, sS])
    S3 = _np.hstack([s3, sS])
    Y1 = S1.dot(_np.random.randn(S1.shape[1], D1))
    Y2 = S2.dot(_np.random.randn(S2.shape[1], D2))
    Y3 = S3.dot(_np.random.randn(S3.shape[1], D3))
    Y1 += .3 * _np.random.randn(*Y1.shape)
    Y2 += .2 * _np.random.randn(*Y2.shape)
    Y3 += .25 * _np.random.randn(*Y3.shape)
    Y1 -= Y1.mean(0)
    Y2 -= Y2.mean(0)
    Y3 -= Y3.mean(0)
    Y1 /= Y1.std(0)
    Y2 /= Y2.std(0)
    Y3 /= Y3.std(0)
    return Y1, Y2, Y3, S1, S2, S3
 def bgplvm_simulation(optimize=True, verbose=1,
                      plot=True, plot_sim=False,
@ -261,95 +323,181 @@ def bgplvm_simulation(optimize=True, verbose=1,
    from GPy import kern
    from GPy.models import BayesianGPLVM
-    D1, D2, D3, N, num_inducing, Q = 49, 30, 10, 12, 3, 10
+    D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 45, 3, 9
-    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
+    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
    Y = Ylist[0]
-    k = kern.linear(Q, ARD=True)
+    k = kern.Linear(Q, ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
    # k = kern.RBF(Q, ARD=True, lengthscale=10.)
    m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k)
-    m.X_variance = m.X_variance * .7
+    m.X.variance[:] = _np.random.uniform(0, .01, m.X.shape)
-    m['noise'] = Y.var() / 100.
+    m.likelihood.variance = .1
    if optimize:
        print "Optimizing model:"
        m.optimize('bfgs', messages=verbose, max_iters=max_iters,
                   gtol=.05)
    if plot:
        m.X.plot("BGPLVM Latent Space 1D")
        m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
    return m
 def ssgplvm_simulation(optimize=True, verbose=1,
                      plot=True, plot_sim=False,
                      max_iters=2e4, useGPU=False
                      ):
    from GPy import kern
    from GPy.models import SSGPLVM
    D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 45, 3, 9
    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
    Y = Ylist[0]
    k = kern.Linear(Q, ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
    # k = kern.RBF(Q, ARD=True, lengthscale=10.)
    m = SSGPLVM(Y, Q, init="pca", num_inducing=num_inducing, kernel=k)
    m.X.variance[:] = _np.random.uniform(0, .01, m.X.shape)
    m.likelihood.variance = .1
    if optimize:
        print "Optimizing model:"
        m.optimize('scg', messages=verbose, max_iters=max_iters,
                   gtol=.05)
    if plot:
-        m.plot_X_1d("BGPLVM Latent Space 1D")
+        m.X.plot("SSGPLVM Latent Space 1D")
        m.kern.plot_ARD('SSGPLVM Simulation ARD Parameters')
    return m
 def bgplvm_simulation_missing_data(optimize=True, verbose=1,
                      plot=True, plot_sim=False,
                      max_iters=2e4, percent_missing=.1,
                      ):
    from GPy import kern
    from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
    D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 400, 3, 4
    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
    Y = Ylist[0]
    k = kern.Linear(Q, ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
    inan = _np.random.binomial(1, percent_missing, size=Y.shape).astype(bool)  # 80% missing data
    Ymissing = Y.copy()
    Ymissing[inan] = _np.nan
    m = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
                      kernel=k, missing_data=True)
    m.Yreal = Y
    if optimize:
        print "Optimizing model:"
        m.optimize('bfgs', messages=verbose, max_iters=max_iters,
                   gtol=.05)
    if plot:
        m.X.plot("BGPLVM Latent Space 1D")
        m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
    return m
 def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
    from GPy import kern
    from GPy.models import MRD
    from GPy.likelihoods import Gaussian
    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
-    _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
+    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
    likelihood_list = [Gaussian(x, normalize=True) for x in Ylist]
-    k = kern.linear(Q, ARD=True)# + kern.bias(Q, _np.exp(-2)) + kern.white(Q, _np.exp(-2))
+    # Ylist = [Ylist[0]]
-    m = MRD(likelihood_list, input_dim=Q, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw)
+    k = kern.Linear(Q, ARD=True)
-    m.ensure_default_constraints()
+    m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing, kernel=k, initx="PCA_concat", initz='permute', **kw)
    m['.*noise'] = [Y.var() / 40. for Y in Ylist]
    for i, bgplvm in enumerate(m.bgplvms):
        m['{}_noise'.format(i)] = 1 #bgplvm.likelihood.Y.var() / 500.
        bgplvm.X_variance = bgplvm.X_variance #* .1
    if optimize:
        print "Optimizing Model:"
-        m.optimize(messages=verbose, max_iters=8e3, gtol=.1)
+        m.optimize(messages=verbose, max_iters=8e3)
    if plot:
-        m.plot_X_1d("MRD Latent Space 1D")
+        m.X.plot("MRD Latent Space 1D")
        m.plot_scales("MRD Scales")
    return m
 def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
    from GPy import kern
    from GPy.models import MRD
    D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
    # Ylist = [Ylist[0]]
    k = kern.Linear(Q, ARD=True)
    inanlist = []
    for Y in Ylist:
        inan = _np.random.binomial(1, .6, size=Y.shape).astype(bool)
        inanlist.append(inan)
        Y[inan] = _np.nan
    m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing,
            kernel=k, inference_method=None,
            initx="random", initz='permute', **kw)
    if optimize:
        print "Optimizing Model:"
        m.optimize('bfgs', messages=verbose, max_iters=8e3, gtol=.1)
    if plot:
        m.X.plot("MRD Latent Space 1D")
        m.plot_scales("MRD Scales")
    return m
 def brendan_faces(optimize=True, verbose=True, plot=True):
    import GPy
    import pods
-    data = GPy.util.datasets.brendan_faces()
+    data = pods.datasets.brendan_faces()
    Q = 2
    Y = data['Y']
    Yn = Y - Y.mean()
    Yn /= Yn.std()
-    m = GPy.models.GPLVM(Yn, Q)
+    m = GPy.models.BayesianGPLVM(Yn, Q, num_inducing=20)
    # optimize
    m.constrain('rbf|noise|white', GPy.core.transformations.logexp_clipped())
-    if optimize: m.optimize('scg', messages=verbose, max_iters=1000)
+    if optimize: m.optimize('bfgs', messages=verbose, max_iters=1000)
    if plot:
        ax = m.plot_latent(which_indices=(0, 1))
-        y = m.likelihood.Y[0, :]
+        y = m.Y[0, :]
-        data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, order='F', invert=False, scale=False)
+        data_show = GPy.plotting.matplot_dep.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, order='F', invert=False, scale=False)
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        lvm = GPy.plotting.matplot_dep.visualize.lvm(m.X.mean[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')
    return m
 def olivetti_faces(optimize=True, verbose=True, plot=True):
    import GPy
    import pods
-    data = GPy.util.datasets.olivetti_faces()
+    data = pods.datasets.olivetti_faces()
    Q = 2
    Y = data['Y']
    Yn = Y - Y.mean()
    Yn /= Yn.std()
-    m = GPy.models.GPLVM(Yn, Q)
+    m = GPy.models.BayesianGPLVM(Yn, Q, num_inducing=20)
-    if optimize: m.optimize('scg', messages=verbose, max_iters=1000)
+
    if optimize: m.optimize('bfgs', messages=verbose, max_iters=1000)
    if plot:
        ax = m.plot_latent(which_indices=(0, 1))
-        y = m.likelihood.Y[0, :]
+        y = m.Y[0, :]
-        data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
+        data_show = GPy.plotting.matplot_dep.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        lvm = GPy.plotting.matplot_dep.visualize.lvm(m.X.mean[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')
    return m
 def stick_play(range=None, frame_rate=15, optimize=False, verbose=True, plot=True):
    import GPy
-    data = GPy.util.datasets.osu_run1()
+    import pods
    data = pods.datasets.osu_run1()
    # optimize
    if range == None:
        Y = data['Y'].copy()
@ -357,43 +505,46 @@ def stick_play(range=None, frame_rate=15, optimize=False, verbose=True, plot=Tru
        Y = data['Y'][range[0]:range[1], :].copy()
    if plot:
        y = Y[0, :]
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.util.visualize.data_play(Y, data_show, frame_rate)
+        GPy.plotting.matplot_dep.visualize.data_play(Y, data_show, frame_rate)
    return Y
 def stick(kernel=None, optimize=True, verbose=True, plot=True):
    from matplotlib import pyplot as plt
    import GPy
    import pods
-    data = GPy.util.datasets.osu_run1()
+    data = pods.datasets.osu_run1()
    # optimize
    m = GPy.models.GPLVM(data['Y'], 2, kernel=kernel)
-    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
+    if optimize: m.optimize('bfgs', messages=verbose, max_f_eval=10000)
-    if plot and GPy.util.visualize.visual_available:
+    if plot:
        plt.clf
        ax = m.plot_latent()
-        y = m.likelihood.Y[0, :]
+        y = m.Y[0, :]
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[:1, :].copy(), m, data_show, latent_axes=ax)
        raw_input('Press enter to finish')
-
+        lvm_visualizer.close()
        data_show.close()
    return m
 def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):
    from matplotlib import pyplot as plt
    import GPy
    import pods
-    data = GPy.util.datasets.osu_run1()
+    data = pods.datasets.osu_run1()
    # optimize
    mapping = GPy.mappings.Linear(data['Y'].shape[1], 2)
    m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
-    if plot and GPy.util.visualize.visual_available:
+    if plot and GPy.plotting.matplot_dep.visualize.visual_available:
        plt.clf
        ax = m.plot_latent()
        y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
        raw_input('Press enter to finish')
    return m
@ -401,32 +552,33 @@ def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):
 def bcgplvm_stick(kernel=None, optimize=True, verbose=True, plot=True):
    from matplotlib import pyplot as plt
    import GPy
    import pods
-    data = GPy.util.datasets.osu_run1()
+    data = pods.datasets.osu_run1()
    # optimize
-    back_kernel=GPy.kern.rbf(data['Y'].shape[1], lengthscale=5.)
+    back_kernel = GPy.kern.RBF(data['Y'].shape[1], lengthscale=5.)
    mapping = GPy.mappings.Kernel(X=data['Y'], output_dim=2, kernel=back_kernel)
    m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
-    if plot and GPy.util.visualize.visual_available:
+    if plot and GPy.plotting.matplot_dep.visualize.visual_available:
        plt.clf
        ax = m.plot_latent()
        y = m.likelihood.Y[0, :]
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
-        GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
-        raw_input('Press enter to finish')
+        # raw_input('Press enter to finish')
    return m
 def robot_wireless(optimize=True, verbose=True, plot=True):
    from matplotlib import pyplot as plt
    import GPy
    import pods
-    data = GPy.util.datasets.robot_wireless()
+    data = pods.datasets.robot_wireless()
    # optimize
-    m = GPy.models.GPLVM(data['Y'], 2)
+    m = GPy.models.BayesianGPLVM(data['Y'], 4, num_inducing=25)
    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
    m._set_params(m._get_params())
    if plot:
        m.plot_latent()
@ -435,23 +587,33 @@ def robot_wireless(optimize=True, verbose=True, plot=True):
 def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
    from GPy.models import BayesianGPLVM
    from matplotlib import pyplot as plt
    import numpy as np
    import GPy
    import pods
-    data = GPy.util.datasets.osu_run1()
+    data = pods.datasets.osu_run1()
    Q = 6
-    kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, _np.exp(-2)) + GPy.kern.white(Q, _np.exp(-2))
+    kernel = GPy.kern.RBF(Q, lengthscale=np.repeat(.5, Q), ARD=True)
    m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel)
    m.data = data
    m.likelihood.variance = 0.001
    # optimize
-    m.ensure_default_constraints()
+    try:
-    if optimize: m.optimize('scg', messages=verbose, max_iters=200, xtol=1e-300, ftol=1e-300)
+        if optimize: m.optimize('bfgs', messages=verbose, max_iters=5e3, bfgs_factor=10)
-    m._set_params(m._get_params())
+    except KeyboardInterrupt:
        print "Keyboard interrupt, continuing to plot and return"
    if plot:
-        plt.clf, (latent_axes, sense_axes) = plt.subplots(1, 2)
+        fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
        plt.sca(latent_axes)
-        m.plot_latent()
+        m.plot_latent(ax=latent_axes)
-        y = m.likelihood.Y[0, :].copy()
+        y = m.Y[:1, :].copy()
-        data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect'])
+        data_show = GPy.plotting.matplot_dep.visualize.stick_show(y, connect=data['connect'])
-        GPy.util.visualize.lvm_dimselect(m.X[0, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
+        dim_select = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean[:1, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes)
        fig.canvas.draw()
        fig.canvas.show()
        raw_input('Press enter to finish')
    return m
@ -459,20 +621,50 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
 def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose=True, plot=True):
    import GPy
    import pods
-    data = GPy.util.datasets.cmu_mocap(subject, motion)
+    data = pods.datasets.cmu_mocap(subject, motion)
    if in_place:
        # Make figure move in place.
        data['Y'][:, 0:3] = 0.0
-    m = GPy.models.GPLVM(data['Y'], 2, normalize_Y=True)
+    Y = data['Y']
    Y_mean = Y.mean(0)
    Y_std = Y.std(0)
    m = GPy.models.GPLVM((Y - Y_mean) / Y_std, 2)
    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
    if plot:
        ax = m.plot_latent()
-        y = m.likelihood.Y[0, :]
+        y = m.Y[0, :]
-        data_show = GPy.util.visualize.skeleton_show(y[None, :], data['skel'])
+        data_show = GPy.plotting.matplot_dep.visualize.skeleton_show(y[None, :], data['skel'])
-        lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[0].copy(), m, data_show, latent_axes=ax)
        raw_input('Press enter to finish')
        lvm_visualizer.close()
        data_show.close()
    return m
 def ssgplvm_simulation_linear():
    import numpy as np
    import GPy
    N, D, Q = 1000, 20, 5
    pi = 0.2
    def sample_X(Q, pi):
        x = np.empty(Q)
        dies = np.random.rand(Q)
        for q in xrange(Q):
            if dies[q] < pi:
                x[q] = np.random.randn()
            else:
                x[q] = 0.
        return x
    Y = np.empty((N, D))
    X = np.empty((N, Q))
    # Generate data from random sampled weight matrices
    for n in xrange(N):
        X[n] = sample_X(Q, pi)
        w = np.random.randn(D, Q)
        Y[n] = np.dot(w, X[n])
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@ -1,7 +1,13 @@
 # Copyright (c) 2014, Alan Saul
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import GPy
 import numpy as np
 import matplotlib.pyplot as plt
 from GPy.util import datasets
 try:
    import matplotlib.pyplot as plt
 except:
    pass
 def student_t_approx(optimize=True, plot=True):
    """
@ -30,47 +36,53 @@ def student_t_approx(optimize=True, plot=True):
    #Yc = Yc/Yc.max()
    #Add student t random noise to datapoints
-    deg_free = 5
+    deg_free = 1
    print "Real noise: ", real_std
    initial_var_guess = 0.5
    edited_real_sd = initial_var_guess
    # Kernel object
-    kernel1 = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1])
+    kernel1 = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])
-    kernel2 = kernel1.copy()
+    kernel2 = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])
-    kernel3 = kernel1.copy()
+    kernel3 = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])
-    kernel4 = kernel1.copy()
+    kernel4 = GPy.kern.RBF(X.shape[1]) + GPy.kern.White(X.shape[1])
    #Gaussian GP model on clean data
    m1 = GPy.models.GPRegression(X, Y.copy(), kernel=kernel1)
    # optimize
-    m1.ensure_default_constraints()
+    m1['.*white'].constrain_fixed(1e-5)
    m1.constrain_fixed('white', 1e-5)
    m1.randomize()
    #Gaussian GP model on corrupt data
    m2 = GPy.models.GPRegression(X, Yc.copy(), kernel=kernel2)
-    m2.ensure_default_constraints()
+    m2['.*white'].constrain_fixed(1e-5)
    m2.constrain_fixed('white', 1e-5)
    m2.randomize()
    #Student t GP model on clean data
-    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
+    t_distribution = GPy.likelihoods.StudentT(deg_free=deg_free, sigma2=edited_real_sd)
-    stu_t_likelihood = GPy.likelihoods.Laplace(Y.copy(), t_distribution)
+    laplace_inf = GPy.inference.latent_function_inference.Laplace()
-    m3 = GPy.models.GPRegression(X, Y.copy(), kernel3, likelihood=stu_t_likelihood)
+    m3 = GPy.core.GP(X, Y.copy(), kernel3, likelihood=t_distribution, inference_method=laplace_inf)
-    m3.ensure_default_constraints()
+    m3['.*t_scale2'].constrain_bounded(1e-6, 10.)
-    m3.constrain_bounded('t_noise', 1e-6, 10.)
+    m3['.*white'].constrain_fixed(1e-5)
    m3.constrain_fixed('white', 1e-5)
    m3.randomize()
    #Student t GP model on corrupt data
-    t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=deg_free, sigma2=edited_real_sd)
+    t_distribution = GPy.likelihoods.StudentT(deg_free=deg_free, sigma2=edited_real_sd)
-    corrupt_stu_t_likelihood = GPy.likelihoods.Laplace(Yc.copy(), t_distribution)
+    laplace_inf = GPy.inference.latent_function_inference.Laplace()
-    m4 = GPy.models.GPRegression(X, Yc.copy(), kernel4, likelihood=corrupt_stu_t_likelihood)
+    m4 = GPy.core.GP(X, Yc.copy(), kernel4, likelihood=t_distribution, inference_method=laplace_inf)
-    m4.ensure_default_constraints()
+    m4['.*t_scale2'].constrain_bounded(1e-6, 10.)
-    m4.constrain_bounded('t_noise', 1e-6, 10.)
+    m4['.*white'].constrain_fixed(1e-5)
    m4.constrain_fixed('white', 1e-5)
    m4.randomize()
    print m4
    debug=True
    if debug:
        m4.optimize(messages=1)
        import pylab as pb
        pb.plot(m4.X, m4.inference_method.f_hat)
        pb.plot(m4.X, m4.Y, 'rx')
        m4.plot()
        print m4
        return m4
    if optimize:
        optimizer='scg'
@ -115,6 +127,7 @@ def student_t_approx(optimize=True, plot=True):
    return m1, m2, m3, m4
 def boston_example(optimize=True, plot=True):
    raise NotImplementedError("Needs updating")
    import sklearn
    from sklearn.cross_validation import KFold
    optimizer='bfgs'
@ -143,8 +156,8 @@ def boston_example(optimize=True, plot=True):
        noise = 1e-1 #np.exp(-2)
        rbf_len = 0.5
        data_axis_plot = 4
-        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+        kernelstu = GPy.kern.RBF(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
-        kernelgp = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+        kernelgp = GPy.kern.RBF(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
        #Baseline
        score_folds[0, n] = rmse(Y_test, np.mean(Y_train))
@ -152,10 +165,9 @@ def boston_example(optimize=True, plot=True):
        #Gaussian GP
        print "Gauss GP"
        mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp.copy())
-        mgp.ensure_default_constraints()
+        mgp.constrain_fixed('.*white', 1e-5)
-        mgp.constrain_fixed('white', 1e-5)
+        mgp['.*len'] = rbf_len
-        mgp['rbf_len'] = rbf_len
+        mgp['.*noise'] = noise
        mgp['noise'] = noise
        print mgp
        if optimize:
            mgp.optimize(optimizer=optimizer, messages=messages)
@ -170,9 +182,8 @@ def boston_example(optimize=True, plot=True):
        g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=noise, N=N, D=D)
        g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution)
        mg = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=g_likelihood)
        mg.ensure_default_constraints()
        mg.constrain_positive('noise_variance')
-        mg.constrain_fixed('white', 1e-5)
+        mg.constrain_fixed('.*white', 1e-5)
        mg['rbf_len'] = rbf_len
        mg['noise'] = noise
        print mg
@ -190,11 +201,10 @@ def boston_example(optimize=True, plot=True):
            t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=df, sigma2=noise)
            stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
            mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
-            mstu_t.ensure_default_constraints()
+            mstu_t.constrain_fixed('.*white', 1e-5)
-            mstu_t.constrain_fixed('white', 1e-5)
+            mstu_t.constrain_bounded('.*t_scale2', 0.0001, 1000)
            mstu_t.constrain_bounded('t_noise', 0.0001, 1000)
            mstu_t['rbf_len'] = rbf_len
-            mstu_t['t_noise'] = noise
+            mstu_t['.*t_scale2'] = noise
            print mstu_t
            if optimize:
                mstu_t.optimize(optimizer=optimizer, messages=messages)
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -1,22 +1,29 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 Gaussian Processes regression examples
 """
-import pylab as pb
+try:
    import pylab as pb
 except:
    pass
 import numpy as np
 import GPy
 def olympic_marathon_men(optimize=True, plot=True):
    """Run a standard Gaussian process regression on the Olympic marathon data."""
-    data = GPy.util.datasets.olympic_marathon_men()
+    try:import pods
    except ImportError:
        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
        return
    data = pods.datasets.olympic_marathon_men()
    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])
    # set the lengthscale to be something sensible (defaults to 1)
-    m['rbf_lengthscale'] = 10
+    m.kern.lengthscale = 10.
    if optimize:
        m.optimize('bfgs', max_iters=200)
@ -25,79 +32,51 @@ def olympic_marathon_men(optimize=True, plot=True):
    return m
-def coregionalization_toy2(optimize=True, plot=True):
+def coregionalization_toy(optimize=True, plot=True):
    """
    A simple demonstration of coregionalization on two sinusoidal functions.
    """
    #build a design matrix with a column of integers indicating the output
    X1 = np.random.rand(50, 1) * 8
    X2 = np.random.rand(30, 1) * 5
    index = np.vstack((np.zeros_like(X1), np.ones_like(X2)))
    X = np.hstack((np.vstack((X1, X2)), index))
    #build a suitable set of observed variables
    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
    Y2 = np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + 2.
    Y = np.vstack((Y1, Y2))
-    #build the kernel
+    m = GPy.models.GPCoregionalizedRegression(X_list=[X1,X2], Y_list=[Y1,Y2])
    k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
    k2 = GPy.kern.coregionalize(2,1)
    k = k1**k2
    m = GPy.models.GPRegression(X, Y, kernel=k)
    m.constrain_fixed('.*rbf_var', 1.)
    if optimize:
        m.optimize('bfgs', max_iters=100)
    if plot:
-        m.plot(fixed_inputs=[(1,0)])
+        slices = GPy.util.multioutput.get_slices([X1,X2])
-        m.plot(fixed_inputs=[(1,1)], ax=pb.gca())
+        m.plot(fixed_inputs=[(1,0)],which_data_rows=slices[0],Y_metadata={'output_index':0})
-
+        m.plot(fixed_inputs=[(1,1)],which_data_rows=slices[1],Y_metadata={'output_index':1},ax=pb.gca())
    return m
 #FIXME: Needs recovering once likelihoods are consolidated
 #def coregionalization_toy(optimize=True, plot=True):
 #    """
 #    A simple demonstration of coregionalization on two sinusoidal functions.
 #    """
 #    X1 = np.random.rand(50, 1) * 8
 #    X2 = np.random.rand(30, 1) * 5
 #    X = np.vstack((X1, X2))
 #    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
 #    Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
 #    Y = np.vstack((Y1, Y2))
 #
 #    k1 = GPy.kern.rbf(1)
 #    m = GPy.models.GPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1])
 #    m.constrain_fixed('.*rbf_var', 1.)
 #    m.optimize(max_iters=100)
 #
 #    fig, axes = pb.subplots(2,1)
 #    m.plot(fixed_inputs=[(1,0)],ax=axes[0])
 #    m.plot(fixed_inputs=[(1,1)],ax=axes[1])
 #    axes[0].set_title('Output 0')
 #    axes[1].set_title('Output 1')
 #    return m
 def coregionalization_sparse(optimize=True, plot=True):
    """
    A simple demonstration of coregionalization on two sinusoidal functions using sparse approximations.
    """
-    #fetch the data from the non sparse examples
+    #build a design matrix with a column of integers indicating the output
-    m = coregionalization_toy2(optimize=False, plot=False)
+    X1 = np.random.rand(50, 1) * 8
-    X, Y = m.X, m.likelihood.Y
+    X2 = np.random.rand(30, 1) * 5
-    #construct a model
+    #build a suitable set of observed variables
-    m = GPy.models.SparseGPRegression(X,Y)
+    Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
-    m.constrain_fixed('iip_\d+_1') # don't optimize the inducing input indexes
+    Y2 = np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + 2.
    m = GPy.models.SparseGPCoregionalizedRegression(X_list=[X1,X2], Y_list=[Y1,Y2])
    if optimize:
-        m.optimize('bfgs', max_iters=100, messages=1)
+        m.optimize('bfgs', max_iters=100)
    if plot:
-        m.plot(fixed_inputs=[(1,0)])
+        slices = GPy.util.multioutput.get_slices([X1,X2])
-        m.plot(fixed_inputs=[(1,1)], ax=pb.gca())
+        m.plot(fixed_inputs=[(1,0)],which_data_rows=slices[0],Y_metadata={'output_index':0})
        m.plot(fixed_inputs=[(1,1)],which_data_rows=slices[1],Y_metadata={'output_index':1},ax=pb.gca())
        pb.ylim(-3,)
    return m
@ -107,7 +86,11 @@ def epomeo_gpx(max_iters=200, optimize=True, plot=True):
    from the Mount Epomeo runs. Requires gpxpy to be installed on your system
    to load in the data.
    """
-    data = GPy.util.datasets.epomeo_gpx()
+    try:import pods
    except ImportError:
        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
        return
    data = pods.datasets.epomeo_gpx()
    num_data_list = []
    for Xpart in data['X']:
        num_data_list.append(Xpart.shape[0])
@ -127,14 +110,14 @@ def epomeo_gpx(max_iters=200, optimize=True, plot=True):
    Z = np.hstack((np.linspace(t[:,0].min(), t[:, 0].max(), num_inducing)[:, None],
                   np.random.randint(0, 4, num_inducing)[:, None]))
-    k1 = GPy.kern.rbf(1)
+    k1 = GPy.kern.RBF(1)
-    k2 = GPy.kern.coregionalize(output_dim=5, rank=5)
+    k2 = GPy.kern.Coregionalize(output_dim=5, rank=5)
    k = k1**k2
    m = GPy.models.SparseGPRegression(t, Y, kernel=k, Z=Z, normalize_Y=True)
-    m.constrain_fixed('.*rbf_var', 1.)
+    m.constrain_fixed('.*variance', 1.)
-    m.constrain_fixed('iip')
+    m.inducing_inputs.constrain_fixed()
-    m.constrain_bounded('noise_variance', 1e-3, 1e-1)
+    m.Gaussian_noise.variance.constrain_bounded(1e-3, 1e-1)
    m.optimize(max_iters=max_iters,messages=True)
    return m
@ -150,13 +133,17 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
    length_scales = np.linspace(0.1, 60., resolution)
    log_SNRs = np.linspace(-3., 4., resolution)
-    data = GPy.util.datasets.della_gatta_TRP63_gene_expression(data_set='della_gatta',gene_number=gene_number)
+    try:import pods
    except ImportError:
        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
        return
    data = pods.datasets.della_gatta_TRP63_gene_expression(data_set='della_gatta',gene_number=gene_number)
    # data['Y'] = data['Y'][0::2, :]
    # data['X'] = data['X'][0::2, :]
    data['Y'] = data['Y'] - np.mean(data['Y'])
-    lls = GPy.examples.regression._contour_data(data, length_scales, log_SNRs, GPy.kern.rbf)
+    lls = GPy.examples.regression._contour_data(data, length_scales, log_SNRs, GPy.kern.RBF)
    if plot:
        pb.contour(length_scales, log_SNRs, np.exp(lls), 20, cmap=pb.cm.jet)
        ax = pb.gca()
@ -172,20 +159,20 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
    optim_point_y = np.empty(2)
    np.random.seed(seed=seed)
    for i in range(0, model_restarts):
-        # kern = GPy.kern.rbf(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.))
+        # kern = GPy.kern.RBF(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.))
-        kern = GPy.kern.rbf(1, variance=np.random.uniform(1e-3, 1), lengthscale=np.random.uniform(5, 50))
+        kern = GPy.kern.RBF(1, variance=np.random.uniform(1e-3, 1), lengthscale=np.random.uniform(5, 50))
        m = GPy.models.GPRegression(data['X'], data['Y'], kernel=kern)
-        m['noise_variance'] = np.random.uniform(1e-3, 1)
+        m.likelihood.variance = np.random.uniform(1e-3, 1)
-        optim_point_x[0] = m['rbf_lengthscale']
+        optim_point_x[0] = m.rbf.lengthscale
-        optim_point_y[0] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);
+        optim_point_y[0] = np.log10(m.rbf.variance) - np.log10(m.likelihood.variance);
        # optimize
        if optimize:
            m.optimize('scg', xtol=1e-6, ftol=1e-6, max_iters=max_iters)
-        optim_point_x[1] = m['rbf_lengthscale']
+        optim_point_x[1] = m.rbf.lengthscale
-        optim_point_y[1] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']);
+        optim_point_y[1] = np.log10(m.rbf.variance) - np.log10(m.likelihood.variance);
        if plot:
            pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1] - optim_point_x[0], optim_point_y[1] - optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k')
@ -196,7 +183,7 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
        ax.set_ylim(ylim)
    return m # (models, lls)
-def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):
+def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.RBF):
    """
    Evaluate the GP objective function for a given data set for a range of
    signal to noise ratios and a range of lengthscales.
@ -216,7 +203,7 @@ def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):
        noise_var = total_var / (1. + SNR)
        signal_var = total_var - noise_var
        model.kern['.*variance'] = signal_var
-        model['noise_variance'] = noise_var
+        model.likelihood.variance = noise_var
        length_scale_lls = []
        for length_scale in length_scales:
@ -230,13 +217,17 @@ def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf):
 def olympic_100m_men(optimize=True, plot=True):
    """Run a standard Gaussian process regression on the Rogers and Girolami olympics data."""
-    data = GPy.util.datasets.olympic_100m_men()
+    try:import pods
    except ImportError:
        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
        return
    data = pods.datasets.olympic_100m_men()
    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])
    # set the lengthscale to be something sensible (defaults to 1)
-    m['rbf_lengthscale'] = 10
+    m.rbf.lengthscale = 10
    if optimize:
        m.optimize('bfgs', max_iters=200)
@ -247,7 +238,11 @@ def olympic_100m_men(optimize=True, plot=True):
 def toy_rbf_1d(optimize=True, plot=True):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
-    data = GPy.util.datasets.toy_rbf_1d()
+    try:import pods
    except ImportError:
        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
        return
    data = pods.datasets.toy_rbf_1d()
    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])
@ -261,7 +256,11 @@ def toy_rbf_1d(optimize=True, plot=True):
 def toy_rbf_1d_50(optimize=True, plot=True):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
-    data = GPy.util.datasets.toy_rbf_1d_50()
+    try:import pods
    except ImportError:
        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
        return
    data = pods.datasets.toy_rbf_1d_50()
    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])
@ -278,14 +277,15 @@ def toy_poisson_rbf_1d_laplace(optimize=True, plot=True):
    optimizer='scg'
    x_len = 30
    X = np.linspace(0, 10, x_len)[:, None]
-    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.rbf(1).K(X))
+    f_true = np.random.multivariate_normal(np.zeros(x_len), GPy.kern.RBF(1).K(X))
    Y = np.array([np.random.poisson(np.exp(f)) for f in f_true])[:,None]
-    noise_model = GPy.likelihoods.poisson()
+    kern = GPy.kern.RBF(1)
-    likelihood = GPy.likelihoods.Laplace(Y,noise_model)
+    poisson_lik = GPy.likelihoods.Poisson()
    laplace_inf = GPy.inference.latent_function_inference.Laplace()
    # create simple GP Model
-    m = GPy.models.GPRegression(X, Y, likelihood=likelihood)
+    m = GPy.core.GP(X, Y, kernel=kern, likelihood=poisson_lik, inference_method=laplace_inf)
    if optimize:
        m.optimize(optimizer)
@ -316,23 +316,22 @@ def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize
    Y /= Y.std()
    if kernel_type == 'linear':
-        kernel = GPy.kern.linear(X.shape[1], ARD=1)
+        kernel = GPy.kern.Linear(X.shape[1], ARD=1)
    elif kernel_type == 'rbf_inv':
-        kernel = GPy.kern.rbf_inv(X.shape[1], ARD=1)
+        kernel = GPy.kern.RBF_inv(X.shape[1], ARD=1)
    else:
-        kernel = GPy.kern.rbf(X.shape[1], ARD=1)
+        kernel = GPy.kern.RBF(X.shape[1], ARD=1)
-    kernel += GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+    kernel += GPy.kern.White(X.shape[1]) + GPy.kern.Bias(X.shape[1])
    m = GPy.models.GPRegression(X, Y, kernel)
    # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
    # m.set_prior('.*lengthscale',len_prior)
    if optimize:
-        m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
+        m.optimize(optimizer='scg', max_iters=max_iters)
    if plot:
        m.kern.plot_ARD()
    print m
    return m
 def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4, optimize=True, plot=True):
@ -355,36 +354,39 @@ def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4, o
    Y /= Y.std()
    if kernel_type == 'linear':
-        kernel = GPy.kern.linear(X.shape[1], ARD=1)
+        kernel = GPy.kern.Linear(X.shape[1], ARD=1)
    elif kernel_type == 'rbf_inv':
-        kernel = GPy.kern.rbf_inv(X.shape[1], ARD=1)
+        kernel = GPy.kern.RBF_inv(X.shape[1], ARD=1)
    else:
-        kernel = GPy.kern.rbf(X.shape[1], ARD=1)
+        kernel = GPy.kern.RBF(X.shape[1], ARD=1)
-    kernel += GPy.kern.bias(X.shape[1])
+    #kernel += GPy.kern.Bias(X.shape[1])
    X_variance = np.ones(X.shape) * 0.5
    m = GPy.models.SparseGPRegression(X, Y, kernel, X_variance=X_variance)
    # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25
    # m.set_prior('.*lengthscale',len_prior)
    if optimize:
-        m.optimize(optimizer='scg', max_iters=max_iters, messages=1)
+        m.optimize(optimizer='scg', max_iters=max_iters)
    if plot:
        m.kern.plot_ARD()
    print m
    return m
 def robot_wireless(max_iters=100, kernel=None, optimize=True, plot=True):
    """Predict the location of a robot given wirelss signal strength readings."""
-    data = GPy.util.datasets.robot_wireless()
+    try:import pods
    except ImportError:
        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
        return
    data = pods.datasets.robot_wireless()
    # create simple GP Model
    m = GPy.models.GPRegression(data['Y'], data['X'], kernel=kernel)
    # optimize
    if optimize:
-        m.optimize(messages=True, max_iters=max_iters)
+        m.optimize(max_iters=max_iters)
    Xpredict = m.predict(data['Ytest'])[0]
    if plot:
@ -396,13 +398,16 @@ def robot_wireless(max_iters=100, kernel=None, optimize=True, plot=True):
    sse = ((data['Xtest'] - Xpredict)**2).sum()
    print m
    print('Sum of squares error on test data: ' + str(sse))
    return m
 def silhouette(max_iters=100, optimize=True, plot=True):
    """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper."""
-    data = GPy.util.datasets.silhouette()
+    try:import pods
    except ImportError:
        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
        return
    data = pods.datasets.silhouette()
    # create simple GP Model
    m = GPy.models.GPRegression(data['X'], data['Y'])
@ -414,32 +419,38 @@ def silhouette(max_iters=100, optimize=True, plot=True):
    print m
    return m
-def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, optimize=True, plot=True):
+def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, optimize=True, plot=True, checkgrad=False):
    """Run a 1D example of a sparse GP regression."""
    # sample inputs and outputs
    X = np.random.uniform(-3., 3., (num_samples, 1))
    Y = np.sin(X) + np.random.randn(num_samples, 1) * 0.05
    # construct kernel
-    rbf = GPy.kern.rbf(1)
+    rbf = GPy.kern.RBF(1)
    # create simple GP Model
    m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing)
-    m.checkgrad(verbose=1)
+
    if checkgrad:
        m.checkgrad()
    if optimize:
-        m.optimize('tnc', messages=1, max_iters=max_iters)
+        m.optimize('tnc', max_iters=max_iters)
    if plot:
        m.plot()
    return m
-def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, optimize=True, plot=True):
+def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, optimize=True, plot=True, nan=False):
    """Run a 2D example of a sparse GP regression."""
    np.random.seed(1234)
    X = np.random.uniform(-3., 3., (num_samples, 2))
    Y = np.sin(X[:, 0:1]) * np.sin(X[:, 1:2]) + np.random.randn(num_samples, 1) * 0.05
    if nan:
        inan = np.random.binomial(1,.2,size=Y.shape)
        Y[inan] = np.nan
    # construct kernel
-    rbf = GPy.kern.rbf(2)
+    rbf = GPy.kern.RBF(2)
    # create simple GP Model
    m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing)
@ -462,7 +473,7 @@ def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, opt
 def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
    """Run a 1D example of a sparse GP regression with uncertain inputs."""
-    fig, axes = pb.subplots(1, 2, figsize=(12, 5))
+    fig, axes = pb.subplots(1, 2, figsize=(12, 5), sharex=True, sharey=True)
    # sample inputs and outputs
    S = np.ones((20, 1))
@ -471,8 +482,7 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
    # likelihood = GPy.likelihoods.Gaussian(Y)
    Z = np.random.uniform(-3., 3., (7, 1))
-    k = GPy.kern.rbf(1)
+    k = GPy.kern.RBF(1)
    # create simple GP Model - no input uncertainty on this one
    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z)
@ -485,7 +495,7 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
    print m
    # the same Model with uncertainty
-    m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z, X_variance=S)
+    m = GPy.models.SparseGPRegression(X, Y, kernel=GPy.kern.RBF(1), Z=Z, X_variance=S)
    if optimize:
        m.optimize('scg', messages=1, max_iters=max_iters)
    if plot:
--- a/GPy/examples/stochastic.py
+++ b/GPy/examples/stochastic.py
@ -1,37 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import pylab as pb
 import numpy as np
 import GPy
 def toy_1d(optimize=True, plot=True):
    N = 2000
    M = 20
    #create data
    X = np.linspace(0,32,N)[:,None]
    Z = np.linspace(0,32,M)[:,None]
    Y = np.sin(X) + np.cos(0.3*X) + np.random.randn(*X.shape)/np.sqrt(50.)
    m = GPy.models.SVIGPRegression(X,Y, batchsize=10, Z=Z)
    m.constrain_bounded('noise_variance',1e-3,1e-1)
    m.constrain_bounded('white_variance',1e-3,1e-1)
    m.param_steplength = 1e-4
    if plot:
        fig = pb.figure()
        ax = fig.add_subplot(111)
        def cb(foo):
            ax.cla()
            m.plot(ax=ax,Z_height=-3)
            ax.set_ylim(-3,3)
            fig.canvas.draw()
    if optimize:
        m.optimize(500, callback=cb, callback_interval=1)
    if plot:
        m.plot_traces()
    return m
--- a/GPy/examples/tutorials.py
+++ b/GPy/examples/tutorials.py
@ -1,153 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 Code of Tutorials
 """
 import pylab as pb
 pb.ion()
 import numpy as np
 import GPy
 def tuto_GP_regression(optimize=True, plot=True):
    """The detailed explanations of the commands used in this file can be found in the tutorial section"""
    X = np.random.uniform(-3.,3.,(20,1))
    Y = np.sin(X) + np.random.randn(20,1)*0.05
    kernel = GPy.kern.rbf(input_dim=1, variance=1., lengthscale=1.)
    m = GPy.models.GPRegression(X, Y, kernel)
    print m
    if plot:
        m.plot()
    m.constrain_positive('')
    m.unconstrain('')               # may be used to remove the previous constrains
    m.constrain_positive('.*rbf_variance')
    m.constrain_bounded('.*lengthscale',1.,10. )
    m.constrain_fixed('.*noise',0.0025)
    if optimize:
        m.optimize()
        m.optimize_restarts(num_restarts = 10)
    #######################################################
    #######################################################
    # sample inputs and outputs
    X = np.random.uniform(-3.,3.,(50,2))
    Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(50,1)*0.05
    # define kernel
    ker = GPy.kern.Matern52(2,ARD=True) + GPy.kern.white(2)
    # create simple GP model
    m = GPy.models.GPRegression(X, Y, ker)
    # contrain all parameters to be positive
    m.constrain_positive('')
    # optimize and plot
    if optimize:
        m.optimize('tnc', max_f_eval = 1000)
    if plot:
        m.plot()
    print m
    return(m)
 def tuto_kernel_overview(optimize=True, plot=True):
    """The detailed explanations of the commands used in this file can be found in the tutorial section"""
    ker1 = GPy.kern.rbf(1)  # Equivalent to ker1 = GPy.kern.rbf(input_dim=1, variance=1., lengthscale=1.)
    ker2 = GPy.kern.rbf(input_dim=1, variance = .75, lengthscale=2.)
    ker3 = GPy.kern.rbf(1, .5, .5)
    print ker2
    if plot:
        ker1.plot()
        ker2.plot()
        ker3.plot()
    k1 = GPy.kern.rbf(1,1.,2.)
    k2 = GPy.kern.Matern32(1, 0.5, 0.2)
    # Product of kernels
    k_prod = k1.prod(k2)                        # By default, tensor=False
    k_prodtens = k1.prod(k2,tensor=True)
    # Sum of kernels
    k_add = k1.add(k2)                          # By default, tensor=False
    k_addtens = k1.add(k2,tensor=True)
    k1 = GPy.kern.rbf(1,1.,2)
    k2 = GPy.kern.periodic_Matern52(1,variance=1e3, lengthscale=1, period = 1.5, lower=-5., upper = 5)
    k = k1 * k2  # equivalent to k = k1.prod(k2)
    print k
    # Simulate sample paths
    X = np.linspace(-5,5,501)[:,None]
    Y = np.random.multivariate_normal(np.zeros(501),k.K(X),1)
    k1 = GPy.kern.rbf(1)
    k2 = GPy.kern.Matern32(1)
    k3 = GPy.kern.white(1)
    k = k1 + k2 + k3
    print k
    k.constrain_positive('.*var')
    k.constrain_fixed(np.array([1]),1.75)
    k.tie_params('.*len')
    k.unconstrain('white')
    k.constrain_bounded('white',lower=1e-5,upper=.5)
    print k
    k_cst = GPy.kern.bias(1,variance=1.)
    k_mat = GPy.kern.Matern52(1,variance=1., lengthscale=3)
    Kanova = (k_cst + k_mat).prod(k_cst + k_mat,tensor=True)
    print Kanova
    # sample inputs and outputs
    X = np.random.uniform(-3.,3.,(40,2))
    Y = 0.5*X[:,:1] + 0.5*X[:,1:] + 2*np.sin(X[:,:1]) * np.sin(X[:,1:])
    # Create GP regression model
    m = GPy.models.GPRegression(X, Y, Kanova)
    if plot:
        fig = pb.figure(figsize=(5,5))
        ax = fig.add_subplot(111)
        m.plot(ax=ax)
        pb.figure(figsize=(20,3))
        pb.subplots_adjust(wspace=0.5)
        axs = pb.subplot(1,5,1)
        m.plot(ax=axs)
        pb.subplot(1,5,2)
        pb.ylabel("=   ",rotation='horizontal',fontsize='30')
        axs = pb.subplot(1,5,3)
        m.plot(ax=axs, which_parts=[False,True,False,False])
        pb.ylabel("cst          +",rotation='horizontal',fontsize='30')
        axs = pb.subplot(1,5,4)
        m.plot(ax=axs, which_parts=[False,False,True,False])
        pb.ylabel("+   ",rotation='horizontal',fontsize='30')
        axs = pb.subplot(1,5,5)
        pb.ylabel("+   ",rotation='horizontal',fontsize='30')
        m.plot(ax=axs, which_parts=[False,False,False,True])
    return(m)
 def model_interaction(optimize=True, plot=True):
    X = np.random.randn(20,1)
    Y = np.sin(X) + np.random.randn(*X.shape)*0.01 + 5.
    k = GPy.kern.rbf(1) + GPy.kern.bias(1)
    m = GPy.models.GPRegression(X, Y, kernel=k)
    return m
--- a/GPy/gpy_config.cfg
+++ b/GPy/gpy_config.cfg
@ -1,7 +0,0 @@
 # This is the configuration file for GPy
 [parallel]
 # Enable openmp support. This speeds up some computations, depending on the number
 # of cores available. Setting up a compiler with openmp support can be difficult on 
 # some platforms, hence this option.
 openmp=False
--- a/GPy/inference/init.py
+++ b/GPy/inference/init.py
@ -0,0 +1,2 @@
 import latent_function_inference
 import optimization
--- a/GPy/inference/latent_function_inference/init.py
+++ b/GPy/inference/latent_function_inference/init.py
@ -0,0 +1,98 @@
 # Copyright (c) 2012, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 __doc__ = """
 Inference over Gaussian process latent functions
 In all our GP models, the consistency propery means that we have a Gaussian
 prior over a finite set of points f. This prior is
  math:: N(f | 0, K)
 where K is the kernel matrix.
 We also have a likelihood (see GPy.likelihoods) which defines how the data are
 related to the latent function: p(y | f).  If the likelihood is also a Gaussian,
 the inference over f is tractable (see exact_gaussian_inference.py).
 If the likelihood object is something other than Gaussian, then exact inference
 is not tractable. We then resort to a Laplace approximation (laplace.py) or
 expectation propagation (ep.py).
 The inference methods return a
 :class:`~GPy.inference.latent_function_inference.posterior.Posterior`
 instance, which is a simple
 structure which contains a summary of the posterior. The model classes can then
 use this posterior object for making predictions, optimizing hyper-parameters,
 etc.
 """
 class LatentFunctionInference(object):
    def on_optimization_start(self):
        """
        This function gets called, just before the optimization loop to start.
        """
        pass
    def on_optimization_end(self):
        """
        This function gets called, just after the optimization loop ended.
        """
        pass
 class InferenceMethodList(LatentFunctionInference, list):
    def on_optimization_start(self):
        for inf in self:
            inf.on_optimization_start()
    def on_optimization_end(self):
        for inf in self:
            inf.on_optimization_end()
    def __getstate__(self):
        state = []
        for inf in self:
            state.append(inf)
        return state
    def __setstate__(self, state):
        for inf in state:
            self.append(inf)
 from exact_gaussian_inference import ExactGaussianInference
 from laplace import Laplace
 from GPy.inference.latent_function_inference.var_dtc import VarDTC
 from expectation_propagation import EP
 from expectation_propagation_dtc import EPDTC
 from dtc import DTC
 from fitc import FITC
 from var_dtc_parallel import VarDTC_minibatch
 # class FullLatentFunctionData(object):
 #
 #
 # class EMLikeLatentFunctionInference(LatentFunctionInference):
 #     def update_approximation(self):
 #         """
 #         This function gets called when the 
 #         """
 #     
 #     def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
 #         """
 #         Do inference on the latent functions given a covariance function `kern`,
 #         inputs and outputs `X` and `Y`, inducing_inputs `Z`, and a likelihood `likelihood`.
 #         Additional metadata for the outputs `Y` can be given in `Y_metadata`.
 #         """
 #         raise NotImplementedError, "Abstract base class for full inference"
 # 
 # class VariationalLatentFunctionInference(LatentFunctionInference):
 #     def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
 #         """
 #         Do inference on the latent functions given a covariance function `kern`,
 #         inputs and outputs `X` and `Y`, inducing_inputs `Z`, and a likelihood `likelihood`.
 #         Additional metadata for the outputs `Y` can be given in `Y_metadata`.
 #         """
 #         raise NotImplementedError, "Abstract base class for full inference"
--- a/GPy/inference/latent_function_inference/dtc.py
+++ b/GPy/inference/latent_function_inference/dtc.py
@ -0,0 +1,162 @@
 # Copyright (c) 2012-2014, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from posterior import Posterior
 from ...util.linalg import jitchol, tdot, dtrtrs, dpotri, pdinv
 import numpy as np
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 class DTC(LatentFunctionInference):
    """
    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
    The function self.inference returns a Posterior object, which summarizes
    the posterior.
    NB. It's not recommended to use this function! It's here for historical purposes. 
    """
    def __init__(self):
        self.const_jitter = 1e-6
    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
        assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
        num_inducing, _ = Z.shape
        num_data, output_dim = Y.shape
        #make sure the noise is not hetero
        beta = 1./likelihood.gaussian_variance(Y_metadata)
        if beta.size > 1:
            raise NotImplementedError, "no hetero noise with this implementation of DTC"
        Kmm = kern.K(Z)
        Knn = kern.Kdiag(X)
        Knm = kern.K(X, Z)
        U = Knm
        Uy = np.dot(U.T,Y)
        #factor Kmm
        Kmmi, L, Li, _ = pdinv(Kmm)
        # Compute A
        LiUTbeta = np.dot(Li, U.T)*np.sqrt(beta)
        A = tdot(LiUTbeta) + np.eye(num_inducing)
        # factor A
        LA = jitchol(A)
        # back substutue to get b, P, v
        tmp, _ = dtrtrs(L, Uy, lower=1)
        b, _ = dtrtrs(LA, tmp*beta, lower=1)
        tmp, _ = dtrtrs(LA, b, lower=1, trans=1)
        v, _ = dtrtrs(L, tmp, lower=1, trans=1)
        tmp, _ = dtrtrs(LA, Li, lower=1, trans=0)
        P = tdot(tmp.T)
        #compute log marginal
        log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \
                       -np.sum(np.log(np.diag(LA)))*output_dim + \
                       0.5*num_data*output_dim*np.log(beta) + \
                       -0.5*beta*np.sum(np.square(Y)) + \
                       0.5*np.sum(np.square(b))
        # Compute dL_dKmm
        vvT_P = tdot(v.reshape(-1,1)) + P
        dL_dK = 0.5*(Kmmi - vvT_P)
        # Compute dL_dU
        vY = np.dot(v.reshape(-1,1),Y.T)
        dL_dU = vY - np.dot(vvT_P, U.T)
        dL_dU *= beta
        #compute dL_dR
        Uv = np.dot(U, v)
        dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - 1./beta + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) + np.sum(np.square(Uv), 1))*beta**2
        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR)
        grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':np.zeros_like(Knn), 'dL_dKnm':dL_dU.T, 'dL_dthetaL':dL_dthetaL}
        #construct a posterior object
        post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L)
        return post, log_marginal, grad_dict
 class vDTC(object):
    def __init__(self):
        self.const_jitter = 1e-6
    def inference(self, kern, X, X_variance, Z, likelihood, Y, Y_metadata):
        assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
        num_inducing, _ = Z.shape
        num_data, output_dim = Y.shape
        #make sure the noise is not hetero
        beta = 1./likelihood.gaussian_variance(Y_metadata)
        if beta.size > 1:
            raise NotImplementedError, "no hetero noise with this implementation of DTC"
        Kmm = kern.K(Z)
        Knn = kern.Kdiag(X)
        Knm = kern.K(X, Z)
        U = Knm
        Uy = np.dot(U.T,Y)
        #factor Kmm
        Kmmi, L, Li, _ = pdinv(Kmm)
        # Compute A
        LiUTbeta = np.dot(Li, U.T)*np.sqrt(beta)
        A_ = tdot(LiUTbeta)
        trace_term = -0.5*(np.sum(Knn)*beta - np.trace(A_))
        A = A_ + np.eye(num_inducing)
        # factor A
        LA = jitchol(A)
        # back substutue to get b, P, v
        tmp, _ = dtrtrs(L, Uy, lower=1)
        b, _ = dtrtrs(LA, tmp*beta, lower=1)
        tmp, _ = dtrtrs(LA, b, lower=1, trans=1)
        v, _ = dtrtrs(L, tmp, lower=1, trans=1)
        tmp, _ = dtrtrs(LA, Li, lower=1, trans=0)
        P = tdot(tmp.T)
        stop
        #compute log marginal
        log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \
                       -np.sum(np.log(np.diag(LA)))*output_dim + \
                       0.5*num_data*output_dim*np.log(beta) + \
                       -0.5*beta*np.sum(np.square(Y)) + \
                       0.5*np.sum(np.square(b)) + \
                       trace_term
        # Compute dL_dKmm
        vvT_P = tdot(v.reshape(-1,1)) + P
        LAL = Li.T.dot(A).dot(Li)
        dL_dK = Kmmi - 0.5*(vvT_P + LAL)
        # Compute dL_dU
        vY = np.dot(v.reshape(-1,1),Y.T)
        #dL_dU = vY - np.dot(vvT_P, U.T)
        dL_dU = vY - np.dot(vvT_P - Kmmi, U.T)
        dL_dU *= beta
        #compute dL_dR
        Uv = np.dot(U, v)
        dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - 1./beta + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) + np.sum(np.square(Uv), 1) )*beta**2
        dL_dR -=beta*trace_term/num_data
        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR)
        grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':np.zeros_like(Knn) + -0.5*beta, 'dL_dKnm':dL_dU.T, 'dL_dthetaL':dL_dthetaL}
        #construct a posterior object
        post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L)
        return post, log_marginal, grad_dict
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@ -0,0 +1,59 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from posterior import Posterior
 from ...util.linalg import pdinv, dpotrs, tdot
 from ...util import diag
 import numpy as np
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 class ExactGaussianInference(LatentFunctionInference):
    """
    An object for inference when the likelihood is Gaussian.
    The function self.inference returns a Posterior object, which summarizes
    the posterior.
    For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.
    """
    def __init__(self):
        pass#self._YYTfactor_cache = caching.cache()
    def get_YYTfactor(self, Y):
        """
        find a matrix L which satisfies LL^T = YY^T.
        Note that L may have fewer columns than Y, else L=Y.
        """
        N, D = Y.shape
        if (N>D):
            return Y
        else:
            #if Y in self.cache, return self.Cache[Y], else store Y in cache and return L.
            #print "WARNING: N>D of Y, we need caching of L, such that L*L^T = Y, returning Y still!"
            return Y
    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
        """
        Returns a Posterior class containing essential quantities of the posterior
        """
        YYT_factor = self.get_YYTfactor(Y)
        K = kern.K(X)
        Ky = K.copy()
        diag.add(Ky, likelihood.gaussian_variance(Y_metadata))
        Wi, LW, LWi, W_logdet = pdinv(Ky)
        alpha, _ = dpotrs(LW, YYT_factor, lower=1)
        log_marginal =  0.5*(-Y.size * log_2_pi - Y.shape[1] * W_logdet - np.sum(alpha * YYT_factor))
        dL_dK = 0.5 * (tdot(alpha) - Y.shape[1] * Wi)
        dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK),Y_metadata)
        return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@ -0,0 +1,122 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ...util.linalg import pdinv,jitchol,DSYR,tdot,dtrtrs, dpotrs
 from posterior import Posterior
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 class EP(LatentFunctionInference):
    def __init__(self, epsilon=1e-6, eta=1., delta=1.):
        """
        The expectation-propagation algorithm.
        For nomenclature see Rasmussen & Williams 2006.
        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
        :type epsilon: float
        :param eta: parameter for fractional EP updates.
        :type eta: float64
        :param delta: damping EP updates factor.
        :type delta: float64
        """
        self.epsilon, self.eta, self.delta = epsilon, eta, delta
        self.reset()
    def reset(self):
        self.old_mutilde, self.old_vtilde = None, None
        self._ep_approximation = None
    def on_optimization_start(self):
        self._ep_approximation = None
    def on_optimization_end(self):
        # TODO: update approximation in the end as well? Maybe even with a switch?
        pass
    def inference(self, kern, X, likelihood, Y, Y_metadata=None, Z=None):
        num_data, output_dim = Y.shape
        assert output_dim ==1, "ep in 1D only (for now!)"
        K = kern.K(X)
        if self._ep_approximation is None:
            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation = self.expectation_propagation(K, Y, likelihood, Y_metadata)
        else:
            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation
        Wi, LW, LWi, W_logdet = pdinv(K + np.diag(1./tau_tilde))
        alpha, _ = dpotrs(LW, mu_tilde, lower=1)
        log_marginal =  0.5*(-num_data * log_2_pi - W_logdet - np.sum(alpha * mu_tilde)) # TODO: add log Z_hat??
        dL_dK = 0.5 * (tdot(alpha[:,None]) - Wi)
        dL_dthetaL = np.zeros(likelihood.size)#TODO: derivatives of the likelihood parameters
        return Posterior(woodbury_inv=Wi, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
    def expectation_propagation(self, K, Y, likelihood, Y_metadata):
        num_data, data_dim = Y.shape
        assert data_dim == 1, "This EP methods only works for 1D outputs"
        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
        mu = np.zeros(num_data)
        Sigma = K.copy()
        #Initial values - Marginal moments
        Z_hat = np.empty(num_data,dtype=np.float64)
        mu_hat = np.empty(num_data,dtype=np.float64)
        sigma2_hat = np.empty(num_data,dtype=np.float64)
        #initial values - Gaussian factors
        if self.old_mutilde is None:
            tau_tilde, mu_tilde, v_tilde = np.zeros((3, num_data))
        else:
            assert old_mutilde.size == num_data, "data size mis-match: did you change the data? try resetting!"
            mu_tilde, v_tilde = self.old_mutilde, self.old_vtilde
            tau_tilde = v_tilde/mu_tilde
        #Approximation
        tau_diff = self.epsilon + 1.
        v_diff = self.epsilon + 1.
       	iterations = 0
        while (tau_diff > self.epsilon) or (v_diff > self.epsilon):
            update_order = np.random.permutation(num_data)
            for i in update_order:
                #Cavity distribution parameters
                tau_cav = 1./Sigma[i,i] - self.eta*tau_tilde[i]
                v_cav = mu[i]/Sigma[i,i] - self.eta*v_tilde[i]
                #Marginal moments
                Z_hat[i], mu_hat[i], sigma2_hat[i] = likelihood.moments_match_ep(Y[i], tau_cav, v_cav)#, Y_metadata=None)#=(None if Y_metadata is None else Y_metadata[i]))
                #Site parameters update
                delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i])
                delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i])
                tau_tilde[i] += delta_tau
                v_tilde[i] += delta_v
                #Posterior distribution parameters update
                DSYR(Sigma, Sigma[:,i].copy(), -delta_tau/(1.+ delta_tau*Sigma[i,i]))
                mu = np.dot(Sigma, v_tilde)
            #(re) compute Sigma and mu using full Cholesky decompy
            tau_tilde_root = np.sqrt(tau_tilde)
            Sroot_tilde_K = tau_tilde_root[:,None] * K
            B = np.eye(num_data) + Sroot_tilde_K * tau_tilde_root[None,:]
            L = jitchol(B)
            V, _ = dtrtrs(L, Sroot_tilde_K, lower=1)
            Sigma = K - np.dot(V.T,V)
            mu = np.dot(Sigma,v_tilde)
            #monitor convergence
            if iterations>0:
                tau_diff = np.mean(np.square(tau_tilde-tau_tilde_old))
                v_diff = np.mean(np.square(v_tilde-v_tilde_old))
            tau_tilde_old = tau_tilde.copy()
            v_tilde_old = v_tilde.copy()
            iterations += 1
        mu_tilde = v_tilde/tau_tilde
        return mu, Sigma, mu_tilde, tau_tilde, Z_hat
--- a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
@ -0,0 +1,351 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ...util import diag
 from ...util.linalg import mdot, jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify, DSYR
 from ...core.parameterization.variational import VariationalPosterior
 from . import LatentFunctionInference
 from posterior import Posterior
 log_2_pi = np.log(2*np.pi)
 class EPDTC(LatentFunctionInference):
    const_jitter = 1e-6
    def __init__(self, epsilon=1e-6, eta=1., delta=1., limit=1):
        from ...util.caching import Cacher
        self.limit = limit
        self.get_trYYT = Cacher(self._get_trYYT, limit)
        self.get_YYTfactor = Cacher(self._get_YYTfactor, limit)
        self.epsilon, self.eta, self.delta = epsilon, eta, delta
        self.reset()
    def set_limit(self, limit):
        self.get_trYYT.limit = limit
        self.get_YYTfactor.limit = limit
    def on_optimization_start(self):
        self._ep_approximation = None
    def on_optimization_end(self):
        # TODO: update approximation in the end as well? Maybe even with a switch?
        pass
    def _get_trYYT(self, Y):
        return np.sum(np.square(Y))
    def __getstate__(self):
        # has to be overridden, as Cacher objects cannot be pickled.
        return self.limit
    def __setstate__(self, state):
        # has to be overridden, as Cacher objects cannot be pickled.
        self.limit = state
        from ...util.caching import Cacher
        self.get_trYYT = Cacher(self._get_trYYT, self.limit)
        self.get_YYTfactor = Cacher(self._get_YYTfactor, self.limit)
    def _get_YYTfactor(self, Y):
        """
        find a matrix L which satisfies LLT = YYT.
        Note that L may have fewer columns than Y.
        """
        N, D = Y.shape
        if (N>=D):
            return Y
        else:
            return jitchol(tdot(Y))
    def get_VVTfactor(self, Y, prec):
        return Y * prec # TODO chache this, and make it effective
    def reset(self):
        self.old_mutilde, self.old_vtilde = None, None
        self._ep_approximation = None
    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
        num_data, output_dim = Y.shape
        assert output_dim ==1, "ep in 1D only (for now!)"
        Kmm = kern.K(Z)
        Kmn = kern.K(Z,X)
        if self._ep_approximation is None:
            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation = self.expectation_propagation(Kmm, Kmn, Y, likelihood, Y_metadata)
        else:
            mu, Sigma, mu_tilde, tau_tilde, Z_hat = self._ep_approximation
        if isinstance(X, VariationalPosterior):
            uncertain_inputs = True
            psi0 = kern.psi0(Z, X)
            psi1 = Kmn.T#kern.psi1(Z, X)
            psi2 = kern.psi2(Z, X)
        else:
            uncertain_inputs = False
            psi0 = kern.Kdiag(X)
            psi1 = Kmn.T#kern.K(X, Z)
            psi2 = None
        #see whether we're using variational uncertain inputs
        _, output_dim = Y.shape
        #see whether we've got a different noise variance for each datum
        #beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6)
        beta = tau_tilde
        VVT_factor = beta[:,None]*mu_tilde[:,None]
        trYYT = self.get_trYYT(mu_tilde[:,None])
        # do the inference:
        het_noise = beta.size > 1
        num_inducing = Z.shape[0]
        num_data = Y.shape[0]
        # kernel computations, using BGPLVM notation
        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
        Lm = jitchol(Kmm)
        # The rather complex computations of A
        if uncertain_inputs:
            if het_noise:
                psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0)
            else:
                psi2_beta = psi2.sum(0) * beta
            LmInv = dtrtri(Lm)
            A = LmInv.dot(psi2_beta.dot(LmInv.T))
        else:
            if het_noise:
                tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
            else:
                tmp = psi1 * (np.sqrt(beta))
            tmp, _ = dtrtrs(Lm, tmp.T, lower=1)
            A = tdot(tmp) #print A.sum()
        # factor B
        B = np.eye(num_inducing) + A
        LB = jitchol(B)
        psi1Vf = np.dot(psi1.T, VVT_factor)
        # back substutue C into psi1Vf
        tmp, _ = dtrtrs(Lm, psi1Vf, lower=1, trans=0)
        _LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0)
        tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
        Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
        # data fit and derivative of L w.r.t. Kmm
        delit = tdot(_LBi_Lmi_psi1Vf)
        data_fit = np.trace(delit)
        DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit)
        delit = -0.5 * DBi_plus_BiPBi
        delit += -0.5 * B * output_dim
        delit += output_dim * np.eye(num_inducing)
        # Compute dL_dKmm
        dL_dKmm = backsub_both_sides(Lm, delit)
        # derivatives of L w.r.t. psi
        dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm,
            VVT_factor, Cpsi1Vf, DBi_plus_BiPBi,
            psi1, het_noise, uncertain_inputs)
        # log marginal likelihood
        log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise,
            psi0, A, LB, trYYT, data_fit, VVT_factor)
        #put the gradients in the right places
        dL_dR = _compute_dL_dR(likelihood,
            het_noise, uncertain_inputs, LB,
            _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A,
            psi0, psi1, beta,
            data_fit, num_data, output_dim, trYYT, mu_tilde[:,None])
        dL_dthetaL = 0#likelihood.exact_inference_gradients(dL_dR,Y_metadata)
        if uncertain_inputs:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dpsi0':dL_dpsi0,
                         'dL_dpsi1':dL_dpsi1,
                         'dL_dpsi2':dL_dpsi2,
                         'dL_dthetaL':dL_dthetaL}
        else:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dKdiag':dL_dpsi0,
                         'dL_dKnm':dL_dpsi1,
                         'dL_dthetaL':dL_dthetaL}
        #get sufficient things for posterior prediction
        #TODO: do we really want to do this in  the loop?
        if VVT_factor.shape[1] == Y.shape[1]:
            woodbury_vector = Cpsi1Vf # == Cpsi1V
        else:
            print 'foobar'
            psi1V = np.dot(mu_tilde[:,None].T*beta, psi1).T
            tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
            tmp, _ = dpotrs(LB, tmp, lower=1)
            woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
        Bi, _ = dpotri(LB, lower=1)
        symmetrify(Bi)
        Bi = -dpotri(LB, lower=1)[0]
        diag.add(Bi, 1)
        woodbury_inv = backsub_both_sides(Lm, Bi)
        #construct a posterior object
        post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
        return post, log_marginal, grad_dict
    def expectation_propagation(self, Kmm, Kmn, Y, likelihood, Y_metadata):
        num_data, data_dim = Y.shape
        assert data_dim == 1, "This EP methods only works for 1D outputs"
        KmnKnm = np.dot(Kmn,Kmn.T)
        Lm = jitchol(Kmm)
        Lmi = dtrtrs(Lm,np.eye(Lm.shape[0]))[0] #chol_inv(Lm)
        Kmmi = np.dot(Lmi.T,Lmi)
        KmmiKmn = np.dot(Kmmi,Kmn)
        Qnn_diag = np.sum(Kmn*KmmiKmn,-2)
        LLT0 = Kmm.copy()
        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
        mu = np.zeros(num_data)
        LLT = Kmm.copy() #Sigma = K.copy()
        Sigma_diag = Qnn_diag.copy()
        #Initial values - Marginal moments
        Z_hat = np.empty(num_data,dtype=np.float64)
        mu_hat = np.empty(num_data,dtype=np.float64)
        sigma2_hat = np.empty(num_data,dtype=np.float64)
        #initial values - Gaussian factors
        if self.old_mutilde is None:
            tau_tilde, mu_tilde, v_tilde = np.zeros((3, num_data))
        else:
            assert old_mutilde.size == num_data, "data size mis-match: did you change the data? try resetting!"
            mu_tilde, v_tilde = self.old_mutilde, self.old_vtilde
            tau_tilde = v_tilde/mu_tilde
        #Approximation
        tau_diff = self.epsilon + 1.
        v_diff = self.epsilon + 1.
       	iterations = 0
        while (tau_diff > self.epsilon) or (v_diff > self.epsilon):
            update_order = np.random.permutation(num_data)
            for i in update_order:
                #Cavity distribution parameters
                tau_cav = 1./Sigma_diag[i] - self.eta*tau_tilde[i]
                v_cav = mu[i]/Sigma_diag[i] - self.eta*v_tilde[i]
                #Marginal moments
                Z_hat[i], mu_hat[i], sigma2_hat[i] = likelihood.moments_match_ep(Y[i], tau_cav, v_cav)#, Y_metadata=None)#=(None if Y_metadata is None else Y_metadata[i]))
                #Site parameters update
                delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i])
                delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i])
                tau_tilde[i] += delta_tau
                v_tilde[i] += delta_v
                #Posterior distribution parameters update
                #DSYR(Sigma, Sigma[:,i].copy(), -delta_tau/(1.+ delta_tau*Sigma[i,i]))
                DSYR(LLT,Kmn[:,i].copy(),delta_tau)
                L = jitchol(LLT)
                V,info = dtrtrs(L,Kmn,lower=1)
                Sigma_diag = np.sum(V*V,-2)
                si = np.sum(V.T*V[:,i],-1)
                mu += (delta_v-delta_tau*mu[i])*si
                #mu = np.dot(Sigma, v_tilde)
            #(re) compute Sigma and mu using full Cholesky decompy
            LLT = LLT0 + np.dot(Kmn*tau_tilde[None,:],Kmn.T)
            L = jitchol(LLT)
            V,info = dtrtrs(L,Kmn,lower=1)
            V2,info = dtrtrs(L.T,V,lower=0)
            #Sigma_diag = np.sum(V*V,-2)
            #Knmv_tilde = np.dot(Kmn,v_tilde)
            #mu = np.dot(V2.T,Knmv_tilde)
            Sigma = np.dot(V2.T,V2)
            mu = np.dot(Sigma,v_tilde)
            #monitor convergence
            if iterations>0:
                tau_diff = np.mean(np.square(tau_tilde-tau_tilde_old))
                v_diff = np.mean(np.square(v_tilde-v_tilde_old))
            tau_tilde_old = tau_tilde.copy()
            v_tilde_old = v_tilde.copy()
            tau_diff = 0
            v_diff = 0
            iterations += 1
        mu_tilde = v_tilde/tau_tilde
        return mu, Sigma, mu_tilde, tau_tilde, Z_hat
 def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs):
    dL_dpsi0 = -0.5 * output_dim * (beta[:,None] * np.ones([num_data, 1])).flatten()
    dL_dpsi1 = np.dot(VVT_factor, Cpsi1Vf.T)
    dL_dpsi2_beta = 0.5 * backsub_both_sides(Lm, output_dim * np.eye(num_inducing) - DBi_plus_BiPBi)
    if het_noise:
        if uncertain_inputs:
            dL_dpsi2 = beta[:, None, None] * dL_dpsi2_beta[None, :, :]
        else:
            dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * beta.reshape(num_data, 1)).T).T
            dL_dpsi2 = None
    else:
        dL_dpsi2 = beta * dL_dpsi2_beta
        if uncertain_inputs:
            # repeat for each of the N psi_2 matrices
            dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], num_data, axis=0)
        else:
            # subsume back into psi1 (==Kmn)
            dL_dpsi1 += 2.*np.dot(psi1, dL_dpsi2)
            dL_dpsi2 = None
    return dL_dpsi0, dL_dpsi1, dL_dpsi2
 def _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT, Y):
    # the partial derivative vector for the likelihood
    if likelihood.size == 0:
        # save computation here.
        dL_dR = None
    elif het_noise:
        if uncertain_inputs:
            raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
        else:
            #from ...util.linalg import chol_inv
            #LBi = chol_inv(LB)
            LBi, _ = dtrtrs(LB,np.eye(LB.shape[0]))
            Lmi_psi1, nil = dtrtrs(Lm, psi1.T, lower=1, trans=0)
            _LBi_Lmi_psi1, _ = dtrtrs(LB, Lmi_psi1, lower=1, trans=0)
            dL_dR = -0.5 * beta + 0.5 * (beta*Y)**2
            dL_dR += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * beta**2
            dL_dR += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*beta**2
            dL_dR += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * Y * beta**2
            dL_dR += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * beta**2
    else:
        # likelihood is not heteroscedatic
        dL_dR = -0.5 * num_data * output_dim * beta + 0.5 * trYYT * beta ** 2
        dL_dR += 0.5 * output_dim * (psi0.sum() * beta ** 2 - np.trace(A) * beta)
        dL_dR += beta * (0.5 * np.sum(A * DBi_plus_BiPBi) - data_fit)
    return dL_dR
 def _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT, data_fit,Y):
    #compute log marginal likelihood
    if het_noise:
        lik_1 = -0.5 * num_data * output_dim * np.log(2. * np.pi) + 0.5 * np.sum(np.log(beta)) - 0.5 * np.sum(beta * np.square(Y).sum(axis=-1))
        lik_2 = -0.5 * output_dim * (np.sum(beta.flatten() * psi0) - np.trace(A))
    else:
        lik_1 = -0.5 * num_data * output_dim * (np.log(2. * np.pi) - np.log(beta)) - 0.5 * beta * trYYT
        lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
    lik_3 = -output_dim * (np.sum(np.log(np.diag(LB))))
    lik_4 = 0.5 * data_fit
    log_marginal = lik_1 + lik_2 + lik_3 + lik_4
    return log_marginal
--- a/GPy/inference/latent_function_inference/fitc.py
+++ b/GPy/inference/latent_function_inference/fitc.py
@ -0,0 +1,89 @@
 # Copyright (c) 2012, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from posterior import Posterior
 from ...util.linalg import jitchol, tdot, dtrtrs, dpotri, pdinv
 from ...util import diag
 import numpy as np
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 class FITC(LatentFunctionInference):
    """
    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
    The function self.inference returns a Posterior object, which summarizes
    the posterior.
    """
    const_jitter = 1e-6
    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
        num_inducing, _ = Z.shape
        num_data, output_dim = Y.shape
        #make sure the noise is not hetero
        sigma_n = likelihood.gaussian_variance(Y_metadata)
        if sigma_n.size >1:
            raise NotImplementedError, "no hetero noise with this implementation of FITC"
        Kmm = kern.K(Z)
        Knn = kern.Kdiag(X)
        Knm = kern.K(X, Z)
        U = Knm
        #factor Kmm
        diag.add(Kmm, self.const_jitter)
        Kmmi, L, Li, _ = pdinv(Kmm)
        #compute beta_star, the effective noise precision
        LiUT = np.dot(Li, U.T)
        sigma_star = Knn + sigma_n - np.sum(np.square(LiUT),0)
        beta_star = 1./sigma_star
        # Compute and factor A
        A = tdot(LiUT*np.sqrt(beta_star)) + np.eye(num_inducing)
        LA = jitchol(A)
        # back substutue to get b, P, v
        URiy = np.dot(U.T*beta_star,Y)
        tmp, _ = dtrtrs(L, URiy, lower=1)
        b, _ = dtrtrs(LA, tmp, lower=1)
        tmp, _ = dtrtrs(LA, b, lower=1, trans=1)
        v, _ = dtrtrs(L, tmp, lower=1, trans=1)
        tmp, _ = dtrtrs(LA, Li, lower=1, trans=0)
        P = tdot(tmp.T)
        #compute log marginal
        log_marginal = -0.5*num_data*output_dim*np.log(2*np.pi) + \
                       -np.sum(np.log(np.diag(LA)))*output_dim + \
                       0.5*output_dim*np.sum(np.log(beta_star)) + \
                       -0.5*np.sum(np.square(Y.T*np.sqrt(beta_star))) + \
                       0.5*np.sum(np.square(b))
        #compute dL_dR
        Uv = np.dot(U, v)
        dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - 1./beta_star + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) + np.sum(np.square(Uv), 1))*beta_star**2
        # Compute dL_dKmm
        vvT_P = tdot(v.reshape(-1,1)) + P
        dL_dK = 0.5*(Kmmi - vvT_P)
        KiU = np.dot(Kmmi, U.T)
        dL_dK += np.dot(KiU*dL_dR, KiU.T)
        # Compute dL_dU
        vY = np.dot(v.reshape(-1,1),Y.T)
        dL_dU = vY - np.dot(vvT_P, U.T)
        dL_dU *= beta_star
        dL_dU -= 2.*KiU*dL_dR
        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR)
        grad_dict = {'dL_dKmm': dL_dK, 'dL_dKdiag':dL_dR, 'dL_dKnm':dL_dU.T, 'dL_dthetaL':dL_dthetaL}
        #construct a posterior object
        post = Posterior(woodbury_inv=Kmmi-P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=L)
        return post, log_marginal, grad_dict
--- a/GPy/inference/latent_function_inference/inferenceX.py
+++ b/GPy/inference/latent_function_inference/inferenceX.py
@ -0,0 +1,162 @@
 # Copyright (c) 2014, Zhenwen Dai
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ...core import Model
 from ...core.parameterization import variational
 def infer_newX(model, Y_new, optimize=True, init='L2'):
    """
    Infer the distribution of X for the new observed data *Y_new*.
    :param model: the GPy model used in inference
    :type model: GPy.core.Model
    :param Y_new: the new observed data for inference
    :type Y_new: numpy.ndarray
    :param optimize: whether to optimize the location of new X (True by default)
    :type optimize: boolean
    :return: a tuple containing the estimated posterior distribution of X and the model that optimize X
    :rtype: (GPy.core.parameterization.variational.VariationalPosterior, GPy.core.Model)
    """
    infr_m = InferenceX(model, Y_new, init=init)
    if optimize:
        infr_m.optimize()
    return infr_m.X, infr_m
 class InferenceX(Model):
    """
    The class for inference of new X with given new Y. (do_test_latent)
    :param model: the GPy model used in inference
    :type model: GPy.core.Model
    :param Y: the new observed data for inference
    :type Y: numpy.ndarray
    """
    def __init__(self, model, Y, name='inferenceX', init='L2'):
        if np.isnan(Y).any() or getattr(model, 'missing_data', False):
            assert Y.shape[0]==1, "The current implementation of inference X only support one data point at a time with missing data!"
            self.missing_data = True
            self.valid_dim = np.logical_not(np.isnan(Y[0]))
            self.ninan = getattr(model, 'ninan', None)
        else:
            self.missing_data = False
        super(InferenceX, self).__init__(name)
        self.likelihood = model.likelihood.copy()
        self.kern = model.kern.copy()
        if model.kern.useGPU:
            from ...models import SSGPLVM
            if isinstance(model, SSGPLVM):
                self.kern.GPU_SSRBF(True)
            else:
                self.kern.GPU(True)
        from copy import deepcopy
        self.posterior = deepcopy(model.posterior)
        if hasattr(model, 'variational_prior'):
            self.uncertain_input = True
            self.variational_prior = model.variational_prior.copy()
        else:
            self.uncertain_input = False
        if hasattr(model, 'inducing_inputs'):
            self.sparse_gp = True
            self.Z = model.Z.copy()
        else:
            self.sparse_gp = False
            self.uncertain_input = False
            self.Z = model.X.copy()
        self.Y = Y
        self.X = self._init_X(model, Y, init=init)
        self.compute_dL()
        self.link_parameter(self.X)
    def _init_X(self, model, Y_new, init='L2'):
        # Initialize the new X by finding the nearest point in Y space.
        Y = model.Y
        if self.missing_data:
            Y = Y[:,self.valid_dim]
            Y_new = Y_new[:,self.valid_dim]
            dist = -2.*Y_new.dot(Y.T) + np.square(Y_new).sum(axis=1)[:,None]+ np.square(Y).sum(axis=1)[None,:]
        else:
            if init=='L2':
                dist = -2.*Y_new.dot(Y.T) + np.square(Y_new).sum(axis=1)[:,None]+ np.square(Y).sum(axis=1)[None,:]
            elif init=='NCC':
                dist = Y_new.dot(Y.T)
            elif init=='rand':
                dist = np.random.rand(Y_new.shape[0],Y.shape[0])
        idx = dist.argmin(axis=1)
        from ...models import SSGPLVM
        from ...util.misc import param_to_array
        if isinstance(model, SSGPLVM):
            X = variational.SpikeAndSlabPosterior(param_to_array(model.X.mean[idx]), param_to_array(model.X.variance[idx]), param_to_array(model.X.gamma[idx]))
            if model.group_spike:
                X.gamma.fix()
        else:
            if self.uncertain_input and self.sparse_gp:
                X = variational.NormalPosterior(param_to_array(model.X.mean[idx]), param_to_array(model.X.variance[idx]))
            else:
                from ...core import Param
                X = Param('latent mean',param_to_array(model.X[idx]).copy())
        return X
    def compute_dL(self):
        # Common computation
        beta = 1./np.fmax(self.likelihood.variance, 1e-6)
        output_dim = self.Y.shape[-1]
        wv = self.posterior.woodbury_vector
        if self.missing_data:
            wv = wv[:,self.valid_dim]
            output_dim = self.valid_dim.sum()
            if self.ninan is not None:
                self.dL_dpsi2 = beta/2.*(self.posterior.woodbury_inv[:,:,self.valid_dim] - np.einsum('md,od->mo',wv, wv)[:, :, None]).sum(-1)
            else:
                self.dL_dpsi2 = beta/2.*(output_dim*self.posterior.woodbury_inv - np.einsum('md,od->mo',wv, wv))
            self.dL_dpsi1 = beta*np.dot(self.Y[:,self.valid_dim], wv.T)
            self.dL_dpsi0 = - beta/2.* np.ones(self.Y.shape[0])
        else:
            self.dL_dpsi2 = beta*(output_dim*self.posterior.woodbury_inv - np.einsum('md,od->mo',wv, wv))/2.
            self.dL_dpsi1 = beta*np.dot(self.Y, wv.T)
            self.dL_dpsi0 = -beta/2.*output_dim* np.ones(self.Y.shape[0])
    def parameters_changed(self):
        if self.uncertain_input:
            psi0 = self.kern.psi0(self.Z, self.X)
            psi1 = self.kern.psi1(self.Z, self.X)
            psi2 = self.kern.psi2(self.Z, self.X)
        else:
            psi0 = self.kern.Kdiag(self.X)
            psi1 = self.kern.K(self.X, self.Z)
            psi2 = np.dot(psi1.T,psi1)
        self._log_marginal_likelihood = (self.dL_dpsi2*psi2).sum()+(self.dL_dpsi1*psi1).sum()+(self.dL_dpsi0*psi0).sum()
        if self.uncertain_input:
            X_grad = self.kern.gradients_qX_expectations(variational_posterior=self.X, Z=self.Z, dL_dpsi0=self.dL_dpsi0, dL_dpsi1=self.dL_dpsi1, dL_dpsi2=self.dL_dpsi2)
            self.X.set_gradients(X_grad)
        else:
            dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(psi1,self.dL_dpsi2)
            X_grad = self.kern.gradients_X_diag(self.dL_dpsi0, self.X)
            X_grad += self.kern.gradients_X(dL_dpsi1, self.X, self.Z)
            self.X.gradient = X_grad
        if self.uncertain_input:
            from ...core.parameterization.variational import SpikeAndSlabPrior
            if isinstance(self.variational_prior, SpikeAndSlabPrior):
                # Update Log-likelihood
                KL_div = self.variational_prior.KL_divergence(self.X, N=self.Y.shape[0])
                # update for the KL divergence
                self.variational_prior.update_gradients_KL(self.X, N=self.Y.shape[0])
            else:
                # Update Log-likelihood
                KL_div = self.variational_prior.KL_divergence(self.X)
                # update for the KL divergence
                self.variational_prior.update_gradients_KL(self.X)
            self._log_marginal_likelihood += -KL_div
    def log_likelihood(self):
        return self._log_marginal_likelihood
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@ -0,0 +1,251 @@
 # Copyright (c) 2013, 2014 Alan Saul
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 #
 #Parts of this file were influenced by the Matlab GPML framework written by
 #Carl Edward Rasmussen & Hannes Nickisch, however all bugs are our own.
 #
 #The GPML code is released under the FreeBSD License.
 #Copyright (c) 2005-2013 Carl Edward Rasmussen & Hannes Nickisch. All rights reserved.
 #
 #The code and associated documentation is available from
 #http://gaussianprocess.org/gpml/code.
 import numpy as np
 from ...util.linalg import mdot, jitchol, dpotrs, dtrtrs, dpotri, symmetrify, pdinv
 from posterior import Posterior
 import warnings
 def warning_on_one_line(message, category, filename, lineno, file=None, line=None):
    return ' %s:%s: %s:%s\n' % (filename, lineno, category.__name__, message)
 warnings.formatwarning = warning_on_one_line
 from scipy import optimize
 from . import LatentFunctionInference
 class Laplace(LatentFunctionInference):
    def __init__(self):
        """
        Laplace Approximation
        Find the moments \hat{f} and the hessian at this point
        (using Newton-Raphson) of the unnormalised posterior
        """
        self._mode_finding_tolerance = 1e-7
        self._mode_finding_max_iter = 60
        self.bad_fhat = False
        #Store whether it is the first run of the inference so that we can choose whether we need
        #to calculate things or reuse old variables
        self.first_run = True
        self._previous_Ki_fhat = None
    def inference(self, kern, X, likelihood, Y, Y_metadata=None):
        """
        Returns a Posterior class containing essential quantities of the posterior
        """
        # Compute K
        K = kern.K(X)
        #Find mode
        if self.bad_fhat or self.first_run:
            Ki_f_init = np.zeros_like(Y)
            first_run = False
        else:
            Ki_f_init = self._previous_Ki_fhat
        f_hat, Ki_fhat = self.rasm_mode(K, Y, likelihood, Ki_f_init, Y_metadata=Y_metadata)
        self.f_hat = f_hat
        self.Ki_fhat =  Ki_fhat
        self.K = K.copy()
        #Compute hessian and other variables at mode
        log_marginal, woodbury_inv, dL_dK, dL_dthetaL = self.mode_computations(f_hat, Ki_fhat, K, Y, likelihood, kern, Y_metadata)
        self._previous_Ki_fhat = Ki_fhat.copy()
        return Posterior(woodbury_vector=Ki_fhat, woodbury_inv=woodbury_inv, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
    def rasm_mode(self, K, Y, likelihood, Ki_f_init, Y_metadata=None):
        """
        Rasmussen's numerically stable mode finding
        For nomenclature see Rasmussen & Williams 2006
        Influenced by GPML (BSD) code, all errors are our own
        :param K: Covariance matrix evaluated at locations X
        :type K: NxD matrix
        :param Y: The data
        :type Y: np.ndarray
        :param likelihood: the likelihood of the latent function value for the given data
        :type likelihood: a GPy.likelihood object
        :param Ki_f_init: the initial guess at the mode
        :type Ki_f_init: np.ndarray
        :param Y_metadata: information about the data, e.g. which likelihood to take from a multi-likelihood object
        :type Y_metadata: np.ndarray | None
        :returns: f_hat, mode on which to make laplace approxmiation
        :rtype: np.ndarray
        """
        Ki_f = Ki_f_init.copy()
        f = np.dot(K, Ki_f)
        #define the objective function (to be maximised)
        def obj(Ki_f, f):
            return -0.5*np.dot(Ki_f.flatten(), f.flatten()) + np.sum(likelihood.logpdf(f, Y, Y_metadata=Y_metadata))
        difference = np.inf
        iteration = 0
        while difference > self._mode_finding_tolerance and iteration < self._mode_finding_max_iter:
            W = -likelihood.d2logpdf_df2(f, Y, Y_metadata=Y_metadata)
            if np.any(np.isnan(W)):
                raise ValueError('One or more element(s) of W is NaN')
            grad = likelihood.dlogpdf_df(f, Y, Y_metadata=Y_metadata)
            if np.any(np.isnan(grad)):
                raise ValueError('One or more element(s) of grad is NaN')
            W_f = W*f
            b = W_f + grad # R+W p46 line 6.
            W12BiW12, _, _ = self._compute_B_statistics(K, W, likelihood.log_concave)
            W12BiW12Kb = np.dot(W12BiW12, np.dot(K, b))
            #Work out the DIRECTION that we want to move in, but don't choose the stepsize yet
            full_step_Ki_f = b - W12BiW12Kb # full_step_Ki_f = a in R&W p46 line 6.
            dKi_f = full_step_Ki_f - Ki_f
            #define an objective for the line search (minimize this one)
            def inner_obj(step_size):
                Ki_f_trial = Ki_f + step_size*dKi_f
                f_trial = np.dot(K, Ki_f_trial)
                return -obj(Ki_f_trial, f_trial)
            #use scipy for the line search, the compute new values of f, Ki_f
            step = optimize.brent(inner_obj, tol=1e-4, maxiter=12)
            Ki_f_new = Ki_f + step*dKi_f
            f_new = np.dot(K, Ki_f_new)
            difference = np.abs(np.sum(f_new - f)) + np.abs(np.sum(Ki_f_new - Ki_f))
            Ki_f = Ki_f_new
            f = f_new
            iteration += 1
        #Warn of bad fits
        if difference > self._mode_finding_tolerance:
            if not self.bad_fhat:
                warnings.warn("Not perfect mode found (f_hat). difference: {}, iteration: {} out of max {}".format(difference, iteration, self._mode_finding_max_iter))
            self.bad_fhat = True
        elif self.bad_fhat:
            self.bad_fhat = False
            warnings.warn("f_hat now fine again. difference: {}, iteration: {} out of max {}".format(difference, iteration, self._mode_finding_max_iter))
        return f, Ki_f
    def mode_computations(self, f_hat, Ki_f, K, Y, likelihood, kern, Y_metadata):
        """
        At the mode, compute the hessian and effective covariance matrix.
        returns: logZ : approximation to the marginal likelihood
                 woodbury_inv : variable required for calculating the approximation to the covariance matrix
                 dL_dthetaL : array of derivatives (1 x num_kernel_params)
                 dL_dthetaL : array of derivatives (1 x num_likelihood_params)
        """
        #At this point get the hessian matrix (or vector as W is diagonal)
        W = -likelihood.d2logpdf_df2(f_hat, Y, Y_metadata=Y_metadata)
        if np.any(np.isnan(W)):
            raise ValueError('One or more element(s) of W is NaN')
        K_Wi_i, L, LiW12 = self._compute_B_statistics(K, W, likelihood.log_concave)
        #compute vital matrices
        C = np.dot(LiW12, K)
        Ki_W_i  = K - C.T.dot(C)
        #compute the log marginal
        log_marginal = -0.5*np.dot(Ki_f.flatten(), f_hat.flatten()) + np.sum(likelihood.logpdf(f_hat, Y, Y_metadata=Y_metadata)) - np.sum(np.log(np.diag(L)))
        # Compute matrices for derivatives
        dW_df = -likelihood.d3logpdf_df3(f_hat, Y, Y_metadata=Y_metadata) # -d3lik_d3fhat
        if np.any(np.isnan(dW_df)):
            raise ValueError('One or more element(s) of dW_df is NaN')
        dL_dfhat = -0.5*(np.diag(Ki_W_i)[:, None]*dW_df) # s2 in R&W p126 line 9.
        #BiK, _ = dpotrs(L, K, lower=1)
        #dL_dfhat = 0.5*np.diag(BiK)[:, None]*dW_df
        I_KW_i = np.eye(Y.shape[0]) - np.dot(K, K_Wi_i)
        ####################
        #  compute dL_dK   #
        ####################
        if kern.size > 0 and not kern.is_fixed:
            #Explicit
            explicit_part = 0.5*(np.dot(Ki_f, Ki_f.T) - K_Wi_i)
            #Implicit
            implicit_part = np.dot(Ki_f, dL_dfhat.T).dot(I_KW_i)
            dL_dK = explicit_part + implicit_part
        else:
            dL_dK = np.zeros(likelihood.size)
        ####################
        #compute dL_dthetaL#
        ####################
        if likelihood.size > 0 and not likelihood.is_fixed:
            dlik_dthetaL, dlik_grad_dthetaL, dlik_hess_dthetaL = likelihood._laplace_gradients(f_hat, Y, Y_metadata=Y_metadata)
            num_params = likelihood.size
            # make space for one derivative for each likelihood parameter
            dL_dthetaL = np.zeros(num_params)
            for thetaL_i in range(num_params):
                #Explicit
                dL_dthetaL_exp = ( np.sum(dlik_dthetaL[thetaL_i])
                                # The + comes from the fact that dlik_hess_dthetaL == -dW_dthetaL
                                + 0.5*np.sum(np.diag(Ki_W_i).flatten()*dlik_hess_dthetaL[:, thetaL_i].flatten())
                                )
                #Implicit
                dfhat_dthetaL = mdot(I_KW_i, K, dlik_grad_dthetaL[:, thetaL_i])
                #dfhat_dthetaL = mdot(Ki_W_i, dlik_grad_dthetaL[:, thetaL_i])
                dL_dthetaL_imp = np.dot(dL_dfhat.T, dfhat_dthetaL)
                dL_dthetaL[thetaL_i] = dL_dthetaL_exp + dL_dthetaL_imp
        else:
            dL_dthetaL = np.zeros(likelihood.size)
        return log_marginal, K_Wi_i, dL_dK, dL_dthetaL
    def _compute_B_statistics(self, K, W, log_concave):
        """
        Rasmussen suggests the use of a numerically stable positive definite matrix B
        Which has a positive diagonal elements and can be easily inverted
        :param K: Prior Covariance matrix evaluated at locations X
        :type K: NxN matrix
        :param W: Negative hessian at a point (diagonal matrix)
        :type W: Vector of diagonal values of Hessian (1xN)
        :returns: (W12BiW12, L_B, Li_W12)
        """
        if not log_concave:
            #print "Under 1e-10: {}".format(np.sum(W < 1e-6))
            W[W<1e-6] = 1e-6
            # NOTE: when setting a parameter inside parameters_changed it will allways come to closed update circles!!!
            #W.__setitem__(W < 1e-6, 1e-6, update=False)  # FIXME-HACK: This is a hack since GPy can't handle negative variances which can occur
                                # If the likelihood is non-log-concave. We wan't to say that there is a negative variance
                                # To cause the posterior to become less certain than the prior and likelihood,
                                # This is a property only held by non-log-concave likelihoods
        if np.any(np.isnan(W)):
            raise ValueError('One or more element(s) of W is NaN')
        #W is diagonal so its sqrt is just the sqrt of the diagonal elements
        W_12 = np.sqrt(W)
        B = np.eye(K.shape[0]) + W_12*K*W_12.T
        L = jitchol(B)
        LiW12, _ = dtrtrs(L, np.diagflat(W_12), lower=1, trans=0)
        K_Wi_i = np.dot(LiW12.T, LiW12) # R = W12BiW12, in R&W p 126, eq 5.25
        #here's a better way to compute the required matrix.
        # you could do the model finding witha backsub, instead of a dot...
        #L2 = L/W_12
        #K_Wi_i_2 , _= dpotri(L2)
        #symmetrify(K_Wi_i_2)
        return K_Wi_i, L, LiW12
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@ -0,0 +1,186 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ...util.linalg import pdinv, dpotrs, dpotri, symmetrify, jitchol
 class Posterior(object):
    """
    An object to represent a Gaussian posterior over latent function values, p(f|D).
    This may be computed exactly for Gaussian likelihoods, or approximated for
    non-Gaussian likelihoods.
    The purpose of this class is to serve as an interface between the inference
    schemes and the model classes.  the model class can make predictions for
    the function at any new point x_* by integrating over this posterior.
    """
    def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None):
        """
        woodbury_chol : a lower triangular matrix L that satisfies posterior_covariance = K - K L^{-T} L^{-1} K
        woodbury_vector : a matrix (or vector, as Nx1 matrix) M which satisfies posterior_mean = K M
        K : the proir covariance (required for lazy computation of various quantities)
        mean : the posterior mean
        cov : the posterior covariance
        Not all of the above need to be supplied! You *must* supply:
          K (for lazy computation)
          or
          K_chol (for lazy computation)
       You may supply either:
          woodbury_chol
          woodbury_vector
        Or:
          mean
          cov
        Of course, you can supply more than that, but this class will lazily
        compute all other quantites on demand.
        """
        #obligatory
        self._K = K
        if ((woodbury_chol is not None) and (woodbury_vector is not None))\
                or ((woodbury_inv is not None) and (woodbury_vector is not None))\
                or ((woodbury_inv is not None) and (mean is not None))\
                or ((mean is not None) and (cov is not None)):
            pass # we have sufficient to compute the posterior
        else:
            raise ValueError, "insufficient information to compute the posterior"
        self._K_chol = K_chol
        self._K = K
        #option 1:
        self._woodbury_chol = woodbury_chol
        self._woodbury_vector = woodbury_vector
        #option 2.
        self._woodbury_inv = woodbury_inv
        #and woodbury vector
        #option 2:
        self._mean = mean
        self._covariance = cov
        #compute this lazily
        self._precision = None
    @property
    def mean(self):
        """
        Posterior mean
        $$
        K_{xx}v
        v := \texttt{Woodbury vector}
        $$
        """
        if self._mean is None:
            self._mean = np.dot(self._K, self.woodbury_vector)
        return self._mean
    @property
    def covariance(self):
        """
        Posterior covariance
        $$
        K_{xx} - K_{xx}W_{xx}^{-1}K_{xx}
        W_{xx} := \texttt{Woodbury inv}
        $$
        """
        if self._covariance is None:
            #LiK, _ = dtrtrs(self.woodbury_chol, self._K, lower=1)
            self._covariance = (np.atleast_3d(self._K) - np.tensordot(np.dot(np.atleast_3d(self.woodbury_inv).T, self._K), self._K, [1,0]).T).squeeze()
            #self._covariance = self._K - self._K.dot(self.woodbury_inv).dot(self._K)
        return self._covariance
    @property
    def precision(self):
        """
        Inverse of posterior covariance
        """
        if self._precision is None:
            cov = np.atleast_3d(self.covariance)
            self._precision = np.zeros(cov.shape) # if one covariance per dimension
            for p in xrange(cov.shape[-1]):
                self._precision[:,:,p] = pdinv(cov[:,:,p])[0]
        return self._precision
    @property
    def woodbury_chol(self):
        """
        return $L_{W}$ where L is the lower triangular Cholesky decomposition of the Woodbury matrix
        $$
        L_{W}L_{W}^{\top} = W^{-1}
        W^{-1} := \texttt{Woodbury inv}
        $$
        """
        if self._woodbury_chol is None:
            #compute woodbury chol from
            if self._woodbury_inv is not None:
                winv = np.atleast_3d(self._woodbury_inv)
                self._woodbury_chol = np.zeros(winv.shape)
                for p in xrange(winv.shape[-1]):
                    self._woodbury_chol[:,:,p] = pdinv(winv[:,:,p])[2]
                #Li = jitchol(self._woodbury_inv)
                #self._woodbury_chol, _ = dtrtri(Li)
                #W, _, _, _, = pdinv(self._woodbury_inv)
                #symmetrify(W)
                #self._woodbury_chol = jitchol(W)
            #try computing woodbury chol from cov
            elif self._covariance is not None:
                raise NotImplementedError, "TODO: check code here"
                B = self._K - self._covariance
                tmp, _ = dpotrs(self.K_chol, B)
                self._woodbury_inv, _ = dpotrs(self.K_chol, tmp.T)
                _, _, self._woodbury_chol, _ = pdinv(self._woodbury_inv)
            else:
                raise ValueError, "insufficient information to compute posterior"
        return self._woodbury_chol
    @property
    def woodbury_inv(self):
        """
        The inverse of the woodbury matrix, in the gaussian likelihood case it is defined as
        $$
        (K_{xx} + \Sigma_{xx})^{-1}
        \Sigma_{xx} := \texttt{Likelihood.variance / Approximate likelihood covariance}
        $$
        """
        if self._woodbury_inv is None:
            if self._woodbury_chol is not None:
                self._woodbury_inv, _ = dpotri(self._woodbury_chol, lower=1)
                #self._woodbury_inv, _ = dpotrs(self.woodbury_chol, np.eye(self.woodbury_chol.shape[0]), lower=1)
                symmetrify(self._woodbury_inv)
            elif self._covariance is not None:
                B = self._K - self._covariance
                tmp, _ = dpotrs(self.K_chol, B)
                self._woodbury_inv, _ = dpotrs(self.K_chol, tmp.T)                
        return self._woodbury_inv
    @property
    def woodbury_vector(self):
        """
        Woodbury vector in the gaussian likelihood case only is defined as
        $$
        (K_{xx} + \Sigma)^{-1}Y
        \Sigma := \texttt{Likelihood.variance / Approximate likelihood covariance}
        $$
        """
        if self._woodbury_vector is None:
            self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean)
        return self._woodbury_vector
    @property
    def K_chol(self):
        """
        Cholesky of the prior covariance K
        """
        if self._K_chol is None:
            self._K_chol = jitchol(self._K)
        return self._K_chol
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@ -0,0 +1,249 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from posterior import Posterior
 from ...util.linalg import mdot, jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify
 from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
 import numpy as np
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 import logging, itertools
 logger = logging.getLogger('vardtc')
 class VarDTC(LatentFunctionInference):
    """
    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
    The function self.inference returns a Posterior object, which summarizes
    the posterior.
    For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.
    """
    const_jitter = 1e-6
    def __init__(self, limit=1):
        #self._YYTfactor_cache = caching.cache()
        from ...util.caching import Cacher
        self.limit = limit
        self.get_trYYT = Cacher(self._get_trYYT, limit)
        self.get_YYTfactor = Cacher(self._get_YYTfactor, limit)
    def set_limit(self, limit):
        self.get_trYYT.limit = limit
        self.get_YYTfactor.limit = limit
    def _get_trYYT(self, Y):
        return np.einsum("ij,ij->", Y, Y)
        # faster than, but same as:
        # return np.sum(np.square(Y))
    def __getstate__(self):
        # has to be overridden, as Cacher objects cannot be pickled.
        return self.limit
    def __setstate__(self, state):
        # has to be overridden, as Cacher objects cannot be pickled.
        self.limit = state
        from ...util.caching import Cacher
        self.get_trYYT = Cacher(self._get_trYYT, self.limit)
        self.get_YYTfactor = Cacher(self._get_YYTfactor, self.limit)
    def _get_YYTfactor(self, Y):
        """
        find a matrix L which satisfies LLT = YYT.
        Note that L may have fewer columns than Y.
        """
        N, D = Y.shape
        if (N>=D):
            return Y.view(np.ndarray)
        else:
            return jitchol(tdot(Y))
    def get_VVTfactor(self, Y, prec):
        return Y * prec # TODO chache this, and make it effective
    def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None):
        _, output_dim = Y.shape
        uncertain_inputs = isinstance(X, VariationalPosterior)
        #see whether we've got a different noise variance for each datum
        beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6)
        # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
        #self.YYTfactor = self.get_YYTfactor(Y)
        #VVT_factor = self.get_VVTfactor(self.YYTfactor, beta)
        het_noise = beta.size > 1
        if beta.ndim == 1:
            beta = beta[:, None]
        VVT_factor = beta*Y
        #VVT_factor = beta*Y
        trYYT = self.get_trYYT(Y)
        # do the inference:
        num_inducing = Z.shape[0]
        num_data = Y.shape[0]
        # kernel computations, using BGPLVM notation
        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
        if Lm is None:
            Lm = jitchol(Kmm)
        # The rather complex computations of A, and the psi stats
        if uncertain_inputs:
            psi0 = kern.psi0(Z, X)
            psi1 = kern.psi1(Z, X)
            if het_noise:
                psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0)
            else:
                psi2_beta = kern.psi2(Z,X) * beta
            LmInv = dtrtri(Lm)
            A = LmInv.dot(psi2_beta.dot(LmInv.T))
        else:
            psi0 = kern.Kdiag(X)
            psi1 = kern.K(X, Z)
            if het_noise:
                tmp = psi1 * (np.sqrt(beta))
            else:
                tmp = psi1 * (np.sqrt(beta))
            tmp, _ = dtrtrs(Lm, tmp.T, lower=1)
            A = tdot(tmp) #print A.sum()
        # factor B
        B = np.eye(num_inducing) + A
        LB = jitchol(B)
        psi1Vf = np.dot(psi1.T, VVT_factor)
        # back substutue C into psi1Vf
        tmp, _ = dtrtrs(Lm, psi1Vf, lower=1, trans=0)
        _LBi_Lmi_psi1Vf, _ = dtrtrs(LB, tmp, lower=1, trans=0)
        tmp, _ = dtrtrs(LB, _LBi_Lmi_psi1Vf, lower=1, trans=1)
        Cpsi1Vf, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
        # data fit and derivative of L w.r.t. Kmm
        delit = tdot(_LBi_Lmi_psi1Vf)
        data_fit = np.trace(delit)
        DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit)
        if dL_dKmm is None:
            delit = -0.5 * DBi_plus_BiPBi
            delit += -0.5 * B * output_dim
            delit += output_dim * np.eye(num_inducing)
            # Compute dL_dKmm
            dL_dKmm = backsub_both_sides(Lm, delit)
        # derivatives of L w.r.t. psi
        dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm,
            VVT_factor, Cpsi1Vf, DBi_plus_BiPBi,
            psi1, het_noise, uncertain_inputs)
        # log marginal likelihood
        log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise,
            psi0, A, LB, trYYT, data_fit, Y)
        #noise derivatives
        dL_dR = _compute_dL_dR(likelihood,
            het_noise, uncertain_inputs, LB,
            _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A,
            psi0, psi1, beta,
            data_fit, num_data, output_dim, trYYT, Y, VVT_factor)
        dL_dthetaL = likelihood.exact_inference_gradients(dL_dR,Y_metadata)
        #put the gradients in the right places
        if uncertain_inputs:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dpsi0':dL_dpsi0,
                         'dL_dpsi1':dL_dpsi1,
                         'dL_dpsi2':dL_dpsi2,
                         'dL_dthetaL':dL_dthetaL}
        else:
            grad_dict = {'dL_dKmm': dL_dKmm,
                         'dL_dKdiag':dL_dpsi0,
                         'dL_dKnm':dL_dpsi1,
                         'dL_dthetaL':dL_dthetaL}
        #get sufficient things for posterior prediction
        #TODO: do we really want to do this in  the loop?
        if VVT_factor.shape[1] == Y.shape[1]:
            woodbury_vector = Cpsi1Vf # == Cpsi1V
        else:
            print 'foobar'
            import ipdb; ipdb.set_trace()
            psi1V = np.dot(Y.T*beta, psi1).T
            tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
            tmp, _ = dpotrs(LB, tmp, lower=1)
            woodbury_vector, _ = dtrtrs(Lm, tmp, lower=1, trans=1)
        Bi, _ = dpotri(LB, lower=1)
        symmetrify(Bi)
        Bi = -dpotri(LB, lower=1)[0]
        diag.add(Bi, 1)
        woodbury_inv = backsub_both_sides(Lm, Bi)
        #construct a posterior object
        post = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, K=Kmm, mean=None, cov=None, K_chol=Lm)
        return post, log_marginal, grad_dict
 def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, Cpsi1Vf, DBi_plus_BiPBi, psi1, het_noise, uncertain_inputs):
    dL_dpsi0 = -0.5 * output_dim * (beta* np.ones([num_data, 1])).flatten()
    dL_dpsi1 = np.dot(VVT_factor, Cpsi1Vf.T)
    dL_dpsi2_beta = 0.5 * backsub_both_sides(Lm, output_dim * np.eye(num_inducing) - DBi_plus_BiPBi)
    if het_noise:
        if uncertain_inputs:
            dL_dpsi2 = beta[:, None] * dL_dpsi2_beta[None, :, :]
        else:
            dL_dpsi1 += 2.*np.dot(dL_dpsi2_beta, (psi1 * beta).T).T
            dL_dpsi2 = None
    else:
        dL_dpsi2 = beta * dL_dpsi2_beta
        if not uncertain_inputs:
            # subsume back into psi1 (==Kmn)
            dL_dpsi1 += 2.*np.dot(psi1, dL_dpsi2)
            dL_dpsi2 = None
    return dL_dpsi0, dL_dpsi1, dL_dpsi2
 def _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, psi0, psi1, beta, data_fit, num_data, output_dim, trYYT, Y, VVT_factr=None):
    # the partial derivative vector for the likelihood
    if likelihood.size == 0:
        # save computation here.
        dL_dR = None
    elif het_noise:
        if uncertain_inputs:
            raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
        else:
            #from ...util.linalg import chol_inv
            #LBi = chol_inv(LB)
            LBi, _ = dtrtrs(LB,np.eye(LB.shape[0]))
            Lmi_psi1, nil = dtrtrs(Lm, psi1.T, lower=1, trans=0)
            _LBi_Lmi_psi1, _ = dtrtrs(LB, Lmi_psi1, lower=1, trans=0)
            dL_dR = -0.5 * beta + 0.5 * VVT_factr**2
            dL_dR += 0.5 * output_dim * (psi0 - np.sum(Lmi_psi1**2,0))[:,None] * beta**2
            dL_dR += 0.5*np.sum(mdot(LBi.T,LBi,Lmi_psi1)*Lmi_psi1,0)[:,None]*beta**2
            dL_dR += -np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T * Y * beta**2
            dL_dR += 0.5*np.dot(_LBi_Lmi_psi1Vf.T,_LBi_Lmi_psi1).T**2 * beta**2
    else:
        # likelihood is not heteroscedatic
        dL_dR = -0.5 * num_data * output_dim * beta + 0.5 * trYYT * beta ** 2
        dL_dR += 0.5 * output_dim * (psi0.sum() * beta ** 2 - np.trace(A) * beta)
        dL_dR += beta * (0.5 * np.sum(A * DBi_plus_BiPBi) - data_fit)
    return dL_dR
 def _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT, data_fit, Y):
    #compute log marginal likelihood
    if het_noise:
        lik_1 = -0.5 * num_data * output_dim * np.log(2. * np.pi) + 0.5 * output_dim * np.sum(np.log(beta)) - 0.5 * np.sum(beta.ravel() * np.square(Y).sum(axis=-1))
        lik_2 = -0.5 * output_dim * (np.sum(beta.flatten() * psi0) - np.trace(A))
    else:
        lik_1 = -0.5 * num_data * output_dim * (np.log(2. * np.pi) - np.log(beta)) - 0.5 * beta * trYYT
        lik_2 = -0.5 * output_dim * (np.sum(beta * psi0) - np.trace(A))
    lik_3 = -output_dim * (np.sum(np.log(np.diag(LB))))
    lik_4 = 0.5 * data_fit
    log_marginal = lik_1 + lik_2 + lik_3 + lik_4
    return log_marginal
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@ -0,0 +1,479 @@
 # Copyright (c) 2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from posterior import Posterior
 from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri,pdinv
 from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
 import numpy as np
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 try:
    from mpi4py import MPI
 except:
    pass
 class VarDTC_minibatch(LatentFunctionInference):
    """
    An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
    The function self.inference returns a Posterior object, which summarizes
    the posterior.
    For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.
    """
    const_jitter = 1e-6
    def __init__(self, batchsize=None, limit=1, mpi_comm=None):
        self.batchsize = batchsize
        self.mpi_comm = mpi_comm
        self.limit = limit
        # Cache functions
        from ...util.caching import Cacher
        self.get_trYYT = Cacher(self._get_trYYT, limit)
        self.get_YYTfactor = Cacher(self._get_YYTfactor, limit)
        self.midRes = {}
        self.batch_pos = 0 # the starting position of the current mini-batch
        self.Y_speedup = False # Replace Y with the cholesky factor of YY.T, but the computation of posterior object will be skipped.
    def __getstate__(self):
        # has to be overridden, as Cacher objects cannot be pickled.
        return self.batchsize, self.limit, self.Y_speedup
    def __setstate__(self, state):
        # has to be overridden, as Cacher objects cannot be pickled.
        self.batchsize, self.limit, self.Y_speedup = state
        self.mpi_comm = None
        self.midRes = {}
        self.batch_pos = 0
        from ...util.caching import Cacher
        self.get_trYYT = Cacher(self._get_trYYT, self.limit)
        self.get_YYTfactor = Cacher(self._get_YYTfactor, self.limit)
    def set_limit(self, limit):
        self.get_trYYT.limit = limit
        self.get_YYTfactor.limit = limit
    def _get_trYYT(self, Y):
        return np.sum(np.square(Y))
    def _get_YYTfactor(self, Y):
        """
        find a matrix L which satisfies LLT = YYT.
        Note that L may have fewer columns than Y.
        """
        N, D = Y.shape
        if (N>=D):
            return Y.view(np.ndarray)
        else:
            return jitchol(tdot(Y))
    def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs):
        het_noise = beta.size > 1
        assert beta.size == 1
        trYYT = self.get_trYYT(Y)
        if self.Y_speedup and not het_noise:
            Y =  self.get_YYTfactor(Y)
        num_inducing = Z.shape[0]
        num_data, output_dim = Y.shape
        batchsize = num_data if self.batchsize is None else self.batchsize
        psi2_full = np.zeros((num_inducing,num_inducing)) # MxM
        psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM
        psi0_full = 0.
        YRY_full = 0.
        for n_start in xrange(0,num_data,batchsize):
            n_end = min(batchsize+n_start, num_data)
            if batchsize==num_data:
                Y_slice = Y
                X_slice = X
            else:
                Y_slice = Y[n_start:n_end]
                X_slice = X[n_start:n_end]
            if het_noise:
                b = beta[n_start]
                YRY_full += np.inner(Y_slice, Y_slice)*b
            else:
                b = beta
            if uncertain_inputs:
                psi0 = kern.psi0(Z, X_slice)
                psi1 = kern.psi1(Z, X_slice)
                psi2_full += kern.psi2(Z, X_slice)*b
            else:
                psi0 = kern.Kdiag(X_slice)
                psi1 = kern.K(X_slice, Z)
                psi2_full += np.dot(psi1.T,psi1)*b
            psi0_full += psi0.sum()*b
            psi1Y_full += np.dot(Y_slice.T,psi1)*b # DxM
        if not het_noise:
            YRY_full = trYYT*beta
        if self.mpi_comm != None:
            psi0_all = np.array(psi0_full)
            psi1Y_all = psi1Y_full.copy()
            psi2_all = psi2_full.copy()
            YRY_all = np.array(YRY_full)
            self.mpi_comm.Allreduce([psi0_full, MPI.DOUBLE], [psi0_all, MPI.DOUBLE])
            self.mpi_comm.Allreduce([psi1Y_full, MPI.DOUBLE], [psi1Y_all, MPI.DOUBLE])
            self.mpi_comm.Allreduce([psi2_full, MPI.DOUBLE], [psi2_all, MPI.DOUBLE])
            self.mpi_comm.Allreduce([YRY_full, MPI.DOUBLE], [YRY_all, MPI.DOUBLE])
            return psi0_all, psi1Y_all, psi2_all, YRY_all
        return psi0_full, psi1Y_full, psi2_full, YRY_full
    def inference_likelihood(self, kern, X, Z, likelihood, Y):
        """
        The first phase of inference:
        Compute: log-likelihood, dL_dKmm
        Cached intermediate results: Kmm, KmmInv,
        """
        num_data, output_dim = Y.shape
        input_dim = Z.shape[0]
        if self.mpi_comm != None:
            num_data_all = np.array(num_data,dtype=np.int32)
            self.mpi_comm.Allreduce([np.int32(num_data), MPI.INT], [num_data_all, MPI.INT])
            num_data = num_data_all
        if isinstance(X, VariationalPosterior):
            uncertain_inputs = True
        else:
            uncertain_inputs = False
        #see whether we've got a different noise variance for each datum
        beta = 1./np.fmax(likelihood.variance, 1e-6)
        het_noise = beta.size > 1
        if het_noise:
            self.batchsize = 1
        psi0_full, psi1Y_full, psi2_full, YRY_full = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs)
        #======================================================================
        # Compute Common Components
        #======================================================================
        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
        Lm = jitchol(Kmm, maxtries=100)
        LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right')
        Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT
        LL = jitchol(Lambda, maxtries=100)
        logdet_L = 2.*np.sum(np.log(np.diag(LL)))
        b = dtrtrs(LL,dtrtrs(Lm,psi1Y_full.T)[0])[0]
        bbt = np.square(b).sum()
        v = dtrtrs(Lm,dtrtrs(LL,b,trans=1)[0],trans=1)[0]
        tmp  = -backsub_both_sides(LL, tdot(b)+output_dim*np.eye(input_dim), transpose='left')
        dL_dpsi2R = backsub_both_sides(Lm, tmp+output_dim*np.eye(input_dim), transpose='left')/2.
        # Cache intermediate results
        self.midRes['dL_dpsi2R'] = dL_dpsi2R
        self.midRes['v'] = v
        #======================================================================
        # Compute log-likelihood
        #======================================================================
        if het_noise:
            logL_R = -np.log(beta).sum()
        else:
            logL_R = -num_data*np.log(beta)
        logL = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-np.trace(LmInvPsi2LmInvT))+YRY_full-bbt)/2.-output_dim*logdet_L/2.
        #======================================================================
        # Compute dL_dKmm
        #======================================================================
        dL_dKmm =  dL_dpsi2R - output_dim*backsub_both_sides(Lm, LmInvPsi2LmInvT, transpose='left')/2.
        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
        #======================================================================
        if not self.Y_speedup or het_noise:
            wd_inv = backsub_both_sides(Lm, np.eye(input_dim)- backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left')
            post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=Lm)
        else:
            post = None
        #======================================================================
        # Compute dL_dthetaL for uncertian input and non-heter noise
        #======================================================================
        if not het_noise:
            dL_dthetaL = (YRY_full*beta + beta*output_dim*psi0_full - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2_full).sum() - beta*(v.T*psi1Y_full).sum()
            self.midRes['dL_dthetaL'] = dL_dthetaL
        return logL, dL_dKmm, post
    def inference_minibatch(self, kern, X, Z, likelihood, Y):
        """
        The second phase of inference: Computing the derivatives over a minibatch of Y
        Compute: dL_dpsi0, dL_dpsi1, dL_dpsi2, dL_dthetaL
        return a flag showing whether it reached the end of Y (isEnd)
        """
        num_data, output_dim = Y.shape
        if isinstance(X, VariationalPosterior):
            uncertain_inputs = True
        else:
            uncertain_inputs = False
        #see whether we've got a different noise variance for each datum
        beta = 1./np.fmax(likelihood.variance, 1e-6)
        het_noise = beta.size > 1
        # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
        #self.YYTfactor = beta*self.get_YYTfactor(Y)
        if self.Y_speedup and not het_noise:
            YYT_factor = self.get_YYTfactor(Y)
        else:
            YYT_factor = Y
        n_start = self.batch_pos
        batchsize = num_data if self.batchsize is None else self.batchsize
        n_end = min(batchsize+n_start, num_data)
        if n_end==num_data:
            isEnd = True
            self.batch_pos = 0
        else:
            isEnd = False
            self.batch_pos = n_end
        if batchsize==num_data:
            Y_slice = YYT_factor
            X_slice =X
        else:
            Y_slice = YYT_factor[n_start:n_end]
            X_slice = X[n_start:n_end]
        if not uncertain_inputs:
            psi0 = kern.Kdiag(X_slice)
            psi1 = kern.K(X_slice, Z)
            psi2 = None
            betapsi1 = np.einsum('n,nm->nm',beta,psi1)
        elif het_noise:
            psi0 = kern.psi0(Z, X_slice)
            psi1 = kern.psi1(Z, X_slice)
            psi2 = kern.psi2(Z, X_slice)
            betapsi1 = np.einsum('n,nm->nm',beta,psi1)
        if het_noise:
            beta = beta[n_start] # assuming batchsize==1
        betaY = beta*Y_slice
        #======================================================================
        # Load Intermediate Results
        #======================================================================
        dL_dpsi2R = self.midRes['dL_dpsi2R']
        v = self.midRes['v']
        #======================================================================
        # Compute dL_dpsi
        #======================================================================
        dL_dpsi0 = -output_dim * (beta * np.ones((n_end-n_start,)))/2.
        dL_dpsi1 = np.dot(betaY,v.T)
        if uncertain_inputs:
            dL_dpsi2 = beta* dL_dpsi2R
        else:
            dL_dpsi1 += np.dot(betapsi1,dL_dpsi2R)*2.
            dL_dpsi2 = None
        #======================================================================
        # Compute dL_dthetaL
        #======================================================================
        if het_noise:
            if uncertain_inputs:
                psiR = np.einsum('mo,mo->',dL_dpsi2R,psi2)
            else:
                psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R)
            dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1)
        else:
            if isEnd:
                dL_dthetaL = self.midRes['dL_dthetaL']
            else:
                dL_dthetaL = 0.
        if uncertain_inputs:
            grad_dict = {'dL_dpsi0':dL_dpsi0,
                         'dL_dpsi1':dL_dpsi1,
                         'dL_dpsi2':dL_dpsi2,
                         'dL_dthetaL':dL_dthetaL}
        else:
            grad_dict = {'dL_dKdiag':dL_dpsi0,
                         'dL_dKnm':dL_dpsi1,
                         'dL_dthetaL':dL_dthetaL}
        return isEnd, (n_start,n_end), grad_dict
 def update_gradients(model, mpi_comm=None):
    if mpi_comm == None:
        Y = model.Y
        X = model.X
    else:
        Y = model.Y_local
        X = model.X[model.N_range[0]:model.N_range[1]]
    model._log_marginal_likelihood, dL_dKmm, model.posterior = model.inference_method.inference_likelihood(model.kern, X, model.Z, model.likelihood, Y)
    het_noise = model.likelihood.variance.size > 1
    if het_noise:
        dL_dthetaL = np.empty((model.Y.shape[0],))
    else:
        dL_dthetaL = np.float64(0.)
    kern_grad = model.kern.gradient.copy()
    kern_grad[:] = 0.
    model.Z.gradient = 0.
    isEnd = False
    while not isEnd:
        isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, X, model.Z, model.likelihood, Y)
        if isinstance(model.X, VariationalPosterior):
            if (n_range[1]-n_range[0])==X.shape[0]:
                X_slice = X
            elif mpi_comm ==None:
                X_slice = model.X[n_range[0]:n_range[1]]
            else:
                X_slice = model.X[model.N_range[0]+n_range[0]:model.N_range[0]+n_range[1]]
            #gradients w.r.t. kernel
            model.kern.update_gradients_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'])
            kern_grad += model.kern.gradient
            #gradients w.r.t. Z
            model.Z.gradient += model.kern.gradients_Z_expectations(
                               dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=X_slice)
            #gradients w.r.t. posterior parameters of X
            X_grad = model.kern.gradients_qX_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'])
            model.set_X_gradients(X_slice, X_grad)
            if het_noise:
                dL_dthetaL[n_range[0]:n_range[1]] = grad_dict['dL_dthetaL']
            else:
                dL_dthetaL += grad_dict['dL_dthetaL']
    # Gather the gradients from multiple MPI nodes
    if mpi_comm != None:
        if het_noise:
            raise "het_noise not implemented!"
        kern_grad_all = kern_grad.copy()
        Z_grad_all = model.Z.gradient.copy()
        mpi_comm.Allreduce([kern_grad, MPI.DOUBLE], [kern_grad_all, MPI.DOUBLE])
        mpi_comm.Allreduce([model.Z.gradient, MPI.DOUBLE], [Z_grad_all, MPI.DOUBLE])
        kern_grad = kern_grad_all
        model.Z.gradient = Z_grad_all
    #gradients w.r.t. kernel
    model.kern.update_gradients_full(dL_dKmm, model.Z, None)
    model.kern.gradient += kern_grad
    #gradients w.r.t. Z
    model.Z.gradient += model.kern.gradients_X(dL_dKmm, model.Z)
    # Update Log-likelihood
    KL_div = model.variational_prior.KL_divergence(X)
    # update for the KL divergence
    model.variational_prior.update_gradients_KL(X)
    if mpi_comm != None:
        KL_div_all = np.array(KL_div)
        mpi_comm.Allreduce([np.float64(KL_div), MPI.DOUBLE], [KL_div_all, MPI.DOUBLE])
        KL_div = KL_div_all
        [mpi_comm.Allgatherv([pp.copy(), MPI.DOUBLE], [pa, (model.N_list*pa.shape[-1], None), MPI.DOUBLE]) for pp,pa in zip(model.get_X_gradients(X),model.get_X_gradients(model.X))]
 #         from ...models import SSGPLVM
 #         if isinstance(model, SSGPLVM):
 #             grad_pi = np.array(model.variational_prior.pi.gradient)
 #             mpi_comm.Allreduce([grad_pi.copy(), MPI.DOUBLE], [model.variational_prior.pi.gradient, MPI.DOUBLE])
    model._log_marginal_likelihood -= KL_div
    # dL_dthetaL
    model.likelihood.update_gradients(dL_dthetaL)
 def update_gradients_sparsegp(model, mpi_comm=None):
    if mpi_comm == None:
        Y = model.Y
        X = model.X
    else:
        Y = model.Y_local
        X = model.X[model.N_range[0]:model.N_range[1]]
    model._log_marginal_likelihood, dL_dKmm, model.posterior = model.inference_method.inference_likelihood(model.kern, X, model.Z, model.likelihood, Y)
    het_noise = model.likelihood.variance.size > 1
    if het_noise:
        dL_dthetaL = np.empty((model.Y.shape[0],))
    else:
        dL_dthetaL = np.float64(0.)
    kern_grad = model.kern.gradient.copy()
    kern_grad[:] = 0.
    model.Z.gradient = 0.
    isEnd = False
    while not isEnd:
        isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, X, model.Z, model.likelihood, Y)
        if (n_range[1]-n_range[0])==X.shape[0]:
            X_slice = X
        elif mpi_comm ==None:
            X_slice = model.X[n_range[0]:n_range[1]]
        else:
            X_slice = model.X[model.N_range[0]+n_range[0]:model.N_range[0]+n_range[1]]
        model.kern.update_gradients_diag(grad_dict['dL_dKdiag'], X_slice)
        kern_grad += model.kern.gradient
        model.kern.update_gradients_full(grad_dict['dL_dKnm'], X_slice, model.Z)
        kern_grad += model.kern.gradient
        model.Z.gradient += model.kern.gradients_X(grad_dict['dL_dKnm'].T, model.Z, X_slice)
        if het_noise:
            dL_dthetaL[n_range[0]:n_range[1]] = grad_dict['dL_dthetaL']
        else:
            dL_dthetaL += grad_dict['dL_dthetaL']
    # Gather the gradients from multiple MPI nodes
    if mpi_comm != None:
        if het_noise:
            raise "het_noise not implemented!"
        kern_grad_all = kern_grad.copy()
        Z_grad_all = model.Z.gradient.copy()
        mpi_comm.Allreduce([kern_grad, MPI.DOUBLE], [kern_grad_all, MPI.DOUBLE])
        mpi_comm.Allreduce([model.Z.gradient, MPI.DOUBLE], [Z_grad_all, MPI.DOUBLE])
        kern_grad = kern_grad_all
        model.Z.gradient = Z_grad_all
    model.kern.update_gradients_full(dL_dKmm, model.Z, None)
    model.kern.gradient += kern_grad
    model.Z.gradient += model.kern.gradients_X(dL_dKmm, model.Z)
    # dL_dthetaL
    model.likelihood.update_gradients(dL_dthetaL)
--- a/GPy/inference/mcmc/init.py
+++ b/GPy/inference/mcmc/init.py
@ -0,0 +1 @@
 from hmc import HMC
--- a/GPy/inference/mcmc/hmc.py
+++ b/GPy/inference/mcmc/hmc.py
@ -0,0 +1,174 @@
 # ## Copyright (c) 2014, Zhenwen Dai
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 class HMC:
    """
    An implementation of Hybrid Monte Carlo (HMC) for GPy models
    Initialize an object for HMC sampling. Note that the status of the model (model parameters) will be changed during sampling.
    :param model: the GPy model that will be sampled
    :type model: GPy.core.Model
    :param M: the mass matrix (an identity matrix by default)
    :type M: numpy.ndarray
    :param stepsize: the step size for HMC sampling
    :type stepsize: float
    """
    def __init__(self, model, M=None,stepsize=1e-1):
        self.model = model
        self.stepsize = stepsize
        self.p = np.empty_like(model.optimizer_array.copy())
        if M is None:
            self.M = np.eye(self.p.size)
        else:
            self.M = M
        self.Minv = np.linalg.inv(self.M)
    def sample(self, num_samples=1000, hmc_iters=20):
        """
        Sample the (unfixed) model parameters.
        :param num_samples: the number of samples to draw (1000 by default)
        :type num_samples: int
        :param hmc_iters: the number of leap-frog iterations (20 by default)
        :type hmc_iters: int
        :return: the list of parameters samples with the size N x P (N - the number of samples, P - the number of parameters to sample) 
        :rtype: numpy.ndarray
        """
        params = np.empty((num_samples,self.p.size))
        for i in xrange(num_samples):
            self.p[:] = np.random.multivariate_normal(np.zeros(self.p.size),self.M)
            H_old = self._computeH()
            theta_old = self.model.optimizer_array.copy()
            params[i] = self.model.unfixed_param_array
            #Matropolis
            self._update(hmc_iters)
            H_new = self._computeH()
            if H_old>H_new:
                k = 1.
            else:
                k = np.exp(H_old-H_new)
            if np.random.rand()<k:
                params[i] = self.model.unfixed_param_array
            else:
                self.model.optimizer_array = theta_old
        return params
    def _update(self, hmc_iters):
        for i in xrange(hmc_iters):
            self.p[:] += -self.stepsize/2.*self.model._transform_gradients(self.model.objective_function_gradients())
            self.model.optimizer_array = self.model.optimizer_array + self.stepsize*np.dot(self.Minv, self.p)
            self.p[:] += -self.stepsize/2.*self.model._transform_gradients(self.model.objective_function_gradients())
    def _computeH(self,):
        return self.model.objective_function()+self.p.size*np.log(2*np.pi)/2.+np.log(np.linalg.det(self.M))/2.+np.dot(self.p, np.dot(self.Minv,self.p[:,None]))/2.
 class HMC_shortcut:
    def __init__(self,model,M=None,stepsize_range=[1e-6, 1e-1],groupsize=5, Hstd_th=[1e-5, 3.]):
        self.model = model
        self.stepsize_range = np.log(stepsize_range)
        self.p = np.empty_like(model.optimizer_array.copy())
        self.groupsize = groupsize
        self.Hstd_th = Hstd_th
        if M is None:
            self.M = np.eye(self.p.size)
        else:
            self.M = M
        self.Minv = np.linalg.inv(self.M)
    def sample(self, m_iters=1000, hmc_iters=20):
        params = np.empty((m_iters,self.p.size))
        for i in xrange(m_iters):
            # sample a stepsize from the uniform distribution
            stepsize = np.exp(np.random.rand()*(self.stepsize_range[1]-self.stepsize_range[0])+self.stepsize_range[0])
            self.p[:] = np.random.multivariate_normal(np.zeros(self.p.size),self.M)
            H_old = self._computeH()
            params[i] = self.model.unfixed_param_array
            theta_old = self.model.optimizer_array.copy()
            #Matropolis
            self._update(hmc_iters, stepsize)
            H_new = self._computeH()
            if H_old>H_new:
                k = 1.
            else:
                k = np.exp(H_old-H_new)
            if np.random.rand()<k:
                params[i] = self.model.unfixed_param_array
            else:
                self.model.optimizer_array = theta_old
        return params
    def _update(self, hmc_iters, stepsize):
        theta_buf = np.empty((2*hmc_iters+1,self.model.optimizer_array.size))
        p_buf = np.empty((2*hmc_iters+1,self.p.size))
        H_buf = np.empty((2*hmc_iters+1,))
        # Set initial position
        theta_buf[hmc_iters] = self.model.optimizer_array
        p_buf[hmc_iters] = self.p
        H_buf[hmc_iters] = self._computeH()
        reversal = []
        pos = 1
        i=0
        while i<hmc_iters:
            self.p[:] += -stepsize/2.*self.model._transform_gradients(self.model.objective_function_gradients())
            self.model.optimizer_array = self.model.optimizer_array + stepsize*np.dot(self.Minv, self.p)
            self.p[:] += -stepsize/2.*self.model._transform_gradients(self.model.objective_function_gradients())
            theta_buf[hmc_iters+pos] = self.model.optimizer_array
            p_buf[hmc_iters+pos] = self.p
            H_buf[hmc_iters+pos] = self._computeH()
            i+=1
            if i<self.groupsize:
                pos += 1
                continue
            else:
                if len(reversal)==0:
                    Hlist = range(hmc_iters+pos,hmc_iters+pos-self.groupsize,-1)
                    if self._testH(H_buf[Hlist]):
                        pos += 1
                    else:
                        # Reverse the trajectory for the 1st time
                        reversal.append(pos)
                        if hmc_iters-i>pos:
                            pos = -1
                            i += pos
                            self.model.optimizer_array = theta_buf[hmc_iters]
                            self.p[:] = -p_buf[hmc_iters]
                        else:
                            pos_new = pos-hmc_iters+i
                            self.model.optimizer_array = theta_buf[hmc_iters+pos_new]
                            self.p[:] = -p_buf[hmc_iters+pos_new]
                            break
                else:
                    Hlist = range(hmc_iters+pos,hmc_iters+pos+self.groupsize)
                    if self._testH(H_buf[Hlist]):
                        pos += -1
                    else:
                        # Reverse the trajectory for the 2nd time
                        r = (hmc_iters - i)%((reversal[0]-pos)*2)
                        if r>(reversal[0]-pos):
                            pos_new = 2*reversal[0] - r - pos
                        else:
                            pos_new = pos + r
                        self.model.optimizer_array = theta_buf[hmc_iters+pos_new]
                        self.p[:] = p_buf[hmc_iters+pos_new] # the sign of momentum might be wrong!
                        break
    def _testH(self, Hlist):
        Hstd = np.std(Hlist)
        if Hstd<self.Hstd_th[0] or Hstd>self.Hstd_th[1]:
            return False
        else:
            return True
    def _computeH(self,):
        return self.model.objective_function()+self.p.size*np.log(2*np.pi)/2.+np.log(np.linalg.det(self.M))/2.+np.dot(self.p, np.dot(self.Minv,self.p[:,None]))/2.
--- a/GPy/inference/mcmc/samplers.py
+++ b/GPy/inference/mcmc/samplers.py
@ -1,10 +1,9 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# ## Copyright (c) 2014, Zhenwen Dai
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import linalg, optimize
 import pylab as pb
 import Tango
 import sys
 import re
@ -80,6 +79,3 @@ class Metropolis_Hastings:
            fs.append(function(*args))
        self.model._set_params(param)# reset model to starting state
        return fs
--- a/GPy/inference/optimization/init.py
+++ b/GPy/inference/optimization/init.py
@ -0,0 +1,2 @@
 from scg import SCG
 from optimization import *
--- a/GPy/inference/optimization/conjugate_gradient_descent.py
+++ b/GPy/inference/optimization/conjugate_gradient_descent.py
@ -1,9 +1,7 @@
-'''
+# Copyright (c) 2012-2014, Max Zwiessele
-Created on 24 Apr 2013
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
-@author: maxz
+from gradient_descent_update_rules import FletcherReeves, \
 '''
 from GPy.inference.gradient_descent_update_rules import FletcherReeves, \
    PolakRibiere
 from Queue import Empty
 from multiprocessing import Value
--- a/GPy/inference/optimization/gradient_descent_update_rules.py
+++ b/GPy/inference/optimization/gradient_descent_update_rules.py
@ -1,8 +1,6 @@
-'''
+# Copyright (c) 2012-2014, Max Zwiessele
-Created on 24 Apr 2013
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
@author: maxz
 '''
 import numpy
 class GDUpdateRule():
--- a/GPy/inference/optimization/optimization.py
+++ b/GPy/inference/optimization/optimization.py
@ -1,7 +1,6 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import pylab as pb
 import datetime as dt
 from scipy import optimize
 from warnings import warn
@ -57,13 +56,14 @@ class Optimizer():
        raise NotImplementedError, "this needs to be implemented to use the optimizer class"
    def plot(self):
-        if self.trace == None:
+        """
-            print "No trace present so I can't plot it. Please check that the optimizer actually supplies a trace."
+        See GPy.plotting.matplot_dep.inference_plots
-        else:
+        """
-            pb.figure()
+        import sys
-            pb.plot(self.trace)
+        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
-            pb.xlabel('Iteration')
+        from ...plotting.matplot_dep import inference_plots
-            pb.ylabel('f(x)')
+        inference_plots.plot_optimizer(self)
    def __str__(self):
        diagnostics = "Optimizer: \t\t\t\t %s\n" % self.opt_name
@ -118,7 +118,7 @@ class opt_lbfgsb(Optimizer):
        assert f_fp != None, "BFGS requires f_fp"
        if self.messages:
-            iprint = 0
+            iprint = 1
        else:
            iprint = -1
@ -126,29 +126,18 @@ class opt_lbfgsb(Optimizer):
        if self.xtol is not None:
            print "WARNING: l-bfgs-b doesn't have an xtol arg, so I'm going to ignore it"
        if self.ftol is not None:
-            opt_dict['ftol'] = self.ftol
+            print "WARNING: l-bfgs-b doesn't have an ftol arg, so I'm going to ignore it"
        #    print "WARNING: l-bfgs-b doesn't have an ftol arg, so I'm going to ignore it"
        if self.gtol is not None:
-            opt_dict['gtol'] = self.gtol
+            opt_dict['pgtol'] = self.gtol
        if self.bfgs_factor is not None:
            opt_dict['factr'] = self.bfgs_factor
-        opt_dict['iprint'] = iprint
+
-        opt_dict['maxiter'] = self.max_iters
+        opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint=iprint,
-        opt_dict['disp'] = self.messages
+                                            maxfun=self.max_iters, **opt_dict)
-        #dict(maxiter=self.max_iters, disp=self.messages, iprint=iprint, ftol=self.ftol, gtol=self.gtol)
+        self.x_opt = opt_result[0]
-            
+        self.f_opt = f_fp(self.x_opt)[0]
-        opt_result = optimize.minimize(f_fp, self.x_init, method='L-BFGS-B', jac=True, options=opt_dict)
+        self.funct_eval = opt_result[2]['funcalls']
-        #opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint=iprint,
+        self.status = rcstrings[opt_result[2]['warnflag']]
        #                                    maxfun=self.max_iters, **opt_dict)
        #self.x_opt = opt_result[0]
        #self.f_opt = f_fp(self.x_opt)[0]
        #self.funct_eval = opt_result[2]['funcalls']
        #self.status = rcstrings[opt_result[2]['warnflag']]
        self.x_opt = opt_result.x
        self.status = opt_result.success
        self.funct_eval = opt_result.nfev
        self.f_opt = opt_result.fun
        self.opt_result = opt_result
 class opt_simplex(Optimizer):
    def __init__(self, *args, **kwargs):
@ -236,13 +225,11 @@ class opt_SCG(Optimizer):
        self.status = opt_result[3]
 def get_optimizer(f_min):
    from sgd import opt_SGD
    optimizers = {'fmin_tnc': opt_tnc,
          'simplex': opt_simplex,
          'lbfgsb': opt_lbfgsb,
-          'scg': opt_SCG,
+          'scg': opt_SCG}
          'sgd': opt_SGD}
    if rasm_available:
        optimizers['rasmussen'] = opt_rasm
--- a/GPy/inference/optimization/scg.py
+++ b/GPy/inference/optimization/scg.py
@ -28,11 +28,11 @@ import sys
 def print_out(len_maxiters, fnow, current_grad, beta, iteration):
    print '\r',
-    print '{0:>0{mi}g}  {1:> 12e}  {2:> 12e}  {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
+    print '{0:>0{mi}g}  {1:> 12e}  {2:< 12.6e}  {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
    sys.stdout.flush()
 def exponents(fnow, current_grad):
-    exps = [np.abs(fnow), current_grad]
+    exps = [np.abs(np.float(fnow)), current_grad]
    return np.sign(exps) * np.log10(exps).astype(int)
 def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True, xtol=None, ftol=None, gtol=None):
@ -56,13 +56,13 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
    if gtol is None:
        gtol = 1e-5
-    sigma0 = 1.0e-8
+    sigma0 = 1.0e-7
    fold = f(x, *optargs) # Initial function value.
    function_eval = 1
    fnow = fold
    gradnew = gradf(x, *optargs) # Initial gradient.
-    if any(np.isnan(gradnew)):
+    #if any(np.isnan(gradnew)):
-        raise UnexpectedInfOrNan, "Gradient contribution resulted in a NaN value"
+    #    raise UnexpectedInfOrNan, "Gradient contribution resulted in a NaN value"
    current_grad = np.dot(gradnew, gradnew)
    gradold = gradnew.copy()
    d = -gradnew # Initial search direction.
@ -168,13 +168,13 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
        if Delta < 0.25:
            beta = min(4.0 * beta, betamax)
        if Delta > 0.75:
-            beta = max(0.5 * beta, betamin)
+            beta = max(0.25 * beta, betamin)
        # Update search direction using Polak-Ribiere formula, or re-start
        # in direction of negative gradient after nparams steps.
        if nsuccess == x.size:
            d = -gradnew
-#             beta = 1.  # TODO: betareset!!
+            beta = 1. # This is not in the original paper
            nsuccess = 0
        elif success:
            Gamma = np.dot(gradold - gradnew, gradnew) / (mu)
--- a/GPy/inference/optimization/stochastics.py
+++ b/GPy/inference/optimization/stochastics.py
@ -0,0 +1,56 @@
 # Copyright (c) 2012-2014, Max Zwiessele
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 class StochasticStorage(object):
    '''
    This is a container for holding the stochastic parameters,
    such as subset indices or step length and so on.
    '''
    def __init__(self, model):
        """
        Initialize this stochastic container using the given model
        """
    def do_stochastics(self):
        """
        Update the internal state to the next batch of the stochastic
        descent algorithm.
        """
        pass
    def reset(self):
        """
        Reset the state of this stochastics generator.
        """
 class SparseGPMissing(StochasticStorage):
    def __init__(self, model, batchsize=1):
        """
        Here we want to loop over all dimensions everytime.
        Thus, we can just make sure the loop goes over self.d every
        time.
        """
        self.d = xrange(model.Y_normalized.shape[1])
 class SparseGPStochastics(StochasticStorage):
    """
    For the sparse gp we need to store the dimension we are in,
    and the indices corresponding to those
    """
    def __init__(self, model, batchsize=1):
        self.batchsize = batchsize
        self.output_dim = model.Y.shape[1]
        self.reset()
        self.do_stochastics()
    def do_stochastics(self):
        if self.batchsize == 1:
            self.current_dim = (self.current_dim+1)%self.output_dim
            self.d = [self.current_dim]
        else:
            import numpy as np
            self.d = np.random.choice(self.output_dim, size=self.batchsize, replace=False)
    def reset(self):
        self.current_dim = -1
        self.d = None
--- a/GPy/inference/sgd.py
+++ b/GPy/inference/sgd.py
@ -1,355 +0,0 @@
 import numpy as np
 import scipy as sp
 import scipy.sparse
 from optimization import Optimizer
 from scipy import linalg, optimize
 import pylab as plt
 import copy, sys, pickle
 class opt_SGD(Optimizer):
    """
    Optimize using stochastic gradient descent.
    :param Model: reference to the Model object
    :param iterations: number of iterations
    :param learning_rate: learning rate
    :param momentum: momentum
    """
    def __init__(self, start, iterations = 10, learning_rate = 1e-4, momentum = 0.9, model = None, messages = False, batch_size = 1, self_paced = False, center = True, iteration_file = None, learning_rate_adaptation=None, actual_iter=None, schedule=None, **kwargs):
        self.opt_name = "Stochastic Gradient Descent"
        self.Model = model
        self.iterations = iterations
        self.momentum = momentum
        self.learning_rate = learning_rate
        self.x_opt = None
        self.f_opt = None
        self.messages = messages
        self.batch_size = batch_size
        self.self_paced = self_paced
        self.center = center
        self.param_traces = [('noise',[])]
        self.iteration_file = iteration_file
        self.learning_rate_adaptation = learning_rate_adaptation
        self.actual_iter = actual_iter
        if self.learning_rate_adaptation != None:
            if self.learning_rate_adaptation == 'annealing':
                self.learning_rate_0 = self.learning_rate
            else:
                self.learning_rate_0 = self.learning_rate.mean()
        self.schedule = schedule
        # if len([p for p in self.model.kern.parts if p.name == 'bias']) == 1:
        #     self.param_traces.append(('bias',[]))
        # if len([p for p in self.model.kern.parts if p.name == 'linear']) == 1:
        #     self.param_traces.append(('linear',[]))
        # if len([p for p in self.model.kern.parts if p.name == 'rbf']) == 1:
        #     self.param_traces.append(('rbf_var',[]))
        self.param_traces = dict(self.param_traces)
        self.fopt_trace = []
        num_params = len(self.Model._get_params())
        if isinstance(self.learning_rate, float):
            self.learning_rate = np.ones((num_params,)) * self.learning_rate
        assert (len(self.learning_rate) == num_params), "there must be one learning rate per parameter"
    def __str__(self):
        status = "\nOptimizer: \t\t\t %s\n" % self.opt_name
        status += "f(x_opt): \t\t\t %.4f\n" % self.f_opt
        status += "Number of iterations: \t\t %d\n" % self.iterations
        status += "Learning rate: \t\t\t max %.3f, min %.3f\n" % (self.learning_rate.max(), self.learning_rate.min())
        status += "Momentum: \t\t\t %.3f\n" % self.momentum
        status += "Batch size: \t\t\t %d\n" % self.batch_size
        status += "Time elapsed: \t\t\t %s\n" % self.time
        return status
    def plot_traces(self):
        plt.figure()
        plt.subplot(211)
        plt.title('Parameters')
        for k in self.param_traces.keys():
            plt.plot(self.param_traces[k], label=k)
        plt.legend(loc=0)
        plt.subplot(212)
        plt.title('Objective function')
        plt.plot(self.fopt_trace)
    def non_null_samples(self, data):
        return (np.isnan(data).sum(axis=1) == 0)
    def check_for_missing(self, data):
        if sp.sparse.issparse(self.Model.likelihood.Y):
            return True
        else:
            return np.isnan(data).sum() > 0
    def subset_parameter_vector(self, x, samples, param_shapes):
        subset = np.array([], dtype = int)
        x = np.arange(0, len(x))
        i = 0
        for s in param_shapes:
            N, input_dim = s
            X = x[i:i+N*input_dim].reshape(N, input_dim)
            X = X[samples]
            subset = np.append(subset, X.flatten())
            i += N*input_dim
        subset = np.append(subset, x[i:])
        return subset
    def shift_constraints(self, j):
        constrained_indices = copy.deepcopy(self.Model.constrained_indices)
        for c, constraint in enumerate(constrained_indices):
            mask = (np.ones_like(constrained_indices[c]) == 1)
            for i in range(len(constrained_indices[c])):
                pos = np.where(j == constrained_indices[c][i])[0]
                if len(pos) == 1:
                    self.Model.constrained_indices[c][i] = pos
                else:
                    mask[i] = False
            self.Model.constrained_indices[c] = self.Model.constrained_indices[c][mask]
        return constrained_indices
        # back them up
        # bounded_i = copy.deepcopy(self.Model.constrained_bounded_indices)
        # bounded_l = copy.deepcopy(self.Model.constrained_bounded_lowers)
        # bounded_u = copy.deepcopy(self.Model.constrained_bounded_uppers)
        # for b in range(len(bounded_i)): # for each group of constraints
        #     for bc in range(len(bounded_i[b])):
        #         pos = np.where(j == bounded_i[b][bc])[0]
        #         if len(pos) == 1:
        #             pos2 = np.where(self.Model.constrained_bounded_indices[b] == bounded_i[b][bc])[0][0]
        #             self.Model.constrained_bounded_indices[b][pos2] = pos[0]
        #         else:
        #             if len(self.Model.constrained_bounded_indices[b]) == 1:
        #                 # if it's the last index to be removed
        #                 # the logic here is just a mess. If we remove the last one, then all the
        #                 # b-indices change and we have to iterate through everything to find our
        #                 # current index. Can't deal with this right now.
        #                 raise NotImplementedError
        #             else: # just remove it from the indices
        #                 mask = self.Model.constrained_bounded_indices[b] != bc
        #                 self.Model.constrained_bounded_indices[b] = self.Model.constrained_bounded_indices[b][mask]
        # # here we shif the positive constraints. We cycle through each positive
        # # constraint
        # positive = self.Model.constrained_positive_indices.copy()
        # mask = (np.ones_like(positive) == 1)
        # for p in range(len(positive)):
        #     # we now check whether the constrained index appears in the j vector
        #     # (the vector of the "active" indices)
        #     pos = np.where(j == self.Model.constrained_positive_indices[p])[0]
        #     if len(pos) == 1:
        #         self.Model.constrained_positive_indices[p] = pos
        #     else:
        #         mask[p] = False
        # self.Model.constrained_positive_indices = self.Model.constrained_positive_indices[mask]
        # return (bounded_i, bounded_l, bounded_u), positive
    def restore_constraints(self, c):#b, p):
        # self.Model.constrained_bounded_indices = b[0]
        # self.Model.constrained_bounded_lowers = b[1]
        # self.Model.constrained_bounded_uppers = b[2]
        # self.Model.constrained_positive_indices = p
        self.Model.constrained_indices = c
    def get_param_shapes(self, N = None, input_dim = None):
        model_name = self.Model.__class__.__name__
        if model_name == 'GPLVM':
            return [(N, input_dim)]
        if model_name == 'Bayesian_GPLVM':
            return [(N, input_dim), (N, input_dim)]
        else:
            raise NotImplementedError
    def step_with_missing_data(self, f_fp, X, step, shapes):
        N, input_dim = X.shape
        if not sp.sparse.issparse(self.Model.likelihood.Y):
            Y = self.Model.likelihood.Y
            samples = self.non_null_samples(self.Model.likelihood.Y)
            self.Model.N = samples.sum()
            Y = Y[samples]
        else:
            samples = self.Model.likelihood.Y.nonzero()[0]
            self.Model.N = len(samples)
            Y = np.asarray(self.Model.likelihood.Y[samples].todense(), dtype = np.float64)
        if self.Model.N == 0 or Y.std() == 0.0:
            return 0, step, self.Model.N
        self.Model.likelihood._offset = Y.mean()
        self.Model.likelihood._scale = Y.std()
        self.Model.likelihood.set_data(Y)
        # self.Model.likelihood.V = self.Model.likelihood.Y*self.Model.likelihood.precision
        sigma = self.Model.likelihood._variance
        self.Model.likelihood._variance = None # invalidate cache
        self.Model.likelihood._set_params(sigma)
        j = self.subset_parameter_vector(self.x_opt, samples, shapes)
        self.Model.X = X[samples]
        model_name = self.Model.__class__.__name__
        if model_name == 'Bayesian_GPLVM':
            self.Model.likelihood.YYT = np.dot(self.Model.likelihood.Y, self.Model.likelihood.Y.T)
            self.Model.likelihood.trYYT = np.trace(self.Model.likelihood.YYT)
        ci = self.shift_constraints(j)
        f, fp = f_fp(self.x_opt[j])
        step[j] = self.momentum * step[j] + self.learning_rate[j] * fp
        self.x_opt[j] -= step[j]
        self.restore_constraints(ci)
        self.Model.grads[j] = fp
        # restore likelihood _offset and _scale, otherwise when we call set_data(y) on
        # the next feature, it will get normalized with the mean and std of this one.
        self.Model.likelihood._offset = 0
        self.Model.likelihood._scale = 1
        return f, step, self.Model.N
    def adapt_learning_rate(self, t, D):
        if self.learning_rate_adaptation == 'adagrad':
            if t > 0:
                g_k = self.Model.grads
                self.s_k += np.square(g_k)
                t0 = 100.0
                self.learning_rate = 0.1/(t0 + np.sqrt(self.s_k))
                import pdb; pdb.set_trace()
            else:
                self.learning_rate = np.zeros_like(self.learning_rate)
                self.s_k = np.zeros_like(self.x_opt)
        elif self.learning_rate_adaptation == 'annealing':
            #self.learning_rate = self.learning_rate_0/(1+float(t+1)/10)
            self.learning_rate = np.ones_like(self.learning_rate) * self.schedule[t]
        elif self.learning_rate_adaptation == 'semi_pesky':
            if self.Model.__class__.__name__ == 'Bayesian_GPLVM':
                g_t = self.Model.grads
                if t == 0:
                    self.hbar_t = 0.0
                    self.tau_t = 100.0
                    self.gbar_t = 0.0
                self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t
                self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t)
                self.learning_rate = np.ones_like(self.learning_rate)*(np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t)
                tau_t = self.tau_t*(1-self.learning_rate) + 1
    def opt(self, f_fp=None, f=None, fp=None):
        self.x_opt = self.Model._get_params_transformed()
        self.grads = []
        X, Y = self.Model.X.copy(), self.Model.likelihood.Y.copy()
        self.Model.likelihood.YYT = 0
        self.Model.likelihood.trYYT = 0
        self.Model.likelihood._offset = 0.0
        self.Model.likelihood._scale = 1.0
        N, input_dim = self.Model.X.shape
        D = self.Model.likelihood.Y.shape[1]
        num_params = self.Model._get_params()
        self.trace = []
        missing_data = self.check_for_missing(self.Model.likelihood.Y)
        step = np.zeros_like(num_params)
        for it in range(self.iterations):
            if self.actual_iter != None:
                it = self.actual_iter
            self.Model.grads = np.zeros_like(self.x_opt) # TODO this is ugly
            if it == 0 or self.self_paced is False:
                features = np.random.permutation(Y.shape[1])
            else:
                features = np.argsort(NLL)
            b = len(features)/self.batch_size
            features = [features[i::b] for i in range(b)]
            NLL = []
            import pylab as plt
            for count, j in enumerate(features):
                self.Model.input_dim = len(j)
                self.Model.likelihood.input_dim = len(j)
                self.Model.likelihood.set_data(Y[:, j])
                # self.Model.likelihood.V = self.Model.likelihood.Y*self.Model.likelihood.precision
                sigma = self.Model.likelihood._variance
                self.Model.likelihood._variance = None # invalidate cache
                self.Model.likelihood._set_params(sigma)
                if missing_data:
                    shapes = self.get_param_shapes(N, input_dim)
                    f, step, Nj = self.step_with_missing_data(f_fp, X, step, shapes)
                else:
                    self.Model.likelihood.YYT = np.dot(self.Model.likelihood.Y, self.Model.likelihood.Y.T)
                    self.Model.likelihood.trYYT = np.trace(self.Model.likelihood.YYT)
                    Nj = N
                    f, fp = f_fp(self.x_opt)
                    self.Model.grads = fp.copy()
                    step = self.momentum * step + self.learning_rate * fp
                    self.x_opt -= step
                if self.messages == 2:
                    noise = self.Model.likelihood._variance
                    status = "evaluating {feature: 5d}/{tot: 5d} \t f: {f: 2.3f} \t non-missing: {nm: 4d}\t noise: {noise: 2.4f}\r".format(feature = count, tot = len(features), f = f, nm = Nj, noise = noise)
                    sys.stdout.write(status)
                    sys.stdout.flush()
                    self.param_traces['noise'].append(noise)
                self.adapt_learning_rate(it+count, D)
                NLL.append(f)
                self.fopt_trace.append(NLL[-1])
                # fig = plt.figure('traces')
                # plt.clf()
                # plt.plot(self.param_traces['noise'])
                # for k in self.param_traces.keys():
                #     self.param_traces[k].append(self.Model.get(k)[0])
            self.grads.append(self.Model.grads.tolist())
            # should really be a sum(), but earlier samples in the iteration will have a very crappy ll
            self.f_opt = np.mean(NLL)
            self.Model.N = N
            self.Model.X = X
            self.Model.input_dim = D
            self.Model.likelihood.N = N
            self.Model.likelihood.input_dim = D
            self.Model.likelihood.Y = Y
            sigma = self.Model.likelihood._variance
            self.Model.likelihood._variance = None # invalidate cache
            self.Model.likelihood._set_params(sigma)
            self.trace.append(self.f_opt)
            if self.iteration_file is not None:
                f = open(self.iteration_file + "iteration%d.pickle" % it, 'w')
                data = [self.x_opt, self.fopt_trace, self.param_traces]
                pickle.dump(data, f)
                f.close()
            if self.messages != 0:
                sys.stdout.write('\r' + ' '*len(status)*2 + '  \r')
                status = "SGD Iteration: {0: 3d}/{1: 3d}  f: {2: 2.3f}   max eta: {3: 1.5f}\n".format(it+1, self.iterations, self.f_opt, self.learning_rate.max())
                sys.stdout.write(status)
                sys.stdout.flush()
--- a/GPy/installation.cfg
+++ b/GPy/installation.cfg
@ -0,0 +1,2 @@
 # This is the local installation configuration file for GPy
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -1,9 +1,19 @@
-# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
+from _src.kern import Kern
-# Licensed under the BSD 3-clause license (see LICENSE.txt)
+from _src.rbf import RBF
 from _src.linear import Linear, LinearFull
 from _src.static import Bias, White
 from _src.brownian import Brownian
 from _src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
 from _src.mlp import MLP
 from _src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52
 from _src.independent_outputs import IndependentOutputs, Hierarchical
 from _src.coregionalize import Coregionalize
 from _src.ODE_UY import ODE_UY
 from _src.ODE_UYC import ODE_UYC
 from _src.ODE_st import ODE_st
 from _src.ODE_t import ODE_t
 from _src.poly import Poly
 from _src.trunclinear import TruncLinear,TruncLinear_inf
 from _src.splitKern import SplitKern,DiffGenomeKern
 from constructors import *
 try:
    from constructors import rbf_sympy, sympykern # these depend on sympy
 except:
    pass
 from kern import *
--- a/GPy/kern/_src/ODE_UY.py
+++ b/GPy/kern/_src/ODE_UY.py
@ -0,0 +1,282 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
 from independent_outputs import index_to_slices
 class ODE_UY(Kern):
    def __init__(self, input_dim, variance_U=3., variance_Y=1., lengthscale_U=1., lengthscale_Y=1., active_dims=None, name='ode_uy'):
        assert input_dim ==2, "only defined for 2 input dims"
        super(ODE_UY, self).__init__(input_dim, active_dims, name)
        self.variance_Y = Param('variance_Y', variance_Y, Logexp())
        self.variance_U = Param('variance_U', variance_Y, Logexp())
        self.lengthscale_Y = Param('lengthscale_Y', lengthscale_Y, Logexp())
        self.lengthscale_U = Param('lengthscale_U', lengthscale_Y, Logexp())
        self.link_parameters(self.variance_Y, self.variance_U, self.lengthscale_Y, self.lengthscale_U)
    def K(self, X, X2=None):
        # model :   a * dy/dt + b * y = U
        #lu=sqrt(3)/theta1  ly=1/theta2  theta2= a/b :thetay   sigma2=1/(2ab) :sigmay
        X,slices = X[:,:-1],index_to_slices(X[:,-1])
        if X2 is None:
            X2,slices2 = X,slices
            K = np.zeros((X.shape[0], X.shape[0]))
        else:
            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
            K = np.zeros((X.shape[0], X2.shape[0]))
        #rdist = X[:,0][:,None] - X2[:,0][:,None].T
        rdist = X - X2.T
        ly=1/self.lengthscale_Y
        lu=np.sqrt(3)/self.lengthscale_U
        #iu=self.input_lengthU  #dimention of U
        Vu=self.variance_U
        Vy=self.variance_Y
        #Vy=ly/2
        #stop
        # kernel for kuu  matern3/2
        kuu = lambda dist:Vu * (1 + lu* np.abs(dist)) * np.exp(-lu * np.abs(dist))
        # kernel for kyy
        k1 = lambda dist:np.exp(-ly*np.abs(dist))*(2*lu+ly)/(lu+ly)**2
        k2 = lambda dist:(np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2
        k3 = lambda dist:np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
        kyy = lambda dist:Vu*Vy*(k1(dist) + k2(dist) + k3(dist))
        # cross covariance function
        kyu3 = lambda dist:np.exp(-lu*dist)/(lu+ly)*(1+lu*(dist+1/(lu+ly)))
        #kyu3 = lambda dist: 0
        k1cros = lambda dist:np.exp(ly*dist)/(lu-ly) * ( 1- np.exp( (lu-ly)*dist) + lu* ( dist*np.exp( (lu-ly)*dist ) + (1- np.exp( (lu-ly)*dist ) ) /(lu-ly)   )    )
        #k1cros = lambda dist:0
        k2cros = lambda dist:np.exp(ly*dist)*( 1/(lu+ly) + lu/(lu+ly)**2 )
        #k2cros = lambda dist:0
        Vyu=np.sqrt(Vy*ly*2)
        # cross covariance kuy
        kuyp = lambda dist:Vu*Vyu*(kyu3(dist))       #t>0 kuy
        kuyn = lambda dist:Vu*Vyu*(k1cros(dist)+k2cros(dist))      #t<0 kuy
        # cross covariance kyu
        kyup = lambda dist:Vu*Vyu*(k1cros(-dist)+k2cros(-dist))    #t>0 kyu
        kyun = lambda dist:Vu*Vyu*(kyu3(-dist))       #t<0 kyu
        for i, s1 in enumerate(slices):
            for j, s2 in enumerate(slices2):
                for ss1 in s1:
                    for ss2 in s2:
                        if i==0 and j==0:
                            K[ss1,ss2] = kuu(np.abs(rdist[ss1,ss2]))
                        elif i==0 and j==1:
                            #K[ss1,ss2]=  np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[ss1,ss2]) )   )
                            K[ss1,ss2]=  np.where(  rdist[ss1,ss2]>0 , kuyp(rdist[ss1,ss2]), kuyn(rdist[ss1,ss2] )   )
                        elif i==1 and j==1:
                            K[ss1,ss2] = kyy(np.abs(rdist[ss1,ss2]))
                        else:
                            #K[ss1,ss2]= 0
                            #K[ss1,ss2]= np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[ss1,ss2]) )   )
                            K[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(rdist[ss1,ss2]), kyun(rdist[ss1,ss2] )   )
        return K
    def Kdiag(self, X):
        """Compute the diagonal of the covariance matrix associated to X."""
        Kdiag = np.zeros(X.shape[0])
        ly=1/self.lengthscale_Y
        lu=np.sqrt(3)/self.lengthscale_U
        Vu = self.variance_U
        Vy=self.variance_Y
        k1 = (2*lu+ly)/(lu+ly)**2
        k2 = (ly-2*lu + 2*lu-ly ) / (ly-lu)**2
        k3 = 1/(lu+ly) + (lu)/(lu+ly)**2
        slices = index_to_slices(X[:,-1])
        for i, ss1 in enumerate(slices):
            for s1 in ss1:
                if i==0:
                    Kdiag[s1]+= self.variance_U
                elif i==1:
                    Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
                else:
                    raise ValueError, "invalid input/output index"
        #Kdiag[slices[0][0]]+= self.variance_U   #matern32 diag
        #Kdiag[slices[1][0]]+= self.variance_U*self.variance_Y*(k1+k2+k3)  #  diag
        return Kdiag
    def update_gradients_full(self, dL_dK, X, X2=None):
        """derivative of the covariance matrix with respect to the parameters."""
        X,slices = X[:,:-1],index_to_slices(X[:,-1])
        if X2 is None:
            X2,slices2 = X,slices
        else:
            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
        #rdist = X[:,0][:,None] - X2[:,0][:,None].T
        rdist = X - X2.T
        ly=1/self.lengthscale_Y
        lu=np.sqrt(3)/self.lengthscale_U
        Vu=self.variance_U
        Vy=self.variance_Y
        Vyu = np.sqrt(Vy*ly*2)
        dVdly = 0.5/np.sqrt(ly)*np.sqrt(2*Vy)
        dVdVy = 0.5/np.sqrt(Vy)*np.sqrt(2*ly)
        rd=rdist.shape
        dktheta1 = np.zeros(rd)
        dktheta2 = np.zeros(rd)
        dkUdvar = np.zeros(rd)
        dkYdvar = np.zeros(rd)
        # dk dtheta for UU
        UUdtheta1 = lambda dist: np.exp(-lu* dist)*dist + (-dist)*np.exp(-lu* dist)*(1+lu*dist)
        UUdtheta2 = lambda dist: 0
        #UUdvar = lambda dist: (1 + lu*dist)*np.exp(-lu*dist)
        UUdvar = lambda dist: (1 + lu* np.abs(dist)) * np.exp(-lu * np.abs(dist))
        # dk dtheta for YY
        dk1theta1 = lambda dist: np.exp(-ly*dist)*2*(-lu)/(lu+ly)**3
        dk2theta1 = lambda dist: (1.0)*(
            np.exp(-lu*dist)*dist*(-ly+2*lu-lu*ly*dist+dist*lu**2)*(ly-lu)**(-2) + np.exp(-lu*dist)*(-2+ly*dist-2*dist*lu)*(ly-lu)**(-2)
            +np.exp(-dist*lu)*(ly-2*lu+ly*lu*dist-dist*lu**2)*2*(ly-lu)**(-3)
            +np.exp(-dist*ly)*2*(ly-lu)**(-2)
            +np.exp(-dist*ly)*2*(2*lu-ly)*(ly-lu)**(-3)
            )
        dk3theta1 = lambda dist: np.exp(-dist*lu)*(lu+ly)**(-2)*((2*lu+ly+dist*lu**2+lu*ly*dist)*(-dist-2/(lu+ly))+2+2*lu*dist+ly*dist)
        #dktheta1 = lambda dist: self.variance_U*self.variance_Y*(dk1theta1+dk2theta1+dk3theta1)
        dk1theta2 = lambda dist: np.exp(-ly*dist) * ((lu+ly)**(-2)) * (  (-dist)*(2*lu+ly)  +  1  +  (-2)*(2*lu+ly)/(lu+ly)  )
        dk2theta2 =lambda dist:  1*(
            np.exp(-dist*lu)*(ly-lu)**(-2) * ( 1+lu*dist+(-2)*(ly-2*lu+lu*ly*dist-dist*lu**2)*(ly-lu)**(-1) )
            +np.exp(-dist*ly)*(ly-lu)**(-2) * ( (-dist)*(2*lu-ly) -1+(2*lu-ly)*(-2)*(ly-lu)**(-1) )
            )
        dk3theta2 = lambda dist: np.exp(-dist*lu) * (-3*lu-ly-dist*lu**2-lu*ly*dist)/(lu+ly)**3
        #dktheta2 = lambda dist: self.variance_U*self.variance_Y*(dk1theta2 + dk2theta2 +dk3theta2)
        # kyy kernel
        k1 = lambda dist: np.exp(-ly*dist)*(2*lu+ly)/(lu+ly)**2
        k2 = lambda dist: (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2
        k3 = lambda dist: np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
        #dkdvar = k1+k2+k3
        # cross covariance function
        kyu3 = lambda dist:np.exp(-lu*dist)/(lu+ly)*(1+lu*(dist+1/(lu+ly)))
        k1cros = lambda dist:np.exp(ly*dist)/(lu-ly) * ( 1- np.exp( (lu-ly)*dist) + lu* ( dist*np.exp( (lu-ly)*dist ) + (1- np.exp( (lu-ly)*dist ) ) /(lu-ly)   )    )
        k2cros = lambda dist:np.exp(ly*dist)*( 1/(lu+ly) + lu/(lu+ly)**2 )
        # cross covariance kuy
        kuyp = lambda dist:(kyu3(dist))       #t>0 kuy
        kuyn = lambda dist:(k1cros(dist)+k2cros(dist))      #t<0 kuy
        # cross covariance kyu
        kyup = lambda dist:(k1cros(-dist)+k2cros(-dist))    #t>0 kyu
        kyun = lambda dist:(kyu3(-dist))       #t<0 kyu
        # dk dtheta for UY
        dkyu3dtheta2 = lambda dist: np.exp(-lu*dist) * ( (-1)*(lu+ly)**(-2)*(1+lu*dist+lu*(lu+ly)**(-1)) + (lu+ly)**(-1)*(-lu)*(lu+ly)**(-2) )
        dkyu3dtheta1 = lambda dist: np.exp(-lu*dist)*(lu+ly)**(-1)* ( (-dist)*(1+dist*lu+lu*(lu+ly)**(-1)) -\
         (lu+ly)**(-1)*(1+dist*lu+lu*(lu+ly)**(-1)) +dist+(lu+ly)**(-1)-lu*(lu+ly)**(-2) )
        dkcros2dtheta1 = lambda dist: np.exp(ly*dist)* ( -(ly+lu)**(-2) + (ly+lu)**(-2) + (-2)*lu*(lu+ly)**(-3)  )
        dkcros2dtheta2 = lambda dist: np.exp(ly*dist)*dist* ( (ly+lu)**(-1) + lu*(lu+ly)**(-2) ) + \
                                      np.exp(ly*dist)*( -(lu+ly)**(-2) + lu*(-2)*(lu+ly)**(-3)  )
        dkcros1dtheta1 = lambda dist: np.exp(ly*dist)*(     -(lu-ly)**(-2)*(  1-np.exp((lu-ly)*dist) + lu*dist*np.exp((lu-ly)*dist)+ \
          lu*(1-np.exp((lu-ly)*dist))/(lu-ly)  )  +  (lu-ly)**(-1)*(  -np.exp( (lu-ly)*dist )*dist + dist*np.exp( (lu-ly)*dist)+\
          lu*dist**2*np.exp((lu-ly)*dist)+(1-np.exp((lu-ly)*dist))/(lu-ly) - lu*np.exp((lu-ly)*dist)*dist/(lu-ly) -\
          lu*(1-np.exp((lu-ly)*dist))/(lu-ly)**2  )   )
        dkcros1dtheta2 = lambda t: np.exp(ly*t)*t/(lu-ly)*( 1-np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t)+\
            lu*(1-np.exp((lu-ly)*t))/(lu-ly)  )+\
            np.exp(ly*t)/(lu-ly)**2* ( 1-np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t) + lu*( 1-np.exp((lu-ly)*t) )/(lu-ly)  )+\
            np.exp(ly*t)/(lu-ly)*( np.exp((lu-ly)*t)*t -lu*t*t*np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t)/(lu-ly)+\
            lu*( 1-np.exp((lu-ly)*t) )/(lu-ly)**2 )
        dkuypdtheta1 = lambda dist:(dkyu3dtheta1(dist))       #t>0 kuy
        dkuyndtheta1 = lambda dist:(dkcros1dtheta1(dist)+dkcros2dtheta1(dist))      #t<0 kuy
        # cross covariance kyu
        dkyupdtheta1 = lambda dist:(dkcros1dtheta1(-dist)+dkcros2dtheta1(-dist))    #t>0 kyu
        dkyundtheta1 = lambda dist:(dkyu3dtheta1(-dist))       #t<0 kyu
        dkuypdtheta2 = lambda dist:(dkyu3dtheta2(dist))       #t>0 kuy
        dkuyndtheta2 = lambda dist:(dkcros1dtheta2(dist)+dkcros2dtheta2(dist))      #t<0 kuy
        # cross covariance kyu
        dkyupdtheta2 = lambda dist:(dkcros1dtheta2(-dist)+dkcros2dtheta2(-dist))    #t>0 kyu
        dkyundtheta2 = lambda dist:(dkyu3dtheta2(-dist))       #t<0 kyu
        for i, s1 in enumerate(slices):
            for j, s2 in enumerate(slices2):
                for ss1 in s1:
                    for ss2 in s2:
                        if i==0 and j==0:
                            #target[ss1,ss2] = kuu(np.abs(rdist[ss1,ss2]))
                            dktheta1[ss1,ss2] = Vu*UUdtheta1(np.abs(rdist[ss1,ss2]))
                            dktheta2[ss1,ss2] = 0
                            dkUdvar[ss1,ss2] = UUdvar(np.abs(rdist[ss1,ss2]))
                            dkYdvar[ss1,ss2] = 0
                        elif i==0 and j==1:
                            ########target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) )   )
                            #np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) )   )
                            #dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , self.variance_U*self.variance_Y*dkcrtheta1(np.abs(rdist[ss1,ss2])) ,self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2])))    )
                            #dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , self.variance_U*self.variance_Y*dkcrtheta2(np.abs(rdist[ss1,ss2])) ,self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2]))+dk2theta2(np.abs(rdist[ss1,ss2])))    )
                            dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkuypdtheta1(rdist[ss1,ss2]),Vu*Vyu*dkuyndtheta1(rdist[ss1,ss2]) )
                            dkUdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vyu*kuyp(rdist[ss1,ss2]), Vyu* kuyn(rdist[ss1,ss2])  )
                            dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkuypdtheta2(rdist[ss1,ss2])+Vu*dVdly*kuyp(rdist[ss1,ss2]),Vu*Vyu*dkuyndtheta2(rdist[ss1,ss2])+Vu*dVdly*kuyn(rdist[ss1,ss2]) )
                            dkYdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*dVdVy*kuyp(rdist[ss1,ss2]), Vu*dVdVy* kuyn(rdist[ss1,ss2])  )
                        elif i==1 and j==1:
                            #target[ss1,ss2] = kyy(np.abs(rdist[ss1,ss2]))
                            dktheta1[ss1,ss2] = self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2]))+dk3theta1(np.abs(rdist[ss1,ss2])))
                            dktheta2[ss1,ss2] = self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2])) + dk2theta2(np.abs(rdist[ss1,ss2])) +dk3theta2(np.abs(rdist[ss1,ss2])))
                            dkUdvar[ss1,ss2] = self.variance_Y*(k1(np.abs(rdist[ss1,ss2]))+k2(np.abs(rdist[ss1,ss2]))+k3(np.abs(rdist[ss1,ss2])) )
                            dkYdvar[ss1,ss2] = self.variance_U*(k1(np.abs(rdist[ss1,ss2]))+k2(np.abs(rdist[ss1,ss2]))+k3(np.abs(rdist[ss1,ss2])) )
                        else:
                            #######target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[s1[0],s2[0]]) )   )
                            #dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 ,self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2]))) , self.variance_U*self.variance_Y*dkcrtheta1(np.abs(rdist[ss1,ss2])) )
                            #dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 ,self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2]))+dk2theta2(np.abs(rdist[ss1,ss2]))) , self.variance_U*self.variance_Y*dkcrtheta2(np.abs(rdist[ss1,ss2])) )
                            dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkyupdtheta1(rdist[ss1,ss2]),Vu*Vyu*dkyundtheta1(rdist[ss1,ss2])  )
                            dkUdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vyu*kyup(rdist[ss1,ss2]),Vyu*kyun(rdist[ss1,ss2]))
                            dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkyupdtheta2(rdist[ss1,ss2])+Vu*dVdly*kyup(rdist[ss1,ss2]),Vu*Vyu*dkyundtheta2(rdist[ss1,ss2])+Vu*dVdly*kyun(rdist[ss1,ss2])  )
                            dkYdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*dVdVy*kyup(rdist[ss1,ss2]), Vu*dVdVy*kyun(rdist[ss1,ss2]))
        #stop
        self.variance_U.gradient = np.sum(dkUdvar * dL_dK)     # Vu
        self.variance_Y.gradient = np.sum(dkYdvar * dL_dK)     # Vy
        self.lengthscale_U.gradient = np.sum(dktheta1*(-np.sqrt(3)*self.lengthscale_U**(-2))* dL_dK)     #lu
        self.lengthscale_Y.gradient = np.sum(dktheta2*(-self.lengthscale_Y**(-2)) * dL_dK)              #ly
--- a/GPy/kern/_src/ODE_UYC.py
+++ b/GPy/kern/_src/ODE_UYC.py
@ -0,0 +1,290 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
 from independent_outputs import index_to_slices
 class ODE_UYC(Kern):
    def __init__(self, input_dim, variance_U=3., variance_Y=1., lengthscale_U=1., lengthscale_Y=1., ubias =1. ,active_dims=None, name='ode_uyc'):
        assert input_dim ==2, "only defined for 2 input dims"
        super(ODE_UYC, self).__init__(input_dim, active_dims, name)
        self.variance_Y = Param('variance_Y', variance_Y, Logexp())
        self.variance_U = Param('variance_U', variance_U, Logexp())
        self.lengthscale_Y = Param('lengthscale_Y', lengthscale_Y, Logexp())
        self.lengthscale_U = Param('lengthscale_U', lengthscale_U, Logexp())
        self.ubias = Param('ubias', ubias, Logexp())
        self.add_parameters(self.variance_Y, self.variance_U, self.lengthscale_Y, self.lengthscale_U, self.ubias)
    def K(self, X, X2=None):
        # model :   a * dy/dt + b * y = U
        #lu=sqrt(3)/theta1  ly=1/theta2  theta2= a/b :thetay   sigma2=1/(2ab) :sigmay
        X,slices = X[:,:-1],index_to_slices(X[:,-1])
        if X2 is None:
            X2,slices2 = X,slices
            K = np.zeros((X.shape[0], X.shape[0]))
        else:
            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
            K = np.zeros((X.shape[0], X2.shape[0]))
        #stop
        #rdist = X[:,0][:,None] - X2[:,0][:,None].T
        rdist = X - X2.T
        ly=1/self.lengthscale_Y
        lu=np.sqrt(3)/self.lengthscale_U
        #iu=self.input_lengthU  #dimention of U
        Vu=self.variance_U
        Vy=self.variance_Y
        #Vy=ly/2
        #stop
        # kernel for kuu  matern3/2
        kuu = lambda dist:Vu * (1 + lu* np.abs(dist)) * np.exp(-lu * np.abs(dist)) +self.ubias
        # kernel for kyy
        k1 = lambda dist:np.exp(-ly*np.abs(dist))*(2*lu+ly)/(lu+ly)**2
        k2 = lambda dist:(np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2
        k3 = lambda dist:np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
        kyy = lambda dist:Vu*Vy*(k1(dist) + k2(dist) + k3(dist))
        # cross covariance function
        kyu3 = lambda dist:np.exp(-lu*dist)/(lu+ly)*(1+lu*(dist+1/(lu+ly)))
        #kyu3 = lambda dist: 0
        k1cros = lambda dist:np.exp(ly*dist)/(lu-ly) * ( 1- np.exp( (lu-ly)*dist) + lu* ( dist*np.exp( (lu-ly)*dist ) + (1- np.exp( (lu-ly)*dist ) ) /(lu-ly)   )    )
        #k1cros = lambda dist:0
        k2cros = lambda dist:np.exp(ly*dist)*( 1/(lu+ly) + lu/(lu+ly)**2 )
        #k2cros = lambda dist:0
        Vyu=np.sqrt(Vy*ly*2)
        # cross covariance kuy
        kuyp = lambda dist:Vu*Vyu*(kyu3(dist))       #t>0 kuy
        kuyn = lambda dist:Vu*Vyu*(k1cros(dist)+k2cros(dist))      #t<0 kuy
        # cross covariance kyu
        kyup = lambda dist:Vu*Vyu*(k1cros(-dist)+k2cros(-dist))    #t>0 kyu
        kyun = lambda dist:Vu*Vyu*(kyu3(-dist))       #t<0 kyu
        for i, s1 in enumerate(slices):
            for j, s2 in enumerate(slices2):
                for ss1 in s1:
                    for ss2 in s2:
                        if i==0 and j==0:
                            K[ss1,ss2] = kuu(np.abs(rdist[ss1,ss2]))
                        elif i==0 and j==1:
                            #K[ss1,ss2]=  np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[ss1,ss2]) )   )
                            K[ss1,ss2]=  np.where(  rdist[ss1,ss2]>0 , kuyp(rdist[ss1,ss2]), kuyn(rdist[ss1,ss2] )   )
                        elif i==1 and j==1:
                            K[ss1,ss2] = kyy(np.abs(rdist[ss1,ss2]))
                        else:
                            #K[ss1,ss2]= 0
                            #K[ss1,ss2]= np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[ss1,ss2]) )   )
                            K[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(rdist[ss1,ss2]), kyun(rdist[ss1,ss2] )   )
        return K
    def Kdiag(self, X):
        """Compute the diagonal of the covariance matrix associated to X."""
        Kdiag = np.zeros(X.shape[0])
        ly=1/self.lengthscale_Y
        lu=np.sqrt(3)/self.lengthscale_U
        Vu = self.variance_U
        Vy=self.variance_Y
        k1 = (2*lu+ly)/(lu+ly)**2
        k2 = (ly-2*lu + 2*lu-ly ) / (ly-lu)**2
        k3 = 1/(lu+ly) + (lu)/(lu+ly)**2
        slices = index_to_slices(X[:,-1])
        for i, ss1 in enumerate(slices):
            for s1 in ss1:
                if i==0:
                    Kdiag[s1]+= self.variance_U + self.ubias
                elif i==1:
                    Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
                else:
                    raise ValueError, "invalid input/output index"
        #Kdiag[slices[0][0]]+= self.variance_U   #matern32 diag
        #Kdiag[slices[1][0]]+= self.variance_U*self.variance_Y*(k1+k2+k3)  #  diag
        return Kdiag
    def update_gradients_full(self, dL_dK, X, X2=None):
        """derivative of the covariance matrix with respect to the parameters."""
        X,slices = X[:,:-1],index_to_slices(X[:,-1])
        if X2 is None:
            X2,slices2 = X,slices
        else:
            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
        #rdist = X[:,0][:,None] - X2[:,0][:,None].T
        rdist = X - X2.T
        ly=1/self.lengthscale_Y
        lu=np.sqrt(3)/self.lengthscale_U
        Vu=self.variance_U
        Vy=self.variance_Y
        Vyu = np.sqrt(Vy*ly*2)
        dVdly = 0.5/np.sqrt(ly)*np.sqrt(2*Vy)
        dVdVy = 0.5/np.sqrt(Vy)*np.sqrt(2*ly)
        rd=rdist.shape[0]
        dktheta1 = np.zeros([rd,rd])
        dktheta2 = np.zeros([rd,rd])
        dkUdvar = np.zeros([rd,rd])
        dkYdvar = np.zeros([rd,rd])
        dkdubias = np.zeros([rd,rd])
        # dk dtheta for UU
        UUdtheta1 = lambda dist: np.exp(-lu* dist)*dist + (-dist)*np.exp(-lu* dist)*(1+lu*dist)
        UUdtheta2 = lambda dist: 0
        #UUdvar = lambda dist: (1 + lu*dist)*np.exp(-lu*dist)
        UUdvar = lambda dist: (1 + lu* np.abs(dist)) * np.exp(-lu * np.abs(dist))
        # dk dtheta for YY
        dk1theta1 = lambda dist: np.exp(-ly*dist)*2*(-lu)/(lu+ly)**3
        dk2theta1 = lambda dist: (1.0)*(
            np.exp(-lu*dist)*dist*(-ly+2*lu-lu*ly*dist+dist*lu**2)*(ly-lu)**(-2) + np.exp(-lu*dist)*(-2+ly*dist-2*dist*lu)*(ly-lu)**(-2)
            +np.exp(-dist*lu)*(ly-2*lu+ly*lu*dist-dist*lu**2)*2*(ly-lu)**(-3)
            +np.exp(-dist*ly)*2*(ly-lu)**(-2)
            +np.exp(-dist*ly)*2*(2*lu-ly)*(ly-lu)**(-3)
            )
        dk3theta1 = lambda dist: np.exp(-dist*lu)*(lu+ly)**(-2)*((2*lu+ly+dist*lu**2+lu*ly*dist)*(-dist-2/(lu+ly))+2+2*lu*dist+ly*dist)
        #dktheta1 = lambda dist: self.variance_U*self.variance_Y*(dk1theta1+dk2theta1+dk3theta1)
        dk1theta2 = lambda dist: np.exp(-ly*dist) * ((lu+ly)**(-2)) * (  (-dist)*(2*lu+ly)  +  1  +  (-2)*(2*lu+ly)/(lu+ly)  )
        dk2theta2 =lambda dist:  1*(
            np.exp(-dist*lu)*(ly-lu)**(-2) * ( 1+lu*dist+(-2)*(ly-2*lu+lu*ly*dist-dist*lu**2)*(ly-lu)**(-1) )
            +np.exp(-dist*ly)*(ly-lu)**(-2) * ( (-dist)*(2*lu-ly) -1+(2*lu-ly)*(-2)*(ly-lu)**(-1) )
            )
        dk3theta2 = lambda dist: np.exp(-dist*lu) * (-3*lu-ly-dist*lu**2-lu*ly*dist)/(lu+ly)**3
        #dktheta2 = lambda dist: self.variance_U*self.variance_Y*(dk1theta2 + dk2theta2 +dk3theta2)
        # kyy kernel
        k1 = lambda dist: np.exp(-ly*dist)*(2*lu+ly)/(lu+ly)**2
        k2 = lambda dist: (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2
        k3 = lambda dist: np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
        #dkdvar = k1+k2+k3
        # cross covariance function
        kyu3 = lambda dist:np.exp(-lu*dist)/(lu+ly)*(1+lu*(dist+1/(lu+ly)))
        k1cros = lambda dist:np.exp(ly*dist)/(lu-ly) * ( 1- np.exp( (lu-ly)*dist) + lu* ( dist*np.exp( (lu-ly)*dist ) + (1- np.exp( (lu-ly)*dist ) ) /(lu-ly)   )    )
        k2cros = lambda dist:np.exp(ly*dist)*( 1/(lu+ly) + lu/(lu+ly)**2 )
        # cross covariance kuy
        kuyp = lambda dist:(kyu3(dist))       #t>0 kuy
        kuyn = lambda dist:(k1cros(dist)+k2cros(dist))      #t<0 kuy
        # cross covariance kyu
        kyup = lambda dist:(k1cros(-dist)+k2cros(-dist))    #t>0 kyu
        kyun = lambda dist:(kyu3(-dist))       #t<0 kyu
        # dk dtheta for UY
        dkyu3dtheta2 = lambda dist: np.exp(-lu*dist) * ( (-1)*(lu+ly)**(-2)*(1+lu*dist+lu*(lu+ly)**(-1)) + (lu+ly)**(-1)*(-lu)*(lu+ly)**(-2) )
        dkyu3dtheta1 = lambda dist: np.exp(-lu*dist)*(lu+ly)**(-1)* ( (-dist)*(1+dist*lu+lu*(lu+ly)**(-1)) -\
         (lu+ly)**(-1)*(1+dist*lu+lu*(lu+ly)**(-1)) +dist+(lu+ly)**(-1)-lu*(lu+ly)**(-2) )
        dkcros2dtheta1 = lambda dist: np.exp(ly*dist)* ( -(ly+lu)**(-2) + (ly+lu)**(-2) + (-2)*lu*(lu+ly)**(-3)  )
        dkcros2dtheta2 = lambda dist: np.exp(ly*dist)*dist* ( (ly+lu)**(-1) + lu*(lu+ly)**(-2) ) + \
                                      np.exp(ly*dist)*( -(lu+ly)**(-2) + lu*(-2)*(lu+ly)**(-3)  )
        dkcros1dtheta1 = lambda dist: np.exp(ly*dist)*(     -(lu-ly)**(-2)*(  1-np.exp((lu-ly)*dist) + lu*dist*np.exp((lu-ly)*dist)+ \
          lu*(1-np.exp((lu-ly)*dist))/(lu-ly)  )  +  (lu-ly)**(-1)*(  -np.exp( (lu-ly)*dist )*dist + dist*np.exp( (lu-ly)*dist)+\
          lu*dist**2*np.exp((lu-ly)*dist)+(1-np.exp((lu-ly)*dist))/(lu-ly) - lu*np.exp((lu-ly)*dist)*dist/(lu-ly) -\
          lu*(1-np.exp((lu-ly)*dist))/(lu-ly)**2  )   )
        dkcros1dtheta2 = lambda t: np.exp(ly*t)*t/(lu-ly)*( 1-np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t)+\
            lu*(1-np.exp((lu-ly)*t))/(lu-ly)  )+\
            np.exp(ly*t)/(lu-ly)**2* ( 1-np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t) + lu*( 1-np.exp((lu-ly)*t) )/(lu-ly)  )+\
            np.exp(ly*t)/(lu-ly)*( np.exp((lu-ly)*t)*t -lu*t*t*np.exp((lu-ly)*t) +lu*t*np.exp((lu-ly)*t)/(lu-ly)+\
            lu*( 1-np.exp((lu-ly)*t) )/(lu-ly)**2 )
        dkuypdtheta1 = lambda dist:(dkyu3dtheta1(dist))       #t>0 kuy
        dkuyndtheta1 = lambda dist:(dkcros1dtheta1(dist)+dkcros2dtheta1(dist))      #t<0 kuy
        # cross covariance kyu
        dkyupdtheta1 = lambda dist:(dkcros1dtheta1(-dist)+dkcros2dtheta1(-dist))    #t>0 kyu
        dkyundtheta1 = lambda dist:(dkyu3dtheta1(-dist))       #t<0 kyu
        dkuypdtheta2 = lambda dist:(dkyu3dtheta2(dist))       #t>0 kuy
        dkuyndtheta2 = lambda dist:(dkcros1dtheta2(dist)+dkcros2dtheta2(dist))      #t<0 kuy
        # cross covariance kyu
        dkyupdtheta2 = lambda dist:(dkcros1dtheta2(-dist)+dkcros2dtheta2(-dist))    #t>0 kyu
        dkyundtheta2 = lambda dist:(dkyu3dtheta2(-dist))       #t<0 kyu
        for i, s1 in enumerate(slices):
            for j, s2 in enumerate(slices2):
                for ss1 in s1:
                    for ss2 in s2:
                        if i==0 and j==0:
                            #target[ss1,ss2] = kuu(np.abs(rdist[ss1,ss2]))
                            dktheta1[ss1,ss2] = Vu*UUdtheta1(np.abs(rdist[ss1,ss2]))
                            dktheta2[ss1,ss2] = 0
                            dkUdvar[ss1,ss2] = UUdvar(np.abs(rdist[ss1,ss2]))
                            dkYdvar[ss1,ss2] = 0
                            dkdubias[ss1,ss2] = 1
                        elif i==0 and j==1:
                            ########target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) )   )
                            #np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[s1[0],s2[0]]) )   )
                            #dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , self.variance_U*self.variance_Y*dkcrtheta1(np.abs(rdist[ss1,ss2])) ,self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2])))    )
                            #dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , self.variance_U*self.variance_Y*dkcrtheta2(np.abs(rdist[ss1,ss2])) ,self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2]))+dk2theta2(np.abs(rdist[ss1,ss2])))    )
                            dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkuypdtheta1(rdist[ss1,ss2]),Vu*Vyu*dkuyndtheta1(rdist[ss1,ss2]) )
                            dkUdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vyu*kuyp(rdist[ss1,ss2]), Vyu* kuyn(rdist[ss1,ss2])  )
                            dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkuypdtheta2(rdist[ss1,ss2])+Vu*dVdly*kuyp(rdist[ss1,ss2]),Vu*Vyu*dkuyndtheta2(rdist[ss1,ss2])+Vu*dVdly*kuyn(rdist[ss1,ss2]) )
                            dkYdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*dVdVy*kuyp(rdist[ss1,ss2]), Vu*dVdVy* kuyn(rdist[ss1,ss2])  )
                            dkdubias[ss1,ss2] = 0
                        elif i==1 and j==1:
                            #target[ss1,ss2] = kyy(np.abs(rdist[ss1,ss2]))
                            dktheta1[ss1,ss2] = self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2]))+dk3theta1(np.abs(rdist[ss1,ss2])))
                            dktheta2[ss1,ss2] = self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2])) + dk2theta2(np.abs(rdist[ss1,ss2])) +dk3theta2(np.abs(rdist[ss1,ss2])))
                            dkUdvar[ss1,ss2] = self.variance_Y*(k1(np.abs(rdist[ss1,ss2]))+k2(np.abs(rdist[ss1,ss2]))+k3(np.abs(rdist[ss1,ss2])) )
                            dkYdvar[ss1,ss2] = self.variance_U*(k1(np.abs(rdist[ss1,ss2]))+k2(np.abs(rdist[ss1,ss2]))+k3(np.abs(rdist[ss1,ss2])) )
                            dkdubias[ss1,ss2] = 0
                        else:
                            #######target[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[s1[0],s2[0]]) )   )
                            #dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 ,self.variance_U*self.variance_Y*(dk1theta1(np.abs(rdist[ss1,ss2]))+dk2theta1(np.abs(rdist[ss1,ss2]))) , self.variance_U*self.variance_Y*dkcrtheta1(np.abs(rdist[ss1,ss2])) )
                            #dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 ,self.variance_U*self.variance_Y*(dk1theta2(np.abs(rdist[ss1,ss2]))+dk2theta2(np.abs(rdist[ss1,ss2]))) , self.variance_U*self.variance_Y*dkcrtheta2(np.abs(rdist[ss1,ss2])) )
                            dktheta1[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkyupdtheta1(rdist[ss1,ss2]),Vu*Vyu*dkyundtheta1(rdist[ss1,ss2])  )
                            dkUdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vyu*kyup(rdist[ss1,ss2]),Vyu*kyun(rdist[ss1,ss2]))
                            dktheta2[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*Vyu*dkyupdtheta2(rdist[ss1,ss2])+Vu*dVdly*kyup(rdist[ss1,ss2]),Vu*Vyu*dkyundtheta2(rdist[ss1,ss2])+Vu*dVdly*kyun(rdist[ss1,ss2])  )
                            dkYdvar[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , Vu*dVdVy*kyup(rdist[ss1,ss2]), Vu*dVdVy*kyun(rdist[ss1,ss2]))
                            dkdubias[ss1,ss2] = 0
        #stop
        self.variance_U.gradient = np.sum(dkUdvar * dL_dK)     # Vu
        self.variance_Y.gradient = np.sum(dkYdvar * dL_dK)     # Vy
        self.lengthscale_U.gradient = np.sum(dktheta1*(-np.sqrt(3)*self.lengthscale_U**(-2))* dL_dK)     #lu
        self.lengthscale_Y.gradient = np.sum(dktheta2*(-self.lengthscale_Y**(-2)) * dL_dK)              #ly
        self.ubias.gradient = np.sum(dkdubias * dL_dK) 
--- a/GPy/kern/_src/ODE_st.py
+++ b/GPy/kern/_src/ODE_st.py
@ -0,0 +1,267 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
 from independent_outputs import index_to_slices
 class ODE_st(Kern):
    """
    kernel resultiong from a first order ODE with OU driving GP
    :param input_dim: the number of input dimension, has to be equal to one
    :type input_dim: int
    :param varianceU: variance of the driving GP
    :type varianceU: float
    :param lengthscaleU: lengthscale of the driving GP  (sqrt(3)/lengthscaleU)
    :type lengthscaleU: float
    :param varianceY: 'variance' of the transfer function
    :type varianceY: float
    :param lengthscaleY: 'lengthscale' of the transfer function (1/lengthscaleY)
    :type lengthscaleY: float
    :rtype: kernel object
    """
    def __init__(self, input_dim, a=1.,b=1., c=1.,variance_Yx=3.,variance_Yt=1.5, lengthscale_Yx=1.5, lengthscale_Yt=1.5, active_dims=None, name='ode_st'):
        assert input_dim ==3, "only defined for 3 input dims"
        super(ODE_st, self).__init__(input_dim, active_dims, name)
        self.variance_Yt = Param('variance_Yt', variance_Yt, Logexp())
        self.variance_Yx = Param('variance_Yx', variance_Yx, Logexp())
        self.lengthscale_Yt = Param('lengthscale_Yt', lengthscale_Yt, Logexp())
        self.lengthscale_Yx = Param('lengthscale_Yx', lengthscale_Yx, Logexp())        
        self.a= Param('a', a, Logexp())
        self.b = Param('b', b, Logexp())
        self.c = Param('c', c, Logexp())
        self.add_parameters(self.a, self.b, self.c, self.variance_Yt, self.variance_Yx, self.lengthscale_Yt,self.lengthscale_Yx)
    def K(self, X, X2=None):        
    # model :   -a d^2y/dx^2  + b dy/dt + c * y = U
    # kernel Kyy rbf spatiol temporal
    # vyt Y temporal variance  vyx Y spatiol variance   lyt Y temporal lengthscale   lyx Y spatiol lengthscale
    # kernel Kuu doper( doper(Kyy))
    # a   b    c    lyt   lyx    vyx*vyt
        """Compute the covariance matrix between X and X2."""        
        X,slices = X[:,:-1],index_to_slices(X[:,-1])
        if X2 is None:
            X2,slices2 = X,slices
            K = np.zeros((X.shape[0], X.shape[0]))
        else:
            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
            K = np.zeros((X.shape[0], X2.shape[0]))
        tdist = (X[:,0][:,None] - X2[:,0][None,:])**2
        xdist = (X[:,1][:,None] - X2[:,1][None,:])**2
        ttdist = (X[:,0][:,None] - X2[:,0][None,:])
        #rdist = [tdist,xdist]
        #dist = np.abs(X - X2.T)
        vyt = self.variance_Yt
        vyx = self.variance_Yx
        lyt=1/(2*self.lengthscale_Yt)
        lyx=1/(2*self.lengthscale_Yx)
        a = self.a ## -a is used in the model, negtive diffusion
        b = self.b
        c = self.c
        kyy = lambda tdist,xdist: np.exp(-lyt*(tdist) -lyx*(xdist))
        k1 = lambda tdist: (2*lyt - 4*lyt**2 * (tdist) )
        k2 = lambda xdist: ( 4*lyx**2 * (xdist)  - 2*lyx )
        k3 = lambda xdist: ( 3*4*lyx**2 - 6*8*xdist*lyx**3 + 16*xdist**2*lyx**4 )
        k4 = lambda ttdist: 2*lyt*(ttdist)
        for i, s1 in enumerate(slices):
            for j, s2 in enumerate(slices2):
                for ss1 in s1:
                    for ss2 in s2:
                        if i==0 and j==0:
                            K[ss1,ss2] = vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                        elif i==0 and j==1:
                            K[ss1,ss2] = (-a*k2(xdist[ss1,ss2]) + b*k4(ttdist[ss1,ss2]) + c)*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                            #K[ss1,ss2]=  np.where(  rdist[ss1,ss2]>0 , kuyp(np.abs(rdist[ss1,ss2])), kuyn(np.abs(rdist[ss1,ss2]) )   )
                            #K[ss1,ss2]=  np.where(  rdist[ss1,ss2]>0 , kuyp(rdist[ss1,ss2]), kuyn(rdist[ss1,ss2] )   )
                        elif i==1 and j==1:
                            K[ss1,ss2] = ( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 )* vyt*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                        else:
                            K[ss1,ss2] = (-a*k2(xdist[ss1,ss2]) - b*k4(ttdist[ss1,ss2]) + c)*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                            #K[ss1,ss2]= np.where(  rdist[ss1,ss2]>0 , kyup(np.abs(rdist[ss1,ss2])), kyun(np.abs(rdist[ss1,ss2]) )   )
                            #K[ss1,ss2] = np.where(  rdist[ss1,ss2]>0 , kyup(rdist[ss1,ss2]), kyun(rdist[ss1,ss2] )   )
        #stop
        return K
    def Kdiag(self, X):
        """Compute the diagonal of the covariance matrix associated to X."""
        vyt = self.variance_Yt
        vyx = self.variance_Yx
        lyt = 1./(2*self.lengthscale_Yt)
        lyx = 1./(2*self.lengthscale_Yx)
        a = self.a
        b = self.b
        c = self.c
        ## dk^2/dtdt'
        k1 = (2*lyt )*vyt*vyx
        ## dk^2/dx^2
        k2 = ( - 2*lyx )*vyt*vyx
        ## dk^4/dx^2dx'^2
        k3 = ( 4*3*lyx**2 )*vyt*vyx
        Kdiag = np.zeros(X.shape[0])
        slices = index_to_slices(X[:,-1])
        for i, ss1 in enumerate(slices):
            for s1 in ss1:
                if i==0:
                    Kdiag[s1]+= vyt*vyx
                elif i==1:
                    #i=1
                    Kdiag[s1]+= b**2*k1 - 2*a*c*k2 + a**2*k3 + c**2*vyt*vyx
                    #Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
                else:
                    raise ValueError, "invalid input/output index"
        return Kdiag
    def update_gradients_full(self, dL_dK, X, X2=None):
    #def dK_dtheta(self, dL_dK, X, X2, target):
        """derivative of the covariance matrix with respect to the parameters."""
        X,slices = X[:,:-1],index_to_slices(X[:,-1])
        if X2 is None:
            X2,slices2 = X,slices
            K = np.zeros((X.shape[0], X.shape[0]))
        else:
            X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
        vyt = self.variance_Yt
        vyx = self.variance_Yx
        lyt = 1./(2*self.lengthscale_Yt)
        lyx = 1./(2*self.lengthscale_Yx)
        a = self.a
        b = self.b
        c = self.c
        tdist = (X[:,0][:,None] - X2[:,0][None,:])**2
        xdist = (X[:,1][:,None] - X2[:,1][None,:])**2
        #rdist = [tdist,xdist]
        ttdist = (X[:,0][:,None] - X2[:,0][None,:])
        rd=tdist.shape[0]
        dka = np.zeros([rd,rd])
        dkb = np.zeros([rd,rd])
        dkc = np.zeros([rd,rd])
        dkYdvart = np.zeros([rd,rd])
        dkYdvarx = np.zeros([rd,rd])
        dkYdlent = np.zeros([rd,rd])
        dkYdlenx = np.zeros([rd,rd])
        kyy = lambda tdist,xdist: np.exp(-lyt*(tdist) -lyx*(xdist))
        #k1 = lambda tdist: (lyt - lyt**2 * (tdist) )
        #k2 = lambda xdist: ( lyx**2 * (xdist)  - lyx )
        #k3 = lambda xdist: ( 3*lyx**2 - 6*xdist*lyx**3 + xdist**2*lyx**4 )
        #k4 = lambda tdist: -lyt*np.sqrt(tdist)
        k1 = lambda tdist: (2*lyt - 4*lyt**2 * (tdist) )
        k2 = lambda xdist: ( 4*lyx**2 * (xdist)  - 2*lyx )
        k3 = lambda xdist: ( 3*4*lyx**2 - 6*8*xdist*lyx**3 + 16*xdist**2*lyx**4 )
        k4 = lambda ttdist: 2*lyt*(ttdist)
        dkyydlyx = lambda tdist,xdist: kyy(tdist,xdist)*(-xdist)
        dkyydlyt = lambda tdist,xdist: kyy(tdist,xdist)*(-tdist)
        dk1dlyt = lambda tdist: 2. - 4*2.*lyt*tdist
        dk2dlyx = lambda xdist: (4.*2.*lyx*xdist -2.)
        dk3dlyx = lambda xdist: (6.*4.*lyx - 18.*8*xdist*lyx**2 + 4*16*xdist**2*lyx**3)
        dk4dlyt = lambda ttdist: 2*(ttdist)
        for i, s1 in enumerate(slices):
            for j, s2 in enumerate(slices2):
                for ss1 in s1:
                    for ss2 in s2:
                        if i==0 and j==0:
                            dka[ss1,ss2] = 0
                            dkb[ss1,ss2] = 0
                            dkc[ss1,ss2] = 0
                            dkYdvart[ss1,ss2] = vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                            dkYdvarx[ss1,ss2] = vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                            dkYdlenx[ss1,ss2] = vyt*vyx*dkyydlyx(tdist[ss1,ss2],xdist[ss1,ss2])
                            dkYdlent[ss1,ss2] = vyt*vyx*dkyydlyt(tdist[ss1,ss2],xdist[ss1,ss2])
                        elif i==0 and j==1:
                            dka[ss1,ss2] = -k2(xdist[ss1,ss2])*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                            dkb[ss1,ss2] = k4(ttdist[ss1,ss2])*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                            dkc[ss1,ss2] = vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                            #dkYdvart[ss1,ss2] = 0
                            #dkYdvarx[ss1,ss2] = 0
                            #dkYdlent[ss1,ss2] = 0
                            #dkYdlenx[ss1,ss2] = 0
                            dkYdvart[ss1,ss2] = (-a*k2(xdist[ss1,ss2])+b*k4(ttdist[ss1,ss2])+c)*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                            dkYdvarx[ss1,ss2] = (-a*k2(xdist[ss1,ss2])+b*k4(ttdist[ss1,ss2])+c)*vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                            dkYdlent[ss1,ss2] = vyt*vyx*dkyydlyt(tdist[ss1,ss2],xdist[ss1,ss2])* (-a*k2(xdist[ss1,ss2])+b*k4(ttdist[ss1,ss2])+c)+\
                            vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*b*dk4dlyt(ttdist[ss1,ss2])
                            dkYdlenx[ss1,ss2] = vyt*vyx*dkyydlyx(tdist[ss1,ss2],xdist[ss1,ss2])*(-a*k2(xdist[ss1,ss2])+b*k4(ttdist[ss1,ss2])+c)+\
                            vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*(-a*dk2dlyx(xdist[ss1,ss2]))
                        elif i==1 and j==1:
                            dka[ss1,ss2] = (2*a*k3(xdist[ss1,ss2]) - 2*c*k2(xdist[ss1,ss2]))*vyt*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                            dkb[ss1,ss2] = 2*b*k1(tdist[ss1,ss2])*vyt*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                            dkc[ss1,ss2] = (-2*a*k2(xdist[ss1,ss2]) + 2*c )*vyt*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                            dkYdvart[ss1,ss2] = ( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 )*vyx* kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                            dkYdvarx[ss1,ss2] = ( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 )*vyt* kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                            dkYdlent[ss1,ss2] = vyt*vyx*dkyydlyt(tdist[ss1,ss2],xdist[ss1,ss2])*( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 ) +\
                            vyx*vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*b**2*dk1dlyt(tdist[ss1,ss2])
                            dkYdlenx[ss1,ss2] = vyt*vyx*dkyydlyx(tdist[ss1,ss2],xdist[ss1,ss2])*( b**2*k1(tdist[ss1,ss2]) - 2*a*c*k2(xdist[ss1,ss2]) + a**2*k3(xdist[ss1,ss2]) + c**2 ) +\
                            vyx*vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2])* (-2*a*c*dk2dlyx(xdist[ss1,ss2]) + a**2*dk3dlyx(xdist[ss1,ss2]) )
                        else:
                            dka[ss1,ss2] = -k2(xdist[ss1,ss2])*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                            dkb[ss1,ss2] = -k4(ttdist[ss1,ss2])*vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                            dkc[ss1,ss2] = vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                            #dkYdvart[ss1,ss2] = 0
                            #dkYdvarx[ss1,ss2] = 0
                            #dkYdlent[ss1,ss2] = 0
                            #dkYdlenx[ss1,ss2] = 0
                            dkYdvart[ss1,ss2] = (-a*k2(xdist[ss1,ss2])-b*k4(ttdist[ss1,ss2])+c)*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                            dkYdvarx[ss1,ss2] = (-a*k2(xdist[ss1,ss2])-b*k4(ttdist[ss1,ss2])+c)*vyt*kyy(tdist[ss1,ss2],xdist[ss1,ss2])
                            dkYdlent[ss1,ss2] = vyt*vyx*dkyydlyt(tdist[ss1,ss2],xdist[ss1,ss2])* (-a*k2(xdist[ss1,ss2])-b*k4(ttdist[ss1,ss2])+c)+\
                            vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*(-1)*b*dk4dlyt(ttdist[ss1,ss2])
                            dkYdlenx[ss1,ss2] = vyt*vyx*dkyydlyx(tdist[ss1,ss2],xdist[ss1,ss2])*(-a*k2(xdist[ss1,ss2])-b*k4(ttdist[ss1,ss2])+c)+\
                            vyt*vyx*kyy(tdist[ss1,ss2],xdist[ss1,ss2])*(-a*dk2dlyx(xdist[ss1,ss2])) 
        self.a.gradient = np.sum(dka * dL_dK)  
        self.b.gradient = np.sum(dkb * dL_dK) 
        self.c.gradient = np.sum(dkc * dL_dK)
        self.variance_Yt.gradient = np.sum(dkYdvart * dL_dK)  # Vy
        self.variance_Yx.gradient = np.sum(dkYdvarx * dL_dK)
        self.lengthscale_Yt.gradient = np.sum(dkYdlent*(-0.5*self.lengthscale_Yt**(-2)) * dL_dK)    #ly np.sum(dktheta2*(-self.lengthscale_Y**(-2)) * dL_dK) 
        self.lengthscale_Yx.gradient =  np.sum(dkYdlenx*(-0.5*self.lengthscale_Yx**(-2)) * dL_dK)
--- a/GPy/kern/_src/ODE_t.py
+++ b/GPy/kern/_src/ODE_t.py
@ -0,0 +1,165 @@
 from kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
 from independent_outputs import index_to_slices
 class ODE_t(Kern):
        def __init__(self, input_dim, a=1., c=1.,variance_Yt=3., lengthscale_Yt=1.5,ubias =1., active_dims=None, name='ode_st'):
                assert input_dim ==2, "only defined for 2 input dims"
                super(ODE_t, self).__init__(input_dim, active_dims, name)
                self.variance_Yt = Param('variance_Yt', variance_Yt, Logexp())
                self.lengthscale_Yt = Param('lengthscale_Yt', lengthscale_Yt, Logexp())        
                self.a= Param('a', a, Logexp())
                self.c = Param('c', c, Logexp())
                self.ubias = Param('ubias', ubias, Logexp())
                self.add_parameters(self.a, self.c, self.variance_Yt, self.lengthscale_Yt,self.ubias)
        def K(self, X, X2=None):
                """Compute the covariance matrix between X and X2."""        
                X,slices = X[:,:-1],index_to_slices(X[:,-1])
                if X2 is None:
                        X2,slices2 = X,slices
                        K = np.zeros((X.shape[0], X.shape[0]))
                else:
                        X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
                        K = np.zeros((X.shape[0], X2.shape[0]))
                tdist = (X[:,0][:,None] - X2[:,0][None,:])**2
                ttdist = (X[:,0][:,None] - X2[:,0][None,:])
                vyt = self.variance_Yt
                lyt=1/(2*self.lengthscale_Yt)
                a = -self.a
                c = self.c
                kyy = lambda tdist: np.exp(-lyt*(tdist))
                k1 = lambda tdist: (2*lyt - 4*lyt**2 *(tdist) )
                k4 = lambda tdist: 2*lyt*(tdist)
                for i, s1 in enumerate(slices):
                        for j, s2 in enumerate(slices2):
                                for ss1 in s1:
                                    for ss2 in s2:
                                        if i==0 and j==0:
                                            K[ss1,ss2] = vyt*kyy(tdist[ss1,ss2])
                                        elif i==0 and j==1:
                                            K[ss1,ss2] = (k4(ttdist[ss1,ss2])+1)*vyt*kyy(tdist[ss1,ss2])
                                            #K[ss1,ss2] = (2*lyt*(ttdist[ss1,ss2])+1)*vyt*kyy(tdist[ss1,ss2])
                                        elif i==1 and j==1:
                                            K[ss1,ss2] = ( k1(tdist[ss1,ss2]) + 1. )*vyt* kyy(tdist[ss1,ss2])+self.ubias
                                        else:
                                            K[ss1,ss2] = (-k4(ttdist[ss1,ss2])+1)*vyt*kyy(tdist[ss1,ss2])
                                            #K[ss1,ss2] = (-2*lyt*(ttdist[ss1,ss2])+1)*vyt*kyy(tdist[ss1,ss2])
                #stop
                return K
        def Kdiag(self, X):
                vyt = self.variance_Yt
                lyt = 1./(2*self.lengthscale_Yt)
                a = -self.a
                c = self.c        
                k1 = (2*lyt )*vyt
                Kdiag = np.zeros(X.shape[0])
                slices = index_to_slices(X[:,-1])
                for i, ss1 in enumerate(slices):
                    for s1 in ss1:
                        if i==0:
                            Kdiag[s1]+= vyt
                        elif i==1:
                            #i=1
                            Kdiag[s1]+= k1 + vyt+self.ubias
                            #Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
                        else:
                            raise ValueError, "invalid input/output index"
                return Kdiag
        def update_gradients_full(self, dL_dK, X, X2=None):
                """derivative of the covariance matrix with respect to the parameters."""
                X,slices = X[:,:-1],index_to_slices(X[:,-1])
                if X2 is None:
                    X2,slices2 = X,slices
                    K = np.zeros((X.shape[0], X.shape[0]))
                else:
                    X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1])
                vyt = self.variance_Yt
                lyt = 1./(2*self.lengthscale_Yt)
                tdist = (X[:,0][:,None] - X2[:,0][None,:])**2
                ttdist = (X[:,0][:,None] - X2[:,0][None,:])
                #rdist = [tdist,xdist]
                rd=tdist.shape[0]
                dka = np.zeros([rd,rd])
                dkc = np.zeros([rd,rd])
                dkYdvart = np.zeros([rd,rd])
                dkYdlent = np.zeros([rd,rd])
                dkdubias = np.zeros([rd,rd])
                kyy = lambda tdist: np.exp(-lyt*(tdist))
                dkyydlyt = lambda tdist: kyy(tdist)*(-tdist)
                k1 = lambda tdist: (2*lyt - 4*lyt**2 * (tdist) )
                k4 = lambda ttdist: 2*lyt*(ttdist)
                dk1dlyt = lambda tdist: 2. - 4*2.*lyt*tdist
                dk4dlyt = lambda ttdist: 2*(ttdist)
                for i, s1 in enumerate(slices):
                    for j, s2 in enumerate(slices2):
                        for ss1 in s1:
                            for ss2 in s2:
                                if i==0 and j==0:
                                    dkYdvart[ss1,ss2] = kyy(tdist[ss1,ss2])
                                    dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])
                                    dkdubias[ss1,ss2] = 0
                                elif i==0 and j==1:
                                    dkYdvart[ss1,ss2] = (k4(ttdist[ss1,ss2])+1)*kyy(tdist[ss1,ss2])
                                    #dkYdvart[ss1,ss2] = ((2*lyt*ttdist[ss1,ss2])+1)*kyy(tdist[ss1,ss2])
                                    dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])* (k4(ttdist[ss1,ss2])+1.)+\
                                    vyt*kyy(tdist[ss1,ss2])*(dk4dlyt(ttdist[ss1,ss2]))
                                    #dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])* (2*lyt*(ttdist[ss1,ss2])+1.)+\
                                    #vyt*kyy(tdist[ss1,ss2])*(2*ttdist[ss1,ss2])
                                    dkdubias[ss1,ss2] = 0
                                elif i==1 and j==1:
                                    dkYdvart[ss1,ss2] = (k1(tdist[ss1,ss2]) + 1. )* kyy(tdist[ss1,ss2])
                                    dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])*( k1(tdist[ss1,ss2]) + 1. ) +\
                          			vyt*kyy(tdist[ss1,ss2])*dk1dlyt(tdist[ss1,ss2])
                                    dkdubias[ss1,ss2] = 1
                                else:
                                    dkYdvart[ss1,ss2] = (-k4(ttdist[ss1,ss2])+1)*kyy(tdist[ss1,ss2])
                                    #dkYdvart[ss1,ss2] = (-2*lyt*(ttdist[ss1,ss2])+1)*kyy(tdist[ss1,ss2])
                                    dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])* (-k4(ttdist[ss1,ss2])+1.)+\
                                    vyt*kyy(tdist[ss1,ss2])*(-dk4dlyt(ttdist[ss1,ss2]) )
                                    dkdubias[ss1,ss2] = 0
                                    #dkYdlent[ss1,ss2] = vyt*dkyydlyt(tdist[ss1,ss2])* (-2*lyt*(ttdist[ss1,ss2])+1.)+\
                                    #vyt*kyy(tdist[ss1,ss2])*(-2)*(ttdist[ss1,ss2])
                self.variance_Yt.gradient = np.sum(dkYdvart * dL_dK)
                self.lengthscale_Yt.gradient =  np.sum(dkYdlent*(-0.5*self.lengthscale_Yt**(-2)) * dL_dK)
                self.ubias.gradient = np.sum(dkdubias * dL_dK) 
--- a/GPy/kern/_src/init.py
+++ b/GPy/kern/_src/init.py
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@ -0,0 +1,188 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 import itertools
 from ...util.caching import Cache_this
 from kern import CombinationKernel
 class Add(CombinationKernel):
    """
    Add given list of kernels together.
    propagates gradients through.
    This kernel will take over the active dims of it's subkernels passed in.
    """
    def __init__(self, subkerns, name='add'):
        for i, kern in enumerate(subkerns[:]):
            if isinstance(kern, Add):
                del subkerns[i]
                for part in kern.parts[::-1]:
                    kern.unlink_parameter(part)
                    subkerns.insert(i, part)
        super(Add, self).__init__(subkerns, name)
    @Cache_this(limit=2, force_kwargs=['which_parts'])
    def K(self, X, X2=None, which_parts=None):
        """
        Add all kernels together.
        If a list of parts (of this kernel!) `which_parts` is given, only
        the parts of the list are taken to compute the covariance.
        """
        if which_parts is None:
            which_parts = self.parts
        elif not isinstance(which_parts, (list, tuple)):
            # if only one part is given
            which_parts = [which_parts]
        return reduce(np.add, (p.K(X, X2) for p in which_parts))
    @Cache_this(limit=2, force_kwargs=['which_parts'])
    def Kdiag(self, X, which_parts=None):
        if which_parts is None:
            which_parts = self.parts
        elif not isinstance(which_parts, (list, tuple)):
            # if only one part is given
            which_parts = [which_parts]
        return reduce(np.add, (p.Kdiag(X) for p in which_parts))
    def update_gradients_full(self, dL_dK, X, X2=None):
        [p.update_gradients_full(dL_dK, X, X2) for p in self.parts if not p.is_fixed]
    def update_gradients_diag(self, dL_dK, X):
        [p.update_gradients_diag(dL_dK, X) for p in self.parts]
    def gradients_X(self, dL_dK, X, X2=None):
        """Compute the gradient of the objective function with respect to X.
        :param dL_dK: An array of gradients of the objective function with respect to the covariance function.
        :type dL_dK: np.ndarray (num_samples x num_inducing)
        :param X: Observed data inputs
        :type X: np.ndarray (num_samples x input_dim)
        :param X2: Observed data inputs (optional, defaults to X)
        :type X2: np.ndarray (num_inducing x input_dim)"""
        target = np.zeros(X.shape)
        [target.__iadd__(p.gradients_X(dL_dK, X, X2)) for p in self.parts]
        return target
    def gradients_X_diag(self, dL_dKdiag, X):
        target = np.zeros(X.shape)
        [target.__iadd__(p.gradients_X_diag(dL_dKdiag, X)) for p in self.parts]
        return target
    @Cache_this(limit=2, force_kwargs=['which_parts'])
    def psi0(self, Z, variational_posterior):
        return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts))
    @Cache_this(limit=2, force_kwargs=['which_parts'])
    def psi1(self, Z, variational_posterior):
        return reduce(np.add, (p.psi1(Z, variational_posterior) for p in self.parts))
    @Cache_this(limit=2, force_kwargs=['which_parts'])
    def psi2(self, Z, variational_posterior):
        psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts))
        #return psi2
        # compute the "cross" terms
        from static import White, Bias
        from rbf import RBF
        #from rbf_inv import RBFInv
        from linear import Linear
        #ffrom fixed import Fixed
        for p1, p2 in itertools.combinations(self.parts, 2):
            # i1, i2 = p1.active_dims, p2.active_dims
            # white doesn;t combine with anything
            if isinstance(p1, White) or isinstance(p2, White):
                pass
            # rbf X bias
            #elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
            elif isinstance(p1,  Bias) and isinstance(p2, (RBF, Linear)):
                tmp = p2.psi1(Z, variational_posterior).sum(axis=0)
                psi2 += p1.variance * (tmp[:,None]+tmp[None,:]) #(tmp[:, :, None] + tmp[:, None, :])
            #elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
            elif isinstance(p2, Bias) and isinstance(p1, (RBF, Linear)):
                tmp = p1.psi1(Z, variational_posterior).sum(axis=0)
                psi2 += p2.variance * (tmp[:,None]+tmp[None,:]) #(tmp[:, :, None] + tmp[:, None, :])
            elif isinstance(p2, (RBF, Linear)) and isinstance(p1, (RBF, Linear)):
                assert np.intersect1d(p1.active_dims, p2.active_dims).size == 0, "only non overlapping kernel dimensions allowed so far"
                tmp1 = p1.psi1(Z, variational_posterior)
                tmp2 = p2.psi1(Z, variational_posterior)
                psi2 += np.einsum('nm,no->mo',tmp1,tmp2)+np.einsum('nm,no->mo',tmp2,tmp1)
                #(tmp1[:, :, None] * tmp2[:, None, :]) + (tmp2[:, :, None] * tmp1[:, None, :])
            else:
                raise NotImplementedError, "psi2 cannot be computed for this kernel"
        return psi2
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        from static import White, Bias
        for p1 in self.parts:
            #compute the effective dL_dpsi1. Extra terms appear becaue of the cross terms in psi2!
            eff_dL_dpsi1 = dL_dpsi1.copy()
            for p2 in self.parts:
                if p2 is p1:
                    continue
                if isinstance(p2, White):
                    continue
                elif isinstance(p2, Bias):
                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
                else:# np.setdiff1d(p1.active_dims, ar2, assume_unique): # TODO: Careful, not correct for overlapping active_dims
                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
            p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
    def gradients_Z_expectations(self, dL_psi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        from static import White, Bias
        target = np.zeros(Z.shape)
        for p1 in self.parts:
            #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
            eff_dL_dpsi1 = dL_dpsi1.copy()
            for p2 in self.parts:
                if p2 is p1:
                    continue
                if isinstance(p2, White):
                    continue
                elif isinstance(p2, Bias):
                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
                else:
                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
            target += p1.gradients_Z_expectations(dL_psi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
        return target
    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        from static import White, Bias
        target_grads = [np.zeros(v.shape) for v in variational_posterior.parameters]
        for p1 in self.parameters:
            #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
            eff_dL_dpsi1 = dL_dpsi1.copy()
            for p2 in self.parameters:
                if p2 is p1:
                    continue
                if isinstance(p2, White):
                    continue
                elif isinstance(p2, Bias):
                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.variance * 2.
                else:
                    eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
            grads = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
            [np.add(target_grads[i],grads[i],target_grads[i]) for i in xrange(len(grads))]
        return target_grads
    def add(self, other):
        if isinstance(other, Add):
            other_params = other.parameters[:]
            for p in other_params:
                other.unlink_parameter(p)
            self.link_parameters(*other_params)
        else:
            self.link_parameter(other)
        self.input_dim, self.active_dims = self.get_input_dim_active_dims(self.parts)
        return self
    def input_sensitivity(self, summarize=True):
        if summarize:
            return reduce(np.add, [k.input_sensitivity(summarize) for k in self.parts])
        else:
            i_s = np.zeros((len(self.parts), self.input_dim))
            from operator import setitem
            [setitem(i_s, (i, Ellipsis), k.input_sensitivity(summarize)) for i, k in enumerate(self.parts)]
            return i_s
--- a/GPy/kern/_src/brownian.py
+++ b/GPy/kern/_src/brownian.py
@ -0,0 +1,50 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
 class Brownian(Kern):
    """
    Brownian motion in 1D only.
    Negative times are treated as a separate (backwards!) Brownian motion.
    :param input_dim: the number of input dimensions
    :type input_dim: int
    :param variance:
    :type variance: float
    """
    def __init__(self, input_dim=1, variance=1., active_dims=None, name='Brownian'):
        assert input_dim==1, "Brownian motion in 1D only"
        super(Brownian, self).__init__(input_dim, active_dims, name)
        self.variance = Param('variance', variance, Logexp())
        self.link_parameters(self.variance)
    def K(self,X,X2=None):
        if X2 is None:
            X2 = X
        return self.variance*np.where(np.sign(X)==np.sign(X2.T),np.fmin(np.abs(X),np.abs(X2.T)), 0.)
    def Kdiag(self,X):
        return self.variance*np.abs(X.flatten())
    def update_gradients_full(self, dL_dK, X, X2=None):
        if X2 is None:
            X2 = X
        self.variance.gradient = np.sum(dL_dK * np.where(np.sign(X)==np.sign(X2.T),np.fmin(np.abs(X),np.abs(X2.T)), 0.))
    #def update_gradients_diag(self, dL_dKdiag, X):
        #self.variance.gradient = np.dot(np.abs(X.flatten()), dL_dKdiag)
    #def gradients_X(self, dL_dK, X, X2=None):
        #if X2 is None:
            #return np.sum(self.variance*dL_dK*np.abs(X),1)[:,None]
        #else:
            #return np.sum(np.where(np.logical_and(np.abs(X)<np.abs(X2.T), np.sign(X)==np.sign(X2)), self.variance*dL_dK,0.),1)[:,None]
--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@ -0,0 +1,174 @@
 # Copyright (c) 2012, James Hensman and Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kern import Kern
 import numpy as np
 from scipy import weave
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ...util.config import config # for assesing whether to use weave
 class Coregionalize(Kern):
    """
    Covariance function for intrinsic/linear coregionalization models
    This covariance has the form:
    .. math::
       \mathbf{B} = \mathbf{W}\mathbf{W}^\top + \text{diag}(kappa)
    An intrinsic/linear coregionalization covariance function of the form:
    .. math::
       k_2(x, y)=\mathbf{B} k(x, y)
    it is obtained as the tensor product between a covariance function
    k(x, y) and B.
    :param output_dim: number of outputs to coregionalize
    :type output_dim: int
    :param rank: number of columns of the W matrix (this parameter is ignored if parameter W is not None)
    :type rank: int
    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalization matrix B
    :type W: numpy array of dimensionality (num_outpus, W_columns)
    :param kappa: a vector which allows the outputs to behave independently
    :type kappa: numpy array of dimensionality  (output_dim, )
    .. note: see coregionalization examples in GPy.examples.regression for some usage.
    """
    def __init__(self, input_dim, output_dim, rank=1, W=None, kappa=None, active_dims=None, name='coregion'):
        super(Coregionalize, self).__init__(input_dim, active_dims, name=name)
        self.output_dim = output_dim
        self.rank = rank
        if self.rank>output_dim:
            print("Warning: Unusual choice of rank, it should normally be less than the output_dim.")
        if W is None:
            W = 0.5*np.random.randn(self.output_dim, self.rank)/np.sqrt(self.rank)
        else:
            assert W.shape==(self.output_dim, self.rank)
        self.W = Param('W', W)
        if kappa is None:
            kappa = 0.5*np.ones(self.output_dim)
        else:
            assert kappa.shape==(self.output_dim, )
        self.kappa = Param('kappa', kappa, Logexp())
        self.link_parameters(self.W, self.kappa)
    def parameters_changed(self):
        self.B = np.dot(self.W, self.W.T) + np.diag(self.kappa)
    def K(self, X, X2=None):
        if config.getboolean('weave', 'working'):
            try:
                return self._K_weave(X, X2)
            except:
                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
                config.set('weave', 'working', 'False')
                return self._K_numpy(X, X2)
        else:
            return self._K_numpy(X, X2)
    def _K_numpy(self, X, X2=None):
        index = np.asarray(X, dtype=np.int)
        if X2 is None:
            return self.B[index,index.T]
        else:
            index2 = np.asarray(X2, dtype=np.int)
            return self.B[index,index2.T]
    def _K_weave(self, X, X2=None):
        """compute the kernel function using scipy.weave"""
        index = np.asarray(X, dtype=np.int)
        if X2 is None:
            target = np.empty((X.shape[0], X.shape[0]), dtype=np.float64)
            code="""
            for(int i=0;i<N; i++){
              target[i+i*N] = B[index[i]+output_dim*index[i]];
              for(int j=0; j<i; j++){
                  target[j+i*N] = B[index[i]+output_dim*index[j]];
                  target[i+j*N] = target[j+i*N];
                }
              }
            """
            N, B, output_dim = index.size, self.B, self.output_dim
            weave.inline(code, ['target', 'index', 'N', 'B', 'output_dim'])
        else:
            index2 = np.asarray(X2, dtype=np.int)
            target = np.empty((X.shape[0], X2.shape[0]), dtype=np.float64)
            code="""
            for(int i=0;i<num_inducing; i++){
              for(int j=0; j<N; j++){
                  target[i+j*num_inducing] = B[output_dim*index[j]+index2[i]];
                }
              }
            """
            N, num_inducing, B, output_dim = index.size, index2.size, self.B, self.output_dim
            weave.inline(code, ['target', 'index', 'index2', 'N', 'num_inducing', 'B', 'output_dim'])
        return target
    def Kdiag(self, X):
        return np.diag(self.B)[np.asarray(X, dtype=np.int).flatten()]
    def update_gradients_full(self, dL_dK, X, X2=None):
        index = np.asarray(X, dtype=np.int)
        if X2 is None:
            index2 = index
        else:
            index2 = np.asarray(X2, dtype=np.int)
        #attempt to use weave for a nasty double indexing loop: fall back to numpy
        if config.getboolean('weave', 'working'):
            try:
                dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
            except:
                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
                config.set('weave', 'working', 'False')
                dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
        else:
            dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
        dkappa = np.diag(dL_dK_small)
        dL_dK_small += dL_dK_small.T
        dW = (self.W[:, None, :]*dL_dK_small[:, :, None]).sum(0)
        self.W.gradient = dW
        self.kappa.gradient = dkappa
    def _gradient_reduce_weave(self, dL_dK, index, index2):
        dL_dK_small = np.zeros_like(self.B)
        code="""
        for(int i=0; i<num_inducing; i++){
          for(int j=0; j<N; j++){
            dL_dK_small[index[j] + output_dim*index2[i]] += dL_dK[i+j*num_inducing];
          }
        }
        """
        N, num_inducing, output_dim = index.size, index2.size, self.output_dim
        weave.inline(code, ['N', 'num_inducing', 'output_dim', 'dL_dK', 'dL_dK_small', 'index', 'index2'])
        return dL_dK_small
    def _gradient_reduce_numpy(self, dL_dK, index, index2):
        index, index2 = index[:,0], index2[:,0]
        dL_dK_small = np.zeros_like(self.B)
        for i in range(k.output_dim):
            tmp1 = dL_dK[index==i]
            for j in range(k.output_dim):
                dL_dK_small[j,i] = tmp1[:,index2==j].sum()
        return dL_dK_small
    def update_gradients_diag(self, dL_dKdiag, X):
        index = np.asarray(X, dtype=np.int).flatten()
        dL_dKdiag_small = np.array([dL_dKdiag[index==i].sum() for i in xrange(self.output_dim)])
        self.W.gradient = 2.*self.W*dL_dKdiag_small[:, None]
        self.kappa.gradient = dL_dKdiag_small
    def gradients_X(self, dL_dK, X, X2=None):
        return np.zeros(X.shape)
    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)
--- a/GPy/kern/_src/independent_outputs.py
+++ b/GPy/kern/_src/independent_outputs.py
@ -0,0 +1,202 @@
 # Copyright (c) 2012, James Hesnsman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kern import Kern, CombinationKernel
 import numpy as np
 import itertools
 def index_to_slices(index):
    """
    take a numpy array of integers (index) and return a  nested list of slices such that the slices describe the start, stop points for each integer in the index. 
    e.g.
    >>> index = np.asarray([0,0,0,1,1,1,2,2,2])
    returns
    >>> [[slice(0,3,None)],[slice(3,6,None)],[slice(6,9,None)]]
    or, a more complicated example
    >>> index = np.asarray([0,0,1,1,0,2,2,2,1,1])
    returns
    >>> [[slice(0,2,None),slice(4,5,None)],[slice(2,4,None),slice(8,10,None)],[slice(5,8,None)]]
    """
    if len(index)==0:
        return[]
    #contruct the return structure
    ind = np.asarray(index,dtype=np.int)
    ret = [[] for i in range(ind.max()+1)]
    #find the switchpoints
    ind_ = np.hstack((ind,ind[0]+ind[-1]+1))
    switchpoints = np.nonzero(ind_ - np.roll(ind_,+1))[0]
    [ret[ind_i].append(slice(*indexes_i)) for ind_i,indexes_i in zip(ind[switchpoints[:-1]],zip(switchpoints,switchpoints[1:]))]
    return ret
 class IndependentOutputs(CombinationKernel):
    """
    A kernel which can represent several independent functions.  this kernel
    'switches off' parts of the matrix where the output indexes are different.
    The index of the functions is given by the last column in the input X the
    rest of the columns of X are passed to the underlying kernel for
    computation (in blocks).
    :param kernels: either a kernel, or list of kernels to work with. If it is
    a list of kernels the indices in the index_dim, index the kernels you gave!
    """
    def __init__(self, kernels, index_dim=-1, name='independ'):
        assert isinstance(index_dim, int), "IndependentOutputs kernel is only defined with one input dimension being the index"
        if not isinstance(kernels, list):
            self.single_kern = True
            self.kern = kernels
            kernels = [kernels]
        else:
            self.single_kern = False
            self.kern = kernels
        super(IndependentOutputs, self).__init__(kernels=kernels, extra_dims=[index_dim], name=name)
        self.index_dim = index_dim
    def K(self,X ,X2=None):
        slices = index_to_slices(X[:,self.index_dim])
        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
        if X2 is None:
            target = np.zeros((X.shape[0], X.shape[0]))
            [[target.__setitem__((s,ss), kern.K(X[s,:], X[ss,:])) for s,ss in itertools.product(slices_i, slices_i)] for kern, slices_i in zip(kerns, slices)]
        else:
            slices2 = index_to_slices(X2[:,self.index_dim])
            target = np.zeros((X.shape[0], X2.shape[0]))
            [[target.__setitem__((s,s2), kern.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices_i, slices_j)] for kern, slices_i,slices_j in zip(kerns, slices,slices2)]
        return target
    def Kdiag(self,X):
        slices = index_to_slices(X[:,self.index_dim])
        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
        target = np.zeros(X.shape[0])
        [[np.copyto(target[s], kern.Kdiag(X[s])) for s in slices_i] for kern, slices_i in zip(kerns, slices)]
        return target
    def update_gradients_full(self,dL_dK,X,X2=None):
        slices = index_to_slices(X[:,self.index_dim])
        if self.single_kern: 
            target = np.zeros(self.kern.size)
            kerns = itertools.repeat(self.kern)
        else: 
            kerns = self.kern
            target = [np.zeros(kern.size) for kern, _ in zip(kerns, slices)]
        def collate_grads(kern, i, dL, X, X2):
            kern.update_gradients_full(dL,X,X2)
            if self.single_kern: target[:] += kern.gradient
            else: target[i][:] += kern.gradient
        if X2 is None:
            [[collate_grads(kern, i, dL_dK[s,ss], X[s], X[ss]) for s,ss in itertools.product(slices_i, slices_i)] for i,(kern,slices_i) in enumerate(zip(kerns,slices))]
        else:
            slices2 = index_to_slices(X2[:,self.index_dim])
            [[[collate_grads(kern, i, dL_dK[s,s2],X[s],X2[s2]) for s in slices_i] for s2 in slices_j] for i,(kern,slices_i,slices_j) in enumerate(zip(kerns,slices,slices2))]
        if self.single_kern: kern.gradient = target
        else:[kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(kerns, slices))]
    def gradients_X(self,dL_dK, X, X2=None):
        target = np.zeros(X.shape)
        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
        if X2 is None:
            # TODO: make use of index_to_slices
            values = np.unique(X[:,self.index_dim])
            slices = [X[:,self.index_dim]==i for i in values]
            [target.__setitem__(s, kern.gradients_X(dL_dK[s,s],X[s],None))
              for kern, s in zip(kerns, slices)]
            #slices = index_to_slices(X[:,self.index_dim])
            #[[np.add(target[s], kern.gradients_X(dL_dK[s,s], X[s]), out=target[s]) 
            #  for s in slices_i] for kern, slices_i in zip(kerns, slices)]
            #import ipdb;ipdb.set_trace()
            #[[(np.add(target[s ], kern.gradients_X(dL_dK[s ,ss],X[s ], X[ss]), out=target[s ]),
            #   np.add(target[ss], kern.gradients_X(dL_dK[ss,s ],X[ss], X[s ]), out=target[ss]))
            #  for s, ss in itertools.combinations(slices_i, 2)] for kern, slices_i in zip(kerns, slices)]
        else:
            values = np.unique(X[:,self.index_dim])
            slices = [X[:,self.index_dim]==i for i in values]
            slices2 = [X2[:,self.index_dim]==i for i in values]
            [target.__setitem__(s, kern.gradients_X(dL_dK[s, :][:, s2],X[s],X2[s2]))
              for kern, s, s2 in zip(kerns, slices, slices2)]
            # TODO: make work with index_to_slices
            #slices = index_to_slices(X[:,self.index_dim])
            #slices2 = index_to_slices(X2[:,self.index_dim])
            #[[target.__setitem__(s, target[s] + kern.gradients_X(dL_dK[s,s2], X[s], X2[s2])) for s, s2 in itertools.product(slices_i, slices_j)] for kern, slices_i,slices_j in zip(kerns, slices,slices2)]
        return target
    def gradients_X_diag(self, dL_dKdiag, X):
        slices = index_to_slices(X[:,self.index_dim])
        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
        target = np.zeros(X.shape)
        [[target.__setitem__(s, kern.gradients_X_diag(dL_dKdiag[s],X[s])) for s in slices_i] for kern, slices_i in zip(kerns, slices)]
        return target
    def update_gradients_diag(self, dL_dKdiag, X):
        slices = index_to_slices(X[:,self.index_dim])
        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
        if self.single_kern: target = np.zeros(self.kern.size)
        else: target = [np.zeros(kern.size) for kern, _ in zip(kerns, slices)]
        def collate_grads(kern, i, dL, X):
            kern.update_gradients_diag(dL,X)
            if self.single_kern: target[:] += kern.gradient
            else: target[i][:] += kern.gradient
        [[collate_grads(kern, i, dL_dKdiag[s], X[s,:]) for s in slices_i] for i, (kern, slices_i) in enumerate(zip(kerns, slices))]
        if self.single_kern: kern.gradient = target
        else:[kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(kerns, slices))]
 class Hierarchical(CombinationKernel):
    """
    A kernel which can represent a simple hierarchical model.
    See Hensman et al 2013, "Hierarchical Bayesian modelling of gene expression time
    series across irregularly sampled replicates and clusters"
    http://www.biomedcentral.com/1471-2105/14/252
    To construct this kernel, you must pass a list of kernels. the first kernel
    will be assumed to be the 'base' kernel, and will be computed everywhere.
    For every additional kernel, we assume another layer in the hierachy, with
    a corresponding column of the input matrix which indexes which function the
    data are in at that level.
    For more, see the ipython notebook documentation on Hierarchical
    covariances.
    """
    def __init__(self, kernels, name='hierarchy'):
        assert all([k.input_dim==kernels[0].input_dim for k in kernels])
        assert len(kernels) > 1
        self.levels = len(kernels) -1
        input_max = max([k.input_dim for k in kernels])
        super(Hierarchical, self).__init__(kernels=kernels, extra_dims = range(input_max, input_max + len(kernels)-1), name=name)
    def K(self,X ,X2=None):
        K = self.parts[0].K(X, X2) # compute 'base' kern everywhere
        slices = [index_to_slices(X[:,i]) for i in self.extra_dims]
        if X2 is None:
            [[[np.add(K[s,s], k.K(X[s], None), K[s, s]) for s in slices_i] for slices_i in slices_k] for k, slices_k in zip(self.parts[1:], slices)]
        else:
            slices2 = [index_to_slices(X2[:,i]) for i in self.extra_dims]
            [[[np.add(K[s,ss], k.K(X[s], X2[ss]), K[s, ss]) for s,ss in zip(slices_i, slices_j)] for slices_i, slices_j in zip(slices_k1, slices_k2)] for k, slices_k1, slices_k2 in zip(self.parts[1:], slices, slices2)]
        return K
    def Kdiag(self,X):
        return np.diag(self.K(X))
    def gradients_X(self, dL_dK, X, X2=None):
        raise NotImplementedError
    def update_gradients_full(self,dL_dK,X,X2=None):
        slices = [index_to_slices(X[:,i]) for i in self.extra_dims]
        if X2 is None:
            self.parts[0].update_gradients_full(dL_dK, X, None)
            for k, slices_k in zip(self.parts[1:], slices):
                target = np.zeros(k.size)
                def collate_grads(dL, X, X2, target):
                    k.update_gradients_full(dL,X,X2)
                    target += k.gradient
                [[collate_grads(dL_dK[s,s], X[s], None, target) for s in slices_i] for slices_i in slices_k]
                k.gradient[:] = target
        else:
            raise NotImplementedError
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@ -0,0 +1,280 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import sys
 import numpy as np
 from ...core.parameterization.parameterized import Parameterized
 from kernel_slice_operations import KernCallsViaSlicerMeta
 from ...util.caching import Cache_this
 from GPy.core.parameterization.observable_array import ObsAr
 class Kern(Parameterized):
    #===========================================================================
    # This adds input slice support. The rather ugly code for slicing can be
    # found in kernel_slice_operations
    __metaclass__ = KernCallsViaSlicerMeta
    #===========================================================================
    _support_GPU=False
    def __init__(self, input_dim, active_dims, name, useGPU=False, *a, **kw):
        """
        The base class for a kernel: a positive definite function
        which forms of a covariance function (kernel).
        input_dim:
            is the number of dimensions to work on. Make sure to give the
            tight dimensionality of inputs.
            You most likely want this to be the integer telling the number of
            input dimensions of the kernel.
            If this is not an integer (!) we will work on the whole input matrix X,
            and not check whether dimensions match or not (!).
        active_dims:
            is the active_dimensions of inputs X we will work on.
            All kernels will get sliced Xes as inputs, if active_dims is not None
            Only positive integers are allowed in active_dims!
            if active_dims is None, slicing is switched off and all X will be passed through as given.
        :param int input_dim: the number of input dimensions to the function
        :param array-like|None active_dims: list of indices on which dimensions this kernel works on, or none if no slicing
        Do not instantiate.
        """
        super(Kern, self).__init__(name=name, *a, **kw)
        self.input_dim = int(input_dim)
        if active_dims is None:
            active_dims = np.arange(input_dim)
        self.active_dims = np.atleast_1d(active_dims).astype(int)
        assert self.active_dims.size == self.input_dim, "input_dim={} does not match len(active_dim)={}, active_dims={}".format(self.input_dim, self.active_dims.size, self.active_dims)
        self._sliced_X = 0
        self.useGPU = self._support_GPU and useGPU
        self._return_psi2_n_flag = ObsAr(np.zeros(1)).astype(bool)
    @property
    def return_psi2_n(self):
        """
        Flag whether to pass back psi2 as NxMxM or MxM, by summing out N.
        """
        return self._return_psi2_n_flag[0]
    @return_psi2_n.setter
    def return_psi2_n(self, val):
        def visit(self):
            if isinstance(self, Kern):
                self._return_psi2_n_flag[0]=val
        self.traverse(visit)
    @Cache_this(limit=20)
    def _slice_X(self, X):
        return X[:, self.active_dims]
    def K(self, X, X2):
        """
        Compute the kernel function.
        :param X: the first set of inputs to the kernel
        :param X2: (optional) the second set of arguments to the kernel. If X2
                   is None, this is passed throgh to the 'part' object, which
                   handLes this as X2 == X.
        """
        raise NotImplementedError
    def Kdiag(self, X):
        raise NotImplementedError
    def psi0(self, Z, variational_posterior):
        raise NotImplementedError
    def psi1(self, Z, variational_posterior):
        raise NotImplementedError
    def psi2(self, Z, variational_posterior):
        raise NotImplementedError
    def gradients_X(self, dL_dK, X, X2):
        raise NotImplementedError
    def gradients_X_diag(self, dL_dKdiag, X):
        raise NotImplementedError
    def update_gradients_diag(self, dL_dKdiag, X):
        """ update the gradients of all parameters when using only the diagonal elements of the covariance matrix"""
        raise NotImplementedError
    def update_gradients_full(self, dL_dK, X, X2):
        """Set the gradients of all parameters when doing full (N) inference."""
        raise NotImplementedError
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        """
        Set the gradients of all parameters when doing inference with
        uncertain inputs, using expectations of the kernel.
        The esential maths is
        dL_d{theta_i} = dL_dpsi0 * dpsi0_d{theta_i} +
                        dL_dpsi1 * dpsi1_d{theta_i} +
                        dL_dpsi2 * dpsi2_d{theta_i}
        """
        raise NotImplementedError
    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        """
        Returns the derivative of the objective wrt Z, using the chain rule
        through the expectation variables.
        """
        raise NotImplementedError
    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        """
        Compute the gradients wrt the parameters of the variational
        distruibution q(X), chain-ruling via the expectations of the kernel
        """
        raise NotImplementedError
    def plot(self, x=None, fignum=None, ax=None, title=None, plot_limits=None, resolution=None, **mpl_kwargs):
        """
        plot this kernel.
        :param x: the value to use for the other kernel argument (kernels are a function of two variables!)
        :param fignum: figure number of the plot
        :param ax: matplotlib axis to plot on
        :param title: the matplotlib title
        :param plot_limits: the range over which to plot the kernel
        :resolution: the resolution of the lines used in plotting
        :mpl_kwargs avalid keyword arguments to pass through to matplotlib (e.g. lw=7)
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ...plotting.matplot_dep import kernel_plots
        kernel_plots.plot(self, x, fignum, ax, title, plot_limits, resolution, **mpl_kwargs)
    def plot_ARD(self, *args, **kw):
        """
        See :class:`~GPy.plotting.matplot_dep.kernel_plots`
        """
        import sys
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ...plotting.matplot_dep import kernel_plots
        return kernel_plots.plot_ARD(self,*args,**kw)
    def input_sensitivity(self, summarize=True):
        """
        Returns the sensitivity for each dimension of this kernel.
        """
        return np.zeros(self.input_dim)
    def __add__(self, other):
        """ Overloading of the '+' operator. for more control, see self.add """
        return self.add(other)
    def __iadd__(self, other):
        return self.add(other)
    def add(self, other, name='add'):
        """
        Add another kernel to this one.
        :param other: the other kernel to be added
        :type other: GPy.kern
        """
        assert isinstance(other, Kern), "only kernels can be added to kernels..."
        from add import Add
        return Add([self, other], name=name)
    def __mul__(self, other):
        """ Here we overload the '*' operator. See self.prod for more information"""
        return self.prod(other)
    def __imul__(self, other):
        """ Here we overload the '*' operator. See self.prod for more information"""
        return self.prod(other)
    def __pow__(self, other):
        """
        Shortcut for tensor `prod`.
        """
        assert np.all(self.active_dims == range(self.input_dim)), "Can only use kernels, which have their input_dims defined from 0"
        assert np.all(other.active_dims == range(other.input_dim)), "Can only use kernels, which have their input_dims defined from 0"
        other.active_dims += self.input_dim
        return self.prod(other)
    def prod(self, other, name='mul'):
        """
        Multiply two kernels (either on the same space, or on the tensor
        product of the input space).
        :param other: the other kernel to be added
        :type other: GPy.kern
        :param tensor: whether or not to use the tensor space (default is false).
        :type tensor: bool
        """
        assert isinstance(other, Kern), "only kernels can be multiplied to kernels..."
        from prod import Prod
        #kernels = []
        #if isinstance(self, Prod): kernels.extend(self.parameters)
        #else: kernels.append(self)
        #if isinstance(other, Prod): kernels.extend(other.parameters)
        #else: kernels.append(other)
        return Prod([self, other], name)
    def _check_input_dim(self, X):
        assert X.shape[1] == self.input_dim, "{} did not specify active_dims and X has wrong shape: X_dim={}, whereas input_dim={}".format(self.name, X.shape[1], self.input_dim)
    def _check_active_dims(self, X):
        assert X.shape[1] >= len(self.active_dims), "At least {} dimensional X needed, X.shape={!s}".format(len(self.active_dims), X.shape)
 class CombinationKernel(Kern):
    """
    Abstract super class for combination kernels.
    A combination kernel combines (a list of) kernels and works on those.
    Examples are the HierarchicalKernel or Add and Prod kernels.
    """
    def __init__(self, kernels, name, extra_dims=[]):
        """
        Abstract super class for combination kernels.
        A combination kernel combines (a list of) kernels and works on those.
        Examples are the HierarchicalKernel or Add and Prod kernels.
        :param list kernels: List of kernels to combine (can be only one element)
        :param str name: name of the combination kernel
        :param array-like extra_dims: if needed extra dimensions for the combination kernel to work on
        """
        assert all([isinstance(k, Kern) for k in kernels])
        extra_dims = np.array(extra_dims, dtype=int)
        input_dim, active_dims = self.get_input_dim_active_dims(kernels, extra_dims)
        # initialize the kernel with the full input_dim
        super(CombinationKernel, self).__init__(input_dim, active_dims, name)
        self.extra_dims = extra_dims
        self.link_parameters(*kernels)
    @property
    def parts(self):
        return self.parameters
    def get_input_dim_active_dims(self, kernels, extra_dims = None):
        #active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int))
        #active_dims = np.array(np.concatenate((active_dims, extra_dims if extra_dims is not None else [])), dtype=int)
        input_dim = reduce(max, (k.active_dims.max() for k in kernels)) + 1
        if extra_dims is not None:
            input_dim += extra_dims.size
        active_dims = np.arange(input_dim)
        return input_dim, active_dims
    def input_sensitivity(self, summarize=True):
        """
        If summize is true, we want to get the summerized view of the sensitivities,
        otherwise put everything into an array with shape (#kernels, input_dim)
        in the order of appearance of the kernels in the parameterized object.
        """
        raise NotImplementedError("Choose the kernel you want to get the sensitivity for. You need to override the default behaviour for getting the input sensitivity to be able to get the input sensitivity. For sum kernel it is the sum of all sensitivities, TODO: product kernel? Other kernels?, also TODO: shall we return all the sensitivities here in the combination kernel? So we can combine them however we want? This could lead to just plot all the sensitivities here...")
    def _check_active_dims(self, X):
        return
    def _check_input_dim(self, X):
        # As combination kernels cannot always know, what their inner kernels have as input dims, the check will be done inside them, respectively
        return
--- a/GPy/kern/_src/kernel_slice_operations.py
+++ b/GPy/kern/_src/kernel_slice_operations.py
@ -0,0 +1,143 @@
 '''
 Created on 11 Mar 2014
@author: maxz
 '''
 from ...core.parameterization.parameterized import ParametersChangedMeta
 import numpy as np
 from functools import wraps
 def put_clean(dct, name, func):
    if name in dct:
        dct['_clean_{}'.format(name)] = dct[name]
        dct[name] = func(dct[name])
 class KernCallsViaSlicerMeta(ParametersChangedMeta):
    def __new__(cls, name, bases, dct):
        put_clean(dct, 'K', _slice_K)
        put_clean(dct, 'Kdiag', _slice_Kdiag)
        put_clean(dct, 'update_gradients_full', _slice_update_gradients_full)
        put_clean(dct, 'update_gradients_diag', _slice_update_gradients_diag)
        put_clean(dct, 'gradients_X', _slice_gradients_X)
        put_clean(dct, 'gradients_X_diag', _slice_gradients_X_diag)
        put_clean(dct, 'psi0', _slice_psi)
        put_clean(dct, 'psi1', _slice_psi)
        put_clean(dct, 'psi2', _slice_psi)
        put_clean(dct, 'update_gradients_expectations', _slice_update_gradients_expectations)
        put_clean(dct, 'gradients_Z_expectations', _slice_gradients_Z_expectations)
        put_clean(dct, 'gradients_qX_expectations', _slice_gradients_qX_expectations)
        return super(KernCallsViaSlicerMeta, cls).__new__(cls, name, bases, dct)
 class _Slice_wrap(object):
    def __init__(self, k, X, X2=None):
        self.k = k
        self.shape = X.shape
        assert X.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X.shape={!s}".format(X.shape)
        if X2 is not None:
            assert X2.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X2.shape={!s}".format(X2.shape)
        if (self.k.active_dims is not None) and (self.k._sliced_X == 0):
            self.k._check_active_dims(X)
            self.X = self.k._slice_X(X)
            self.X2 = self.k._slice_X(X2) if X2 is not None else X2
            self.ret = True
        else:
            self.k._check_input_dim(X)
            self.X = X
            self.X2 = X2
            self.ret = False
    def __enter__(self):
        self.k._sliced_X += 1
        return self
    def __exit__(self, *a):
        self.k._sliced_X -= 1
    def handle_return_array(self, return_val):
        if self.ret:
            ret = np.zeros(self.shape)
            ret[:, self.k.active_dims] = return_val
            return ret
        return return_val
 def _slice_K(f):
    @wraps(f)
    def wrap(self, X, X2 = None, *a, **kw):
        with _Slice_wrap(self, X, X2) as s:
            ret = f(self, s.X, s.X2, *a, **kw)
        return ret
    return wrap
 def _slice_Kdiag(f):
    @wraps(f)
    def wrap(self, X, *a, **kw):
        with _Slice_wrap(self, X, None) as s:
            ret = f(self, s.X, *a, **kw)
        return ret
    return wrap
 def _slice_update_gradients_full(f):
    @wraps(f)
    def wrap(self, dL_dK, X, X2=None):
        with _Slice_wrap(self, X, X2) as s:
            ret = f(self, dL_dK, s.X, s.X2)
        return ret
    return wrap
 def _slice_update_gradients_diag(f):
    @wraps(f)
    def wrap(self, dL_dKdiag, X):
        with _Slice_wrap(self, X, None) as s:
            ret = f(self, dL_dKdiag, s.X)
        return ret
    return wrap
 def _slice_gradients_X(f):
    @wraps(f)
    def wrap(self, dL_dK, X, X2=None):
        with _Slice_wrap(self, X, X2) as s:
            ret = s.handle_return_array(f(self, dL_dK, s.X, s.X2))
        return ret
    return wrap
 def _slice_gradients_X_diag(f):
    @wraps(f)
    def wrap(self, dL_dKdiag, X):
        with _Slice_wrap(self, X, None) as s:
            ret = s.handle_return_array(f(self, dL_dKdiag, s.X))
        return ret
    return wrap
 def _slice_psi(f):
    @wraps(f)
    def wrap(self, Z, variational_posterior):
        with _Slice_wrap(self, Z, variational_posterior) as s:
            ret = f(self, s.X, s.X2)
        return ret
    return wrap
 def _slice_update_gradients_expectations(f):
    @wraps(f)
    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        with _Slice_wrap(self, Z, variational_posterior) as s:
            ret = f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2)
        return ret
    return wrap
 def _slice_gradients_Z_expectations(f):
    @wraps(f)
    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        with _Slice_wrap(self, Z, variational_posterior) as s:
            ret = s.handle_return_array(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2))
        return ret
    return wrap
 def _slice_gradients_qX_expectations(f):
    @wraps(f)
    def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        with _Slice_wrap(self, variational_posterior, Z) as s:
            ret = list(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X2, s.X))
            r2 = ret[:2]
            ret[0] = s.handle_return_array(r2[0])
            ret[1] = s.handle_return_array(r2[1])
            del r2
        return ret
    return wrap
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@ -0,0 +1,177 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from kern import Kern
 from ...util.linalg import tdot
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ...util.caching import Cache_this
 from ...util.config import *
 from .psi_comp import PSICOMP_Linear
 class Linear(Kern):
    """
    Linear kernel
    .. math::
       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i x_iy_i
    :param input_dim: the number of input dimensions
    :type input_dim: int
    :param variances: the vector of variances :math:`\sigma^2_i`
    :type variances: array or list of the appropriate size (or float if there
                     is only one variance parameter)
    :param ARD: Auto Relevance Determination. If False, the kernel has only one
                variance parameter \sigma^2, otherwise there is one variance
                parameter per dimension.
    :type ARD: Boolean
    :rtype: kernel object
    """
    def __init__(self, input_dim, variances=None, ARD=False, active_dims=None, name='linear'):
        super(Linear, self).__init__(input_dim, active_dims, name)
        self.ARD = ARD
        if not ARD:
            if variances is not None:
                variances = np.asarray(variances)
                assert variances.size == 1, "Only one variance needed for non-ARD kernel"
            else:
                variances = np.ones(1)
        else:
            if variances is not None:
                variances = np.asarray(variances)
                assert variances.size == self.input_dim, "bad number of variances, need one ARD variance per input_dim"
            else:
                variances = np.ones(self.input_dim)
        self.variances = Param('variances', variances, Logexp())
        self.link_parameter(self.variances)
        self.psicomp = PSICOMP_Linear()
    @Cache_this(limit=2)
    def K(self, X, X2=None):
        if self.ARD:
            if X2 is None:
                return tdot(X*np.sqrt(self.variances))
            else:
                rv = np.sqrt(self.variances)
                return np.dot(X*rv, (X2*rv).T)
        else:
            return self._dot_product(X, X2) * self.variances
    @Cache_this(limit=1, ignore_args=(0,))
    def _dot_product(self, X, X2=None):
        if X2 is None:
            return tdot(X)
        else:
            return np.dot(X, X2.T)
    def Kdiag(self, X):
        return np.sum(self.variances * np.square(X), -1)
    def update_gradients_full(self, dL_dK, X, X2=None):
        if self.ARD:
            if X2 is None:
                #self.variances.gradient = np.array([np.sum(dL_dK * tdot(X[:, i:i + 1])) for i in range(self.input_dim)])
                self.variances.gradient = np.einsum('ij,iq,jq->q', dL_dK, X, X)
            else:
                #product = X[:, None, :] * X2[None, :, :]
                #self.variances.gradient = (dL_dK[:, :, None] * product).sum(0).sum(0)
                self.variances.gradient = np.einsum('ij,iq,jq->q', dL_dK, X, X2)
        else:
            self.variances.gradient = np.sum(self._dot_product(X, X2) * dL_dK)
    def update_gradients_diag(self, dL_dKdiag, X):
        tmp = dL_dKdiag[:, None] * X ** 2
        if self.ARD:
            self.variances.gradient = tmp.sum(0)
        else:
            self.variances.gradient = np.atleast_1d(tmp.sum())
    def gradients_X(self, dL_dK, X, X2=None):
        if X2 is None:
            return np.einsum('jq,q,ij->iq', X, 2*self.variances, dL_dK)
        else:
            #return (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
            return np.einsum('jq,q,ij->iq', X2, self.variances, dL_dK)
    def gradients_X_diag(self, dL_dKdiag, X):
        return 2.*self.variances*dL_dKdiag[:,None]*X
    def input_sensitivity(self, summarize=True):
        return np.ones(self.input_dim) * self.variances
    #---------------------------------------#
    #             PSI statistics            #
    #---------------------------------------#
    def psi0(self, Z, variational_posterior):
        return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[0]
    def psi1(self, Z, variational_posterior):
        return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[1]
    def psi2(self, Z, variational_posterior):
        return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[2]
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        dL_dvar = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[0]
        if self.ARD:
            self.variances.gradient = dL_dvar
        else:
            self.variances.gradient = dL_dvar.sum()
    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[1]
    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[2:]
 class LinearFull(Kern):
    def __init__(self, input_dim, rank, W=None, kappa=None, active_dims=None, name='linear_full'):
        super(LinearFull, self).__init__(input_dim, active_dims, name)
        if W is None:
            W = np.ones((input_dim, rank))
        if kappa is None:
            kappa = np.ones(input_dim)
        assert W.shape == (input_dim, rank)
        assert kappa.shape == (input_dim,)
        self.W = Param('W', W)
        self.kappa = Param('kappa', kappa, Logexp())
        self.link_parameters(self.W, self.kappa)
    def K(self, X, X2=None):
        P = np.dot(self.W, self.W.T) + np.diag(self.kappa)
        return np.einsum('ij,jk,lk->il', X, P, X if X2 is None else X2)
    def update_gradients_full(self, dL_dK, X, X2=None):
        self.kappa.gradient = np.einsum('ij,ik,kj->j', X, dL_dK, X if X2 is None else X2)
        self.W.gradient = np.einsum('ij,kl,ik,lm->jm', X, X if X2 is None else X2, dL_dK, self.W)
        self.W.gradient += np.einsum('ij,kl,ik,jm->lm', X, X if X2 is None else X2, dL_dK, self.W)
    def Kdiag(self, X):
        P = np.dot(self.W, self.W.T) + np.diag(self.kappa)
        return np.einsum('ij,jk,ik->i', X, P, X)
    def update_gradients_diag(self, dL_dKdiag, X):
        self.kappa.gradient = np.einsum('ij,i->j', np.square(X), dL_dKdiag)
        self.W.gradient = 2.*np.einsum('ij,ik,jl,i->kl', X, X, self.W, dL_dKdiag)
    def gradients_X(self, dL_dK, X, X2=None):
        P = np.dot(self.W, self.W.T) + np.diag(self.kappa)
        if X2 is None:
            return 2.*np.einsum('ij,jk,kl->il', dL_dK, X, P)
        else:
            return np.einsum('ij,jk,kl->il', dL_dK, X2, P)
    def gradients_X_diag(self, dL_dKdiag, X):
        P = np.dot(self.W, self.W.T) + np.diag(self.kappa)
        return 2.*np.einsum('jk,i,ij->ik', P, dL_dKdiag, X)
--- a/GPy/kern/_src/mlp.py
+++ b/GPy/kern/_src/mlp.py
@ -0,0 +1,129 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
 four_over_tau = 2./np.pi
 class MLP(Kern):
    """
    Multi layer perceptron kernel (also known as arc sine kernel or neural network kernel)
    .. math::
          k(x,y) = \\sigma^{2}\\frac{2}{\\pi }  \\text{asin} \\left ( \\frac{ \\sigma_w^2 x^\\top y+\\sigma_b^2}{\\sqrt{\\sigma_w^2x^\\top x + \\sigma_b^2 + 1}\\sqrt{\\sigma_w^2 y^\\top y \\sigma_b^2 +1}} \\right )
    :param input_dim: the number of input dimensions
    :type input_dim: int
    :param variance: the variance :math:`\sigma^2`
    :type variance: float
    :param weight_variance: the vector of the variances of the prior over input weights in the neural network :math:`\sigma^2_w`
    :type weight_variance: array or list of the appropriate size (or float if there is only one weight variance parameter)
    :param bias_variance: the variance of the prior over bias parameters :math:`\sigma^2_b`
    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
    :type ARD: Boolean
    :rtype: Kernpart object
    """
    def __init__(self, input_dim, variance=1., weight_variance=1., bias_variance=100., active_dims=None, name='mlp'):
        super(MLP, self).__init__(input_dim, active_dims, name)
        self.variance = Param('variance', variance, Logexp())
        self.weight_variance = Param('weight_variance', weight_variance, Logexp())
        self.bias_variance = Param('bias_variance', bias_variance, Logexp())
        self.link_parameters(self.variance, self.weight_variance, self.bias_variance)
    def K(self, X, X2=None):
        self._K_computations(X, X2)
        return self.variance*self._K_dvar
    def Kdiag(self, X):
        """Compute the diagonal of the covariance matrix for X."""
        self._K_diag_computations(X)
        return self.variance*self._K_diag_dvar
    def update_gradients_full(self, dL_dK, X, X2=None):
        """Derivative of the covariance with respect to the parameters."""
        self._K_computations(X, X2)
        self.variance.gradient = np.sum(self._K_dvar*dL_dK)
        denom3 = self._K_denom**3
        base = four_over_tau*self.variance/np.sqrt(1-self._K_asin_arg*self._K_asin_arg)
        base_cov_grad = base*dL_dK
        if X2 is None:
            vec = np.diag(self._K_inner_prod)
            self.weight_variance.gradient = ((self._K_inner_prod/self._K_denom
                           -.5*self._K_numer/denom3
                           *(np.outer((self.weight_variance*vec+self.bias_variance+1.), vec)
                             +np.outer(vec,(self.weight_variance*vec+self.bias_variance+1.))))*base_cov_grad).sum()
            self.bias_variance.gradient = ((1./self._K_denom
                           -.5*self._K_numer/denom3
                           *((vec[None, :]+vec[:, None])*self.weight_variance
                           +2.*self.bias_variance + 2.))*base_cov_grad).sum()
        else:
            vec1 = (X*X).sum(1)
            vec2 = (X2*X2).sum(1)
            self.weight_variance.gradient = ((self._K_inner_prod/self._K_denom
                           -.5*self._K_numer/denom3
                           *(np.outer((self.weight_variance*vec1+self.bias_variance+1.), vec2) + np.outer(vec1, self.weight_variance*vec2 + self.bias_variance+1.)))*base_cov_grad).sum()
            self.bias_variance.gradient = ((1./self._K_denom
                           -.5*self._K_numer/denom3
                           *((vec1[:, None]+vec2[None, :])*self.weight_variance
                             + 2*self.bias_variance + 2.))*base_cov_grad).sum()
    def update_gradients_diag(self, X):
        raise NotImplementedError, "TODO"
    def gradients_X(self, dL_dK, X, X2):
        """Derivative of the covariance matrix with respect to X"""
        self._K_computations(X, X2)
        arg = self._K_asin_arg
        numer = self._K_numer
        denom = self._K_denom
        denom3 = denom*denom*denom
        if X2 is not None:
            vec2 = (X2*X2).sum(1)*self.weight_variance+self.bias_variance + 1.
            return four_over_tau*self.weight_variance*self.variance*((X2[None, :, :]/denom[:, :, None] - vec2[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
        else:
            vec = (X*X).sum(1)*self.weight_variance+self.bias_variance + 1.
            return 2*four_over_tau*self.weight_variance*self.variance*((X[None, :, :]/denom[:, :, None] - vec[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
    def gradients_X_diag(self, dL_dKdiag, X):
        """Gradient of diagonal of covariance with respect to X"""
        self._K_diag_computations(X)
        arg = self._K_diag_asin_arg
        denom = self._K_diag_denom
        #numer = self._K_diag_numer
        return four_over_tau*2.*self.weight_variance*self.variance*X*(1./denom*(1. - arg)*dL_dKdiag/(np.sqrt(1-arg*arg)))[:, None]
    def _K_computations(self, X, X2):
        """Pre-computations for the covariance matrix (used for computing the covariance and its gradients."""
        if X2 is None:
            self._K_inner_prod = np.dot(X,X.T)
            self._K_numer = self._K_inner_prod*self.weight_variance + self.bias_variance
            vec = np.diag(self._K_numer) + 1.
            self._K_denom = np.sqrt(np.outer(vec,vec))
        else:
            self._K_inner_prod = np.dot(X,X2.T)
            self._K_numer = self._K_inner_prod*self.weight_variance + self.bias_variance
            vec1 = (X*X).sum(1)*self.weight_variance + self.bias_variance + 1.
            vec2 = (X2*X2).sum(1)*self.weight_variance + self.bias_variance + 1.
            self._K_denom = np.sqrt(np.outer(vec1,vec2))
        self._K_asin_arg = self._K_numer/self._K_denom
        self._K_dvar = four_over_tau*np.arcsin(self._K_asin_arg)
    def _K_diag_computations(self, X):
        """Pre-computations concerning the diagonal terms (used for computation of diagonal and its gradients)."""
        self._K_diag_numer = (X*X).sum(1)*self.weight_variance + self.bias_variance
        self._K_diag_denom = self._K_diag_numer+1.
        self._K_diag_asin_arg = self._K_diag_numer/self._K_diag_denom
        self._K_diag_dvar = four_over_tau*np.arcsin(self._K_diag_asin_arg)
--- a/GPy/kern/parts/periodic_Matern52.py
+++ b/GPy/kern/parts/periodic_Matern52.py
@ -2,12 +2,288 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kernpart import Kernpart
 import numpy as np
-from GPy.util.linalg import mdot
+from kern import Kern
-from GPy.util.decorators import silence_errors
+from ...util.linalg import mdot
 from ...util.decorators import silence_errors
 from ...core.parameterization.param import Param
 from ...core.parameterization.transformations import Logexp
-class PeriodicMatern52(Kernpart):
+class Periodic(Kern):
    def __init__(self, input_dim, variance, lengthscale, period, n_freq, lower, upper, active_dims, name):
        """
        :type input_dim: int
        :param variance: the variance of the Matern kernel
        :type variance: float
        :param lengthscale: the lengthscale of the Matern kernel
        :type lengthscale: np.ndarray of size (input_dim,)
        :param period: the period
        :type period: float
        :param n_freq: the number of frequencies considered for the periodic subspace
        :type n_freq: int
        :rtype: kernel object
        """
        assert input_dim==1, "Periodic kernels are only defined for input_dim=1"
        super(Periodic, self).__init__(input_dim, active_dims, name)
        self.input_dim = input_dim
        self.lower,self.upper = lower, upper
        self.n_freq = n_freq
        self.n_basis = 2*n_freq
        self.variance = Param('variance', np.float64(variance), Logexp())
        self.lengthscale = Param('lengthscale', np.float64(lengthscale), Logexp())
        self.period = Param('period', np.float64(period), Logexp())
        self.link_parameters(self.variance, self.lengthscale, self.period)
    def _cos(self, alpha, omega, phase):
        def f(x):
            return alpha*np.cos(omega*x + phase)
        return f
    @silence_errors
    def _cos_factorization(self, alpha, omega, phase):
        r1 = np.sum(alpha*np.cos(phase),axis=1)[:,None]
        r2 = np.sum(alpha*np.sin(phase),axis=1)[:,None]
        r =  np.sqrt(r1**2 + r2**2)
        psi = np.where(r1 != 0, (np.arctan(r2/r1) + (r1<0.)*np.pi),np.arcsin(r2))
        return r,omega[:,0:1], psi
    @silence_errors
    def _int_computation(self,r1,omega1,phi1,r2,omega2,phi2):
        Gint1 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) + 1./(omega1-omega2.T)*( np.sin((omega1-omega2.T)*self.upper+phi1-phi2.T) - np.sin((omega1-omega2.T)*self.lower+phi1-phi2.T) )
        Gint2 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) +  np.cos(phi1-phi2.T)*(self.upper-self.lower)
        Gint = np.dot(r1,r2.T)/2 * np.where(np.isnan(Gint1),Gint2,Gint1)
        return Gint
    def K(self, X, X2=None):
        FX = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        if X2 is None:
            FX2 = FX
        else:
            FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
        return mdot(FX,self.Gi,FX2.T)
    def Kdiag(self,X):
        return np.diag(self.K(X))
 class PeriodicExponential(Periodic):
    """
    Kernel of the periodic subspace (up to a given frequency) of a exponential
    (Matern 1/2) RKHS.
    Only defined for input_dim=1.
    """
    def __init__(self, input_dim=1, variance=1., lengthscale=1., period=2.*np.pi, n_freq=10, lower=0., upper=4*np.pi, active_dims=None, name='periodic_exponential'):
        super(PeriodicExponential, self).__init__(input_dim, variance, lengthscale, period, n_freq, lower, upper, active_dims, name)
    def parameters_changed(self):
        self.a = [1./self.lengthscale, 1.]
        self.b = [1]
        self.basis_alpha = np.ones((self.n_basis,))
        self.basis_omega = (2*np.pi*np.arange(1,self.n_freq+1)/self.period).repeat(2)
        self.basis_phi =   np.zeros(self.n_freq * 2)
        self.basis_phi[::2] = -np.pi/2
        self.G = self.Gram_matrix()
        self.Gi = np.linalg.inv(self.G)
    def Gram_matrix(self):
        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega))
        Lo = np.column_stack((self.basis_omega,self.basis_omega))
        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2))
        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
        Gint = self._int_computation( r,omega,phi, r,omega,phi)
        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
        return(self.lengthscale/(2*self.variance) * Gint + 1./self.variance*np.dot(Flower,Flower.T))
    @silence_errors
    def update_gradients_full(self, dL_dK, X, X2=None):
        """derivative of the covariance matrix with respect to the parameters (shape is N x num_inducing x num_params)"""
        if X2 is None: X2 = X
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega))
        Lo = np.column_stack((self.basis_omega,self.basis_omega))
        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2))
        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
        Gint = self._int_computation( r,omega,phi, r,omega,phi)
        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
        #dK_dvar
        dK_dvar = 1./self.variance*mdot(FX,self.Gi,FX2.T)
        #dK_dlen
        da_dlen = [-1./self.lengthscale**2,0.]
        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)),da_dlen[1]*self.basis_omega))
        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
        dGint_dlen = dGint_dlen + dGint_dlen.T
        dG_dlen = 1./2*Gint + self.lengthscale/2*dGint_dlen
        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX2.T)
        #dK_dper
        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
        dFX2_dper = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X2,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X2)
        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period))
        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi))
        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
        # SIMPLIFY!!!       IPPprim1 = (self.upper - self.lower)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
        IPPprim = np.where(np.logical_or(np.isnan(IPPprim1), np.isinf(IPPprim1)), IPPprim2, IPPprim1)
        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + 1./2*self.upper**2*np.cos(phi-phi1.T)
        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + 1./2*self.lower**2*np.cos(phi-phi1.T)
        #IPPint2[0,0] = (self.upper**2 - self.lower**2)*np.cos(phi[0,0])*np.cos(phi1[0,0])
        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period))
        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2))
        r2,omega2,phi2 = dLa_dper2.T,Lo[:,0:1],dLp_dper2.T
        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) + self._int_computation(r2,omega2,phi2, r,omega,phi)
        dGint_dper = dGint_dper + dGint_dper.T
        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
        dG_dper = 1./self.variance*(self.lengthscale/2*dGint_dper + self.b[0]*(np.dot(dFlower_dper,Flower.T)+np.dot(Flower,dFlower_dper.T)))
        dK_dper = mdot(dFX_dper,self.Gi,FX2.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX2.T) + mdot(FX,self.Gi,dFX2_dper.T)
        self.variance.gradient = np.sum(dK_dvar*dL_dK)
        self.lengthscale.gradient = np.sum(dK_dlen*dL_dK)
        self.period.gradient = np.sum(dK_dper*dL_dK)
 class PeriodicMatern32(Periodic):
    """
    Kernel of the periodic subspace (up to a given frequency) of a Matern 3/2 RKHS. Only defined for input_dim=1.
    :param input_dim: the number of input dimensions
    :type input_dim: int
    :param variance: the variance of the Matern kernel
    :type variance: float
    :param lengthscale: the lengthscale of the Matern kernel
    :type lengthscale: np.ndarray of size (input_dim,)
    :param period: the period
    :type period: float
    :param n_freq: the number of frequencies considered for the periodic subspace
    :type n_freq: int
    :rtype: kernel object
    """
    def __init__(self, input_dim=1, variance=1., lengthscale=1., period=2.*np.pi, n_freq=10, lower=0., upper=4*np.pi, active_dims=None, name='periodic_Matern32'):
        super(PeriodicMatern32, self).__init__(input_dim, variance, lengthscale, period, n_freq, lower, upper, active_dims, name)
    def parameters_changed(self):
        self.a = [3./self.lengthscale**2, 2*np.sqrt(3)/self.lengthscale, 1.]
        self.b = [1,self.lengthscale**2/3]
        self.basis_alpha = np.ones((self.n_basis,))
        self.basis_omega = (2*np.pi*np.arange(1,self.n_freq+1)/self.period).repeat(2)
        self.basis_phi =   np.zeros(self.n_freq * 2)
        self.basis_phi[::2] = -np.pi/2
        self.G = self.Gram_matrix()
        self.Gi = np.linalg.inv(self.G)
    def Gram_matrix(self):
        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega,self.a[2]*self.basis_omega**2))
        Lo = np.column_stack((self.basis_omega,self.basis_omega,self.basis_omega))
        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2,self.basis_phi+np.pi))
        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
        Gint = self._int_computation( r,omega,phi, r,omega,phi)
        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
        return(self.lengthscale**3/(12*np.sqrt(3)*self.variance) * Gint + 1./self.variance*np.dot(Flower,Flower.T) + self.lengthscale**2/(3.*self.variance)*np.dot(F1lower,F1lower.T))
    @silence_errors
    def update_gradients_full(self,dL_dK,X,X2):
        """derivative of the covariance matrix with respect to the parameters (shape is num_data x num_inducing x num_params)"""
        if X2 is None: X2 = X
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)),self.a[1]*self.basis_omega,self.a[2]*self.basis_omega**2))
        Lo = np.column_stack((self.basis_omega,self.basis_omega,self.basis_omega))
        Lp = np.column_stack((self.basis_phi,self.basis_phi+np.pi/2,self.basis_phi+np.pi))
        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
        Gint = self._int_computation( r,omega,phi, r,omega,phi)
        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
        #dK_dvar
        dK_dvar = 1./self.variance*mdot(FX,self.Gi,FX2.T)
        #dK_dlen
        da_dlen = [-6/self.lengthscale**3,-2*np.sqrt(3)/self.lengthscale**2,0.]
        db_dlen = [0.,2*self.lengthscale/3.]
        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)),da_dlen[1]*self.basis_omega,da_dlen[2]*self.basis_omega**2))
        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
        dGint_dlen = dGint_dlen + dGint_dlen.T
        dG_dlen = self.lengthscale**2/(4*np.sqrt(3))*Gint + self.lengthscale**3/(12*np.sqrt(3))*dGint_dlen + db_dlen[0]*np.dot(Flower,Flower.T) + db_dlen[1]*np.dot(F1lower,F1lower.T)
        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX2.T)
        #dK_dper
        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
        dFX2_dper = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X2,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X2)
        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period, -self.a[2]*self.basis_omega**3/self.period))
        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi,self.basis_phi+np.pi*3/2))
        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)
        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + 1./2*self.upper**2*np.cos(phi-phi1.T)
        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + 1./2*self.lower**2*np.cos(phi-phi1.T)
        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period, -2*self.a[2]*self.basis_omega**2/self.period))
        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi))
        r2,omega2,phi2 =  self._cos_factorization(dLa_dper2,Lo[:,0:2],dLp_dper2)
        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) +  self._int_computation(r2,omega2,phi2, r,omega,phi)
        dGint_dper = dGint_dper + dGint_dper.T
        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
        dF1lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower)+self._cos(-self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
        dG_dper = 1./self.variance*(self.lengthscale**3/(12*np.sqrt(3))*dGint_dper + self.b[0]*(np.dot(dFlower_dper,Flower.T)+np.dot(Flower,dFlower_dper.T)) + self.b[1]*(np.dot(dF1lower_dper,F1lower.T)+np.dot(F1lower,dF1lower_dper.T)))
        dK_dper = mdot(dFX_dper,self.Gi,FX2.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX2.T) + mdot(FX,self.Gi,dFX2_dper.T)
        self.variance.gradient = np.sum(dK_dvar*dL_dK)
        self.lengthscale.gradient = np.sum(dK_dlen*dL_dK)
        self.period.gradient = np.sum(dK_dper*dL_dK)
 class PeriodicMatern52(Periodic):
    """
    Kernel of the periodic subspace (up to a given frequency) of a Matern 5/2 RKHS. Only defined for input_dim=1.
@ -25,67 +301,21 @@ class PeriodicMatern52(Kernpart):
    """
-    def __init__(self,input_dim=1,variance=1.,lengthscale=None,period=2*np.pi,n_freq=10,lower=0.,upper=4*np.pi):
+    def __init__(self, input_dim=1, variance=1., lengthscale=1., period=2.*np.pi, n_freq=10, lower=0., upper=4*np.pi, active_dims=None, name='periodic_Matern52'):
-        assert input_dim==1, "Periodic kernels are only defined for input_dim=1"
+        super(PeriodicMatern52, self).__init__(input_dim, variance, lengthscale, period, n_freq, lower, upper, active_dims, name)
        self.name = 'periodic_Mat52'
        self.input_dim = input_dim
        if lengthscale is not None:
            lengthscale = np.asarray(lengthscale)
            assert lengthscale.size == 1, "Wrong size: only one lengthscale needed"
        else:
            lengthscale = np.ones(1)
        self.lower,self.upper = lower, upper
        self.num_params = 3
        self.n_freq = n_freq
        self.n_basis = 2*n_freq
        self._set_params(np.hstack((variance,lengthscale,period)))
    def _cos(self,alpha,omega,phase):
        def f(x):
            return alpha*np.cos(omega*x+phase)
        return f
    @silence_errors
    def _cos_factorization(self,alpha,omega,phase):
        r1 = np.sum(alpha*np.cos(phase),axis=1)[:,None]
        r2 = np.sum(alpha*np.sin(phase),axis=1)[:,None]
        r =  np.sqrt(r1**2 + r2**2)
        psi = np.where(r1 != 0, (np.arctan(r2/r1) + (r1<0.)*np.pi),np.arcsin(r2))
        return r,omega[:,0:1], psi
    @silence_errors
    def _int_computation(self,r1,omega1,phi1,r2,omega2,phi2):
        Gint1 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) + 1./(omega1-omega2.T)*( np.sin((omega1-omega2.T)*self.upper+phi1-phi2.T) - np.sin((omega1-omega2.T)*self.lower+phi1-phi2.T) )
        Gint2 = 1./(omega1+omega2.T)*( np.sin((omega1+omega2.T)*self.upper+phi1+phi2.T) - np.sin((omega1+omega2.T)*self.lower+phi1+phi2.T)) +  np.cos(phi1-phi2.T)*(self.upper-self.lower)
        #Gint2[0,0] = 2.*(self.upper-self.lower)*np.cos(phi1[0,0])*np.cos(phi2[0,0])
        Gint = np.dot(r1,r2.T)/2 * np.where(np.isnan(Gint1),Gint2,Gint1)
        return Gint
    def _get_params(self):
        """return the value of the parameters."""
        return np.hstack((self.variance,self.lengthscale,self.period))
    def _set_params(self,x):
        """set the value of the parameters."""
        assert x.size==3
        self.variance = x[0]
        self.lengthscale = x[1]
        self.period = x[2]
    def parameters_changed(self):
        self.a = [5*np.sqrt(5)/self.lengthscale**3, 15./self.lengthscale**2,3*np.sqrt(5)/self.lengthscale, 1.]
        self.b  = [9./8, 9*self.lengthscale**4/200., 3*self.lengthscale**2/5., 3*self.lengthscale**2/(5*8.), 3*self.lengthscale**2/(5*8.)]
        self.basis_alpha = np.ones((2*self.n_freq,))
-        self.basis_omega = np.array(sum([[i*2*np.pi/self.period]*2 for i in  range(1,self.n_freq+1)],[]))
+        self.basis_omega = (2*np.pi*np.arange(1,self.n_freq+1)/self.period).repeat(2)
-        self.basis_phi =   np.array(sum([[-np.pi/2, 0.]  for i in range(1,self.n_freq+1)],[]))
+        self.basis_phi =   np.zeros(self.n_freq * 2)
        self.basis_phi[::2] = -np.pi/2
        self.G = self.Gram_matrix()
        self.Gi = np.linalg.inv(self.G)
    def _get_param_names(self):
        """return parameter names."""
        return ['variance','lengthscale','period']
    def Gram_matrix(self):
        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)), self.a[1]*self.basis_omega, self.a[2]*self.basis_omega**2, self.a[3]*self.basis_omega**3))
        Lo = np.column_stack((self.basis_omega, self.basis_omega, self.basis_omega, self.basis_omega))
@ -99,23 +329,8 @@ class PeriodicMatern52(Kernpart):
        lower_terms = self.b[0]*np.dot(Flower,Flower.T) + self.b[1]*np.dot(F2lower,F2lower.T) + self.b[2]*np.dot(F1lower,F1lower.T) + self.b[3]*np.dot(F2lower,Flower.T) + self.b[4]*np.dot(Flower,F2lower.T)
        return(3*self.lengthscale**5/(400*np.sqrt(5)*self.variance) * Gint + 1./self.variance*lower_terms)
    def K(self,X,X2,target):
        """Compute the covariance matrix between X and X2."""
        FX = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        if X2 is None:
            FX2 = FX
        else:
            FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
        np.add(mdot(FX,self.Gi,FX2.T), target,target)
    def Kdiag(self,X,target):
        """Compute the diagonal of the covariance matrix associated to X."""
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        np.add(target,np.diag(mdot(FX,self.Gi,FX.T)),target)
    @silence_errors
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def update_gradients_full(self, dL_dK, X, X2=None):
        """derivative of the covariance matrix with respect to the parameters (shape is num_data x num_inducing x num_params)"""
        if X2 is None: X2 = X
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        FX2 = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X2)
@ -156,14 +371,12 @@ class PeriodicMatern52(Kernpart):
        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
        #IPPprim2[0,0] = 2*(self.upper**2 - self.lower**2)*np.cos(phi[0,0])*np.cos(phi1[0,0])
        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)
        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + 1./2*self.upper**2*np.cos(phi-phi1.T)
        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + 1./2*self.lower**2*np.cos(phi-phi1.T)
        #IPPint2[0,0] = (self.upper**2 - self.lower**2)*np.cos(phi[0,0])*np.cos(phi1[0,0])
        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period, -2*self.a[2]*self.basis_omega**2/self.period, -3*self.a[3]*self.basis_omega**3/self.period))
@ -186,81 +399,7 @@ class PeriodicMatern52(Kernpart):
        dG_dper = 1./self.variance*(3*self.lengthscale**5/(400*np.sqrt(5))*dGint_dper + 0.5*dlower_terms_dper)
        dK_dper = mdot(dFX_dper,self.Gi,FX2.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX2.T) + mdot(FX,self.Gi,dFX2_dper.T)
-        # np.add(target[:,:,0],dK_dvar, target[:,:,0])
+        self.variance.gradient = np.sum(dK_dvar*dL_dK)
-        target[0] += np.sum(dK_dvar*dL_dK)
+        self.lengthscale.gradient = np.sum(dK_dlen*dL_dK)
-        #np.add(target[:,:,1],dK_dlen, target[:,:,1])
+        self.period.gradient = np.sum(dK_dper*dL_dK)
        target[1] += np.sum(dK_dlen*dL_dK)
        #np.add(target[:,:,2],dK_dper, target[:,:,2])
        target[2] += np.sum(dK_dper*dL_dK)
    @silence_errors
    def dKdiag_dtheta(self,dL_dKdiag,X,target):
        """derivative of the diagonal of the covariance matrix with respect to the parameters"""
        FX  = self._cos(self.basis_alpha[None,:],self.basis_omega[None,:],self.basis_phi[None,:])(X)
        La = np.column_stack((self.a[0]*np.ones((self.n_basis,1)), self.a[1]*self.basis_omega, self.a[2]*self.basis_omega**2, self.a[3]*self.basis_omega**3))
        Lo = np.column_stack((self.basis_omega, self.basis_omega, self.basis_omega, self.basis_omega))
        Lp = np.column_stack((self.basis_phi, self.basis_phi+np.pi/2, self.basis_phi+np.pi, self.basis_phi+np.pi*3/2))
        r,omega,phi =  self._cos_factorization(La,Lo,Lp)
        Gint = self._int_computation( r,omega,phi, r,omega,phi)
        Flower = np.array(self._cos(self.basis_alpha,self.basis_omega,self.basis_phi)(self.lower))[:,None]
        F1lower = np.array(self._cos(self.basis_alpha*self.basis_omega,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
        F2lower = np.array(self._cos(self.basis_alpha*self.basis_omega**2,self.basis_omega,self.basis_phi+np.pi)(self.lower))[:,None]
        #dK_dvar
        dK_dvar = 1. / self.variance * mdot(FX, self.Gi, FX.T)
        #dK_dlen
        da_dlen = [-3*self.a[0]/self.lengthscale, -2*self.a[1]/self.lengthscale, -self.a[2]/self.lengthscale, 0.]
        db_dlen = [0., 4*self.b[1]/self.lengthscale, 2*self.b[2]/self.lengthscale, 2*self.b[3]/self.lengthscale, 2*self.b[4]/self.lengthscale]
        dLa_dlen =  np.column_stack((da_dlen[0]*np.ones((self.n_basis,1)), da_dlen[1]*self.basis_omega, da_dlen[2]*self.basis_omega**2, da_dlen[3]*self.basis_omega**3))
        r1,omega1,phi1 = self._cos_factorization(dLa_dlen,Lo,Lp)
        dGint_dlen = self._int_computation(r1,omega1,phi1, r,omega,phi)
        dGint_dlen = dGint_dlen + dGint_dlen.T
        dlower_terms_dlen = db_dlen[0]*np.dot(Flower,Flower.T) + db_dlen[1]*np.dot(F2lower,F2lower.T) + db_dlen[2]*np.dot(F1lower,F1lower.T) + db_dlen[3]*np.dot(F2lower,Flower.T) + db_dlen[4]*np.dot(Flower,F2lower.T)
        dG_dlen = 15*self.lengthscale**4/(400*np.sqrt(5))*Gint + 3*self.lengthscale**5/(400*np.sqrt(5))*dGint_dlen + dlower_terms_dlen
        dK_dlen = -mdot(FX,self.Gi,dG_dlen/self.variance,self.Gi,FX.T)
        #dK_dper
        dFX_dper  = self._cos(-self.basis_alpha[None,:]*self.basis_omega[None,:]/self.period*X ,self.basis_omega[None,:],self.basis_phi[None,:]+np.pi/2)(X)
        dLa_dper = np.column_stack((-self.a[0]*self.basis_omega/self.period, -self.a[1]*self.basis_omega**2/self.period, -self.a[2]*self.basis_omega**3/self.period, -self.a[3]*self.basis_omega**4/self.period))
        dLp_dper = np.column_stack((self.basis_phi+np.pi/2,self.basis_phi+np.pi,self.basis_phi+np.pi*3/2,self.basis_phi))
        r1,omega1,phi1 =  self._cos_factorization(dLa_dper,Lo,dLp_dper)
        IPPprim1 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi/2))
        IPPprim1 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  +  1./(omega-omega1.T)*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi/2))
        IPPprim2 =  self.upper*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi/2)  + self.upper*np.cos(phi-phi1.T))
        IPPprim2 -= self.lower*(1./(omega+omega1.T)*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi/2)  + self.lower*np.cos(phi-phi1.T))
        IPPprim = np.where(np.isnan(IPPprim1),IPPprim2,IPPprim1)
        IPPint1 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.upper+phi-phi1.T-np.pi)
        IPPint1 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  +  1./(omega-omega1.T)**2*np.cos((omega-omega1.T)*self.lower+phi-phi1.T-np.pi)
        IPPint2 =  1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.upper+phi+phi1.T-np.pi)  + .5*self.upper**2*np.cos(phi-phi1.T)
        IPPint2 -= 1./(omega+omega1.T)**2*np.cos((omega+omega1.T)*self.lower+phi+phi1.T-np.pi)  + .5*self.lower**2*np.cos(phi-phi1.T)
        IPPint = np.where(np.isnan(IPPint1),IPPint2,IPPint1)
        dLa_dper2 = np.column_stack((-self.a[1]*self.basis_omega/self.period, -2*self.a[2]*self.basis_omega**2/self.period, -3*self.a[3]*self.basis_omega**3/self.period))
        dLp_dper2 = np.column_stack((self.basis_phi+np.pi/2, self.basis_phi+np.pi, self.basis_phi+np.pi*3/2))
        r2,omega2,phi2 =  self._cos_factorization(dLa_dper2,Lo[:,0:2],dLp_dper2)
        dGint_dper = np.dot(r,r1.T)/2 * (IPPprim - IPPint) +  self._int_computation(r2,omega2,phi2, r,omega,phi)
        dGint_dper = dGint_dper + dGint_dper.T
        dFlower_dper  = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
        dF1lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower)+self._cos(-self.basis_alpha*self.basis_omega/self.period,self.basis_omega,self.basis_phi+np.pi/2)(self.lower))[:,None]
        dF2lower_dper = np.array(self._cos(-self.lower*self.basis_alpha*self.basis_omega**3/self.period,self.basis_omega,self.basis_phi+np.pi*3/2)(self.lower) + self._cos(-2*self.basis_alpha*self.basis_omega**2/self.period,self.basis_omega,self.basis_phi+np.pi)(self.lower))[:,None]
        dlower_terms_dper  = self.b[0] * (np.dot(dFlower_dper,Flower.T) + np.dot(Flower.T,dFlower_dper))
        dlower_terms_dper += self.b[1] * (np.dot(dF2lower_dper,F2lower.T) + np.dot(F2lower,dF2lower_dper.T)) - 4*self.b[1]/self.period*np.dot(F2lower,F2lower.T)
        dlower_terms_dper += self.b[2] * (np.dot(dF1lower_dper,F1lower.T) + np.dot(F1lower,dF1lower_dper.T)) - 2*self.b[2]/self.period*np.dot(F1lower,F1lower.T)
        dlower_terms_dper += self.b[3] * (np.dot(dF2lower_dper,Flower.T) + np.dot(F2lower,dFlower_dper.T)) - 2*self.b[3]/self.period*np.dot(F2lower,Flower.T)
        dlower_terms_dper += self.b[4] * (np.dot(dFlower_dper,F2lower.T) + np.dot(Flower,dF2lower_dper.T)) - 2*self.b[4]/self.period*np.dot(Flower,F2lower.T)
        dG_dper = 1./self.variance*(3*self.lengthscale**5/(400*np.sqrt(5))*dGint_dper + 0.5*dlower_terms_dper)
        dK_dper = 2*mdot(dFX_dper,self.Gi,FX.T) - mdot(FX,self.Gi,dG_dper,self.Gi,FX.T)
        target[0] += np.sum(np.diag(dK_dvar)*dL_dKdiag)
        target[1] += np.sum(np.diag(dK_dlen)*dL_dKdiag)
        target[2] += np.sum(np.diag(dK_dper)*dL_dKdiag)
--- a/GPy/kern/_src/poly.py
+++ b/GPy/kern/_src/poly.py
@ -0,0 +1,41 @@
 # Copyright (c) 2014, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 class Poly(Kern):
    """
    Polynomial kernel
    """
    def __init__(self, input_dim, variance=1., order=3., active_dims=None, name='poly'):
        super(Poly, self).__init__(input_dim, active_dims, name)
        self.variance = Param('variance', variance, Logexp())
        self.link_parameter(self.variance)
        self.order=order
    def K(self, X, X2=None):
        return (self._dot_product(X, X2) + 1.)**self.order * self.variance
    def _dot_product(self, X, X2=None):
        if X2 is None:
            return np.dot(X, X.T)
        else:
            return np.dot(X, X2.T)
    def Kdiag(self, X):
        return self.variance*(np.square(X).sum(1) + 1.)**self.order
    def update_gradients_full(self, dL_dK, X, X2=None):
        self.variance.gradient = np.sum(dL_dK * (self._dot_product(X, X2) + 1.)**self.order)
    def update_gradients_diag(self, dL_dKdiag, X):
        raise NotImplementedError
    def gradients_X(self, dL_dK, X, X2=None):
        raise NotImplementedError
    def gradients_X_diag(self, dL_dKdiag, X):
        raise NotImplementedError
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@ -0,0 +1,66 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from kern import CombinationKernel
 from ...util.caching import Cache_this
 import itertools
 class Prod(CombinationKernel):
    """
    Computes the product of 2 kernels
    :param k1, k2: the kernels to multiply
    :type k1, k2: Kern
    :param tensor: The kernels are either multiply as functions defined on the same input space (default) or on the product of the input spaces
    :type tensor: Boolean
    :rtype: kernel object
    """
    def __init__(self, kernels, name='mul'):
        for i, kern in enumerate(kernels[:]):
            if isinstance(kern, Prod):
                del kernels[i]
                for part in kern.parts[::-1]:
                    kern.unlink_parameter(part)
                    kernels.insert(i, part)
        super(Prod, self).__init__(kernels, name)
    @Cache_this(limit=2, force_kwargs=['which_parts'])
    def K(self, X, X2=None, which_parts=None):
        if which_parts is None:
            which_parts = self.parts
        elif not isinstance(which_parts, (list, tuple)):
            # if only one part is given
            which_parts = [which_parts]
        return reduce(np.multiply, (p.K(X, X2) for p in which_parts))
    @Cache_this(limit=2, force_kwargs=['which_parts'])
    def Kdiag(self, X, which_parts=None):
        if which_parts is None:
            which_parts = self.parts
        return reduce(np.multiply, (p.Kdiag(X) for p in which_parts))
    def update_gradients_full(self, dL_dK, X, X2=None):
        k = self.K(X,X2)*dL_dK
        for p in self.parts:
            p.update_gradients_full(k/p.K(X,X2),X,X2)
    def update_gradients_diag(self, dL_dKdiag, X):
        k = self.Kdiag(X)*dL_dKdiag
        for p in self.parts:
            p.update_gradients_diag(k/p.Kdiag(X),X)
    def gradients_X(self, dL_dK, X, X2=None):
        target = np.zeros(X.shape)
        k = self.K(X,X2)*dL_dK
        for p in self.parts:
            target += p.gradients_X(k/p.K(X,X2),X,X2)
        return target
    def gradients_X_diag(self, dL_dKdiag, X):
        target = np.zeros(X.shape)
        k = self.Kdiag(X)*dL_dKdiag
        for p in self.parts:
            target += p.gradients_X_diag(k/p.Kdiag(X),X)
        return target
--- a/GPy/kern/_src/psi_comp/init.py
+++ b/GPy/kern/_src/psi_comp/init.py
@ -0,0 +1,55 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from ....core.parameterization.parameter_core import Pickleable
 from GPy.util.caching import Cache_this
 from ....core.parameterization import variational
 import rbf_psi_comp
 import ssrbf_psi_comp
 import sslinear_psi_comp
 import linear_psi_comp
 class PSICOMP_RBF(Pickleable):
    @Cache_this(limit=2, ignore_args=(0,))
    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
        if isinstance(variational_posterior, variational.NormalPosterior):
            return rbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior)
        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
            return ssrbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior)
        else:
            raise ValueError, "unknown distriubtion received for psi-statistics"
    @Cache_this(limit=2, ignore_args=(0,1,2,3))
    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
        if isinstance(variational_posterior, variational.NormalPosterior):
            return rbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior)
        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
            return ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior)
        else:
            raise ValueError, "unknown distriubtion received for psi-statistics"
    def _setup_observers(self):
        pass
 class PSICOMP_Linear(Pickleable):
    @Cache_this(limit=2, ignore_args=(0,))
    def psicomputations(self, variance, Z, variational_posterior):
        if isinstance(variational_posterior, variational.NormalPosterior):
            return linear_psi_comp.psicomputations(variance, Z, variational_posterior)
        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
            return sslinear_psi_comp.psicomputations(variance, Z, variational_posterior)
        else:
            raise ValueError, "unknown distriubtion received for psi-statistics"
    @Cache_this(limit=2, ignore_args=(0,1,2,3))
    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior):
        if isinstance(variational_posterior, variational.NormalPosterior):
            return linear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
        elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
            return sslinear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
        else:
            raise ValueError, "unknown distriubtion received for psi-statistics"
    def _setup_observers(self):
        pass
--- a/GPy/kern/_src/psi_comp/linear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/linear_psi_comp.py
@ -0,0 +1,77 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 The package for the Psi statistics computation of the linear kernel for Bayesian GPLVM
 """
 import numpy as np
 from ....util.linalg import tdot
 def psicomputations(variance, Z, variational_posterior):
    """
    Compute psi-statistics for ss-linear kernel
    """
    # here are the "statistics" for psi0, psi1 and psi2
    # Produced intermediate results:
    # psi0    N
    # psi1    NxM
    # psi2    MxM
    mu = variational_posterior.mean
    S = variational_posterior.variance
    psi0 = (variance*(np.square(mu)+S)).sum(axis=1)
    psi1 = np.dot(mu,(variance*Z).T)
    psi2 = np.dot(S.sum(axis=0)*np.square(variance)*Z,Z.T)+ tdot(psi1.T)
    return psi0, psi1, psi2
 def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior):
    mu = variational_posterior.mean
    S = variational_posterior.variance
    dL_dvar, dL_dmu, dL_dS, dL_dZ = _psi2computations(dL_dpsi2, variance, Z, mu, S)
    # Compute for psi0 and psi1
    mu2S = np.square(mu)+S
    dL_dpsi0_var = dL_dpsi0[:,None]*variance[None,:]
    dL_dpsi1_mu = np.dot(dL_dpsi1.T,mu)
    dL_dvar += (dL_dpsi0[:,None]*mu2S).sum(axis=0)+ (dL_dpsi1_mu*Z).sum(axis=0)
    dL_dmu += 2.*dL_dpsi0_var*mu+np.dot(dL_dpsi1,Z)*variance
    dL_dS += dL_dpsi0_var
    dL_dZ += dL_dpsi1_mu*variance
    return dL_dvar, dL_dZ, dL_dmu, dL_dS
 def _psi2computations(dL_dpsi2, variance, Z, mu, S):
    """
    Z - MxQ
    mu - NxQ
    S - NxQ
    gamma - NxQ
    """
    # here are the "statistics" for psi1 and psi2
    # Produced intermediate results:
    # _psi2_dvariance      Q
    # _psi2_dZ             MxQ
    # _psi2_dmu            NxQ
    # _psi2_dS             NxQ
    variance2 = np.square(variance)
    common_sum = np.dot(mu,(variance*Z).T)
    Z_expect = (np.dot(dL_dpsi2,Z)*Z).sum(axis=0)
    dL_dpsi2T = dL_dpsi2+dL_dpsi2.T
    common_expect = np.dot(common_sum,np.dot(dL_dpsi2T,Z))
    Z2_expect = np.inner(common_sum,dL_dpsi2T)
    Z1_expect = np.dot(dL_dpsi2T,Z)
    dL_dvar = 2.*S.sum(axis=0)*variance*Z_expect+(common_expect*mu).sum(axis=0)
    dL_dmu = common_expect*variance
    dL_dS = np.empty(S.shape)
    dL_dS[:] = Z_expect*variance2
    dL_dZ = variance2*S.sum(axis=0)*Z1_expect+np.dot(Z2_expect.T,variance*mu)
    return dL_dvar, dL_dmu, dL_dS, dL_dZ
--- a/GPy/kern/_src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py
@ -0,0 +1,161 @@
 """
 The module for psi-statistics for RBF kernel
 """
 import numpy as np
 from GPy.util.caching import Cacher
 def psicomputations(variance, lengthscale, Z, variational_posterior):
    """
    Z - MxQ
    mu - NxQ
    S - NxQ
    gamma - NxQ
    """
    # here are the "statistics" for psi0, psi1 and psi2
    # Produced intermediate results:
    # _psi1                NxM
    mu = variational_posterior.mean
    S = variational_posterior.variance
    psi0 = np.empty(mu.shape[0])
    psi0[:] = variance
    psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
    psi2 = _psi2computations(variance, lengthscale, Z, mu, S).sum(axis=0)
    return psi0, psi1, psi2
 def __psi1computations(variance, lengthscale, Z, mu, S):
    """
    Z - MxQ
    mu - NxQ
    S - NxQ
    gamma - NxQ
    """
    # here are the "statistics" for psi1
    # Produced intermediate results:
    # _psi1                NxM
    lengthscale2 = np.square(lengthscale)
    # psi1
    _psi1_logdenom = np.log(S/lengthscale2+1.).sum(axis=-1) # N
    _psi1_log = (_psi1_logdenom[:,None]+np.einsum('nmq,nq->nm',np.square(mu[:,None,:]-Z[None,:,:]),1./(S+lengthscale2)))/(-2.)
    _psi1 = variance*np.exp(_psi1_log)
    return _psi1
 def __psi2computations(variance, lengthscale, Z, mu, S):
    """
    Z - MxQ
    mu - NxQ
    S - NxQ
    gamma - NxQ
    """
    # here are the "statistics" for psi2
    # Produced intermediate results:
    # _psi2                MxM
    lengthscale2 = np.square(lengthscale)
    _psi2_logdenom = np.log(2.*S/lengthscale2+1.).sum(axis=-1)/(-2.) # N
    _psi2_exp1 = (np.square(Z[:,None,:]-Z[None,:,:])/lengthscale2).sum(axis=-1)/(-4.) #MxM
    Z_hat = (Z[:,None,:]+Z[None,:,:])/2. #MxMxQ
    denom = 1./(2.*S+lengthscale2)
    _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+2.*np.einsum('nq,moq,nq->nmo',mu,Z_hat,denom)-np.einsum('moq,nq->nmo',np.square(Z_hat),denom)
    _psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2)
    return _psi2
 def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
    ARD = (len(lengthscale)!=1)
    dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
    dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
    dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
    dL_dlengscale = dl_psi1 + dl_psi2
    if not ARD:
        dL_dlengscale = dL_dlengscale.sum()
    dL_dmu = dmu_psi1 + dmu_psi2
    dL_dS = dS_psi1 + dS_psi2
    dL_dZ = dZ_psi1 + dZ_psi2
    return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS
 def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S):
    """
    dL_dpsi1 - NxM
    Z - MxQ
    mu - NxQ
    S - NxQ
    gamma - NxQ
    """
    # here are the "statistics" for psi1
    # Produced intermediate results: dL_dparams w.r.t. psi1
    # _dL_dvariance     1
    # _dL_dlengthscale  Q
    # _dL_dZ            MxQ
    # _dL_dgamma        NxQ
    # _dL_dmu           NxQ
    # _dL_dS            NxQ
    lengthscale2 = np.square(lengthscale)
    _psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
    Lpsi1 = dL_dpsi1*_psi1
    Zmu = Z[None,:,:]-mu[:,None,:] # NxMxQ
    denom = 1./(S+lengthscale2)
    Zmu2_denom = np.square(Zmu)*denom[:,None,:] #NxMxQ
    _dL_dvar = Lpsi1.sum()/variance
    _dL_dmu = np.einsum('nm,nmq,nq->nq',Lpsi1,Zmu,denom)
    _dL_dS = np.einsum('nm,nmq,nq->nq',Lpsi1,(Zmu2_denom-1.),denom)/2.
    _dL_dZ = -np.einsum('nm,nmq,nq->mq',Lpsi1,Zmu,denom)
    _dL_dl = np.einsum('nm,nmq,nq->q',Lpsi1,(Zmu2_denom+(S/lengthscale2)[:,None,:]),denom*lengthscale)
    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
 def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
    """
    Z - MxQ
    mu - NxQ
    S - NxQ
    gamma - NxQ
    dL_dpsi2 - MxM
    """
    # here are the "statistics" for psi2
    # Produced the derivatives w.r.t. psi2:
    # _dL_dvariance      1
    # _dL_dlengthscale   Q
    # _dL_dZ             MxQ
    # _dL_dgamma         NxQ
    # _dL_dmu            NxQ
    # _dL_dS             NxQ
    lengthscale2 = np.square(lengthscale)
    denom = 1./(2*S+lengthscale2)
    denom2 = np.square(denom)
    _psi2 = _psi2computations(variance, lengthscale, Z, mu, S) # NxMxM
    Lpsi2 = dL_dpsi2*_psi2 # dL_dpsi2 is MxM, using broadcast to multiply N out
    Lpsi2sum = np.einsum('nmo->n',Lpsi2) #N
    Lpsi2Z = np.einsum('nmo,oq->nq',Lpsi2,Z) #NxQ
    Lpsi2Z2 = np.einsum('nmo,oq,oq->nq',Lpsi2,Z,Z) #NxQ
    Lpsi2Z2p = np.einsum('nmo,mq,oq->nq',Lpsi2,Z,Z) #NxQ
    Lpsi2Zhat = Lpsi2Z
    Lpsi2Zhat2 = (Lpsi2Z2+Lpsi2Z2p)/2
    _dL_dvar = Lpsi2sum.sum()*2/variance
    _dL_dmu = (-2*denom) * (mu*Lpsi2sum[:,None]-Lpsi2Zhat)
    _dL_dS = (2*np.square(denom))*(np.square(mu)*Lpsi2sum[:,None]-2*mu*Lpsi2Zhat+Lpsi2Zhat2) - denom*Lpsi2sum[:,None]
    _dL_dZ = -np.einsum('nmo,oq->oq',Lpsi2,Z)/lengthscale2+np.einsum('nmo,oq->mq',Lpsi2,Z)/lengthscale2+ \
             2*np.einsum('nmo,nq,nq->mq',Lpsi2,mu,denom) - np.einsum('nmo,nq,mq->mq',Lpsi2,denom,Z) - np.einsum('nmo,oq,nq->mq',Lpsi2,Z,denom)
    _dL_dl = 2*lengthscale* ((S/lengthscale2*denom+np.square(mu*denom))*Lpsi2sum[:,None]+(Lpsi2Z2-Lpsi2Z2p)/(2*np.square(lengthscale2))-
                             (2*mu*denom2)*Lpsi2Zhat+denom2*Lpsi2Zhat2).sum(axis=0)
    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
 _psi1computations = Cacher(__psi1computations, limit=1)
 _psi2computations = Cacher(__psi2computations, limit=1)
--- a/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py
+++ b/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py
@ -0,0 +1,411 @@
 """
 The module for psi-statistics for RBF kernel
 """
 import numpy as np
 from ....util.caching import Cache_this
 from . import PSICOMP_RBF
 from ....util import gpu_init
 try:
    import pycuda.gpuarray as gpuarray
    from pycuda.compiler import SourceModule
    from ....util.linalg_gpu import sum_axis
 except:
    pass    
 gpu_code = """
    // define THREADNUM
    #define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
    #define IDX_NMM(n,m1,m2) ((m2*M+m1)*N+n)
    #define IDX_NQ(n,q) (q*N+n)
    #define IDX_NM(n,m) (m*N+n)
    #define IDX_MQ(m,q) (q*M+m)
    #define IDX_MM(m1,m2) (m2*M+m1)
    #define IDX_NQB(n,q,b) ((b*Q+q)*N+n)
    #define IDX_QB(q,b) (b*Q+q)
    // Divide data evenly
    __device__ void divide_data(int total_data, int psize, int pidx, int *start, int *end) {
        int residue = (total_data)%psize;
        if(pidx<residue) {
            int size = total_data/psize+1;
            *start = size*pidx;
            *end = *start+size;
        } else {
            int size = total_data/psize;
            *start = size*pidx+residue;
            *end = *start+size;
        }
    }
    __device__ void reduce_sum(double* array, int array_size) {
        int s;
        if(array_size >= blockDim.x) {
            for(int i=blockDim.x+threadIdx.x; i<array_size; i+= blockDim.x) {
                array[threadIdx.x] += array[i];
            }
            array_size = blockDim.x;
        }
        __syncthreads();
        for(int i=1; i<=array_size;i*=2) {s=i;}
        if(threadIdx.x < array_size-s) {array[threadIdx.x] += array[s+threadIdx.x];}
        __syncthreads();
        for(s=s/2;s>=1;s=s/2) {
            if(threadIdx.x < s) {array[threadIdx.x] += array[s+threadIdx.x];}
            __syncthreads();
        }
    }
    __global__ void compDenom(double *log_denom1, double *log_denom2, double *l, double *S, int N, int Q)
    {
        int n_start, n_end;
        divide_data(N, gridDim.x, blockIdx.x, &n_start, &n_end);
        for(int i=n_start*Q+threadIdx.x; i<n_end*Q; i+=blockDim.x) {
            int n=i/Q;
            int q=i%Q;
            double Snq = S[IDX_NQ(n,q)];
            double lq = l[q]*l[q];
            log_denom1[IDX_NQ(n,q)] = log(Snq/lq+1.);
            log_denom2[IDX_NQ(n,q)] = log(2.*Snq/lq+1.);
        }
    }
    __global__ void psi1computations(double *psi1, double *log_denom1, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
    {
        int m_start, m_end;
        divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
        for(int m=m_start; m<m_end; m++) {
            for(int n=threadIdx.x; n<N; n+= blockDim.x) {            
                double log_psi1 = 0;
                for(int q=0;q<Q;q++) {
                    double muZ = mu[IDX_NQ(n,q)]-Z[IDX_MQ(m,q)];
                    double Snq = S[IDX_NQ(n,q)];
                    double lq = l[q]*l[q];
                    log_psi1 += (muZ*muZ/(Snq+lq)+log_denom1[IDX_NQ(n,q)])/(-2.);
                }
                psi1[IDX_NM(n,m)] = var*exp(log_psi1);
            }
        }
    }
    __global__ void psi2computations(double *psi2, double *psi2n, double *log_denom2, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
    {
        int psi2_idx_start, psi2_idx_end;
        __shared__ double psi2_local[THREADNUM];
        divide_data((M+1)*M/2, gridDim.x, blockIdx.x, &psi2_idx_start, &psi2_idx_end);
        for(int psi2_idx=psi2_idx_start; psi2_idx<psi2_idx_end; psi2_idx++) {
            int m1 = int((sqrt(8.*psi2_idx+1.)-1.)/2.);
            int m2 = psi2_idx - (m1+1)*m1/2;
            psi2_local[threadIdx.x] = 0;
            for(int n=threadIdx.x;n<N;n+=blockDim.x) {
                double log_psi2_n = 0;
                for(int q=0;q<Q;q++) {
                    double dZ = Z[IDX_MQ(m1,q)] - Z[IDX_MQ(m2,q)];
                    double muZhat = mu[IDX_NQ(n,q)]- (Z[IDX_MQ(m1,q)]+Z[IDX_MQ(m2,q)])/2.;
                    double Snq = S[IDX_NQ(n,q)];
                    double lq = l[q]*l[q];
                    log_psi2_n += dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) + log_denom2[IDX_NQ(n,q)]/(-2.);
                }
                double exp_psi2_n = exp(log_psi2_n);
                psi2n[IDX_NMM(n,m1,m2)] = var*var*exp_psi2_n;
                if(m1!=m2) { psi2n[IDX_NMM(n,m2,m1)] = var*var*exp_psi2_n;}
                psi2_local[threadIdx.x] += exp_psi2_n;
            }
            __syncthreads();
            reduce_sum(psi2_local, THREADNUM);
            if(threadIdx.x==0) {
                psi2[IDX_MM(m1,m2)] = var*var*psi2_local[0];
                if(m1!=m2) { psi2[IDX_MM(m2,m1)] = var*var*psi2_local[0]; }
            }
            __syncthreads();
        }
    }
    __global__ void psi1compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dL_dpsi1, double *psi1, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
    {
        int m_start, m_end;
        __shared__ double g_local[THREADNUM];
        divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
        int P = int(ceil(double(N)/THREADNUM));
        double dvar_local = 0;
        for(int q=0;q<Q;q++) {
            double lq_sqrt = l[q];
            double lq = lq_sqrt*lq_sqrt;
            double dl_local = 0;
            for(int p=0;p<P;p++) {
                int n = p*THREADNUM + threadIdx.x;
                double dmu_local = 0;
                double dS_local = 0;
                double Snq,mu_nq;
                if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)];}
                for(int m=m_start; m<m_end; m++) {
                    if(n<N) {
                        double lpsi1 = psi1[IDX_NM(n,m)]*dL_dpsi1[IDX_NM(n,m)];
                        if(q==0) {dvar_local += lpsi1;}
                        double Zmu = Z[IDX_MQ(m,q)] - mu_nq;
                        double denom = Snq+lq;
                        double Zmu2_denom = Zmu*Zmu/denom;
                        dmu_local += lpsi1*Zmu/denom;
                        dS_local += lpsi1*(Zmu2_denom-1.)/denom;
                        dl_local += lpsi1*(Zmu2_denom+Snq/lq)/denom;
                        g_local[threadIdx.x] = -lpsi1*Zmu/denom;
                    }
                    __syncthreads();
                    reduce_sum(g_local, p<P-1?THREADNUM:N-(P-1)*THREADNUM);
                    if(threadIdx.x==0) {dZ[IDX_MQ(m,q)] += g_local[0];}
                }
                if(n<N) {
                    dmu[IDX_NQB(n,q,blockIdx.x)] += dmu_local;
                    dS[IDX_NQB(n,q,blockIdx.x)] += dS_local/2.;
                }
                __threadfence_block();
            }
            g_local[threadIdx.x] = dl_local*lq_sqrt;
            __syncthreads();
            reduce_sum(g_local, THREADNUM);
            if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
        }
        g_local[threadIdx.x] = dvar_local;
        __syncthreads();
        reduce_sum(g_local, THREADNUM);
        if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]/var;}        
    }
    __global__ void psi2compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dL_dpsi2, double *psi2n, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
    {
        int m_start, m_end;
        __shared__ double g_local[THREADNUM];
        divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
        int P = int(ceil(double(N)/THREADNUM));
        double dvar_local = 0;
        for(int q=0;q<Q;q++) {
            double lq_sqrt = l[q];
            double lq = lq_sqrt*lq_sqrt;
            double dl_local = 0;
            for(int p=0;p<P;p++) {
                int n = p*THREADNUM + threadIdx.x;
                double dmu_local = 0;
                double dS_local = 0;
                double Snq,mu_nq;
                if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)];}
                for(int m1=m_start; m1<m_end; m1++) {
                    g_local[threadIdx.x] = 0;
                    for(int m2=0;m2<M;m2++) {
                        if(n<N) {
                            double lpsi2 = psi2n[IDX_NMM(n,m1,m2)]*dL_dpsi2[IDX_MM(m1,m2)];
                            if(q==0) {dvar_local += lpsi2;}
                            double dZ = Z[IDX_MQ(m1,q)] - Z[IDX_MQ(m2,q)];
                            double muZhat =  mu_nq - (Z[IDX_MQ(m1,q)] + Z[IDX_MQ(m2,q)])/2.;
                            double denom = 2.*Snq+lq;
                            double muZhat2_denom = muZhat*muZhat/denom;
                            dmu_local += lpsi2*muZhat/denom;
                            dS_local += lpsi2*(2.*muZhat2_denom-1.)/denom;
                            dl_local += lpsi2*((Snq/lq+muZhat2_denom)/denom+dZ*dZ/(4.*lq*lq));
                            g_local[threadIdx.x] += 2.*lpsi2*(muZhat/denom-dZ/(2*lq));
                        }
                    }
                    __syncthreads();
                    reduce_sum(g_local, p<P-1?THREADNUM:N-(P-1)*THREADNUM);
                    if(threadIdx.x==0) {dZ[IDX_MQ(m1,q)] += g_local[0];}
                }
                if(n<N) {
                    dmu[IDX_NQB(n,q,blockIdx.x)] += -2.*dmu_local;
                    dS[IDX_NQB(n,q,blockIdx.x)] += dS_local;
                }
                __threadfence_block();
            }
            g_local[threadIdx.x] = dl_local*2.*lq_sqrt;
            __syncthreads();
            reduce_sum(g_local, THREADNUM);
            if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
        }
        g_local[threadIdx.x] = dvar_local;
        __syncthreads();
        reduce_sum(g_local, THREADNUM);
        if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]*2/var;}
    }
    """
 class PSICOMP_RBF_GPU(PSICOMP_RBF):
    def __init__(self, threadnum=128, blocknum=15, GPU_direct=False):
        self.GPU_direct = GPU_direct
        self.gpuCache = None
        self.threadnum = threadnum
        self.blocknum = blocknum
        module = SourceModule("#define THREADNUM "+str(self.threadnum)+"\n"+gpu_code)
        self.g_psi1computations = module.get_function('psi1computations')
        self.g_psi1computations.prepare('PPdPPPPiii')
        self.g_psi2computations = module.get_function('psi2computations')
        self.g_psi2computations.prepare('PPPdPPPPiii')
        self.g_psi1compDer = module.get_function('psi1compDer')
        self.g_psi1compDer.prepare('PPPPPPPdPPPPiii')
        self.g_psi2compDer = module.get_function('psi2compDer')
        self.g_psi2compDer.prepare('PPPPPPPdPPPPiii')
        self.g_compDenom = module.get_function('compDenom')
        self.g_compDenom.prepare('PPPPii')
    def __deepcopy__(self, memo):
        s = PSICOMP_RBF_GPU(threadnum=self.threadnum, blocknum=self.blocknum, GPU_direct=self.GPU_direct)
        memo[id(self)] = s 
        return s
    def _initGPUCache(self, N, M, Q):            
        if self.gpuCache == None:
            self.gpuCache = {
                             'l_gpu'                :gpuarray.empty((Q,),np.float64,order='F'),
                             'Z_gpu'                :gpuarray.empty((M,Q),np.float64,order='F'),
                             'mu_gpu'               :gpuarray.empty((N,Q),np.float64,order='F'),
                             'S_gpu'                :gpuarray.empty((N,Q),np.float64,order='F'),
                             'psi1_gpu'             :gpuarray.empty((N,M),np.float64,order='F'),
                             'psi2_gpu'             :gpuarray.empty((M,M),np.float64,order='F'),
                             'psi2n_gpu'            :gpuarray.empty((N,M,M),np.float64,order='F'),
                             'dL_dpsi1_gpu'         :gpuarray.empty((N,M),np.float64,order='F'),
                             'dL_dpsi2_gpu'         :gpuarray.empty((M,M),np.float64,order='F'),
                             'log_denom1_gpu'       :gpuarray.empty((N,Q),np.float64,order='F'),
                             'log_denom2_gpu'       :gpuarray.empty((N,Q),np.float64,order='F'),
                             # derivatives
                             'dvar_gpu'             :gpuarray.empty((self.blocknum,),np.float64, order='F'),
                             'dl_gpu'               :gpuarray.empty((Q,self.blocknum),np.float64, order='F'),
                             'dZ_gpu'               :gpuarray.empty((M,Q),np.float64, order='F'),
                             'dmu_gpu'              :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
                             'dS_gpu'               :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
                             # grad
                             'grad_l_gpu'               :gpuarray.empty((Q,),np.float64, order='F'),
                             'grad_mu_gpu'              :gpuarray.empty((N,Q,),np.float64, order='F'),
                             'grad_S_gpu'               :gpuarray.empty((N,Q,),np.float64, order='F'),
                             }
        else:
            assert N==self.gpuCache['mu_gpu'].shape[0]
            assert M==self.gpuCache['Z_gpu'].shape[0]
            assert Q==self.gpuCache['l_gpu'].shape[0]
    def sync_params(self, lengthscale, Z, mu, S):
        if len(lengthscale)==1:
            self.gpuCache['l_gpu'].fill(lengthscale)
        else:
            self.gpuCache['l_gpu'].set(np.asfortranarray(lengthscale))
        self.gpuCache['Z_gpu'].set(np.asfortranarray(Z))
        self.gpuCache['mu_gpu'].set(np.asfortranarray(mu))
        self.gpuCache['S_gpu'].set(np.asfortranarray(S))
        N,Q = self.gpuCache['S_gpu'].shape
        # t=self.g_compDenom(self.gpuCache['log_denom1_gpu'],self.gpuCache['log_denom2_gpu'],self.gpuCache['l_gpu'],self.gpuCache['S_gpu'], np.int32(N), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
        # print 'g_compDenom '+str(t)
        self.g_compDenom.prepared_call((self.blocknum,1),(self.threadnum,1,1), self.gpuCache['log_denom1_gpu'].gpudata,self.gpuCache['log_denom2_gpu'].gpudata,self.gpuCache['l_gpu'].gpudata,self.gpuCache['S_gpu'].gpudata, np.int32(N), np.int32(Q))
    def reset_derivative(self):
        self.gpuCache['dvar_gpu'].fill(0.)
        self.gpuCache['dl_gpu'].fill(0.)
        self.gpuCache['dZ_gpu'].fill(0.)
        self.gpuCache['dmu_gpu'].fill(0.)
        self.gpuCache['dS_gpu'].fill(0.)
        self.gpuCache['grad_l_gpu'].fill(0.)
        self.gpuCache['grad_mu_gpu'].fill(0.)
        self.gpuCache['grad_S_gpu'].fill(0.)
    def get_dimensions(self, Z, variational_posterior):
        return variational_posterior.mean.shape[0], Z.shape[0], Z.shape[1]
    @Cache_this(limit=1, ignore_args=(0,))
    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
        """
        Z - MxQ
        mu - NxQ
        S - NxQ
        """
        N,M,Q = self.get_dimensions(Z, variational_posterior)
        self._initGPUCache(N,M,Q)
        self.sync_params(lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
        psi1_gpu = self.gpuCache['psi1_gpu']
        psi2_gpu = self.gpuCache['psi2_gpu']
        psi2n_gpu = self.gpuCache['psi2n_gpu']
        l_gpu = self.gpuCache['l_gpu']
        Z_gpu = self.gpuCache['Z_gpu']
        mu_gpu = self.gpuCache['mu_gpu']
        S_gpu = self.gpuCache['S_gpu']
        log_denom1_gpu = self.gpuCache['log_denom1_gpu']
        log_denom2_gpu = self.gpuCache['log_denom2_gpu']
        psi0 = np.empty((N,))
        psi0[:] = variance
        self.g_psi1computations.prepared_call((self.blocknum,1),(self.threadnum,1,1),psi1_gpu.gpudata, log_denom1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
        self.g_psi2computations.prepared_call((self.blocknum,1),(self.threadnum,1,1),psi2_gpu.gpudata, psi2n_gpu.gpudata, log_denom2_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
        # t = self.g_psi1computations(psi1_gpu, log_denom1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
        # print 'g_psi1computations '+str(t)
        # t = self.g_psi2computations(psi2_gpu, psi2n_gpu, log_denom2_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
        # print 'g_psi2computations '+str(t)
        if self.GPU_direct:
            return psi0, psi1_gpu, psi2_gpu
        else:
            return psi0, psi1_gpu.get(), psi2_gpu.get()
    @Cache_this(limit=1, ignore_args=(0,1,2,3))
    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
        ARD = (len(lengthscale)!=1)
        N,M,Q = self.get_dimensions(Z, variational_posterior)
        psi1_gpu = self.gpuCache['psi1_gpu']
        psi2n_gpu = self.gpuCache['psi2n_gpu']
        l_gpu = self.gpuCache['l_gpu']
        Z_gpu = self.gpuCache['Z_gpu']
        mu_gpu = self.gpuCache['mu_gpu']
        S_gpu = self.gpuCache['S_gpu']
        dvar_gpu = self.gpuCache['dvar_gpu']
        dl_gpu = self.gpuCache['dl_gpu']
        dZ_gpu = self.gpuCache['dZ_gpu']
        dmu_gpu = self.gpuCache['dmu_gpu']
        dS_gpu = self.gpuCache['dS_gpu']
        grad_l_gpu = self.gpuCache['grad_l_gpu']
        grad_mu_gpu = self.gpuCache['grad_mu_gpu']
        grad_S_gpu = self.gpuCache['grad_S_gpu']
        if self.GPU_direct:
            dL_dpsi1_gpu = dL_dpsi1
            dL_dpsi2_gpu = dL_dpsi2
            dL_dpsi0_sum = dL_dpsi0.get().sum() #gpuarray.sum(dL_dpsi0).get()
        else:
            dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu']
            dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu']
            dL_dpsi1_gpu.set(np.asfortranarray(dL_dpsi1))
            dL_dpsi2_gpu.set(np.asfortranarray(dL_dpsi2))
            dL_dpsi0_sum = dL_dpsi0.sum()
        self.reset_derivative()
        # t=self.g_psi1compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi1_gpu,psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
        # print 'g_psi1compDer '+str(t)
        # t=self.g_psi2compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi2_gpu,psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
        # print 'g_psi2compDer '+str(t)
        self.g_psi1compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dL_dpsi1_gpu.gpudata,psi1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
        self.g_psi2compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dL_dpsi2_gpu.gpudata,psi2n_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
        dL_dvar = dL_dpsi0_sum + dvar_gpu.get().sum()#gpuarray.sum(dvar_gpu).get()
        sum_axis(grad_mu_gpu,dmu_gpu,N*Q,self.blocknum)
        dL_dmu = grad_mu_gpu.get()
        sum_axis(grad_S_gpu,dS_gpu,N*Q,self.blocknum)
        dL_dS = grad_S_gpu.get()
        dL_dZ = dZ_gpu.get()
        if ARD:
            sum_axis(grad_l_gpu,dl_gpu,Q,self.blocknum)
            dL_dlengscale = grad_l_gpu.get()
        else:
            dL_dlengscale = dl_gpu.get().sum() #gpuarray.sum(dl_gpu).get()
        return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS
--- a/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/sslinear_psi_comp.py
@ -0,0 +1,92 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 The package for the Psi statistics computation of the linear kernel for SSGPLVM
 """
 from ....util.linalg import tdot
 import numpy as np
 def psicomputations(variance, Z, variational_posterior):
    """
    Compute psi-statistics for ss-linear kernel
    """
    # here are the "statistics" for psi0, psi1 and psi2
    # Produced intermediate results:
    # psi0    N
    # psi1    NxM
    # psi2    MxM
    mu = variational_posterior.mean
    S = variational_posterior.variance
    gamma = variational_posterior.binary_prob
    psi0 = (gamma*(np.square(mu)+S)*variance).sum(axis=-1)
    psi1 = np.inner(variance*gamma*mu,Z)
    psi2 = np.inner(np.square(variance)*(gamma*((1-gamma)*np.square(mu)+S)).sum(axis=0)*Z,Z)+tdot(psi1.T)
    return psi0, psi1, psi2
 def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior):
    mu = variational_posterior.mean
    S = variational_posterior.variance
    gamma = variational_posterior.binary_prob
    dL_dvar, dL_dgamma, dL_dmu, dL_dS, dL_dZ = _psi2computations(dL_dpsi2, variance, Z, mu, S, gamma)
    # Compute for psi0 and psi1
    mu2S = np.square(mu)+S
    dL_dvar += np.einsum('n,nq,nq->q',dL_dpsi0,gamma,mu2S) + np.einsum('nm,nq,mq,nq->q',dL_dpsi1,gamma,Z,mu)
    dL_dgamma += np.einsum('n,q,nq->nq',dL_dpsi0,variance,mu2S) + np.einsum('nm,q,mq,nq->nq',dL_dpsi1,variance,Z,mu)
    dL_dmu += np.einsum('n,nq,q,nq->nq',dL_dpsi0,gamma,2.*variance,mu) + np.einsum('nm,nq,q,mq->nq',dL_dpsi1,gamma,variance,Z)
    dL_dS += np.einsum('n,nq,q->nq',dL_dpsi0,gamma,variance)
    dL_dZ +=  np.einsum('nm,nq,q,nq->mq',dL_dpsi1,gamma, variance,mu)
    return dL_dvar, dL_dZ, dL_dmu, dL_dS, dL_dgamma
 def _psi2computations(dL_dpsi2, variance, Z, mu, S, gamma):
    """
    Z - MxQ
    mu - NxQ
    S - NxQ
    gamma - NxQ
    """
    # here are the "statistics" for psi1 and psi2
    # Produced intermediate results:
    # _psi2_dvariance      Q
    # _psi2_dZ             MxQ
    # _psi2_dgamma         NxQ
    # _psi2_dmu            NxQ
    # _psi2_dS             NxQ
    mu2 = np.square(mu)
    gamma2 = np.square(gamma)
    variance2 = np.square(variance)
    mu2S = mu2+S # NxQ
    gvm = np.einsum('nq,nq,q->nq',gamma,mu,variance)
    common_sum = np.einsum('nq,mq->nm',gvm,Z)
 #     common_sum = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) # NxM
    Z_expect = np.einsum('mo,mq,oq->q',dL_dpsi2,Z,Z)
    dL_dpsi2T = dL_dpsi2+dL_dpsi2.T
    tmp = np.einsum('mo,oq->mq',dL_dpsi2T,Z)
    common_expect = np.einsum('mq,nm->nq',tmp,common_sum)
 #     common_expect = np.einsum('mo,mq,no->nq',dL_dpsi2+dL_dpsi2.T,Z,common_sum)
    Z2_expect = np.einsum('om,nm->no',dL_dpsi2T,common_sum)
    Z1_expect = np.einsum('om,mq->oq',dL_dpsi2T,Z)
    dL_dvar = np.einsum('nq,q,q->q',2.*(gamma*mu2S-gamma2*mu2),variance,Z_expect)+\
        np.einsum('nq,nq,nq->q',common_expect,gamma,mu)
    dL_dgamma = np.einsum('q,q,nq->nq',Z_expect,variance2,(mu2S-2.*gamma*mu2))+\
        np.einsum('nq,q,nq->nq',common_expect,variance,mu)
    dL_dmu = np.einsum('q,q,nq,nq->nq',Z_expect,variance2,mu,2.*(gamma-gamma2))+\
            np.einsum('nq,nq,q->nq',common_expect,gamma,variance)
    dL_dS = np.einsum('q,nq,q->nq',Z_expect,gamma,variance2)
 #     dL_dZ = 2.*(np.einsum('om,nq,q,mq,nq->oq',dL_dpsi2,gamma,variance2,Z,(mu2S-gamma*mu2))+np.einsum('om,nq,q,nq,nm->oq',dL_dpsi2,gamma,variance,mu,common_sum))
    dL_dZ = Z1_expect*np.einsum('nq,q,nq->q',gamma,variance2,(mu2S-gamma*mu2))+np.einsum('nq,q,nq,nm->mq',gamma,variance,mu,Z2_expect)
    return dL_dvar, dL_dgamma, dL_dmu, dL_dS, dL_dZ
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
@ -0,0 +1,394 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 The package for the psi statistics computation
 """
 import numpy as np
 try:
    from scipy import weave
    def _psicomputations(variance, lengthscale, Z, variational_posterior):
        """
        Z - MxQ
        mu - NxQ
        S - NxQ
        gamma - NxQ
        """
        # here are the "statistics" for psi0, psi1 and psi2
        # Produced intermediate results:
        # _psi1                NxM
        mu = variational_posterior.mean
        S = variational_posterior.variance
        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
        l2 = np.square(lengthscale)
        log_denom1 = np.log(S/l2+1)
        log_denom2 = np.log(2*S/l2+1)
        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
        variance = float(variance)
        psi0 = np.empty(N)
        psi0[:] = variance
        psi1 = np.empty((N,M))
        psi2n = np.empty((N,M,M))
        from ....util.misc import param_to_array
        S = param_to_array(S)
        mu = param_to_array(mu)
        Z = param_to_array(Z)
        support_code = """
        #include <math.h>
        """
        code = """
        for(int n=0; n<N; n++) {
            for(int m1=0;m1<M;m1++) {
                double log_psi1=0;
                for(int m2=0;m2<=m1;m2++) {
                    double log_psi2_n=0;
                    for(int q=0;q<Q;q++) {
                        double Snq = S(n,q);
                        double lq = l2(q);
                        double Zm1q = Z(m1,q);
                        double Zm2q = Z(m2,q);
                        if(m2==0) {
                            // Compute Psi_1
                            double muZ = mu(n,q)-Z(m1,q);
                            double psi1_exp1 = log_gamma(n,q) - (muZ*muZ/(Snq+lq) +log_denom1(n,q))/2.;
                            double psi1_exp2 = log_gamma1(n,q) -Zm1q*Zm1q/(2.*lq);
                            log_psi1 += (psi1_exp1>psi1_exp2)?psi1_exp1+log1p(exp(psi1_exp2-psi1_exp1)):psi1_exp2+log1p(exp(psi1_exp1-psi1_exp2));
                        }
                        // Compute Psi_2
                        double muZhat = mu(n,q) - (Zm1q+Zm2q)/2.;
                        double Z2 = Zm1q*Zm1q+ Zm2q*Zm2q;
                        double dZ = Zm1q - Zm2q;
                        double psi2_exp1 = dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2(n,q)/2. + log_gamma(n,q);
                        double psi2_exp2 = log_gamma1(n,q) - Z2/(2.*lq);
                        log_psi2_n += (psi2_exp1>psi2_exp2)?psi2_exp1+log1p(exp(psi2_exp2-psi2_exp1)):psi2_exp2+log1p(exp(psi2_exp1-psi2_exp2));                    
                    }
                    double exp_psi2_n = exp(log_psi2_n);
                    psi2n(n,m1,m2) = variance*variance*exp_psi2_n;
                    if(m1!=m2) { psi2n(n,m2,m1) = variance*variance*exp_psi2_n;}
                }
                psi1(n,m1) = variance*exp(log_psi1);
            }
        }
        """
        weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz)
        psi2 = psi2n.sum(axis=0)
        return psi0,psi1,psi2,psi2n
    from GPy.util.caching import Cacher
    psicomputations = Cacher(_psicomputations, limit=1)
    def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
        ARD = (len(lengthscale)!=1)
        _,psi1,_,psi2n = psicomputations(variance, lengthscale, Z, variational_posterior)
        mu = variational_posterior.mean
        S = variational_posterior.variance
        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
        l2 = np.square(lengthscale)
        log_denom1 = np.log(S/l2+1)
        log_denom2 = np.log(2*S/l2+1)
        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
        gamma, gamma1 = variational_posterior.gamma_probabilities()
        variance = float(variance)
        dvar = np.zeros(1)
        dmu = np.zeros((N,Q))
        dS = np.zeros((N,Q))
        dgamma = np.zeros((N,Q))
        dl = np.zeros(Q)
        dZ = np.zeros((M,Q))
        dvar += np.sum(dL_dpsi0)
        from ....util.misc import param_to_array
        S = param_to_array(S)
        mu = param_to_array(mu)
        Z = param_to_array(Z)
        support_code = """
        #include <math.h>
        """
        code = """
        for(int n=0; n<N; n++) {
            for(int m1=0;m1<M;m1++) {
                double log_psi1=0;
                for(int m2=0;m2<M;m2++) {
                    double log_psi2_n=0;
                    for(int q=0;q<Q;q++) {
                        double Snq = S(n,q);
                        double lq = l2(q);
                        double Zm1q = Z(m1,q);
                        double Zm2q = Z(m2,q);
                        double gnq = gamma(n,q);
                        double g1nq = gamma1(n,q);
                        double mu_nq = mu(n,q);
                        if(m2==0) {
                            // Compute Psi_1                        
                            double lpsi1 = psi1(n,m1)*dL_dpsi1(n,m1);
                            if(q==0) {dvar(0) += lpsi1/variance;}
                            double Zmu = Zm1q - mu_nq;
                            double denom = Snq+lq;
                            double Zmu2_denom = Zmu*Zmu/denom;
                            double exp1 = log_gamma(n,q)-(Zmu*Zmu/(Snq+lq)+log_denom1(n,q))/(2.);
                            double exp2 = log_gamma1(n,q)-Zm1q*Zm1q/(2.*lq);
                            double d_exp1,d_exp2;
                            if(exp1>exp2) {
                                d_exp1 = 1.;
                                d_exp2 = exp(exp2-exp1);
                            } else {
                                d_exp1 = exp(exp1-exp2);
                                d_exp2 = 1.;
                            }
                            double exp_sum = d_exp1+d_exp2;
                            dmu(n,q) += lpsi1*Zmu*d_exp1/(denom*exp_sum);
                            dS(n,q) += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum)/2.;
                            dgamma(n,q) += lpsi1*(d_exp1*g1nq-d_exp2*gnq)/exp_sum;
                            dl(q) += lpsi1*((Zmu2_denom+Snq/lq)/denom*d_exp1+Zm1q*Zm1q/(lq*lq)*d_exp2)/(2.*exp_sum);
                            dZ(m1,q) += lpsi1*(-Zmu/denom*d_exp1-Zm1q/lq*d_exp2)/exp_sum;
                        }
                        // Compute Psi_2
                        double lpsi2 = psi2n(n,m1,m2)*dL_dpsi2(m1,m2);
                        if(q==0) {dvar(0) += lpsi2*2/variance;}
                        double dZm1m2 = Zm1q - Zm2q;
                        double Z2 = Zm1q*Zm1q+Zm2q*Zm2q;
                        double muZhat =  mu_nq - (Zm1q + Zm2q)/2.;
                        double denom = 2.*Snq+lq;
                        double muZhat2_denom = muZhat*muZhat/denom;
                        double exp1 = dZm1m2*dZm1m2/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2(n,q)/2. + log_gamma(n,q);
                        double exp2 = log_gamma1(n,q) - Z2/(2.*lq);
                        double d_exp1,d_exp2;
                        if(exp1>exp2) {
                            d_exp1 = 1.;
                            d_exp2 = exp(exp2-exp1);
                        } else {
                            d_exp1 = exp(exp1-exp2);
                            d_exp2 = 1.;
                        }
                        double exp_sum = d_exp1+d_exp2;
                        dmu(n,q) += -2.*lpsi2*muZhat/denom*d_exp1/exp_sum;
                        dS(n,q) += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum;
                        dgamma(n,q) += lpsi2*(d_exp1*g1nq-d_exp2*gnq)/exp_sum;
                        dl(q) += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZm1m2*dZm1m2/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum;
                        dZ(m1,q) += 2.*lpsi2*((muZhat/denom-dZm1m2/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum;                   
                    }
                }
            }
        }
        """
        weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','gamma1','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz)
        dl *= 2.*lengthscale
        if not ARD:
            dl = dl.sum()
        return dvar, dl, dZ, dmu, dS, dgamma
 except:
    def psicomputations(variance, lengthscale, Z, variational_posterior):
        """
        Z - MxQ
        mu - NxQ
        S - NxQ
        gamma - NxQ
        """
        # here are the "statistics" for psi0, psi1 and psi2
        # Produced intermediate results:
        # _psi1                NxM
        mu = variational_posterior.mean
        S = variational_posterior.variance
        gamma = variational_posterior.binary_prob
        psi0 = np.empty(mu.shape[0])
        psi0[:] = variance
        psi1 = _psi1computations(variance, lengthscale, Z, mu, S, gamma)
        psi2 = _psi2computations(variance, lengthscale, Z, mu, S, gamma)
        return psi0, psi1, psi2
    def _psi1computations(variance, lengthscale, Z, mu, S, gamma):
        """
        Z - MxQ
        mu - NxQ
        S - NxQ
        gamma - NxQ
        """
        # here are the "statistics" for psi1
        # Produced intermediate results:
        # _psi1                NxM
        lengthscale2 = np.square(lengthscale)
        # psi1
        _psi1_denom = S[:, None, :] / lengthscale2 + 1.  # Nx1xQ
        _psi1_denom_sqrt = np.sqrt(_psi1_denom) #Nx1xQ
        _psi1_dist = Z[None, :, :] - mu[:, None, :]  # NxMxQ
        _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom) # NxMxQ
        _psi1_common = gamma[:,None,:] / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #Nx1xQ
        _psi1_exponent1 = np.log(gamma[:,None,:]) - (_psi1_dist_sq + np.log(_psi1_denom))/2. # NxMxQ
        _psi1_exponent2 = np.log(1.-gamma[:,None,:]) - (np.square(Z[None,:,:])/lengthscale2)/2. # NxMxQ
        _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2)
        _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ
        _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM
        _psi1 = variance * np.exp(_psi1_exp_sum) # NxM
        return _psi1
    def _psi2computations(variance, lengthscale, Z, mu, S, gamma):
        """
        Z - MxQ
        mu - NxQ
        S - NxQ
        gamma - NxQ
        """
        # here are the "statistics" for psi2
        # Produced intermediate results:
        # _psi2                MxM
        lengthscale2 = np.square(lengthscale)
        _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
        _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
        _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
        _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
        # psi2
        _psi2_denom = 2.*S[:, None, None, :] / lengthscale2 + 1. # Nx1x1xQ
        _psi2_denom_sqrt = np.sqrt(_psi2_denom)
        _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q
        _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom)
        _psi2_common = gamma[:,None,None,:]/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # Nx1x1xQ
        _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom)+np.log(gamma[:,None,None,:]) #N,M,M,Q
        _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ
        _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2)
        _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
        _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
        _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
        return _psi2
    def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
        ARD = (len(lengthscale)!=1)
        dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1, dgamma_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
        dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2, dgamma_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
        dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
        dL_dlengscale = dl_psi1 + dl_psi2
        if not ARD:
            dL_dlengscale = dL_dlengscale.sum()
        dL_dgamma = dgamma_psi1 + dgamma_psi2
        dL_dmu = dmu_psi1 + dmu_psi2
        dL_dS = dS_psi1 + dS_psi2
        dL_dZ = dZ_psi1 + dZ_psi2
        return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma
    def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, gamma):
        """
        dL_dpsi1 - NxM
        Z - MxQ
        mu - NxQ
        S - NxQ
        gamma - NxQ
        """
        # here are the "statistics" for psi1
        # Produced intermediate results: dL_dparams w.r.t. psi1
        # _dL_dvariance     1
        # _dL_dlengthscale  Q
        # _dL_dZ            MxQ
        # _dL_dgamma        NxQ
        # _dL_dmu           NxQ
        # _dL_dS            NxQ
        lengthscale2 = np.square(lengthscale)
        # psi1
        _psi1_denom = S / lengthscale2 + 1.  # NxQ
        _psi1_denom_sqrt = np.sqrt(_psi1_denom) #NxQ
        _psi1_dist = Z[None, :, :] - mu[:, None, :]  # NxMxQ
        _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom[:,None,:]) # NxMxQ
        _psi1_common = gamma / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #NxQ
        _psi1_exponent1 = np.log(gamma[:,None,:]) -0.5 * (_psi1_dist_sq + np.log(_psi1_denom[:, None,:])) # NxMxQ
        _psi1_exponent2 = np.log(1.-gamma[:,None,:]) -0.5 * (np.square(Z[None,:,:])/lengthscale2) # NxMxQ
        _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2)
        _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ
        _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM
        _psi1_exp_dist_sq = np.exp(-0.5*_psi1_dist_sq) # NxMxQ
        _psi1_exp_Z = np.exp(-0.5*np.square(Z[None,:,:])/lengthscale2) # 1xMxQ
        _psi1_q = variance * np.exp(_psi1_exp_sum[:,:,None] - _psi1_exponent) # NxMxQ
        _psi1 = variance * np.exp(_psi1_exp_sum) # NxM
        _dL_dvariance = np.einsum('nm,nm->',dL_dpsi1, _psi1)/variance # 1
        _dL_dgamma = np.einsum('nm,nmq,nmq->nq',dL_dpsi1, _psi1_q, (_psi1_exp_dist_sq/_psi1_denom_sqrt[:,None,:]-_psi1_exp_Z)) # NxQ
        _dL_dmu = np.einsum('nm, nmq, nmq, nmq, nq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_dist,_psi1_common)  # NxQ
        _dL_dS = np.einsum('nm,nmq,nmq,nq,nmq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_common,(_psi1_dist_sq-1.))/2.  # NxQ
        _dL_dZ = np.einsum('nm,nmq,nmq->mq',dL_dpsi1,_psi1_q, (- _psi1_common[:,None,:] * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z))
        _dL_dlengthscale = lengthscale* np.einsum('nm,nmq,nmq->q',dL_dpsi1,_psi1_q,(_psi1_common[:,None,:]*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + (1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z))
        return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma 
    def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, gamma):
        """
        Z - MxQ
        mu - NxQ
        S - NxQ
        gamma - NxQ
        dL_dpsi2 - MxM
        """
        # here are the "statistics" for psi2
        # Produced the derivatives w.r.t. psi2:
        # _dL_dvariance      1
        # _dL_dlengthscale   Q
        # _dL_dZ             MxQ
        # _dL_dgamma         NxQ
        # _dL_dmu            NxQ
        # _dL_dS             NxQ
        lengthscale2 = np.square(lengthscale)
        _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
        _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
        _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
        _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
        # psi2
        _psi2_denom = 2.*S / lengthscale2 + 1. # NxQ
        _psi2_denom_sqrt = np.sqrt(_psi2_denom)
        _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q
        _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom[:,None,None,:])
        _psi2_common = gamma/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # NxQ
        _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom[:,None,None,:])+np.log(gamma[:,None,None,:]) #N,M,M,Q
        _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ
        _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2)
        _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
        _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
        _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ 
        _psi2_exp_dist_sq = np.exp(-_psi2_Zdist_sq -_psi2_mudist_sq) # NxMxMxQ
        _psi2_exp_Z = np.exp(-0.5*_psi2_Z_sq_sum) # MxMxQ
        _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
        _dL_dvariance = np.einsum('mo,mo->',dL_dpsi2,_psi2)*2./variance
        _dL_dgamma = np.einsum('mo,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,(_psi2_exp_dist_sq/_psi2_denom_sqrt[:,None,None,:] - _psi2_exp_Z))
        _dL_dmu = -2.*np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,_psi2_common,_psi2_mudist,_psi2_exp_dist_sq)
        _dL_dS = np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q, _psi2_common, (2.*_psi2_mudist_sq-1.), _psi2_exp_dist_sq)
        _dL_dZ = 2.*np.einsum('mo,nmoq,nmoq->mq',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(-_psi2_Zdist*_psi2_denom[:,None,None,:]+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z))
        _dL_dlengthscale = 2.*lengthscale* np.einsum('mo,nmoq,nmoq->q',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z))
        return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py
@ -0,0 +1,474 @@
 """
 The module for psi-statistics for RBF kernel for Spike-and-Slab GPLVM
 """
 import numpy as np
 from ....util.caching import Cache_this
 from . import PSICOMP_RBF
 from ....util import gpu_init
 try:
    import pycuda.gpuarray as gpuarray
    from pycuda.compiler import SourceModule
    from ....util.linalg_gpu import sum_axis
 except:
    pass    
 gpu_code = """
    // define THREADNUM
    #define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
    #define IDX_NMM(n,m1,m2) ((m2*M+m1)*N+n)
    #define IDX_NQ(n,q) (q*N+n)
    #define IDX_NM(n,m) (m*N+n)
    #define IDX_MQ(m,q) (q*M+m)
    #define IDX_MM(m1,m2) (m2*M+m1)
    #define IDX_NQB(n,q,b) ((b*Q+q)*N+n)
    #define IDX_QB(q,b) (b*Q+q)
    // Divide data evenly
    __device__ void divide_data(int total_data, int psize, int pidx, int *start, int *end) {
        int residue = (total_data)%psize;
        if(pidx<residue) {
            int size = total_data/psize+1;
            *start = size*pidx;
            *end = *start+size;
        } else {
            int size = total_data/psize;
            *start = size*pidx+residue;
            *end = *start+size;
        }
    }
    __device__ void reduce_sum(double* array, int array_size) {
        int s;
        if(array_size >= blockDim.x) {
            for(int i=blockDim.x+threadIdx.x; i<array_size; i+= blockDim.x) {
                array[threadIdx.x] += array[i];
            }
            array_size = blockDim.x;
        }
        __syncthreads();
        for(int i=1; i<=array_size;i*=2) {s=i;}
        if(threadIdx.x < array_size-s) {array[threadIdx.x] += array[s+threadIdx.x];}
        __syncthreads();
        for(s=s/2;s>=1;s=s/2) {
            if(threadIdx.x < s) {array[threadIdx.x] += array[s+threadIdx.x];}
            __syncthreads();
        }
    }
    __global__ void compDenom(double *log_denom1, double *log_denom2, double *log_gamma, double*log_gamma1, double *gamma, double *l, double *S, int N, int Q)
    {
        int n_start, n_end;
        divide_data(N, gridDim.x, blockIdx.x, &n_start, &n_end);
        for(int i=n_start*Q+threadIdx.x; i<n_end*Q; i+=blockDim.x) {
            int n=i/Q;
            int q=i%Q;
            double Snq = S[IDX_NQ(n,q)];
            double lq = l[q]*l[q];
            double gnq = gamma[IDX_NQ(n,q)];
            log_denom1[IDX_NQ(n,q)] = log(Snq/lq+1.);
            log_denom2[IDX_NQ(n,q)] = log(2.*Snq/lq+1.);
            log_gamma[IDX_NQ(n,q)] = log(gnq);
            log_gamma1[IDX_NQ(n,q)] = log(1.-gnq);
        }
    }
    __global__ void psi1computations(double *psi1, double *log_denom1, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
    {
        int m_start, m_end;
        divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
        for(int m=m_start; m<m_end; m++) {
            for(int n=threadIdx.x; n<N; n+= blockDim.x) {            
                double log_psi1 = 0;
                for(int q=0;q<Q;q++) {
                    double Zmq = Z[IDX_MQ(m,q)];
                    double muZ = mu[IDX_NQ(n,q)]-Zmq;
                    double Snq = S[IDX_NQ(n,q)];
                    double lq = l[q]*l[q];
                    double exp1 = log_gamma[IDX_NQ(n,q)]-(muZ*muZ/(Snq+lq)+log_denom1[IDX_NQ(n,q)])/(2.);
                    double exp2 = log_gamma1[IDX_NQ(n,q)]-Zmq*Zmq/(2.*lq);
                    log_psi1 += (exp1>exp2)?exp1+log1p(exp(exp2-exp1)):exp2+log1p(exp(exp1-exp2));
                }
                psi1[IDX_NM(n,m)] = var*exp(log_psi1);
            }
        }
    }
    __global__ void psi2computations(double *psi2, double *psi2n, double *log_denom2, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
    {
        int psi2_idx_start, psi2_idx_end;
        __shared__ double psi2_local[THREADNUM];
        divide_data((M+1)*M/2, gridDim.x, blockIdx.x, &psi2_idx_start, &psi2_idx_end);
        for(int psi2_idx=psi2_idx_start; psi2_idx<psi2_idx_end; psi2_idx++) {
            int m1 = int((sqrt(8.*psi2_idx+1.)-1.)/2.);
            int m2 = psi2_idx - (m1+1)*m1/2;
            psi2_local[threadIdx.x] = 0;
            for(int n=threadIdx.x;n<N;n+=blockDim.x) {
                double log_psi2_n = 0;
                for(int q=0;q<Q;q++) {
                    double Zm1q = Z[IDX_MQ(m1,q)];
                    double Zm2q = Z[IDX_MQ(m2,q)];
                    double dZ = Zm1q - Zm2q;
                    double muZhat = mu[IDX_NQ(n,q)]- (Zm1q+Zm2q)/2.;
                    double Z2 = Zm1q*Zm1q+Zm2q*Zm2q;
                    double Snq = S[IDX_NQ(n,q)];
                    double lq = l[q]*l[q];
                    double exp1 = dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2[IDX_NQ(n,q)]/2. + log_gamma[IDX_NQ(n,q)];
                    double exp2 = log_gamma1[IDX_NQ(n,q)] - Z2/(2.*lq);
                    log_psi2_n += (exp1>exp2)?exp1+log1p(exp(exp2-exp1)):exp2+log1p(exp(exp1-exp2));
                }
                double exp_psi2_n = exp(log_psi2_n);
                psi2n[IDX_NMM(n,m1,m2)] = var*var*exp_psi2_n;
                if(m1!=m2) { psi2n[IDX_NMM(n,m2,m1)] = var*var*exp_psi2_n;}
                psi2_local[threadIdx.x] += exp_psi2_n;
            }
            __syncthreads();
            reduce_sum(psi2_local, THREADNUM);
            if(threadIdx.x==0) {
                psi2[IDX_MM(m1,m2)] = var*var*psi2_local[0];
                if(m1!=m2) { psi2[IDX_MM(m2,m1)] = var*var*psi2_local[0]; }
            }
            __syncthreads();
        }
    }
    __global__ void psi1compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dgamma, double *dL_dpsi1, double *psi1, double *log_denom1, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q)
    {
        int m_start, m_end;
        __shared__ double g_local[THREADNUM];
        divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
        int P = int(ceil(double(N)/THREADNUM));
        double dvar_local = 0;
        for(int q=0;q<Q;q++) {
            double lq_sqrt = l[q];
            double lq = lq_sqrt*lq_sqrt;
            double dl_local = 0;
            for(int p=0;p<P;p++) {
                int n = p*THREADNUM + threadIdx.x;
                double dmu_local = 0;
                double dS_local = 0;
                double dgamma_local = 0;
                double Snq,mu_nq,gnq,log_gnq,log_gnq1,log_de;
                if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)]; gnq = gamma[IDX_NQ(n,q)];
                        log_gnq = log_gamma[IDX_NQ(n,q)]; log_gnq1 = log_gamma1[IDX_NQ(n,q)];
                        log_de = log_denom1[IDX_NQ(n,q)];}
                for(int m=m_start; m<m_end; m++) {
                    if(n<N) {
                        double lpsi1 = psi1[IDX_NM(n,m)]*dL_dpsi1[IDX_NM(n,m)];
                        if(q==0) {dvar_local += lpsi1;}
                        double Zmq = Z[IDX_MQ(m,q)];
                        double Zmu = Zmq - mu_nq;
                        double denom = Snq+lq;
                        double Zmu2_denom = Zmu*Zmu/denom;
                        double exp1 = log_gnq-(Zmu*Zmu/(Snq+lq)+log_de)/(2.);
                        double exp2 = log_gnq1-Zmq*Zmq/(2.*lq);
                        double d_exp1,d_exp2;
                        if(exp1>exp2) {
                            d_exp1 = 1.;
                            d_exp2 = exp(exp2-exp1);
                        } else {
                            d_exp1 = exp(exp1-exp2);
                            d_exp2 = 1.;
                        }
                        double exp_sum = d_exp1+d_exp2;
                        dmu_local += lpsi1*Zmu*d_exp1/(denom*exp_sum);
                        dS_local += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum);
                        dgamma_local += lpsi1*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
                        dl_local += lpsi1*((Zmu2_denom+Snq/lq)/denom*d_exp1+Zmq*Zmq/(lq*lq)*d_exp2)/(2.*exp_sum);
                        g_local[threadIdx.x] = lpsi1*(-Zmu/denom*d_exp1-Zmq/lq*d_exp2)/exp_sum;
                    }
                    __syncthreads();
                    reduce_sum(g_local, p<P-1?THREADNUM:N-(P-1)*THREADNUM);
                    if(threadIdx.x==0) {dZ[IDX_MQ(m,q)] += g_local[0];}
                }
                if(n<N) {
                    dmu[IDX_NQB(n,q,blockIdx.x)] += dmu_local;
                    dS[IDX_NQB(n,q,blockIdx.x)] += dS_local/2.;
                    dgamma[IDX_NQB(n,q,blockIdx.x)] += dgamma_local;
                }
                __threadfence_block();
            }
            g_local[threadIdx.x] = dl_local*2.*lq_sqrt;
            __syncthreads();
            reduce_sum(g_local, THREADNUM);
            if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
        }
        g_local[threadIdx.x] = dvar_local;
        __syncthreads();
        reduce_sum(g_local, THREADNUM);
        if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]/var;}
    }
    __global__ void psi2compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dgamma, double *dL_dpsi2, double *psi2n, double *log_denom2, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q)
    {
        int m_start, m_end;
        __shared__ double g_local[THREADNUM];
        divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
        int P = int(ceil(double(N)/THREADNUM));
        double dvar_local = 0;
        for(int q=0;q<Q;q++) {
            double lq_sqrt = l[q];
            double lq = lq_sqrt*lq_sqrt;
            double dl_local = 0;
            for(int p=0;p<P;p++) {
                int n = p*THREADNUM + threadIdx.x;
                double dmu_local = 0;
                double dS_local = 0;
                double dgamma_local = 0;
                double Snq,mu_nq,gnq,log_gnq,log_gnq1,log_de;
                if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)]; gnq = gamma[IDX_NQ(n,q)];
                        log_gnq = log_gamma[IDX_NQ(n,q)]; log_gnq1 = log_gamma1[IDX_NQ(n,q)];
                        log_de = log_denom2[IDX_NQ(n,q)];}
                for(int m1=m_start; m1<m_end; m1++) {
                    g_local[threadIdx.x] = 0;
                    for(int m2=0;m2<M;m2++) {
                        if(n<N) {
                            double lpsi2 = psi2n[IDX_NMM(n,m1,m2)]*dL_dpsi2[IDX_MM(m1,m2)];
                            if(q==0) {dvar_local += lpsi2;}
                            double Zm1q = Z[IDX_MQ(m1,q)];
                            double Zm2q = Z[IDX_MQ(m2,q)];
                            double dZ = Zm1q - Zm2q;
                            double Z2 = Zm1q*Zm1q+Zm2q*Zm2q;
                            double muZhat =  mu_nq - (Zm1q + Zm2q)/2.;
                            double denom = 2.*Snq+lq;
                            double muZhat2_denom = muZhat*muZhat/denom;
                            double exp1 = dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_de/2. + log_gnq;
                            double exp2 = log_gnq1 - Z2/(2.*lq);
                            double d_exp1,d_exp2;
                            if(exp1>exp2) {
                                d_exp1 = 1.;
                                d_exp2 = exp(exp2-exp1);
                            } else {
                                d_exp1 = exp(exp1-exp2);
                                d_exp2 = 1.;
                            }
                            double exp_sum = d_exp1+d_exp2;
                            dmu_local += lpsi2*muZhat/denom*d_exp1/exp_sum;
                            dS_local += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum;
                            dgamma_local += lpsi2*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
                            dl_local += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZ*dZ/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum;
                            g_local[threadIdx.x] += 2.*lpsi2*((muZhat/denom-dZ/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum;
                        }
                    }
                    __syncthreads();
                    reduce_sum(g_local, p<P-1?THREADNUM:N-(P-1)*THREADNUM);
                    if(threadIdx.x==0) {dZ[IDX_MQ(m1,q)] += g_local[0];}
                }
                if(n<N) {
                    dmu[IDX_NQB(n,q,blockIdx.x)] += -2.*dmu_local;
                    dS[IDX_NQB(n,q,blockIdx.x)] += dS_local;
                    dgamma[IDX_NQB(n,q,blockIdx.x)] += dgamma_local;
                }
                __threadfence_block();
            }
            g_local[threadIdx.x] = dl_local*2.*lq_sqrt;
            __syncthreads();
            reduce_sum(g_local, THREADNUM);
            if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
        }
        g_local[threadIdx.x] = dvar_local;
        __syncthreads();
        reduce_sum(g_local, THREADNUM);
        if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]*2/var;}
    }
    """
 class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
    def __init__(self, threadnum=128, blocknum=15, GPU_direct=False):
        self.GPU_direct = GPU_direct
        self.gpuCache = None
        self.threadnum = threadnum
        self.blocknum = blocknum
        module = SourceModule("#define THREADNUM "+str(self.threadnum)+"\n"+gpu_code)
        self.g_psi1computations = module.get_function('psi1computations')
        self.g_psi1computations.prepare('PPPPdPPPPiii')
        self.g_psi2computations = module.get_function('psi2computations')
        self.g_psi2computations.prepare('PPPPPdPPPPiii')
        self.g_psi1compDer = module.get_function('psi1compDer')
        self.g_psi1compDer.prepare('PPPPPPPPPPPdPPPPPiii')
        self.g_psi2compDer = module.get_function('psi2compDer')
        self.g_psi2compDer.prepare('PPPPPPPPPPPdPPPPPiii')
        self.g_compDenom = module.get_function('compDenom')
        self.g_compDenom.prepare('PPPPPPPii')
    def __deepcopy__(self, memo):
        s = PSICOMP_SSRBF_GPU(threadnum=self.threadnum, blocknum=self.blocknum, GPU_direct=self.GPU_direct)
        memo[id(self)] = s 
        return s
    def _initGPUCache(self, N, M, Q):            
        if self.gpuCache == None:
            self.gpuCache = {
                             'l_gpu'                :gpuarray.empty((Q,),np.float64,order='F'),
                             'Z_gpu'                :gpuarray.empty((M,Q),np.float64,order='F'),
                             'mu_gpu'               :gpuarray.empty((N,Q),np.float64,order='F'),
                             'S_gpu'                :gpuarray.empty((N,Q),np.float64,order='F'),
                             'gamma_gpu'            :gpuarray.empty((N,Q),np.float64,order='F'),
                             'psi1_gpu'             :gpuarray.empty((N,M),np.float64,order='F'),
                             'psi2_gpu'             :gpuarray.empty((M,M),np.float64,order='F'),
                             'psi2n_gpu'            :gpuarray.empty((N,M,M),np.float64,order='F'),
                             'dL_dpsi1_gpu'         :gpuarray.empty((N,M),np.float64,order='F'),
                             'dL_dpsi2_gpu'         :gpuarray.empty((M,M),np.float64,order='F'),
                             'log_denom1_gpu'       :gpuarray.empty((N,Q),np.float64,order='F'),
                             'log_denom2_gpu'       :gpuarray.empty((N,Q),np.float64,order='F'),
                             'log_gamma_gpu'        :gpuarray.empty((N,Q),np.float64,order='F'),
                             'log_gamma1_gpu'       :gpuarray.empty((N,Q),np.float64,order='F'),
                             # derivatives
                             'dvar_gpu'             :gpuarray.empty((self.blocknum,),np.float64, order='F'),
                             'dl_gpu'               :gpuarray.empty((Q,self.blocknum),np.float64, order='F'),
                             'dZ_gpu'               :gpuarray.empty((M,Q),np.float64, order='F'),
                             'dmu_gpu'              :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
                             'dS_gpu'               :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
                             'dgamma_gpu'           :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
                             # grad
                             'grad_l_gpu'               :gpuarray.empty((Q,),np.float64, order='F'),
                             'grad_mu_gpu'              :gpuarray.empty((N,Q,),np.float64, order='F'),
                             'grad_S_gpu'               :gpuarray.empty((N,Q,),np.float64, order='F'),
                             'grad_gamma_gpu'           :gpuarray.empty((N,Q,),np.float64, order='F'),
                             }
        else:
            assert N==self.gpuCache['mu_gpu'].shape[0]
            assert M==self.gpuCache['Z_gpu'].shape[0]
            assert Q==self.gpuCache['l_gpu'].shape[0]
    def sync_params(self, lengthscale, Z, mu, S, gamma):
        if len(lengthscale)==1:
            self.gpuCache['l_gpu'].fill(lengthscale)
        else:
            self.gpuCache['l_gpu'].set(np.asfortranarray(lengthscale))
        self.gpuCache['Z_gpu'].set(np.asfortranarray(Z))
        self.gpuCache['mu_gpu'].set(np.asfortranarray(mu))
        self.gpuCache['S_gpu'].set(np.asfortranarray(S))
        self.gpuCache['gamma_gpu'].set(np.asfortranarray(gamma))
        N,Q = self.gpuCache['S_gpu'].shape
        self.g_compDenom.prepared_call((self.blocknum,1),(self.threadnum,1,1), self.gpuCache['log_denom1_gpu'].gpudata,self.gpuCache['log_denom2_gpu'].gpudata,self.gpuCache['log_gamma_gpu'].gpudata,self.gpuCache['log_gamma1_gpu'].gpudata,self.gpuCache['gamma_gpu'].gpudata,self.gpuCache['l_gpu'].gpudata,self.gpuCache['S_gpu'].gpudata, np.int32(N), np.int32(Q))
    def reset_derivative(self):
        self.gpuCache['dvar_gpu'].fill(0.)
        self.gpuCache['dl_gpu'].fill(0.)
        self.gpuCache['dZ_gpu'].fill(0.)
        self.gpuCache['dmu_gpu'].fill(0.)
        self.gpuCache['dS_gpu'].fill(0.)
        self.gpuCache['dgamma_gpu'].fill(0.)
        self.gpuCache['grad_l_gpu'].fill(0.)
        self.gpuCache['grad_mu_gpu'].fill(0.)
        self.gpuCache['grad_S_gpu'].fill(0.)
        self.gpuCache['grad_gamma_gpu'].fill(0.)
    def get_dimensions(self, Z, variational_posterior):
        return variational_posterior.mean.shape[0], Z.shape[0], Z.shape[1]
    @Cache_this(limit=1, ignore_args=(0,))
    def psicomputations(self, variance, lengthscale, Z, variational_posterior):
        """
        Z - MxQ
        mu - NxQ
        S - NxQ
        """
        N,M,Q = self.get_dimensions(Z, variational_posterior)
        self._initGPUCache(N,M,Q)
        self.sync_params(lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
        psi1_gpu = self.gpuCache['psi1_gpu']
        psi2_gpu = self.gpuCache['psi2_gpu']
        psi2n_gpu = self.gpuCache['psi2n_gpu']
        l_gpu = self.gpuCache['l_gpu']
        Z_gpu = self.gpuCache['Z_gpu']
        mu_gpu = self.gpuCache['mu_gpu']
        S_gpu = self.gpuCache['S_gpu']
        log_denom1_gpu = self.gpuCache['log_denom1_gpu']
        log_denom2_gpu = self.gpuCache['log_denom2_gpu']
        log_gamma_gpu = self.gpuCache['log_gamma_gpu']
        log_gamma1_gpu = self.gpuCache['log_gamma1_gpu']
        psi0 = np.empty((N,))
        psi0[:] = variance
        self.g_psi1computations.prepared_call((self.blocknum,1),(self.threadnum,1,1),psi1_gpu.gpudata, log_denom1_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
        self.g_psi2computations.prepared_call((self.blocknum,1),(self.threadnum,1,1),psi2_gpu.gpudata, psi2n_gpu.gpudata, log_denom2_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
        if self.GPU_direct:
            return psi0, psi1_gpu, psi2_gpu
        else:
            return psi0, psi1_gpu.get(), psi2_gpu.get()
    @Cache_this(limit=1, ignore_args=(0,1,2,3))
    def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
        ARD = (len(lengthscale)!=1)
        N,M,Q = self.get_dimensions(Z, variational_posterior)
        psi1_gpu = self.gpuCache['psi1_gpu']
        psi2n_gpu = self.gpuCache['psi2n_gpu']
        l_gpu = self.gpuCache['l_gpu']
        Z_gpu = self.gpuCache['Z_gpu']
        mu_gpu = self.gpuCache['mu_gpu']
        S_gpu = self.gpuCache['S_gpu']
        gamma_gpu = self.gpuCache['gamma_gpu']
        dvar_gpu = self.gpuCache['dvar_gpu']
        dl_gpu = self.gpuCache['dl_gpu']
        dZ_gpu = self.gpuCache['dZ_gpu']
        dmu_gpu = self.gpuCache['dmu_gpu']
        dS_gpu = self.gpuCache['dS_gpu']
        dgamma_gpu = self.gpuCache['dgamma_gpu']
        grad_l_gpu = self.gpuCache['grad_l_gpu']
        grad_mu_gpu = self.gpuCache['grad_mu_gpu']
        grad_S_gpu = self.gpuCache['grad_S_gpu']
        grad_gamma_gpu = self.gpuCache['grad_gamma_gpu']
        log_denom1_gpu = self.gpuCache['log_denom1_gpu']
        log_denom2_gpu = self.gpuCache['log_denom2_gpu']
        log_gamma_gpu = self.gpuCache['log_gamma_gpu']
        log_gamma1_gpu = self.gpuCache['log_gamma1_gpu']
        if self.GPU_direct:
            dL_dpsi1_gpu = dL_dpsi1
            dL_dpsi2_gpu = dL_dpsi2
            dL_dpsi0_sum = gpuarray.sum(dL_dpsi0).get()
        else:
            dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu']
            dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu']
            dL_dpsi1_gpu.set(np.asfortranarray(dL_dpsi1))
            dL_dpsi2_gpu.set(np.asfortranarray(dL_dpsi2))
            dL_dpsi0_sum = dL_dpsi0.sum()
        self.reset_derivative()
        # t=self.g_psi1compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi1_gpu,psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
        # print 'g_psi1compDer '+str(t)
        # t=self.g_psi2compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi2_gpu,psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
        # print 'g_psi2compDer '+str(t)
        self.g_psi1compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dgamma_gpu.gpudata,dL_dpsi1_gpu.gpudata,psi1_gpu.gpudata, log_denom1_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata,gamma_gpu.gpudata,np.int32(N), np.int32(M), np.int32(Q))
        self.g_psi2compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dgamma_gpu.gpudata,dL_dpsi2_gpu.gpudata,psi2n_gpu.gpudata, log_denom2_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata,gamma_gpu.gpudata,np.int32(N), np.int32(M), np.int32(Q))
        dL_dvar = dL_dpsi0_sum + gpuarray.sum(dvar_gpu).get()
        sum_axis(grad_mu_gpu,dmu_gpu,N*Q,self.blocknum)
        dL_dmu = grad_mu_gpu.get()
        sum_axis(grad_S_gpu,dS_gpu,N*Q,self.blocknum)
        dL_dS = grad_S_gpu.get()
        sum_axis(grad_gamma_gpu,dgamma_gpu,N*Q,self.blocknum)
        dL_dgamma = grad_gamma_gpu.get()
        dL_dZ = dZ_gpu.get()
        if ARD:
            sum_axis(grad_l_gpu,dl_gpu,Q,self.blocknum)
            dL_dlengscale = grad_l_gpu.get()
        else:
            dL_dlengscale = gpuarray.sum(dl_gpu).get()
        return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@ -0,0 +1,71 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from stationary import Stationary
 from psi_comp import PSICOMP_RBF
 from psi_comp.rbf_psi_gpucomp import PSICOMP_RBF_GPU
 from ...util.config import *
 class RBF(Stationary):
    """
    Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel:
    .. math::
       k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg)
    """
    _support_GPU = True
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='rbf', useGPU=False):
        super(RBF, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name, useGPU=useGPU)
        self.psicomp = PSICOMP_RBF()
        if self.useGPU:
            self.psicomp = PSICOMP_RBF_GPU()
        else:
            self.psicomp = PSICOMP_RBF()
    def K_of_r(self, r):
        return self.variance * np.exp(-0.5 * r**2)
    def dK_dr(self, r):
        return -r*self.K_of_r(r)
    def __getstate__(self):
        dc = super(RBF, self).__getstate__()
        if self.useGPU:
            dc['psicomp'] = PSICOMP_RBF()
        return dc
    def __setstate__(self, state):
        return super(RBF, self).__setstate__(state)
    def spectrum(self, omega):
        assert self.input_dim == 1 #TODO: higher dim spectra?
        return self.variance*np.sqrt(2*np.pi)*self.lengthscale*np.exp(-self.lengthscale*2*omega**2/2)
    #---------------------------------------#
    #             PSI statistics            #
    #---------------------------------------#
    def psi0(self, Z, variational_posterior):
        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[0]
    def psi1(self, Z, variational_posterior):
        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[1]
    def psi2(self, Z, variational_posterior):
        return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[2]
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[:2]
        self.variance.gradient = dL_dvar
        self.lengthscale.gradient = dL_dlengscale
    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[2]
    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[3:]
--- a/GPy/kern/_src/splitKern.py
+++ b/GPy/kern/_src/splitKern.py
@ -0,0 +1,204 @@
 """
 A new kernel
 """
 import numpy as np
 from kern import Kern,CombinationKernel
 from .independent_outputs import index_to_slices
 import itertools
 class DiffGenomeKern(Kern):
    def __init__(self, kernel, idx_p, Xp, index_dim=-1, name='DiffGenomeKern'):
        self.idx_p = idx_p
        self.index_dim=index_dim
        self.kern = SplitKern(kernel,Xp, index_dim=index_dim)
        super(DiffGenomeKern, self).__init__(input_dim=kernel.input_dim+1, active_dims=None, name=name)
        self.add_parameter(self.kern)
    def K(self, X, X2=None):
        assert X2==None
        K = self.kern.K(X,X2)
        if self.idx_p<=0 or self.idx_p>X.shape[0]/2:
            return K
        slices = index_to_slices(X[:,self.index_dim])
        idx_start = slices[1][0].start
        idx_end = idx_start+self.idx_p
        K_c = K[idx_start:idx_end,idx_start:idx_end].copy()
        K[idx_start:idx_end,:] = K[:self.idx_p,:]
        K[:,idx_start:idx_end] = K[:,:self.idx_p]
        K[idx_start:idx_end,idx_start:idx_end] = K_c
        return K
    def Kdiag(self,X):
        Kdiag = self.kern.Kdiag(X)
        if self.idx_p<=0 or self.idx_p>X.shape[0]/2:
            return Kdiag
        slices = index_to_slices(X[:,self.index_dim])
        idx_start = slices[1][0].start
        idx_end = idx_start+self.idx_p
        Kdiag[idx_start:idx_end] = Kdiag[:self.idx_p]
        return Kdiag
    def update_gradients_full(self,dL_dK,X,X2=None):
        assert X2==None
        if self.idx_p<=0 or self.idx_p>X.shape[0]/2:
            self.kern.update_gradients_full(dL_dK, X)
            return
        slices = index_to_slices(X[:,self.index_dim])
        idx_start = slices[1][0].start
        idx_end = idx_start+self.idx_p
        self.kern.update_gradients_full(dL_dK[idx_start:idx_end,:], X[:self.idx_p],X)
        grad_p1 = self.kern.gradient.copy()
        self.kern.update_gradients_full(dL_dK[:,idx_start:idx_end], X, X[:self.idx_p])
        grad_p2 = self.kern.gradient.copy()
        self.kern.update_gradients_full(dL_dK[idx_start:idx_end,idx_start:idx_end], X[:self.idx_p],X[idx_start:idx_end])
        grad_p3 = self.kern.gradient.copy()
        self.kern.update_gradients_full(dL_dK[idx_start:idx_end,idx_start:idx_end], X[idx_start:idx_end], X[:self.idx_p])
        grad_p4 = self.kern.gradient.copy()
        self.kern.update_gradients_full(dL_dK[idx_start:idx_end,:], X[idx_start:idx_end],X)
        grad_n1 = self.kern.gradient.copy()
        self.kern.update_gradients_full(dL_dK[:,idx_start:idx_end], X, X[idx_start:idx_end])
        grad_n2 = self.kern.gradient.copy()
        self.kern.update_gradients_full(dL_dK[idx_start:idx_end,idx_start:idx_end], X[idx_start:idx_end], X[idx_start:idx_end])
        grad_n3 = self.kern.gradient.copy()
        self.kern.update_gradients_full(dL_dK, X)
        self.kern.gradient += grad_p1+grad_p2-grad_p3-grad_p4-grad_n1-grad_n2+2*grad_n3
    def update_gradients_diag(self, dL_dKdiag, X):
        pass
 class SplitKern(CombinationKernel):
    def __init__(self, kernel, Xp, index_dim=-1, name='SplitKern'):
        assert isinstance(index_dim, int), "The index dimension must be an integer!"
        self.kern = kernel
        self.kern_cross = SplitKern_cross(kernel,Xp)
        super(SplitKern, self).__init__(kernels=[self.kern, self.kern_cross], extra_dims=[index_dim], name=name)
        self.index_dim = index_dim
    def K(self,X ,X2=None):
        slices = index_to_slices(X[:,self.index_dim])
        assert len(slices)<=2, 'The Split kernel only support two different indices'
        if X2 is None:
            target = np.zeros((X.shape[0], X.shape[0]))
            # diagonal blocks
            [[target.__setitem__((s,ss), self.kern.K(X[s,:], X[ss,:])) for s,ss in itertools.product(slices_i, slices_i)] for slices_i in slices]
            if len(slices)>1:
                # cross blocks
                [target.__setitem__((s,ss), self.kern_cross.K(X[s,:], X[ss,:])) for s,ss in itertools.product(slices[0], slices[1])]
                # cross blocks
                [target.__setitem__((s,ss), self.kern_cross.K(X[s,:], X[ss,:])) for s,ss in itertools.product(slices[1], slices[0])]
        else:
            slices2 = index_to_slices(X2[:,self.index_dim])
            assert len(slices2)<=2, 'The Split kernel only support two different indices'
            target = np.zeros((X.shape[0], X2.shape[0]))
            # diagonal blocks
            [[target.__setitem__((s,s2), self.kern.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices[i], slices2[i])] for i in xrange(min(len(slices),len(slices2)))]
            if len(slices)>1:
                [target.__setitem__((s,s2), self.kern_cross.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices[1], slices2[0])]
            if len(slices2)>1:
                [target.__setitem__((s,s2), self.kern_cross.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices[0], slices2[1])]                
        return target
    def Kdiag(self,X):
        return self.kern.Kdiag(X)
    def update_gradients_full(self,dL_dK,X,X2=None):
        slices = index_to_slices(X[:,self.index_dim])
        target = np.zeros(self.kern.size)
        def collate_grads(dL, X, X2, cross=False):
            if cross:
                self.kern_cross.update_gradients_full(dL,X,X2)
                target[:] += self.kern_cross.kern.gradient
            else:
                self.kern.update_gradients_full(dL,X,X2)
                target[:] += self.kern.gradient
        if X2 is None:
            assert dL_dK.shape==(X.shape[0],X.shape[0])
            [[collate_grads(dL_dK[s,ss], X[s], X[ss]) for s,ss in itertools.product(slices_i, slices_i)] for slices_i in slices]
            if len(slices)>1:
                [collate_grads(dL_dK[s,ss], X[s], X[ss], True) for s,ss in itertools.product(slices[0], slices[1])]
                [collate_grads(dL_dK[s,ss], X[s], X[ss], True) for s,ss in itertools.product(slices[1], slices[0])]
        else:
            assert dL_dK.shape==(X.shape[0],X2.shape[0])
            slices2 = index_to_slices(X2[:,self.index_dim])
            [[collate_grads(dL_dK[s,s2],X[s],X2[s2]) for s,s2 in itertools.product(slices[i], slices2[i])] for i in xrange(min(len(slices),len(slices2)))]
            if len(slices)>1:
                [collate_grads(dL_dK[s,s2], X[s], X2[s2], True) for s,s2 in itertools.product(slices[1], slices2[0])]
            if len(slices2)>1:
                [collate_grads(dL_dK[s,s2], X[s], X2[s2], True) for s,s2 in itertools.product(slices[0], slices2[1])]
        self.kern.gradient = target
    def update_gradients_diag(self, dL_dKdiag, X):
        self.kern.update_gradients_diag(self, dL_dKdiag, X)
 class SplitKern_cross(Kern):
    def __init__(self, kernel, Xp, name='SplitKern_cross'):
        assert isinstance(kernel, Kern)
        self.kern = kernel
        if not isinstance(Xp,np.ndarray):
            Xp = np.array([[Xp]])
        self.Xp = Xp
        super(SplitKern_cross, self).__init__(input_dim=kernel.input_dim, active_dims=None, name=name)
    def K(self, X, X2=None):
        if X2 is None:
            return np.dot(self.kern.K(X,self.Xp),self.kern.K(self.Xp,X))/self.kern.K(self.Xp,self.Xp)
        else:
            return np.dot(self.kern.K(X,self.Xp),self.kern.K(self.Xp,X2))/self.kern.K(self.Xp,self.Xp)
    def Kdiag(self, X):
        return np.inner(self.kern.K(X,self.Xp),self.kern.K(self.Xp,X).T)/self.kern.K(self.Xp,self.Xp)
    def update_gradients_full(self, dL_dK, X, X2=None):
        if X2 is None:
            X2 = X
        k1 = self.kern.K(X,self.Xp)
        k2 = self.kern.K(self.Xp,X2)
        k3 = self.kern.K(self.Xp,self.Xp)
        dL_dk1 = np.einsum('ij,j->i',dL_dK,k2[0])/k3[0,0]
        dL_dk2 = np.einsum('ij,i->j',dL_dK,k1[:,0])/k3[0,0]
        dL_dk3 = np.einsum('ij,ij->',dL_dK,-np.dot(k1,k2)/(k3[0,0]*k3[0,0]))
        self.kern.update_gradients_full(dL_dk1[:,None],X,self.Xp)
        grad = self.kern.gradient.copy()
        self.kern.update_gradients_full(dL_dk2[None,:],self.Xp,X2)
        grad += self.kern.gradient.copy()
        self.kern.update_gradients_full(np.array([[dL_dk3]]),self.Xp,self.Xp)
        grad += self.kern.gradient.copy()
        self.kern.gradient = grad
    def update_gradients_diag(self, dL_dKdiag, X):
        k1 = self.kern.K(X,self.Xp)
        k2 = self.kern.K(self.Xp,X)
        k3 = self.kern.K(self.Xp,self.Xp)
        dL_dk1 = dL_dKdiag*k2[0]/k3
        dL_dk2 = dL_dKdiag*k1[:,0]/k3
        dL_dk3 = -dL_dKdiag*(k1[:,0]*k2[0]).sum()/(k3*k3)
        self.kern.update_gradients_full(dL_dk1[:,None],X,self.Xp)
        grad1 = self.kern.gradient.copy()
        self.kern.update_gradients_full(dL_dk2[None,:],self.Xp,X)
        grad2 = self.kern.gradient.copy()
        self.kern.update_gradients_full(np.array([[dL_dk3]]),self.Xp,self.Xp)
        grad3 = self.kern.gradient.copy()
        self.kern.gradient = grad1+grad2+grad3
--- a/GPy/kern/_src/static.py
+++ b/GPy/kern/_src/static.py
@ -0,0 +1,122 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kern import Kern
 import numpy as np
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 class Static(Kern):
    def __init__(self, input_dim, variance, active_dims, name):
        super(Static, self).__init__(input_dim, active_dims, name)
        self.variance = Param('variance', variance, Logexp())
        self.link_parameters(self.variance)
    def Kdiag(self, X):
        ret = np.empty((X.shape[0],), dtype=np.float64)
        ret[:] = self.variance
        return ret
    def gradients_X(self, dL_dK, X, X2=None):
        return np.zeros(X.shape)
    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)
    def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        return np.zeros(Z.shape)
    def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        return np.zeros(variational_posterior.shape), np.zeros(variational_posterior.shape)
    def psi0(self, Z, variational_posterior):
        return self.Kdiag(variational_posterior.mean)
    def psi1(self, Z, variational_posterior):
        return self.K(variational_posterior.mean, Z)
    def psi2(self, Z, variational_posterior):
        K = self.K(variational_posterior.mean, Z)
        return np.einsum('ij,ik->jk',K,K) #K[:,:,None]*K[:,None,:] # NB. more efficient implementations on inherriting classes
    def input_sensitivity(self, summarize=True):
        if summarize:
            return super(Static, self).input_sensitivity(summarize=summarize)
        else:
            return np.ones(self.input_dim) * self.variance
 class White(Static):
    def __init__(self, input_dim, variance=1., active_dims=None, name='white'):
        super(White, self).__init__(input_dim, variance, active_dims, name)
    def K(self, X, X2=None):
        if X2 is None:
            return np.eye(X.shape[0])*self.variance
        else:
            return np.zeros((X.shape[0], X2.shape[0]))
    def psi2(self, Z, variational_posterior):
        return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)
    def update_gradients_full(self, dL_dK, X, X2=None):
        self.variance.gradient = np.trace(dL_dK)
    def update_gradients_diag(self, dL_dKdiag, X):
        self.variance.gradient = dL_dKdiag.sum()
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        self.variance.gradient = dL_dpsi0.sum()
 class Bias(Static):
    def __init__(self, input_dim, variance=1., active_dims=None, name='bias'):
        super(Bias, self).__init__(input_dim, variance, active_dims, name)
    def K(self, X, X2=None):
        shape = (X.shape[0], X.shape[0] if X2 is None else X2.shape[0])
        ret = np.empty(shape, dtype=np.float64)
        ret[:] = self.variance
        return ret
    def update_gradients_full(self, dL_dK, X, X2=None):
        self.variance.gradient = dL_dK.sum()
    def update_gradients_diag(self, dL_dKdiag, X):
        self.variance.gradient = dL_dKdiag.sum()
    def psi2(self, Z, variational_posterior):
        ret = np.empty((Z.shape[0], Z.shape[0]), dtype=np.float64)
        ret[:] = self.variance*self.variance*variational_posterior.shape[0]
        return ret
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        self.variance.gradient = dL_dpsi0.sum() + dL_dpsi1.sum() + 2.*self.variance*dL_dpsi2.sum()*variational_posterior.shape[0]
 class Fixed(Static):
    def __init__(self, input_dim, covariance_matrix, variance=1., active_dims=None, name='fixed'):
        """
        :param input_dim: the number of input dimensions
        :type input_dim: int
        :param variance: the variance of the kernel
        :type variance: float
        """
        super(Fixed, self).__init__(input_dim, variance, active_dims, name)
        self.fixed_K = covariance_matrix
    def K(self, X, X2):
        return self.variance * self.fixed_K
    def Kdiag(self, X):
        return self.variance * self.fixed_K.diag()
    def update_gradients_full(self, dL_dK, X, X2=None):
        self.variance.gradient = np.einsum('ij,ij', dL_dK, self.fixed_K)
    def update_gradients_diag(self, dL_dKdiag, X):
        self.variance.gradient = np.einsum('i,i', dL_dKdiag, self.fixed_K)
    def psi2(self, Z, variational_posterior):
        return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        self.variance.gradient = dL_dpsi0.sum()
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@ -0,0 +1,484 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ...util.linalg import tdot
 from ... import util
 import numpy as np
 from scipy import integrate, weave
 from ...util.config import config # for assesing whether to use weave
 from ...util.caching import Cache_this
 class Stationary(Kern):
    """
    Stationary kernels (covariance functions).
    Stationary covariance fucntion depend only on r, where r is defined as
      r = \sqrt{ \sum_{q=1}^Q (x_q - x'_q)^2 }
    The covariance function k(x, x' can then be written k(r).
    In this implementation, r is scaled by the lengthscales parameter(s):
      r = \sqrt{ \sum_{q=1}^Q \frac{(x_q - x'_q)^2}{\ell_q^2} }.
    By default, there's only one lengthscale: seaprate lengthscales for each
    dimension can be enables by setting ARD=True.
    To implement a stationary covariance function using this class, one need
    only define the covariance function k(r), and it derivative.
      ...
      def K_of_r(self, r):
          return foo
      def dK_dr(self, r):
          return bar
    The lengthscale(s) and variance parameters are added to the structure automatically.
    """
    def __init__(self, input_dim, variance, lengthscale, ARD, active_dims, name, useGPU=False):
        super(Stationary, self).__init__(input_dim, active_dims, name,useGPU=useGPU)
        self.ARD = ARD
        if not ARD:
            if lengthscale is None:
                lengthscale = np.ones(1)
            else:
                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size == 1, "Only 1 lengthscale needed for non-ARD kernel"
        else:
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
                assert lengthscale.size in [1, input_dim], "Bad number of lengthscales"
                if lengthscale.size != input_dim:
                    lengthscale = np.ones(input_dim)*lengthscale
            else:
                lengthscale = np.ones(self.input_dim)
        self.lengthscale = Param('lengthscale', lengthscale, Logexp())
        self.variance = Param('variance', variance, Logexp())
        assert self.variance.size==1
        self.link_parameters(self.variance, self.lengthscale)
    def K_of_r(self, r):
        raise NotImplementedError, "implement the covariance function as a fn of r to use this class"
    def dK_dr(self, r):
        raise NotImplementedError, "implement derivative of the covariance function wrt r to use this class"
    @Cache_this(limit=5, ignore_args=())
    def K(self, X, X2=None):
        """
        Kernel function applied on inputs X and X2.
        In the stationary case there is an inner function depending on the
        distances from X to X2, called r.
        K(X, X2) = K_of_r((X-X2)**2)
        """
        r = self._scaled_dist(X, X2)
        return self.K_of_r(r)
    @Cache_this(limit=3, ignore_args=())
    def dK_dr_via_X(self, X, X2):
        #a convenience function, so we can cache dK_dr
        return self.dK_dr(self._scaled_dist(X, X2))
    def _unscaled_dist(self, X, X2=None):
        """
        Compute the Euclidean distance between each row of X and X2, or between
        each pair of rows of X if X2 is None.
        """
        #X, = self._slice_X(X)
        if X2 is None:
            Xsq = np.sum(np.square(X),1)
            r2 = -2.*tdot(X) + (Xsq[:,None] + Xsq[None,:])
            util.diag.view(r2)[:,]= 0. # force diagnoal to be zero: sometime numerically a little negative
            r2 = np.clip(r2, 0, np.inf)
            return np.sqrt(r2)
        else:
            #X2, = self._slice_X(X2)
            X1sq = np.sum(np.square(X),1)
            X2sq = np.sum(np.square(X2),1)
            r2 = -2.*np.dot(X, X2.T) + X1sq[:,None] + X2sq[None,:]
            r2 = np.clip(r2, 0, np.inf)
            return np.sqrt(r2)
    @Cache_this(limit=5, ignore_args=())
    def _scaled_dist(self, X, X2=None):
        """
        Efficiently compute the scaled distance, r.
        r = \sqrt( \sum_{q=1}^Q (x_q - x'q)^2/l_q^2 )
        Note that if thre is only one lengthscale, l comes outside the sum. In
        this case we compute the unscaled distance first (in a separate
        function for caching) and divide by lengthscale afterwards
        """
        if self.ARD:
            if X2 is not None:
                X2 = X2 / self.lengthscale
            return self._unscaled_dist(X/self.lengthscale, X2)
        else:
            return self._unscaled_dist(X, X2)/self.lengthscale
    def Kdiag(self, X):
        ret = np.empty(X.shape[0])
        ret[:] = self.variance
        return ret
    def update_gradients_diag(self, dL_dKdiag, X):
        """
        Given the derivative of the objective with respect to the diagonal of
        the covariance matrix, compute the derivative wrt the parameters of
        this kernel and stor in the <parameter>.gradient field.
        See also update_gradients_full
        """
        self.variance.gradient = np.sum(dL_dKdiag)
        self.lengthscale.gradient = 0.
    def update_gradients_full(self, dL_dK, X, X2=None):
        """
        Given the derivative of the objective wrt the covariance matrix
        (dL_dK), compute the gradient wrt the parameters of this kernel,
        and store in the parameters object as e.g. self.variance.gradient
        """
        self.variance.gradient = np.einsum('ij,ij,i', self.K(X, X2), dL_dK, 1./self.variance)
        #now the lengthscale gradient(s)
        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
        if self.ARD:
            #rinv = self._inv_dis# this is rather high memory? Should we loop instead?t(X, X2)
            #d =  X[:, None, :] - X2[None, :, :]
            #x_xl3 = np.square(d)
            #self.lengthscale.gradient = -((dL_dr*rinv)[:,:,None]*x_xl3).sum(0).sum(0)/self.lengthscale**3
            tmp = dL_dr*self._inv_dist(X, X2)
            if X2 is None: X2 = X
            if config.getboolean('weave', 'working'):
                try:
                    self.lengthscale.gradient = self.weave_lengthscale_grads(tmp, X, X2)
                except:
                    print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
                    config.set('weave', 'working', 'False')
                    self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)])
            else:
                self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)])
        else:
            r = self._scaled_dist(X, X2)
            self.lengthscale.gradient = -np.sum(dL_dr*r)/self.lengthscale
    def _inv_dist(self, X, X2=None):
        """
        Compute the elementwise inverse of the distance matrix, expecpt on the
        diagonal, where we return zero (the distance on the diagonal is zero).
        This term appears in derviatives.
        """
        dist = self._scaled_dist(X, X2).copy()
        return 1./np.where(dist != 0., dist, np.inf)
    def weave_lengthscale_grads(self, tmp, X, X2):
        """Use scipy.weave to compute derivatives wrt the lengthscales"""
        N,M = tmp.shape
        Q = X.shape[1]
        if hasattr(X, 'values'):X = X.values
        if hasattr(X2, 'values'):X2 = X2.values
        grads = np.zeros(self.input_dim)
        code = """
        double gradq;
        for(int q=0; q<Q; q++){
          gradq = 0;
          for(int n=0; n<N; n++){
            for(int m=0; m<M; m++){
              gradq += tmp(n,m)*(X(n,q)-X2(m,q))*(X(n,q)-X2(m,q));
            }
          }
          grads(q) = gradq;
        }
        """
        weave.inline(code, ['tmp', 'X', 'X2', 'grads', 'N', 'M', 'Q'], type_converters=weave.converters.blitz, support_code="#include <math.h>")
        return -grads/self.lengthscale**3
    def gradients_X(self, dL_dK, X, X2=None):
        """
        Given the derivative of the objective wrt K (dL_dK), compute the derivative wrt X
        """
        if config.getboolean('weave', 'working'):
            try:
                return self.gradients_X_weave(dL_dK, X, X2)
            except:
                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
                config.set('weave', 'working', 'False')
                return self.gradients_X_(dL_dK, X, X2)
        else:
            return self.gradients_X_(dL_dK, X, X2)
    def gradients_X_(self, dL_dK, X, X2=None):
        invdist = self._inv_dist(X, X2)
        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
        tmp = invdist*dL_dr
        if X2 is None:
            tmp = tmp + tmp.T
            X2 = X
        #The high-memory numpy way:
        #d =  X[:, None, :] - X2[None, :, :]
        #ret = np.sum(tmp[:,:,None]*d,1)/self.lengthscale**2
        #the lower memory way with a loop
        ret = np.empty(X.shape, dtype=np.float64)
        for q in xrange(self.input_dim):
            np.sum(tmp*(X[:,q][:,None]-X2[:,q][None,:]), axis=1, out=ret[:,q])
        ret /= self.lengthscale**2
        return ret
    def gradients_X_weave(self, dL_dK, X, X2=None):
        invdist = self._inv_dist(X, X2)
        dL_dr = self.dK_dr_via_X(X, X2) * dL_dK
        tmp = invdist*dL_dr
        if X2 is None:
            tmp = tmp + tmp.T
            X2 = X
        code = """
        int n,m,d;
        double retnd;
        #pragma omp parallel for private(n,d, retnd, m)
        for(d=0;d<D;d++){
          for(n=0;n<N;n++){
            retnd = 0.0;
            for(m=0;m<M;m++){
              retnd += tmp(n,m)*(X(n,d)-X2(m,d));
            }
            ret(n,d) = retnd;
          }
        }
        """
        if hasattr(X, 'values'):X = X.values #remove the GPy wrapping to make passing into weave safe
        if hasattr(X2, 'values'):X2 = X2.values
        ret = np.zeros(X.shape)
        N,D = X.shape
        N,M = tmp.shape
        from scipy import weave
        support_code = """
        #include <omp.h>
        #include <stdio.h>
        """
        weave_options = {'headers'           : ['<omp.h>'],
                         'extra_compile_args': ['-fopenmp -O3'], # -march=native'],
                         'extra_link_args'   : ['-lgomp']}
        weave.inline(code, ['ret', 'N', 'D', 'M', 'tmp', 'X', 'X2'], type_converters=weave.converters.blitz, support_code=support_code, **weave_options)
        return ret/self.lengthscale**2
    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)
    def input_sensitivity(self, summarize=True):
        return np.ones(self.input_dim)/self.lengthscale**2
 class Exponential(Stationary):
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Exponential'):
        super(Exponential, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
    def K_of_r(self, r):
        return self.variance * np.exp(-0.5 * r)
    def dK_dr(self, r):
        return -0.5*self.K_of_r(r)
 class OU(Stationary):
    """
    OU kernel:
    .. math::
       k(r) = \\sigma^2 \exp(- r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
    """
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='OU'):
        super(OU, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
    def K_of_r(self, r):
        return self.variance * np.exp(-r)
    def dK_dr(self,r):
        return -1.*self.variance*np.exp(-r)
 class Matern32(Stationary):
    """
    Matern 3/2 kernel:
    .. math::
       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} }
    """
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Mat32'):
        super(Matern32, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
    def K_of_r(self, r):
        return self.variance * (1. + np.sqrt(3.) * r) * np.exp(-np.sqrt(3.) * r)
    def dK_dr(self,r):
        return -3.*self.variance*r*np.exp(-np.sqrt(3.)*r)
    def Gram_matrix(self, F, F1, F2, lower, upper):
        """
        Return the Gram matrix of the vector of functions F with respect to the
        RKHS norm. The use of this function is limited to input_dim=1.
        :param F: vector of functions
        :type F: np.array
        :param F1: vector of derivatives of F
        :type F1: np.array
        :param F2: vector of second derivatives of F
        :type F2: np.array
        :param lower,upper: boundaries of the input domain
        :type lower,upper: floats
        """
        assert self.input_dim == 1
        def L(x, i):
            return(3. / self.lengthscale ** 2 * F[i](x) + 2 * np.sqrt(3) / self.lengthscale * F1[i](x) + F2[i](x))
        n = F.shape[0]
        G = np.zeros((n, n))
        for i in range(n):
            for j in range(i, n):
                G[i, j] = G[j, i] = integrate.quad(lambda x : L(x, i) * L(x, j), lower, upper)[0]
        Flower = np.array([f(lower) for f in F])[:, None]
        F1lower = np.array([f(lower) for f in F1])[:, None]
        return(self.lengthscale ** 3 / (12.*np.sqrt(3) * self.variance) * G + 1. / self.variance * np.dot(Flower, Flower.T) + self.lengthscale ** 2 / (3.*self.variance) * np.dot(F1lower, F1lower.T))
 class Matern52(Stationary):
    """
    Matern 5/2 kernel:
    .. math::
       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r)
       """
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Mat52'):
        super(Matern52, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
    def K_of_r(self, r):
        return self.variance*(1+np.sqrt(5.)*r+5./3*r**2)*np.exp(-np.sqrt(5.)*r)
    def dK_dr(self, r):
        return self.variance*(10./3*r -5.*r -5.*np.sqrt(5.)/3*r**2)*np.exp(-np.sqrt(5.)*r)
    def Gram_matrix(self, F, F1, F2, F3, lower, upper):
        """
        Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to input_dim=1.
        :param F: vector of functions
        :type F: np.array
        :param F1: vector of derivatives of F
        :type F1: np.array
        :param F2: vector of second derivatives of F
        :type F2: np.array
        :param F3: vector of third derivatives of F
        :type F3: np.array
        :param lower,upper: boundaries of the input domain
        :type lower,upper: floats
        """
        assert self.input_dim == 1
        def L(x,i):
            return(5*np.sqrt(5)/self.lengthscale**3*F[i](x) + 15./self.lengthscale**2*F1[i](x)+ 3*np.sqrt(5)/self.lengthscale*F2[i](x) + F3[i](x))
        n = F.shape[0]
        G = np.zeros((n,n))
        for i in range(n):
            for j in range(i,n):
                G[i,j] = G[j,i] = integrate.quad(lambda x : L(x,i)*L(x,j),lower,upper)[0]
        G_coef = 3.*self.lengthscale**5/(400*np.sqrt(5))
        Flower = np.array([f(lower) for f in F])[:,None]
        F1lower = np.array([f(lower) for f in F1])[:,None]
        F2lower = np.array([f(lower) for f in F2])[:,None]
        orig = 9./8*np.dot(Flower,Flower.T) + 9.*self.lengthscale**4/200*np.dot(F2lower,F2lower.T)
        orig2 = 3./5*self.lengthscale**2 * ( np.dot(F1lower,F1lower.T) + 1./8*np.dot(Flower,F2lower.T) + 1./8*np.dot(F2lower,Flower.T))
        return(1./self.variance* (G_coef*G + orig + orig2))
 class ExpQuad(Stationary):
    """
    The Exponentiated quadratic covariance function.
    .. math::
       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r)
    notes::
     - Yes, this is exactly the same as the RBF covariance function, but the
       RBF implementation also has some features for doing variational kernels
       (the psi-statistics).
    """
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='ExpQuad'):
        super(ExpQuad, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
    def K_of_r(self, r):
        return self.variance * np.exp(-0.5 * r**2)
    def dK_dr(self, r):
        return -r*self.K_of_r(r)
 class Cosine(Stationary):
    def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Cosine'):
        super(Cosine, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
    def K_of_r(self, r):
        return self.variance * np.cos(r)
    def dK_dr(self, r):
        return -self.variance * np.sin(r)
 class RatQuad(Stationary):
    """
    Rational Quadratic Kernel
    .. math::
       k(r) = \sigma^2 \\bigg( 1 + \\frac{r^2}{2} \\bigg)^{- \\alpha}
    """
    def __init__(self, input_dim, variance=1., lengthscale=None, power=2., ARD=False, active_dims=None, name='RatQuad'):
        super(RatQuad, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
        self.power = Param('power', power, Logexp())
        self.link_parameters(self.power)
    def K_of_r(self, r):
        r2 = np.power(r, 2.)
        return self.variance*np.power(1. + r2/2., -self.power)
    def dK_dr(self, r):
        r2 = np.power(r, 2.)
        return -self.variance*self.power*r*np.power(1. + r2/2., - self.power - 1.)
    def update_gradients_full(self, dL_dK, X, X2=None):
        super(RatQuad, self).update_gradients_full(dL_dK, X, X2)
        r = self._scaled_dist(X, X2)
        r2 = np.power(r, 2.)
        dK_dpow = -self.variance * np.power(2., self.power) * np.power(r2 + 2., -self.power) * np.log(0.5*(r2+2.))
        grad = np.sum(dL_dK*dK_dpow)
        self.power.gradient = grad
    def update_gradients_diag(self, dL_dKdiag, X):
        super(RatQuad, self).update_gradients_diag(dL_dKdiag, X)
        self.power.gradient = 0.
--- a/GPy/kern/_src/symbolic.py
+++ b/GPy/kern/_src/symbolic.py
@ -0,0 +1,75 @@
 # Check Matthew Rocklin's blog post.
 import sympy as sym
 import numpy as np
 from kern import Kern
 from ...core.symbolic import Symbolic_core
 class Symbolic(Kern, Symbolic_core):
    """
    """
    def __init__(self, input_dim, k=None, output_dim=1, name='symbolic', parameters=None, active_dims=None, operators=None, func_modules=[]):
        if k is None:
            raise ValueError, "You must provide an argument for the covariance function."
        Kern.__init__(self, input_dim, active_dims, name=name)
        kdiag = k
        self.cacheable = ['X', 'Z']
        Symbolic_core.__init__(self, {'k':k,'kdiag':kdiag}, cacheable=self.cacheable, derivatives = ['X', 'theta'], parameters=parameters, func_modules=func_modules)        
        self.output_dim = output_dim
    def __add__(self,other):
        return spkern(self._sym_k+other._sym_k)
    def _set_expressions(self, expressions):
        """This method is overwritten because we need to modify kdiag by substituting z for x. We do this by calling the parent expression method to extract variables from expressions, then subsitute the z variables that are present with x."""
        Symbolic_core._set_expressions(self, expressions)
        Symbolic_core._set_variables(self, self.cacheable)
        # Substitute z with x to obtain kdiag.
        for x, z in zip(self.variables['X'], self.variables['Z']):
            expressions['kdiag'] = expressions['kdiag'].subs(z, x)
        Symbolic_core._set_expressions(self, expressions)
    def K(self,X,X2=None):
        if X2 is None:
            return self.eval_function('k', X=X, Z=X)
        else:
            return self.eval_function('k', X=X, Z=X2)
    def Kdiag(self,X):
        d = self.eval_function('kdiag', X=X)
        if not d.shape[0] == X.shape[0]:
            d = np.tile(d, (X.shape[0], 1))
        return d
    def gradients_X(self, dL_dK, X, X2=None):
        #if self._X is None or X.base is not self._X.base or X2 is not None:
        g = self.eval_gradients_X('k', dL_dK, X=X, Z=X2)
        if X2 is None:
            g *= 2
        return g
    def gradients_X_diag(self, dL_dK, X):
        return self.eval_gradients_X('kdiag', dL_dK, X=X)
    def update_gradients_full(self, dL_dK, X, X2=None):
        # Need to extract parameters to local variables first
        if X2 is None:
            # need to double this inside ...
            gradients = self.eval_update_gradients('k', dL_dK, X=X)
        else:
            gradients = self.eval_update_gradients('k', dL_dK, X=X, Z=X2)
        for name, val in gradients:
            setattr(getattr(self, name), 'gradient', val)
    def update_gradients_diag(self, dL_dKdiag, X):
        gradients = self.eval_update_gradients('kdiag', dL_dKdiag, X)
        for name, val in gradients:
            setattr(getattr(self, name), 'gradient', val)
--- a/GPy/kern/_src/sympy_helpers.cpp
+++ b/GPy/kern/_src/sympy_helpers.cpp
@ -0,0 +1,61 @@
 #include <math.h>
 #include <float.h>
 #include <stdlib.h>
 double DiracDelta(double x){
  // TODO: this doesn't seem to be a dirac delta ... should return infinity. Neil
    if((x<0.000001) & (x>-0.000001))//go on, laugh at my c++ skills
        return 1.0;
    else
        return 0.0;
 };
 double DiracDelta(double x,int foo){
    return 0.0;
 };
 double sinc(double x){
  if (x==0)
    return 1.0;
  else 
    return sin(x)/x;
 }
 double sinc_grad(double x){
  if (x==0)
    return 0.0;
  else 
    return (x*cos(x) - sin(x))/(x*x);
 }
 double erfcx(double x){
  double xneg=-sqrt(log(DBL_MAX/2));
  double xmax = 1/(sqrt(M_PI)*DBL_MIN);
  xmax = DBL_MAX<xmax ? DBL_MAX : xmax;
  // Find values where erfcx can be evaluated
  double t = 3.97886080735226 / (abs(x) + 3.97886080735226);
  double u = t-0.5;
  double y = (((((((((u * 0.00127109764952614092 + 1.19314022838340944e-4) * u 
 	      - 0.003963850973605135)   * u - 8.70779635317295828e-4) * u 
 	    + 0.00773672528313526668) * u + 0.00383335126264887303) * u 
 	  - 0.0127223813782122755)  * u - 0.0133823644533460069)  * u 
 	+ 0.0161315329733252248)  * u + 0.0390976845588484035)  * u + 0.00249367200053503304;
  if (x<xneg)
    return -INFINITY;
  else if (x<0)
    return 2*exp(x*x)-y;
  else if (x>xmax)
    return 0.0;
  else 
    return y;
 }
 double ln_diff_erf(double x0, double x1){
  if (x0==x1)
    return INFINITY;
  else if(x0<0 && x1>0 || x0>0 && x1<0)
    return log(erf(x0)-erf(x1));
  else if(x1>0)
    return log(erfcx(x1)-erfcx(x0)*exp(x1*x1)- x0*x0)-x1*x1;
  else 
    return log(erfcx(-x0)-erfcx(-x1)*exp(x0*x0 - x1*x1))-x0*x0;
 }
--- a/GPy/kern/_src/sympy_helpers.h
+++ b/GPy/kern/_src/sympy_helpers.h
@ -0,0 +1,9 @@
 #include <math.h>
 double DiracDelta(double x);
 double DiracDelta(double x, int foo);
 double sinc(double x);
 double sinc_grad(double x);
 double erfcx(double x);
 double ln_diff_erf(double x0, double x1);
--- a/GPy/kern/_src/todo/ODE_1.py
+++ b/GPy/kern/_src/todo/ODE_1.py
@ -90,7 +90,7 @@ class ODE_1(Kernpart):
        np.add(self.varianceU*self.varianceY*(k1+k2+k3), target, target)
-    def dK_dtheta(self, dL_dK, X, X2, target):
+    def _param_grad_helper(self, dL_dK, X, X2, target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
        dist = np.abs(X - X2.T)
@ -137,11 +137,7 @@ class ODE_1(Kernpart):
        k2 = (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2 
        k3 = np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
        dkdvar = k1+k2+k3
-        
+
        #target[0] dk dvarU
        #target[1] dk dvarY
        #target[2] dk d theta1
        #target[3] dk d theta2 
        target[0] += np.sum(self.varianceY*dkdvar * dL_dK)
        target[1] += np.sum(self.varianceU*dkdvar * dL_dK)
        target[2] += np.sum(dktheta1*(-np.sqrt(3)*self.lengthscaleU**(-2)) * dL_dK)
--- a/GPy/kern/_src/todo/eq_ode1.py
+++ b/GPy/kern/_src/todo/eq_ode1.py
@ -124,7 +124,7 @@ class Eq_ode1(Kernpart):
        #target += np.diag(self.B)[np.asarray(index,dtype=np.int).flatten()]
        pass
-    def dK_dtheta(self,dL_dK,X,X2,target):
+    def _param_grad_helper(self,dL_dK,X,X2,target):
        # First extract times and indices.
        self._extract_t_indices(X, X2, dL_dK=dL_dK)
@ -193,7 +193,7 @@ class Eq_ode1(Kernpart):
    def dKdiag_dtheta(self,dL_dKdiag,index,target):
        pass
-    def dK_dX(self,dL_dK,X,X2,target):
+    def gradients_X(self,dL_dK,X,X2,target):
        pass
    def _extract_t_indices(self, X, X2=None, dL_dK=None):
--- a/GPy/kern/_src/todo/finite_dimensional.py
+++ b/GPy/kern/_src/todo/finite_dimensional.py
@ -50,7 +50,7 @@ class FiniteDimensional(Kernpart):
    def Kdiag(self,X,target):
        product = np.diag(self.K(X, X))
        np.add(target,product,target)
-    def dK_dtheta(self,X,X2,target):
+    def _param_grad_helper(self,X,X2,target):
        """Return shape is NxMx(Ntheta)"""
        if X2 is None: X2 = X
        FX = np.column_stack([f(X) for f in self.F])
--- a/Show more
+++ b/Show more
		`@ -0,0 +1,2 @@`
							`import latent_function_inference`
							`import optimization`
		`@ -0,0 +1,2 @@`
							`from scg import SCG`
							`from optimization import *`
		`@ -0,0 +1,2 @@`
							`# This is the local installation configuration file for GPy`