merged last devel

2026-05-09 20:12:38 +02:00 · 2016-04-11 11:26:46 +01:00 · 2016-04-11 11:26:46 +01:00 · 45f692340a
commit 45f692340a
parent e17ee1078b 2f901a90e2
170 changed files with 39094 additions and 4107 deletions
--- a/.coveragerc
+++ b/.coveragerc
@ -9,20 +9,16 @@ omit = ./GPy/testing/*.py, travis_tests.py, setup.py, ./GPy/__version__.py
 exclude_lines =
    # Have to re-enable the standard pragma
    pragma: no cover
-    
+    verbose
    # Don't complain about missing debug-only code:
    if self\.debug
    # Don't complain if tests don't hit defensive assertion code:
-    raise AssertionError
+    raise
-    raise NotImplementedError
+    except
    raise NotImplemented
    except NotImplementedError
    except NotImplemented
    except AssertionError
    except ImportError
    pass
    Not implemented
    # Don't complain if non-runnable code isn't run:
    if 0:
@ -31,4 +27,4 @@ exclude_lines =
    # Don't fail on python3 catch clauses:
    python3
-ignore_errors = True
+ignore_errors = True
--- a/.travis.yml
+++ b/.travis.yml
@ -30,6 +30,8 @@ install:
 - source install_retry.sh
 - pip install codecov
 - pip install pypandoc
 - pip install git+git://github.com/BRML/climin.git
 - pip install autograd
 - python setup.py develop
 script:
@ -56,8 +58,7 @@ deploy:
  password:
    secure: "vMEOlP7DQhFJ7hQAKtKC5hrJXFl5BkUt4nXdosWWiw//Kg8E+PPLg88XPI2gqIosir9wwgtbSBBbbwCxkM6uxRNMpoNR8Ixyv9fmSXp4rLl7bbBY768W7IRXKIBjpuEy2brQjoT+CwDDSzUkckHvuUjJDNRvUv8ab4P/qYO1LG4="
  on:
-    tags: false
+    tags: true
-    branch: devel
+    branch: deploy
  server: https://testpypi.python.org/pypi
  distributions: $DIST
  skip_cleanup: true
--- a/AUTHORS.txt
+++ b/AUTHORS.txt
@ -1 +1 @@
-[GPy Authors](https://github.com/SheffieldML/GPy/graphs/contributors)
+GPy Authors: https://github.com/SheffieldML/GPy/graphs/contributors
--- a/GPy/init.py
+++ b/GPy/init.py
@ -28,16 +28,18 @@ from .core.parameterization import Param, Parameterized, ObsAr, transformations
 from .__version__ import __version__
 from numpy.testing import Tester
-#@nottest
+
-try:
+with warnings.catch_warnings():
-    #Get rid of nose dependency by only ignoring if you have nose installed
+    warnings.simplefilter('ignore')
-    from nose.tools import nottest
+    try:
-    @nottest
+        #Get rid of nose dependency by only ignoring if you have nose installed
-    def tests(verbose=10):
+        from nose.tools import nottest
-        Tester(testing).test(verbose=verbose)
+        @nottest
-except:
+        def tests(verbose=10):
-    def tests(verbose=10):
+            Tester(testing).test(verbose=verbose)
-        Tester(testing).test(verbose=verbose)
+    except:
        def tests(verbose=10):
            Tester(testing).test(verbose=verbose)
 def load(file_or_path):
    """
--- a/GPy/version.py
+++ b/GPy/version.py
@ -1 +1 @@
-__version__ = "0.9.7"
+__version__ = "1.0.7"
--- a/GPy/core/init.py
+++ b/GPy/core/init.py
@ -43,4 +43,4 @@ def randomize(self, rand_gen=None, *args, **kwargs):
 Model.randomize = randomize
 Param.randomize = randomize
-Parameterized.randomize = randomize
+Parameterized.randomize = randomize
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -212,41 +212,18 @@ class GP(Model):
                        = N(f*| K_{x*x}(K_{xx} + \Sigma)^{-1}Y, K_{x*x*} - K_{xx*}(K_{xx} + \Sigma)^{-1}K_{xx*}
            \Sigma := \texttt{Likelihood.variance / Approximate likelihood covariance}
        """
-        if kern is None:
+        mu, var = self.posterior._raw_predict(kern=self.kern if kern is None else kern, Xnew=Xnew, pred_var=self._predictive_variable, full_cov=full_cov)
            kern = self.kern
        Kx = kern.K(self._predictive_variable, Xnew)
        mu = np.dot(Kx.T, self.posterior.woodbury_vector)
        if len(mu.shape)==1:
            mu = mu.reshape(-1,1)
        if full_cov:
            Kxx = kern.K(Xnew)
            if self.posterior.woodbury_inv.ndim == 2:
                var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
            elif self.posterior.woodbury_inv.ndim == 3: # Missing data
                var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
                from ..util.linalg import mdot
                for i in range(var.shape[2]):
                    var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
            var = var
        else:
            Kxx = kern.Kdiag(Xnew)
            if self.posterior.woodbury_inv.ndim == 2:
                var = (Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0))[:,None]
            elif self.posterior.woodbury_inv.ndim == 3: # Missing data
                var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
                for i in range(var.shape[1]):
                    var[:, i] = (Kxx - (np.sum(np.dot(self.posterior.woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
            var = var
        #add in the mean function
        if self.mean_function is not None:
            mu += self.mean_function.f(Xnew)
        return mu, var
-    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None, likelihood=None):
+    def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None, likelihood=None, include_likelihood=True):
        """
-        Predict the function(s) at the new point(s) Xnew.
+        Predict the function(s) at the new point(s) Xnew. This includes the likelihood
        variance added to the predicted underlying function (usually referred to as f).
        In order to predict without adding in the likelihood give
        `include_likelihood=False`, or refer to self.predict_noiseless().
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray (Nnew x self.input_dim)
@ -256,6 +233,8 @@ class GP(Model):
        :param Y_metadata: metadata about the predicting point to pass to the likelihood
        :param kern: The kernel to use for prediction (defaults to the model
                     kern). this is useful for examining e.g. subprocesses.
        :param bool include_likelihood: Whether or not to add likelihood noise to the predicted underlying latent function f.
        :returns: (mean, var):
            mean: posterior mean, a Numpy array, Nnew x self.input_dim
            var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
@ -270,11 +249,40 @@ class GP(Model):
        if self.normalizer is not None:
            mu, var = self.normalizer.inverse_mean(mu), self.normalizer.inverse_variance(var)
-        # now push through likelihood
+        if include_likelihood:
-        if likelihood is None:
+            # now push through likelihood
-            likelihood = self.likelihood
+            if likelihood is None:
-        mean, var = likelihood.predictive_values(mu, var, full_cov, Y_metadata=Y_metadata)
+                likelihood = self.likelihood
-        return mean, var
+            mu, var = likelihood.predictive_values(mu, var, full_cov, Y_metadata=Y_metadata)
        return mu, var
    def predict_noiseless(self,  Xnew, full_cov=False, Y_metadata=None, kern=None):
        """
        Convenience function to predict the underlying function of the GP (often
        referred to as f) without adding the likelihood variance on the
        prediction function.
        This is most likely what you want to use for your predictions.
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray (Nnew x self.input_dim)
        :param full_cov: whether to return the full covariance matrix, or just
                         the diagonal
        :type full_cov: bool
        :param Y_metadata: metadata about the predicting point to pass to the likelihood
        :param kern: The kernel to use for prediction (defaults to the model
                     kern). this is useful for examining e.g. subprocesses.
        :returns: (mean, var):
            mean: posterior mean, a Numpy array, Nnew x self.input_dim
            var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
           If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
           This is to allow for different normalizations of the output dimensions.
        Note: If you want the predictive quantiles (e.g. 95% confidence interval) use :py:func:"~GPy.core.gp.GP.predict_quantiles".
        """
        return self.predict(Xnew, full_cov, Y_metadata, kern, None, False)
    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None, kern=None, likelihood=None):
        """
@ -395,9 +403,9 @@ class GP(Model):
            var_jac = compute_cov_inner(self.posterior.woodbury_inv)
        return mean_jac, var_jac
-    def predict_wishard_embedding(self, Xnew, kern=None, mean=True, covariance=True):
+    def predict_wishart_embedding(self, Xnew, kern=None, mean=True, covariance=True):
        """
-        Predict the wishard embedding G of the GP. This is the density of the
+        Predict the wishart embedding G of the GP. This is the density of the
        input of the GP defined by the probabilistic function mapping f.
        G = J_mean.T*J_mean + output_dim*J_cov.
@ -425,15 +433,26 @@ class GP(Model):
            G += Sigma
        return G
-    def predict_magnification(self, Xnew, kern=None, mean=True, covariance=True):
+    def predict_wishard_embedding(self, Xnew, kern=None, mean=True, covariance=True):
        warnings.warn("Wrong naming, use predict_wishart_embedding instead. Will be removed in future versions!", DeprecationWarning)
        return self.predict_wishart_embedding(Xnew, kern, mean, covariance)
    def predict_magnification(self, Xnew, kern=None, mean=True, covariance=True, dimensions=None):
        """
        Predict the magnification factor as
        sqrt(det(G))
-        for each point N in Xnew
+        for each point N in Xnew.
        :param bool mean: whether to include the mean of the wishart embedding.
        :param bool covariance: whether to include the covariance of the wishart embedding.
        :param array-like dimensions: which dimensions of the input space to use [defaults to self.get_most_significant_input_dimensions()[:2]]
        """
        G = self.predict_wishard_embedding(Xnew, kern, mean, covariance)
        if dimensions is None:
            dimensions = self.get_most_significant_input_dimensions()[:2]
        G = G[:, dimensions][:,:,dimensions]
        from ..util.linalg import jitchol
        mag = np.empty(Xnew.shape[0])
        for n in range(Xnew.shape[0]):
@ -513,21 +532,23 @@ class GP(Model):
    def get_most_significant_input_dimensions(self, which_indices=None):
        return self.kern.get_most_significant_input_dimensions(which_indices)
-    def optimize(self, optimizer=None, start=None, **kwargs):
+    def optimize(self, optimizer=None, start=None, messages=False, max_iters=1000, ipython_notebook=True, clear_after_finish=False, **kwargs):
        """
        Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
        kwargs are passed to the optimizer. They can be:
-        :param max_f_eval: maximum number of function evaluations
+        :param max_iters: maximum number of function evaluations
-        :type max_f_eval: int
+        :type max_iters: int
        :messages: whether to display during optimisation
        :type messages: bool
        :param optimizer: which optimizer to use (defaults to self.preferred optimizer), a range of optimisers can be found in :module:`~GPy.inference.optimization`, they include 'scg', 'lbfgs', 'tnc'.
        :type optimizer: string
        :param bool ipython_notebook: whether to use ipython notebook widgets or not.
        :param bool clear_after_finish: if in ipython notebook, we can clear the widgets after optimization.
        """
        self.inference_method.on_optimization_start()
        try:
-            super(GP, self).optimize(optimizer, start, **kwargs)
+            super(GP, self).optimize(optimizer, start, messages, max_iters, ipython_notebook, clear_after_finish, **kwargs)
        except KeyboardInterrupt:
            print("KeyboardInterrupt caught, calling on_optimization_end() to round things up")
            self.inference_method.on_optimization_end()
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -45,4 +45,4 @@ class Model(ParamzModel, Priorizable):
        (including the MAP prior), so we return it here. If your model is not
        probabilistic, just return your *negative* gradient here!
        """
-        return -(self._log_likelihood_gradients() + self._log_prior_gradients())
+        return -(self._log_likelihood_gradients() + self._log_prior_gradients())
--- a/GPy/core/parameterization/init.py
+++ b/GPy/core/parameterization/init.py
@ -3,7 +3,7 @@
 from .param import Param
 from .parameterized import Parameterized
-from paramz import transformations
+from . import transformations
 from paramz.core import lists_and_dicts, index_operations, observable_array, observable
-from paramz import ties_and_remappings, ObsAr
+from paramz import ties_and_remappings, ObsAr
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@ -7,4 +7,4 @@ from paramz.transformations import __fixed__
 import logging, numpy as np
 class Param(Param, Priorizable):
-    pass
+    pass
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -49,4 +49,4 @@ class Parameterized(Parameterized, Priorizable):
        If you want to operate on all parameters use m[''] to wildcard select all paramters
        and concatenate them. Printing m[''] will result in printing of all parameters in detail.
    """     
-    pass
+    pass
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@ -1,4 +1,5 @@
 # Copyright (c) 2014, Max Zwiessele, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-from paramz.transformations import *
+from paramz.transformations import *
 from paramz.transformations import __fixed__
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -44,7 +44,7 @@ class SparseGP(GP):
        #pick a sensible inference method
        if inference_method is None:
            if isinstance(likelihood, likelihoods.Gaussian):
-                inference_method = var_dtc.VarDTC(limit=1)
+                inference_method = var_dtc.VarDTC(limit=3)
            else:
                #inference_method = ??
                raise NotImplementedError("what to do what to do?")
@ -113,85 +113,3 @@ class SparseGP(GP):
            self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)
        self._Zgrad = self.Z.gradient.copy()
    def _raw_predict(self, Xnew, full_cov=False, kern=None):
        """
        Make a prediction for the latent function values.
        For certain inputs we give back a full_cov of shape NxN,
        if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of,
        we take only the diagonal elements across N.
        For uncertain inputs, the SparseGP bound produces cannot predict the full covariance matrix full_cov for now.
        The implementation of that will follow. However, for each dimension the
        covariance changes, so if full_cov is False (standard), we return the variance
        for each dimension [NxD].
        """
        if kern is None: kern = self.kern
        if not isinstance(Xnew, VariationalPosterior):
            # Kx = kern.K(self._predictive_variable, Xnew)
            # mu = np.dot(Kx.T, self.posterior.woodbury_vector)
            # if full_cov:
            #     Kxx = kern.K(Xnew)
            #     if self.posterior.woodbury_inv.ndim == 2:
            #         var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
            #     elif self.posterior.woodbury_inv.ndim == 3:
            #         var = np.empty((Kxx.shape[0],Kxx.shape[1],self.posterior.woodbury_inv.shape[2]))
            #         for i in range(var.shape[2]):
            #             var[:, :, i] = (Kxx - mdot(Kx.T, self.posterior.woodbury_inv[:, :, i], Kx))
            #     var = var
            # else:
            #     Kxx = kern.Kdiag(Xnew)
            #     if self.posterior.woodbury_inv.ndim == 2:
            #         var = (Kxx - np.sum(np.dot(self.posterior.woodbury_inv.T, Kx) * Kx, 0))[:,None]
            #     elif self.posterior.woodbury_inv.ndim == 3:
            #         var = np.empty((Kxx.shape[0],self.posterior.woodbury_inv.shape[2]))
            #         for i in range(var.shape[1]):
            #             var[:, i] = (Kxx - (np.sum(np.dot(self.posterior.woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
            #     var = var
            # #add in the mean function
            # if self.mean_function is not None:
            #     mu += self.mean_function.f(Xnew)
            mu, var = super(SparseGP, self)._raw_predict(Xnew, full_cov, kern)
        else:
            psi0_star = kern.psi0(self._predictive_variable, Xnew)
            psi1_star = kern.psi1(self._predictive_variable, Xnew)
            psi2_star = kern.psi2n(self._predictive_variable, Xnew) 
            la = self.posterior.woodbury_vector
            mu = np.dot(psi1_star, la) # TODO: dimensions?
            N,M,D = psi0_star.shape[0],psi1_star.shape[1], la.shape[1]
            if full_cov:
                raise NotImplementedError("Full covariance for Sparse GP predicted with uncertain inputs not implemented yet.")
                var = np.zeros((Xnew.shape[0], la.shape[1], la.shape[1]))
                di = np.diag_indices(la.shape[1])
            else:
                tmp = psi2_star - psi1_star[:,:,None]*psi1_star[:,None,:]
                var = (tmp.reshape(-1,M).dot(la).reshape(N,M,D)*la[None,:,:]).sum(1) + psi0_star[:,None] 
                if self.posterior.woodbury_inv.ndim==2:
                    var += -psi2_star.reshape(N,-1).dot(self.posterior.woodbury_inv.flat)[:,None]
                else:
                    var += -psi2_star.reshape(N,-1).dot(self.posterior.woodbury_inv.reshape(-1,D))
            assert np.all(var>=-1e-5), "The predicted variance goes negative!: "+str(var)
            var = np.clip(var,1e-15,np.inf)
 #             for i in range(Xnew.shape[0]):
 #                 _mu, _var = Xnew.mean.values[[i]], Xnew.variance.values[[i]]
 #                 psi2_star = kern.psi2(self._predictive_variable, NormalPosterior(_mu, _var))
 #                 tmp = (psi2_star[:, :] - psi1_star[[i]].T.dot(psi1_star[[i]]))
 # 
 #                 var_ = mdot(la.T, tmp, la)
 #                 p0 = psi0_star[i]
 #                 t = np.atleast_3d(self.posterior.woodbury_inv)
 #                 t2 = np.trace(t.T.dot(psi2_star), axis1=1, axis2=2)
 # 
 #                 if full_cov:
 #                     var_[di] += p0
 #                     var_[di] += -t2
 #                     var[i] = var_
 #                 else:
 #                     var[i] = np.diag(var_)+p0-t2
        return mu, var
--- a/GPy/examples/state_space.py
+++ b/GPy/examples/state_space.py
@ -0,0 +1,26 @@
 import GPy
 import numpy as np
 import matplotlib.pyplot as plt
 import GPy.models.state_space_model as SS_model
 X = np.linspace(0, 10, 2000)[:, None]
 Y = np.sin(X) + np.random.randn(*X.shape)*0.1
 kernel1 = GPy.kern.Matern32(X.shape[1])
 m1  = GPy.models.GPRegression(X,Y, kernel1)
 print m1
 m1.optimize(optimizer='bfgs',messages=True)
 print m1
 kernel2 = GPy.kern.sde_Matern32(X.shape[1])
 #m2  = SS_model.StateSpace(X,Y, kernel2)
 m2 = GPy.models.StateSpace(X,Y, kernel2)
 print m2
 m2.optimize(optimizer='bfgs',messages=True)
 print m2
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@ -1,14 +1,13 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-from .posterior import Posterior
+from .posterior import PosteriorExact as Posterior
 from ...util.linalg import pdinv, dpotrs, tdot
 from ...util import diag
 import numpy as np
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 class ExactGaussianInference(LatentFunctionInference):
    """
    An object for inference when the likelihood is Gaussian.
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@ -2,7 +2,8 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
-from ...util.linalg import pdinv, dpotrs, dpotri, symmetrify, jitchol
+from ...util.linalg import pdinv, dpotrs, dpotri, symmetrify, jitchol, dtrtrs, tdot
 from GPy.core.parameterization.variational import VariationalPosterior
 class Posterior(object):
    """
@ -187,3 +188,85 @@ class Posterior(object):
        if self._K_chol is None:
            self._K_chol = jitchol(self._K)
        return self._K_chol
    def _raw_predict(self, kern, Xnew, pred_var, full_cov=False):
        woodbury_vector = self.woodbury_vector
        woodbury_inv = self.woodbury_inv
        if not isinstance(Xnew, VariationalPosterior):
            Kx = kern.K(pred_var, Xnew)
            mu = np.dot(Kx.T, woodbury_vector)
            if len(mu.shape)==1:
                mu = mu.reshape(-1,1)
            if full_cov:
                Kxx = kern.K(Xnew)
                if woodbury_inv.ndim == 2:
                    var = Kxx - np.dot(Kx.T, np.dot(woodbury_inv, Kx))
                elif woodbury_inv.ndim == 3: # Missing data
                    var = np.empty((Kxx.shape[0],Kxx.shape[1],woodbury_inv.shape[2]))
                    from ...util.linalg import mdot
                    for i in range(var.shape[2]):
                        var[:, :, i] = (Kxx - mdot(Kx.T, woodbury_inv[:, :, i], Kx))
                var = var
            else:
                Kxx = kern.Kdiag(Xnew)
                if woodbury_inv.ndim == 2:
                    var = (Kxx - np.sum(np.dot(woodbury_inv.T, Kx) * Kx, 0))[:,None]
                elif woodbury_inv.ndim == 3: # Missing data
                    var = np.empty((Kxx.shape[0],woodbury_inv.shape[2]))
                    for i in range(var.shape[1]):
                        var[:, i] = (Kxx - (np.sum(np.dot(woodbury_inv[:, :, i].T, Kx) * Kx, 0)))
                var = var
        else:
            psi0_star = kern.psi0(pred_var, Xnew)
            psi1_star = kern.psi1(pred_var, Xnew)
            psi2_star = kern.psi2n(pred_var, Xnew) 
            la = woodbury_vector
            mu = np.dot(psi1_star, la) # TODO: dimensions?
            N,M,D = psi0_star.shape[0],psi1_star.shape[1], la.shape[1]
            if full_cov:
                raise NotImplementedError("Full covariance for Sparse GP predicted with uncertain inputs not implemented yet.")
                var = np.zeros((Xnew.shape[0], la.shape[1], la.shape[1]))
                di = np.diag_indices(la.shape[1])
            else:
                tmp = psi2_star - psi1_star[:,:,None]*psi1_star[:,None,:]
                var = (tmp.reshape(-1,M).dot(la).reshape(N,M,D)*la[None,:,:]).sum(1) + psi0_star[:,None] 
                if woodbury_inv.ndim==2:
                    var += -psi2_star.reshape(N,-1).dot(woodbury_inv.flat)[:,None]
                else:
                    var += -psi2_star.reshape(N,-1).dot(woodbury_inv.reshape(-1,D))
        var = np.clip(var,1e-15,np.inf)
        return mu, var
 class PosteriorExact(Posterior):
    def _raw_predict(self, kern, Xnew, pred_var, full_cov=False):
        Kx = kern.K(pred_var, Xnew)
        mu = np.dot(Kx.T, self.woodbury_vector)
        if len(mu.shape)==1:
            mu = mu.reshape(-1,1)
        if full_cov:
            Kxx = kern.K(Xnew)
            if self._woodbury_chol.ndim == 2:
                tmp = dtrtrs(self._woodbury_chol, Kx)[0]
                var = Kxx - tdot(tmp.T)
            elif self._woodbury_chol.ndim == 3: # Missing data
                var = np.empty((Kxx.shape[0],Kxx.shape[1],self._woodbury_chol.shape[2]))
                for i in range(var.shape[2]):
                    tmp = dtrtrs(self._woodbury_chol[:,:,i], Kx)[0]
                    var[:, :, i] = (Kxx - tdot(tmp.T))
            var = var
        else:
            Kxx = kern.Kdiag(Xnew)
            if self._woodbury_chol.ndim == 2:
                tmp = dtrtrs(self._woodbury_chol, Kx)[0]
                var = (Kxx - np.square(tmp).sum(0))[:,None]
            elif self._woodbury_chol.ndim == 3: # Missing data
                var = np.empty((Kxx.shape[0],self._woodbury_chol.shape[2]))
                for i in range(var.shape[1]):
                    tmp = dtrtrs(self._woodbury_chol[:,:,i], Kx)[0]
                    var[:, i] = (Kxx - np.square(tmp).sum(0))
            var = var
        return mu, var
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@ -21,7 +21,7 @@ class VarDTC_minibatch(LatentFunctionInference):
    """
    const_jitter = 1e-8
-    def __init__(self, batchsize=None, limit=1, mpi_comm=None):
+    def __init__(self, batchsize=None, limit=3, mpi_comm=None):
        self.batchsize = batchsize
        self.mpi_comm = mpi_comm
--- a/GPy/inference/mcmc/samplers.py
+++ b/GPy/inference/mcmc/samplers.py
@ -37,16 +37,14 @@ class Metropolis_Hastings(object):
    def sample(self, Ntotal=10000, Nburn=1000, Nthin=10, tune=True, tune_throughout=False, tune_interval=400):
        current = self.model.optimizer_array
-        fcurrent = self.model.log_likelihood() + self.model.log_prior() + \
+        fcurrent = self.model.log_likelihood() + self.model.log_prior() 
                   self.model._log_det_jacobian()
        accepted = np.zeros(Ntotal,dtype=np.bool)
        for it in range(Ntotal):
-            print("sample %d of %d\r"%(it,Ntotal),end="\t")
+            print("sample %d of %d\r"%(it+1,Ntotal),end="")
            sys.stdout.flush()
            prop = np.random.multivariate_normal(current, self.cov*self.scale*self.scale)
            self.model.optimizer_array = prop
-            fprop = self.model.log_likelihood() + self.model.log_prior() + \
+            fprop = self.model.log_likelihood() + self.model.log_prior() 
                    self.model._log_det_jacobian()
            if fprop>fcurrent:#sample accepted, going 'uphill'
                accepted[it] = True
--- a/GPy/inference/optimization/init.py
+++ b/GPy/inference/optimization/init.py
@ -1,5 +1,8 @@
-from paramz.optimization import stochastics, Optimizer
+from paramz.optimization import Optimizer
 from . import stochastics
 from paramz.optimization import *
 import sys
 sys.modules['GPy.inference.optimization.stochastics'] = stochastics
-sys.modules['GPy.inference.optimization.Optimizer'] = Optimizer
+sys.modules['GPy.inference.optimization.Optimizer'] = Optimizer
--- a/GPy/inference/optimization/stochastics.py
+++ b/GPy/inference/optimization/stochastics.py
@ -0,0 +1,119 @@
 #===============================================================================
 # Copyright (c) 2015, Max Zwiessele
 # All rights reserved.
 # 
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
 # 
 # * Redistributions of source code must retain the above copyright notice, this
 #   list of conditions and the following disclaimer.
 # 
 # * Redistributions in binary form must reproduce the above copyright notice,
 #   this list of conditions and the following disclaimer in the documentation
 #   and/or other materials provided with the distribution.
 # 
 # * Neither the name of paramax nor the names of its
 #   contributors may be used to endorse or promote products derived from
 #   this software without specific prior written permission.
 # 
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #===============================================================================
 class StochasticStorage(object):
    '''
    This is a container for holding the stochastic parameters,
    such as subset indices or step length and so on.
    self.d has to be a list of lists:
    [dimension indices, nan indices for those dimensions]
    so that the minibatches can be used as efficiently as possible.
    '''
    def __init__(self, model):
        """
        Initialize this stochastic container using the given model
        """
    def do_stochastics(self):
        """
        Update the internal state to the next batch of the stochastic
        descent algorithm.
        """
        pass
    def reset(self):
        """
        Reset the state of this stochastics generator.
        """
 class SparseGPMissing(StochasticStorage):
    def __init__(self, model, batchsize=1):
        """
        Here we want to loop over all dimensions everytime.
        Thus, we can just make sure the loop goes over self.d every
        time. We will try to get batches which look the same together
        which speeds up calculations significantly.
        """
        import numpy as np
        self.Y = model.Y_normalized
        bdict = {}
        #For N > 1000 array2string default crops
        opt = np.get_printoptions()
        np.set_printoptions(threshold=np.inf)
        for d in range(self.Y.shape[1]):
            inan = np.isnan(self.Y)[:, d]
            arr_str = np.array2string(inan, np.inf, 0, True, '', formatter={'bool':lambda x: '1' if x else '0'})
            try:
                bdict[arr_str][0].append(d)
            except:
                bdict[arr_str] = [[d], ~inan]
        np.set_printoptions(**opt)
        self.d = bdict.values()
 class SparseGPStochastics(StochasticStorage):
    """
    For the sparse gp we need to store the dimension we are in,
    and the indices corresponding to those
    """
    def __init__(self, model, batchsize=1, missing_data=True):
        self.batchsize = batchsize
        self.output_dim = model.Y.shape[1]
        self.Y = model.Y_normalized
        self.missing_data = missing_data
        self.reset()
        self.do_stochastics()
    def do_stochastics(self):
        import numpy as np
        if self.batchsize == 1:
            self.current_dim = (self.current_dim+1)%self.output_dim
            self.d = [[[self.current_dim], np.isnan(self.Y[:, self.current_dim]) if self.missing_data else None]]
        else:
            self.d = np.random.choice(self.output_dim, size=self.batchsize, replace=False)
            bdict = {}
            if self.missing_data:
                opt = np.get_printoptions()
                np.set_printoptions(threshold=np.inf)
                for d in self.d:
                    inan = np.isnan(self.Y[:, d])
                    arr_str = np.array2string(inan,np.inf, 0,True, '',formatter={'bool':lambda x: '1' if x else '0'})
                    try:
                        bdict[arr_str][0].append(d)
                    except:
                        bdict[arr_str] = [[d], ~inan]
                np.set_printoptions(**opt)
                self.d = bdict.values()
            else:
                self.d = [[self.d, None]]
    def reset(self):
        self.current_dim = -1
        self.d = None
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -10,7 +10,7 @@ from .src.add import Add
 from .src.prod import Prod
 from .src.rbf import RBF
 from .src.linear import Linear, LinearFull
-from .src.static import Bias, White, Fixed
+from .src.static import Bias, White, Fixed, WhiteHeteroscedastic
 from .src.brownian import Brownian
 from .src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
 from .src.mlp import MLP
@ -28,4 +28,12 @@ from .src.trunclinear import TruncLinear,TruncLinear_inf
 from .src.splitKern import SplitKern,DEtime
 from .src.splitKern import DEtime as DiffGenomeKern
 from .src.spline import Spline
-from .src.basis_funcs import LogisticBasisFuncKernel, LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel
+from .src.basis_funcs import LogisticBasisFuncKernel, LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel
 from .src.sde_matern import sde_Matern32
 from .src.sde_matern import sde_Matern52
 from .src.sde_linear import sde_Linear
 from .src.sde_standard_periodic import sde_StdPeriodic
 from .src.sde_static import sde_White, sde_Bias
 from .src.sde_stationary import sde_RBF,sde_Exponential,sde_RatQuad
 from .src.sde_brownian import sde_Brownian
--- a/GPy/kern/_src/sde_brownian.py
+++ b/GPy/kern/_src/sde_brownian.py
@ -0,0 +1,57 @@
 # -*- coding: utf-8 -*-
 """
 Classes in this module enhance Brownian motion covariance function with the
 Stochastic Differential Equation (SDE) functionality.
 """
 from .brownian import Brownian
 import numpy as np
 class sde_Brownian(Brownian):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE form.
    Linear kernel:
    .. math::
       k(x,y) = \sigma^2 min(x,y)
    """
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        self.variance.gradient = gradients[0]
    def sde(self): 
        """ 
        Return the state space representation of the covariance. 
        """ 
        variance = float(self.variance.values) # this is initial variancve in Bayesian linear regression
        F = np.array( ((0,1.0),(0,0) ))
        L = np.array( ((1.0,),(0,)) )
        Qc = np.array( ((variance,),) )
        H = np.array( ((1.0,0),) )
        Pinf   = np.array( ( (0, -0.5*variance ), (-0.5*variance, 0) ) )
        #P0 = Pinf.copy() 
        P0 = np.zeros((2,2))   
        #Pinf   = np.array( ( (t0, 1.0), (1.0, 1.0/t0) ) ) * variance
        dF = np.zeros((2,2,1))
        dQc    = np.ones( (1,1,1) )
        dPinf = np.zeros((2,2,1))
        dPinf[:,:,0] = np.array( ( (0, -0.5), (-0.5, 0) ) )
        #dP0 = dPinf.copy() 
        dP0 = np.zeros((2,2,1))
        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
--- a/GPy/kern/_src/sde_linear.py
+++ b/GPy/kern/_src/sde_linear.py
@ -0,0 +1,64 @@
 # -*- coding: utf-8 -*-
 """
 Classes in this module enhance Linear covariance function with the
 Stochastic Differential Equation (SDE) functionality.
 """
 from .linear import Linear
 import numpy as np
 class sde_Linear(Linear):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE form.
    Linear kernel:
    .. math::
       k(x,y) = \sum_{i=1}^{input dim} \sigma^2_i x_iy_i
    """
    def __init__(self, input_dim, X, variances=None, ARD=False, active_dims=None, name='linear'):
        """
        Modify the init method, because one extra parameter is required. X - points
        on the X axis.
        """
        super(sde_Linear, self).__init__(input_dim, variances, ARD, active_dims, name)
        self.t0 = np.min(X)
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        self.variances.gradient = gradients[0]
    def sde(self): 
        """ 
        Return the state space representation of the covariance. 
        """ 
        variance = float(self.variances.values) # this is initial variancve in Bayesian linear regression
        t0 = float(self.t0)
        F = np.array( ((0,1.0),(0,0) ))
        L = np.array( ((0,),(1.0,)) )
        Qc = np.zeros((1,1))
        H = np.array( ((1.0,0),) )
        Pinf   = np.zeros((2,2))
        P0 = np.array( ( (t0**2, t0), (t0, 1) ) ) * variance        
        dF = np.zeros((2,2,1))
        dQc    = np.zeros( (1,1,1) )
        dPinf = np.zeros((2,2,1))
        dP0 = np.zeros((2,2,1))
        dP0[:,:,0]  = P0 / variance
        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
--- a/GPy/kern/_src/sde_matern.py
+++ b/GPy/kern/_src/sde_matern.py
@ -0,0 +1,135 @@
 # -*- coding: utf-8 -*-
 """
 Classes in this module enhance Matern covariance functions with the
 Stochastic Differential Equation (SDE) functionality.
 """
 from .stationary import Matern32
 from .stationary import Matern52
 import numpy as np
 class sde_Matern32(Matern32):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE forrm.
    Matern 3/2 kernel:
    .. math::
       k(r) = \sigma^2 (1 + \sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \text{ where  } r = \sqrt{\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\ell_i^2} }
    """
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        self.variance.gradient = gradients[0]
        self.lengthscale.gradient = gradients[1]
    def sde(self): 
        """ 
        Return the state space representation of the covariance. 
        """ 
        variance = float(self.variance.values)
        lengthscale = float(self.lengthscale.values)
        foo  = np.sqrt(3.)/lengthscale 
        F    = np.array(((0, 1.0), (-foo**2, -2*foo))) 
        L    = np.array(( (0,), (1.0,) ))
        Qc   = np.array(((12.*np.sqrt(3) / lengthscale**3 * variance,),)) 
        H    = np.array(((1.0, 0),)) 
        Pinf = np.array(((variance, 0.0), (0.0, 3.*variance/(lengthscale**2))))
        P0 = Pinf.copy()
        # Allocate space for the derivatives 
        dF    = np.empty([F.shape[0],F.shape[1],2])
        dQc   = np.empty([Qc.shape[0],Qc.shape[1],2]) 
        dPinf = np.empty([Pinf.shape[0],Pinf.shape[1],2]) 
        # The partial derivatives 
        dFvariance       = np.zeros((2,2)) 
        dFlengthscale    = np.array(((0,0), (6./lengthscale**3,2*np.sqrt(3)/lengthscale**2))) 
        dQcvariance      = np.array((12.*np.sqrt(3)/lengthscale**3)) 
        dQclengthscale   = np.array((-3*12*np.sqrt(3)/lengthscale**4*variance)) 
        dPinfvariance    = np.array(((1,0),(0,3./lengthscale**2))) 
        dPinflengthscale = np.array(((0,0), (0,-6*variance/lengthscale**3))) 
        # Combine the derivatives 
        dF[:,:,0]    = dFvariance 
        dF[:,:,1]    = dFlengthscale 
        dQc[:,:,0]   = dQcvariance 
        dQc[:,:,1]   = dQclengthscale 
        dPinf[:,:,0] = dPinfvariance 
        dPinf[:,:,1] = dPinflengthscale 
        dP0 = dPinf.copy()
        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
 class sde_Matern52(Matern52):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE forrm.
    Matern 5/2 kernel:
    .. math::
       k(r) = \sigma^2 (1 + \sqrt{5} r + \frac{5}{3}r^2) \exp(- \sqrt{5} r) \\ \\ \\ \\  \text{ where  } r = \sqrt{\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\ell_i^2} }
    """
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        self.variance.gradient = gradients[0]
        self.lengthscale.gradient = gradients[1]
    def sde(self): 
        """ 
        Return the state space representation of the covariance. 
        """ 
        variance = float(self.variance.values)
        lengthscale = float(self.lengthscale.values)
        lamda = np.sqrt(5.0)/lengthscale
        kappa = 5.0/3.0*variance/lengthscale**2        
        F = np.array(((0, 1,0), (0, 0, 1), (-lamda**3, -3.0*lamda**2, -3*lamda)))
        L = np.array(((0,),(0,),(1,)))
        Qc = np.array((((variance*400.0*np.sqrt(5.0)/3.0/lengthscale**5),),))
        H = np.array(((1,0,0),))        
        Pinf = np.array(((variance,0,-kappa), (0, kappa, 0), (-kappa, 0, 25.0*variance/lengthscale**4)))
        P0 = Pinf.copy()
        # Allocate space for the derivatives         
        dF = np.empty((3,3,2))        
        dQc = np.empty((1,1,2))        
        dPinf = np.empty((3,3,2))
         # The partial derivatives 
        dFvariance = np.zeros((3,3))
        dFlengthscale = np.array(((0,0,0),(0,0,0),(15.0*np.sqrt(5.0)/lengthscale**4, 
                                   30.0/lengthscale**3, 3*np.sqrt(5.0)/lengthscale**2)))
        dQcvariance = np.array((((400*np.sqrt(5)/3/lengthscale**5,),)))
        dQclengthscale = np.array((((-variance*2000*np.sqrt(5)/3/lengthscale**6,),)))        
        dPinf_variance = Pinf/variance
        kappa2 = -2.0*kappa/lengthscale
        dPinf_lengthscale = np.array(((0,0,-kappa2),(0,kappa2,0),(-kappa2, 
                                    0,-100*variance/lengthscale**5)))        
        # Combine the derivatives 
        dF[:,:,0] = dFvariance
        dF[:,:,1] = dFlengthscale        
        dQc[:,:,0] = dQcvariance         
        dQc[:,:,1] = dQclengthscale        
        dPinf[:,:,0] = dPinf_variance
        dPinf[:,:,1] = dPinf_lengthscale
        dP0 = dPinf.copy()
        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)  
--- a/GPy/kern/_src/sde_standard_periodic.py
+++ b/GPy/kern/_src/sde_standard_periodic.py
@ -0,0 +1,178 @@
 # -*- coding: utf-8 -*-
 """
 Classes in this module enhance Matern covariance functions with the
 Stochastic Differential Equation (SDE) functionality.
 """
 from .standard_periodic import StdPeriodic
 import numpy as np
 import scipy as sp
 from scipy import special as special
 class sde_StdPeriodic(StdPeriodic):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE form.
    Standard Periodic kernel:
    .. math::
       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} {}\sum_{i=1}^{input\_dim}  
       \left( \frac{\sin(\frac{\pi}{\lambda_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
    """
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        self.variance.gradient = gradients[0]
        self.wavelengths.gradient = gradients[1]
        self.lengthscales.gradient = gradients[2]
    def sde(self): 
        """ 
        Return the state space representation of the covariance.
        ! Note: one must constrain lengthscale not to drop below 0.25.
        After this bessel functions of the first kind grows to very high.
        ! Note: one must keep wevelength also not very low. Because then
        the gradients wrt wavelength become ustable. 
        However this might depend on the data. For test example with
        300 data points the low limit is 0.15.
        """ 
        # Params to use: (in that order)
        #self.variance
        #self.wavelengths
        #self.lengthscales
        N = 7 # approximation order        
        w0 = 2*np.pi/self.wavelengths # frequency
        lengthscales = 2*self.lengthscales         
        [q2,dq2l] = seriescoeff(N,lengthscales,self.variance)        
        # lengthscale is multiplied by 2 because of slightly different
        # formula for periodic covariance function.
        # For the same reason:
        dq2l = 2*dq2l
        if np.any( np.isfinite(q2) == False):
            raise ValueError("SDE periodic covariance error 1")
        if np.any( np.isfinite(dq2l) == False):
            raise ValueError("SDE periodic covariance error 2")
        F    = np.kron(np.diag(range(0,N+1)),np.array( ((0, -w0), (w0, 0)) ) )
        L    = np.eye(2*(N+1))
        Qc   = np.zeros((2*(N+1), 2*(N+1)))
        P_inf = np.kron(np.diag(q2),np.eye(2))
        H    = np.kron(np.ones((1,N+1)),np.array((1,0)) )
        P0 = P_inf.copy()
        # Derivatives
        dF = np.empty((F.shape[0], F.shape[1], 3))
        dQc = np.empty((Qc.shape[0], Qc.shape[1], 3))
        dP_inf = np.empty((P_inf.shape[0], P_inf.shape[1], 3))         
        # Derivatives wrt self.variance
        dF[:,:,0] = np.zeros(F.shape)
        dQc[:,:,0] = np.zeros(Qc.shape)
        dP_inf[:,:,0] = P_inf / self.variance
        # Derivatives self.wavelengths
        dF[:,:,1] = np.kron(np.diag(range(0,N+1)),np.array( ((0,  w0), (-w0, 0)) ) / self.wavelengths );
        dQc[:,:,1] = np.zeros(Qc.shape)
        dP_inf[:,:,1] = np.zeros(P_inf.shape)      
        # Derivatives self.lengthscales        
        dF[:,:,2] = np.zeros(F.shape)
        dQc[:,:,2] = np.zeros(Qc.shape)
        dP_inf[:,:,2] = np.kron(np.diag(dq2l),np.eye(2))
        dP0 = dP_inf.copy()
        return (F, L, Qc, H, P_inf, P0, dF, dQc, dP_inf, dP0)
 def seriescoeff(m=6,lengthScale=1.0,magnSigma2=1.0, true_covariance=False):
    """
    Calculate the coefficients q_j^2 for the covariance function 
    approximation:
        k(\tau) =  \sum_{j=0}^{+\infty} q_j^2 \cos(j\omega_0 \tau)
    Reference is:
    [1] Arno Solin and Simo Särkkä (2014). Explicit link between periodic 
        covariance functions and state space models. In Proceedings of the 
        Seventeenth International Conference on Artifcial Intelligence and 
        Statistics (AISTATS 2014). JMLR: W&CP, volume 33.    
    Note! Only the infinite approximation (through Bessel function) 
          is currently implemented.
    Input:
    ----------------
    m: int
        Degree of approximation. Default 6.
    lengthScale: float
        Length scale parameter in the kerenl
    magnSigma2:float
        Multiplier in front of the kernel.
    Output:
    -----------------
    coeffs: array(m+1)
        Covariance series coefficients
    coeffs_dl: array(m+1)
        Derivatives of the coefficients with respect to lengthscale.
    """
    if true_covariance:
        bb = lambda j,m: (1.0 + np.array((j != 0), dtype=np.float64) ) / (2**(j)) *\
            sp.special.binom(j, sp.floor( (j-m)/2.0 * np.array(m<=j, dtype=np.float64) ))*\
            np.array(m<=j, dtype=np.float64) *np.array(sp.mod(j-m,2)==0, dtype=np.float64)
        M,J = np.meshgrid(range(0,m+1),range(0,m+1))
        coeffs = bb(J,M) / sp.misc.factorial(J) * sp.exp( -lengthScale**(-2) ) *\
             (lengthScale**(-2))**J  *magnSigma2
        coeffs_dl = np.sum( coeffs*lengthScale**(-3)*(2.0-2.0*J*lengthScale**2),0)         
        coeffs = np.sum(coeffs,0)
    else:
        coeffs = 2*magnSigma2*sp.exp( -lengthScale**(-2) ) * special.iv(range(0,m+1),1.0/lengthScale**(2))
        if np.any( np.isfinite(coeffs) == False):
            raise ValueError("sde_standard_periodic: Coefficients are not finite!")
            #import pdb; pdb.set_trace()
        coeffs[0] = 0.5*coeffs[0]
        # Derivatives wrt (lengthScale)
        coeffs_dl = np.zeros(m+1)
        coeffs_dl[1:] = magnSigma2*lengthScale**(-3) * sp.exp(-lengthScale**(-2))*\
        (-4*special.iv(range(0,m),lengthScale**(-2)) + 4*(1+np.arange(1,m+1)*lengthScale**(2))*special.iv(range(1,m+1),lengthScale**(-2)) )    
        # The first element
        coeffs_dl[0] = magnSigma2*lengthScale**(-3) * np.exp(-lengthScale**(-2))*\
            (2*special.iv(0,lengthScale**(-2)) - 2*special.iv(1,lengthScale**(-2)) )     
    return coeffs, coeffs_dl
--- a/GPy/kern/_src/sde_static.py
+++ b/GPy/kern/_src/sde_static.py
@ -0,0 +1,101 @@
 # -*- coding: utf-8 -*-
 """
 Classes in this module enhance Static covariance functions with the
 Stochastic Differential Equation (SDE) functionality.
 """
 from .static import White
 from .static import Bias
 import numpy as np
 class sde_White(White):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE forrm.
    White kernel:
    .. math::
       k(x,y) = \alpha*\delta(x-y)
    """
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        self.variance.gradient = gradients[0]
    def sde(self): 
        """ 
        Return the state space representation of the covariance. 
        """ 
        variance = float(self.variance.values) 
        F = np.array( ((-np.inf,),) )
        L = np.array( ((1.0,),)  )
        Qc = np.array( ((variance,),)  )
        H = np.array( ((1.0,),) )
        Pinf   = np.array( ((variance,),)  )
        P0 = Pinf.copy()     
        dF = np.zeros((1,1,1))
        dQc = np.zeros((1,1,1))
        dQc[:,:,0]    = np.array( ((1.0,),) )
        dPinf = np.zeros((1,1,1))
        dPinf[:,:,0] = np.array( ((1.0,),) )
        dP0 = dPinf.copy()
        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
 class sde_Bias(Bias):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE forrm.
    Bias kernel:
    .. math::
       k(x,y) = \alpha
    """
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        self.variance.gradient = gradients[0]
    def sde(self): 
        """ 
        Return the state space representation of the covariance. 
        """ 
        variance = float(self.variance.values) 
        F = np.array( ((0.0,),))
        L = np.array( ((1.0,),))
        Qc = np.zeros((1,1))
        H = np.array( ((1.0,),))
        Pinf   = np.zeros((1,1))
        P0 = np.array( ((variance,),) )      
        dF = np.zeros((1,1,1))
        dQc    = np.zeros((1,1,1))
        dPinf = np.zeros((1,1,1))
        dP0 = np.zeros((1,1,1))
        dP0[:,:,0] = np.array( ((1.0,),) )
        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
--- a/GPy/kern/_src/sde_stationary.py
+++ b/GPy/kern/_src/sde_stationary.py
@ -0,0 +1,190 @@
 # -*- coding: utf-8 -*-
 """
 Classes in this module enhance several stationary covariance functions with the
 Stochastic Differential Equation (SDE) functionality.
 """
 from .rbf import RBF
 from .stationary import Exponential
 from .stationary import RatQuad
 import numpy as np
 import scipy as sp
 class sde_RBF(RBF):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE form.
    Radial Basis Function kernel:
    .. math::
        k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \\ \\ \\ \\  \text{ where  } r = \sqrt{\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\ell_i^2} }
    """
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        self.variance.gradient = gradients[0]
        self.lengthscale.gradient = gradients[1]
    def sde(self): 
        """ 
        Return the state space representation of the covariance. 
        """ 
        N = 10# approximation order ( number of terms in exponent series expansion)
        roots_rounding_decimals = 6
        fn = np.math.factorial(N)        
        kappa = 1.0/2.0/self.lengthscale**2
        Qc = np.array((self.variance*np.sqrt(np.pi/kappa)*fn*(4*kappa)**N,),)
        pp = np.zeros((2*N+1,)) # array of polynomial coefficients from higher power to lower
        for n in range(0, N+1): # (2N+1) - number of polynomial coefficients
            pp[2*(N-n)] = fn*(4.0*kappa)**(N-n)/np.math.factorial(n)*(-1)**n 
        pp = sp.poly1d(pp)
        roots = sp.roots(pp)        
        neg_real_part_roots = roots[np.round(np.real(roots) ,roots_rounding_decimals) < 0]
        aa = sp.poly1d(neg_real_part_roots, r=True).coeffs        
        F = np.diag(np.ones((N-1,)),1)
        F[-1,:] = -aa[-1:0:-1]
        L= np.zeros((N,1))
        L[N-1,0] = 1
        H = np.zeros((1,N))
        H[0,0] = 1
        # Infinite covariance:
        Pinf = sp.linalg.solve_lyapunov(F, -np.dot(L,np.dot( Qc[0,0],L.T)))
        Pinf = 0.5*(Pinf + Pinf.T)
        # Allocating space for derivatives        
        dF    = np.empty([F.shape[0],F.shape[1],2])
        dQc   = np.empty([Qc.shape[0],Qc.shape[1],2]) 
        dPinf = np.empty([Pinf.shape[0],Pinf.shape[1],2]) 
        # Derivatives:
        dFvariance = np.zeros(F.shape)
        dFlengthscale = np.zeros(F.shape)
        dFlengthscale[-1,:] = -aa[-1:0:-1]/self.lengthscale * np.arange(-N,0,1)
        dQcvariance = Qc/self.variance
        dQclengthscale = np.array(((self.variance*np.sqrt(2*np.pi)*fn*2**N*self.lengthscale**(-2*N)*(1-2*N,),)))         
        dPinf_variance = Pinf/self.variance
        lp = Pinf.shape[0]
        coeff = np.arange(1,lp+1).reshape(lp,1) + np.arange(1,lp+1).reshape(1,lp) - 2
        coeff[np.mod(coeff,2) != 0] = 0
        dPinf_lengthscale = -1/self.lengthscale*Pinf*coeff
        dF[:,:,0]    = dFvariance 
        dF[:,:,1]    = dFlengthscale 
        dQc[:,:,0]   = dQcvariance 
        dQc[:,:,1]   = dQclengthscale 
        dPinf[:,:,0] = dPinf_variance 
        dPinf[:,:,1] = dPinf_lengthscale
        P0 = Pinf.copy()
        dP0 = dPinf.copy()
        # Benefits of this are not very sound. Helps only in one case:
        # SVD Kalman + RBF kernel
        import GPy.models.state_space_main as ssm
        (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf,dP0, T) = ssm.balance_ss_model(F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0 )
        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
 class sde_Exponential(Exponential):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE form.
    Exponential kernel:
    .. math::
       k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r \\bigg) \\ \\ \\ \\  \text{ where  } r = \sqrt{\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\ell_i^2} }
    """
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        self.variance.gradient = gradients[0]
        self.lengthscale.gradient = gradients[1]
    def sde(self): 
        """ 
        Return the state space representation of the covariance. 
        """ 
        variance = float(self.variance.values)
        lengthscale = float(self.lengthscale)        
        F  = np.array(((-1.0/lengthscale,),))
        L  = np.array(((1.0,),)) 
        Qc = np.array( ((2.0*variance/lengthscale,),) ) 
        H = np.array(((1.0,),)) 
        Pinf = np.array(((variance,),)) 
        P0 = Pinf.copy()        
        dF = np.zeros((1,1,2));  
        dQc = np.zeros((1,1,2)); 
        dPinf = np.zeros((1,1,2));
        dF[:,:,0] = 0.0        
        dF[:,:,1] = 1.0/lengthscale**2
        dQc[:,:,0] = 2.0/lengthscale       
        dQc[:,:,1] = -2.0*variance/lengthscale**2
        dPinf[:,:,0] = 1.0
        dPinf[:,:,1] = 0.0
        dP0 = dPinf.copy()        
        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
 class sde_RatQuad(RatQuad):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE form.
    Rational Quadratic kernel:
    .. math::
       k(r) = \sigma^2 \\bigg( 1 + \\frac{r^2}{2} \\bigg)^{- \alpha} \\ \\ \\ \\  \text{ where  } r = \sqrt{\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\ell_i^2} }
    """
    def sde(self):
        """ 
        Return the state space representation of the covariance. 
        """ 
        assert False, 'Not Implemented'
        # Params to use:
        # self.lengthscale
        # self.variance
        #self.power
        #return (F, L, Qc, H, Pinf, dF, dQc, dPinf)  
--- a/GPy/kern/src/ODE_t.py
+++ b/GPy/kern/src/ODE_t.py
@ -162,4 +162,4 @@ class ODE_t(Kern):
                self.lengthscale_Yt.gradient =  np.sum(dkYdlent*(-0.5*self.lengthscale_Yt**(-2)) * dL_dK)
-                self.ubias.gradient = np.sum(dkdubias * dL_dK) 
+                self.ubias.gradient = np.sum(dkdubias * dL_dK) 
--- a/GPy/kern/src/init.py
+++ b/GPy/kern/src/init.py
@ -1 +1 @@
-from . import psi_comp
+from . import psi_comp
--- a/GPy/kern/src/add.py
+++ b/GPy/kern/src/add.py
@ -19,8 +19,8 @@ class Add(CombinationKernel):
            if isinstance(kern, Add):
                del subkerns[i]
                for part in kern.parts[::-1]:
-                    kern.unlink_parameter(part)
+                    #kern.unlink_parameter(part)
-                    subkerns.insert(i, part)
+                    subkerns.insert(i, part.copy())
        super(Add, self).__init__(subkerns, name)
        self._exact_psicomp = self._check_exact_psicomp()
@ -37,7 +37,7 @@ class Add(CombinationKernel):
        else:
            return False
-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def K(self, X, X2=None, which_parts=None):
        """
        Add all kernels together.
@ -51,7 +51,7 @@ class Add(CombinationKernel):
            which_parts = [which_parts]
        return reduce(np.add, (p.K(X, X2) for p in which_parts))
-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def Kdiag(self, X, which_parts=None):
        if which_parts is None:
            which_parts = self.parts
@ -98,17 +98,17 @@ class Add(CombinationKernel):
        [target.__iadd__(p.gradients_XX_diag(dL_dKdiag, X)) for p in self.parts]
        return target
-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def psi0(self, Z, variational_posterior):
        if not self._exact_psicomp: return Kern.psi0(self,Z,variational_posterior)
        return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts))
-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def psi1(self, Z, variational_posterior):
        if not self._exact_psicomp: return Kern.psi1(self,Z,variational_posterior)
        return reduce(np.add, (p.psi1(Z, variational_posterior) for p in self.parts))
-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def psi2(self, Z, variational_posterior):
        if not self._exact_psicomp: return Kern.psi2(self,Z,variational_posterior)
        psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts))
@ -144,7 +144,7 @@ class Add(CombinationKernel):
                raise NotImplementedError("psi2 cannot be computed for this kernel")
        return psi2
-    @Cache_this(limit=1, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def psi2n(self, Z, variational_posterior):
        if not self._exact_psicomp: return Kern.psi2n(self, Z, variational_posterior)
        psi2 = reduce(np.add, (p.psi2n(Z, variational_posterior) for p in self.parts))
@ -241,16 +241,20 @@ class Add(CombinationKernel):
            [np.add(target_grads[i],grads[i],target_grads[i]) for i in range(len(grads))]
        return target_grads
-    def add(self, other):
+    #def add(self, other):
-        if isinstance(other, Add):
+    #    parts = self.parts
-            other_params = other.parameters[:]
+    #    if 0:#isinstance(other, Add):
-            for p in other_params:
+    #        #other_params = other.parameters[:]
-                other.unlink_parameter(p)
+    #        for p in other.parts[:]:
-            self.link_parameters(*other_params)
+    #            other.unlink_parameter(p)
-        else:
+    #        parts.extend(other.parts)
-            self.link_parameter(other)
+    #        #self.link_parameters(*other_params)
-        self.input_dim, self._all_dims_active = self.get_input_dim_active_dims(self.parts)
+    #        
-        return self
+    #    else:
    #        #self.link_parameter(other)
    #        parts.append(other)
    #    #self.input_dim, self._all_dims_active = self.get_input_dim_active_dims(parts)
    #    return Add([p for p in parts], self.name)
    def input_sensitivity(self, summarize=True):
        if summarize:
@ -259,4 +263,94 @@ class Add(CombinationKernel):
                i_s[k._all_dims_active] += k.input_sensitivity(summarize)
            return i_s
        else:
            return super(Add, self).input_sensitivity(summarize)
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        part_start_param_index = 0
        for p in self.parts:
            if not p.is_fixed:
                part_param_num = len(p.param_array) # number of parameters in the part
                p.sde_update_gradient_full(gradients[part_start_param_index:(part_start_param_index+part_param_num)])
                part_start_param_index += part_param_num
    def sde(self):
        """
        Support adding kernels for sde representation
        """
        import scipy.linalg as la
        F     = None
        L     = None
        Qc    = None
        H     = None
        Pinf  = None
        P0    = None
        dF    = None
        dQc   = None
        dPinf = None
        dP0   = None
        n = 0
        nq = 0
        nd = 0
         # Assign models
        for p in self.parts:
            (Ft,Lt,Qct,Ht,Pinft,P0t,dFt,dQct,dPinft,dP0t) = p.sde()
            F = la.block_diag(F,Ft) if (F is not None) else Ft
            L = la.block_diag(L,Lt) if (L is not None) else Lt
            Qc = la.block_diag(Qc,Qct) if (Qc is not None) else Qct
            H = np.hstack((H,Ht)) if (H is not None) else Ht
            Pinf = la.block_diag(Pinf,Pinft) if (Pinf is not None) else Pinft
            P0 = la.block_diag(P0,P0t) if (P0 is not None) else P0t
            if dF is not None:
                dF = np.pad(dF,((0,dFt.shape[0]),(0,dFt.shape[1]),(0,dFt.shape[2])),
                        'constant', constant_values=0)
                dF[-dFt.shape[0]:,-dFt.shape[1]:,-dFt.shape[2]:] = dFt
            else:
                dF = dFt
            if dQc is not None:
                dQc = np.pad(dQc,((0,dQct.shape[0]),(0,dQct.shape[1]),(0,dQct.shape[2])),
                        'constant', constant_values=0)
                dQc[-dQct.shape[0]:,-dQct.shape[1]:,-dQct.shape[2]:] = dQct
            else:
                dQc = dQct
            if dPinf is not None:
                dPinf = np.pad(dPinf,((0,dPinft.shape[0]),(0,dPinft.shape[1]),(0,dPinft.shape[2])),
                        'constant', constant_values=0)
                dPinf[-dPinft.shape[0]:,-dPinft.shape[1]:,-dPinft.shape[2]:] = dPinft
            else:
                dPinf = dPinft
            if dP0 is not None:
                dP0 = np.pad(dP0,((0,dP0t.shape[0]),(0,dP0t.shape[1]),(0,dP0t.shape[2])),
                        'constant', constant_values=0)
                dP0[-dP0t.shape[0]:,-dP0t.shape[1]:,-dP0t.shape[2]:] = dP0t
            else:
                dP0 = dP0t
            n += Ft.shape[0]
            nq += Qct.shape[0]
            nd += dFt.shape[2]
        assert (F.shape[0] == n and F.shape[1]==n), "SDE add: Check of F Dimensions failed"
        assert (L.shape[0] == n and L.shape[1]==nq), "SDE add: Check of L Dimensions failed"
        assert (Qc.shape[0] == nq and Qc.shape[1]==nq), "SDE add: Check of Qc Dimensions failed"
        assert (H.shape[0] == 1 and H.shape[1]==n), "SDE add: Check of H Dimensions failed"
        assert (Pinf.shape[0] == n and Pinf.shape[1]==n), "SDE add: Check of Pinf Dimensions failed"
        assert (P0.shape[0] == n and P0.shape[1]==n), "SDE add: Check of P0 Dimensions failed"        
        assert (dF.shape[0] == n and dF.shape[1]==n and dF.shape[2]==nd), "SDE add: Check of dF Dimensions failed"
        assert (dQc.shape[0] == nq and dQc.shape[1]==nq and dQc.shape[2]==nd), "SDE add: Check of dQc Dimensions failed"
        assert (dPinf.shape[0] == n and dPinf.shape[1]==n and dPinf.shape[2]==nd), "SDE add: Check of dPinf Dimensions failed"
        assert (dP0.shape[0] == n and dP0.shape[1]==n and dP0.shape[2]==nd), "SDE add: Check of dP0 Dimensions failed"
        return (F,L,Qc,H,Pinf,P0,dF,dQc,dPinf,dP0)
--- a/GPy/kern/src/eq_ode2.py
+++ b/GPy/kern/src/eq_ode2.py
@ -64,7 +64,7 @@ class EQ_ODE2(Kern):
        self.W = Param('W', W)
        self.link_parameters(self.lengthscale, self.C, self.B, self.W)
-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
    def K(self, X, X2=None):
        #This way is not working, indexes are lost after using k._slice_X
        #index = np.asarray(X, dtype=np.int)
--- a/GPy/kern/src/independent_outputs.py
+++ b/GPy/kern/src/independent_outputs.py
@ -56,14 +56,18 @@ class IndependentOutputs(CombinationKernel):
            self.single_kern = False
            self.kern = kernels
        super(IndependentOutputs, self).__init__(kernels=kernels, extra_dims=[index_dim], name=name)
-        self.index_dim = index_dim
+        # The combination kernel ALLWAYS puts the extra dimension last.
        # Thus, the index dimension of this kernel is always the last dimension
        # after slicing. This is why the index_dim is just the last column:
        self.index_dim = -1
    def K(self,X ,X2=None):
        slices = index_to_slices(X[:,self.index_dim])
        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
        if X2 is None:
            target = np.zeros((X.shape[0], X.shape[0]))
-            [[target.__setitem__((s,ss), kern.K(X[s,:], X[ss,:])) for s,ss in itertools.product(slices_i, slices_i)] for kern, slices_i in zip(kerns, slices)]
+            #[[target.__setitem__((s,ss), kern.K(X[s,:], X[ss,:])) for s,ss in itertools.product(slices_i, slices_i)] for kern, slices_i in zip(kerns, slices)]
            [[target.__setitem__((s,ss), kern.K(X[s,:]) if s==ss else kern.K(X[s,:], X[ss,:])) for s,ss in itertools.product(slices_i, slices_i)] for kern, slices_i in zip(kerns, slices)]
        else:
            slices2 = index_to_slices(X2[:,self.index_dim])
            target = np.zeros((X.shape[0], X2.shape[0]))
@ -103,13 +107,10 @@ class IndependentOutputs(CombinationKernel):
        target = np.zeros(X.shape)
        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
        if X2 is None:
            # TODO: make use of index_to_slices
            # FIXME: Broken as X is already sliced out
            # print("Warning, gradients_X may not be working, I believe X has already been sliced out by the slicer!")
            values = np.unique(X[:,self.index_dim])
            slices = [X[:,self.index_dim]==i for i in values]
-            [target.__setitem__(s, kern.gradients_X(dL_dK[s,s],X[s],None))
+            for kern, s in zip(kerns, slices):
-              for kern, s in zip(kerns, slices)]
+                target[s] += kern.gradients_X(dL_dK[s, :][:, s],X[s], None)
            #slices = index_to_slices(X[:,self.index_dim])
            #[[np.add(target[s], kern.gradients_X(dL_dK[s,s], X[s]), out=target[s])
            #  for s in slices_i] for kern, slices_i in zip(kerns, slices)]
@ -121,8 +122,8 @@ class IndependentOutputs(CombinationKernel):
            values = np.unique(X[:,self.index_dim])
            slices = [X[:,self.index_dim]==i for i in values]
            slices2 = [X2[:,self.index_dim]==i for i in values]
-            [target.__setitem__(s, kern.gradients_X(dL_dK[s, :][:, s2],X[s],X2[s2]))
+            for kern, s, s2 in zip(kerns, slices, slices2):
-              for kern, s, s2 in zip(kerns, slices, slices2)]
+                target[s] += kern.gradients_X(dL_dK[s, :][:, s2],X[s],X2[s2])
            # TODO: make work with index_to_slices
            #slices = index_to_slices(X[:,self.index_dim])
            #slices2 = index_to_slices(X2[:,self.index_dim])
@ -133,7 +134,9 @@ class IndependentOutputs(CombinationKernel):
        slices = index_to_slices(X[:,self.index_dim])
        kerns = itertools.repeat(self.kern) if self.single_kern else self.kern
        target = np.zeros(X.shape)
-        [[target.__setitem__(s, kern.gradients_X_diag(dL_dKdiag[s],X[s])) for s in slices_i] for kern, slices_i in zip(kerns, slices)]
+        for kern, slices_i in zip(kerns, slices):
            for s in slices_i:
                target[s] += kern.gradients_X_diag(dL_dKdiag[s],X[s])
        return target
    def update_gradients_diag(self, dL_dKdiag, X):
--- a/GPy/kern/src/kern.py
+++ b/GPy/kern/src/kern.py
@ -49,10 +49,11 @@ class Kern(Parameterized):
        if active_dims is None:
            active_dims = np.arange(input_dim)
-        self.active_dims = active_dims
+        self.active_dims = np.asarray(active_dims, np.int_)
        self._all_dims_active = np.atleast_1d(active_dims).astype(int)
-        assert self._all_dims_active.size == self.input_dim, "input_dim={} does not match len(active_dim)={}, _all_dims_active={}".format(self.input_dim, self._all_dims_active.size, self._all_dims_active)
+        self._all_dims_active = np.atleast_1d(self.active_dims).astype(int)
        assert self.active_dims.size == self.input_dim, "input_dim={} does not match len(active_dim)={}".format(self.input_dim, self._all_dims_active.size)
        self._sliced_X = 0
        self.useGPU = self._support_GPU and useGPU
@ -68,9 +69,12 @@ class Kern(Parameterized):
    def _effective_input_dim(self):
        return np.size(self._all_dims_active)
-    @Cache_this(limit=20)
+    @Cache_this(limit=3)
    def _slice_X(self, X):
-        return X[:, self._all_dims_active]
+        try:
            return X[:, self._all_dims_active].astype('float')
        except:
            return X[:, self._all_dims_active]
    def K(self, X, X2):
        """
@ -296,12 +300,11 @@ class Kern(Parameterized):
        return Prod([self, other], name)
    def _check_input_dim(self, X):
-        assert X.shape[1] == self.input_dim, "{} did not specify _all_dims_active and X has wrong shape: X_dim={}, whereas input_dim={}".format(self.name, X.shape[1], self.input_dim)
+        assert X.shape[1] == self.input_dim, "{} did not specify active_dims and X has wrong shape: X_dim={}, whereas input_dim={}".format(self.name, X.shape[1], self.input_dim)
    def _check_active_dims(self, X):
        assert X.shape[1] >= len(self._all_dims_active), "At least {} dimensional X needed, X.shape={!s}".format(len(self._all_dims_active), X.shape)
 class CombinationKernel(Kern):
    """
    Abstract super class for combination kernels.
@ -319,10 +322,18 @@ class CombinationKernel(Kern):
        :param array-like extra_dims: if needed extra dimensions for the combination kernel to work on
        """
        assert all([isinstance(k, Kern) for k in kernels])
-        extra_dims = np.array(extra_dims, dtype=int)
+        extra_dims = np.asarray(extra_dims, dtype=int)
-        input_dim, active_dims = self.get_input_dim_active_dims(kernels, extra_dims)
+
        active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), extra_dims)
        input_dim = active_dims.size
        # initialize the kernel with the full input_dim
        super(CombinationKernel, self).__init__(input_dim, active_dims, name)
        effective_input_dim = reduce(max, (k._all_dims_active.max() for k in kernels)) + 1
        self._all_dims_active = np.array(np.concatenate((np.arange(effective_input_dim), extra_dims if extra_dims is not None else [])), dtype=int)
        self.extra_dims = extra_dims
        self.link_parameters(*kernels)
@ -330,16 +341,8 @@ class CombinationKernel(Kern):
    def parts(self):
        return self.parameters
-    def get_input_dim_active_dims(self, kernels, extra_dims = None):
+    def _set_all_dims_ative(self):
-        self.active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int))
+        self._all_dims_active = np.atleast_1d(self.active_dims).astype(int)
        #_all_dims_active = np.array(np.concatenate((_all_dims_active, extra_dims if extra_dims is not None else [])), dtype=int)
        input_dim = reduce(max, (k._all_dims_active.max() for k in kernels)) + 1
        if extra_dims is not None:
            input_dim += extra_dims.size
        _all_dims_active = np.arange(input_dim)
        return input_dim, _all_dims_active
    def input_sensitivity(self, summarize=True):
        """
--- a/GPy/kern/src/linear.py
+++ b/GPy/kern/src/linear.py
@ -51,7 +51,7 @@ class Linear(Kern):
        self.link_parameter(self.variances)
        self.psicomp = PSICOMP_Linear()
-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
    def K(self, X, X2=None):
        if self.ARD:
            if X2 is None:
@ -62,7 +62,7 @@ class Linear(Kern):
        else:
            return self._dot_product(X, X2) * self.variances
-    @Cache_this(limit=1, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def _dot_product(self, X, X2=None):
        if X2 is None:
            return tdot(X)
--- a/GPy/kern/src/mlp.py
+++ b/GPy/kern/src/mlp.py
@ -45,7 +45,7 @@ class MLP(Kern):
        self.link_parameters(self.variance, self.weight_variance, self.bias_variance)
-    @Cache_this(limit=20, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def K(self, X, X2=None):
        if X2 is None:
            X_denom = np.sqrt(self._comp_prod(X)+1.)
@ -57,7 +57,7 @@ class MLP(Kern):
        XTX = self._comp_prod(X,X2)/X_denom[:,None]/X2_denom[None,:]
        return self.variance*four_over_tau*np.arcsin(XTX)
-    @Cache_this(limit=20, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def Kdiag(self, X):
        """Compute the diagonal of the covariance matrix for X."""
        X_prod = self._comp_prod(X)
@ -88,14 +88,14 @@ class MLP(Kern):
        """Gradient of diagonal of covariance with respect to X"""
        return self._comp_grads_diag(dL_dKdiag, X)[3]
-    @Cache_this(limit=50, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def _comp_prod(self, X, X2=None):
        if X2 is None:
            return (np.square(X)*self.weight_variance).sum(axis=1)+self.bias_variance
        else:
            return (X*self.weight_variance).dot(X2.T)+self.bias_variance
-    @Cache_this(limit=20, ignore_args=(1,))
+    @Cache_this(limit=3, ignore_args=(1,))
    def _comp_grads(self, dL_dK, X, X2=None):
        var,w,b = self.variance, self.weight_variance, self.bias_variance
        K = self.K(X, X2)
@ -130,7 +130,7 @@ class MLP(Kern):
            dX2 = common.T.dot(X)*w-((common*XTX).sum(axis=0)/(X2_prod+1.))[:,None]*X2*w
        return dvar, dw, db, dX, dX2
-    @Cache_this(limit=20, ignore_args=(1,))
+    @Cache_this(limit=3, ignore_args=(1,))
    def _comp_grads_diag(self, dL_dKdiag, X):
        var,w,b = self.variance, self.weight_variance, self.bias_variance
        K = self.Kdiag(X)
--- a/GPy/kern/src/poly.py
+++ b/GPy/kern/src/poly.py
@ -27,7 +27,7 @@ class Poly(Kern):
        _, _, B = self._AB(X, X2)
        return B * self.variance
-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
    def _AB(self, X, X2=None):
        if X2 is None:
            dot_prod = np.dot(X, X.T)
--- a/GPy/kern/src/prod.py
+++ b/GPy/kern/src/prod.py
@ -39,7 +39,7 @@ class Prod(CombinationKernel):
                    kernels.insert(i, part)
        super(Prod, self).__init__(kernels, name)
-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def K(self, X, X2=None, which_parts=None):
        if which_parts is None:
            which_parts = self.parts
@ -48,7 +48,7 @@ class Prod(CombinationKernel):
            which_parts = [which_parts]
        return reduce(np.multiply, (p.K(X, X2) for p in which_parts))
-    @Cache_this(limit=2, force_kwargs=['which_parts'])
+    @Cache_this(limit=3, force_kwargs=['which_parts'])
    def Kdiag(self, X, which_parts=None):
        if which_parts is None:
            which_parts = self.parts
@ -105,3 +105,114 @@ class Prod(CombinationKernel):
            return i_s
        else:
            return super(Prod, self).input_sensitivity(summarize)
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        part_start_param_index = 0
        for p in self.parts:
            if not p.is_fixed:
                part_param_num = len(p.param_array) # number of parameters in the part
                p.sde_update_gradient_full(gradients[part_start_param_index:(part_start_param_index+part_param_num)])
                part_start_param_index += part_param_num
    def sde(self):
        """
        """
        F      = np.array((0,), ndmin=2)
        L      = np.array((1,), ndmin=2)
        Qc     = np.array((1,), ndmin=2)
        H      = np.array((1,), ndmin=2)
        Pinf   = np.array((1,), ndmin=2)
        P0   = np.array((1,), ndmin=2)
        dF     = None
        dQc    = None
        dPinf  = None
        dP0  = None
         # Assign models
        for p in self.parts:
            (Ft,Lt,Qct,Ht,P_inft, P0t, dFt,dQct,dP_inft,dP0t) = p.sde()
            # check derivative dimensions ->
            number_of_parameters = len(p.param_array)            
            assert dFt.shape[2] == number_of_parameters, "Dynamic matrix derivative shape is wrong"
            assert dQct.shape[2] == number_of_parameters, "Diffusion matrix derivative shape is wrong"
            assert dP_inft.shape[2] == number_of_parameters, "Infinite covariance matrix derivative shape is wrong"
            # check derivative dimensions <-
            # exception for periodic kernel
            if (p.name == 'std_periodic'):
                Qct = P_inft  
                dQct = dP_inft                 
            dF    = dkron(F,dF,Ft,dFt,'sum')
            dQc   = dkron(Qc,dQc,Qct,dQct,'prod')
            dPinf = dkron(Pinf,dPinf,P_inft,dP_inft,'prod')
            dP0 = dkron(P0,dP0,P0t,dP0t,'prod')
            F    = np.kron(F,np.eye(Ft.shape[0])) + np.kron(np.eye(F.shape[0]),Ft)
            L    = np.kron(L,Lt)
            Qc   = np.kron(Qc,Qct)
            Pinf = np.kron(Pinf,P_inft)
            P0 = np.kron(P0,P_inft)
            H    = np.kron(H,Ht)
        return (F,L,Qc,H,Pinf,P0,dF,dQc,dPinf,dP0)
 def dkron(A,dA,B,dB, operation='prod'):
    """
    Function computes the derivative of Kronecker product A*B 
    (or Kronecker sum A+B).
    Input:
    -----------------------
    A: 2D matrix
        Some matrix 
    dA: 3D (or 2D matrix)
        Derivarives of A
    B: 2D matrix
        Some matrix 
    dB: 3D (or 2D matrix)
        Derivarives of B    
    operation: str 'prod' or 'sum'
        Which operation is considered. If the operation is 'sum' it is assumed
        that A and are square matrices.s
    Output:
        dC: 3D matrix
        Derivative of Kronecker product A*B (or Kronecker sum A+B)
    """
    if dA is None:
        dA_param_num = 0
        dA = np.zeros((A.shape[0], A.shape[1],1))
    else:
        dA_param_num = dA.shape[2]
    if dB is None:
        dB_param_num = 0
        dB = np.zeros((B.shape[0], B.shape[1],1))    
    else:
        dB_param_num = dB.shape[2]
    # Space allocation for derivative matrix
    dC = np.zeros((A.shape[0]*B.shape[0], A.shape[1]*B.shape[1], dA_param_num +  dB_param_num))    
    for k in range(dA_param_num):
        if operation == 'prod':
            dC[:,:,k] = np.kron(dA[:,:,k],B);
        else:
            dC[:,:,k] = np.kron(dA[:,:,k],np.eye( B.shape[0] ))
    for k in range(dB_param_num):
        if operation == 'prod':
            dC[:,:,dA_param_num+k] = np.kron(A,dB[:,:,k])
        else:
            dC[:,:,dA_param_num+k] = np.kron(np.eye( A.shape[0] ),dB[:,:,k])
    return dC
--- a/GPy/kern/src/psi_comp/init.py
+++ b/GPy/kern/src/psi_comp/init.py
@ -21,7 +21,7 @@ from .gaussherm import PSICOMP_GH
 from . import rbf_psi_comp, linear_psi_comp, ssrbf_psi_comp, sslinear_psi_comp
 class PSICOMP_RBF(PSICOMP):
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        variance, lengthscale = kern.variance, kern.lengthscale
        if isinstance(variational_posterior, variational.NormalPosterior):
@ -31,7 +31,7 @@ class PSICOMP_RBF(PSICOMP):
        else:
            raise ValueError("unknown distriubtion received for psi-statistics")
-    @Cache_this(limit=10, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        variance, lengthscale = kern.variance, kern.lengthscale
        if isinstance(variational_posterior, variational.NormalPosterior):
@ -43,7 +43,7 @@ class PSICOMP_RBF(PSICOMP):
 class PSICOMP_Linear(PSICOMP):
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        variances = kern.variances
        if isinstance(variational_posterior, variational.NormalPosterior):
@ -53,7 +53,7 @@ class PSICOMP_Linear(PSICOMP):
        else:
            raise ValueError("unknown distriubtion received for psi-statistics")
-    @Cache_this(limit=10, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        variances = kern.variances
        if isinstance(variational_posterior, variational.NormalPosterior):
--- a/GPy/kern/src/psi_comp/gaussherm.py
+++ b/GPy/kern/src/psi_comp/gaussherm.py
@ -27,7 +27,7 @@ class PSICOMP_GH(PSICOMP):
    def _setup_observers(self):
        pass
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def comp_K(self, Z, qX):
        if self.Xs is None or self.Xs.shape != qX.mean.shape:
            from paramz import ObsAr
@ -38,7 +38,7 @@ class PSICOMP_GH(PSICOMP):
            self.Xs[i] = self.locs[i]*S_sq+mu
        return self.Xs
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def psicomputations(self, kern, Z, qX, return_psi2_n=False):
        mu, S = qX.mean.values, qX.variance.values
        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
@ -62,7 +62,7 @@ class PSICOMP_GH(PSICOMP):
                psi2 += self.weights[i]* tdot(Kfu.T)
        return psi0, psi1, psi2
-    @Cache_this(limit=10, ignore_args=(0, 2,3,4))
+    @Cache_this(limit=3, ignore_args=(0, 2,3,4))
    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, qX):
        mu, S = qX.mean.values, qX.variance.values
        if self.cache_K: Xs = self.comp_K(Z, qX)
--- a/GPy/kern/src/psi_comp/rbf_psi_comp.py
+++ b/GPy/kern/src/psi_comp/rbf_psi_comp.py
@ -132,5 +132,5 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
    return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
-_psi1computations = Cacher(__psi1computations, limit=5)
+_psi1computations = Cacher(__psi1computations, limit=3)
-_psi2computations = Cacher(__psi2computations, limit=5)
+_psi2computations = Cacher(__psi2computations, limit=3)
--- a/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
+++ b/GPy/kern/src/psi_comp/rbf_psi_gpucomp.py
@ -5,7 +5,6 @@ The module for psi-statistics for RBF kernel
 import numpy as np
 from paramz.caching import Cache_this
 from . import PSICOMP_RBF
 from ....util import gpu_init
 gpu_code = """
    // define THREADNUM
@ -238,8 +237,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
        self.fall_back = PSICOMP_RBF()
        from pycuda.compiler import SourceModule
-        from ....util.gpu_init import initGPU
+        import GPy.util.gpu_init
        initGPU()
        self.GPU_direct = GPU_direct
        self.gpuCache = None
@ -326,7 +324,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
        except:
            return self.fall_back.psicomputations(kern, Z, variational_posterior, return_psi2_n)
-    @Cache_this(limit=10, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def _psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        """
        Z - MxQ
@ -371,7 +369,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
        except:
            return self.fall_back.psiDerivativecomputations(kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior)
-    @Cache_this(limit=10, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
    def _psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        # resolve the requirement of dL_dpsi2 to be symmetric
        if len(dL_dpsi2.shape)==2: dL_dpsi2 = (dL_dpsi2+dL_dpsi2.T)/2
--- a/GPy/kern/src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/src/psi_comp/ssrbf_psi_comp.py
@ -88,7 +88,7 @@ try:
        return psi0,psi1,psi2,psi2n
    from GPy.util.caching import Cacher
-    psicomputations = Cacher(_psicomputations, limit=1)
+    psicomputations = Cacher(_psicomputations, limit=3)
    def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
        ARD = (len(lengthscale)!=1)
--- a/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
+++ b/GPy/kern/src/psi_comp/ssrbf_psi_gpucomp.py
@ -7,7 +7,6 @@ import numpy as np
 from paramz.caching import Cache_this
 from . import PSICOMP_RBF
 gpu_code = """
    // define THREADNUM
@ -287,8 +286,7 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
    def __init__(self, threadnum=128, blocknum=15, GPU_direct=False):
        from pycuda.compiler import SourceModule
-        from ....util.gpu_init import initGPU
+        import GPy.util.gpu_init
        initGPU()
        self.GPU_direct = GPU_direct
        self.gpuCache = None
@ -375,7 +373,7 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
    def get_dimensions(self, Z, variational_posterior):
        return variational_posterior.mean.shape[0], Z.shape[0], Z.shape[1]
-    @Cache_this(limit=1, ignore_args=(0,))
+    @Cache_this(limit=3, ignore_args=(0,))
    def psicomputations(self, kern, Z, variational_posterior, return_psi2_n=False):
        """
        Z - MxQ
@ -409,7 +407,7 @@ class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
        else:
            return psi0, psi1_gpu.get(), psi2_gpu.get()
-    @Cache_this(limit=1, ignore_args=(0,2,3,4))
+    @Cache_this(limit=3, ignore_args=(0,2,3,4))
    def psiDerivativecomputations(self, kern, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        variance, lengthscale = kern.variance, kern.lengthscale
        from ....util.linalg_gpu import sum_axis
--- a/GPy/kern/src/sde_brownian.py
+++ b/GPy/kern/src/sde_brownian.py
@ -0,0 +1,59 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2015, Alex Grigorevskiy, Arno Solin
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 Classes in this module enhance Brownian motion covariance function with the
 Stochastic Differential Equation (SDE) functionality.
 """
 from .brownian import Brownian
 import numpy as np
 class sde_Brownian(Brownian):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE form.
    Linear kernel:
    .. math::
       k(x,y) = \sigma^2 min(x,y)
    """
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        self.variance.gradient = gradients[0]
    def sde(self): 
        """ 
        Return the state space representation of the covariance. 
        """ 
        variance = float(self.variance.values) # this is initial variancve in Bayesian linear regression
        F = np.array( ((0,1.0),(0,0) ))
        L = np.array( ((1.0,),(0,)) )
        Qc = np.array( ((variance,),) )
        H = np.array( ((1.0,0),) )
        Pinf   = np.array( ( (0, -0.5*variance ), (-0.5*variance, 0) ) )
        #P0 = Pinf.copy() 
        P0 = np.zeros((2,2))   
        #Pinf   = np.array( ( (t0, 1.0), (1.0, 1.0/t0) ) ) * variance
        dF = np.zeros((2,2,1))
        dQc    = np.ones( (1,1,1) )
        dPinf = np.zeros((2,2,1))
        dPinf[:,:,0] = np.array( ( (0, -0.5), (-0.5, 0) ) )
        #dP0 = dPinf.copy() 
        dP0 = np.zeros((2,2,1))
        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
--- a/GPy/kern/src/sde_linear.py
+++ b/GPy/kern/src/sde_linear.py
@ -0,0 +1,66 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2015, Alex Grigorevskiy, Arno Solin
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 Classes in this module enhance Linear covariance function with the
 Stochastic Differential Equation (SDE) functionality.
 """
 from .linear import Linear
 import numpy as np
 class sde_Linear(Linear):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE form.
    Linear kernel:
    .. math::
       k(x,y) = \sum_{i=1}^{input dim} \sigma^2_i x_iy_i
    """
    def __init__(self, input_dim, X, variances=None, ARD=False, active_dims=None, name='linear'):
        """
        Modify the init method, because one extra parameter is required. X - points
        on the X axis.
        """
        super(sde_Linear, self).__init__(input_dim, variances, ARD, active_dims, name)
        self.t0 = np.min(X)
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        self.variances.gradient = gradients[0]
    def sde(self): 
        """ 
        Return the state space representation of the covariance. 
        """ 
        variance = float(self.variances.values) # this is initial variancve in Bayesian linear regression
        t0 = float(self.t0)
        F = np.array( ((0,1.0),(0,0) ))
        L = np.array( ((0,),(1.0,)) )
        Qc = np.zeros((1,1))
        H = np.array( ((1.0,0),) )
        Pinf   = np.zeros((2,2))
        P0 = np.array( ( (t0**2, t0), (t0, 1) ) ) * variance        
        dF = np.zeros((2,2,1))
        dQc    = np.zeros( (1,1,1) )
        dPinf = np.zeros((2,2,1))
        dP0 = np.zeros((2,2,1))
        dP0[:,:,0]  = P0 / variance
        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
--- a/GPy/kern/src/sde_matern.py
+++ b/GPy/kern/src/sde_matern.py
@ -0,0 +1,137 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2015, Alex Grigorevskiy, Arno Solin
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 Classes in this module enhance Matern covariance functions with the
 Stochastic Differential Equation (SDE) functionality.
 """
 from .stationary import Matern32
 from .stationary import Matern52
 import numpy as np
 class sde_Matern32(Matern32):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE forrm.
    Matern 3/2 kernel:
    .. math::
       k(r) = \sigma^2 (1 + \sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \text{ where  } r = \sqrt{\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\ell_i^2} }
    """
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        self.variance.gradient = gradients[0]
        self.lengthscale.gradient = gradients[1]
    def sde(self): 
        """ 
        Return the state space representation of the covariance. 
        """ 
        variance = float(self.variance.values)
        lengthscale = float(self.lengthscale.values)
        foo  = np.sqrt(3.)/lengthscale 
        F    = np.array(((0, 1.0), (-foo**2, -2*foo))) 
        L    = np.array(( (0,), (1.0,) ))
        Qc   = np.array(((12.*np.sqrt(3) / lengthscale**3 * variance,),)) 
        H    = np.array(((1.0, 0),)) 
        Pinf = np.array(((variance, 0.0), (0.0, 3.*variance/(lengthscale**2))))
        P0 = Pinf.copy()
        # Allocate space for the derivatives 
        dF    = np.empty([F.shape[0],F.shape[1],2])
        dQc   = np.empty([Qc.shape[0],Qc.shape[1],2]) 
        dPinf = np.empty([Pinf.shape[0],Pinf.shape[1],2]) 
        # The partial derivatives 
        dFvariance       = np.zeros((2,2)) 
        dFlengthscale    = np.array(((0,0), (6./lengthscale**3,2*np.sqrt(3)/lengthscale**2))) 
        dQcvariance      = np.array((12.*np.sqrt(3)/lengthscale**3)) 
        dQclengthscale   = np.array((-3*12*np.sqrt(3)/lengthscale**4*variance)) 
        dPinfvariance    = np.array(((1,0),(0,3./lengthscale**2))) 
        dPinflengthscale = np.array(((0,0), (0,-6*variance/lengthscale**3))) 
        # Combine the derivatives 
        dF[:,:,0]    = dFvariance 
        dF[:,:,1]    = dFlengthscale 
        dQc[:,:,0]   = dQcvariance 
        dQc[:,:,1]   = dQclengthscale 
        dPinf[:,:,0] = dPinfvariance 
        dPinf[:,:,1] = dPinflengthscale 
        dP0 = dPinf.copy()
        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
 class sde_Matern52(Matern52):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE forrm.
    Matern 5/2 kernel:
    .. math::
       k(r) = \sigma^2 (1 + \sqrt{5} r + \frac{5}{3}r^2) \exp(- \sqrt{5} r) \\ \\ \\ \\  \text{ where  } r = \sqrt{\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\ell_i^2} }
    """
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        self.variance.gradient = gradients[0]
        self.lengthscale.gradient = gradients[1]
    def sde(self): 
        """ 
        Return the state space representation of the covariance. 
        """ 
        variance = float(self.variance.values)
        lengthscale = float(self.lengthscale.values)
        lamda = np.sqrt(5.0)/lengthscale
        kappa = 5.0/3.0*variance/lengthscale**2        
        F = np.array(((0, 1,0), (0, 0, 1), (-lamda**3, -3.0*lamda**2, -3*lamda)))
        L = np.array(((0,),(0,),(1,)))
        Qc = np.array((((variance*400.0*np.sqrt(5.0)/3.0/lengthscale**5),),))
        H = np.array(((1,0,0),))        
        Pinf = np.array(((variance,0,-kappa), (0, kappa, 0), (-kappa, 0, 25.0*variance/lengthscale**4)))
        P0 = Pinf.copy()
        # Allocate space for the derivatives         
        dF = np.empty((3,3,2))        
        dQc = np.empty((1,1,2))        
        dPinf = np.empty((3,3,2))
         # The partial derivatives 
        dFvariance = np.zeros((3,3))
        dFlengthscale = np.array(((0,0,0),(0,0,0),(15.0*np.sqrt(5.0)/lengthscale**4, 
                                   30.0/lengthscale**3, 3*np.sqrt(5.0)/lengthscale**2)))
        dQcvariance = np.array((((400*np.sqrt(5)/3/lengthscale**5,),)))
        dQclengthscale = np.array((((-variance*2000*np.sqrt(5)/3/lengthscale**6,),)))        
        dPinf_variance = Pinf/variance
        kappa2 = -2.0*kappa/lengthscale
        dPinf_lengthscale = np.array(((0,0,-kappa2),(0,kappa2,0),(-kappa2, 
                                    0,-100*variance/lengthscale**5)))        
        # Combine the derivatives 
        dF[:,:,0] = dFvariance
        dF[:,:,1] = dFlengthscale        
        dQc[:,:,0] = dQcvariance         
        dQc[:,:,1] = dQclengthscale        
        dPinf[:,:,0] = dPinf_variance
        dPinf[:,:,1] = dPinf_lengthscale
        dP0 = dPinf.copy()
        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)  
--- a/GPy/kern/src/sde_standard_periodic.py
+++ b/GPy/kern/src/sde_standard_periodic.py
@ -0,0 +1,180 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2015, Alex Grigorevskiy, Arno Solin
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 Classes in this module enhance Matern covariance functions with the
 Stochastic Differential Equation (SDE) functionality.
 """
 from .standard_periodic import StdPeriodic
 import numpy as np
 import scipy as sp
 from scipy import special as special
 class sde_StdPeriodic(StdPeriodic):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE form.
    Standard Periodic kernel:
    .. math::
       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} {}\sum_{i=1}^{input\_dim}  
       \left( \frac{\sin(\frac{\pi}{\lambda_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
    """
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        self.variance.gradient = gradients[0]
        self.period.gradient = gradients[1]
        self.lengthscale.gradient = gradients[2]
    def sde(self): 
        """ 
        Return the state space representation of the covariance.
        ! Note: one must constrain lengthscale not to drop below 0.25.
        After this bessel functions of the first kind grows to very high.
        ! Note: one must keep wevelength also not very low. Because then
        the gradients wrt wavelength become ustable. 
        However this might depend on the data. For test example with
        300 data points the low limit is 0.15.
        """ 
        # Params to use: (in that order)
        #self.variance
        #self.period
        #self.lengthscale
        N = 7 # approximation order        
        w0 = 2*np.pi/self.period # frequency
        lengthscale = 2*self.lengthscale         
        [q2,dq2l] = seriescoeff(N,lengthscale,self.variance)        
        # lengthscale is multiplied by 2 because of slightly different
        # formula for periodic covariance function.
        # For the same reason:
        dq2l = 2*dq2l
        if np.any( np.isfinite(q2) == False):
            raise ValueError("SDE periodic covariance error 1")
        if np.any( np.isfinite(dq2l) == False):
            raise ValueError("SDE periodic covariance error 2")
        F    = np.kron(np.diag(range(0,N+1)),np.array( ((0, -w0), (w0, 0)) ) )
        L    = np.eye(2*(N+1))
        Qc   = np.zeros((2*(N+1), 2*(N+1)))
        P_inf = np.kron(np.diag(q2),np.eye(2))
        H    = np.kron(np.ones((1,N+1)),np.array((1,0)) )
        P0 = P_inf.copy()
        # Derivatives
        dF = np.empty((F.shape[0], F.shape[1], 3))
        dQc = np.empty((Qc.shape[0], Qc.shape[1], 3))
        dP_inf = np.empty((P_inf.shape[0], P_inf.shape[1], 3))         
        # Derivatives wrt self.variance
        dF[:,:,0] = np.zeros(F.shape)
        dQc[:,:,0] = np.zeros(Qc.shape)
        dP_inf[:,:,0] = P_inf / self.variance
        # Derivatives self.period
        dF[:,:,1] = np.kron(np.diag(range(0,N+1)),np.array( ((0,  w0), (-w0, 0)) ) / self.period );
        dQc[:,:,1] = np.zeros(Qc.shape)
        dP_inf[:,:,1] = np.zeros(P_inf.shape)      
        # Derivatives self.lengthscales        
        dF[:,:,2] = np.zeros(F.shape)
        dQc[:,:,2] = np.zeros(Qc.shape)
        dP_inf[:,:,2] = np.kron(np.diag(dq2l),np.eye(2))
        dP0 = dP_inf.copy()
        return (F, L, Qc, H, P_inf, P0, dF, dQc, dP_inf, dP0)
 def seriescoeff(m=6,lengthScale=1.0,magnSigma2=1.0, true_covariance=False):
    """
    Calculate the coefficients q_j^2 for the covariance function 
    approximation:
        k(\tau) =  \sum_{j=0}^{+\infty} q_j^2 \cos(j\omega_0 \tau)
    Reference is:
    [1] Arno Solin and Simo Särkkä (2014). Explicit link between periodic 
        covariance functions and state space models. In Proceedings of the 
        Seventeenth International Conference on Artifcial Intelligence and 
        Statistics (AISTATS 2014). JMLR: W&CP, volume 33.    
    Note! Only the infinite approximation (through Bessel function) 
          is currently implemented.
    Input:
    ----------------
    m: int
        Degree of approximation. Default 6.
    lengthScale: float
        Length scale parameter in the kerenl
    magnSigma2:float
        Multiplier in front of the kernel.
    Output:
    -----------------
    coeffs: array(m+1)
        Covariance series coefficients
    coeffs_dl: array(m+1)
        Derivatives of the coefficients with respect to lengthscale.
    """
    if true_covariance:
        bb = lambda j,m: (1.0 + np.array((j != 0), dtype=np.float64) ) / (2**(j)) *\
            sp.special.binom(j, sp.floor( (j-m)/2.0 * np.array(m<=j, dtype=np.float64) ))*\
            np.array(m<=j, dtype=np.float64) *np.array(sp.mod(j-m,2)==0, dtype=np.float64)
        M,J = np.meshgrid(range(0,m+1),range(0,m+1))
        coeffs = bb(J,M) / sp.misc.factorial(J) * sp.exp( -lengthScale**(-2) ) *\
             (lengthScale**(-2))**J  *magnSigma2
        coeffs_dl = np.sum( coeffs*lengthScale**(-3)*(2.0-2.0*J*lengthScale**2),0)         
        coeffs = np.sum(coeffs,0)
    else:
        coeffs = 2*magnSigma2*sp.exp( -lengthScale**(-2) ) * special.iv(range(0,m+1),1.0/lengthScale**(2))
        if np.any( np.isfinite(coeffs) == False):
            raise ValueError("sde_standard_periodic: Coefficients are not finite!")
            #import pdb; pdb.set_trace()
        coeffs[0] = 0.5*coeffs[0]
        # Derivatives wrt (lengthScale)
        coeffs_dl = np.zeros(m+1)
        coeffs_dl[1:] = magnSigma2*lengthScale**(-3) * sp.exp(-lengthScale**(-2))*\
        (-4*special.iv(range(0,m),lengthScale**(-2)) + 4*(1+np.arange(1,m+1)*lengthScale**(2))*special.iv(range(1,m+1),lengthScale**(-2)) )    
        # The first element
        coeffs_dl[0] = magnSigma2*lengthScale**(-3) * np.exp(-lengthScale**(-2))*\
            (2*special.iv(0,lengthScale**(-2)) - 2*special.iv(1,lengthScale**(-2)) )     
    return coeffs, coeffs_dl
--- a/GPy/kern/src/sde_static.py
+++ b/GPy/kern/src/sde_static.py
@ -0,0 +1,103 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2015, Alex Grigorevskiy, Arno Solin
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 Classes in this module enhance Static covariance functions with the
 Stochastic Differential Equation (SDE) functionality.
 """
 from .static import White
 from .static import Bias
 import numpy as np
 class sde_White(White):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE forrm.
    White kernel:
    .. math::
       k(x,y) = \alpha*\delta(x-y)
    """
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        self.variance.gradient = gradients[0]
    def sde(self): 
        """ 
        Return the state space representation of the covariance. 
        """ 
        variance = float(self.variance.values) 
        F = np.array( ((-np.inf,),) )
        L = np.array( ((1.0,),)  )
        Qc = np.array( ((variance,),)  )
        H = np.array( ((1.0,),) )
        Pinf   = np.array( ((variance,),)  )
        P0 = Pinf.copy()     
        dF = np.zeros((1,1,1))
        dQc = np.zeros((1,1,1))
        dQc[:,:,0]    = np.array( ((1.0,),) )
        dPinf = np.zeros((1,1,1))
        dPinf[:,:,0] = np.array( ((1.0,),) )
        dP0 = dPinf.copy()
        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
 class sde_Bias(Bias):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE forrm.
    Bias kernel:
    .. math::
       k(x,y) = \alpha
    """
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        self.variance.gradient = gradients[0]
    def sde(self): 
        """ 
        Return the state space representation of the covariance. 
        """ 
        variance = float(self.variance.values) 
        F = np.array( ((0.0,),))
        L = np.array( ((1.0,),))
        Qc = np.zeros((1,1))
        H = np.array( ((1.0,),))
        Pinf   = np.zeros((1,1))
        P0 = np.array( ((variance,),) )      
        dF = np.zeros((1,1,1))
        dQc    = np.zeros((1,1,1))
        dPinf = np.zeros((1,1,1))
        dP0 = np.zeros((1,1,1))
        dP0[:,:,0] = np.array( ((1.0,),) )
        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
--- a/GPy/kern/src/sde_stationary.py
+++ b/GPy/kern/src/sde_stationary.py
@ -0,0 +1,192 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2015, Alex Grigorevskiy, Arno Solin
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 Classes in this module enhance several stationary covariance functions with the
 Stochastic Differential Equation (SDE) functionality.
 """
 from .rbf import RBF
 from .stationary import Exponential
 from .stationary import RatQuad
 import numpy as np
 import scipy as sp
 class sde_RBF(RBF):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE form.
    Radial Basis Function kernel:
    .. math::
        k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \\ \\ \\ \\  \text{ where  } r = \sqrt{\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\ell_i^2} }
    """
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        self.variance.gradient = gradients[0]
        self.lengthscale.gradient = gradients[1]
    def sde(self):
        """
        Return the state space representation of the covariance.
        """
        N = 10# approximation order ( number of terms in exponent series expansion)
        roots_rounding_decimals = 6
        fn = np.math.factorial(N)
        kappa = 1.0/2.0/self.lengthscale**2
        Qc = np.array((self.variance*np.sqrt(np.pi/kappa)*fn*(4*kappa)**N,),)
        pp = np.zeros((2*N+1,)) # array of polynomial coefficients from higher power to lower
        for n in range(0, N+1): # (2N+1) - number of polynomial coefficients
            pp[2*(N-n)] = fn*(4.0*kappa)**(N-n)/np.math.factorial(n)*(-1)**n
        pp = sp.poly1d(pp)
        roots = sp.roots(pp)
        neg_real_part_roots = roots[np.round(np.real(roots) ,roots_rounding_decimals) < 0]
        aa = sp.poly1d(neg_real_part_roots, r=True).coeffs
        F = np.diag(np.ones((N-1,)),1)
        F[-1,:] = -aa[-1:0:-1]
        L= np.zeros((N,1))
        L[N-1,0] = 1
        H = np.zeros((1,N))
        H[0,0] = 1
        # Infinite covariance:
        Pinf = sp.linalg.solve_lyapunov(F, -np.dot(L,np.dot( Qc[0,0],L.T)))
        Pinf = 0.5*(Pinf + Pinf.T)
        # Allocating space for derivatives
        dF    = np.empty([F.shape[0],F.shape[1],2])
        dQc   = np.empty([Qc.shape[0],Qc.shape[1],2])
        dPinf = np.empty([Pinf.shape[0],Pinf.shape[1],2])
        # Derivatives:
        dFvariance = np.zeros(F.shape)
        dFlengthscale = np.zeros(F.shape)
        dFlengthscale[-1,:] = -aa[-1:0:-1]/self.lengthscale * np.arange(-N,0,1)
        dQcvariance = Qc/self.variance
        dQclengthscale = np.array(((self.variance*np.sqrt(2*np.pi)*fn*2**N*self.lengthscale**(-2*N)*(1-2*N,),)))
        dPinf_variance = Pinf/self.variance
        lp = Pinf.shape[0]
        coeff = np.arange(1,lp+1).reshape(lp,1) + np.arange(1,lp+1).reshape(1,lp) - 2
        coeff[np.mod(coeff,2) != 0] = 0
        dPinf_lengthscale = -1/self.lengthscale*Pinf*coeff
        dF[:,:,0]    = dFvariance
        dF[:,:,1]    = dFlengthscale
        dQc[:,:,0]   = dQcvariance
        dQc[:,:,1]   = dQclengthscale
        dPinf[:,:,0] = dPinf_variance
        dPinf[:,:,1] = dPinf_lengthscale
        P0 = Pinf.copy()
        dP0 = dPinf.copy()
        # Benefits of this are not very sound. Helps only in one case:
        # SVD Kalman + RBF kernel
        import GPy.models.state_space_main as ssm
        (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf,dP0, T) = ssm.balance_ss_model(F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0 )
        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
 class sde_Exponential(Exponential):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE form.
    Exponential kernel:
    .. math::
       k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r \\bigg) \\ \\ \\ \\  \text{ where  } r = \sqrt{\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\ell_i^2} }
    """
    def sde_update_gradient_full(self, gradients):
        """
        Update gradient in the order in which parameters are represented in the
        kernel
        """
        self.variance.gradient = gradients[0]
        self.lengthscale.gradient = gradients[1]
    def sde(self):
        """
        Return the state space representation of the covariance.
        """
        variance = float(self.variance.values)
        lengthscale = float(self.lengthscale)
        F  = np.array(((-1.0/lengthscale,),))
        L  = np.array(((1.0,),))
        Qc = np.array( ((2.0*variance/lengthscale,),) )
        H = np.array(((1.0,),))
        Pinf = np.array(((variance,),))
        P0 = Pinf.copy()
        dF = np.zeros((1,1,2));
        dQc = np.zeros((1,1,2));
        dPinf = np.zeros((1,1,2));
        dF[:,:,0] = 0.0
        dF[:,:,1] = 1.0/lengthscale**2
        dQc[:,:,0] = 2.0/lengthscale
        dQc[:,:,1] = -2.0*variance/lengthscale**2
        dPinf[:,:,0] = 1.0
        dPinf[:,:,1] = 0.0
        dP0 = dPinf.copy()
        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
 class sde_RatQuad(RatQuad):
    """
    Class provide extra functionality to transfer this covariance function into
    SDE form.
    Rational Quadratic kernel:
    .. math::
       k(r) = \sigma^2 \\bigg( 1 + \\frac{r^2}{2} \\bigg)^{- \alpha} \\ \\ \\ \\  \text{ where  } r = \sqrt{\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\ell_i^2} }
    """
    def sde(self):
        """
        Return the state space representation of the covariance.
        """
        assert False, 'Not Implemented'
        # Params to use:
        # self.lengthscale
        # self.variance
        #self.power
        #return (F, L, Qc, H, Pinf, dF, dQc, dPinf)
--- a/GPy/kern/src/standard_periodic.py
+++ b/GPy/kern/src/standard_periodic.py
@ -1,6 +1,5 @@
 # -*- coding: utf-8 -*-
-
+# Copyright (c) 2015, Alex Grigorevskiy
 # Copyright (c) 2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 The standard periodic kernel which mentioned in:
@ -9,7 +8,7 @@ The standard periodic kernel which mentioned in:
 The MIT Press, 2005.
-[2] Introduction to Gaussian processes. D. J. C. MacKay. In C. M. Bishop, editor, 
+[2] Introduction to Gaussian processes. D. J. C. MacKay. In C. M. Bishop, editor,
 Neural Networks and Machine Learning, pages 133-165. Springer, 1998.
 """
@ -25,56 +24,56 @@ class StdPeriodic(Kern):
    .. math::
-       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} {}\sum_{i=1}^{input\_dim}  
+       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} \sum_{i=1}^{input\_dim}
-       \left( \frac{\sin(\frac{\pi}{\lambda_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
+       \left( \frac{\sin(\frac{\pi}{T_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
    :param input_dim: the number of input dimensions
    :type input_dim: int
    :param variance: the variance :math:`\theta_1` in the formula above
    :type variance: float
-    :param wavelength: the vector of wavelengths :math:`\lambda_i`. If None then 1.0 is assumed.
+    :param period: the vector of periods :math:`\T_i`. If None then 1.0 is assumed.
-    :type wavelength: array or list of the appropriate size (or float if there is only one wavelength parameter)
+    :type period: array or list of the appropriate size (or float if there is only one period parameter)
    :param lengthscale: the vector of lengthscale :math:`\l_i`. If None then 1.0 is assumed.
    :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
-    :param ARD1: Auto Relevance Determination with respect to wavelength. 
+    :param ARD1: Auto Relevance Determination with respect to period.
-        If equal to "False" one single wavelength parameter :math:`\lambda_i` for 
+        If equal to "False" one single period parameter :math:`\T_i` for
-        each dimension is assumed, otherwise there is one lengthscale 
+        each dimension is assumed, otherwise there is one lengthscale
        parameter per dimension.
    :type ARD1: Boolean
-    :param ARD2: Auto Relevance Determination with respect to lengthscale. 
+    :param ARD2: Auto Relevance Determination with respect to lengthscale.
-        If equal to "False" one single wavelength parameter :math:`l_i` for 
+        If equal to "False" one single lengthscale parameter :math:`l_i` for
-        each dimension is assumed, otherwise there is one lengthscale 
+        each dimension is assumed, otherwise there is one lengthscale
        parameter per dimension.
    :type ARD2: Boolean
    :param active_dims: indices of dimensions which are used in the computation of the kernel
-    :type wavelength: array or list of the appropriate size
+    :type active_dims: array or list of the appropriate size
    :param name: Name of the kernel for output
    :type String
    :param useGPU: whether of not use GPU
    :type Boolean
    """
-    
+
-    def __init__(self, input_dim, variance=1., wavelength=None, lengthscale=None, ARD1=False, ARD2=False, active_dims=None, name='std_periodic',useGPU=False):
+    def __init__(self, input_dim, variance=1., period=None, lengthscale=None, ARD1=False, ARD2=False, active_dims=None, name='std_periodic',useGPU=False):
        super(StdPeriodic, self).__init__(input_dim, active_dims, name, useGPU=useGPU)
        self.input_dim = input_dim
-        self.ARD1 = ARD1 # correspond to wavelengths        
+        self.ARD1 = ARD1 # correspond to periods
        self.ARD2 = ARD2 # correspond to lengthscales
-        
+
        self.name = name
-        
+
        if self.ARD1 == False:
-            if wavelength is not None:
+            if period is not None:
-                wavelength = np.asarray(wavelength)
+                period = np.asarray(period)
-                assert wavelength.size == 1, "Only one wavelength needed for non-ARD kernel"
+                assert period.size == 1, "Only one period needed for non-ARD kernel"
            else:
-                wavelength = np.ones(1)
+                period = np.ones(1.0)
        else:
-            if wavelength is not None:
+            if period is not None:
-                wavelength = np.asarray(wavelength)
+                period = np.asarray(period)
-                assert wavelength.size == input_dim, "bad number of wavelengths"
+                assert period.size == input_dim, "bad number of periods"
            else:
-                wavelength = np.ones(input_dim)
+                period = np.ones(input_dim)
-        
+
        if self.ARD2 == False:
            if lengthscale is not None:
                lengthscale = np.asarray(lengthscale)
@ -87,33 +86,33 @@ class StdPeriodic(Kern):
                assert lengthscale.size == input_dim, "bad number of lengthscales"
            else:
                lengthscale = np.ones(input_dim)
-        
+
        self.variance = Param('variance', variance, Logexp())
        assert self.variance.size==1, "Variance size must be one"
-        self.wavelengths =  Param('wavelengths', wavelength, Logexp())
+        self.period =  Param('period', period, Logexp())
-        self.lengthscales =  Param('lengthscales', lengthscale, Logexp())
+        self.lengthscale =  Param('lengthscale', lengthscale, Logexp())
-        
+
-        self.link_parameters(self.variance,  self.wavelengths, self.lengthscales)
+        self.link_parameters(self.variance,  self.period, self.lengthscale)
    def parameters_changed(self):
        """
-        This functions deals as a callback for each optimization iteration. 
+        This functions deals as a callback for each optimization iteration.
        If one optimization step was successfull and the parameters
-        this callback function will be called to be able to update any 
+        this callback function will be called to be able to update any
        precomputations for the kernel.
        """
-        
+
        pass
-        
+
-        
+
    def K(self, X, X2=None):
        """Compute the covariance matrix between X and X2."""
-        if X2 is None: 
+        if X2 is None:
            X2 = X
-            
+
-        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
+        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.period
-        exp_dist = np.exp( -0.5* np.sum( np.square(  np.sin( base ) / self.lengthscales ), axis = -1 ) ) 
+        exp_dist = np.exp( -0.5* np.sum( np.square(  np.sin( base ) / self.lengthscale ), axis = -1 ) )
-            
+
        return self.variance * exp_dist
@ -125,42 +124,47 @@ class StdPeriodic(Kern):
    def update_gradients_full(self, dL_dK, X, X2=None):
        """derivative of the covariance matrix with respect to the parameters."""
-        if X2 is None: 
+        if X2 is None:
            X2 = X
-        
+
-        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.wavelengths
+        base = np.pi * (X[:, None, :] - X2[None, :, :]) / self.period
-        
+
-        sin_base = np.sin( base )         
+        sin_base = np.sin( base )
-        exp_dist = np.exp( -0.5* np.sum( np.square(  sin_base / self.lengthscales ), axis = -1 ) ) 
+        exp_dist = np.exp( -0.5* np.sum( np.square(  sin_base / self.lengthscale ), axis = -1 ) )
-        
+
-        dwl = self.variance * (1.0/np.square(self.lengthscales)) * sin_base*np.cos(base) * (base / self.wavelengths)
+        dwl = self.variance * (1.0/np.square(self.lengthscale)) * sin_base*np.cos(base) * (base / self.period)
-        
+
-        dl = self.variance * np.square( sin_base) / np.power( self.lengthscales, 3) 
+        dl = self.variance * np.square( sin_base) / np.power( self.lengthscale, 3)
-        
+
-        self.variance.gradient = np.sum(exp_dist * dL_dK)    
+        self.variance.gradient = np.sum(exp_dist * dL_dK)
-        #target[0] += np.sum( exp_dist * dL_dK)        
+        #target[0] += np.sum( exp_dist * dL_dK)
-        
+
-        if self.ARD1: # different wavelengths
+        if self.ARD1: # different periods
-            self.wavelengths.gradient = (dwl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
+            self.period.gradient = (dwl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
-        else:  # same wavelengths
+        else:  # same period
-            self.wavelengths.gradient = np.sum(dwl.sum(-1) * exp_dist * dL_dK)
+            self.period.gradient = np.sum(dwl.sum(-1) * exp_dist * dL_dK)
-            
+
        if self.ARD2: # different lengthscales
-            self.lengthscales.gradient = (dl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
+            self.lengthscale.gradient = (dl * exp_dist[:,:,None] * dL_dK[:, :, None]).sum(0).sum(0)
        else: # same lengthscales
-            self.lengthscales.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)
+            self.lengthscale.gradient = np.sum(dl.sum(-1) * exp_dist * dL_dK)
-        
+
    def update_gradients_diag(self, dL_dKdiag, X):
        """derivative of the diagonal of the covariance matrix with respect to the parameters."""
        self.variance.gradient = np.sum(dL_dKdiag)
-        self.wavelengths.gradient = 0
+        self.period.gradient = 0
-        self.lengthscales.gradient = 0
+        self.lengthscale.gradient = 0
-#    def gradients_X(self, dL_dK, X, X2=None):
+    def gradients_X(self, dL_dK, X, X2=None):
-#        """derivative of the covariance matrix with respect to X."""
+        K = self.K(X, X2)
-#    
+        if X2 is None:
-#        raise NotImplemented("Periodic kernel: dK_dX not implemented")
+            dL_dK = dL_dK+dL_dK.T
-#
+            X2 = X
-#    def gradients_X_diag(self, dL_dKdiag, X):
+        dX = -np.pi*((dL_dK*K)[:,:,None]*np.sin(2*np.pi/self.period*(X[:,None,:] - X2[None,:,:]))/(2.*np.square(self.lengthscale)*self.period)).sum(1)
-#        
+        return dX
-#        raise NotImplemented("Periodic kernel: dKdiag_dX not implemented")
+    
    def gradients_X_diag(self, dL_dKdiag, X):
        return np.zeros(X.shape)
    def input_sensitivity(self, summarize=True):
        return self.variance*np.ones(self.input_dim)/self.lengthscale**2
--- a/GPy/kern/src/static.py
+++ b/GPy/kern/src/static.py
@ -81,6 +81,52 @@ class White(Static):
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        self.variance.gradient = dL_dpsi0.sum()
 class WhiteHeteroscedastic(Static):
    def __init__(self, input_dim, num_data, variance=1., active_dims=None, name='white_hetero'):
        """
        A heteroscedastic White kernel (nugget/noise).
        It defines one variance (nugget) per input sample. 
        Prediction excludes any noise learnt by this Kernel, so be careful using this kernel.
        You can plot the errors learnt by this kernel by something similar as:
        plt.errorbar(m.X, m.Y, yerr=2*np.sqrt(m.kern.white.variance))
        """
        super(Static, self).__init__(input_dim, active_dims, name)
        self.variance = Param('variance', np.ones(num_data) * variance, Logexp())
        self.link_parameters(self.variance)
    def Kdiag(self, X):
        if X.shape[0] == self.variance.shape[0]:
            # If the input has the same number of samples as 
            # the number of variances, we return the variances
            return self.variance
        return 0.
    def K(self, X, X2=None):
        if X2 is None and X.shape[0] == self.variance.shape[0]:
            return np.eye(X.shape[0]) * self.variance
        else:
            return 0.
    def psi2(self, Z, variational_posterior):
        return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)
    def psi2n(self, Z, variational_posterior):
        return np.zeros((1, Z.shape[0], Z.shape[0]), dtype=np.float64)
    def update_gradients_full(self, dL_dK, X, X2=None):
        if X2 is None:
            self.variance.gradient = np.diagonal(dL_dK)
        else:
            self.variance.gradient = 0.
    def update_gradients_diag(self, dL_dKdiag, X):
        self.variance.gradient = dL_dKdiag
    def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
        self.variance.gradient = dL_dpsi0
 class Bias(Static):
    def __init__(self, input_dim, variance=1., active_dims=None, name='bias'):
        super(Bias, self).__init__(input_dim, variance, active_dims, name)
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@ -81,11 +81,11 @@ class Stationary(Kern):
    def dK_dr(self, r):
        raise NotImplementedError("implement derivative of the covariance function wrt r to use this class")
-    @Cache_this(limit=20, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def dK2_drdr(self, r):
        raise NotImplementedError("implement second derivative of covariance wrt r to use this method")
-    @Cache_this(limit=5, ignore_args=())
+    @Cache_this(limit=3, ignore_args=())
    def K(self, X, X2=None):
        """
        Kernel function applied on inputs X and X2.
@ -99,6 +99,9 @@ class Stationary(Kern):
    @Cache_this(limit=3, ignore_args=())
    def dK_dr_via_X(self, X, X2):
        """
        compute the derivative of K wrt X going through X
        """
        #a convenience function, so we can cache dK_dr
        return self.dK_dr(self._scaled_dist(X, X2))
@ -312,11 +315,23 @@ class Exponential(Stationary):
        super(Exponential, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
    def K_of_r(self, r):
-        return self.variance * np.exp(-0.5 * r)
+        return self.variance * np.exp(-r)
    def dK_dr(self, r):
-        return -0.5*self.K_of_r(r)
+        return -self.K_of_r(r)
 #    def sde(self): 
 #        """ 
 #        Return the state space representation of the covariance. 
 #        """ 
 #        F  = np.array([[-1/self.lengthscale]]) 
 #        L  = np.array([[1]]) 
 #        Qc = np.array([[2*self.variance/self.lengthscale]]) 
 #        H = np.array([[1]]) 
 #        Pinf = np.array([[self.variance]]) 
 #        # TODO: return the derivatives as well 
 #        
 #        return (F, L, Qc, H, Pinf)  
@ -385,6 +400,41 @@ class Matern32(Stationary):
        F1lower = np.array([f(lower) for f in F1])[:, None]
        return(self.lengthscale ** 3 / (12.*np.sqrt(3) * self.variance) * G + 1. / self.variance * np.dot(Flower, Flower.T) + self.lengthscale ** 2 / (3.*self.variance) * np.dot(F1lower, F1lower.T))
    def sde(self): 
        """ 
        Return the state space representation of the covariance. 
        """ 
        variance = float(self.variance.values)
        lengthscale = float(self.lengthscale.values)
        foo  = np.sqrt(3.)/lengthscale 
        F    = np.array([[0, 1], [-foo**2, -2*foo]]) 
        L    = np.array([[0], [1]]) 
        Qc   = np.array([[12.*np.sqrt(3) / lengthscale**3 * variance]]) 
        H    = np.array([[1, 0]]) 
        Pinf = np.array([[variance, 0],  
        [0,              3.*variance/(lengthscale**2)]]) 
        # Allocate space for the derivatives 
        dF    = np.empty([F.shape[0],F.shape[1],2])
        dQc   = np.empty([Qc.shape[0],Qc.shape[1],2]) 
        dPinf = np.empty([Pinf.shape[0],Pinf.shape[1],2]) 
        # The partial derivatives 
        dFvariance       = np.zeros([2,2]) 
        dFlengthscale    = np.array([[0,0], 
        [6./lengthscale**3,2*np.sqrt(3)/lengthscale**2]]) 
        dQcvariance      = np.array([12.*np.sqrt(3)/lengthscale**3]) 
        dQclengthscale   = np.array([-3*12*np.sqrt(3)/lengthscale**4*variance]) 
        dPinfvariance    = np.array([[1,0],[0,3./lengthscale**2]]) 
        dPinflengthscale = np.array([[0,0], 
        [0,-6*variance/lengthscale**3]]) 
        # Combine the derivatives 
        dF[:,:,0]    = dFvariance 
        dF[:,:,1]    = dFlengthscale 
        dQc[:,:,0]   = dQcvariance 
        dQc[:,:,1]   = dQclengthscale 
        dPinf[:,:,0] = dPinfvariance 
        dPinf[:,:,1] = dPinflengthscale 
        return (F, L, Qc, H, Pinf, dF, dQc, dPinf)  
 class Matern52(Stationary):
    """
@ -486,18 +536,21 @@ class RatQuad(Stationary):
        self.link_parameters(self.power)
    def K_of_r(self, r):
-        r2 = np.power(r, 2.)
+        r2 = np.square(r)
-        return self.variance*np.power(1. + r2/2., -self.power)
+#         return self.variance*np.power(1. + r2/2., -self.power)
        return self.variance*np.exp(-self.power*np.log1p(r2/2.))
    def dK_dr(self, r):
-        r2 = np.power(r, 2.)
+        r2 = np.square(r)
-        return -self.variance*self.power*r*np.power(1. + r2/2., - self.power - 1.)
+#         return -self.variance*self.power*r*np.power(1. + r2/2., - self.power - 1.)
        return-self.variance*self.power*r*np.exp(-(self.power+1)*np.log1p(r2/2.))
    def update_gradients_full(self, dL_dK, X, X2=None):
        super(RatQuad, self).update_gradients_full(dL_dK, X, X2)
        r = self._scaled_dist(X, X2)
-        r2 = np.power(r, 2.)
+        r2 = np.square(r)
-        dK_dpow = -self.variance * np.power(2., self.power) * np.power(r2 + 2., -self.power) * np.log(0.5*(r2+2.))
+#        dK_dpow = -self.variance * np.power(2., self.power) * np.power(r2 + 2., -self.power) * np.log(0.5*(r2+2.))
        dK_dpow = -self.variance * np.exp(self.power*(np.log(2.)-np.log1p(r2+1)))*np.log1p(r2/2.)
        grad = np.sum(dL_dK*dK_dpow)
        self.power.gradient = grad
--- a/GPy/kern/src/stationary_cython.c
+++ b/GPy/kern/src/stationary_cython.c
--- a/GPy/kern/src/trunclinear.py
+++ b/GPy/kern/src/trunclinear.py
@ -54,12 +54,12 @@ class TruncLinear(Kern):
        self.add_parameter(self.variances)
        self.add_parameter(self.delta)
-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
    def K(self, X, X2=None):
        XX = self.variances*self._product(X, X2)
        return XX.sum(axis=-1)
-    @Cache_this(limit=2)
+    @Cache_this(limit=3)
    def _product(self, X, X2=None):
        if X2 is None:
            X2 = X
@ -149,12 +149,12 @@ class TruncLinear_inf(Kern):
        self.add_parameter(self.variances)
-#     @Cache_this(limit=2)
+#     @Cache_this(limit=3)
    def K(self, X, X2=None):
        tmp = self._product(X, X2)
        return (self.variances*tmp).sum(axis=-1)
-#     @Cache_this(limit=2)
+#     @Cache_this(limit=3)
    def _product(self, X, X2=None):
        if X2 is None:
            X2 = X
--- a/GPy/likelihoods/binomial.py
+++ b/GPy/likelihoods/binomial.py
@ -27,9 +27,6 @@ class Binomial(Likelihood):
        super(Binomial, self).__init__(gp_link, 'Binomial')
    def conditional_mean(self, gp, Y_metadata):
        return self.gp_link(gp)*Y_metadata['trials']
    def pdf_link(self, inv_link_f, y, Y_metadata):
        """
        Likelihood function given inverse link of f.
@ -109,7 +106,7 @@ class Binomial(Likelihood):
        N = Y_metadata['trials']
        return -y/np.square(inv_link_f) - (N-y)/np.square(1-inv_link_f)
-    def samples(self, gp, Y_metadata=None):
+    def samples(self, gp, Y_metadata=None, **kw):
        """
        Returns a set of samples of observations based on a given value of the latent variable.
@ -123,3 +120,32 @@ class Binomial(Likelihood):
    def exact_inference_gradients(self, dL_dKdiag,Y_metadata=None):
        pass
    def variational_expectations(self, Y, m, v, gh_points=None, Y_metadata=None):
        if isinstance(self.gp_link, link_functions.Probit):
            if gh_points is None:
                gh_x, gh_w = self._gh_points()
            else:
                gh_x, gh_w = gh_points
            gh_w = gh_w / np.sqrt(np.pi)
            shape = m.shape
            C = np.atleast_1d(Y_metadata['trials'])
            m,v,Y, C = m.flatten(), v.flatten(), Y.flatten()[:,None], C.flatten()[:,None]
            X = gh_x[None,:]*np.sqrt(2.*v[:,None]) + m[:,None]
            p = std_norm_cdf(X)
            p = np.clip(p, 1e-9, 1.-1e-9) # for numerical stability
            N = std_norm_pdf(X)
            #TODO: missing nchoosek coefficient! use gammaln?
            F = (Y*np.log(p) + (C-Y)*np.log(1.-p)).dot(gh_w)
            NoverP = N/p
            NoverP_ = N/(1.-p)
            dF_dm = (Y*NoverP - (C-Y)*NoverP_).dot(gh_w)
            dF_dv = -0.5* ( Y*(NoverP**2 + NoverP*X) + (C-Y)*(NoverP_**2 - NoverP_*X) ).dot(gh_w)
            return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape), None
        else:
            raise NotImplementedError
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -22,3 +22,5 @@ from .gp_var_gauss import GPVariationalGaussianApproximation
 from .one_vs_all_classification import OneVsAllClassification
 from .one_vs_all_sparse_classification import OneVsAllSparseClassification
 from .dpgplvm import DPBayesianGPLVM
 from .state_space_model import StateSpace
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@ -61,7 +61,7 @@ class BayesianGPLVM(SparseGP_MPI):
            else:
                from ..inference.latent_function_inference.var_dtc import VarDTC
                self.logger.debug("creating inference_method var_dtc")
-                inference_method = VarDTC(limit=1 if not missing_data else Y.shape[1])
+                inference_method = VarDTC(limit=3 if not missing_data else Y.shape[1])
        if isinstance(inference_method,VarDTC_minibatch):
            inference_method.mpi_comm = mpi_comm
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@ -40,12 +40,13 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
            Z = np.random.permutation(X.copy())[:num_inducing]
        assert Z.shape[1] == X.shape[1]
-        if X_variance == False:
+        if X_variance is False:
            self.logger.info('no variance on X, activating sparse GPLVM')
            X = Param("latent space", X)
-        elif X_variance is None:
+        else:
-            self.logger.info("initializing latent space variance ~ uniform(0,.1)")
+            if X_variance is None:
-            X_variance = np.random.uniform(0,.1,X.shape)
+                self.logger.info("initializing latent space variance ~ uniform(0,.1)")
                X_variance = np.random.uniform(0,.1,X.shape)
            self.variational_prior = NormalPrior()
            X = NormalPosterior(X, X_variance)
@ -61,7 +62,7 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
        if inference_method is None:
            from ..inference.latent_function_inference.var_dtc import VarDTC
            self.logger.debug("creating inference_method var_dtc")
-            inference_method = VarDTC(limit=1 if not missing_data else Y.shape[1])
+            inference_method = VarDTC(limit=3 if not missing_data else Y.shape[1])
        super(BayesianGPLVMMiniBatch,self).__init__(X, Y, Z, kernel, likelihood=likelihood,
                                           name=name, inference_method=inference_method,
@ -71,13 +72,13 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
        self.X = X
        self.link_parameter(self.X, 0)
-    def set_X_gradients(self, X, X_grad):
+    #def set_X_gradients(self, X, X_grad):
-        """Set the gradients of the posterior distribution of X in its specific form."""
+    #    """Set the gradients of the posterior distribution of X in its specific form."""
-        X.mean.gradient, X.variance.gradient = X_grad
+    #    X.mean.gradient, X.variance.gradient = X_grad
-    def get_X_gradients(self, X):
+    #def get_X_gradients(self, X):
-        """Get the gradients of the posterior distribution of X in its specific form."""
+    #    """Get the gradients of the posterior distribution of X in its specific form."""
-        return X.mean.gradient, X.variance.gradient
+    #    return X.mean.gradient, X.variance.gradient
    def _outer_values_update(self, full_values):
        """
@ -106,7 +107,7 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
        super(BayesianGPLVMMiniBatch,self).parameters_changed()
        kl_fctr = self.kl_factr
-        if kl_fctr > 0:
+        if kl_fctr > 0 and self.has_uncertain_inputs():
            Xgrad = self.X.gradient.copy()
            self.X.gradient[:] = 0
            self.variational_prior.update_gradients_KL(self.X)
@ -122,8 +123,8 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
            if self.missing_data or not self.stochastics:
                self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)
-            elif self.stochastics:
+            else: #self.stochastics is given:
                d = self.output_dim
                self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)*self.stochastics.batchsize/d
-        self._Xgrad = self.X.gradient.copy()
+        self._Xgrad = self.X.gradient.copy()
--- a/GPy/models/gp_var_gauss.py
+++ b/GPy/models/gp_var_gauss.py
@ -28,7 +28,7 @@ class GPVariationalGaussianApproximation(GP):
        self.beta = Param('beta', np.ones(num_data))
        inf = VarGauss(self.alpha, self.beta)
-        super(GPVariationalGaussianApproximation, self).__init__(X, Y, kernel, likelihood, name='VarGP', inference_method=inf)
+        super(GPVariationalGaussianApproximation, self).__init__(X, Y, kernel, likelihood, name='VarGP', inference_method=inf, Y_metadata=Y_metadata)
        self.link_parameter(self.alpha)
        self.link_parameter(self.beta)
--- a/GPy/models/gplvm.py
+++ b/GPy/models/gplvm.py
@ -41,4 +41,4 @@ class GPLVM(GP):
    def parameters_changed(self):
        super(GPLVM, self).parameters_changed()
-        self.X.gradient = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X, None)
+        self.X.gradient = self.kern.gradients_X(self.grad_dict['dL_dK'], self.X, None)
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@ -127,8 +127,6 @@ class MRD(BayesianGPLVMMiniBatch):
        self.unlink_parameter(self.likelihood)
        self.unlink_parameter(self.kern)
        del self.kern
        del self.likelihood
        self.num_data = Ylist[0].shape[0]
        if isinstance(batchsize, int):
@ -156,7 +154,11 @@ class MRD(BayesianGPLVMMiniBatch):
            self.link_parameter(spgp, i+2)
            self.bgplvms.append(spgp)
-        self.posterior = None
+        b = self.bgplvms[0]
        self.posterior = b.posterior
        self.kern = b.kern
        self.likelihood = b.likelihood
        self.logger.info("init done")
    def parameters_changed(self):
@ -236,7 +238,7 @@ class MRD(BayesianGPLVMMiniBatch):
    #                                     sharex=sharex, sharey=sharey)
    #         return fig
-    def plot_scales(self, titles=None, fig_kwargs=dict(figsize=None, tight_layout=True), **kwargs):
+    def plot_scales(self, titles=None, fig_kwargs={}, **kwargs):
        """
        Plot input sensitivity for all datasets, to see which input dimensions are
        significant for which dataset.
@ -252,12 +254,9 @@ class MRD(BayesianGPLVMMiniBatch):
        M = len(self.bgplvms)
        fig = pl().figure(rows=1, cols=M, **fig_kwargs)
        plots = {}
        for c in range(M):
            canvas = self.bgplvms[c].kern.plot_ARD(title=titles[c], figure=fig, col=c+1, **kwargs)
-            plots[titles[c]] = canvas
+        return canvas
        pl().show_canvas(canvas)
        return plots
    def plot_latent(self, labels=None, which_indices=None,
                resolution=60, legend=True,
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@ -41,11 +41,12 @@ class SparseGPMiniBatch(SparseGP):
    def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
                 name='sparse gp', Y_metadata=None, normalizer=False,
                 missing_data=False, stochastic=False, batchsize=1):
        self._update_stochastics = False
        # pick a sensible inference method
        if inference_method is None:
            if isinstance(likelihood, likelihoods.Gaussian):
-                inference_method = var_dtc.VarDTC(limit=1 if not missing_data else Y.shape[1])
+                inference_method = var_dtc.VarDTC(limit=3 if not missing_data else Y.shape[1])
            else:
                #inference_method = ??
                raise NotImplementedError("what to do what to do?")
@ -73,7 +74,14 @@ class SparseGPMiniBatch(SparseGP):
        logger.info("Adding Z as parameter")
        self.link_parameter(self.Z, index=0)
        self.posterior = None
-
+        
    def optimize(self, optimizer=None, start=None, **kwargs):
        try:
            self._update_stochastics = True
            SparseGP.optimize(self, optimizer=optimizer, start=start, **kwargs)
        finally:
            self._update_stochastics = False
    def has_uncertain_inputs(self):
        return isinstance(self.X, VariationalPosterior)
@ -226,16 +234,16 @@ class SparseGPMiniBatch(SparseGP):
            woodbury_inv = self.posterior._woodbury_inv
            woodbury_vector = self.posterior._woodbury_vector
-        if not self.stochastics:
+        #if not self.stochastics:
-            m_f = lambda i: "Inference with missing_data: {: >7.2%}".format(float(i+1)/self.output_dim)
+        #    m_f = lambda i: "Inference with missing_data: {: >7.2%}".format(float(i+1)/self.output_dim)
-            message = m_f(-1)
+        #    message = m_f(-1)
-            print(message, end=' ')
+        #    print(message, end=' ')
        for d, ninan in self.stochastics.d:
-            if not self.stochastics:
+            #if not self.stochastics:
-                print(' '*(len(message)) + '\r', end=' ')
+            #    print(' '*(len(message)) + '\r', end=' ')
-                message = m_f(d)
+            #    message = m_f(d)
-                print(message, end=' ')
+            #    print(message, end=' ')
            psi0ni = self.psi0[ninan]
            psi1ni = self.psi1[ninan]
@ -262,8 +270,8 @@ class SparseGPMiniBatch(SparseGP):
            woodbury_vector[:, d] = posterior.woodbury_vector
            self._log_marginal_likelihood += log_marginal_likelihood
-        if not self.stochastics:
+        #if not self.stochastics:
-            print('')
+        #    print('')
        if self.posterior is None:
            self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector,
@ -314,6 +322,8 @@ class SparseGPMiniBatch(SparseGP):
        if self.missing_data:
            self._outer_loop_for_missing_data()
        elif self.stochastics:
            if self._update_stochastics:
                self.stochastics.do_stochastics()
            self._outer_loop_without_missing_data()
        else:
            self.posterior, self._log_marginal_likelihood, self.grad_dict = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
--- a/GPy/models/sparse_gp_regression.py
+++ b/GPy/models/sparse_gp_regression.py
@ -30,7 +30,7 @@ class SparseGPRegression(SparseGP_MPI):
    """
-    def __init__(self, X, Y, kernel=None, Z=None, num_inducing=10, X_variance=None, normalizer=None, mpi_comm=None):
+    def __init__(self, X, Y, kernel=None, Z=None, num_inducing=10, X_variance=None, normalizer=None, mpi_comm=None, name='sparse_gp'):
        num_data, input_dim = X.shape
        # kern defaults to rbf (plus white for stability)
@ -55,11 +55,11 @@ class SparseGPRegression(SparseGP_MPI):
        else:
            infr = VarDTC()
-        SparseGP_MPI.__init__(self, X, Y, Z, kernel, likelihood, inference_method=infr, normalizer=normalizer, mpi_comm=mpi_comm)
+        SparseGP_MPI.__init__(self, X, Y, Z, kernel, likelihood, inference_method=infr, normalizer=normalizer, mpi_comm=mpi_comm, name=name)
    def parameters_changed(self):
        from ..inference.latent_function_inference.var_dtc_parallel import update_gradients_sparsegp,VarDTC_minibatch
        if isinstance(self.inference_method,VarDTC_minibatch):
            update_gradients_sparsegp(self, mpi_comm=self.mpi_comm)
        else:
-            super(SparseGPRegression, self).parameters_changed()
+            super(SparseGPRegression, self).parameters_changed()
--- a/GPy/models/sparse_gplvm.py
+++ b/GPy/models/sparse_gplvm.py
@ -4,6 +4,7 @@
 import sys
 from .sparse_gp_regression import SparseGPRegression
 from ..core import Param
 class SparseGPLVM(SparseGPRegression):
    """
@ -21,7 +22,9 @@ class SparseGPLVM(SparseGPRegression):
        if X is None:
            from ..util.initialization import initialize_latent
            X, fracs = initialize_latent(init, input_dim, Y)
        X = Param('latent space', X)
        SparseGPRegression.__init__(self, X, Y, kernel=kernel, num_inducing=num_inducing)
        self.link_parameter(self.X, 0)
    def parameters_changed(self):
        super(SparseGPLVM, self).parameters_changed()
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@ -94,6 +94,86 @@ class IBPPrior(VariationalPrior):
        variational_posterior.tau.gradient[:,0] = -((tau[:,0]-gamma-ad)*polygamma(1,tau[:,0])+common)
        variational_posterior.tau.gradient[:,1] = -((tau[:,1]+gamma-2)*polygamma(1,tau[:,1])+common)
 class SLVMPosterior(SpikeAndSlabPosterior):
    '''
    The SpikeAndSlab distribution for variational approximations.
    '''
    def __init__(self, means, variances, binary_prob, tau=None,  name='latent space'):
        """
        binary_prob : the probability of the distribution on the slab part.
        """
        from paramz.transformations import Logexp
        super(SLVMPosterior, self).__init__(means, variances, binary_prob, group_spike=False, name=name)
        self.tau = Param("tau_", np.ones((self.gamma.shape[1],2)), Logexp())
        self.link_parameter(self.tau)
    def set_gradients(self, grad):
        self.mean.gradient, self.variance.gradient, self.gamma.gradient, self.tau.gradient = grad
    def __getitem__(self, s):
        if isinstance(s, (int, slice, tuple, list, np.ndarray)):
            import copy
            n = self.__new__(self.__class__, self.name)
            dc = self.__dict__.copy()
            dc['mean'] = self.mean[s]
            dc['variance'] = self.variance[s]
            dc['binary_prob'] = self.binary_prob[s]
            dc['tau'] = self.tau
            dc['parameters'] = copy.copy(self.parameters)
            n.__dict__.update(dc)
            n.parameters[dc['mean']._parent_index_] = dc['mean']
            n.parameters[dc['variance']._parent_index_] = dc['variance']
            n.parameters[dc['binary_prob']._parent_index_] = dc['binary_prob']
            n.parameters[dc['tau']._parent_index_] = dc['tau']
            n._gradient_array_ = None
            oversize = self.size - self.mean.size - self.variance.size - self.gamma.size - self.tau.size
            n.size = n.mean.size + n.variance.size + n.gamma.size+ n.tau.size + oversize
            n.ndim = n.mean.ndim
            n.shape = n.mean.shape
            n.num_data = n.mean.shape[0]
            n.input_dim = n.mean.shape[1] if n.ndim != 1 else 1
            return n
        else:
            return super(IBPPosterior, self).__getitem__(s)
 class SLVMPrior(VariationalPrior):
    def __init__(self, input_dim, alpha =1., beta=1., Z=None, name='SLVMPrior', **kw):
        super(SLVMPrior, self).__init__(name=name, **kw)
        self.input_dim = input_dim
        self.variance = 1.
        self.alpha = alpha
        self.beta = beta
        self.Z = Z
        if Z is not None:
            assert np.all(np.unique(Z)==np.array([0,1]))
    def KL_divergence(self, variational_posterior):
        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma.values, variational_posterior.tau.values
        var_mean = np.square(mu)/self.variance
        var_S = (S/self.variance - np.log(S))
        part1 = (gamma* (np.log(self.variance)-1. +var_mean + var_S)).sum()/2.
        from scipy.special import betaln,digamma            
        part2 = (gamma*np.log(gamma)).sum() + ((1.-gamma)*np.log(1.-gamma)).sum() + betaln(self.alpha,self.beta)*self.input_dim \
                -betaln(tau[:,0], tau[:,1]).sum() + ((tau[:,0]-(gamma*self.Z).sum(0)-self.alpha)*digamma(tau[:,0])).sum() + \
                ((tau[:,1]-((1-gamma)*self.Z).sum(0)-self.beta)*digamma(tau[:,1])).sum() + ((self.Z.sum(0)+self.alpha+self.beta-tau[:,0]-tau[:,1])*digamma(tau.sum(axis=1))).sum()        
        return part1+part2
    def update_gradients_KL(self, variational_posterior):
        mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma.values, variational_posterior.tau.values
        variational_posterior.mean.gradient -= gamma*mu/self.variance
        variational_posterior.variance.gradient -= (1./self.variance - 1./S) * gamma /2.
        from scipy.special import digamma,polygamma
        dgamma = np.log(gamma/(1.-gamma))+ (digamma(tau[:,1])-digamma(tau[:,0]))*self.Z
        variational_posterior.binary_prob.gradient -= dgamma+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
        common = (self.Z.sum(0)+self.alpha+self.beta-tau[:,0]-tau[:,1])*polygamma(1,tau.sum(axis=1))
        variational_posterior.tau.gradient[:,0] = -((tau[:,0]-(gamma*self.Z).sum(0)-self.alpha)*polygamma(1,tau[:,0])+common)
        variational_posterior.tau.gradient[:,1] = -((tau[:,1]-((1-gamma)*self.Z).sum(0)-self.beta)*polygamma(1,tau[:,1])+common)
 class SSGPLVM(SparseGP_MPI):
    """
    Spike-and-Slab Gaussian Process Latent Variable Model
@ -107,7 +187,7 @@ class SSGPLVM(SparseGP_MPI):
    """
    def __init__(self, Y, input_dim, X=None, X_variance=None, Gamma=None, init='PCA', num_inducing=10,
-                 Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike_and_Slab GPLVM', group_spike=False, IBP=False, alpha=2., tau=None, mpi_comm=None, pi=None, learnPi=False,normalizer=False, sharedX=False, variational_prior=None,**kwargs):
+                 Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike_and_Slab GPLVM', group_spike=False, IBP=False,SLVM=False, alpha=2., beta=2., connM=None, tau=None, mpi_comm=None, pi=None, learnPi=False,normalizer=False, sharedX=False, variational_prior=None,**kwargs):
        self.group_spike = group_spike
        self.init = init
@ -152,6 +232,9 @@ class SSGPLVM(SparseGP_MPI):
        if IBP:
            self.variational_prior = IBPPrior(input_dim=input_dim, alpha=alpha) if variational_prior is None else variational_prior
            X = IBPPosterior(X, X_variance, gamma, tau=tau,sharedX=sharedX)
        elif SLVM:
            self.variational_prior = SLVMPrior(input_dim=input_dim, alpha=alpha, beta=beta, Z=connM) if variational_prior is None else variational_prior
            X = SLVMPosterior(X, X_variance, gamma, tau=tau)
        else:
            self.variational_prior = SpikeAndSlabPrior(pi=pi,learnPi=learnPi, group_spike=group_spike)  if variational_prior is None else variational_prior
            X = SpikeAndSlabPosterior(X, X_variance, gamma, group_spike=group_spike,sharedX=sharedX)
--- a/GPy/models/state_space.py
+++ b/GPy/models/state_space.py
@ -0,0 +1,745 @@
 # Copyright (c) 2013, Arno Solin.
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 #
 # This implementation of converting GPs to state space models is based on the article:
 #
 #  @article{Sarkka+Solin+Hartikainen:2013,
 #     author = {Simo S\"arkk\"a and Arno Solin and Jouni Hartikainen},
 #       year = {2013},
 #      title = {Spatiotemporal learning via infinite-dimensional {B}ayesian filtering and smoothing},
 #    journal = {IEEE Signal Processing Magazine},
 #     volume = {30},
 #     number = {4},
 #      pages = {51--61}
 #  }
 #
 import numpy as np
 from scipy import linalg
 from ..core import Model
 from .. import kern
 from GPy.plotting.matplot_dep.models_plots import gpplot
 from GPy.plotting.matplot_dep.base_plots import x_frame1D
 from GPy.plotting.matplot_dep import Tango
 import pylab as pb
 from GPy.core.parameterization.param import Param
 class StateSpace(Model):
    def __init__(self, X, Y, kernel=None, sigma2=1.0, name='StateSpace'):
        super(StateSpace, self).__init__(name=name)
        self.num_data, input_dim = X.shape
        assert input_dim==1, "State space methods for time only"
        num_data_Y, self.output_dim = Y.shape
        assert num_data_Y == self.num_data, "X and Y data don't match"
        assert self.output_dim == 1, "State space methods for single outputs only"
        # Make sure the observations are ordered in time
        sort_index = np.argsort(X[:,0])
        self.X = X[sort_index]
        self.Y = Y[sort_index]
        # Noise variance
        self.sigma2 = Param('Gaussian_noise', sigma2)
        self.link_parameter(self.sigma2)
        # Default kernel
        if kernel is None:
            self.kern = kern.Matern32(1)
        else:
            self.kern = kernel
        self.link_parameter(self.kern)
        self.sigma2.constrain_positive()
        # Assert that the kernel is supported
        if not hasattr(self.kern, 'sde'):
            raise NotImplementedError('SDE must be implemented for the kernel being used')
        #assert self.kern.sde() not False, "This kernel is not supported for state space estimation"
    def parameters_changed(self):
        """
        Parameters have now changed
        """
        # Get the model matrices from the kernel
        (F,L,Qc,H,Pinf,dF,dQc,dPinf) = self.kern.sde()
        # Use the Kalman filter to evaluate the likelihood
        self._log_marginal_likelihood = self.kf_likelihood(F,L,Qc,H,self.sigma2,Pinf,self.X.T,self.Y.T)
        gradients  = self.compute_gradients()
        self.sigma2.gradient_full[:] = gradients[-1]
        self.kern.gradient_full[:] = gradients[:-1]
    def log_likelihood(self):
        return self._log_marginal_likelihood
    def compute_gradients(self):
        # Get the model matrices from the kernel
        (F,L,Qc,H,Pinf,dFt,dQct,dPinft) = self.kern.sde()
        # Allocate space for the full partial derivative matrices
        dF    = np.zeros([dFt.shape[0],dFt.shape[1],dFt.shape[2]+1])
        dQc   = np.zeros([dQct.shape[0],dQct.shape[1],dQct.shape[2]+1])
        dPinf = np.zeros([dPinft.shape[0],dPinft.shape[1],dPinft.shape[2]+1])
        # Assign the values for the kernel function
        dF[:,:,:-1] = dFt
        dQc[:,:,:-1] = dQct
        dPinf[:,:,:-1] = dPinft
        # The sigma2 derivative
        dR = np.zeros([1,1,dF.shape[2]])
        dR[:,:,-1] = 1
        # Calculate the likelihood gradients
        gradients = self.kf_likelihood_g(F,L,Qc,H,self.sigma2,Pinf,dF,dQc,dPinf,dR,self.X.T,self.Y.T)
        return gradients
    def predict_raw(self, Xnew, Ynew=None, filteronly=False):
        # Set defaults
        if Ynew is None:
            Ynew = self.Y
        # Make a single matrix containing training and testing points
        X = np.vstack((self.X, Xnew))
        Y = np.vstack((Ynew, np.nan*np.zeros(Xnew.shape)))
        # Sort the matrix (save the order)
        _, return_index, return_inverse = np.unique(X,True,True)
        X = X[return_index]
        Y = Y[return_index]
        # Get the model matrices from the kernel
        (F,L,Qc,H,Pinf,dF,dQc,dPinf) = self.kern.sde()
        # Run the Kalman filter
        (M, P) = self.kalman_filter(F,L,Qc,H,self.sigma2,Pinf,X.T,Y.T)
        # Run the Rauch-Tung-Striebel smoother
        if not filteronly:
            (M, P) = self.rts_smoother(F,L,Qc,X.T,M,P)
        # Put the data back in the original order
        M = M[:,return_inverse]
        P = P[:,:,return_inverse]
        # Only return the values for Xnew
        M = M[:,self.num_data:]
        P = P[:,:,self.num_data:]
        # Calculate the mean and variance
        m = H.dot(M).T
        V = np.tensordot(H[0],P,(0,0))
        V = np.tensordot(V,H[0],(0,0))
        V = V[:,None]
        # Return the posterior of the state
        return (m, V)
    def predict(self, Xnew, filteronly=False):
        # Run the Kalman filter to get the state
        (m, V) = self.predict_raw(Xnew,filteronly=filteronly)
        # Add the noise variance to the state variance
        V += self.sigma2
        # Lower and upper bounds
        lower = m - 2*np.sqrt(V)
        upper = m + 2*np.sqrt(V)
        # Return mean and variance
        return (m, V, lower, upper)
    def plot(self, plot_limits=None, levels=20, samples=0, fignum=None,
            ax=None, resolution=None, plot_raw=False, plot_filter=False,
            linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
        # Deal with optional parameters
        if ax is None:
            fig = pb.figure(num=fignum)
            ax = fig.add_subplot(111)
        # Define the frame on which to plot
        resolution = resolution or 200
        Xgrid, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits)
        # Make a prediction on the frame and plot it
        if plot_raw:
            m, v = self.predict_raw(Xgrid,filteronly=plot_filter)
            lower = m - 2*np.sqrt(v)
            upper = m + 2*np.sqrt(v)
            Y = self.Y
        else:
            m, v, lower, upper = self.predict(Xgrid,filteronly=plot_filter)
            Y = self.Y
        # Plot the values
        gpplot(Xgrid, m, lower, upper, axes=ax, edgecol=linecol, fillcol=fillcol)
        ax.plot(self.X, self.Y, 'kx', mew=1.5)
        # Optionally plot some samples
        if samples:
            if plot_raw:
                Ysim = self.posterior_samples_f(Xgrid, samples)
            else:
                Ysim = self.posterior_samples(Xgrid, samples)
            for yi in Ysim.T:
                ax.plot(Xgrid, yi, Tango.colorsHex['darkBlue'], linewidth=0.25)
        # Set the limits of the plot to some sensible values
        ymin, ymax = min(np.append(Y.flatten(), lower.flatten())), max(np.append(Y.flatten(), upper.flatten()))
        ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
        ax.set_xlim(xmin, xmax)
        ax.set_ylim(ymin, ymax)
    def prior_samples_f(self,X,size=10):
        # Sort the matrix (save the order)
        (_, return_index, return_inverse) = np.unique(X,True,True)
        X = X[return_index]
        # Get the model matrices from the kernel
        (F,L,Qc,H,Pinf,dF,dQc,dPinf) = self.kern.sde()
        # Allocate space for results
        Y = np.empty((size,X.shape[0]))
        # Simulate random draws
        #for j in range(0,size):
        #    Y[j,:] = H.dot(self.simulate(F,L,Qc,Pinf,X.T))
        Y = self.simulate(F,L,Qc,Pinf,X.T,size)
        # Only observations
        Y = np.tensordot(H[0],Y,(0,0))
        # Reorder simulated values
        Y = Y[:,return_inverse]
        # Return trajectory
        return Y.T
    def posterior_samples_f(self,X,size=10):
        # Sort the matrix (save the order)
        (_, return_index, return_inverse) = np.unique(X,True,True)
        X = X[return_index]
        # Get the model matrices from the kernel
        (F,L,Qc,H,Pinf,dF,dQc,dPinf) = self.kern.sde()
        # Run smoother on original data
        (m,V) = self.predict_raw(X)
        # Simulate random draws from the GP prior
        y = self.prior_samples_f(np.vstack((self.X, X)),size)
        # Allocate space for sample trajectories
        Y = np.empty((size,X.shape[0]))
        # Run the RTS smoother on each of these values
        for j in range(0,size):
            yobs =  y[0:self.num_data,j:j+1] + np.sqrt(self.sigma2)*np.random.randn(self.num_data,1)
            (m2,V2) = self.predict_raw(X,Ynew=yobs)
            Y[j,:] = m.T + y[self.num_data:,j].T - m2.T
        # Reorder simulated values
        Y = Y[:,return_inverse]
        # Return posterior sample trajectories
        return Y.T
    def posterior_samples(self, X, size=10):
        # Make samples of f
        Y = self.posterior_samples_f(X,size)
        # Add noise
        Y += np.sqrt(self.sigma2)*np.random.randn(Y.shape[0],Y.shape[1])
        # Return trajectory
        return Y
    def kalman_filter(self,F,L,Qc,H,R,Pinf,X,Y):
        # KALMAN_FILTER - Run the Kalman filter for a given model and data
        # Allocate space for results
        MF = np.empty((F.shape[0],Y.shape[1]))
        PF = np.empty((F.shape[0],F.shape[0],Y.shape[1]))
        # Initialize
        MF[:,-1] = np.zeros(F.shape[0])
        PF[:,:,-1] = Pinf.copy()
        # Time step lengths
        dt = np.empty(X.shape)
        dt[:,0] = X[:,1]-X[:,0]
        dt[:,1:] = np.diff(X)
        # Solve the LTI SDE for these time steps
        As, Qs, index = self.lti_disc(F,L,Qc,dt)
        # Kalman filter
        for k in range(0,Y.shape[1]):
            # Form discrete-time model
            #(A, Q) = self.lti_disc(F,L,Qc,dt[:,k])
            A = As[:,:,index[k]];
            Q = Qs[:,:,index[k]];
            # Prediction step
            MF[:,k] = A.dot(MF[:,k-1])
            PF[:,:,k] = A.dot(PF[:,:,k-1]).dot(A.T) + Q
            # Update step (only if there is data)
            if not np.isnan(Y[:,k]):
                 if Y.shape[0]==1:
                     K = PF[:,:,k].dot(H.T)/(H.dot(PF[:,:,k]).dot(H.T) + R)
                 else:
                     LL = linalg.cho_factor(H.dot(PF[:,:,k]).dot(H.T) + R)
                     K = linalg.cho_solve(LL, H.dot(PF[:,:,k].T)).T
                 MF[:,k] += K.dot(Y[:,k]-H.dot(MF[:,k]))
                 PF[:,:,k] -= K.dot(H).dot(PF[:,:,k])
        # Return values
        return (MF, PF)
    def rts_smoother(self,F,L,Qc,X,MS,PS):
        # RTS_SMOOTHER - Run the RTS smoother for a given model and data
        # Time step lengths
        dt = np.empty(X.shape)
        dt[:,0] = X[:,1]-X[:,0]
        dt[:,1:] = np.diff(X)
        # Solve the LTI SDE for these time steps
        As, Qs, index = self.lti_disc(F,L,Qc,dt)
        # Sequentially smooth states starting from the end
        for k in range(2,X.shape[1]+1):
            # Form discrete-time model
            #(A, Q) = self.lti_disc(F,L,Qc,dt[:,1-k])
            A = As[:,:,index[1-k]];
            Q = Qs[:,:,index[1-k]];
            # Smoothing step
            LL = linalg.cho_factor(A.dot(PS[:,:,-k]).dot(A.T)+Q)
            G = linalg.cho_solve(LL,A.dot(PS[:,:,-k])).T
            MS[:,-k] += G.dot(MS[:,1-k]-A.dot(MS[:,-k]))
            PS[:,:,-k] += G.dot(PS[:,:,1-k]-A.dot(PS[:,:,-k]).dot(A.T)-Q).dot(G.T)
        # Return
        return (MS, PS)
    def kf_likelihood(self,F,L,Qc,H,R,Pinf,X,Y):
        # Evaluate marginal likelihood
        # Initialize
        lik = 0
        m = np.zeros((F.shape[0],1))
        P = Pinf.copy()
        # Time step lengths
        dt = np.empty(X.shape)
        dt[:,0] = X[:,1]-X[:,0]
        dt[:,1:] = np.diff(X)
        # Solve the LTI SDE for these time steps
        As, Qs, index = self.lti_disc(F,L,Qc,dt)
        # Kalman filter for likelihood evaluation
        for k in range(0,Y.shape[1]):
            # Form discrete-time model
            #(A,Q) = self.lti_disc(F,L,Qc,dt[:,k])
            A = As[:,:,index[k]];
            Q = Qs[:,:,index[k]];
            # Prediction step
            m = A.dot(m)
            P = A.dot(P).dot(A.T) + Q
            # Update step only if there is data
            if not np.isnan(Y[:,k]):
                 v = Y[:,k]-H.dot(m)
                 if Y.shape[0]==1:
                     S = H.dot(P).dot(H.T) + R
                     K = P.dot(H.T)/S
                     lik -= 0.5*np.log(S)
                     lik -= 0.5*v.shape[0]*np.log(2*np.pi)
                     lik -= 0.5*v*v/S
                 else:
                     LL, isupper = linalg.cho_factor(H.dot(P).dot(H.T) + R)
                     lik -= np.sum(np.log(np.diag(LL)))
                     lik -= 0.5*v.shape[0]*np.log(2*np.pi)
                     lik -= 0.5*linalg.cho_solve((LL, isupper),v).dot(v)
                     K = linalg.cho_solve((LL, isupper), H.dot(P.T)).T
                 m += K.dot(v)
                 P -= K.dot(H).dot(P)
        # Return likelihood
        return lik[0,0]
    def kf_likelihood_g(self,F,L,Qc,H,R,Pinf,dF,dQc,dPinf,dR,X,Y):
        # Evaluate marginal likelihood gradient
        # State dimension, number of data points and number of parameters
        n      = F.shape[0]
        steps  = Y.shape[1]
        nparam = dF.shape[2]
        # Time steps
        t = X.squeeze()
        # Allocate space
        e  = 0
        eg = np.zeros(nparam)
        # Set up
        m  = np.zeros([n,1])
        P  = Pinf.copy()
        dm = np.zeros([n,nparam])
        dP = dPinf.copy()
        mm = m.copy()
        PP = P.copy()
        # Initial dt
        dt = -np.Inf
        # Allocate space for expm results
        AA  = np.zeros([2*n, 2*n, nparam])
        FF  = np.zeros([2*n, 2*n])
        # Loop over all observations
        for k in range(0,steps):
            # The previous time step
            dt_old = dt;
            # The time discretization step length
            if k>0:
                dt = t[k]-t[k-1]
            else:
                dt = 0
            # Loop through all parameters (Kalman filter prediction step)
            for j in range(0,nparam):
                # Should we recalculate the matrix exponential?
                if abs(dt-dt_old) > 1e-9:
                    # The first matrix for the matrix factor decomposition
                    FF[:n,:n] = F
                    FF[n:,:n] = dF[:,:,j]
                    FF[n:,n:] = F
                    # Solve the matrix exponential
                    AA[:,:,j] = linalg.expm3(FF*dt)
                # Solve the differential equation
                foo         = AA[:,:,j].dot(np.vstack([m, dm[:,j:j+1]]))
                mm          = foo[:n,:]
                dm[:,j:j+1] = foo[n:,:]
                # The discrete-time dynamical model
                if j==0:
                    A  = AA[:n,:n,j]
                    Q  = Pinf - A.dot(Pinf).dot(A.T)
                    PP = A.dot(P).dot(A.T) + Q
                # The derivatives of A and Q
                dA = AA[n:,:n,j]
                dQ = dPinf[:,:,j] - dA.dot(Pinf).dot(A.T) \
                   - A.dot(dPinf[:,:,j]).dot(A.T) - A.dot(Pinf).dot(dA.T)
                # The derivatives of P
                dP[:,:,j] = dA.dot(P).dot(A.T) + A.dot(dP[:,:,j]).dot(A.T) \
                   + A.dot(P).dot(dA.T) + dQ
            # Set predicted m and P
            m = mm
            P = PP
            # Start the Kalman filter update step and precalculate variables
            S = H.dot(P).dot(H.T) + R
            # We should calculate the Cholesky factor if S is a matrix
            # [LS,notposdef] = chol(S,'lower');
            # The Kalman filter update (S is scalar)
            HtiS = H.T/S
            iS   = 1/S
            K    = P.dot(HtiS)
            v    = Y[:,k]-H.dot(m)
            vtiS = v.T/S
            # Loop through all parameters (Kalman filter update step derivative)
            for j in range(0,nparam):
                # Innovation covariance derivative
                dS = H.dot(dP[:,:,j]).dot(H.T) + dR[:,:,j];
                # Evaluate the energy derivative for j
                eg[j] = eg[j]                           \
                    - .5*np.sum(iS*dS)                  \
                    + .5*H.dot(dm[:,j:j+1]).dot(vtiS.T) \
                    + .5*vtiS.dot(dS).dot(vtiS.T)       \
                    + .5*vtiS.dot(H.dot(dm[:,j:j+1]))
                # Kalman filter update step derivatives
                dK          = dP[:,:,j].dot(HtiS) - P.dot(HtiS).dot(dS)/S
                dm[:,j:j+1] = dm[:,j:j+1] + dK.dot(v) - K.dot(H).dot(dm[:,j:j+1])
                dKSKt       = dK.dot(S).dot(K.T)
                dP[:,:,j]   = dP[:,:,j] - dKSKt - K.dot(dS).dot(K.T) - dKSKt.T
            # Evaluate the energy
            # e = e - .5*S.shape[0]*np.log(2*np.pi) - np.sum(np.log(np.diag(LS))) - .5*vtiS.dot(v);
            e = e - .5*S.shape[0]*np.log(2*np.pi) - np.sum(np.log(np.sqrt(S))) - .5*vtiS.dot(v)
            # Finish Kalman filter update step
            m = m + K.dot(v)
            P = P - K.dot(S).dot(K.T)
            # Make sure the covariances stay symmetric
            P  = (P+P.T)/2
            dP = (dP + dP.transpose([1,0,2]))/2
            # raise NameError('Debug me')
        # Return the gradient
        return eg
    def kf_likelihood_g_notstable(self,F,L,Qc,H,R,Pinf,dF,dQc,dPinf,dR,X,Y):
        # Evaluate marginal likelihood gradient
        # State dimension, number of data points and number of parameters
        steps  = Y.shape[1]
        nparam = dF.shape[2]
        n      = F.shape[0]
        # Time steps
        t = X.squeeze()
        # Allocate space
        e  = 0
        eg = np.zeros(nparam)
        # Set up
        Z  = np.zeros(F.shape)
        QC = L.dot(Qc).dot(L.T)
        m  = np.zeros([n,1])
        P  = Pinf.copy()
        dm = np.zeros([n,nparam])
        dP = dPinf.copy()
        mm = m.copy()
        PP = P.copy()
        # % Initial dt
        dt = -np.Inf
        # Allocate space for expm results
        AA  = np.zeros([2*F.shape[0], 2*F.shape[0], nparam])
        AAA = np.zeros([4*F.shape[0], 4*F.shape[0], nparam])
        FF  = np.zeros([2*F.shape[0], 2*F.shape[0]])
        FFF = np.zeros([4*F.shape[0], 4*F.shape[0]])
        # Loop over all observations
        for k in range(0,steps):
            # The previous time step
            dt_old = dt;
            # The time discretization step length
            if k>0:
                dt = t[k]-t[k-1]
            else:
                dt = t[1]-t[0]
            # Loop through all parameters (Kalman filter prediction step)
            for j in range(0,nparam):
                # Should we recalculate the matrix exponential?
                if abs(dt-dt_old) > 1e-9:
                    # The first matrix for the matrix factor decomposition
                    FF[:n,:n] = F
                    FF[n:,:n] = dF[:,:,j]
                    FF[n:,n:] = F
                    # Solve the matrix exponential
                    AA[:,:,j] = linalg.expm3(FF*dt)
                # Solve using matrix fraction decomposition
                foo = AA[:,:,j].dot(np.vstack([m, dm[:,j:j+1]]))
                # Pick the parts
                mm          = foo[:n,:]
                dm[:,j:j+1] = foo[n:,:]
                # Should we recalculate the matrix exponential?
                if abs(dt-dt_old) > 1e-9:
                    # Define W and G
                    W = L.dot(dQc[:,:,j]).dot(L.T)
                    G = dF[:,:,j];
                    # The second matrix for the matrix factor decomposition
                    FFF[:n,:n]         =  F
                    FFF[2*n:-n,:n]     =  G
                    FFF[:n,    n:2*n]  =  QC
                    FFF[n:2*n, n:2*n]  = -F.T
                    FFF[2*n:-n,n:2*n]  =  W
                    FFF[-n:,   n:2*n]  = -G.T
                    FFF[2*n:-n,2*n:-n] =  F
                    FFF[2*n:-n,-n:]    =  QC
                    FFF[-n:,-n:]       = -F.T
                    # Solve the matrix exponential
                    AAA[:,:,j] = linalg.expm3(FFF*dt)
                # Solve using matrix fraction decomposition
                foo = AAA[:,:,j].dot(np.vstack([P, np.eye(n), dP[:,:,j], np.zeros([n,n])]))
                # Pick the parts
                C  = foo[:n,    :]
                D  = foo[n:2*n, :]
                dC = foo[2*n:-n,:]
                dD = foo[-n:,   :]
                # The prediction step covariance (PP = C/D)
                if j==0:
                    PP = linalg.solve(D.T,C.T).T
                    PP = (PP + PP.T)/2
                # Sove dP for j (C/D == P_{k|k-1})
                dP[:,:,j] = linalg.solve(D.T,(dC - PP.dot(dD)).T).T
            # Set predicted m and P
            m = mm
            P = PP
            # Start the Kalman filter update step and precalculate variables
            S = H.dot(P).dot(H.T) + R
            # We should calculate the Cholesky factor if S is a matrix
            # [LS,notposdef] = chol(S,'lower');
            # The Kalman filter update (S is scalar)
            HtiS = H.T/S
            iS   = 1/S
            K    = P.dot(HtiS)
            v    = Y[:,k]-H.dot(m)
            vtiS = v.T/S
            # Loop through all parameters (Kalman filter update step derivative)
            for j in range(0,nparam):
                # Innovation covariance derivative
                dS = H.dot(dP[:,:,j]).dot(H.T) + dR[:,:,j];
                # Evaluate the energy derivative for j
                eg[j] = eg[j]                           \
                    - .5*np.sum(iS*dS)                  \
                    + .5*H.dot(dm[:,j:j+1]).dot(vtiS.T) \
                    + .5*vtiS.dot(dS).dot(vtiS.T)       \
                    + .5*vtiS.dot(H.dot(dm[:,j:j+1]))
                # Kalman filter update step derivatives
                dK          = dP[:,:,j].dot(HtiS) - P.dot(HtiS).dot(dS)/S
                dm[:,j:j+1] = dm[:,j:j+1] + dK.dot(v) - K.dot(H).dot(dm[:,j:j+1])
                dKSKt       = dK.dot(S).dot(K.T)
                dP[:,:,j]   = dP[:,:,j] - dKSKt - K.dot(dS).dot(K.T) - dKSKt.T
            # Evaluate the energy
            # e = e - .5*S.shape[0]*np.log(2*np.pi) - np.sum(np.log(np.diag(LS))) - .5*vtiS.dot(v);
            e = e - .5*S.shape[0]*np.log(2*np.pi) - np.sum(np.log(np.sqrt(S))) - .5*vtiS.dot(v)
            # Finish Kalman filter update step
            m = m + K.dot(v)
            P = P - K.dot(S).dot(K.T)
            # Make sure the covariances stay symmetric
            P  = (P+P.T)/2
            dP = (dP + dP.transpose([1,0,2]))/2
            # raise NameError('Debug me')
        # Report
        #print e
        #print eg
        # Return the gradient
        return eg
    def simulate(self,F,L,Qc,Pinf,X,size=1):
        # Simulate a trajectory using the state space model
        # Allocate space for results
        f = np.zeros((F.shape[0],size,X.shape[1]))
        # Initial state
        f[:,:,1] = np.linalg.cholesky(Pinf).dot(np.random.randn(F.shape[0],size))
        # Time step lengths
        dt = np.empty(X.shape)
        dt[:,0] = X[:,1]-X[:,0]
        dt[:,1:] = np.diff(X)
        # Solve the LTI SDE for these time steps
        As, Qs, index = self.lti_disc(F,L,Qc,dt)
        # Sweep through remaining time points
        for k in range(1,X.shape[1]):
            # Form discrete-time model
            A = As[:,:,index[1-k]]
            Q = Qs[:,:,index[1-k]]
            # Draw the state
            f[:,:,k] = A.dot(f[:,:,k-1]) + np.dot(np.linalg.cholesky(Q),np.random.randn(A.shape[0],size))
        # Return values
        return f
    def lti_disc(self,F,L,Qc,dt):
        # Discrete-time solution to the LTI SDE
        # Dimensionality
        n = F.shape[0]
        index = 0
        # Check for numbers of time steps
        if dt.flatten().shape[0]==1:
            # The covariance matrix by matrix fraction decomposition
            Phi = np.zeros((2*n,2*n))
            Phi[:n,:n] = F
            Phi[:n,n:] = L.dot(Qc).dot(L.T)
            Phi[n:,n:] = -F.T
            AB = linalg.expm(Phi*dt).dot(np.vstack((np.zeros((n,n)),np.eye(n))))
            Q = linalg.solve(AB[n:,:].T,AB[:n,:].T)
            # The dynamical model
            A  = linalg.expm(F*dt)
            # Return
            return A, Q
        # Optimize for cases where time steps occur repeatedly
        else:
            # Time discretizations (round to 14 decimals to avoid problems)
            dt, _, index = np.unique(np.round(dt,14),True,True)
            # Allocate space for A and Q
            A = np.empty((n,n,dt.shape[0]))
            Q = np.empty((n,n,dt.shape[0]))
            # Call this function for each dt
            for j in range(0,dt.shape[0]):
                A[:,:,j], Q[:,:,j] = self.lti_disc(F,L,Qc,dt[j])
            # Return
            return A, Q, index
--- a/GPy/models/state_space_cython.c
+++ b/GPy/models/state_space_cython.c
--- a/GPy/models/state_space_cython.pyx
+++ b/GPy/models/state_space_cython.pyx
@ -0,0 +1,964 @@
 # -*- coding: utf-8 -*-
 """
 Contains some cython code for state space modelling.
 """
 import numpy as np
 cimport numpy as np
 import scipy as sp
 cimport cython
 #from libc.math cimport isnan # for nan checking in kalman filter cycle
 cdef extern from "numpy/npy_math.h":
    bint npy_isnan(double x)
 DTYPE = np.float64
 DTYPE_int = np.int64
 ctypedef np.float64_t DTYPE_t
 ctypedef np.int64_t DTYPE_int_t
 # Template class for dynamic callables
 cdef class Dynamic_Callables_Cython:
    cpdef f_a(self, int k, np.ndarray[DTYPE_t, ndim=2] m, np.ndarray[DTYPE_t, ndim=2] A):
        raise NotImplemented("(cython) f_a is not implemented!")
    cpdef Ak(self, int k, np.ndarray[DTYPE_t, ndim=2] m, np.ndarray[DTYPE_t, ndim=2] P): # returns state iteration matrix
        raise NotImplemented("(cython) Ak is not implemented!")
    cpdef Qk(self, int k):
        raise NotImplemented("(cython) Qk is not implemented!")
    cpdef Q_srk(self, int k):
        raise NotImplemented("(cython) Q_srk is not implemented!")
    cpdef dAk(self, int k):
        raise NotImplemented("(cython) dAk is not implemented!")
    cpdef dQk(self, int k):
        raise NotImplemented("(cython) dQk is not implemented!")    
    cpdef reset(self, bint compute_derivatives = False):
        raise NotImplemented("(cython) reset is not implemented!") 
 # Template class for measurement callables
 cdef class Measurement_Callables_Cython:
    cpdef f_h(self, int k, np.ndarray[DTYPE_t, ndim=2] m_pred, np.ndarray[DTYPE_t, ndim=2] Hk):
        raise NotImplemented("(cython) f_a is not implemented!")
    cpdef Hk(self, int k, np.ndarray[DTYPE_t, ndim=2] m_pred, np.ndarray[DTYPE_t, ndim=2] P_pred): # returns state iteration matrix
        raise NotImplemented("(cython) Hk is not implemented!")
    cpdef Rk(self, int k):
        raise NotImplemented("(cython) Rk is not implemented!")
    cpdef R_isrk(self, int k):
        raise NotImplemented("(cython) Q_srk is not implemented!")
    cpdef dHk(self, int k):
        raise NotImplemented("(cython) dAk is not implemented!")
    cpdef dRk(self, int k):
        raise NotImplemented("(cython) dQk is not implemented!")
    cpdef reset(self,compute_derivatives = False):
        raise NotImplemented("(cython) reset is not implemented!")
 cdef class R_handling_Cython(Measurement_Callables_Cython):
    """
    The calss handles noise matrix R.
    """
    cdef:    
        np.ndarray R
        np.ndarray index
        int R_time_var_index
        np.ndarray dR
        bint svd_each_time         
        dict R_square_root
    def __init__(self, np.ndarray[DTYPE_t, ndim=3] R, np.ndarray[DTYPE_t, ndim=2] index, 
                   int R_time_var_index, int p_unique_R_number, np.ndarray[DTYPE_t, ndim=3] dR = None):
        """
        Input:        
        ---------------
        R - array with noise on various steps. The result of preprocessing
            the noise input.
        index - for each step of Kalman filter contains the corresponding index
                in the array.
        R_time_var_index - another index in the array R. Computed earlier and passed here.
        unique_R_number - number of unique noise matrices below which square roots
            are cached and above which they are computed each time.
        dR: 3D array[:, :, param_num]
            derivative of R. Derivative is supported only when R do not change over time
        Output:
        --------------
        Object which has two necessary functions:
            f_R(k)
            inv_R_square_root(k)
        """
        self.R = R
        self.index = index
        self.R_time_var_index = R_time_var_index
        self.dR = dR
        cdef int unique_len = len(np.unique(index))
        if (unique_len > p_unique_R_number):
            self.svd_each_time = True
        else:
            self.svd_each_time = False
        self.R_square_root = {}
    cpdef Rk(self, int k):
        return self.R[:,:, <int>self.index[self.R_time_var_index, k]]
    cpdef dRk(self,int k):
        if self.dR is None:
            raise ValueError("dR derivative is None")
        return self.dR # the same dirivative on each iteration
    cpdef R_isrk(self, int k):
        """
        Function returns the inverse square root of R matrix on step k.
        """
        cdef int ind = <int>self.index[self.R_time_var_index, k]
        cdef np.ndarray[DTYPE_t, ndim=2] R = self.R[:,:, ind ]
        cdef np.ndarray[DTYPE_t, ndim=2] inv_square_root
        cdef np.ndarray[DTYPE_t, ndim=2] U
        cdef np.ndarray[DTYPE_t, ndim=1] S           
        cdef np.ndarray[DTYPE_t, ndim=2] Vh
        if (R.shape[0] == 1): # 1-D case handle simplier. No storage
        # of the result, just compute it each time.
            inv_square_root = np.sqrt( 1.0/R )
        else:
            if self.svd_each_time:
                U,S,Vh = sp.linalg.svd( R,full_matrices=False, compute_uv=True, 
                          overwrite_a=False,check_finite=True)
                inv_square_root = U * 1.0/np.sqrt(S)
            else:
                if ind in self.R_square_root:
                    inv_square_root = self.R_square_root[ind]
                else:
                    U,S,Vh = sp.linalg.svd( R,full_matrices=False, compute_uv=True, 
                              overwrite_a=False,check_finite=True)
                    inv_square_root = U * 1.0/np.sqrt(S)
                    self.R_square_root[ind] = inv_square_root
        return inv_square_root
 cdef class Std_Measurement_Callables_Cython(R_handling_Cython):
    cdef:    
        np.ndarray H
        int H_time_var_index
        np.ndarray dH
    def __init__(self, np.ndarray[DTYPE_t, ndim=3] H, int H_time_var_index, 
                 np.ndarray[DTYPE_t, ndim=3] R, np.ndarray[DTYPE_t, ndim=2] index, int R_time_var_index, 
                 int unique_R_number, np.ndarray[DTYPE_t, ndim=3] dH = None, 
                 np.ndarray[DTYPE_t, ndim=3] dR=None):
        super(Std_Measurement_Callables_Cython,self).__init__(R, index, R_time_var_index, unique_R_number,dR)
        self.H = H
        self.H_time_var_index = H_time_var_index
        self.dH = dH
    cpdef f_h(self, int k, np.ndarray[DTYPE_t, ndim=2] m, np.ndarray[DTYPE_t, ndim=2] H):
        """
        function (k, x_{k}, H_{k}). Measurement function.
            k (iteration number), starts at 0
            x_{k} state 
            H_{k} Jacobian matrices of f_h. In the linear case it is exactly H_{k}.
        """
        return np.dot(H, m)
    cpdef Hk(self, int k, np.ndarray[DTYPE_t, ndim=2] m_pred, np.ndarray[DTYPE_t, ndim=2] P_pred): # returns state iteration matrix
        """
        function (k, m, P) return Jacobian of measurement function, it is
            passed into p_h.
            k (iteration number), starts at 0
            m: point where Jacobian is evaluated
            P: parameter for Jacobian, usually covariance matrix.
        """
        return self.H[:,:, <int>self.index[self.H_time_var_index, k]]
    cpdef dHk(self,int k):
        if self.dH is None:
            raise ValueError("dH derivative is None")
        return self.dH # the same dirivative on each iteration
 cdef class Q_handling_Cython(Dynamic_Callables_Cython):
    cdef:    
        np.ndarray Q
        np.ndarray index
        int Q_time_var_index
        np.ndarray dQ
        dict Q_square_root
        bint svd_each_time
    def __init__(self, np.ndarray[DTYPE_t, ndim=3] Q, np.ndarray[DTYPE_t, ndim=2] index, 
                 int Q_time_var_index, int p_unique_Q_number, np.ndarray[DTYPE_t, ndim=3] dQ = None):
        """
        Input:        
        ---------------
        Q - array with noise on various steps. The result of preprocessing
            the noise input.
        index - for each step of Kalman filter contains the corresponding index
                in the array.
        Q_time_var_index - another index in the array R. Computed earlier and passed here.
        unique_Q_number - number of unique noise matrices below which square roots
            are cached and above which they are computed each time.
        dQ: 3D array[:, :, param_num]
            derivative of Q. Derivative is supported only when Q do not change over time
        Output:
        --------------
        Object which has three necessary functions:
            Qk(k)
            dQk(k)
            Q_srkt(k)
        """
        self.Q = Q
        self.index = index
        self.Q_time_var_index = Q_time_var_index
        self.dQ = dQ         
        cdef int unique_len = len(np.unique(index))
        if (unique_len > p_unique_Q_number):
            self.svd_each_time = True
        else:
            self.svd_each_time = False
        self.Q_square_root = {}
    cpdef Qk(self, int k):
        """
        function (k). Returns noise matrix of dynamic model on iteration k.
                k (iteration number). starts at 0
        """
        return self.Q[:,:, <int>self.index[self.Q_time_var_index, k]]
    cpdef dQk(self, int k):
        if self.dQ is None:
            raise ValueError("dQ derivative is None")
        return self.dQ # the same dirivative on each iteration
    cpdef Q_srk(self, int k):
        """
        function (k). Returns the square root of noise matrix of dynamic model on iteration k.
                k (iteration number). starts at 0
        This function is implemented to use SVD prediction step.
        """
        cdef int ind = <int>self.index[self.Q_time_var_index, k]
        cdef np.ndarray[DTYPE_t, ndim=2] Q = self.Q[:,:, ind]
        cdef np.ndarray[DTYPE_t, ndim=2] square_root
        cdef np.ndarray[DTYPE_t, ndim=2] U
        cdef np.ndarray[DTYPE_t, ndim=1] S           
        cdef np.ndarray[DTYPE_t, ndim=2] Vh
        if (Q.shape[0] == 1): # 1-D case handle simplier. No storage
        # of the result, just compute it each time.
            square_root = np.sqrt( Q )
        else:
            if self.svd_each_time:
                U,S,Vh = sp.linalg.svd( Q,full_matrices=False, compute_uv=True, 
                          overwrite_a=False,check_finite=True)
                square_root = U * np.sqrt(S)
            else:
                if ind in self.Q_square_root:
                    square_root = self.Q_square_root[ind]
                else:
                    U,S,Vh = sp.linalg.svd( Q,full_matrices=False, compute_uv=True, 
                              overwrite_a=False,check_finite=True)
                    square_root = U * np.sqrt(S)
                    self.Q_square_root[ind] = square_root
        return square_root
 cdef class Std_Dynamic_Callables_Cython(Q_handling_Cython):
    cdef:
        np.ndarray A
        int A_time_var_index
        np.ndarray  dA
    def __init__(self, np.ndarray[DTYPE_t, ndim=3] A, int A_time_var_index, 
                 np.ndarray[DTYPE_t, ndim=3] Q, 
                 np.ndarray[DTYPE_t, ndim=2] index, 
                 int Q_time_var_index, int unique_Q_number, 
                 np.ndarray[DTYPE_t, ndim=3] dA = None, 
                 np.ndarray[DTYPE_t, ndim=3] dQ=None):
        super(Std_Dynamic_Callables_Cython,self).__init__(Q, index, Q_time_var_index, unique_Q_number,dQ)
        self.A = A
        self.A_time_var_index = A_time_var_index
        self.dA = dA
    cpdef f_a(self, int k, np.ndarray[DTYPE_t, ndim=2] m, np.ndarray[DTYPE_t, ndim=2] A):
        """
            f_a: function (k, x_{k-1}, A_{k}). Dynamic function.        
            k (iteration number), starts at 0
            x_{k-1} State from the previous step
            A_{k} Jacobian matrices of f_a. In the linear case it is exactly A_{k}.
        """
        return np.dot(A,m)
    cpdef Ak(self, int k, np.ndarray[DTYPE_t, ndim=2] m_pred, np.ndarray[DTYPE_t, ndim=2] P_pred): # returns state iteration matrix
        """
        function (k, m, P) return Jacobian of measurement function, it is
            passed into p_h.
            k (iteration number), starts at 0
            m: point where Jacobian is evaluated
            P: parameter for Jacobian, usually covariance matrix.
        """
        return self.A[:,:, <int>self.index[self.A_time_var_index, k]]
    cpdef dAk(self, int k):
        if self.dA is None:
            raise ValueError("dA derivative is None")
        return self.dA # the same dirivative on each iteration
    cpdef reset(self, bint compute_derivatives=False):
        """
        For reusing this object e.g. in smoother computation. It makes sence
        because necessary matrices have been already computed for all
        time steps.
        """
        return self
 cdef class AQcompute_batch_Cython(Q_handling_Cython):
        """
        Class for calculating matrices A, Q, dA, dQ of the discrete Kalman Filter
        from the matrices F, L, Qc, P_ing, dF, dQc, dP_inf of the continuos state
        equation. dt - time steps.       
        It has the same interface as AQcompute_once.
        It computes matrices for all time steps. This object is used when
        there are not so many (controlled by internal variable) 
        different time steps and storing all the matrices do not take too much memory.
        Since all the matrices are computed all together, this object can be used
        in smoother without repeating the computations.        
        """
        #def __init__(self, F,L,Qc,dt,compute_derivatives=False, grad_params_no=None, P_inf=None, dP_inf=None, dF = None, dQc=None):
        cdef:
            np.ndarray As
            np.ndarray Qs
            np.ndarray dAs
            np.ndarray dQs
            np.ndarray reconstruct_indices
            #long total_size_of_data
            dict Q_svd_dict
            int last_k
        def __init__(self, np.ndarray[DTYPE_t, ndim=3] As, np.ndarray[DTYPE_t, ndim=3] Qs, 
                     np.ndarray[DTYPE_int_t, ndim=1] reconstruct_indices, 
                     np.ndarray[DTYPE_t, ndim=4] dAs=None, 
                     np.ndarray[DTYPE_t, ndim=4] dQs=None):
            """
            Constructor. All necessary parameters are passed here and stored 
            in the opject.            
            Input:
            -------------------
                F, L, Qc, P_inf : matrices
                    Parameters of corresponding continuous state model
                dt: array
                    All time steps
                compute_derivatives: bool
                    Whether to calculate derivatives
                dP_inf, dF, dQc: 3D array
                    Derivatives if they are required
            Output:
            -------------------
            """
            self.As = As
            self.Qs = Qs
            self.dAs = dAs
            self.dQs = dQs
            self.reconstruct_indices = reconstruct_indices
            self.total_size_of_data = self.As.nbytes + self.Qs.nbytes +\
                            (self.dAs.nbytes if (self.dAs is not None) else 0) +\
                            (self.dQs.nbytes if (self.dQs is not None) else 0) +\
                            (self.reconstruct_indices.nbytes if (self.reconstruct_indices is not None) else 0)
            self.Q_svd_dict = {}
            self.last_k = 0
             # !!!Print statistics! Which object is created
            # !!!Print statistics! Print sizes of matrices
        cpdef f_a(self, int k, np.ndarray[DTYPE_t, ndim=2] m, np.ndarray[DTYPE_t, ndim=2] A):
            """
            Dynamic model
            """
            return np.dot(A, m) # default dynamic model
        cpdef reset(self, bint compute_derivatives=False):
            """
            For reusing this object e.g. in smoother computation. It makes sence
            because necessary matrices have been already computed for all
            time steps.
            """
            return self
        cpdef Ak(self,int k, np.ndarray[DTYPE_t, ndim=2] m, np.ndarray[DTYPE_t, ndim=2] P):
            self.last_k = k
            return self.As[:,:, <int>self.reconstruct_indices[k]]
        cpdef Qk(self,int k):
            self.last_k = k
            return self.Qs[:,:, <int>self.reconstruct_indices[k]]
        cpdef dAk(self, int k):
            self.last_k = k
            return self.dAs[:,:, :, <int>self.reconstruct_indices[k]]
        cpdef dQk(self, int k):
            self.last_k = k
            return self.dQs[:,:, :, <int>self.reconstruct_indices[k]]
        cpdef Q_srk(self, int k):
            """
            Square root of the noise matrix Q
            """
            cdef int matrix_index = <int>self.reconstruct_indices[k]
            cdef np.ndarray[DTYPE_t, ndim=2] square_root
            cdef np.ndarray[DTYPE_t, ndim=2] U
            cdef np.ndarray[DTYPE_t, ndim=1] S           
            cdef np.ndarray[DTYPE_t, ndim=2] Vh
            if matrix_index in self.Q_svd_dict:
                square_root = self.Q_svd_dict[matrix_index]
            else:
                U,S,Vh = sp.linalg.svd( self.Qs[:,:, matrix_index], 
                                        full_matrices=False, compute_uv=True, 
                                        overwrite_a=False, check_finite=False)                     
                square_root = U * np.sqrt(S)
                self.Q_svd_dict[matrix_index] = square_root
            return square_root
 #        def return_last(self):
 #            """
 #            Function returns last available matrices.
 #            """
 #            
 #            if (self.last_k is None):
 #                raise ValueError("Matrices are not computed.")
 #            else:
 #                ind = self.reconstruct_indices[self.last_k]
 #                A = self.As[:,:, ind]
 #                Q = self.Qs[:,:, ind]
 #                dA = self.dAs[:,:, :, ind]
 #                dQ = self.dQs[:,:, :, ind]
 #                
 #            return self.last_k, A, Q, dA, dQ
@cython.boundscheck(False)
 def _kalman_prediction_step_SVD_Cython(long k, np.ndarray[DTYPE_t, ndim=2] p_m , tuple p_P, 
                                Dynamic_Callables_Cython p_dynamic_callables, 
                                bint calc_grad_log_likelihood=False, 
                                np.ndarray[DTYPE_t, ndim=3] p_dm = None, 
                                np.ndarray[DTYPE_t, ndim=3] p_dP = None):
    """
    Desctrete prediction function        
    Input:
        k:int
            Iteration No. Starts at 0. Total number of iterations equal to the 
            number of measurements.
        p_m: matrix of size (state_dim, time_series_no)
            Mean value from the previous step. For "multiple time series mode" 
            it is matrix, second dimension of which correspond to different
            time series.
        p_P: tuple (Prev_cov, S, V)
            Covariance matrix from the previous step and its SVD decomposition.
            Prev_cov = V * S * V.T The tuple is (Prev_cov, S, V)                
        p_a: function (k, x_{k-1}, A_{k}). Dynamic function.        
            k (iteration number), starts at 0
            x_{k-1} State from the previous step
            A_{k} Jacobian matrices of f_a. In the linear case it is exactly A_{k}.
        p_f_A: function (k, m, P) return Jacobian of dynamic function, it is
            passed into p_a.
            k (iteration number), starts at 0
            m: point where Jacobian is evaluated
            P: parameter for Jacobian, usually covariance matrix.
        p_f_Q: function (k). Returns noise matrix of dynamic model on iteration k.
            k (iteration number). starts at 0
        p_f_Qsr: function (k). Returns square root of noise matrix of the 
            dynamic model on iteration k. k (iteration number). starts at 0
        calc_grad_log_likelihood: boolean
            Whether to calculate gradient of the marginal likelihood 
            of the state-space model. If true then the next parameter must 
            provide the extra parameters for gradient calculation.
        p_dm: 3D array (state_dim, time_series_no, parameters_no)
            Mean derivatives from the previous step. For "multiple time series mode" 
            it is 3D array, second dimension of which correspond to different
            time series.
        p_dP: 3D array (state_dim, state_dim, parameters_no)
            Mean derivatives from the previous step
        grad_calc_params_1: List or None
        List with derivatives. The first component is 'f_dA' - function(k)
        which returns the derivative of A. The second element is 'f_dQ'
         - function(k). Function which returns the derivative of Q.
    Output:
    ----------------------------
    m_pred, P_pred, dm_pred, dP_pred: metrices, 3D objects
        Results of the prediction steps.        
    """
    # covariance from the previous step# p_prev_cov = v * S * V.T
    cdef np.ndarray[DTYPE_t, ndim=2] Prev_cov = p_P[0]
    cdef np.ndarray[DTYPE_t, ndim=1] S_old = p_P[1]
    cdef np.ndarray[DTYPE_t, ndim=2] V_old = p_P[2]     
    #p_prev_cov_tst = np.dot(p_V, (p_S * p_V).T) # reconstructed covariance from the previous step        
    # index correspond to values from previous iteration.
    cdef np.ndarray[DTYPE_t, ndim=2] A = p_dynamic_callables.Ak(k,p_m,Prev_cov) # state transition matrix (or Jacobian)
    cdef np.ndarray[DTYPE_t, ndim=2] Q = p_dynamic_callables.Qk(k) # state noise matrx. This is necessary for the square root calculation (next step)
    cdef np.ndarray[DTYPE_t, ndim=2] Q_sr = p_dynamic_callables.Q_srk(k)            
    # Prediction step ->
    cdef np.ndarray[DTYPE_t, ndim=2] m_pred = p_dynamic_callables.f_a(k, p_m, A) # predicted mean
    # coavariance prediction have changed:
    cdef np.ndarray[DTYPE_t, ndim=2] svd_1_matr = np.vstack( ( (np.sqrt(S_old)* np.dot(A,V_old)).T , Q_sr.T) )
    res = sp.linalg.svd( svd_1_matr,full_matrices=False, compute_uv=True, 
                  overwrite_a=False,check_finite=True)
    # (U,S,Vh)  
    cdef np.ndarray[DTYPE_t, ndim=2] U = res[0]
    cdef np.ndarray[DTYPE_t, ndim=1] S = res[1]             
    cdef np.ndarray[DTYPE_t, ndim=2] Vh = res[2]             
    # predicted variance computed by the regular method. For testing
    #P_pred_tst = A.dot(Prev_cov).dot(A.T) + Q
    cdef np.ndarray[DTYPE_t, ndim=2]  V_new = Vh.T
    cdef np.ndarray[DTYPE_t, ndim=1]  S_new = S**2
    cdef np.ndarray[DTYPE_t, ndim=2] P_pred = np.dot(V_new * S_new, V_new.T) # prediction covariance
    #tuple P_pred = (P_pred, S_new, Vh.T)
    # Prediction step <-
    # derivatives
    cdef np.ndarray[DTYPE_t, ndim=3] dA_all_params
    cdef np.ndarray[DTYPE_t, ndim=3] dQ_all_params
    cdef np.ndarray[DTYPE_t, ndim=3] dm_pred
    cdef np.ndarray[DTYPE_t, ndim=3] dP_pred
    cdef int param_number
    cdef int j
    cdef tuple ret    
    cdef np.ndarray[DTYPE_t, ndim=2] dA
    cdef np.ndarray[DTYPE_t, ndim=2] dQ
    if calc_grad_log_likelihood:
        dA_all_params = p_dynamic_callables.dAk(k) # derivatives of A wrt parameters 
        dQ_all_params = p_dynamic_callables.dQk(k) # derivatives of Q wrt parameters
        param_number = p_dP.shape[2]
        # p_dm, p_dP - derivatives form the previoius step
        dm_pred = np.empty((p_dm.shape[0], p_dm.shape[1], p_dm.shape[2]), dtype = DTYPE)
        dP_pred = np.empty((p_dP.shape[0], p_dP.shape[1], p_dP.shape[2]), dtype = DTYPE)
        for j in range(param_number):
            dA = dA_all_params[:,:,j]
            dQ = dQ_all_params[:,:,j]
            dm_pred[:,:,j] = np.dot(dA, p_m) + np.dot(A, p_dm[:,:,j])
            # prediction step derivatives for current parameter:
            dP_pred[:,:,j] = np.dot( dA ,np.dot(Prev_cov, A.T))
            dP_pred[:,:,j] += dP_pred[:,:,j].T            
            dP_pred[:,:,j] += np.dot( A ,np.dot( p_dP[:,:,j] , A.T)) + dQ
            dP_pred[:,:,j] = 0.5*(dP_pred[:,:,j] + dP_pred[:,:,j].T) #symmetrize
    else:
        dm_pred = None
        dP_pred = None
    ret = (P_pred, S_new, Vh.T)
    return m_pred, ret, dm_pred, dP_pred
@cython.boundscheck(False)
 def _kalman_update_step_SVD_Cython(long k, np.ndarray[DTYPE_t, ndim=2] p_m, tuple p_P, 
                            Measurement_Callables_Cython p_measurement_callables, 
                            np.ndarray[DTYPE_t, ndim=2] measurement, 
                            bint calc_log_likelihood= False, 
                            bint calc_grad_log_likelihood=False, 
                            np.ndarray[DTYPE_t, ndim=3] p_dm = None, 
                            np.ndarray[DTYPE_t, ndim=3] p_dP = None):
    """
    Input:
    k: int
          Iteration No. Starts at 0. Total number of iterations equal to the 
          number of measurements.
    m_P: matrix of size (state_dim, time_series_no)
         Mean value from the previous step. For "multiple time series mode" 
            it is matrix, second dimension of which correspond to different
            time series.
    p_P: tuple (P_pred, S, V)
         Covariance matrix from the prediction step and its SVD decomposition.
         P_pred = V * S * V.T The tuple is (P_pred, S, V)
    p_h: function (k, x_{k}, H_{k}). Measurement function.
        k (iteration number), starts at 0
        x_{k} state 
        H_{k} Jacobian matrices of f_h. In the linear case it is exactly H_{k}.
    p_f_H: function (k, m, P) return Jacobian of dynamic function, it is
        passed into p_h.
        k (iteration number), starts at 0
        m: point where Jacobian is evaluated
        P: parameter for Jacobian, usually covariance matrix.
    p_f_R: function (k). Returns noise matrix of measurement equation 
        on iteration k.
        k (iteration number). starts at 0
    p_f_iRsr: function (k). Returns the square root of the noise matrix of 
        measurement equation on iteration k. 
        k (iteration number). starts at 0
    measurement: (measurement_dim, time_series_no) matrix
        One measurement used on the current update step. For 
        "multiple time series mode" it is matrix, second dimension of 
        which correspond to different time series.
    calc_log_likelihood: boolean
        Whether to calculate marginal likelihood of the state-space model.
    calc_grad_log_likelihood: boolean
            Whether to calculate gradient of the marginal likelihood 
            of the state-space model. If true then the next parameter must 
            provide the extra parameters for gradient calculation.
    p_dm: 3D array (state_dim, time_series_no, parameters_no)
            Mean derivatives from the prediction step. For "multiple time series mode" 
            it is 3D array, second dimension of which correspond to different
            time series.
    p_dP: array
        Covariance derivatives from the prediction step.
    grad_calc_params_2: List or None
        List with derivatives. The first component is 'f_dH' - function(k)
        which returns the derivative of H. The second element is 'f_dR'
         - function(k). Function which returns the derivative of R.
    Output:
    ----------------------------
    m_upd, P_upd, dm_upd, dP_upd: metrices, 3D objects
        Results of the prediction steps.
    log_likelihood_update: double or 1D array
        Update to the log_likelihood from this step        
    d_log_likelihood_update: (grad_params_no, time_series_no) matrix
        Update to the gradient of log_likelihood, "multiple time series mode"
        adds extra columns to the gradient.
    """        
    cdef np.ndarray[DTYPE_t, ndim=2] m_pred = p_m # from prediction step
    #P_pred,S_pred,V_pred = p_P # from prediction step      
    cdef np.ndarray[DTYPE_t, ndim=2] P_pred = p_P[0]
    cdef np.ndarray[DTYPE_t, ndim=1] S_pred = p_P[1]
    cdef np.ndarray[DTYPE_t, ndim=2] V_pred = p_P[2]    
    cdef np.ndarray[DTYPE_t, ndim=2] H = p_measurement_callables.Hk(k, m_pred, P_pred)
    cdef np.ndarray[DTYPE_t, ndim=2] R = p_measurement_callables.Rk(k)
    cdef np.ndarray[DTYPE_t, ndim=2] R_isr =p_measurement_callables.R_isrk(k) # square root of the inverse of R matrix       
    cdef int time_series_no = p_m.shape[1] # number of time serieses
    cdef np.ndarray[DTYPE_t, ndim=2] log_likelihood_update # log_likelihood_update=None;
    # Update step (only if there is data)
    #if not np.any(np.isnan(measurement)): # TODO: if some dimensions are missing, do properly computations for other.
    cdef np.ndarray[DTYPE_t, ndim=2] v = measurement-p_measurement_callables.f_h(k, m_pred, H)
    cdef np.ndarray[DTYPE_t, ndim=2] svd_2_matr = np.vstack( ( np.dot( R_isr.T, np.dot(H, V_pred)) , np.diag( 1.0/np.sqrt(S_pred) ) ) )
    res = sp.linalg.svd( svd_2_matr,full_matrices=False, compute_uv=True, 
                 overwrite_a=False,check_finite=True)
    #(U,S,Vh)    
    cdef np.ndarray[DTYPE_t, ndim=2] U = res[0]
    cdef np.ndarray[DTYPE_t, ndim=1] S_svd = res[1]
    cdef np.ndarray[DTYPE_t, ndim=2] Vh = res[2]
     # P_upd = U_upd S_upd**2 U_upd.T
    cdef np.ndarray[DTYPE_t, ndim=2] U_upd = np.dot(V_pred, Vh.T)             
    cdef np.ndarray[DTYPE_t, ndim=1] S_upd = (1.0/S_svd)**2
    cdef np.ndarray[DTYPE_t, ndim=2] P_upd = np.dot(U_upd * S_upd, U_upd.T) # update covariance
    #P_upd = (P_upd,S_upd,U_upd) # tuple to pass to the next step
     # stil need to compute S and K for derivative computation
    cdef np.ndarray[DTYPE_t, ndim=2] S = H.dot(P_pred).dot(H.T) + R
    cdef np.ndarray[DTYPE_t, ndim=2] K
    cdef bint measurement_dim_gt_one = False
    if measurement.shape[0]==1: # measurements are one dimensional
        if (S < 0):
            raise ValueError("Kalman Filter Update SVD: S is negative step %i" % k )
            #import pdb; pdb.set_trace()
        K = P_pred.dot(H.T) / S
        if calc_log_likelihood:
            log_likelihood_update = -0.5 * ( np.log(2*np.pi) + np.log(S) +
                                 v*v / S)
            #log_likelihood_update = log_likelihood_update[0,0] # to make int
            if np.any(np.isnan(log_likelihood_update)): # some member in P_pred is None.
                raise ValueError("Nan values in likelihood update!")
        else:
            log_likelihood_update = None
        #LL = None; islower = None
    else:
        measurement_dim_gt_one = True
        raise ValueError("""Measurement dimension larger then 1 is currently not supported""")
    # Old  method of computing updated covariance (for testing) ->
    #P_upd_tst = K.dot(S).dot(K.T)
    #P_upd_tst = 0.5*(P_upd_tst + P_upd_tst.T)
    #P_upd_tst =  P_pred - P_upd_tst# this update matrix is symmetric
    # Old  method of computing updated covariance (for testing) <-
    cdef np.ndarray[DTYPE_t, ndim=3] dm_upd # dm_upd=None;
    cdef np.ndarray[DTYPE_t, ndim=3] dP_upd # dP_upd=None;
    cdef np.ndarray[DTYPE_t, ndim=2] d_log_likelihood_update # d_log_likelihood_update=None     
    cdef np.ndarray[DTYPE_t, ndim=3] dm_pred_all_params
    cdef np.ndarray[DTYPE_t, ndim=3] dP_pred_all_params
    cdef int param_number
    cdef np.ndarray[DTYPE_t, ndim=3] dH_all_params
    cdef np.ndarray[DTYPE_t, ndim=3] dR_all_params
    cdef int param
    cdef np.ndarray[DTYPE_t, ndim=2] dH, dR, dm_pred, dP_pred, dv, dS, tmp1, tmp2, tmp3, dK, tmp5
    cdef tuple ret
    if calc_grad_log_likelihood:
        dm_pred_all_params = p_dm # derivativas of the prediction phase 
        dP_pred_all_params = p_dP
        param_number = p_dP.shape[2]
        dH_all_params = p_measurement_callables.dHk(k)
        dR_all_params = p_measurement_callables.dRk(k)
        dm_upd = np.empty((dm_pred_all_params.shape[0], dm_pred_all_params.shape[1], dm_pred_all_params.shape[2]), dtype = DTYPE)
        dP_upd = np.empty((dP_pred_all_params.shape[0], dP_pred_all_params.shape[1], dP_pred_all_params.shape[2]), dtype = DTYPE)
         # firts dimension parameter_no, second - time series number
        d_log_likelihood_update = np.empty((param_number,time_series_no), dtype = DTYPE)
        for param in range(param_number):
           dH = dH_all_params[:,:,param]
           dR = dR_all_params[:,:,param]
           dm_pred = dm_pred_all_params[:,:,param]
           dP_pred = dP_pred_all_params[:,:,param]
            # Terms in the likelihood derivatives
           dv = - np.dot( dH, m_pred) -  np.dot( H, dm_pred)           
           dS = np.dot(dH, np.dot( P_pred, H.T))
           dS += dS.T
           dS += np.dot(H, np.dot( dP_pred, H.T)) + dR
            # TODO: maybe symmetrize dS
           tmp1 = H.T / S
           tmp2 = dH.T / S
           tmp3 = dS.T / S
           dK = np.dot( dP_pred, tmp1) + np.dot( P_pred, tmp2) - \
                np.dot( P_pred, np.dot( tmp1, tmp3 ) )
            # terms required for the next step, save this for each parameter
           dm_upd[:,:,param] = dm_pred + np.dot(dK, v) + np.dot(K, dv)
           dP_upd[:,:,param] = -np.dot(dK, np.dot(S, K.T))      
           dP_upd[:,:,param] += dP_upd[:,:,param].T
           dP_upd[:,:,param] += dP_pred - np.dot(K , np.dot( dS, K.T))
           dP_upd[:,:,param] = 0.5*(dP_upd[:,:,param] + dP_upd[:,:,param].T) #symmetrize
           # computing the likelihood change for each parameter:
           tmp5 = v / S
           d_log_likelihood_update[param,:] = -(0.5*np.sum(np.diag(tmp3)) + \
               np.sum(tmp5*dv, axis=0) - 0.5 * np.sum(tmp5 * np.dot(dS, tmp5), axis=0) ) 
        # Compute the actual updates for mean of the states. Variance update
        # is computed earlier.
    else:
        dm_upd = None 
        dP_upd = None
        d_log_likelihood_update = None
    m_upd = m_pred + K.dot( v )
    ret = (P_upd,S_upd,U_upd)
    return m_upd, ret, log_likelihood_update, dm_upd, dP_upd, d_log_likelihood_update
@cython.boundscheck(False)
 def _cont_discr_kalman_filter_raw_Cython(int state_dim, Dynamic_Callables_Cython p_dynamic_callables, 
                                  Measurement_Callables_Cython p_measurement_callables, X, Y, 
                                  np.ndarray[DTYPE_t, ndim=2] m_init=None, np.ndarray[DTYPE_t, ndim=2] P_init=None, 
                                  p_kalman_filter_type='regular',
                                  bint calc_log_likelihood=False, 
                                  bint calc_grad_log_likelihood=False, 
                                  int grad_params_no=0, 
                                  np.ndarray[DTYPE_t, ndim=3] dm_init=None, 
                                  np.ndarray[DTYPE_t, ndim=3] dP_init=None):
    cdef int steps_no = Y.shape[0] # number of steps in the Kalman Filter
    cdef int time_series_no = Y.shape[2] # multiple time series mode
    # Allocate space for results
    # Mean estimations. Initial values will be included
    cdef np.ndarray[DTYPE_t, ndim=3] M = np.empty(((steps_no+1),state_dim,time_series_no), dtype=DTYPE)
    M[0,:,:] = m_init # Initialize mean values
    # Variance estimations. Initial values will be included
    cdef np.ndarray[DTYPE_t, ndim=3] P = np.empty(((steps_no+1),state_dim,state_dim))
    P_init = 0.5*( P_init + P_init.T) # symmetrize initial covariance. In some ustable cases this is uiseful
    P[0,:,:] = P_init # Initialize initial covariance matrix
    cdef np.ndarray[DTYPE_t, ndim=2] U
    cdef np.ndarray[DTYPE_t, ndim=1] S
    cdef np.ndarray[DTYPE_t, ndim=2] Vh
    U,S,Vh = sp.linalg.svd( P_init,full_matrices=False, compute_uv=True, 
              overwrite_a=False,check_finite=True)
    S[ (S==0) ] = 1e-17 # allows to run algorithm for singular initial variance
    cdef tuple P_upd = (P_init, S,U)
    #log_likelihood = 0
    #grad_log_likelihood = np.zeros((grad_params_no,1))
    cdef np.ndarray[DTYPE_t, ndim=2] log_likelihood = np.zeros((1, time_series_no), dtype = DTYPE) #if calc_log_likelihood else None
    cdef np.ndarray[DTYPE_t, ndim=2] grad_log_likelihood = np.zeros((grad_params_no, time_series_no), dtype = DTYPE) #if calc_grad_log_likelihood else None
    #setting initial values for derivatives update
    cdef np.ndarray[DTYPE_t, ndim=3] dm_upd = dm_init
    cdef np.ndarray[DTYPE_t, ndim=3] dP_upd = dP_init
    # Main loop of the Kalman filter
    cdef np.ndarray[DTYPE_t, ndim=2] prev_mean, k_measurment
    cdef np.ndarray[DTYPE_t, ndim=2] m_pred, m_upd
    cdef tuple P_pred
    cdef np.ndarray[DTYPE_t, ndim=3] dm_pred, dP_pred
    cdef np.ndarray[DTYPE_t, ndim=2] log_likelihood_update, d_log_likelihood_update
    cdef int k
    #print "Hi I am cython"
    for k in range(0,steps_no):
        # In this loop index for new estimations is (k+1), old - (k)
        # This happened because initial values are stored at 0-th index.                 
        #import pdb; pdb.set_trace()
        prev_mean = M[k,:,:] # mean from the previous step
        m_pred, P_pred, dm_pred, dP_pred = \
        _kalman_prediction_step_SVD_Cython(k, prev_mean ,P_upd, p_dynamic_callables,
            calc_grad_log_likelihood, dm_upd, dP_upd)
        k_measurment = Y[k,:,:]
        if (np.any(np.isnan(k_measurment)) == False):
 #        if np.any(np.isnan(k_measurment)):
 #            raise ValueError("Nan measurements are currently not supported")
            m_upd, P_upd, log_likelihood_update, dm_upd, dP_upd, d_log_likelihood_update = \
            _kalman_update_step_SVD_Cython(k,  m_pred , P_pred, p_measurement_callables, 
                    k_measurment, calc_log_likelihood=calc_log_likelihood, 
                    calc_grad_log_likelihood=calc_grad_log_likelihood, 
                    p_dm = dm_pred, p_dP = dP_pred)
        else:
            if not np.all(np.isnan(k_measurment)):
                    raise ValueError("""Nan measurements are currently not supported if
                                     they are intermixed with not NaN measurements""")
            else:
                m_upd = m_pred; P_upd = P_pred; dm_upd = dm_pred; dP_upd = dP_pred
                if calc_log_likelihood:
                    log_likelihood_update = np.zeros((1,time_series_no))
                if calc_grad_log_likelihood:
                    d_log_likelihood_update = np.zeros((grad_params_no,time_series_no))
        if calc_log_likelihood:
            log_likelihood += log_likelihood_update
        if calc_grad_log_likelihood:
            grad_log_likelihood += d_log_likelihood_update
        M[k+1,:,:] = m_upd # separate mean value for each time series
        P[k+1,:,:] = P_upd[0]
    return (M, P, log_likelihood, grad_log_likelihood, p_dynamic_callables.reset(False))
--- a/GPy/models/state_space_main.py
+++ b/GPy/models/state_space_main.py
--- a/GPy/models/state_space_model.py
+++ b/GPy/models/state_space_model.py
@ -0,0 +1,424 @@
 # Copyright (c) 2013, Arno Solin.
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 #
 # This implementation of converting GPs to state space models is based on the article:
 #
 #  @article{Sarkka+Solin+Hartikainen:2013,
 #     author = {Simo S\"arkk\"a and Arno Solin and Jouni Hartikainen},
 #       year = {2013},
 #      title = {Spatiotemporal learning via infinite-dimensional {B}ayesian filtering and smoothing},
 #    journal = {IEEE Signal Processing Magazine},
 #     volume = {30},
 #     number = {4},
 #      pages = {51--61}
 #  }
 #
 import numpy as np
 from scipy import stats
 from .. import likelihoods
 #from . import state_space_setup as ss_setup
 from ..core import Model
 from . import state_space_main as ssm
 from . import state_space_setup as ss_setup
 class StateSpace(Model):
    def __init__(self, X, Y, kernel=None, noise_var=1.0, kalman_filter_type = 'regular', use_cython = False, name='StateSpace'):
        super(StateSpace, self).__init__(name=name)
        if len(X.shape) == 1:
            X = np.atleast_2d(X).T
        self.num_data, self.input_dim = X.shape
        if len(Y.shape) == 1:
            Y = np.atleast_2d(Y).T
        assert self.input_dim==1, "State space methods are only for 1D data"
        if len(Y.shape)==2:
            num_data_Y, self.output_dim = Y.shape
            ts_number = None
        elif len(Y.shape)==3:
            num_data_Y, self.output_dim, ts_number = Y.shape
        self.ts_number = ts_number
        assert num_data_Y == self.num_data, "X and Y data don't match"
        assert self.output_dim == 1, "State space methods are for single outputs only"
        self.kalman_filter_type = kalman_filter_type
        #self.kalman_filter_type = 'svd' # temp test
        ss_setup.use_cython = use_cython
        #import pdb; pdb.set_trace()
        global ssm
        #from . import state_space_main as ssm
        if (ssm.cython_code_available) and (ssm.use_cython != ss_setup.use_cython):
            reload(ssm)
        # Make sure the observations are ordered in time
        sort_index = np.argsort(X[:,0])
        self.X = X[sort_index]
        self.Y = Y[sort_index]
        # Noise variance
        self.likelihood = likelihoods.Gaussian(variance=noise_var)
        # Default kernel
        if kernel is None:
            raise ValueError("State-Space Model: the kernel must be provided.")
        else:
            self.kern = kernel
        self.link_parameter(self.kern)
        self.link_parameter(self.likelihood)
        self.posterior = None
        # Assert that the kernel is supported
        if not hasattr(self.kern, 'sde'):
            raise NotImplementedError('SDE must be implemented for the kernel being used')
        #assert self.kern.sde() not False, "This kernel is not supported for state space estimation"
    def parameters_changed(self):
        """
        Parameters have now changed
        """
        #np.set_printoptions(16)
        #print(self.param_array)
        #import pdb; pdb.set_trace()
        # Get the model matrices from the kernel
        (F,L,Qc,H,P_inf, P0, dFt,dQct,dP_inft, dP0t) = self.kern.sde()
        # necessary parameters
        measurement_dim = self.output_dim
        grad_params_no = dFt.shape[2]+1 # we also add measurement noise as a parameter
        # add measurement noise as a parameter and get the gradient matrices
        dF    = np.zeros([dFt.shape[0],dFt.shape[1],grad_params_no])
        dQc   = np.zeros([dQct.shape[0],dQct.shape[1],grad_params_no])
        dP_inf = np.zeros([dP_inft.shape[0],dP_inft.shape[1],grad_params_no])
        dP0 = np.zeros([dP0t.shape[0],dP0t.shape[1],grad_params_no])
        # Assign the values for the kernel function
        dF[:,:,:-1] = dFt
        dQc[:,:,:-1] = dQct
        dP_inf[:,:,:-1] = dP_inft
        dP0[:,:,:-1] = dP0t
        # The sigma2 derivative
        dR = np.zeros([measurement_dim,measurement_dim,grad_params_no])
        dR[:,:,-1] = np.eye(measurement_dim)
        # Balancing
        #(F,L,Qc,H,P_inf,P0, dF,dQc,dP_inf,dP0) = ssm.balance_ss_model(F,L,Qc,H,P_inf,P0, dF,dQc,dP_inf, dP0)
        # Use the Kalman filter to evaluate the likelihood
        grad_calc_params = {}
        grad_calc_params['dP_inf'] = dP_inf
        grad_calc_params['dF'] = dF
        grad_calc_params['dQc'] = dQc
        grad_calc_params['dR'] = dR
        grad_calc_params['dP_init'] = dP0
        kalman_filter_type = self.kalman_filter_type
        # The following code is required because sometimes the shapes of self.Y
        # becomes 3D even though is must be 2D. The reason is undescovered.
        Y = self.Y
        if self.ts_number is None:
            Y.shape = (self.num_data,1)
        else:
            Y.shape = (self.num_data,1,self.ts_number)
        (filter_means, filter_covs, log_likelihood,
         grad_log_likelihood,SmootherMatrObject) = ssm.ContDescrStateSpace.cont_discr_kalman_filter(F,L,Qc,H,
                                      float(self.Gaussian_noise.variance),P_inf,self.X,Y,m_init=None,
                                      P_init=P0, p_kalman_filter_type = kalman_filter_type, calc_log_likelihood=True,
                                      calc_grad_log_likelihood=True,
                                      grad_params_no=grad_params_no,
                                      grad_calc_params=grad_calc_params)
        if np.any( np.isfinite(log_likelihood) == False):
            #import pdb; pdb.set_trace()
            print("State-Space: NaN valkues in the log_likelihood")
        if np.any( np.isfinite(grad_log_likelihood) == False):
            #import pdb; pdb.set_trace()
            print("State-Space: NaN valkues in the grad_log_likelihood")
        #print(grad_log_likelihood)
        grad_log_likelihood_sum = np.sum(grad_log_likelihood,axis=1)
        grad_log_likelihood_sum.shape = (grad_log_likelihood_sum.shape[0],1)
        self._log_marginal_likelihood = np.sum( log_likelihood,axis=1 )
        self.likelihood.update_gradients(grad_log_likelihood_sum[-1,0])
        self.kern.sde_update_gradient_full(grad_log_likelihood_sum[:-1,0])
    def log_likelihood(self):
        return self._log_marginal_likelihood
    def _raw_predict(self, Xnew=None, Ynew=None, filteronly=False, **kw):
        """
        Performs the actual prediction for new X points.
        Inner function. It is called only from inside this class.
        Input:
        ---------------------
        Xnews: vector or (n_points,1) matrix
            New time points where to evaluate predictions.
        Ynews: (n_train_points, ts_no) matrix
            This matrix can substitude the original training points (in order
            to use only the parameters of the model).
        filteronly: bool
            Use only Kalman Filter for prediction. In this case the output does
            not coincide with corresponding Gaussian process.
        Output:
        --------------------
        m: vector
            Mean prediction
        V: vector
            Variance in every point
        """
        # Set defaults
        if Ynew is None:
            Ynew = self.Y
        # Make a single matrix containing training and testing points
        if Xnew is not None:
            X = np.vstack((self.X, Xnew))
            Y = np.vstack((Ynew, np.nan*np.zeros(Xnew.shape)))
            predict_only_training = False
        else:
            X = self.X
            Y = Ynew
            predict_only_training = True
        # Sort the matrix (save the order)
        _, return_index, return_inverse = np.unique(X,True,True)
        X = X[return_index] # TODO they are not used
        Y = Y[return_index]
        # Get the model matrices from the kernel
        (F,L,Qc,H,P_inf, P0, dF,dQc,dP_inf,dP0) = self.kern.sde()
        state_dim = F.shape[0]
        #Y = self.Y[:, 0,0]
        # Run the Kalman filter
        #import pdb; pdb.set_trace()
        kalman_filter_type = self.kalman_filter_type
        (M, P, log_likelihood,
         grad_log_likelihood,SmootherMatrObject) = ssm.ContDescrStateSpace.cont_discr_kalman_filter(
                                      F,L,Qc,H,float(self.Gaussian_noise.variance),P_inf,X,Y,m_init=None,
                                      P_init=P0, p_kalman_filter_type = kalman_filter_type,
                                      calc_log_likelihood=False,
                                      calc_grad_log_likelihood=False)
 #        (filter_means, filter_covs, log_likelihood,
 #         grad_log_likelihood,SmootherMatrObject) = ssm.ContDescrStateSpace.cont_discr_kalman_filter(F,L,Qc,H,
 #                                      float(self.Gaussian_noise.variance),P_inf,self.X,self.Y,m_init=None,
 #                                      P_init=P0, p_kalman_filter_type = kalman_filter_type, calc_log_likelihood=True,
 #                                      calc_grad_log_likelihood=True,
 #                                      grad_params_no=grad_params_no,
 #                                      grad_calc_params=grad_calc_params)
        # Run the Rauch-Tung-Striebel smoother
        if not filteronly:
            (M, P) = ssm.ContDescrStateSpace.cont_discr_rts_smoother(state_dim, M, P,
                                p_dynamic_callables=SmootherMatrObject, X=X, F=F,L=L,Qc=Qc)
        # remove initial values
        M = M[1:,:,:]
        P = P[1:,:,:]
        # Put the data back in the original order
        M = M[return_inverse,:,:]
        P = P[return_inverse,:,:]
        # Only return the values for Xnew
        if not predict_only_training:
            M = M[self.num_data:,:,:]
            P = P[self.num_data:,:,:]
        # Calculate the mean and variance
        # after einsum m has dimension in 3D (sample_num, dim_no,time_series_no)
        m = np.einsum('ijl,kj', M, H)# np.dot(M,H.T)
        m.shape = (m.shape[0], m.shape[1]) # remove the third dimension
        V = np.einsum('ij,ajk,kl', H, P, H.T)
        V.shape = (V.shape[0], V.shape[1]) # remove the third dimension
        # Return the posterior of the state
        return (m, V)
    def predict(self, Xnew=None, filteronly=False, include_likelihood=True, **kw):
        # Run the Kalman filter to get the state
        (m, V) = self._raw_predict(Xnew,filteronly=filteronly)
        # Add the noise variance to the state variance
        if include_likelihood:
            V += float(self.likelihood.variance)
        # Lower and upper bounds
        #lower = m - 2*np.sqrt(V)
        #upper = m + 2*np.sqrt(V)
        # Return mean and variance
        return m, V
    def predict_quantiles(self, Xnew=None, quantiles=(2.5, 97.5), **kw):
        mu, var = self._raw_predict(Xnew)
        #import pdb; pdb.set_trace()
        return  [stats.norm.ppf(q/100.)*np.sqrt(var + float(self.Gaussian_noise.variance)) + mu for q in quantiles]
 #    def plot(self, plot_limits=None, levels=20, samples=0, fignum=None,
 #            ax=None, resolution=None, plot_raw=False, plot_filter=False,
 #            linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']):
 #
 #        # Deal with optional parameters
 #        if ax is None:
 #            fig = pb.figure(num=fignum)
 #            ax = fig.add_subplot(111)
 #
 #        # Define the frame on which to plot
 #        resolution = resolution or 200
 #        Xgrid, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits)
 #
 #        # Make a prediction on the frame and plot it
 #        if plot_raw:
 #            m, v = self.predict_raw(Xgrid,filteronly=plot_filter)
 #            lower = m - 2*np.sqrt(v)
 #            upper = m + 2*np.sqrt(v)
 #            Y = self.Y
 #        else:
 #            m, v, lower, upper = self.predict(Xgrid,filteronly=plot_filter)
 #            Y = self.Y
 #
 #        # Plot the values
 #        gpplot(Xgrid, m, lower, upper, axes=ax, edgecol=linecol, fillcol=fillcol)
 #        ax.plot(self.X, self.Y, 'kx', mew=1.5)
 #
 #        # Optionally plot some samples
 #        if samples:
 #            if plot_raw:
 #                Ysim = self.posterior_samples_f(Xgrid, samples)
 #            else:
 #                Ysim = self.posterior_samples(Xgrid, samples)
 #            for yi in Ysim.T:
 #                ax.plot(Xgrid, yi, Tango.colorsHex['darkBlue'], linewidth=0.25)
 #
 #        # Set the limits of the plot to some sensible values
 #        ymin, ymax = min(np.append(Y.flatten(), lower.flatten())), max(np.append(Y.flatten(), upper.flatten()))
 #        ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
 #        ax.set_xlim(xmin, xmax)
 #        ax.set_ylim(ymin, ymax)
 #
 #    def prior_samples_f(self,X,size=10):
 #
 #        # Sort the matrix (save the order)
 #        (_, return_index, return_inverse) = np.unique(X,True,True)
 #        X = X[return_index]
 #
 #        # Get the model matrices from the kernel
 #        (F,L,Qc,H,Pinf,dF,dQc,dPinf) = self.kern.sde()
 #
 #        # Allocate space for results
 #        Y = np.empty((size,X.shape[0]))
 #
 #        # Simulate random draws
 #        #for j in range(0,size):
 #        #    Y[j,:] = H.dot(self.simulate(F,L,Qc,Pinf,X.T))
 #        Y = self.simulate(F,L,Qc,Pinf,X.T,size)
 #
 #        # Only observations
 #        Y = np.tensordot(H[0],Y,(0,0))
 #
 #        # Reorder simulated values
 #        Y = Y[:,return_inverse]
 #
 #        # Return trajectory
 #        return Y.T
 #
 #    def posterior_samples_f(self,X,size=10):
 #
 #        # Sort the matrix (save the order)
 #        (_, return_index, return_inverse) = np.unique(X,True,True)
 #        X = X[return_index]
 #
 #        # Get the model matrices from the kernel
 #        (F,L,Qc,H,Pinf,dF,dQc,dPinf) = self.kern.sde()
 #
 #        # Run smoother on original data
 #        (m,V) = self.predict_raw(X)
 #
 #        # Simulate random draws from the GP prior
 #        y = self.prior_samples_f(np.vstack((self.X, X)),size)
 #
 #        # Allocate space for sample trajectories
 #        Y = np.empty((size,X.shape[0]))
 #
 #        # Run the RTS smoother on each of these values
 #        for j in range(0,size):
 #            yobs =  y[0:self.num_data,j:j+1] + np.sqrt(self.sigma2)*np.random.randn(self.num_data,1)
 #            (m2,V2) = self.predict_raw(X,Ynew=yobs)
 #            Y[j,:] = m.T + y[self.num_data:,j].T - m2.T
 #
 #        # Reorder simulated values
 #        Y = Y[:,return_inverse]
 #
 #        # Return posterior sample trajectories
 #        return Y.T
 #
 #    def posterior_samples(self, X, size=10):
 #
 #        # Make samples of f
 #        Y = self.posterior_samples_f(X,size)
 #
 #        # Add noise
 #        Y += np.sqrt(self.sigma2)*np.random.randn(Y.shape[0],Y.shape[1])
 #
 #        # Return trajectory
 #        return Y
 #
 #
 #    def simulate(self,F,L,Qc,Pinf,X,size=1):
 #        # Simulate a trajectory using the state space model
 #
 #        # Allocate space for results
 #        f = np.zeros((F.shape[0],size,X.shape[1]))
 #
 #        # Initial state
 #        f[:,:,1] = np.linalg.cholesky(Pinf).dot(np.random.randn(F.shape[0],size))
 #
 #        # Time step lengths
 #        dt = np.empty(X.shape)
 #        dt[:,0] = X[:,1]-X[:,0]
 #        dt[:,1:] = np.diff(X)
 #
 #        # Solve the LTI SDE for these time steps
 #        As, Qs, index = ssm.ContDescrStateSpace.lti_sde_to_descrete(F,L,Qc,dt)
 #
 #        # Sweep through remaining time points
 #        for k in range(1,X.shape[1]):
 #
 #            # Form discrete-time model
 #            A = As[:,:,index[1-k]]
 #            Q = Qs[:,:,index[1-k]]
 #
 #            # Draw the state
 #            f[:,:,k] = A.dot(f[:,:,k-1]) + np.dot(np.linalg.cholesky(Q),np.random.randn(A.shape[0],size))
 #
 #        # Return values
 #        return f
--- a/GPy/models/state_space_setup.py
+++ b/GPy/models/state_space_setup.py
@ -0,0 +1,10 @@
 # -*- coding: utf-8 -*-
 # Copyright (c) 2015, Alex Grigorevskiy
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 This module is intended for the setup of state_space_main module.
 The need of this module appeared because of the way state_space_main module
 connected with cython code.
 """
 use_cython = False
--- a/GPy/plotting/Tango.py
+++ b/GPy/plotting/Tango.py
@ -104,4 +104,4 @@ cdict_Alu = {'red' :((0./5,colorsRGB['Aluminium1'][0]/256.,colorsRGB['Aluminium1
                     (2./5,colorsRGB['Aluminium3'][2]/256.,colorsRGB['Aluminium3'][2]/256.),
                     (3./5,colorsRGB['Aluminium4'][2]/256.,colorsRGB['Aluminium4'][2]/256.),
                     (4./5,colorsRGB['Aluminium5'][2]/256.,colorsRGB['Aluminium5'][2]/256.),
-                     (5./5,colorsRGB['Aluminium6'][2]/256.,colorsRGB['Aluminium6'][2]/256.))}
+                     (5./5,colorsRGB['Aluminium6'][2]/256.,colorsRGB['Aluminium6'][2]/256.))}
--- a/GPy/plotting/init.py
+++ b/GPy/plotting/init.py
@ -52,6 +52,17 @@ def inject_plotting():
        GP.plot_f = gpy_plot.gp_plots.plot_f
        GP.plot_magnification = gpy_plot.latent_plots.plot_magnification
        from ..models import StateSpace
        StateSpace.plot_data = gpy_plot.data_plots.plot_data
        StateSpace.plot_data_error = gpy_plot.data_plots.plot_data_error
        StateSpace.plot_errorbars_trainset = gpy_plot.data_plots.plot_errorbars_trainset
        StateSpace.plot_mean = gpy_plot.gp_plots.plot_mean
        StateSpace.plot_confidence = gpy_plot.gp_plots.plot_confidence
        StateSpace.plot_density = gpy_plot.gp_plots.plot_density
        StateSpace.plot_samples = gpy_plot.gp_plots.plot_samples
        StateSpace.plot = gpy_plot.gp_plots.plot
        StateSpace.plot_f = gpy_plot.gp_plots.plot_f
        from ..core import SparseGP
        SparseGP.plot_inducing = gpy_plot.data_plots.plot_inducing
@ -107,4 +118,4 @@ try:
    lib = config.get('plotting', 'library')
    change_plotting_library(lib)
 except NoOptionError:
-    print("No plotting library was specified in config file. \n{}".format(error_suggestion))
+    print("No plotting library was specified in config file. \n{}".format(error_suggestion))
--- a/GPy/plotting/gpy_plot/gp_plots.py
+++ b/GPy/plotting/gpy_plot/gp_plots.py
@ -235,8 +235,6 @@ def plot_density(self, plot_limits=None, fixed_inputs=None,
    Give the Y_metadata in the predict_kw if you need it.
    :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
    :type plot_limits: np.array
    :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input dimension i should be set to value v.
@ -420,4 +418,4 @@ def _plot(self, canvas, plots, helper_data, helper_prediction, levels, plot_indu
        if helper_prediction[2] is not None:
            plots.update(_plot_samples(self, canvas, helper_data, helper_prediction, projection, "Samples"))
-        return plots
+        return plots
--- a/GPy/plotting/gpy_plot/kernel_plots.py
+++ b/GPy/plotting/gpy_plot/kernel_plots.py
@ -140,4 +140,4 @@ def plot_covariance(kernel, x=None, label=None,
        return pl().add_to_canvas(canvas, plots)
    else:
-        raise NotImplementedError("Cannot plot a kernel with more than two input dimensions")
+        raise NotImplementedError("Cannot plot a kernel with more than two input dimensions")
--- a/GPy/plotting/gpy_plot/latent_plots.py
+++ b/GPy/plotting/gpy_plot/latent_plots.py
@ -131,7 +131,9 @@ def plot_latent_inducing(self,
    Z = self.Z.values
    labels = np.array(['inducing'] * Z.shape[0])
-    scatters = _plot_latent_scatter(canvas, Z, sig_dims, labels, marker, num_samples, projection=projection, **kwargs)
+    kwargs['marker'] = marker
    update_not_existing_kwargs(kwargs, pl().defaults.inducing_2d)  # @UndefinedVariable
    scatters = _plot_latent_scatter(canvas, Z, sig_dims, labels, num_samples=num_samples, projection=projection, **kwargs)
    return pl().add_to_canvas(canvas, dict(scatter=scatters), legend=legend)
@ -147,6 +149,7 @@ def _plot_magnification(self, canvas, which_indices, Xgrid,
    def plot_function(x):
        Xtest_full = np.zeros((x.shape[0], Xgrid.shape[1]))
        Xtest_full[:, which_indices] = x
        mf = self.predict_magnification(Xtest_full, kern=kern, mean=mean, covariance=covariance)
        return mf.reshape(resolution, resolution).T
    imshow_kwargs = update_not_existing_kwargs(imshow_kwargs, pl().defaults.magnification)
@ -215,7 +218,12 @@ def _plot_latent(self, canvas, which_indices, Xgrid,
    def plot_function(x):
        Xtest_full = np.zeros((x.shape[0], Xgrid.shape[1]))
        Xtest_full[:, which_indices] = x
-        mf = np.log(self.predict(Xtest_full, kern=kern)[1])
+        mf = self.predict(Xtest_full, kern=kern)[1]
        if mf.shape[1]==self.output_dim:
            mf = mf.sum(-1)
        else:
            mf *= self.output_dim
        mf = np.log(mf)
        return mf.reshape(resolution, resolution).T
    imshow_kwargs = update_not_existing_kwargs(imshow_kwargs, pl().defaults.latent)
--- a/GPy/plotting/gpy_plot/plot_util.py
+++ b/GPy/plotting/gpy_plot/plot_util.py
@ -194,6 +194,7 @@ def scatter_label_generator(labels, X, visible_dims, marker=None):
            x = X[index, input_1]
            y = X[index, input_2]
            z = X[index, input_3]
        yield x, y, z, this_label, index, m
 def subsample_X(X, labels, num_samples=1000):
@ -385,5 +386,5 @@ def x_frame2D(X,plot_limits=None,resolution=None):
    resolution = resolution or 50
    xx, yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution]
-    Xnew = np.vstack((xx.flatten(),yy.flatten())).T
+    Xnew = np.c_[xx.flat, yy.flat]
    return Xnew, xx, yy, xmin, xmax
--- a/GPy/plotting/matplot_dep/init.py
+++ b/GPy/plotting/matplot_dep/init.py
@ -18,4 +18,4 @@
 from .util import align_subplot_array, align_subplots, fewerXticks, removeRightTicks, removeUpperTicks
-from . import controllers, base_plots
+from . import controllers, base_plots
--- a/GPy/plotting/matplot_dep/controllers/init.py
+++ b/GPy/plotting/matplot_dep/controllers/init.py
@ -1 +1 @@
-from .imshow_controller import ImshowController, ImAnnotateController
+from .imshow_controller import ImshowController, ImAnnotateController
--- a/GPy/plotting/matplot_dep/controllers/imshow_controller.py
+++ b/GPy/plotting/matplot_dep/controllers/imshow_controller.py
@ -72,4 +72,4 @@ class ImAnnotateController(ImshowController):
            text.set_x(x+xoffset)
            text.set_y(y+yoffset)
            text.set_text("{}".format(X[1][j, i]))
-        return view
+        return view
--- a/GPy/plotting/matplot_dep/defaults.py
+++ b/GPy/plotting/matplot_dep/defaults.py
@ -1,21 +1,21 @@
 #===============================================================================
 # Copyright (c) 2015, Max Zwiessele
 # All rights reserved.
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
-# 
+#
 # * Redistributions of source code must retain the above copyright notice, this
 #   list of conditions and the following disclaimer.
-# 
+#
 # * Redistributions in binary form must reproduce the above copyright notice,
 #   this list of conditions and the following disclaimer in the documentation
 #   and/or other materials provided with the distribution.
-# 
+#
 # * Neither the name of GPy nor the names of its
 #   contributors may be used to endorse or promote products derived from
 #   this software without specific prior written permission.
-# 
+#
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@ -34,20 +34,20 @@ from .. import Tango
 '''
 This file is for defaults for the gpy plot, specific to the plotting library.
-Create a kwargs dictionary with the right name for the plotting function 
+Create a kwargs dictionary with the right name for the plotting function
 you are implementing. If you do not provide defaults, the default behaviour of
-the plotting library will be used. 
+the plotting library will be used.
-In the code, always ise plotting.gpy_plots.defaults to get the defaults, as 
+In the code, always ise plotting.gpy_plots.defaults to get the defaults, as
 it gives back an empty default, when defaults are not defined.
 '''
 # Data plots:
-data_1d = dict(lw=1.5, marker='x', edgecolor='k')
+data_1d = dict(lw=1.5, marker='x', color='k')
 data_2d = dict(s=35, edgecolors='none', linewidth=0., cmap=cm.get_cmap('hot'), alpha=.5)
 inducing_1d = dict(lw=0, s=500, facecolors=Tango.colorsHex['darkRed'])
-inducing_2d = dict(s=14, edgecolors='k', linewidth=.4, facecolors='white', alpha=.5, marker='^')
+inducing_2d = dict(s=17, edgecolor='k', linewidth=.4, color='white', alpha=.5, marker='^')
-inducing_3d = dict(lw=.3, s=500, facecolors='white', edgecolors='k')
+inducing_3d = dict(lw=.3, s=500, color=Tango.colorsHex['darkRed'], edgecolor='k')
 xerrorbar = dict(color='k', fmt='none', elinewidth=.5, alpha=.5)
 yerrorbar = dict(color=Tango.colorsHex['darkRed'], fmt='none', elinewidth=.5, alpha=.5)
@ -71,5 +71,5 @@ ard = dict(edgecolor='k', linewidth=1.2)
 latent = dict(aspect='auto', cmap='Greys', interpolation='bicubic')
 gradient = dict(aspect='auto', cmap='RdBu', interpolation='nearest', alpha=.7)
 magnification = dict(aspect='auto', cmap='Greys', interpolation='bicubic')
-latent_scatter = dict(s=40, linewidth=.2, edgecolor='k', alpha=.9)
+latent_scatter = dict(s=20, linewidth=.2, edgecolor='k', alpha=.9)
-annotation = dict(fontdict=dict(family='sans-serif', weight='light', fontsize=9), zorder=.3, alpha=.7)
+annotation = dict(fontdict=dict(family='sans-serif', weight='light', fontsize=9), zorder=.3, alpha=.7)
--- a/GPy/plotting/matplot_dep/util.py
+++ b/GPy/plotting/matplot_dep/util.py
@ -1,5 +1,5 @@
 #===============================================================================
-# Copyright (c) 2015, Max Zwiessele
+# Copyright (c) 2016, Max Zwiessele, Alan saul
 # All rights reserved.
 # 
 # Redistribution and use in source and binary forms, with or without
@ -116,4 +116,43 @@ def align_subplot_array(axes,xlim=None, ylim=None):
        if i<(M*(N-1)):
            ax.set_xticks([])
        else:
-            removeUpperTicks(ax)
+            removeUpperTicks(ax)
 def fixed_inputs(model, non_fixed_inputs, fix_routine='median', as_list=True, X_all=False):
    """
    Convenience function for returning back fixed_inputs where the other inputs
    are fixed using fix_routine
    :param model: model
    :type model: Model
    :param non_fixed_inputs: dimensions of non fixed inputs
    :type non_fixed_inputs: list
    :param fix_routine: fixing routine to use, 'mean', 'median', 'zero'
    :type fix_routine: string
    :param as_list: if true, will return a list of tuples with (dimension, fixed_val) otherwise it will create the corresponding X matrix
    :type as_list: boolean
    """
    from ...inference.latent_function_inference.posterior import VariationalPosterior
    f_inputs = []
    if hasattr(model, 'has_uncertain_inputs') and model.has_uncertain_inputs():
        X = model.X.mean.values.copy()
    elif isinstance(model.X, VariationalPosterior):
        X = model.X.values.copy()
    else:
        if X_all:
            X = model.X_all.copy()
        else:
            X = model.X.copy()
    for i in range(X.shape[1]):
        if i not in non_fixed_inputs:
            if fix_routine == 'mean':
                f_inputs.append( (i, np.mean(X[:,i])) )
            if fix_routine == 'median':
                f_inputs.append( (i, np.median(X[:,i])) )
            else: # set to zero zero
                f_inputs.append( (i, 0) )
            if not as_list:
                X[:,i] = f_inputs[-1][1]
    if as_list:
        return f_inputs
    else:
        return X
--- a/GPy/plotting/matplot_dep/variational_plots.py
+++ b/GPy/plotting/matplot_dep/variational_plots.py
@ -15,7 +15,9 @@ def plot(parameterized, fignum=None, ax=None, colors=None, figsize=(12, 6)):
    if ax is None:
        fig = pb.figure(num=fignum, figsize=figsize)
    if colors is None:
-        colors = pb.gca()._get_lines.color_cycle
+        from ..Tango import mediumList
        from itertools import cycle
        colors = cycle(mediumList)
        pb.clf()
    else:
        colors = iter(colors)
@ -64,7 +66,9 @@ def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None, side_by_sid
        else:
            fig = pb.figure(num=fignum, figsize=(8, min(12, (2 * parameterized.mean.shape[1]))))
    if colors is None:
-        colors = pb.gca()._get_lines.color_cycle
+        from ..Tango import mediumList
        from itertools import cycle
        colors = cycle(mediumList)
        pb.clf()
    else:
        colors = iter(colors)
--- a/GPy/plotting/plotly_dep/defaults.py
+++ b/GPy/plotting/plotly_dep/defaults.py
@ -73,4 +73,4 @@ latent = dict(colorscale='Greys', reversescale=True, zsmooth='best')
 gradient = dict(colorscale='RdBu', opacity=.7)
 magnification = dict(colorscale='Greys', zsmooth='best', reversescale=True)
 latent_scatter = dict(marker_kwargs=dict(size='5', opacity=.7))
-# annotation = dict(fontdict=dict(family='sans-serif', weight='light', fontsize=9), zorder=.3, alpha=.7)
+# annotation = dict(fontdict=dict(family='sans-serif', weight='light', fontsize=9), zorder=.3, alpha=.7)
--- a/GPy/plotting/plotly_dep/plot_definitions.py
+++ b/GPy/plotting/plotly_dep/plot_definitions.py
@ -131,14 +131,15 @@ class PlotlyPlots(AbstractPlottingLibrary):
            #not matplotlib marker
            pass
        marker_kwargs = marker_kwargs or {}
-        marker_kwargs.setdefault('symbol', marker)
+        if 'symbol' not in marker_kwargs:
            marker_kwargs['symbol'] = marker
        if Z is not None:
            return Scatter3d(x=X, y=Y, z=Z, mode='markers',
                             showlegend=label is not None,
                             marker=Marker(color=color, colorscale=cmap, **marker_kwargs),
                             name=label, **kwargs)
        return Scatter(x=X, y=Y, mode='markers', showlegend=label is not None,
-                       marker=Marker(color=color, colorscale=cmap, **marker_kwargs or {}),
+                       marker=Marker(color=color, colorscale=cmap, **marker_kwargs),
                       name=label, **kwargs)
    def plot(self, ax, X, Y, Z=None, color=None, label=None, line_kwargs=None, **kwargs):
@ -254,7 +255,7 @@ class PlotlyPlots(AbstractPlottingLibrary):
                        font=dict(color='white' if np.abs(var) > 0.8 else 'black', size=10),
                        opacity=.5,
                        showarrow=False,
-                        hoverinfo='x'))
+                        ))
        return imshow, annotations
    def annotation_heatmap_interact(self, ax, plot_function, extent, label=None, resolution=15, imshow_kwargs=None, **annotation_kwargs):
--- a/GPy/testing/baseline/bayesian_gplvm_gradient.png
+++ b/GPy/testing/baseline/bayesian_gplvm_gradient.png
--- a/GPy/testing/baseline/bayesian_gplvm_inducing.png
+++ b/GPy/testing/baseline/bayesian_gplvm_inducing.png
--- a/GPy/testing/baseline/bayesian_gplvm_inducing_3d.png
+++ b/GPy/testing/baseline/bayesian_gplvm_inducing_3d.png
--- a/GPy/testing/baseline/bayesian_gplvm_latent.png
+++ b/GPy/testing/baseline/bayesian_gplvm_latent.png
--- a/GPy/testing/baseline/bayesian_gplvm_latent_3d.png
+++ b/GPy/testing/baseline/bayesian_gplvm_latent_3d.png
--- a/GPy/testing/baseline/bayesian_gplvm_magnification.png
+++ b/GPy/testing/baseline/bayesian_gplvm_magnification.png
--- a/GPy/testing/baseline/coverage_3d_plot.png
+++ b/GPy/testing/baseline/coverage_3d_plot.png
--- a/GPy/testing/baseline/coverage_annotation_interact.png
+++ b/GPy/testing/baseline/coverage_annotation_interact.png
--- a/GPy/testing/baseline/coverage_gradient.png
+++ b/GPy/testing/baseline/coverage_gradient.png
--- a/GPy/testing/baseline/coverage_imshow_interact.png
+++ b/GPy/testing/baseline/coverage_imshow_interact.png
--- a/GPy/testing/baseline/gp_2d_data.png
+++ b/GPy/testing/baseline/gp_2d_data.png
--- a/GPy/testing/baseline/gp_2d_in_error.png
+++ b/GPy/testing/baseline/gp_2d_in_error.png
--- a/GPy/testing/baseline/gp_2d_inducing.png
+++ b/GPy/testing/baseline/gp_2d_inducing.png
--- a/GPy/testing/baseline/gp_2d_mean.png
+++ b/GPy/testing/baseline/gp_2d_mean.png
--- a/GPy/testing/baseline/gp_3d_data.png
+++ b/GPy/testing/baseline/gp_3d_data.png
--- a/Show more
+++ b/Show more
		`@ -1 +1 @@`
			`[GPy Authors](https://github.com/SheffieldML/GPy/graphs/contributors)`				`GPy Authors: https://github.com/SheffieldML/GPy/graphs/contributors`
`@ -162,4 +162,4 @@ class ODE_t(Kern):`

	`self.lengthscale_Yt.gradient = np.sum(dkYdlent(-0.5self.lengthscale_Yt*(-2)) dL_dK)`	`self.lengthscale_Yt.gradient = np.sum(dkYdlent(-0.5self.lengthscale_Yt*(-2)) dL_dK)`

	`self.ubias.gradient = np.sum(dkdubias * dL_dK)`	`self.ubias.gradient = np.sum(dkdubias * dL_dK)`
`@ -1 +1 @@`
	`from . import psi_comp`	`from . import psi_comp`
`@ -18,4 +18,4 @@`


	`from .util import align_subplot_array, align_subplots, fewerXticks, removeRightTicks, removeUpperTicks`	`from .util import align_subplot_array, align_subplots, fewerXticks, removeRightTicks, removeUpperTicks`
	`from . import controllers, base_plots`	`from . import controllers, base_plots`
`@ -1 +1 @@`
	`from .imshow_controller import ImshowController, ImAnnotateController`	`from .imshow_controller import ImshowController, ImAnnotateController`