GPy/GPy/models/state_space_main.py

# -*- coding: utf-8 -*-
# Copyright (c) 2015, Alex Grigorevskiy
# Licensed under the BSD 3-clause license (see LICENSE.txt)
"""
Main functionality for state-space inference.
"""

import collections  # for cheking whether a variable is iterable
import types  # for cheking whether a variable is a function

import numpy as np
import scipy as sp
import scipy.linalg as linalg

try:
    from . import state_space_setup
    setup_available = True
except ImportError as e:
    setup_available = False


print_verbose = False

try:
    import state_space_cython
    cython_code_available = True
    if print_verbose:
        print("state_space: cython is available")
except ImportError as e:
    cython_code_available = False

#cython_code_available = False
# Use cython by default
use_cython = False
if setup_available:
    use_cython = state_space_setup.use_cython

if print_verbose:
    if use_cython:
        print("state_space: cython is used")
    else:
        print("state_space: cython is NOT used")


class Dynamic_Callables_Python(object):

    def f_a(self, k, m, A):
        """
        p_a: function (k, x_{k-1}, A_{k}). Dynamic function.
            k (iteration number), starts at 0
            x_{k-1} State from the previous step
            A_{k} Jacobian matrices of f_a. In the linear case it is exactly
            A_{k}.
        """

        raise NotImplemented("f_a is not implemented!")

    def Ak(self, k, m, P):  # returns state iteration matrix
        """
        function (k, m, P) return Jacobian of dynamic function, it is passed
        into p_a.
            k (iteration number), starts at 0
            m: point where Jacobian is evaluated
            P: parameter for Jacobian, usually covariance matrix.
        """

        raise NotImplemented("Ak is not implemented!")

    def Qk(self, k):
        """
        function (k). Returns noise matrix of dynamic model on iteration k.
                k (iteration number). starts at 0
        """
        raise NotImplemented("Qk is not implemented!")

    def Q_srk(self, k):
        """
        function (k). Returns the square root of noise matrix of dynamic model
        on iteration k.

        k (iteration number). starts at 0

        This function is implemented to use SVD prediction step.
        """

        raise NotImplemented("Q_srk is not implemented!")

    def dAk(self, k):
        """
        function (k). Returns the derivative of A on iteration k.
                k (iteration number). starts at 0
        """
        raise NotImplemented("dAk is not implemented!")

    def dQk(self, k):
        """
        function (k). Returns the derivative of Q on iteration k.
                k (iteration number). starts at 0
        """
        raise NotImplemented("dQk is not implemented!")

    def reset(self, compute_derivatives=False):
        """
        Return the state of this object to the beginning of iteration
        (to k eq. 0).
        """

        raise NotImplemented("reset is not implemented!")

if use_cython:
    Dynamic_Callables_Class = state_space_cython.Dynamic_Callables_Cython
else:
    Dynamic_Callables_Class = Dynamic_Callables_Python


class Measurement_Callables_Python(object):
    def f_h(self, k, m_pred, Hk):
        """
        function (k, x_{k}, H_{k}). Measurement function.
            k (iteration number), starts at 0
            x_{k} state
            H_{k} Jacobian matrices of f_h. In the linear case it is exactly
            H_{k}.
        """

        raise NotImplemented("f_a is not implemented!")

    def Hk(self, k, m_pred, P_pred):  # returns state iteration matrix
        """
        function (k, m, P) return Jacobian of measurement function, it is
            passed into p_h.
            k (iteration number), starts at 0
            m: point where Jacobian is evaluated
            P: parameter for Jacobian, usually covariance matrix.
        """

        raise NotImplemented("Hk is not implemented!")

    def Rk(self, k):
        """
        function (k). Returns noise matrix of measurement equation
            on iteration k.
            k (iteration number). starts at 0
        """
        raise NotImplemented("Rk is not implemented!")

    def R_isrk(self, k):
        """
        function (k). Returns the square root of the noise matrix of
            measurement equation on iteration k.
            k (iteration number). starts at 0

        This function is implemented to use SVD prediction step.
        """

        raise NotImplemented("Q_srk is not implemented!")

    def dHk(self, k):
        """
        function (k). Returns the derivative of H on iteration k.
                k (iteration number). starts at 0
        """
        raise NotImplemented("dAk is not implemented!")

    def dRk(self, k):
        """
        function (k). Returns the derivative of R on iteration k.
                k (iteration number). starts at 0
        """
        raise NotImplemented("dQk is not implemented!")

    def reset(self, compute_derivatives=False):
        """
        Return the state of this object to the beginning of iteration
        (to k eq. 0)
        """

        raise NotImplemented("reset is not implemented!")

if use_cython:
    Measurement_Callables_Class = state_space_cython.\
        Measurement_Callables_Cython
else:
    Measurement_Callables_Class = Measurement_Callables_Python


class R_handling_Python(Measurement_Callables_Class):
    """
    The calss handles noise matrix R.
    """
    def __init__(self, R, index, R_time_var_index, unique_R_number, dR=None):
        """
        Input:
        ---------------
        R - array with noise on various steps. The result of preprocessing
            the noise input.

        index - for each step of Kalman filter contains the corresponding index
                in the array.

        R_time_var_index - another index in the array R. Computed earlier and
                            is passed here.

        unique_R_number - number of unique noise matrices below which square
                roots are cached and above which they are computed each time.

        dR: 3D array[:, :, param_num]
            derivative of R. Derivative is supported only when R do not change
            over time

        Output:
        --------------
        Object which has two necessary functions:
            f_R(k)
            inv_R_square_root(k)
        """
        self.R = R
        self.index = np.asarray(index, np.int_)
        self.R_time_var_index = int(R_time_var_index)
        self.dR = dR

        if (len(np.unique(index)) > unique_R_number):
            self.svd_each_time = True
        else:
            self.svd_each_time = False

        self.R_square_root = {}

    def Rk(self, k):
        return self.R[:, :, self.index[self.R_time_var_index, k]]

    def dRk(self, k):
        if self.dR is None:
            raise ValueError("dR derivative is None")

        return self.dR  # the same dirivative on each iteration

    def R_isrk(self, k):
        """
        Function returns the inverse square root of R matrix on step k.
        """
        ind = int(self.index[self.R_time_var_index, k])
        R = self.R[:, :, ind]

        if (R.shape[0] == 1):  # 1-D case handle simplier. No storage
            # of the result, just compute it each time.
            inv_square_root = np.sqrt(1.0/R)
        else:
            if self.svd_each_time:

                (U, S, Vh) = sp.linalg.svd(R, full_matrices=False,
                                           compute_uv=True, overwrite_a=False,
                                           check_finite=True)

                inv_square_root = U * 1.0/np.sqrt(S)
            else:
                if ind in self.R_square_root:
                    inv_square_root = self.R_square_root[ind]
                else:
                    (U, S, Vh) = sp.linalg.svd(R, full_matrices=False,
                                               compute_uv=True,
                                               overwrite_a=False,
                                               check_finite=True)

                    inv_square_root = U * 1.0/np.sqrt(S)

                    self.R_square_root[ind] = inv_square_root

        return inv_square_root

if use_cython:
    R_handling_Class = state_space_cython.R_handling_Cython
else:
    R_handling_Class = R_handling_Python


class Std_Measurement_Callables_Python(R_handling_Class):

    def __init__(self, H, H_time_var_index, R, index, R_time_var_index,
                 unique_R_number, dH=None, dR=None):
        super(Std_Measurement_Callables_Python,
              self).__init__(R, index, R_time_var_index, unique_R_number, dR)

        self.H = H
        self.H_time_var_index = int(H_time_var_index)
        self.dH = dH

    def f_h(self, k, m, H):
        """
        function (k, x_{k}, H_{k}). Measurement function.
            k (iteration number), starts at 0
            x_{k} state
            H_{k} Jacobian matrices of f_h. In the linear case it is exactly
            H_{k}.
        """

        return np.dot(H, m)

    def Hk(self, k, m_pred, P_pred):  # returns state iteration matrix
        """
        function (k, m, P) return Jacobian of measurement function, it is
            passed into p_h.
            k (iteration number), starts at 0
            m: point where Jacobian is evaluated
            P: parameter for Jacobian, usually covariance matrix.
        """

        return self.H[:, :, self.index[self.H_time_var_index, k]]

    def dHk(self, k):
        if self.dH is None:
            raise ValueError("dH derivative is None")

        return self.dH  # the same dirivative on each iteration

if use_cython:
    Std_Measurement_Callables_Class = state_space_cython.\
                                        Std_Measurement_Callables_Cython
else:
    Std_Measurement_Callables_Class = Std_Measurement_Callables_Python


class Q_handling_Python(Dynamic_Callables_Class):

    def __init__(self, Q, index, Q_time_var_index, unique_Q_number, dQ=None):
        """
        Input:
        ---------------
        R - array with noise on various steps. The result of preprocessing
            the noise input.

        index - for each step of Kalman filter contains the corresponding index
                in the array.

        R_time_var_index - another index in the array R. Computed earlier and
                            passed here.

        unique_R_number - number of unique noise matrices below which square
            roots are cached and above which they are computed each time.

        dQ: 3D array[:, :, param_num]
            derivative of Q. Derivative is supported only when Q do not
            change over time

        Output:
        --------------
        Object which has two necessary functions:
            f_R(k)
            inv_R_square_root(k)
        """

        self.Q = Q
        self.index = np.asarray(index, np.int_)
        self.Q_time_var_index = Q_time_var_index
        self.dQ = dQ

        if (len(np.unique(index)) > unique_Q_number):
            self.svd_each_time = True
        else:
            self.svd_each_time = False

        self.Q_square_root = {}

    def Qk(self, k):
        """
        function (k). Returns noise matrix of dynamic model on iteration k.
                k (iteration number). starts at 0
        """
        return self.Q[:, :, self.index[self.Q_time_var_index, k]]

    def dQk(self, k):
        if self.dQ is None:
            raise ValueError("dQ derivative is None")

        return self.dQ  # the same dirivative on each iteration

    def Q_srk(self, k):
        """
        function (k). Returns the square root of noise matrix of dynamic model
                      on iteration k.
            k (iteration number). starts at 0

        This function is implemented to use SVD prediction step.
        """
        ind = self.index[self.Q_time_var_index, k]
        Q = self.Q[:, :, ind]

        if (Q.shape[0] == 1):  # 1-D case handle simplier. No storage
            # of the result, just compute it each time.
            square_root = np.sqrt(Q)
        else:
            if self.svd_each_time:

                (U, S, Vh) = sp.linalg.svd(Q, full_matrices=False,
                                           compute_uv=True,
                                           overwrite_a=False,
                                           check_finite=True)

                square_root = U * np.sqrt(S)
            else:

                if ind in self.Q_square_root:
                    square_root = self.Q_square_root[ind]
                else:
                    (U, S, Vh) = sp.linalg.svd(Q, full_matrices=False,
                                               compute_uv=True,
                                               overwrite_a=False,
                                               check_finite=True)

                    square_root = U * np.sqrt(S)

                    self.Q_square_root[ind] = square_root

        return square_root

if use_cython:
    Q_handling_Class = state_space_cython.Q_handling_Cython
else:
    Q_handling_Class = Q_handling_Python


class Std_Dynamic_Callables_Python(Q_handling_Class):

    def __init__(self, A, A_time_var_index, Q, index, Q_time_var_index,
                 unique_Q_number, dA=None, dQ=None):
        super(Std_Dynamic_Callables_Python,
              self).__init__(Q, index, Q_time_var_index, unique_Q_number, dQ)

        self.A = A
        self.A_time_var_index = np.asarray(A_time_var_index, np.int_)
        self.dA = dA

    def f_a(self, k, m, A):
        """
            f_a: function (k, x_{k-1}, A_{k}). Dynamic function.
            k (iteration number), starts at 0
            x_{k-1} State from the previous step
            A_{k} Jacobian matrices of f_a. In the linear case it is exactly
            A_{k}.
        """
        return np.dot(A, m)

    def Ak(self, k, m_pred, P_pred):  # returns state iteration matrix
        """
        function (k, m, P) return Jacobian of measurement function, it is
            passed into p_h.
            k (iteration number), starts at 0
            m: point where Jacobian is evaluated
            P: parameter for Jacobian, usually covariance matrix.
        """

        return self.A[:, :, self.index[self.A_time_var_index, k]]

    def dAk(self, k):
        if self.dA is None:
            raise ValueError("dA derivative is None")

        return self.dA  # the same dirivative on each iteration

    def reset(self, compute_derivatives=False):
        """
        Return the state of this object to the beginning of iteration
        (to k eq. 0)
        """

        return self

if use_cython:
    Std_Dynamic_Callables_Class = state_space_cython.\
                                  Std_Dynamic_Callables_Cython
else:
    Std_Dynamic_Callables_Class = Std_Dynamic_Callables_Python


class AddMethodToClass(object):

    def __init__(self, func=None, tp='staticmethod'):
        """
        Input:
        --------------
        func: function to add
        tp: string
        Type of the method: normal, staticmethod, classmethod
        """
        if func is None:
            raise ValueError("Function can not be None")

        self.func = func
        self.tp = tp

    def __get__(self, obj, klass=None, *args, **kwargs):

        if self.tp == 'staticmethod':
            return self.func
        elif self.tp == 'normal':
            def newfunc(obj, *args, **kwargs):
                return self.func

        elif self.tp == 'classmethod':
            def newfunc(klass, *args, **kwargs):
                return self.func
        return newfunc


class DescreteStateSpaceMeta(type):
    """
    Substitute necessary methods from cython.
    """

    def __new__(typeclass, name, bases, attributes):
        """
        After thos method the class object is created
        """

        if use_cython:
            if '_kalman_prediction_step_SVD' in attributes:
                attributes['_kalman_prediction_step_SVD'] =\
                                    AddMethodToClass(state_space_cython.
                                        _kalman_prediction_step_SVD_Cython)

            if '_kalman_update_step_SVD' in attributes:
                attributes['_kalman_update_step_SVD'] =\
                                    AddMethodToClass(state_space_cython.
                                        _kalman_update_step_SVD_Cython)

            if '_cont_discr_kalman_filter_raw' in attributes:
                attributes['_cont_discr_kalman_filter_raw'] =\
                                    AddMethodToClass(state_space_cython.
                                        _cont_discr_kalman_filter_raw_Cython)

        return super(DescreteStateSpaceMeta,
                     typeclass).__new__(typeclass, name, bases, attributes)


class DescreteStateSpace(object):
    """
    This class implents state-space inference for linear and non-linear
    state-space models.
    Linear models are:
    x_{k} =  A_{k} * x_{k-1} + q_{k-1};       q_{k-1} ~ N(0, Q_{k-1})
    y_{k} = H_{k} * x_{k} + r_{k};            r_{k-1} ~ N(0, R_{k})

    Nonlinear:
    x_{k} =  f_a(k, x_{k-1}, A_{k}) + q_{k-1};       q_{k-1} ~ N(0, Q_{k-1})
    y_{k} =  f_h(k, x_{k}, H_{k}) + r_{k};           r_{k-1} ~ N(0, R_{k})
    Here f_a and f_h are some functions of k (iteration number), x_{k-1} or
    x_{k} (state value on certain iteration), A_{k} and  H_{k} - Jacobian
    matrices of f_a and f_h respectively. In the linear case they are exactly
    A_{k} and  H_{k}.


    Currently two nonlinear Gaussian filter algorithms are implemented:
    Extended Kalman Filter (EKF), Statistically linearized Filter (SLF), which
    implementations are very similar.

    """
    __metaclass__ = DescreteStateSpaceMeta

    @staticmethod
    def _reshape_input_data(shape, desired_dim=3):
        """
        Static function returns the column-wise shape for for an input shape.

        Input:
        --------------
            shape: tuple
                Shape of an input array, so that it is always a column.

            desired_dim: int
                desired shape of output. For Y data it should be 3
                (sample_no, dimension, ts_no). For X data - 2 (sample_no, 1)
        Output:
        --------------
            new_shape: tuple
                New shape of the measurements array. Idea is that samples are
                along dimension 0, sample dimension - dimension 1, different
                time series - dimension 2.
            old_shape: tuple or None
                If the shape has been modified, return old shape, otherwise
                None.
        """

        if (len(shape) > 3):
            raise ValueError("""Input array is not supposed to be more
                                than 3 dimensional.""")

        if (len(shape) > desired_dim):
            raise ValueError("Input array shape is more than desired shape.")
        elif len(shape) == 1:
            if (desired_dim == 3):
                return ((shape[0], 1, 1), shape)  # last dimension is the
                # time serime_series_no
            elif (desired_dim == 2):
                return ((shape[0], 1), shape)

        elif len(shape) == 2:
            if (desired_dim == 3):
                return ((shape[1], 1, 1), shape) if (shape[0] == 1) else\
                    ((shape[0], shape[1], 1), shape)  # convert to column
                                                      # vector
            elif (desired_dim == 2):
                return ((shape[1], 1), shape) if (shape[0] == 1) else\
                    ((shape[0], shape[1]), None)  # convert to column vector

        else:  # len(shape) == 3
            return (shape, None)  # do nothing

    @classmethod
    def kalman_filter(cls, p_A, p_Q, p_H, p_R, Y, index=None, m_init=None,
                      P_init=None, p_kalman_filter_type='regular',
                      calc_log_likelihood=False,
                      calc_grad_log_likelihood=False, grad_params_no=None,
                      grad_calc_params=None):
        """
        This function implements the basic Kalman Filter algorithm
        These notations for the State-Space model are assumed:
            x_{k} =  A_{k} * x_{k-1} + q_{k-1};       q_{k-1} ~ N(0, Q_{k-1})
            y_{k} = H_{k} * x_{k} + r_{k};            r_{k-1} ~ N(0, R_{k})

        Returns estimated filter distributions x_{k} ~ N(m_{k}, P(k))

        Current Features:
        ----------------------------------------
        1) The function generaly do not modify the passed parameters. If
        it happens then it is an error. There are several exeprions: scalars
        can be modified into a matrix, in some rare cases shapes of
        the derivatives matrices may be changed, it is ignored for now.

        2) Copies of p_A, p_Q, index are created in memory to be used later
        in smoother. References to copies are kept in "matrs_for_smoother"
        return parameter.

        3) Function support "multiple time series mode" which means that exactly
        the same State-Space model is used to filter several sets of measurements.
        In this case third dimension of Y should include these state-space measurements
        Log_likelihood and Grad_log_likelihood have the corresponding dimensions then.

        4) Calculation of Grad_log_likelihood is not supported if matrices A,Q,
        H, or R changes over time. (later may be changed)

        5) Measurement may include missing values. In this case update step is
        not done for this measurement. (later may be changed)

        Input:
        -----------------

        p_A: scalar, square matrix, 3D array
            A_{k} in the model. If matrix then A_{k} = A - constant.
            If it is 3D array then A_{k} = p_A[:,:, index[0,k]]

        p_Q: scalar, square symmetric matrix, 3D array
            Q_{k-1} in the model. If matrix then Q_{k-1} = Q - constant.
            If it is 3D array then Q_{k-1} = p_Q[:,:, index[1,k]]

        p_H: scalar, matrix (measurement_dim, state_dim) , 3D array
            H_{k} in the model. If matrix then H_{k} = H - constant.
            If it is 3D array then H_{k} = p_Q[:,:, index[2,k]]

        p_R: scalar, square symmetric matrix, 3D array
            R_{k} in the model. If matrix then R_{k} = R - constant.
            If it is 3D array then R_{k} = p_R[:,:, index[3,k]]

        Y: matrix or vector or 3D array
            Data. If Y is matrix then samples are along 0-th dimension and
            features along the 1-st. If 3D array then third dimension
            correspond to "multiple time series mode".

        index: vector
            Which indices (on 3-rd dimension) from arrays p_A, p_Q,p_H, p_R to use
            on every time step. If this parameter is None then it is assumed
            that p_A, p_Q, p_H, p_R do not change over time and indices are not needed.
            index[0,:] - correspond to A, index[1,:] - correspond to Q
            index[2,:] - correspond to H, index[3,:] - correspond to R.
            If index.shape[0] == 1, it is assumed that indides for all matrices
            are the same.

        m_init: vector or matrix
            Initial distribution mean. If None it is assumed to be zero.
            For "multiple time series mode" it is matrix, second dimension of
            which correspond to different time series. In regular case ("one
            time series mode") it is a vector.

        P_init: square symmetric matrix or scalar
            Initial covariance of the states. If the parameter is scalar
            then it is assumed that initial covariance matrix is unit matrix
            multiplied by this scalar. If None the unit matrix is used instead.
            "multiple time series mode" does not affect it, since it does not
            affect anything related to state variaces.

        calc_log_likelihood: boolean
            Whether to calculate marginal likelihood of the state-space model.

        calc_grad_log_likelihood: boolean
            Whether to calculate gradient of the marginal likelihood
            of the state-space model. If true then "grad_calc_params" parameter must
            provide the extra parameters for gradient calculation.

        grad_params_no: int
            If previous parameter is true, then this parameters gives the
            total number of parameters in the gradient.

        grad_calc_params: dictionary
            Dictionary with derivatives of model matrices with respect
            to parameters "dA", "dQ", "dH", "dR", "dm_init", "dP_init".
            They can be None, in this case zero matrices (no dependence on parameters)
            is assumed. If there is only one parameter then third dimension is
            automatically added.

        Output:
        --------------

        M: (no_steps+1,state_dim) matrix or (no_steps+1,state_dim, time_series_no) 3D array
            Filter estimates of the state means. In the extra step the initial
            value is included. In the "multiple time series mode" third dimension
            correspond to different timeseries.

        P: (no_steps+1, state_dim, state_dim) 3D array
            Filter estimates of the state covariances. In the extra step the initial
            value is included.

        log_likelihood: double or (1, time_series_no) 3D array.
            If the parameter calc_log_likelihood was set to true, return
            logarithm of marginal likelihood of the state-space model. If
            the parameter was false, return None. In the "multiple time series mode" it is a vector
            providing log_likelihood for each time series.

        grad_log_likelihood: column vector or (grad_params_no, time_series_no) matrix
            If calc_grad_log_likelihood is true, return gradient of log likelihood
            with respect to parameters. It returns it column wise, so in
            "multiple time series mode" gradients for each time series is in the
            corresponding column.

        matrs_for_smoother: dict
            Dictionary with model functions for smoother. The intrinsic model
            functions are computed in this functions and they are returned to
            use in smoother for convenience. They are: 'p_a', 'p_f_A', 'p_f_Q'
            The dictionary contains the same fields.
        """

        #import pdb; pdb.set_trace()

        # Parameters checking ->
        # index
        p_A = np.atleast_1d(p_A)
        p_Q = np.atleast_1d(p_Q)
        p_H = np.atleast_1d(p_H)
        p_R = np.atleast_1d(p_R)

        # Reshape and check measurements:
        Y.shape, old_Y_shape  = cls._reshape_input_data(Y.shape)
        measurement_dim = Y.shape[1]
        time_series_no = Y.shape[2] # multiple time series mode

        if ((len(p_A.shape) == 3) and (len(p_A.shape[2]) != 1)) or\
            ((len(p_Q.shape) == 3) and (len(p_Q.shape[2]) != 1)) or\
            ((len(p_H.shape) == 3) and (len(p_H.shape[2]) != 1)) or\
            ((len(p_R.shape) == 3) and (len(p_R.shape[2]) != 1)):
            model_matrices_chage_with_time = True
        else:
            model_matrices_chage_with_time = False

        # Check index
        old_index_shape = None
        if index is None:
            if (len(p_A.shape) == 3) or (len(p_Q.shape) == 3) or\
                (len(p_H.shape) == 3) or (len(p_R.shape) == 3):
                raise ValueError("Parameter index can not be None for time varying matrices (third dimension is present)")
            else: # matrices do not change in time, so form dummy zero indices.
                index = np.zeros((1,Y.shape[0]))
        else:
            if len(index.shape) == 1:
                index.shape = (1,index.shape[0])
                old_index_shape = (index.shape[0],)

            if (index.shape[1] != Y.shape[0]):
                raise ValueError("Number of measurements must be equal the number of A_{k}, Q_{k}, H_{k}, R_{k}")

        if (index.shape[0] == 1):
            A_time_var_index = 0; Q_time_var_index = 0
            H_time_var_index = 0; R_time_var_index = 0
        elif (index.shape[0] == 4):
            A_time_var_index = 0; Q_time_var_index = 1
            H_time_var_index = 2; R_time_var_index = 3
        else:
            raise ValueError("First Dimension of index must be either 1 or 4.")

        state_dim = p_A.shape[0]
        # Check and make right shape for model matrices. On exit they all are 3 dimensional. Last dimension
        # correspond to change in time.
        (p_A, old_A_shape) = cls._check_SS_matrix(p_A, state_dim, measurement_dim, which='A')
        (p_Q, old_Q_shape) = cls._check_SS_matrix(p_Q, state_dim, measurement_dim, which='Q')
        (p_H, old_H_shape) = cls._check_SS_matrix(p_H, state_dim, measurement_dim, which='H')
        (p_R, old_R_shape) = cls._check_SS_matrix(p_R, state_dim, measurement_dim, which='R')

        # m_init
        if m_init is None:
            m_init = np.zeros((state_dim, time_series_no))
        else:
            m_init = np.atleast_2d(m_init).T

        # P_init
        if P_init is None:
            P_init = np.eye(state_dim)
        elif not isinstance(P_init, collections.Iterable): #scalar
            P_init = P_init*np.eye(state_dim)

        if p_kalman_filter_type not in ('regular', 'svd'):
            raise ValueError("Kalman filer type neither 'regular nor 'svd'.")

        # Functions to pass to the kalman_filter algorithm:
        # Parameters:
        # k - number of Kalman filter iteration
        # m - vector for calculating matrices. Required for EKF. Not used here.

        c_p_A = p_A.copy() # create a copy because this object is passed to the smoother
        c_p_Q = p_Q.copy() # create a copy because this object is passed to the smoother
        c_index = index.copy() # create a copy because this object is passed to the smoother

        if calc_grad_log_likelihood:
            if model_matrices_chage_with_time:
                raise ValueError("When computing likelihood gradient A and Q can not change over time.")

            dA = cls._check_grad_state_matrices(grad_calc_params.get('dA'), state_dim, grad_params_no, which = 'dA')
            dQ = cls._check_grad_state_matrices(grad_calc_params.get('dQ'), state_dim, grad_params_no, which = 'dQ')
            dH = cls._check_grad_measurement_matrices(grad_calc_params.get('dH'), state_dim, grad_params_no, measurement_dim, which = 'dH')
            dR = cls._check_grad_measurement_matrices(grad_calc_params.get('dR'), state_dim, grad_params_no, measurement_dim, which = 'dR')

            dm_init = grad_calc_params.get('dm_init')
            if dm_init is None:
                 # multiple time series mode. Keep grad_params always as a last dimension
                dm_init = np.zeros((state_dim, time_series_no, grad_params_no))

            dP_init = grad_calc_params.get('dP_init')
            if dP_init is None:
                dP_init = np.zeros((state_dim,state_dim,grad_params_no))
        else:
            dA = None
            dQ = None
            dH = None
            dR = None
            dm_init = None
            dP_init = None

        dynamic_callables = Std_Dynamic_Callables_Class(c_p_A, A_time_var_index, c_p_Q, c_index, Q_time_var_index, 20, dA, dQ)
        measurement_callables = Std_Measurement_Callables_Class(p_H, H_time_var_index, p_R, index, R_time_var_index, 20, dH, dR)

        (M, P,log_likelihood, grad_log_likelihood, dynamic_callables) = \
            cls._kalman_algorithm_raw(state_dim, dynamic_callables,
                                    measurement_callables, Y, m_init,
                                    P_init, p_kalman_filter_type = p_kalman_filter_type,
                                    calc_log_likelihood=calc_log_likelihood,
                                    calc_grad_log_likelihood=calc_grad_log_likelihood,
                                    grad_params_no=grad_params_no,
                                    dm_init=dm_init, dP_init=dP_init)

        # restore shapes so that input parameters are unchenged
        if old_index_shape is not None:
            index.shape = old_index_shape

        if old_Y_shape is not None:
            Y.shape = old_Y_shape

        if old_A_shape is not None:
            p_A.shape = old_A_shape

        if old_Q_shape is not None:
            p_Q.shape = old_Q_shape

        if old_H_shape is not None:
            p_H.shape = old_H_shape

        if old_R_shape is not None:
            p_R.shape = old_R_shape
        # Return values

        return (M, P,log_likelihood, grad_log_likelihood, dynamic_callables)

    @classmethod
    def extended_kalman_filter(cls,p_state_dim, p_a, p_f_A, p_f_Q, p_h, p_f_H, p_f_R, Y, m_init=None,
                          P_init=None,calc_log_likelihood=False):

        """
        Extended Kalman Filter

        Input:
        -----------------

        p_state_dim: integer

        p_a: if None - the function from the linear model is assumed. No non-
             linearity in the dynamic is assumed.

             function (k, x_{k-1}, A_{k}). Dynamic function.
             k: (iteration number),
             x_{k-1}: (previous state)
             x_{k}: Jacobian matrices of f_a. In the linear case it is exactly A_{k}.

        p_f_A: matrix - in this case function which returns this matrix is assumed.
               Look at this parameter description in kalman_filter function.

               function (k, m, P) return Jacobian of dynamic function, it is
               passed into p_a.

               k: (iteration number),
               m: point where Jacobian is evaluated
               P: parameter for Jacobian, usually covariance matrix.

        p_f_Q: matrix. In this case function which returns this matrix is asumed.
               Look at this parameter description in kalman_filter function.

               function (k). Returns noise matrix of dynamic model on iteration k.
               k: (iteration number).

        p_h: if None - the function from the linear measurement model is assumed.
             No nonlinearity in the measurement is assumed.

             function (k, x_{k}, H_{k}). Measurement function.
             k: (iteration number),
             x_{k}: (current state)
             H_{k}: Jacobian matrices of f_h. In the linear case it is exactly H_{k}.

        p_f_H: matrix - in this case function which returns this matrix is assumed.
               function (k, m, P) return Jacobian of dynamic function, it is
               passed into p_h.
               k: (iteration number),
               m: point where Jacobian is evaluated
               P: parameter for Jacobian, usually covariance matrix.

        p_f_R: matrix. In this case function which returns this matrix is asumed.
               function (k). Returns noise matrix of measurement equation
               on iteration k.
               k: (iteration number).

        Y: matrix or vector
            Data. If Y is matrix then samples are along 0-th dimension and
            features along the 1-st. May have missing values.

        p_mean: vector
            Initial distribution mean. If None it is assumed to be zero

        P_init: square symmetric matrix or scalar
            Initial covariance of the states. If the parameter is scalar
            then it is assumed that initial covariance matrix is unit matrix
            multiplied by this scalar. If None the unit matrix is used instead.

        calc_log_likelihood: boolean
            Whether to calculate marginal likelihood of the state-space model.
        """

        # Y
        Y.shape, old_Y_shape  =  cls._reshape_input_data(Y.shape)

         # m_init
        if m_init is None:
            m_init = np.zeros((p_state_dim,1))
        else:
            m_init = np.atleast_2d(m_init).T

        # P_init
        if P_init is None:
            P_init = np.eye(p_state_dim)
        elif not isinstance(P_init, collections.Iterable): #scalar
            P_init = P_init*np.eye(p_state_dim)

        if p_a is None:
            p_a = lambda k,m,A: np.dot(A, m)

        old_A_shape = None
        if not isinstance(p_f_A, types.FunctionType): # not a function but array
            p_f_A = np.atleast_1d(p_f_A)
            (p_A, old_A_shape) = cls._check_A_matrix(p_f_A)

            p_f_A = lambda k, m, P: p_A[:,:, 0] # make function
        else:
            if p_f_A(1, m_init, P_init).shape[0] != m_init.shape[0]:
                raise ValueError("p_f_A function returns matrix of wrong size")

        old_Q_shape = None
        if not isinstance(p_f_Q, types.FunctionType): # not a function but array
            p_f_Q = np.atleast_1d(p_f_Q)
            (p_Q, old_Q_shape) = cls._check_Q_matrix(p_f_Q)

            p_f_Q = lambda k: p_Q[:,:, 0] # make function
        else:
            if p_f_Q(1).shape[0] != m_init.shape[0]:
                raise ValueError("p_f_Q function returns matrix of wrong size")

        if p_h is None:
            lambda k,m,H: np.dot(H, m)

        old_H_shape = None
        if not isinstance(p_f_H, types.FunctionType): # not a function but array
            p_f_H = np.atleast_1d(p_f_H)
            (p_H, old_H_shape) = cls._check_H_matrix(p_f_H)

            p_f_H = lambda k, m, P: p_H # make function
        else:
            if p_f_H(1, m_init, P_init).shape[0] != Y.shape[1]:
                raise ValueError("p_f_H function returns matrix of wrong size")

        old_R_shape = None
        if not isinstance(p_f_R, types.FunctionType): # not a function but array
            p_f_R = np.atleast_1d(p_f_R)
            (p_R, old_R_shape) = cls._check_H_matrix(p_f_R)

            p_f_R = lambda k: p_R # make function
        else:
            if p_f_R(1).shape[0] != m_init.shape[0]:
                raise ValueError("p_f_R function returns matrix of wrong size")

#        class dynamic_callables_class(Dynamic_Model_Callables):
#
#            Ak =
#            Qk =


        class measurement_callables_class(R_handling_Class):
            def __init__(self,R, index, R_time_var_index, unique_R_number):
                super(measurement_callables_class,self).__init__(R, index, R_time_var_index, unique_R_number)

            Hk = AddMethodToClass(f_H)
            f_h = AddMethodToClass(f_hl)


        (M, P,log_likelihood, grad_log_likelihood)  = cls._kalman_algorithm_raw(p_state_dim, p_a, p_f_A, p_f_Q, p_h, p_f_H, p_f_R, Y, m_init,
                          P_init, calc_log_likelihood,
                          calc_grad_log_likelihood=False, grad_calc_params=None)

        if old_Y_shape is not None:
            Y.shape = old_Y_shape

        if old_A_shape is not None:
            p_A.shape = old_A_shape

        if old_Q_shape is not None:
            p_Q.shape = old_Q_shape

        if old_H_shape is not None:
            p_H.shape = old_H_shape

        if old_R_shape is not None:
            p_R.shape = old_R_shape

        return (M, P)

    @classmethod
    def _kalman_algorithm_raw(cls,state_dim, p_dynamic_callables, p_measurement_callables, Y, m_init,
                          P_init, p_kalman_filter_type='regular',
                          calc_log_likelihood=False,
                          calc_grad_log_likelihood=False, grad_params_no=None,
                          dm_init=None, dP_init=None):
        """
        General nonlinear filtering algorithm for inference in the state-space
        model:

        x_{k} =  f_a(k, x_{k-1}, A_{k}) + q_{k-1};       q_{k-1} ~ N(0, Q_{k-1})
        y_{k} =  f_h(k, x_{k}, H_{k}) + r_{k};           r_{k-1} ~ N(0, R_{k})

        Returns estimated filter distributions x_{k} ~ N(m_{k}, P(k))

        Current Features:
        ----------------------------------------

        1) Function support "multiple time series mode" which means that exactly
        the same State-Space model is used to filter several sets of measurements.
        In this case third dimension of Y should include these state-space measurements
        Log_likelihood and Grad_log_likelihood have the corresponding dimensions then.

        2) Measurement may include missing values. In this case update step is
        not done for this measurement. (later may be changed)

        Input:
        -----------------
        state_dim: int
            Demensionality of the states

        p_a: function (k, x_{k-1}, A_{k}). Dynamic function.
            k (iteration number),
            x_{k-1}
            A_{k} Jacobian matrices of f_a. In the linear case it is exactly A_{k}.

        p_f_A: function (k, m, P) return Jacobian of dynamic function, it is
            passed into p_a.
            k (iteration number),
            m: point where Jacobian is evaluated
            P: parameter for Jacobian, usually covariance matrix.

        p_f_Q: function (k). Returns noise matrix of dynamic model on iteration k.
            k (iteration number).

        p_h: function (k, x_{k}, H_{k}). Measurement function.
            k (iteration number),
            x_{k}
            H_{k} Jacobian matrices of f_h. In the linear case it is exactly H_{k}.

        p_f_H: function (k, m, P) return Jacobian of dynamic function, it is
            passed into p_h.
            k (iteration number),
            m: point where Jacobian is evaluated
            P: parameter for Jacobian, usually covariance matrix.

        p_f_R: function (k). Returns noise matrix of measurement equation
            on iteration k.
            k (iteration number).

        Y: matrix or vector or 3D array
            Data. If Y is matrix then samples are along 0-th dimension and
            features along the 1-st. If 3D array then third dimension
            correspond to "multiple time series mode".

        m_init: vector or matrix
            Initial distribution mean. For "multiple time series mode"
            it is matrix, second dimension of which correspond to different
            time series. In regular case ("one time series mode") it is a
            vector.

        P_init: matrix or scalar
            Initial covariance of the states. Must be not None
            "multiple time series mode" does not affect it, since it does not
            affect anything related to state variaces.

        p_kalman_filter_type: string


        calc_log_likelihood: boolean
            Whether to calculate marginal likelihood of the state-space model.

        calc_grad_log_likelihood: boolean
            Whether to calculate gradient of the marginal likelihood
            of the state-space model. If true then the next parameter must
            provide the extra parameters for gradient calculation.

        grad_calc_params: dictionary
            Dictionary with derivatives of model matrices with respect
            to parameters "dA", "dQ", "dH", "dR", "dm_init", "dP_init".

        Output:
        --------------

        M: (no_steps+1,state_dim) matrix or (no_steps+1,state_dim, time_series_no) 3D array
            Filter estimates of the state means. In the extra step the initial
            value is included. In the "multiple time series mode" third dimension
            correspond to different timeseries.

        P: (no_steps+1, state_dim, state_dim) 3D array
            Filter estimates of the state covariances. In the extra step the initial
            value is included.

        log_likelihood: double or (1, time_series_no) 3D array.
            If the parameter calc_log_likelihood was set to true, return
            logarithm of marginal likelihood of the state-space model. If
            the parameter was false, return None. In the "multiple time series mode" it is a vector
            providing log_likelihood for each time series.

        grad_log_likelihood: column vector or (grad_params_no, time_series_no) matrix
            If calc_grad_log_likelihood is true, return gradient of log likelihood
            with respect to parameters. It returns it column wise, so in
            "multiple time series mode" gradients for each time series is in the
            corresponding column.

        """

        steps_no = Y.shape[0] # number of steps in the Kalman Filter
        time_series_no = Y.shape[2] # multiple time series mode

        # Allocate space for results
        # Mean estimations. Initial values will be included
        M = np.empty(((steps_no+1),state_dim,time_series_no))
        M[0,:,:] = m_init # Initialize mean values
        # Variance estimations. Initial values will be included
        P = np.empty(((steps_no+1),state_dim,state_dim))
        P_init = 0.5*( P_init + P_init.T) # symmetrize initial covariance. In some ustable cases this is uiseful
        P[0,:,:] = P_init # Initialize initial covariance matrix

        if p_kalman_filter_type == 'svd':
            (U,S,Vh) = sp.linalg.svd( P_init,full_matrices=False, compute_uv=True,
                      overwrite_a=False,check_finite=True)
            S[ (S==0) ] = 1e-17 # allows to run algorithm for singular initial variance
            P_upd = (P_init, S,U)

        log_likelihood = 0 if calc_log_likelihood else None
        grad_log_likelihood = 0 if calc_grad_log_likelihood else None

        #setting initial values for derivatives update
        dm_upd = dm_init
        dP_upd = dP_init
        # Main loop of the Kalman filter
        for k in range(0,steps_no):
            # In this loop index for new estimations is (k+1), old - (k)
            # This happened because initial values are stored at 0-th index.

            prev_mean = M[k,:,:] # mean from the previous step

            if p_kalman_filter_type == 'svd':
                m_pred, P_pred, dm_pred, dP_pred = \
                cls._kalman_prediction_step_SVD(k, prev_mean ,P_upd, p_dynamic_callables,
                    calc_grad_log_likelihood=calc_grad_log_likelihood,
                    p_dm = dm_upd, p_dP = dP_upd)
            else:
                m_pred, P_pred, dm_pred, dP_pred = \
                cls._kalman_prediction_step(k, prev_mean ,P[k,:,:], p_dynamic_callables,
                    calc_grad_log_likelihood=calc_grad_log_likelihood,
                    p_dm = dm_upd, p_dP = dP_upd )

            k_measurment = Y[k,:,:]

            if (np.any(np.isnan(k_measurment)) == False):
                if p_kalman_filter_type == 'svd':
                    m_upd, P_upd, log_likelihood_update, dm_upd, dP_upd, d_log_likelihood_update = \
                    cls._kalman_update_step_SVD(k,  m_pred , P_pred, p_measurement_callables,
                            k_measurment, calc_log_likelihood=calc_log_likelihood,
                            calc_grad_log_likelihood=calc_grad_log_likelihood,
                            p_dm = dm_pred, p_dP = dP_pred )


    #                m_upd, P_upd, log_likelihood_update, dm_upd, dP_upd, d_log_likelihood_update = \
    #                cls._kalman_update_step(k,  m_pred , P_pred[0], f_h, f_H, p_R.f_R, k_measurment,
    #                        calc_log_likelihood=calc_log_likelihood,
    #                        calc_grad_log_likelihood=calc_grad_log_likelihood,
    #                        p_dm = dm_pred, p_dP = dP_pred, grad_calc_params_2 = (dH, dR))
    #
    #                (U,S,Vh) = sp.linalg.svd( P_upd,full_matrices=False, compute_uv=True,
    #                      overwrite_a=False,check_finite=True)
    #                P_upd = (P_upd, S,U)
                else:
                    m_upd, P_upd, log_likelihood_update, dm_upd, dP_upd, d_log_likelihood_update = \
                    cls._kalman_update_step(k,  m_pred , P_pred, p_measurement_callables, k_measurment,
                            calc_log_likelihood=calc_log_likelihood,
                            calc_grad_log_likelihood=calc_grad_log_likelihood,
                            p_dm = dm_pred, p_dP = dP_pred )

            else:
#                if k_measurment.shape != (1,1):
#                    raise ValueError("Nan measurements are currently not supported for \
#                                     multidimensional output and multiple time series.")
#                else:
#                    m_upd = m_pred; P_upd = P_pred; dm_upd = dm_pred; dP_upd = dP_pred
#                    log_likelihood_update = 0.0;
#                    d_log_likelihood_update = 0.0;

                if not np.all(np.isnan(k_measurment)):
                    raise ValueError("""Nan measurements are currently not supported if
                                     they are intermixed with not NaN measurements""")
                else:
                    m_upd = m_pred; P_upd = P_pred; dm_upd = dm_pred; dP_upd = dP_pred
                    if calc_log_likelihood:
                        log_likelihood_update = np.zeros((time_series_no,))
                    if calc_grad_log_likelihood:
                        d_log_likelihood_update = np.zeros((grad_params_no,time_series_no))


            if calc_log_likelihood:
                log_likelihood += log_likelihood_update

            if calc_grad_log_likelihood:
                grad_log_likelihood += d_log_likelihood_update

            M[k+1,:,:] = m_upd # separate mean value for each time series

            if p_kalman_filter_type == 'svd':
                P[k+1,:,:] = P_upd[0]
            else:
                P[k+1,:,:] = P_upd

        # !!!Print statistics! Print sizes of matrices
        # !!!Print statistics! Print iteration time base on another boolean variable
        return (M, P, log_likelihood, grad_log_likelihood, p_dynamic_callables.reset(False))

    @staticmethod
    def _kalman_prediction_step(k, p_m , p_P, p_dyn_model_callable, calc_grad_log_likelihood=False,
                                p_dm = None, p_dP = None):
        """
        Desctrete prediction function

        Input:
            k:int
                Iteration No. Starts at 0. Total number of iterations equal to the
                number of measurements.

            p_m: matrix of size (state_dim, time_series_no)
                Mean value from the previous step. For "multiple time series mode"
                it is matrix, second dimension of which correspond to different
                time series.

            p_P:
                Covariance matrix from the previous step.

            p_dyn_model_callable: class


            calc_grad_log_likelihood: boolean
                Whether to calculate gradient of the marginal likelihood
                of the state-space model. If true then the next parameter must
                provide the extra parameters for gradient calculation.

            p_dm: 3D array (state_dim, time_series_no, parameters_no)
                Mean derivatives from the previous step. For "multiple time series mode"
                it is 3D array, second dimension of which correspond to different
                time series.

            p_dP: 3D array (state_dim, state_dim, parameters_no)
                Mean derivatives from the previous step

        Output:
        ----------------------------
        m_pred, P_pred, dm_pred, dP_pred: metrices, 3D objects
            Results of the prediction steps.

        """

        # index correspond to values from previous iteration.
        A = p_dyn_model_callable.Ak(k,p_m,p_P) # state transition matrix (or Jacobian)
        Q = p_dyn_model_callable.Qk(k) # state noise matrix

        # Prediction step ->
        m_pred = p_dyn_model_callable.f_a(k, p_m, A) # predicted mean
        P_pred = A.dot(p_P).dot(A.T) + Q # predicted variance
        # Prediction step <-

        if calc_grad_log_likelihood:
            dA_all_params = p_dyn_model_callable.dAk(k) # derivatives of A wrt parameters
            dQ_all_params = p_dyn_model_callable.dQk(k) # derivatives of Q wrt parameters

            param_number = p_dP.shape[2]

            # p_dm, p_dP - derivatives form the previoius step
            dm_pred = np.empty(p_dm.shape)
            dP_pred = np.empty(p_dP.shape)

            for j in range(param_number):
                dA = dA_all_params[:,:,j]
                dQ = dQ_all_params[:,:,j]

                dP = p_dP[:,:,j]
                dm = p_dm[:,:,j]
                dm_pred[:,:,j] = np.dot(dA, p_m) + np.dot(A, dm)
                # prediction step derivatives for current parameter:

                dP_pred[:,:,j] = np.dot( dA ,np.dot(p_P, A.T))
                dP_pred[:,:,j] += dP_pred[:,:,j].T
                dP_pred[:,:,j] += np.dot( A ,np.dot(dP, A.T)) + dQ

                dP_pred[:,:,j] = 0.5*(dP_pred[:,:,j] + dP_pred[:,:,j].T) #symmetrize
        else:
            dm_pred = None
            dP_pred = None

        return m_pred, P_pred, dm_pred, dP_pred

    @staticmethod
    def _kalman_prediction_step_SVD(k, p_m , p_P, p_dyn_model_callable, calc_grad_log_likelihood=False,
                                p_dm = None, p_dP = None):
        """
        Desctrete prediction function

        Input:
            k:int
                Iteration No. Starts at 0. Total number of iterations equal to the
                number of measurements.

            p_m: matrix of size (state_dim, time_series_no)
                Mean value from the previous step. For "multiple time series mode"
                it is matrix, second dimension of which correspond to different
                time series.

            p_P: tuple (Prev_cov, S, V)
                Covariance matrix from the previous step and its SVD decomposition.
                Prev_cov = V * S * V.T The tuple is (Prev_cov, S, V)

            p_dyn_model_callable: object

            calc_grad_log_likelihood: boolean
                Whether to calculate gradient of the marginal likelihood
                of the state-space model. If true then the next parameter must
                provide the extra parameters for gradient calculation.

            p_dm: 3D array (state_dim, time_series_no, parameters_no)
                Mean derivatives from the previous step. For "multiple time series mode"
                it is 3D array, second dimension of which correspond to different
                time series.

            p_dP: 3D array (state_dim, state_dim, parameters_no)
                Mean derivatives from the previous step

        Output:
        ----------------------------
        m_pred, P_pred, dm_pred, dP_pred: metrices, 3D objects
            Results of the prediction steps.

        """

        # covariance from the previous step and its SVD decomposition
        # p_prev_cov = v * S * V.T
        Prev_cov, S_old, V_old = p_P
        #p_prev_cov_tst = np.dot(p_V, (p_S * p_V).T) # reconstructed covariance from the previous step

        # index correspond to values from previous iteration.
        A = p_dyn_model_callable.Ak(k,p_m,Prev_cov) # state transition matrix (or Jacobian)
        Q = p_dyn_model_callable.Qk(k) # state noise matrx. This is necessary for the square root calculation (next step)
        Q_sr = p_dyn_model_callable.Q_srk(k)
        # Prediction step ->
        m_pred = p_dyn_model_callable.f_a(k, p_m, A) # predicted mean

        # coavariance prediction have changed:
        svd_1_matr = np.vstack( ( (np.sqrt(S_old)* np.dot(A,V_old)).T , Q_sr.T) )
        (U,S,Vh) = sp.linalg.svd( svd_1_matr,full_matrices=False, compute_uv=True,
                      overwrite_a=False,check_finite=True)

        # predicted variance computed by the regular method. For testing
        #P_pred_tst = A.dot(Prev_cov).dot(A.T) + Q
        V_new = Vh.T
        S_new = S**2

        P_pred = np.dot(V_new * S_new, V_new.T) # prediction covariance
        P_pred = (P_pred, S_new, Vh.T)
        # Prediction step <-

        # derivatives
        if calc_grad_log_likelihood:
            dA_all_params = p_dyn_model_callable.dAk(k) # derivatives of A wrt parameters
            dQ_all_params = p_dyn_model_callable.dQk(k) # derivatives of Q wrt parameters

            param_number = p_dP.shape[2]

            # p_dm, p_dP - derivatives form the previoius step
            dm_pred = np.empty(p_dm.shape)
            dP_pred = np.empty(p_dP.shape)

            for j in range(param_number):
                dA = dA_all_params[:,:,j]
                dQ = dQ_all_params[:,:,j]

                #dP = p_dP[:,:,j]
                #dm = p_dm[:,:,j]
                dm_pred[:,:,j] = np.dot(dA, p_m) + np.dot(A, p_dm[:,:,j])
                # prediction step derivatives for current parameter:


                dP_pred[:,:,j] = np.dot( dA ,np.dot(Prev_cov, A.T))
                dP_pred[:,:,j] += dP_pred[:,:,j].T
                dP_pred[:,:,j] += np.dot( A ,np.dot(p_dP[:,:,j], A.T)) + dQ

                dP_pred[:,:,j] = 0.5*(dP_pred[:,:,j] + dP_pred[:,:,j].T) #symmetrize
        else:
            dm_pred = None
            dP_pred = None

        return m_pred, P_pred, dm_pred, dP_pred

    @staticmethod
    def _kalman_update_step(k,   p_m , p_P, p_meas_model_callable, measurement, calc_log_likelihood= False,
                            calc_grad_log_likelihood=False, p_dm = None, p_dP = None):
        """
        Input:

        k: int
              Iteration No. Starts at 0. Total number of iterations equal to the
              number of measurements.

        m_P: matrix of size (state_dim, time_series_no)
             Mean value from the previous step. For "multiple time series mode"
                it is matrix, second dimension of which correspond to different
                time series.

        p_P:
             Covariance matrix from the prediction step.

        p_meas_model_callable: object

        measurement: (measurement_dim, time_series_no) matrix
            One measurement used on the current update step. For
            "multiple time series mode" it is matrix, second dimension of
            which correspond to different time series.

        calc_log_likelihood: boolean
            Whether to calculate marginal likelihood of the state-space model.

        calc_grad_log_likelihood: boolean
                Whether to calculate gradient of the marginal likelihood
                of the state-space model. If true then the next parameter must
                provide the extra parameters for gradient calculation.

        p_dm: 3D array (state_dim, time_series_no, parameters_no)
                Mean derivatives from the prediction step. For "multiple time series mode"
                it is 3D array, second dimension of which correspond to different
                time series.

        p_dP: array
            Covariance derivatives from the prediction step.

        Output:
        ----------------------------
        m_upd, P_upd, dm_upd, dP_upd: metrices, 3D objects
            Results of the prediction steps.

        log_likelihood_update: double or 1D array
            Update to the log_likelihood from this step

        d_log_likelihood_update: (grad_params_no, time_series_no) matrix
            Update to the gradient of log_likelihood, "multiple time series mode"
            adds extra columns to the gradient.

        """
        #import pdb; pdb.set_trace()

        m_pred = p_m # from prediction step
        P_pred = p_P # from prediction step

        H = p_meas_model_callable.Hk(k, m_pred, P_pred)
        R = p_meas_model_callable.Rk(k)

        time_series_no = p_m.shape[1] # number of time serieses

        log_likelihood_update=None; dm_upd=None; dP_upd=None; d_log_likelihood_update=None
        # Update step (only if there is data)
        #if not np.any(np.isnan(measurement)): # TODO: if some dimensions are missing, do properly computations for other.
        v = measurement-p_meas_model_callable.f_h(k, m_pred, H)
        S = H.dot(P_pred).dot(H.T) + R
        if measurement.shape[0]==1: # measurements are one dimensional
            if (S < 0):
                raise ValueError("Kalman Filter Update: S is negative step %i" % k )
                 #import pdb; pdb.set_trace()

            K = P_pred.dot(H.T) / S
            if calc_log_likelihood:
                log_likelihood_update = -0.5 * ( np.log(2*np.pi) + np.log(S) +
                                    v*v / S)
                #log_likelihood_update = log_likelihood_update[0,0] # to make int
                if np.any(np.isnan(log_likelihood_update)): # some member in P_pred is None.
                    raise ValueError("Nan values in likelihood update!")
            LL = None; islower = None
        else:
            LL,islower = linalg.cho_factor(S)
            K = linalg.cho_solve((LL,islower), H.dot(P_pred.T)).T

            if calc_log_likelihood:
                log_likelihood_update = -0.5 * ( v.shape[0]*np.log(2*np.pi) +
                    2*np.sum( np.log(np.diag(LL)) ) +\
                        np.sum((linalg.cho_solve((LL,islower),v)) * v, axis = 0) ) # diagonal of v.T*S^{-1}*v

        if calc_grad_log_likelihood:
            dm_pred_all_params = p_dm # derivativas of the prediction phase
            dP_pred_all_params = p_dP

            param_number = p_dP.shape[2]

            dH_all_params = p_meas_model_callable.dHk(k)
            dR_all_params = p_meas_model_callable.dRk(k)

            dm_upd = np.empty(dm_pred_all_params.shape)
            dP_upd = np.empty(dP_pred_all_params.shape)

             # firts dimension parameter_no, second - time series number
            d_log_likelihood_update = np.empty((param_number,time_series_no))
            for param in range(param_number):

               dH = dH_all_params[:,:,param]
               dR = dR_all_params[:,:,param]

               dm_pred = dm_pred_all_params[:,:,param]
               dP_pred = dP_pred_all_params[:,:,param]

                # Terms in the likelihood derivatives
               dv = - np.dot( dH, m_pred) -  np.dot( H, dm_pred)
               dS = np.dot(dH, np.dot( P_pred, H.T))
               dS += dS.T
               dS += np.dot(H, np.dot( dP_pred, H.T)) + dR

               # TODO: maybe symmetrize dS

               #dm and dP for the next stem
               if LL is not None: # the state vector is not a scalar
                   tmp1 = linalg.cho_solve((LL,islower), H).T
                   tmp2 = linalg.cho_solve((LL,islower), dH).T
                   tmp3 = linalg.cho_solve((LL,islower), dS).T
               else: # the state vector is a scalar
                   tmp1 = H.T / S
                   tmp2 = dH.T / S
                   tmp3 = dS.T / S

               dK = np.dot( dP_pred, tmp1) + np.dot( P_pred, tmp2) - \
                    np.dot( P_pred, np.dot( tmp1, tmp3 ) )

                # terms required for the next step, save this for each parameter
               dm_upd[:,:,param] = dm_pred + np.dot(dK, v) + np.dot(K, dv)

               dP_upd[:,:,param] = -np.dot(dK, np.dot(S, K.T))
               dP_upd[:,:,param] += dP_upd[:,:,param].T
               dP_upd[:,:,param] += dP_pred - np.dot(K , np.dot( dS, K.T))

               dP_upd[:,:,param] = 0.5*(dP_upd[:,:,param] + dP_upd[:,:,param].T) #symmetrize
                # computing the likelihood change for each parameter:
               if LL is not None: # the state vector is not 1D
                    #tmp4 = linalg.cho_solve((LL,islower), dv)
                   tmp5 = linalg.cho_solve((LL,islower), v)
               else: # the state vector is a scalar
                   #tmp4 = dv / S
                   tmp5 = v / S


               d_log_likelihood_update[param,:] = -(0.5*np.sum(np.diag(tmp3)) + \
                    np.sum(tmp5*dv, axis=0) - 0.5 * np.sum(tmp5 * np.dot(dS, tmp5), axis=0) )
                # Before
                #d_log_likelihood_update[param,0] = -(0.5*np.sum(np.diag(tmp3)) + \
                #np.dot(tmp5.T, dv) - 0.5 * np.dot(tmp5.T ,np.dot(dS, tmp5)) )


        # Compute the actual updates for mean and variance of the states.
        m_upd = m_pred + K.dot( v )

        # Covariance update and ensure it is symmetric
        P_upd = K.dot(S).dot(K.T)
        P_upd = 0.5*(P_upd + P_upd.T)
        P_upd =  P_pred - P_upd# this update matrix is symmetric

        return m_upd, P_upd, log_likelihood_update, dm_upd, dP_upd, d_log_likelihood_update

    @staticmethod
    def _kalman_update_step_SVD(k, p_m , p_P, p_meas_model_callable, measurement, calc_log_likelihood= False,
                            calc_grad_log_likelihood=False, p_dm = None, p_dP = None):
        """
        Input:

        k: int
              Iteration No. Starts at 0. Total number of iterations equal to the
              number of measurements.

        m_P: matrix of size (state_dim, time_series_no)
             Mean value from the previous step. For "multiple time series mode"
                it is matrix, second dimension of which correspond to different
                time series.

        p_P: tuple (P_pred, S, V)
             Covariance matrix from the prediction step and its SVD decomposition.
             P_pred = V * S * V.T The tuple is (P_pred, S, V)

        p_h: function (k, x_{k}, H_{k}). Measurement function.
            k (iteration number), starts at 0
            x_{k} state
            H_{k} Jacobian matrices of f_h. In the linear case it is exactly H_{k}.

        p_f_H: function (k, m, P) return Jacobian of measurement function, it is
            passed into p_h.
            k (iteration number), starts at 0
            m: point where Jacobian is evaluated
            P: parameter for Jacobian, usually covariance matrix.

        p_f_R: function (k). Returns noise matrix of measurement equation
            on iteration k.
            k (iteration number). starts at 0

        p_f_iRsr: function (k). Returns the square root of the noise matrix of
            measurement equation on iteration k.
            k (iteration number). starts at 0

        measurement: (measurement_dim, time_series_no) matrix
            One measurement used on the current update step. For
            "multiple time series mode" it is matrix, second dimension of
            which correspond to different time series.

        calc_log_likelihood: boolean
            Whether to calculate marginal likelihood of the state-space model.

        calc_grad_log_likelihood: boolean
                Whether to calculate gradient of the marginal likelihood
                of the state-space model. If true then the next parameter must
                provide the extra parameters for gradient calculation.

        p_dm: 3D array (state_dim, time_series_no, parameters_no)
                Mean derivatives from the prediction step. For "multiple time series mode"
                it is 3D array, second dimension of which correspond to different
                time series.

        p_dP: array
            Covariance derivatives from the prediction step.

        grad_calc_params_2: List or None
            List with derivatives. The first component is 'f_dH' - function(k)
            which returns the derivative of H. The second element is 'f_dR'
             - function(k). Function which returns the derivative of R.

        Output:
        ----------------------------
        m_upd, P_upd, dm_upd, dP_upd: metrices, 3D objects
            Results of the prediction steps.

        log_likelihood_update: double or 1D array
            Update to the log_likelihood from this step

        d_log_likelihood_update: (grad_params_no, time_series_no) matrix
            Update to the gradient of log_likelihood, "multiple time series mode"
            adds extra columns to the gradient.

        """

        #import pdb; pdb.set_trace()

        m_pred = p_m # from prediction step
        P_pred,S_pred,V_pred = p_P # from prediction step

        H = p_meas_model_callable.Hk(k, m_pred, P_pred)
        R = p_meas_model_callable.Rk(k)
        R_isr = p_meas_model_callable.R_isrk(k) # square root of the inverse of R matrix

        time_series_no = p_m.shape[1] # number of time serieses

        log_likelihood_update=None; dm_upd=None; dP_upd=None; d_log_likelihood_update=None
        # Update step (only if there is data)
        #if not np.any(np.isnan(measurement)): # TODO: if some dimensions are missing, do properly computations for other.
        v = measurement-p_meas_model_callable.f_h(k, m_pred, H)

        svd_2_matr = np.vstack( ( np.dot( R_isr.T, np.dot(H, V_pred)) , np.diag( 1.0/np.sqrt(S_pred) ) ) )

        (U,S,Vh) = sp.linalg.svd( svd_2_matr,full_matrices=False, compute_uv=True,
                     overwrite_a=False,check_finite=True)

         # P_upd = U_upd S_upd**2 U_upd.T
        U_upd = np.dot(V_pred, Vh.T)
        S_upd = (1.0/S)**2

        P_upd = np.dot(U_upd * S_upd, U_upd.T) # update covariance
        P_upd = (P_upd,S_upd,U_upd) # tuple to pass to the next step

         # stil need to compute S and K for derivative computation
        S = H.dot(P_pred).dot(H.T) + R
        if measurement.shape[0]==1: # measurements are one dimensional
            if (S < 0):
                raise ValueError("Kalman Filter Update: S is negative step %i" % k )
                 #import pdb; pdb.set_trace()

            K = P_pred.dot(H.T) / S
            if calc_log_likelihood:
                log_likelihood_update = -0.5 * ( np.log(2*np.pi) + np.log(S) +
                                    v*v / S)
                #log_likelihood_update = log_likelihood_update[0,0] # to make int
                if np.any(np.isnan(log_likelihood_update)): # some member in P_pred is None.
                    raise ValueError("Nan values in likelihood update!")
            LL = None; islower = None
        else:
            LL,islower = linalg.cho_factor(S)
            K = linalg.cho_solve((LL,islower), H.dot(P_pred.T)).T

            if calc_log_likelihood:
                log_likelihood_update = -0.5 * ( v.shape[0]*np.log(2*np.pi) +
                    2*np.sum( np.log(np.diag(LL)) ) +\
                        np.sum((linalg.cho_solve((LL,islower),v)) * v, axis = 0) ) # diagonal of v.T*S^{-1}*v


        # Old  method of computing updated covariance (for testing) ->
        #P_upd_tst = K.dot(S).dot(K.T)
        #P_upd_tst = 0.5*(P_upd_tst + P_upd_tst.T)
        #P_upd_tst =  P_pred - P_upd_tst# this update matrix is symmetric
        # Old  method of computing updated covariance (for testing) <-

        if calc_grad_log_likelihood:
            dm_pred_all_params = p_dm # derivativas of the prediction phase
            dP_pred_all_params = p_dP

            param_number = p_dP.shape[2]

            dH_all_params = p_meas_model_callable.dHk(k)
            dR_all_params = p_meas_model_callable.dRk(k)

            dm_upd = np.empty(dm_pred_all_params.shape)
            dP_upd = np.empty(dP_pred_all_params.shape)

             # firts dimension parameter_no, second - time series number
            d_log_likelihood_update = np.empty((param_number,time_series_no))
            for param in range(param_number):

               dH = dH_all_params[:,:,param]
               dR = dR_all_params[:,:,param]

               dm_pred = dm_pred_all_params[:,:,param]
               dP_pred = dP_pred_all_params[:,:,param]

                # Terms in the likelihood derivatives
               dv = - np.dot( dH, m_pred) -  np.dot( H, dm_pred)
               dS = np.dot(dH, np.dot( P_pred, H.T))
               dS += dS.T
               dS += np.dot(H, np.dot( dP_pred, H.T)) + dR

                # TODO: maybe symmetrize dS

                #dm and dP for the next stem
               if LL is not None: # the state vector is not a scalar
                   tmp1 = linalg.cho_solve((LL,islower), H).T
                   tmp2 = linalg.cho_solve((LL,islower), dH).T
                   tmp3 = linalg.cho_solve((LL,islower), dS).T
               else: # the state vector is a scalar
                   tmp1 = H.T / S
                   tmp2 = dH.T / S
                   tmp3 = dS.T / S

               dK = np.dot( dP_pred, tmp1) + np.dot( P_pred, tmp2) - \
                    np.dot( P_pred, np.dot( tmp1, tmp3 ) )

               # terms required for the next step, save this for each parameter
               dm_upd[:,:,param] = dm_pred + np.dot(dK, v) + np.dot(K, dv)

               dP_upd[:,:,param] = -np.dot(dK, np.dot(S, K.T))
               dP_upd[:,:,param] += dP_upd[:,:,param].T
               dP_upd[:,:,param] += dP_pred - np.dot(K , np.dot( dS, K.T))

               dP_upd[:,:,param] = 0.5*(dP_upd[:,:,param] + dP_upd[:,:,param].T) #symmetrize
               # computing the likelihood change for each parameter:
               if LL is not None: # the state vector is not 1D
                   tmp5 = linalg.cho_solve((LL,islower), v)
               else: # the state vector is a scalar
                   tmp5 = v / S


               d_log_likelihood_update[param,:] = -(0.5*np.sum(np.diag(tmp3)) + \
                   np.sum(tmp5*dv, axis=0) - 0.5 * np.sum(tmp5 * np.dot(dS, tmp5), axis=0) )
                # Before
                #d_log_likelihood_update[param,0] = -(0.5*np.sum(np.diag(tmp3)) + \
                #np.dot(tmp5.T, dv) - 0.5 * np.dot(tmp5.T ,np.dot(dS, tmp5)) )

        # Compute the actual updates for mean of the states. Variance update
        # is computed earlier.
        m_upd = m_pred + K.dot( v )

        return m_upd, P_upd, log_likelihood_update, dm_upd, dP_upd, d_log_likelihood_update

    @staticmethod
    def _rts_smoother_update_step(k, p_m , p_P, p_m_pred, p_P_pred, p_m_prev_step,
                                  p_P_prev_step, p_dynamic_callables):
        """
        Rauch–Tung–Striebel(RTS) update step

        Input:
        -----------------------------
        k: int
              Iteration No. Starts at 0. Total number of iterations equal to the
              number of measurements.

        p_m: matrix of size (state_dim, time_series_no)
             Filter mean on step k

        p_P:  matrix of size (state_dim,state_dim)
             Filter Covariance on step k

        p_m_pred: matrix of size (state_dim, time_series_no)
             Means from the smoother prediction step.

        p_P_pred:
             Covariance from the smoother prediction step.

        p_m_prev_step
            Smoother mean from the previous step.

        p_P_prev_step:
            Smoother covariance from the previous step.

        p_f_A: function (k, m, P) return Jacobian of dynamic function, it is
            passed into p_a.
            k (iteration number), starts at 0
            m: point where Jacobian is evaluated
            P: parameter for Jacobian, usually covariance matrix.

        """

        A = p_dynamic_callables.Ak(k,p_m,p_P) # state transition matrix (or Jacobian)

        tmp = np.dot( A, p_P.T)
        if A.shape[0] == 1: # 1D states
            G = tmp.T / p_P_pred # P[:,:,k] is symmetric
        else:
            try:
                LL,islower = linalg.cho_factor(p_P_pred)
                G = linalg.cho_solve((LL,islower),tmp).T
            except:
                # It happende that p_P_pred has several near zero eigenvalues
                # hence the Cholesky method does not work.
                res = sp.linalg.lstsq(p_P_pred, tmp)
                G = res[0].T

        m_upd = p_m + G.dot( p_m_prev_step-p_m_pred )
        P_upd = p_P + G.dot( p_P_prev_step-p_P_pred).dot(G.T)

        P_upd = 0.5*(P_upd + P_upd.T)

        return m_upd, P_upd, G

    @classmethod
    def rts_smoother(cls,state_dim, p_dynamic_callables, filter_means,
                          filter_covars):
        """
        This function implements Rauch–Tung–Striebel(RTS) smoother algorithm
        based on the results of kalman_filter_raw.
        These notations are the same:
            x_{k} =  A_{k} * x_{k-1} + q_{k-1};       q_{k-1} ~ N(0, Q_{k-1})
            y_{k} = H_{k} * x_{k} + r_{k};            r_{k-1} ~ N(0, R_{k})

        Returns estimated smoother distributions x_{k} ~ N(m_{k}, P(k))

        Input:
        --------------

        p_a: function (k, x_{k-1}, A_{k}). Dynamic function.
                k (iteration number), starts at 0
                x_{k-1} State from the previous step
                A_{k} Jacobian matrices of f_a. In the linear case it is exactly A_{k}.

        p_f_A: function (k, m, P) return Jacobian of dynamic function, it is
            passed into p_a.
            k (iteration number), starts at 0
            m: point where Jacobian is evaluated
            P: parameter for Jacobian, usually covariance matrix.

        p_f_Q: function (k). Returns noise matrix of dynamic model on iteration k.
            k (iteration number). starts at 0

        filter_means: (no_steps+1,state_dim) matrix or (no_steps+1,state_dim, time_series_no) 3D array
            Results of the Kalman Filter means estimation.

        filter_covars: (no_steps+1, state_dim, state_dim) 3D array
            Results of the Kalman Filter covariance estimation.

        Output:
        -------------

        M: (no_steps+1, state_dim) matrix
            Smoothed estimates of the state means

        P: (no_steps+1, state_dim, state_dim) 3D array
            Smoothed estimates of the state covariances
        """

        no_steps = filter_covars.shape[0]-1# number of steps (minus initial covariance)

        M = np.empty(filter_means.shape) # smoothed means
        P = np.empty(filter_covars.shape) # smoothed covars
        #G = np.empty( (no_steps,state_dim,state_dim)  ) # G from the update step of the smoother

        M[-1,:] = filter_means[-1,:]
        P[-1,:,:] = filter_covars[-1,:,:]
        for k in range(no_steps-1,-1,-1):

            m_pred, P_pred, tmp1, tmp2 = \
                    cls._kalman_prediction_step(k, filter_means[k,:],
                                                filter_covars[k,:,:], p_dynamic_callables,
                                                calc_grad_log_likelihood=False)
            p_m = filter_means[k,:]
            if len(p_m.shape)<2:
                p_m.shape = (p_m.shape[0],1)

            p_m_prev_step = M[k+1,:]
            if len(p_m_prev_step.shape)<2:
                p_m_prev_step.shape = (p_m_prev_step.shape[0],1)

            m_upd, P_upd, G_tmp = cls._rts_smoother_update_step(k,
                            p_m ,filter_covars[k,:,:],
                            m_pred, P_pred, p_m_prev_step ,P[k+1,:,:], p_dynamic_callables)

            M[k,:] = m_upd#np.squeeze(m_upd)
            P[k,:,:] = P_upd
            #G[k,:,:] = G_upd.T # store transposed G.
        # Return values

        return (M, P) #, G)

    @staticmethod
    def _EM_gradient(A,Q,H,R,m_init,P_init,measurements, M, P, G, dA, dQ, dH, dR, dm_init, dP_init):
        """
        Gradient computation with the EM algorithm.

        Input:
        -----------------

        M: Means from the smoother
        P: Variances from the smoother
        G: Gains? from the smoother
        """
        import pdb; pdb.set_trace();

        param_number = dA.shape[-1]
        d_log_likelihood_update = np.empty((param_number,1))

        sample_no = measurements.shape[0]
        P_1 = P[1:,:,:] # remove 0-th step
        P_2 = P[0:-1,:,:] # remove 0-th step

        M_1 = M[1:,:] # remove 0-th step
        M_2 = M[0:-1,:] # remove the last step

        Sigma = np.mean(P_1,axis=0) + np.dot(M_1.T, M_1) / sample_no #
        Phi =   np.mean(P_2,axis=0) + np.dot(M_2.T, M_2) / sample_no #

        B = np.dot( measurements.T, M_1 )/ sample_no
        C =   (sp.einsum( 'ijk,ikl', P_1, G) + np.dot(M_1.T, M_2)) / sample_no #

#        C1 = np.zeros( (P_1.shape[1],P_1.shape[1]) )
#        for k in range(P_1.shape[0]):
#            C1 += np.dot(P_1[k,:,:],G[k,:,:]) + sp.outer( M_1[k,:], M_2[k,:] )
#        C1 = C1 / sample_no

        D = np.dot( measurements.T, measurements ) / sample_no

        try:
            P_init_inv = sp.linalg.inv(P_init)

            if np.max( np.abs(P_init_inv)) > 10e13:
                compute_P_init_terms = False
            else:
                compute_P_init_terms = True
        except np.linalg.LinAlgError:
            compute_P_init_terms = False

        try:
            Q_inv = sp.linalg.inv(Q)

            if np.max( np.abs(Q_inv)) > 10e13:
                compute_Q_terms = False
            else:
                compute_Q_terms = True
        except np.linalg.LinAlgError:
            compute_Q_terms = False

        try:
            R_inv = sp.linalg.inv(R)

            if np.max( np.abs(R_inv)) > 10e13:
                compute_R_terms = False
            else:
                compute_R_terms = True
        except np.linalg.LinAlgError:
            compute_R_terms = False


        d_log_likelihood_update = np.zeros((param_number,1))
        for j in range(param_number):
            if compute_P_init_terms:
                d_log_likelihood_update[j,:] -= 0.5 * np.sum(P_init_inv* dP_init[:,:,j].T ) #p #m

                M0_smoothed = M[0]; M0_smoothed.shape = (M0_smoothed.shape[0],1)
                tmp1 = np.dot( dP_init[:,:,j], np.dot( P_init_inv, (P[0,:,:] + sp.outer( (M0_smoothed - m_init), (M0_smoothed - m_init) )) )  ) #p #m
                d_log_likelihood_update[j,:] += 0.5 * np.sum(P_init_inv* tmp1.T )

                tmp2 = sp.outer( dm_init[:,j], M0_smoothed )
                tmp2 += tmp2.T
                d_log_likelihood_update[j,:] += 0.5 * np.sum(P_init_inv* tmp2.T )

            if compute_Q_terms:

                d_log_likelihood_update[j,:] -=  sample_no/2.0 * np.sum(Q_inv* dQ[:,:,j].T ) #m

                tmp1 = np.dot(C,A.T); tmp1 += tmp1.T; tmp1 = Sigma - tmp1 + np.dot(A, np.dot(Phi,A.T)) #m
                tmp1 = np.dot( dQ[:,:,j], np.dot( Q_inv, tmp1) )
                d_log_likelihood_update[j,:] += sample_no/2.0 * np.sum(Q_inv * tmp1.T)

                tmp2 = np.dot( dA[:,:,j], C.T); tmp2 += tmp2.T;
                tmp3 = np.dot(dA[:,:,j], np.dot(Phi,A.T)); tmp3 += tmp3.T
                d_log_likelihood_update[j,:] -= sample_no/2.0 * np.sum(Q_inv.T * (tmp3 - tmp2) )

            if compute_R_terms:
                d_log_likelihood_update[j,:] -=  sample_no/2.0 * np.sum(R_inv* dR[:,:,j].T )

                tmp1 = np.dot(B,H.T); tmp1 += tmp1.T; tmp1 = D - tmp1 + np.dot(H, np.dot(Sigma,H.T))
                tmp1 = np.dot( dR[:,:,j], np.dot( R_inv, tmp1) )
                d_log_likelihood_update[j,:] += sample_no/2.0 * np.sum(R_inv * tmp1.T)

                tmp2 = np.dot( dH[:,:,j], B.T); tmp2 += tmp2.T;
                tmp3 = np.dot(dH[:,:,j], np.dot(Sigma,H.T)); tmp3 += tmp3.T
                d_log_likelihood_update[j,:] -= sample_no/2.0 * np.sum(R_inv.T * (tmp3 - tmp2) )

        return d_log_likelihood_update

    @staticmethod
    def _check_SS_matrix(p_M, state_dim, measurement_dim, which='A'):
        """
        Veryfy that on exit the matrix has appropriate shape for the KF algorithm.

        Input:
            p_M: matrix
                As it is given for the user
            state_dim: int
                State dimensioanlity
            measurement_dim: int
                Measurement dimensionality
            which: string
                One of: 'A', 'Q', 'H', 'R'
        Output:
        ---------------
            p_M: matrix of the right shape

            old_M_shape: tuple
                Old Shape
        """

        old_M_shape = None
        if len(p_M.shape) < 3: # new shape is 3 dimensional
            old_M_shape = p_M.shape # save shape to restore it on exit
            if len(p_M.shape) == 2: # matrix
                p_M.shape = (p_M.shape[0],p_M.shape[1],1)
            elif len(p_M.shape) == 1: # scalar but in array already
                if (p_M.shape[0] != 1):
                    raise ValueError("Matrix %s is an 1D array, while it must be a matrix or scalar", which)
                else:
                    p_M.shape = (1,1,1)

        if (which == 'A') or (which == 'Q'):
            if (p_M.shape[0] != state_dim) or (p_M.shape[1] != state_dim):
                raise ValueError("%s must be a square matrix of size (%i,%i)" % (which, state_dim, state_dim))
        if (which == 'H'):
            if (p_M.shape[0] != measurement_dim) or (p_M.shape[1] != state_dim):
                raise ValueError("H must be of shape (measurement_dim, state_dim) (%i,%i)" % (measurement_dim, state_dim))
        if (which == 'R'):
            if (p_M.shape[0] != measurement_dim) or (p_M.shape[1] != measurement_dim):
                raise ValueError("R must be of shape (measurement_dim, measurement_dim) (%i,%i)" % (measurement_dim, measurement_dim))

        return (p_M,old_M_shape)

    @staticmethod
    def _check_grad_state_matrices(dM, state_dim, grad_params_no, which = 'dA'):
        """
        Function checks (mostly check dimensions) matrices for marginal likelihood
        gradient parameters calculation. It check dA, dQ matrices.

        Input:
        -------------
            dM: None, scaler or 3D matrix
                It is supposed to be (state_dim,state_dim,grad_params_no) matrix.
                If None then zero matrix is assumed. If scalar then the function
                checks consistency with "state_dim" and "grad_params_no".

            state_dim: int
                State dimensionality

            grad_params_no: int
                How many parrameters of likelihood gradient in total.

            which: string
                'dA' or 'dQ'


        Output:
        --------------
            function of (k) which returns the parameters matrix.

        """


        if dM is None:
            dM=np.zeros((state_dim,state_dim,grad_params_no))
        elif isinstance(dM, np.ndarray):
            if state_dim == 1:
                if len(dM.shape) < 3:
                    dM.shape = (1,1,1)
            else:
                if len(dM.shape) < 3:
                    dM.shape = (state_dim,state_dim,1)
        elif isinstance(dM, np.int):
            if state_dim > 1:
                raise ValueError("When computing likelihood gradient wrong %s dimension." % which)
            else:
                dM = np.ones((1,1,1)) * dM

#        if not isinstance(dM, types.FunctionType):
#            f_dM = lambda k: dM
#        else:
#            f_dM = dM

        return dM


    @staticmethod
    def _check_grad_measurement_matrices(dM, state_dim, grad_params_no, measurement_dim, which = 'dH'):
        """
        Function checks (mostly check dimensions) matrices for marginal likelihood
        gradient parameters calculation. It check dH, dR matrices.

        Input:
        -------------
            dM: None, scaler or 3D matrix
                It is supposed to be
                (measurement_dim ,state_dim,grad_params_no) for "dH" matrix.
                (measurement_dim,measurement_dim,grad_params_no) for "dR"

                If None then zero matrix is assumed. If scalar then the function
                checks consistency with "state_dim" and "grad_params_no".

            state_dim: int
                State dimensionality

            grad_params_no: int
                How many parrameters of likelihood gradient in total.

            measurement_dim: int
                Dimensionality of measurements.

            which: string
                'dH' or 'dR'


        Output:
        --------------
            function of (k) which returns the parameters matrix.
        """

        if dM is None:
            if which == 'dH':
                dM=np.zeros((measurement_dim ,state_dim,grad_params_no))
            elif  which == 'dR':
                dM=np.zeros((measurement_dim,measurement_dim,grad_params_no))
        elif isinstance(dM, np.ndarray):
            if state_dim == 1:
                if len(dM.shape) < 3:
                    dM.shape = (1,1,1)
            else:
                if len(dM.shape) < 3:
                     if which == 'dH':
                        dM.shape = (measurement_dim,state_dim,1)
                     elif  which == 'dR':
                        dM.shape = (measurement_dim,measurement_dim,1)
        elif isinstance(dM, np.int):
            if state_dim > 1:
                raise ValueError("When computing likelihood gradient wrong dH dimension.")
            else:
                dM = np.ones((1,1,1)) * dM

#        if not isinstance(dM, types.FunctionType):
#            f_dM = lambda k: dM
#        else:
#            f_dM = dM

        return dM


class Struct(object):
    pass

class ContDescrStateSpace(DescreteStateSpace):
    """
    Class for continuous-discrete Kalman filter. State equation is
    continuous while measurement equation is discrete.

        d x(t)/ dt = F x(t) + L q;        where q~ N(0, Qc)
        y_{t_k} = H_{k} x_{t_k} + r_{k};        r_{k-1} ~ N(0, R_{k})

    """

    class AQcompute_once(Q_handling_Class):
        """
        Class for calculating matrices A, Q, dA, dQ of the discrete Kalman Filter
        from the matrices F, L, Qc, P_ing, dF, dQc, dP_inf of the continuos state
        equation. dt - time steps.

        It has the same interface as AQcompute_batch.

        It computes matrices for only one time step. This object is used when
        there are many different time steps and storing matrices for each of them
        would take too much memory.
        """

        def __init__(self, F,L,Qc,dt,compute_derivatives=False, grad_params_no=None, P_inf=None, dP_inf=None, dF = None, dQc=None):
            """
            Constructor. All necessary parameters are passed here and stored
            in the opject.

            Input:
            -------------------
                F, L, Qc, P_inf : matrices
                    Parameters of corresponding continuous state model
                dt: array
                    All time steps
                compute_derivatives: bool
                    Whether to calculate derivatives

                dP_inf, dF, dQc: 3D array
                    Derivatives if they are required

            Output:
            -------------------
            Nothing
            """
            # Copies are done because this object is used later in smoother
            # and these parameters must not change.
            self.F = F.copy()
            self.L = L.copy()
            self.Qc = Qc.copy()

            self.dt = dt # copy is not taken because dt is internal parameter

            # Parameters are used to calculate derivatives but derivatives
            # are not used in the smoother. Therefore copies are not taken.
            self.P_inf = P_inf
            self.dP_inf = dP_inf
            self.dF = dF
            self.dQc = dQc

            self.compute_derivatives = compute_derivatives
            self.grad_params_no =  grad_params_no


            self.last_k = 0
            self.last_k_computed = False
            self.v_Ak = None
            self.v_Qk = None
            self.v_dAk = None
            self.v_dQk = None

            self.square_root_computed = False
            # !!!Print statistics! Which object is created

        def f_a(self, k,m,A):
            """
            Dynamic model
            """

            return np.dot(A, m) # default dynamic model

        def _recompute_for_new_k(self,k):
            """
            Computes the necessary matrices for an index k and store the results.

            Input:
            ----------------------
            k: int
                Index in the time differences array dt where to compute matrices

            Output:
            ----------------------
                Ak,Qk, dAk, dQk: matrices and/or 3D arrays
                    A, Q, dA dQ on step k
            """
            if (self.last_k != k) or (self.last_k_computed == False):
                v_Ak,v_Qk, tmp, v_dAk, v_dQk = ContDescrStateSpace.lti_sde_to_descrete(self.F,
                        self.L,self.Qc,self.dt[k],self.compute_derivatives,
                        grad_params_no=self.grad_params_no, P_inf=self.P_inf, dP_inf=self.dP_inf, dF=self.dF, dQc=self.dQc)

                self.last_k = k
                self.last_k_computed = True
                self.v_Ak = v_Ak
                self.v_Qk = v_Qk
                self.v_dAk = v_dAk
                self.v_dQk = v_dQk
                self.Q_square_root_computed = False
            else:
                v_Ak = self.v_Ak
                v_Qk = self.v_Qk
                v_dAk = self.v_dAk
                v_dQk = self.v_dQk

            # !!!Print statistics! Print sizes of matrices

            return v_Ak,v_Qk, v_dAk, v_dQk

        def reset(self, compute_derivatives):
            """
            For reusing this object e.g. in smoother computation. Actually,
            this object can not be reused because it computes the matrices on
            every iteration. But this method is written for keeping the same
            interface with the class AQcompute_batch.
            """

            self.last_k = 0
            self.last_k_computed = False
            self.compute_derivatives = compute_derivatives
            self.Q_square_root_computed = False

            return self

        def Ak(self,k,m,P):
            v_Ak,v_Qk, v_dAk, v_dQk = self._recompute_for_new_k(k)
            return v_Ak

        def Qk(self,k):
            v_Ak,v_Qk, v_dAk, v_dQk = self._recompute_for_new_k(k)
            return v_Qk

        def dAk(self, k):
            v_Ak,v_Qk, v_dAk, v_dQk = self._recompute_for_new_k(k)
            return v_dAk

        def dQk(self, k):
            v_Ak,v_Qk, v_dAk, v_dQk = self._recompute_for_new_k(k)
            return v_dQk

        def Q_srk(self,k):
            """
            Square root of the noise matrix Q
            """

            if ((self.last_k == k) and (self.last_k_computed == True)):
                if not self.Q_square_root_computed:
                    (U, S, Vh) = sp.linalg.svd( self.v_Qk, full_matrices=False, compute_uv=True, overwrite_a=False, check_finite=False)
                    square_root = U * np.sqrt(S)
                    self.square_root_computed = True
                    self.Q_square_root = square_root
                else:
                    square_root = self.Q_square_root
            else:
                raise ValueError("Square root of Q can not be computed")

            return square_root

        def return_last(self):
            """
            Function returns last computed matrices.
            """
            if not self.last_k_computed:
                raise ValueError("Matrices are not computed.")
            else:
                k = self.last_k
                A = self.v_Ak
                Q = self.v_Qk
                dA = self.v_dAk
                dQ = self.v_dQk

            return k, A, Q, dA, dQ

    class AQcompute_batch_Python(Q_handling_Class):
        """
        Class for calculating matrices A, Q, dA, dQ of the discrete Kalman Filter
        from the matrices F, L, Qc, P_ing, dF, dQc, dP_inf of the continuos state
        equation. dt - time steps.

        It has the same interface as AQcompute_once.

        It computes matrices for all time steps. This object is used when
        there are not so many (controlled by internal variable)
        different time steps and storing all the matrices do not take too much memory.

        Since all the matrices are computed all together, this object can be used
        in smoother without repeating the computations.
        """
        def __init__(self, F,L,Qc,dt,compute_derivatives=False, grad_params_no=None, P_inf=None, dP_inf=None, dF = None, dQc=None):
            """
            Constructor. All necessary parameters are passed here and stored
            in the opject.

            Input:
            -------------------
                F, L, Qc, P_inf : matrices
                    Parameters of corresponding continuous state model
                dt: array
                    All time steps
                compute_derivatives: bool
                    Whether to calculate derivatives

                dP_inf, dF, dQc: 3D array
                    Derivatives if they are required

            Output:
            -------------------
            Nothing
            """
            As, Qs, reconstruct_indices, dAs, dQs = ContDescrStateSpace.lti_sde_to_descrete(F,
                        L,Qc,dt,compute_derivatives,
                        grad_params_no=grad_params_no, P_inf=P_inf, dP_inf=dP_inf, dF=dF, dQc=dQc)

            self.As = As
            self.Qs = Qs
            self.dAs = dAs
            self.dQs = dQs
            self.reconstruct_indices = reconstruct_indices
            self.total_size_of_data = self.As.nbytes + self.Qs.nbytes +\
                            (self.dAs.nbytes if (self.dAs is not None) else 0) +\
                            (self.dQs.nbytes if (self.dQs is not None) else 0) +\
                            (self.reconstruct_indices.nbytes if (self.reconstruct_indices is not None) else 0)

            self.Q_svd_dict = {}
            self.last_k = None
             # !!!Print statistics! Which object is created
            # !!!Print statistics! Print sizes of matrices

        def f_a(self, k,m,A):
            """
            Dynamic model
            """
            return np.dot(A, m) # default dynamic model

        def reset(self, compute_derivatives=False):
            """
            For reusing this object e.g. in smoother computation. It makes sence
            because necessary matrices have been already computed for all
            time steps.
            """
            return self

        def Ak(self,k,m,P):
            self.last_k = k
            return self.As[:,:, self.reconstruct_indices[k]]

        def Qk(self,k):
            self.last_k = k
            return self.Qs[:,:, self.reconstruct_indices[k]]

        def dAk(self,k):
            self.last_k = k
            return self.dAs[:,:, :, self.reconstruct_indices[k]]

        def dQk(self,k):
            self.last_k = k
            return self.dQs[:,:, :, self.reconstruct_indices[k]]


        def Q_srk(self,k):
            """
            Square root of the noise matrix Q
            """
            matrix_index = self.reconstruct_indices[k]
            if matrix_index in self.Q_svd_dict:
                square_root = self.Q_svd_dict[matrix_index]
            else:
                (U, S, Vh) = sp.linalg.svd( self.Qs[:,:, matrix_index],
                                        full_matrices=False, compute_uv=True,
                                        overwrite_a=False, check_finite=False)
                square_root = U * np.sqrt(S)
                self.Q_svd_dict[matrix_index] = square_root

            return square_root

        def return_last(self):
            """
            Function returns last available matrices.
            """

            if (self.last_k is None):
                raise ValueError("Matrices are not computed.")
            else:
                ind = self.reconstruct_indices[self.last_k]
                A = self.As[:,:, ind]
                Q = self.Qs[:,:, ind]
                dA = self.dAs[:,:, :, ind]
                dQ = self.dQs[:,:, :, ind]

            return self.last_k, A, Q, dA, dQ

    @classmethod
    def cont_discr_kalman_filter(cls, F, L, Qc, p_H, p_R, P_inf, X, Y, index = None,
                                 m_init=None, P_init=None,
                                 p_kalman_filter_type='regular',
                                 calc_log_likelihood=False,
                                 calc_grad_log_likelihood=False,
                                 grad_params_no=0, grad_calc_params=None):
        """
        This function implements the continuous-discrete Kalman Filter algorithm
        These notations for the State-Space model are assumed:
            d/dt x(t) =  F * x(t) + L * w(t);         w(t) ~ N(0, Qc)
            y_{k} = H_{k} * x_{k} + r_{k};            r_{k-1} ~ N(0, R_{k})

        Returns estimated filter distributions x_{k} ~ N(m_{k}, P(k))

        Current Features:
        ----------------------------------------
        1) The function generaly do not modify the passed parameters. If
        it happens then it is an error. There are several exeprions: scalars
        can be modified into a matrix, in some rare cases shapes of
        the derivatives matrices may be changed, it is ignored for now.

        2) Copies of F,L,Qc are created in memory because they may be used later
        in smoother. References to copies are kept in "AQcomp" object
        return parameter.

        3) Function support "multiple time series mode" which means that exactly
        the same State-Space model is used to filter several sets of measurements.
        In this case third dimension of Y should include these state-space measurements
        Log_likelihood and Grad_log_likelihood have the corresponding dimensions then.

        4) Calculation of Grad_log_likelihood is not supported if matrices
        H, or R changes overf time (with index k). (later may be changed)

        5) Measurement may include missing values. In this case update step is
        not done for this measurement. (later may be changed)

        Input:
        -----------------

        F: (state_dim, state_dim) matrix
            F in the model.

        L: (state_dim, noise_dim) matrix
            L in the model.

        Qc: (noise_dim, noise_dim) matrix
            Q_c in the model.

        p_H: scalar, matrix (measurement_dim, state_dim) , 3D array
            H_{k} in the model. If matrix then H_{k} = H - constant.
            If it is 3D array then H_{k} = p_Q[:,:, index[2,k]]

        p_R: scalar, square symmetric matrix, 3D array
            R_{k} in the model. If matrix then R_{k} = R - constant.
            If it is 3D array then R_{k} = p_R[:,:, index[3,k]]

        P_inf: (state_dim, state_dim) matrix
            State varince matrix on infinity.

        X: 1D array
            Time points of measurements. Needed for converting continuos
            problem to the discrete one.

        Y: matrix or vector or 3D array
            Data. If Y is matrix then samples are along 0-th dimension and
            features along the 1-st. If 3D array then third dimension
            correspond to "multiple time series mode".

        index: vector
            Which indices (on 3-rd dimension) from arrays p_H, p_R to use
            on every time step. If this parameter is None then it is assumed
            that p_H, p_R do not change over time and indices are not needed.
            index[0,:] - correspond to H, index[1,:] - correspond to R
            If index.shape[0] == 1, it is assumed that indides for all matrices
            are the same.

        m_init: vector or matrix
            Initial distribution mean. If None it is assumed to be zero.
            For "multiple time series mode" it is matrix, second dimension of
            which correspond to different time series. In regular case ("one
            time series mode") it is a vector.

        P_init: square symmetric matrix or scalar
            Initial covariance of the states. If the parameter is scalar
            then it is assumed that initial covariance matrix is unit matrix
            multiplied by this scalar. If None the unit matrix is used instead.
            "multiple time series mode" does not affect it, since it does not
            affect anything related to state variaces.

        p_kalman_filter_type: string, one of ('regular', 'svd')
            Which Kalman Filter is used. Regular or SVD. SVD is more numerically
            stable, in particular, Covariace matrices are guarantied to be
            positive semi-definite. However, 'svd' works slower, especially for
            small data due to SVD call overhead.

        calc_log_likelihood: boolean
            Whether to calculate marginal likelihood of the state-space model.

        calc_grad_log_likelihood: boolean
            Whether to calculate gradient of the marginal likelihood
            of the state-space model. If true then "grad_calc_params" parameter must
            provide the extra parameters for gradient calculation.

        grad_params_no: int
            If previous parameter is true, then this parameters gives the
            total number of parameters in the gradient.

        grad_calc_params: dictionary
            Dictionary with derivatives of model matrices with respect
            to parameters "dF", "dL", "dQc", "dH", "dR", "dm_init", "dP_init".
            They can be None, in this case zero matrices (no dependence on parameters)
            is assumed. If there is only one parameter then third dimension is
            automatically added.

        Output:
        --------------

        M: (no_steps+1,state_dim) matrix or (no_steps+1,state_dim, time_series_no) 3D array
            Filter estimates of the state means. In the extra step the initial
            value is included. In the "multiple time series mode" third dimension
            correspond to different timeseries.

        P: (no_steps+1, state_dim, state_dim) 3D array
            Filter estimates of the state covariances. In the extra step the initial
            value is included.

        log_likelihood: double or (1, time_series_no) 3D array.

            If the parameter calc_log_likelihood was set to true, return
            logarithm of marginal likelihood of the state-space model. If
            the parameter was false, return None. In the "multiple time series mode" it is a vector
            providing log_likelihood for each time series.

        grad_log_likelihood: column vector or (grad_params_no, time_series_no) matrix
            If calc_grad_log_likelihood is true, return gradient of log likelihood
            with respect to parameters. It returns it column wise, so in
            "multiple time series mode" gradients for each time series is in the
            corresponding column.

        AQcomp: object
            Contains some pre-computed values for converting continuos model into
            discrete one. It can be used later in the smoothing pahse.
        """

        p_H = np.atleast_1d(p_H)
        p_R = np.atleast_1d(p_R)

        X.shape, old_X_shape  = cls._reshape_input_data(X.shape, 2) # represent as column
        if (X.shape[1] != 1):
            raise ValueError("Only one dimensional X data is supported.")

        Y.shape, old_Y_shape  = cls._reshape_input_data(Y.shape) # represent as column

        state_dim = F.shape[0]
        measurement_dim = Y.shape[1]
        time_series_no = Y.shape[2] # multiple time series mode

        if  ((len(p_H.shape) == 3) and (len(p_H.shape[2]) != 1)) or\
            ((len(p_R.shape) == 3) and (len(p_R.shape[2]) != 1)):
            model_matrices_chage_with_time = True
        else:
            model_matrices_chage_with_time = False

        # Check index
        old_index_shape = None
        if index is None:
            if (len(p_H.shape) == 3) or (len(p_R.shape) == 3):
                raise ValueError("Parameter index can not be None for time varying matrices (third dimension is present)")
            else: # matrices do not change in time, so form dummy zero indices.
                index = np.zeros((1,Y.shape[0]))
        else:
            if len(index.shape) == 1:
                index.shape = (1,index.shape[0])
                old_index_shape = (index.shape[0],)

            if (index.shape[1] != Y.shape[0]):
                raise ValueError("Number of measurements must be equal the number of H_{k}, R_{k}")

        if (index.shape[0] == 1):
            H_time_var_index = 0; R_time_var_index = 0
        elif (index.shape[0] == 4):
            H_time_var_index = 0; R_time_var_index = 1
        else:
            raise ValueError("First Dimension of index must be either 1 or 2.")

        (p_H, old_H_shape) = cls._check_SS_matrix(p_H, state_dim, measurement_dim, which='H')
        (p_R, old_R_shape) = cls._check_SS_matrix(p_R, state_dim, measurement_dim, which='R')

        if m_init is None:
            m_init = np.zeros((state_dim, time_series_no))
        else:
            m_init = np.atleast_2d(m_init).T

        if P_init is None:
            P_init = P_inf.copy()

        if p_kalman_filter_type not in ('regular', 'svd'):
            raise ValueError("Kalman filer type neither 'regular nor 'svd'.")

        # Functions to pass to the kalman_filter algorithm:
        # Parameters:
        # k - number of Kalman filter iteration
        # m - vector for calculating matrices. Required for EKF. Not used here.
        # f_hl = lambda k,m,H: np.dot(H, m)
        # f_H = lambda k,m,P: p_H[:,:, index[H_time_var_index, k]]
        #f_R = lambda k: p_R[:,:, index[R_time_var_index, k]]
        #o_R = R_handling( p_R, index, R_time_var_index, 20)

        if calc_grad_log_likelihood:

            dF = cls._check_grad_state_matrices(grad_calc_params.get('dF'), state_dim, grad_params_no, which = 'dA')
            dQc = cls._check_grad_state_matrices(grad_calc_params.get('dQc'), state_dim, grad_params_no, which = 'dQ')
            dP_inf = cls._check_grad_state_matrices(grad_calc_params.get('dP_inf'), state_dim, grad_params_no, which = 'dA')

            dH = cls._check_grad_measurement_matrices(grad_calc_params.get('dH'), state_dim, grad_params_no, measurement_dim, which = 'dH')
            dR = cls._check_grad_measurement_matrices(grad_calc_params.get('dR'), state_dim, grad_params_no, measurement_dim, which = 'dR')

            dm_init = grad_calc_params.get('dm_init') # Initial values for the Kalman Filter
            if dm_init is None:
                # multiple time series mode. Keep grad_params always as a last dimension
                dm_init = np.zeros( (state_dim, time_series_no, grad_params_no) )

            dP_init = grad_calc_params.get('dP_init') # Initial values for the Kalman Filter
            if dP_init is None:
                dP_init = dP_inf(0).copy() # get the dP_init matrix, because now it is a function

        else:
            dP_inf = None
            dF = None
            dQc = None
            dH = None
            dR = None
            dm_init = None
            dP_init = None

        measurement_callables = Std_Measurement_Callables_Class(p_H, H_time_var_index, p_R, index, R_time_var_index, 20, dH, dR)
        #import pdb; pdb.set_trace()

        dynamic_callables = cls._cont_to_discrete_object(X, F, L, Qc, compute_derivatives=calc_grad_log_likelihood,
                                              grad_params_no=grad_params_no,
                                              P_inf=P_inf, dP_inf=dP_inf, dF = dF, dQc=dQc)

        if print_verbose:
            print("General: run Continuos-Discrete Kalman Filter")
        # Also for dH, dR and probably for all derivatives
        (M, P, log_likelihood, grad_log_likelihood, AQcomp) = cls._cont_discr_kalman_filter_raw(state_dim,
                        dynamic_callables, measurement_callables,
                        X, Y, m_init=m_init, P_init=P_init,
                        p_kalman_filter_type=p_kalman_filter_type,
                        calc_log_likelihood=calc_log_likelihood,
                        calc_grad_log_likelihood=calc_grad_log_likelihood, grad_params_no=grad_params_no,
                        dm_init=dm_init, dP_init=dP_init)

        if old_index_shape is not None:
            index.shape = old_index_shape

        if old_X_shape is not None:
            X.shape = old_X_shape

        if old_Y_shape is not None:
            Y.shape = old_Y_shape

        if old_H_shape is not None:
            p_H.shape = old_H_shape

        if old_R_shape is not None:
            p_R.shape = old_R_shape

        return (M, P, log_likelihood, grad_log_likelihood, AQcomp)

    @classmethod
    def _cont_discr_kalman_filter_raw(cls,state_dim, p_dynamic_callables, p_measurement_callables, X, Y,
                                      m_init, P_init,
                                      p_kalman_filter_type='regular',
                                      calc_log_likelihood=False,
                      calc_grad_log_likelihood=False, grad_params_no=None,
                      dm_init=None, dP_init=None):
        """
        General filtering algorithm for inference in the continuos-discrete
        state-space model:

            d/dt x(t) =  F * x(t) + L * w(t);         w(t) ~ N(0, Qc)
            y_{k} = H_{k} * x_{k} + r_{k};            r_{k-1} ~ N(0, R_{k})

        Returns estimated filter distributions x_{k} ~ N(m_{k}, P(k))

        Current Features:
        ----------------------------------------

        1) Function support "multiple time series mode" which means that exactly
        the same State-Space model is used to filter several sets of measurements.
        In this case third dimension of Y should include these state-space measurements
        Log_likelihood and Grad_log_likelihood have the corresponding dimensions then.

        2) Measurement may include missing values. In this case update step is
        not done for this measurement. (later may be changed)

        Input:
        -----------------
        state_dim: int
            Demensionality of the states

        F: (state_dim, state_dim) matrix
            F in the model.

        L: (state_dim, noise_dim) matrix
            L in the model.

        Qc: (noise_dim, noise_dim) matrix
            Q_c in the model.

        P_inf: (state_dim, state_dim) matrix
            State varince matrix on infinity.

        p_h: function (k, x_{k}, H_{k}). Measurement function.
            k (iteration number),
            x_{k}
            H_{k} Jacobian matrices of f_h. In the linear case it is exactly H_{k}.

        f_H: function (k, m, P) return Jacobian of dynamic function, it is
            passed into p_h.
            k (iteration number),
            m: point where Jacobian is evaluated,
            P: parameter for Jacobian, usually covariance matrix.

        p_f_R: function (k). Returns noise matrix of measurement equation
            on iteration k.
            k (iteration number).

        m_init: vector or matrix
            Initial distribution mean. For "multiple time series mode"
            it is matrix, second dimension of which correspond to different
            time series. In regular case ("one time series mode") it is a
            vector.

        P_init: matrix or scalar
            Initial covariance of the states. Must be not None
            "multiple time series mode" does not affect it, since it does not
            affect anything related to state variaces.

        p_kalman_filter_type: string, one of ('regular', 'svd')
            Which Kalman Filter is used. Regular or SVD. SVD is more numerically
            stable, in particular, Covariace matrices are guarantied to be
            positive semi-definite. However, 'svd' works slower, especially for
            small data due to SVD call overhead.

        calc_log_likelihood: boolean
            Whether to calculate marginal likelihood of the state-space model.

        calc_grad_log_likelihood: boolean
            Whether to calculate gradient of the marginal likelihood
            of the state-space model. If true then the next parameter must
            provide the extra parameters for gradient calculation.

        grad_params_no: int
            Number of gradient parameters

        dP_inf, dF, dQc, dH, dR, dm_init, dP_init: matrices or 3D arrays.
            Necessary parameters for derivatives calculation.

        """

        #import pdb; pdb.set_trace()
        steps_no = Y.shape[0] # number of steps in the Kalman Filter
        time_series_no = Y.shape[2] # multiple time series mode

        # Allocate space for results
        # Mean estimations. Initial values will be included
        M = np.empty(((steps_no+1),state_dim,time_series_no))
        M[0,:,:] = m_init # Initialize mean values
        # Variance estimations. Initial values will be included
        P = np.empty(((steps_no+1),state_dim,state_dim))
        P_init = 0.5*( P_init + P_init.T) # symmetrize initial covariance. In some ustable cases this is uiseful
        P[0,:,:] = P_init # Initialize initial covariance matrix

        #import pdb;pdb.set_trace()
        if p_kalman_filter_type == 'svd':
            (U,S,Vh) = sp.linalg.svd( P_init,full_matrices=False, compute_uv=True,
                      overwrite_a=False,check_finite=True)
            S[ (S==0) ] = 1e-17 # allows to run algorithm for singular initial variance
            P_upd = (P_init, S,U)
        #log_likelihood = 0
        #grad_log_likelihood = np.zeros((grad_params_no,1))
        log_likelihood = 0 if calc_log_likelihood else None
        grad_log_likelihood = 0 if calc_grad_log_likelihood else None

        #setting initial values for derivatives update
        dm_upd = dm_init
        dP_upd = dP_init
        # Main loop of the Kalman filter
        for k in range(0,steps_no):
            # In this loop index for new estimations is (k+1), old - (k)
            # This happened because initial values are stored at 0-th index.
            #import pdb; pdb.set_trace()

            prev_mean = M[k,:,:] # mean from the previous step

            if p_kalman_filter_type == 'svd':
                m_pred, P_pred, dm_pred, dP_pred = \
                cls._kalman_prediction_step_SVD(k, prev_mean ,P_upd, p_dynamic_callables,
                    calc_grad_log_likelihood=calc_grad_log_likelihood,
                    p_dm = dm_upd, p_dP = dP_upd)
            else:
                m_pred, P_pred, dm_pred, dP_pred = \
                cls._kalman_prediction_step(k, prev_mean ,P[k,:,:], p_dynamic_callables,
                    calc_grad_log_likelihood=calc_grad_log_likelihood,
                    p_dm = dm_upd, p_dP = dP_upd )

            #import pdb; pdb.set_trace()
            k_measurment = Y[k,:,:]

            if (np.any(np.isnan(k_measurment)) == False):

                if p_kalman_filter_type == 'svd':
                    m_upd, P_upd, log_likelihood_update, dm_upd, dP_upd, d_log_likelihood_update = \
                    cls._kalman_update_step_SVD(k,  m_pred , P_pred, p_measurement_callables,
                            k_measurment, calc_log_likelihood=calc_log_likelihood,
                            calc_grad_log_likelihood=calc_grad_log_likelihood,
                            p_dm = dm_pred, p_dP = dP_pred )


    #                m_upd, P_upd, log_likelihood_update, dm_upd, dP_upd, d_log_likelihood_update = \
    #                cls._kalman_update_step(k,  m_pred , P_pred[0], f_h, f_H, p_R.f_R, k_measurment,
    #                        calc_log_likelihood=calc_log_likelihood,
    #                        calc_grad_log_likelihood=calc_grad_log_likelihood,
    #                        p_dm = dm_pred, p_dP = dP_pred, grad_calc_params_2 = (dH, dR))
    #
    #                (U,S,Vh) = sp.linalg.svd( P_upd,full_matrices=False, compute_uv=True,
    #                      overwrite_a=False,check_finite=True)
    #                P_upd = (P_upd, S,U)
                else:
                    m_upd, P_upd, log_likelihood_update, dm_upd, dP_upd, d_log_likelihood_update = \
                    cls._kalman_update_step(k,  m_pred , P_pred, p_measurement_callables, k_measurment,
                            calc_log_likelihood=calc_log_likelihood,
                            calc_grad_log_likelihood=calc_grad_log_likelihood,
                            p_dm = dm_pred, p_dP = dP_pred )
            else:
                if k_measurment.shape != (1,1):
                    raise ValueError("Nan measurements are currently not supported for \
                                     multidimensional output and multiple tiem series.")
                else:
                    m_upd = m_pred; P_upd = P_pred; dm_upd = dm_pred; dP_upd = dP_pred
                    log_likelihood_update = 0.0;
                    d_log_likelihood_update = 0.0;


            if calc_log_likelihood:
                log_likelihood += log_likelihood_update

            if calc_grad_log_likelihood:
                grad_log_likelihood += d_log_likelihood_update

            M[k+1,:,:] = m_upd # separate mean value for each time series

            if p_kalman_filter_type == 'svd':
                P[k+1,:,:] = P_upd[0]
            else:
                P[k+1,:,:] = P_upd
            #print("kf it: %i" % k)
            # !!!Print statistics! Print sizes of matrices
            # !!!Print statistics! Print iteration time base on another boolean variable
        return (M, P, log_likelihood, grad_log_likelihood, p_dynamic_callables.reset(False))

    @classmethod
    def cont_discr_rts_smoother(cls,state_dim, filter_means, filter_covars,
                                p_dynamic_callables=None, X=None, F=None,L=None,Qc=None):
        """

        Continuos-discrete Rauch–Tung–Striebel(RTS) smoother.

        This function implements Rauch–Tung–Striebel(RTS) smoother algorithm
        based on the results of _cont_discr_kalman_filter_raw.

        Model:
            d/dt x(t) =  F * x(t) + L * w(t);         w(t) ~ N(0, Qc)
            y_{k} = H_{k} * x_{k} + r_{k};            r_{k-1} ~ N(0, R_{k})

        Returns estimated smoother distributions x_{k} ~ N(m_{k}, P(k))

        Input:
        --------------

        filter_means: (no_steps+1,state_dim) matrix or (no_steps+1,state_dim, time_series_no) 3D array
            Results of the Kalman Filter means estimation.

        filter_covars: (no_steps+1, state_dim, state_dim) 3D array
            Results of the Kalman Filter covariance estimation.

        Dynamic_callables: object or None
            Object form the filter phase which provides functions for computing
            A, Q, dA, dQ fro  discrete model from the continuos model.

         X, F, L, Qc: matrices
             If AQcomp is None, these matrices are used to create this object from scratch.

        Output:
        -------------

        M: (no_steps+1,state_dim) matrix
            Smoothed estimates of the state means

        P: (no_steps+1,state_dim, state_dim) 3D array
            Smoothed estimates of the state covariances
        """

        f_a = lambda k,m,A: np.dot(A, m) # state dynamic model
        if p_dynamic_callables is None: # make this object from scratch
            p_dynamic_callables = cls._cont_to_discrete_object(cls, X, F,L,Qc,f_a,compute_derivatives=False,
                                                  grad_params_no=None, P_inf=None, dP_inf=None, dF = None, dQc=None)

        no_steps = filter_covars.shape[0]-1# number of steps (minus initial covariance)

        M = np.empty(filter_means.shape) # smoothed means
        P = np.empty(filter_covars.shape) # smoothed covars

        if print_verbose:
            print("General: run Continuos-Discrete Kalman Smoother")

        M[-1,:,:] = filter_means[-1,:,:]
        P[-1,:,:] = filter_covars[-1,:,:]
        for k in range(no_steps-1,-1,-1):

            prev_mean = filter_means[k,:] # mean from the previous step
            m_pred, P_pred, tmp1, tmp2 = \
                    cls._kalman_prediction_step(k, prev_mean,
                                                filter_covars[k,:,:], p_dynamic_callables,
                                                calc_grad_log_likelihood=False)
            p_m = filter_means[k,:]
            p_m_prev_step = M[(k+1),:]

            m_upd, P_upd, tmp_G = cls._rts_smoother_update_step(k,
                            p_m ,filter_covars[k,:,:],
                            m_pred, P_pred, p_m_prev_step ,P[(k+1),:,:], p_dynamic_callables)

            M[k,:,:] = m_upd
            P[k,:,:] = P_upd
        # Return values
        return (M, P)

    @classmethod
    def _cont_to_discrete_object(cls, X, F, L, Qc, compute_derivatives=False,
                                 grad_params_no=None,
                                 P_inf=None, dP_inf=None, dF = None, dQc=None):
        """
        Function return the object which is used in Kalman filter and/or
        smoother to obtain matrices A, Q and their derivatives for discrete model
        from the continuous model.

        There are 2 objects AQcompute_once and AQcompute_batch and the function
        returs the appropriate one based on the number of different time steps.

        Input:
        ----------------------
        X, F, L, Qc: matrices
            Continuous model matrices

        f_a: function
            Dynamic Function is attached to the Dynamic_Model_Callables class
        compute_derivatives: boolean
            Whether to compute derivatives

        grad_params_no: int
            Number of parameters in the gradient

        P_inf, dP_inf, dF, dQ: matrices and 3D objects
            Data necessary to compute derivatives.

        Output:
        --------------------------
        AQcomp: object
            Its methods return matrices (and optionally derivatives) for the
            discrete state-space model.

        """

        unique_round_decimals = 10
        threshold_number_of_unique_time_steps = 20 # above which matrices are separately each time
        dt = np.empty((X.shape[0],))
        dt[1:] = np.diff(X[:,0],axis=0)
        dt[0]  = 0#dt[1]
        unique_indices = np.unique(np.round(dt, decimals=unique_round_decimals))
        number_unique_indices = len(unique_indices)

        #import pdb; pdb.set_trace()
        if use_cython:
            class AQcompute_batch(state_space_cython.AQcompute_batch_Cython):
                def __init__(self, F,L,Qc,dt,compute_derivatives=False, grad_params_no=None, P_inf=None, dP_inf=None, dF = None, dQc=None):
                    As, Qs, reconstruct_indices, dAs, dQs = ContDescrStateSpace.lti_sde_to_descrete(F,
                                L,Qc,dt,compute_derivatives,
                                grad_params_no=grad_params_no, P_inf=P_inf, dP_inf=dP_inf, dF=dF, dQc=dQc)

                    super(AQcompute_batch,self).__init__(As, Qs, reconstruct_indices, dAs,dQs)
        else:
            AQcompute_batch = cls.AQcompute_batch_Python

        if number_unique_indices > threshold_number_of_unique_time_steps:
            AQcomp = cls.AQcompute_once(F,L,Qc, dt,compute_derivatives=compute_derivatives,
                                    grad_params_no=grad_params_no, P_inf=P_inf, dP_inf=dP_inf, dF=dF, dQc=dQc)
            if print_verbose:
                print("CDO:  Continue-to-discrete INSTANTANEOUS object is created.")
                print("CDO:  Number of different time steps: %i" % (number_unique_indices,) )

        else:
            AQcomp = AQcompute_batch(F,L,Qc,dt,compute_derivatives=compute_derivatives,
                                    grad_params_no=grad_params_no, P_inf=P_inf, dP_inf=dP_inf, dF=dF, dQc=dQc)
            if print_verbose:
                print("CDO:  Continue-to-discrete BATCH object is created.")
                print("CDO:  Number of different time steps: %i" % (number_unique_indices,) )
                print("CDO:  Total size if its data: %i" % (AQcomp.total_size_of_data,) )

        return AQcomp

    @staticmethod
    def lti_sde_to_descrete(F,L,Qc,dt,compute_derivatives=False,
                            grad_params_no=None, P_inf=None,
                            dP_inf=None, dF = None, dQc=None):
        """
        Linear Time-Invariant Stochastic Differential Equation (LTI SDE):

            dx(t) = F x(t) dt + L d \beta  ,where

                x(t): (vector) stochastic process
                \beta: (vector) Brownian motion process
                F, L: (time invariant) matrices of corresponding dimensions
                Qc: covariance of noise.

        This function rewrites it into the corresponding state-space form:

            x_{k} =  A_{k} * x_{k-1} + q_{k-1};       q_{k-1} ~ N(0, Q_{k-1})


        Input:
        --------------
        F,L: LTI SDE matrices of corresponding dimensions

        Qc: matrix (n,n)
            Covarince between different dimensions of noise \beta.
            n is the dimensionality of the noise.

        dt: double or iterable
            Time difference used on this iteration.
            If dt is iterable, then A and Q_noise are computed for every
            unique dt

        compute_derivatives: boolean
            Whether derivatives of A and Q are required.

        grad_params_no: int
            Number of gradient parameters

        P_inf: (state_dim. state_dim) matrix

        dP_inf

        dF: 3D array
            Derivatives of F

        dQc: 3D array
            Derivatives of Qc

        dR: 3D array
            Derivatives of R

        Output:
        --------------

        A: matrix
            A_{k}. Because we have LTI SDE only dt can affect on matrix
            difference for different k.

        Q_noise: matrix
            Covariance matrix of (vector) q_{k-1}. Only dt can affect the
            matrix difference for different k.

        reconstruct_index: array
            If dt was iterable return three dimensinal arrays A and Q_noise.
            Third dimension of these arrays correspond to unique dt's.
            This reconstruct_index contain indices of the original dt's
            in the uninue dt sequence. A[:,:, reconstruct_index[5]]
            is matrix A of 6-th(indices start from zero) dt in the original
            sequence.
        dA: 3D array
            Derivatives of A

        dQ: 3D array
            Derivatives of Q
        """
        # Dimensionality
        n = F.shape[0]

        if not isinstance(dt, collections.Iterable): # not iterable, scalar

            # The dynamical model
            A  = matrix_exponent(F*dt)
            if np.any( np.isnan(A)):
                A  = linalg.expm3(F*dt)

            # The covariance matrix Q by matrix fraction decomposition ->
            Phi = np.zeros((2*n,2*n))
            Phi[:n,:n] = F
            Phi[:n,n:] = L.dot(Qc).dot(L.T)
            Phi[n:,n:] = -F.T
            AB = matrix_exponent(Phi*dt)
            AB = np.dot(AB, np.vstack((np.zeros((n,n)),np.eye(n))))

            Q_noise_1 = linalg.solve(AB[n:,:].T,AB[:n,:].T)
            Q_noise_2  = P_inf - A.dot(P_inf).dot(A.T)
            # The covariance matrix Q by matrix fraction decomposition <-

            if compute_derivatives:
                dA = np.zeros([n, n, grad_params_no])
                dQ = np.zeros([n, n, grad_params_no])

                #AA  = np.zeros([2*n, 2*n, nparam])
                FF  = np.zeros([2*n, 2*n])
                AA = np.zeros([2*n, 2*n, grad_params_no])

                for p in range(0, grad_params_no):

                    FF[:n,:n] = F
                    FF[n:,:n] = dF[:,:,p]
                    FF[n:,n:] = F

                    # Solve the matrix exponential
                    AA[:,:,p] = matrix_exponent(FF*dt)

                    # Solve the differential equation
                    #foo         = AA[:,:,p].dot(np.vstack([m, dm[:,p]]))
                    #mm          = foo[:n,:]
                    #dm[:,p] = foo[n:,:]

                    # The discrete-time dynamical model*
                    if p==0:
                        A  = AA[:n,:n,p]
                        Q_noise_2  = P_inf - A.dot(P_inf).dot(A.T)
                        Q_noise = Q_noise_2
                        #PP = A.dot(P).dot(A.T) + Q_noise_2

                    # The derivatives of A and Q
                    dA[:,:,p] = AA[n:,:n,p]
                    dQ[:,:,p] = dP_inf[:,:,p] - dA[:,:,p].dot(P_inf).dot(A.T) \
                       - A.dot(dP_inf[:,:,p]).dot(A.T) - A.dot(P_inf).dot(dA[:,:,p].T) # Rewrite not ro multiply two times

            else:
              dA = None
              dQ = None
              Q_noise = Q_noise_2
	      # Innacuracies have been observed when Q_noise_1 was used.

            #Q_noise = Q_noise_1

            # Return
            return A, Q_noise,None, dA, dQ

        else: # iterable, array

            # Time discretizations (round to 14 decimals to avoid problems)
            dt_unique, tmp, reconstruct_index = np.unique(np.round(dt,8),
                                        return_index=True,return_inverse=True)
            del tmp
            # Allocate space for A and Q
            A = np.empty((n,n,dt_unique.shape[0]))
            Q_noise = np.empty((n,n,dt_unique.shape[0]))

            if compute_derivatives:
                dA = np.empty((n,n,grad_params_no,dt_unique.shape[0]))
                dQ = np.empty((n,n,grad_params_no,dt_unique.shape[0]))
            else:
                dA = None
                dQ = None
            # Call this function for each unique dt
            for j in range(0,dt_unique.shape[0]):
                A[:,:,j], Q_noise[:,:,j], tmp1, dA_t, dQ_t = ContDescrStateSpace.lti_sde_to_descrete(F,L,Qc,dt_unique[j],
                    compute_derivatives=compute_derivatives, grad_params_no=grad_params_no, P_inf=P_inf, dP_inf=dP_inf, dF = dF, dQc=dQc)
                if compute_derivatives:
                    dA[:,:,:,j] = dA_t
                    dQ[:,:,:,j] = dQ_t

            # Return
            return A, Q_noise, reconstruct_index, dA, dQ

def matrix_exponent(M):
    """
    The function computes matrix exponent and handles some special cases
    """

    if (M.shape[0] == 1): # 1*1 matrix
        Mexp = np.array( ((np.exp(M[0,0]) ,),) )

    else: # matrix is larger
        method = None
        try:
            Mexp = linalg.expm(M)
            method = 1
        except (Exception,) as e:
            Mexp = linalg.expm3(M)
            method = 2
        finally:
            if np.any(np.isnan(Mexp)):
                if method == 2:
                    raise ValueError("Matrix Exponent is not computed 1")
                else:
                    Mexp = linalg.expm3(M)
                    method = 2
                    if np.any(np.isnan(Mexp)):
                        raise ValueError("Matrix Exponent is not computed 2")

    return Mexp

def balance_matrix(A):
    """
    Balance matrix, i.e. finds such similarity transformation of the original
    matrix A:  A = T * bA * T^{-1}, where norms of columns of bA and of rows of bA
    are as close as possible. It is usually used as a preprocessing step in
    eigenvalue calculation routine. It is useful also for State-Space models.

    See also:
        [1] Beresford N. Parlett and Christian Reinsch (1969). Balancing
            a matrix for calculation of eigenvalues and eigenvectors.
            Numerische Mathematik, 13(4): 293-304.

    Input:
    ----------------------
    A: square matrix
        Matrix to be balanced

    Output:
    ----------------
        bA: matrix
            Balanced matrix

        T: matrix
            Left part of the similarity transformation

        T_inv: matrix
            Right part of the similarity transformation.
    """

    if len(A.shape) != 2 or (A.shape[0] != A.shape[1]):
        raise ValueError('balance_matrix: Expecting square matrix')

    N = A.shape[0] # matrix size

    gebal = sp.linalg.lapack.get_lapack_funcs('gebal',(A,))
    bA, lo, hi, pivscale, info = gebal(A, permute=True, scale=True,overwrite_a=False)
    if info < 0:
        raise ValueError('balance_matrix: Illegal value in %d-th argument of internal gebal ' % -info)
    # calculating the similarity transforamtion:
    def perm_matr(D, c1,c2):
        """
        Function creates the permutation matrix which swaps columns c1 and c2.

        Input:
        --------------
        D: int
            Size of the permutation matrix
        c1: int
            Column 1. Numeration starts from 1...D
        c2: int
            Column 2. Numeration starts from 1...D
        """
        i1 = c1-1; i2 = c2-1 # indices
        P = np.eye(D);
        P[i1,i1] = 0.0; P[i2,i2] = 0.0; # nullify diagonal elements
        P[i1,i2] = 1.0; P[i2,i1] = 1.0

        return P

    P = np.eye(N) # permutation matrix
    if (hi != N-1): # there are row permutations
        for k in range(N-1,hi,-1):
            new_perm = perm_matr(N, k+1, pivscale[k])
            P = np.dot(P,new_perm)
    if (lo != 0):
        for k in range(0,lo,1):
            new_perm = perm_matr(N, k+1, pivscale[k])
            P = np.dot(P,new_perm)
    D = pivscale.copy()
    D[0:lo] = 1.0; D[hi+1:N] = 1.0 # thesee scaling factors must be set to one.
    #D = np.diag(D) # make a diagonal matrix

    T = np.dot(P,np.diag(D)) # similarity transformation in question
    T_inv = np.dot(np.diag(D**(-1)),P.T)

    #print( np.max(A - np.dot(T, np.dot(bA, T_inv) )) )
    return bA.copy(), T, T_inv

def balance_ss_model(F,L,Qc,H,Pinf,P0,dF=None,dQc=None,dPinf=None,dP0=None):
    """
    Balances State-Space model for more numerical stability

    This is based on the following:

     dx/dt = F x + L w
         y = H x

     Let T z = x, which gives

     dz/dt = inv(T) F T z + inv(T) L w
         y = H T z
    """

    bF,T,T_inv = balance_matrix(F)

    bL = np.dot( T_inv, L)
    bQc = Qc # not affected

    bH = np.dot(H, T)

    bPinf = np.dot(T_inv, np.dot(Pinf, T_inv.T))

    #import pdb; pdb.set_trace()
#    LL,islower = linalg.cho_factor(Pinf)
#    inds = np.triu_indices(Pinf.shape[0],k=1)
#    LL[inds] = 0.0
#    bLL = np.dot(T_inv, LL)
#    bPinf = np.dot( bLL, bLL.T)

    bP0 = np.dot(T_inv, np.dot(P0, T_inv.T))

    if dF is not None:
        bdF = np.zeros(dF.shape)
        for i in range(dF.shape[2]):
            bdF[:,:,i] = np.dot( T_inv, np.dot( dF[:,:,i], T))

    else:
        bdF = None

    if dPinf is not None:
        bdPinf = np.zeros(dPinf.shape)
        for i in range(dPinf.shape[2]):
            bdPinf[:,:,i] = np.dot( T_inv, np.dot( dPinf[:,:,i], T_inv.T))

#            LL,islower = linalg.cho_factor(dPinf[:,:,i])
#            inds = np.triu_indices(dPinf[:,:,i].shape[0],k=1)
#            LL[inds] = 0.0
#            bLL = np.dot(T_inv, LL)
#            bdPinf[:,:,i] = np.dot( bLL, bLL.T)


    else:
        bdPinf = None

    if dP0 is not None:
        bdP0 = np.zeros(dP0.shape)
        for i in range(dP0.shape[2]):
            bdP0[:,:,i] = np.dot( T_inv, np.dot( dP0[:,:,i], T_inv.T))
    else:
        bdP0 = None


    bdQc = dQc # not affected

    # (F,L,Qc,H,Pinf,P0,dF,dQc,dPinf,dP0)

    return bF, bL, bQc, bH, bPinf, bP0, bdF, bdQc, bdPinf, bdP0, T