From b8e21057f56fb1333c82c2358a356d4832087da7 Mon Sep 17 00:00:00 2001
From: Alexander Grigorievskiy <alex.grigorievskiy@gmail.com>
Date: Mon, 10 Aug 2015 19:40:39 +0300
Subject: [PATCH] UPD: Added SVD Kalman Filter, EM algorithm for gradient
 calculation (only for discrete KF)

---
 GPy/kern/_src/sde_matern.py            |   8 +-
 GPy/kern/_src/sde_standard_periodic.py |  16 +-
 GPy/kern/_src/sde_stationary.py        |  11 +-
 GPy/models/state_space_main.py         | 773 +++++++++++++++++++++++--
 GPy/models/state_space_new.py          |  45 +-
 5 files changed, 786 insertions(+), 67 deletions(-)

diff --git a/GPy/kern/_src/sde_matern.py b/GPy/kern/_src/sde_matern.py
index 102a9200..0ce1cf98 100644
--- a/GPy/kern/_src/sde_matern.py
+++ b/GPy/kern/_src/sde_matern.py
@@ -38,11 +38,11 @@ class sde_Matern32(Matern32):
         lengthscale = float(self.lengthscale.values)
         
         foo  = np.sqrt(3.)/lengthscale 
-        F    = np.array(((0, 1), (-foo**2, -2*foo))) 
-        L    = np.array(( (0,), (1,) ))
+        F    = np.array(((0, 1.0), (-foo**2, -2*foo))) 
+        L    = np.array(( (0,), (1.0,) ))
         Qc   = np.array(((12.*np.sqrt(3) / lengthscale**3 * variance,),)) 
-        H    = np.array(((1, 0),)) 
-        Pinf = np.array(((variance, 0), (0, 3.*variance/(lengthscale**2))))
+        H    = np.array(((1.0, 0),)) 
+        Pinf = np.array(((variance, 0.0), (0.0, 3.*variance/(lengthscale**2))))
         P0 = Pinf.copy()
         
         # Allocate space for the derivatives 
diff --git a/GPy/kern/_src/sde_standard_periodic.py b/GPy/kern/_src/sde_standard_periodic.py
index 3434cc74..c3df7d92 100644
--- a/GPy/kern/_src/sde_standard_periodic.py
+++ b/GPy/kern/_src/sde_standard_periodic.py
@@ -56,19 +56,20 @@ class sde_StdPeriodic(StdPeriodic):
         
         
         w0 = 2*np.pi/self.wavelengths # frequency
+        lengthscales = 2*self.lengthscales         
         
-        [q2,dq2l] = seriescoeff(N,2*self.lengthscales,self.variance)        
+        [q2,dq2l] = seriescoeff(N,lengthscales,self.variance)        
         # lengthscale is multiplied by 2 because of slightly different
         # formula for periodic covariance function.
         # For the same reason:
         
         dq2l = 2*dq2l
         
-        if np.any( np.isnan(q2)):
-            raise ValueError("SDE periodic covariance error1")
+        if np.any( np.isfinite(q2) == False):
+            raise ValueError("SDE periodic covariance error 1")
         
-        if np.any( np.isnan(dq2l)):
-            raise ValueError("SDE periodic covariance error1")
+        if np.any( np.isfinite(dq2l) == False):
+            raise ValueError("SDE periodic covariance error 2")
         
         F    = np.kron(np.diag(range(0,N+1)),np.array( ((0, -w0), (w0, 0)) ) )
         L    = np.eye(2*(N+1))
@@ -159,8 +160,9 @@ def seriescoeff(m=6,lengthScale=1.0,magnSigma2=1.0, true_covariance=False):
         
     else:
         coeffs = 2*magnSigma2*sp.exp( -lengthScale**(-2) ) * special.iv(range(0,m+1),1.0/lengthScale**(2))
-        if np.any( np.isnan(coeffs)):
-            pass
+        if np.any( np.isfinite(coeffs) == False):
+            raise ValueError("sde_standard_periodic: Coefficients are not finite!")
+            #import pdb; pdb.set_trace()
         coeffs[0] = 0.5*coeffs[0]
         
         # Derivatives wrt (lengthScale)
diff --git a/GPy/kern/_src/sde_stationary.py b/GPy/kern/_src/sde_stationary.py
index 1dfa97a2..76b5266a 100644
--- a/GPy/kern/_src/sde_stationary.py
+++ b/GPy/kern/_src/sde_stationary.py
@@ -68,7 +68,7 @@ class sde_RBF(RBF):
         
         # Infinite covariance:
         Pinf = sp.linalg.solve_lyapunov(F, -np.dot(L,np.dot( Qc[0,0],L.T)))
-        
+        Pinf = 0.5*(Pinf + Pinf.T)
         # Allocating space for derivatives        
         dF    = np.empty([F.shape[0],F.shape[1],2])
         dQc   = np.empty([Qc.shape[0],Qc.shape[1],2]) 
@@ -96,13 +96,14 @@ class sde_RBF(RBF):
         dPinf[:,:,0] = dPinf_variance 
         dPinf[:,:,1] = dPinf_lengthscale
         
-        # Benefits of this are unjustified
-        #import GPy.models.state_space_main as ssm
-        #(F, L, Qc, H, Pinf, dF, dQc, dPinf,T) = ssm.balance_ss_model(F, L, Qc, H, Pinf, dF, dQc, dPinf)
-        
         P0 = Pinf.copy()
         dP0 = dPinf.copy()
         
+        # Benefits of this are not very sound. Helps only in one case:
+        # SVD Kalman + RBF kernel
+        import GPy.models.state_space_main as ssm
+        (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf,dP0, T) = ssm.balance_ss_model(F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0 )
+        
         return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
 
 class sde_Exponential(Exponential):
diff --git a/GPy/models/state_space_main.py b/GPy/models/state_space_main.py
index 066a2aa3..39d5abe2 100644
--- a/GPy/models/state_space_main.py
+++ b/GPy/models/state_space_main.py
@@ -749,6 +749,8 @@ class DescreteStateSpace(object):
         if len(p_m.shape)<2:
             p_m.shape = (p_m.shape[0],1)
         
+        #import pdb; pdb.set_trace()
+        
         # index correspond to values from previous iteration.
         A = p_f_A(k,p_m,p_P) # state transition matrix (or Jacobian)
         Q = p_f_Q(k) # state noise matrix 
@@ -799,7 +801,137 @@ class DescreteStateSpace(object):
             
         return m_pred, P_pred, dm_pred, dP_pred
     
-    
+    @staticmethod    
+    def _kalman_prediction_step_SVD(k, p_m , p_P, p_a, p_f_A, p_f_Q, p_f_Qsr, calc_grad_log_likelihood=False, 
+                                p_dm = None, p_dP = None, grad_calc_params_1 = None):
+        """
+        Desctrete prediction function        
+        
+        Input:
+            k:int
+                Iteration No. Starts at 0. Total number of iterations equal to the 
+                number of measurements.
+              
+            p_m: matrix of size (state_dim, time_series_no)
+                Mean value from the previous step. For "multiple time series mode" 
+                it is matrix, second dimension of which correspond to different
+                time series.
+                
+            p_P: tuple (Prev_cov, S, V)
+                Covariance matrix from the previous step and its SVD decomposition.
+                Prev_cov = V * S * V.T The tuple is (Prev_cov, S, V)                
+                
+            p_a: function (k, x_{k-1}, A_{k}). Dynamic function.        
+                k (iteration number), starts at 0
+                x_{k-1} State from the previous step
+                A_{k} Jacobian matrices of f_a. In the linear case it is exactly A_{k}.
+            
+            p_f_A: function (k, m, P) return Jacobian of dynamic function, it is
+                passed into p_a.
+                k (iteration number), starts at 0
+                m: point where Jacobian is evaluated
+                P: parameter for Jacobian, usually covariance matrix.
+        
+            p_f_Q: function (k). Returns noise matrix of dynamic model on iteration k.
+                k (iteration number). starts at 0
+            
+            p_f_Qsr: function (k). Returns square root of noise matrix of the 
+                dynamic model on iteration k. k (iteration number). starts at 0
+                
+            calc_grad_log_likelihood: boolean
+                Whether to calculate gradient of the marginal likelihood 
+                of the state-space model. If true then the next parameter must 
+                provide the extra parameters for gradient calculation.
+                
+            p_dm: 3D array (state_dim, time_series_no, parameters_no)
+                Mean derivatives from the previous step. For "multiple time series mode" 
+                it is 3D array, second dimension of which correspond to different
+                time series.
+                
+            p_dP: 3D array (state_dim, state_dim, parameters_no)
+                Mean derivatives from the previous step
+                
+            grad_calc_params_1: List or None
+            List with derivatives. The first component is 'f_dA' - function(k)
+            which returns the derivative of H. The second element is 'f_dQ'
+             - function(k). Function which returns the derivative of Q.
+             
+        Output:
+        ----------------------------
+        m_pred, P_pred, dm_pred, dP_pred: metrices, 3D objects
+            Results of the prediction steps.        
+            
+        """
+        if len(p_m.shape)<2:
+            p_m.shape = (p_m.shape[0],1)
+        
+        # covariance from the previous step and its SVD decomposition
+        # p_prev_cov = v * S * V.T
+        Prev_cov, S_old, V_old = p_P        
+        #p_prev_cov_tst = np.dot(p_V, (p_S * p_V).T) # reconstructed covariance from the previous step        
+        
+        # index correspond to values from previous iteration.
+        A = p_f_A(k,p_m,Prev_cov) # state transition matrix (or Jacobian)
+        Q = p_f_Q(k) # state noise matrx. This is necessary for the square root calculation (next step)
+        Q_sr = p_f_Qsr(k)            
+        # Prediction step ->
+        m_pred = p_a(k, p_m, A) # predicted mean
+        
+        # coavariance prediction have changed:
+        svd_1_matr = np.vstack( ( (np.sqrt(S_old)* np.dot(A,V_old)).T , Q_sr.T) )
+        (U,S,Vh) = sp.linalg.svd( svd_1_matr,full_matrices=False, compute_uv=True, 
+                      overwrite_a=False,check_finite=True)
+                      
+        # predicted variance computed by the regular method. For testing
+        #P_pred_tst = A.dot(Prev_cov).dot(A.T) + Q
+        V_new = Vh.T
+        S_new = S**2
+        
+        P_pred = np.dot(V_new * S_new, V_new.T) # prediction covariance
+        P_pred = (P_pred, S_new, Vh.T)
+        # Prediction step <-
+        
+        if (p_m.shape[1] > 1):
+            multiple_ts_mode = True
+        else:
+            multiple_ts_mode = False
+        
+        # derivatives
+        if calc_grad_log_likelihood:
+            p_f_dA = grad_calc_params_1[0]; dA_all_params = p_f_dA(k) # derivatives of A wrt parameters 
+            p_f_dQ = grad_calc_params_1[1]; dQ_all_params = p_f_dQ(k) # derivatives of Q wrt parameters
+            
+            param_number = p_dP.shape[2]
+            
+            # p_dm, p_dP - derivatives form the previoius step
+            dm_pred = np.empty(p_dm.shape)
+            dP_pred = np.empty(p_dP.shape)
+            
+            for j in range(param_number):
+                dA = dA_all_params[:,:,j]
+                dQ = dQ_all_params[:,:,j]
+                
+                dP = p_dP[:,:,j]
+                if (multiple_ts_mode == False):
+                    dm = p_dm[:,j]; dm.shape = (dm.shape[0],1)
+                    dm_pred[:,j] =  np.squeeze(np.dot(dA, p_m) + np.dot(A, dm)) # dm can 3-dim (dim,ts,variable)
+                elif (multiple_ts_mode == True): # modification for several time series
+                    dm = p_dm[:,:,j]
+                    dm_pred[:,:,j] = np.dot(dA, p_m) + np.dot(A, dm)
+                # prediction step derivatives for current parameter:
+                
+                
+                dP_pred[:,:,j] = np.dot( dA ,np.dot(Prev_cov, A.T))
+                dP_pred[:,:,j] += dP_pred[:,:,j].T            
+                dP_pred[:,:,j] += np.dot( A ,np.dot(dP, A.T)) + dQ
+                
+                dP_pred[:,:,j] = 0.5*(dP_pred[:,:,j] + dP_pred[:,:,j].T) #symmetrize
+        else:
+            dm_pred = None
+            dP_pred = None
+            
+        return m_pred, P_pred, dm_pred, dP_pred
+        
     @staticmethod
     def _kalman_update_step(k,   p_m , p_P, p_h, p_f_H, p_f_R, measurement, calc_log_likelihood= False, 
                             calc_grad_log_likelihood=False, p_dm = None, p_dP = None, grad_calc_params_2 = None):
@@ -816,7 +948,7 @@ class DescreteStateSpace(object):
                 time series.
                 
         p_P:
-             Covariance matrix from the previous step.
+             Covariance matrix from the prediction step.
              
         p_h: function (k, x_{k}, H_{k}). Measurement function.
             k (iteration number), starts at 0
@@ -892,6 +1024,10 @@ class DescreteStateSpace(object):
              v = measurement-p_h(k, m_pred, H)
              S = H.dot(P_pred).dot(H.T) + R
              if measurement.shape[0]==1: # measurements are one dimensional
+                 if (S < 0):
+                     raise ValueError("Kalman Filter Update: S is negative step %i" % k )
+                     #import pdb; pdb.set_trace()
+                     
                  K = P_pred.dot(H.T) / S
                  if calc_log_likelihood:
                      log_likelihood_update = -0.5 * ( np.log(2*np.pi) + np.log(S) +
@@ -998,6 +1134,220 @@ class DescreteStateSpace(object):
         
              return m_upd, P_upd, log_likelihood_update, dm_upd, dP_upd, d_log_likelihood_update
     
+    @staticmethod
+    def _kalman_update_step_SVD(k,   p_m , p_P, p_h, p_f_H, p_f_R, p_f_iRsr, measurement, calc_log_likelihood= False, 
+                            calc_grad_log_likelihood=False, p_dm = None, p_dP = None, grad_calc_params_2 = None):
+        """
+        Input:
+        
+        k: int
+              Iteration No. Starts at 0. Total number of iterations equal to the 
+              number of measurements.
+          
+        m_P: matrix of size (state_dim, time_series_no)
+             Mean value from the previous step. For "multiple time series mode" 
+                it is matrix, second dimension of which correspond to different
+                time series.
+                
+        p_P: tuple (P_pred, S, V)
+             Covariance matrix from the prediction step and its SVD decomposition.
+             P_pred = V * S * V.T The tuple is (P_pred, S, V)
+             
+        p_h: function (k, x_{k}, H_{k}). Measurement function.
+            k (iteration number), starts at 0
+            x_{k} state 
+            H_{k} Jacobian matrices of f_h. In the linear case it is exactly H_{k}.
+        
+        p_f_H: function (k, m, P) return Jacobian of dynamic function, it is
+            passed into p_h.
+            k (iteration number), starts at 0
+            m: point where Jacobian is evaluated
+            P: parameter for Jacobian, usually covariance matrix.
+        
+        p_f_R: function (k). Returns noise matrix of measurement equation 
+            on iteration k.
+            k (iteration number). starts at 0
+            
+        p_f_iRsr: function (k). Returns the square root of the noise matrix of 
+            measurement equation on iteration k. 
+            k (iteration number). starts at 0
+            
+        measurement: (measurement_dim, time_series_no) matrix
+            One measurement used on the current update step. For 
+            "multiple time series mode" it is matrix, second dimension of 
+            which correspond to different time series.
+            
+        calc_log_likelihood: boolean
+            Whether to calculate marginal likelihood of the state-space model.
+        
+        calc_grad_log_likelihood: boolean
+                Whether to calculate gradient of the marginal likelihood 
+                of the state-space model. If true then the next parameter must 
+                provide the extra parameters for gradient calculation.
+        
+        p_dm: 3D array (state_dim, time_series_no, parameters_no)
+                Mean derivatives from the prediction step. For "multiple time series mode" 
+                it is 3D array, second dimension of which correspond to different
+                time series.
+        
+        p_dP: array
+            Covariance derivatives from the prediction step.
+        
+        grad_calc_params_2: List or None
+            List with derivatives. The first component is 'f_dH' - function(k)
+            which returns the derivative of H. The second element is 'f_dR'
+             - function(k). Function which returns the derivative of R.
+        
+        Output:
+        ----------------------------
+        m_upd, P_upd, dm_upd, dP_upd: metrices, 3D objects
+            Results of the prediction steps.
+            
+        log_likelihood_update: double or 1D array
+            Update to the log_likelihood from this step        
+        
+        d_log_likelihood_update: (grad_params_no, time_series_no) matrix
+            Update to the gradient of log_likelihood, "multiple time series mode"
+            adds extra columns to the gradient.
+        
+        """        
+        
+        m_pred = p_m # from prediction step
+        P_pred,S_pred,V_pred = p_P # from prediction step      
+        
+        H = p_f_H(k, m_pred, P_pred)
+        R = p_f_R(k)
+        R_isr = p_f_iRsr(k) # square root of the inverse of R matrix       
+        
+        if (p_m.shape[1] > 1):
+            multiple_ts_mode = True
+            time_series_no = p_m.shape[1] # number of time serieses
+        else:
+            time_series_no = 1
+            multiple_ts_mode = False
+        
+        log_likelihood_update=None; dm_upd=None; dP_upd=None; d_log_likelihood_update=None
+        # Update step (only if there is data)
+        if not np.any(np.isnan(measurement)): # TODO: if some dimensions are missing, do properly computations for other.
+             v = measurement-p_h(k, m_pred, H)
+             
+             svd_2_matr = np.vstack( ( np.dot( R_isr.T, np.dot(H, V_pred)) , np.diag( 1.0/np.sqrt(S_pred) ) ) )
+                
+             (U,S,Vh) = sp.linalg.svd( svd_2_matr,full_matrices=False, compute_uv=True, 
+                          overwrite_a=False,check_finite=True)
+             
+             # P_upd = U_upd S_upd**2 U_upd.T
+             U_upd = np.dot(V_pred, Vh.T)             
+             S_upd = (1.0/S)**2
+             
+             P_upd = np.dot(U_upd * S_upd, U_upd.T) # update covariance
+             P_upd = (P_upd,S_upd,U_upd) # tuple to pass to the next step
+            
+             # stil need to compute S and K for derivative computation
+             S = H.dot(P_pred).dot(H.T) + R
+             if measurement.shape[0]==1: # measurements are one dimensional
+                 if (S < 0):
+                     raise ValueError("Kalman Filter Update SVD: S is negative step %i" % k )
+                     #import pdb; pdb.set_trace()
+                     
+                 K = P_pred.dot(H.T) / S
+                 if calc_log_likelihood:
+                     log_likelihood_update = -0.5 * ( np.log(2*np.pi) + np.log(S) +
+                                         v*v / S)
+                     #log_likelihood_update = log_likelihood_update[0,0] # to make int
+                     if np.any(np.isnan(log_likelihood_update)): # some member in P_pred is None.
+                         raise ValueError("Nan values in likelihood update!")
+                 LL = None; islower = None
+             else:
+                 raise ValueError("""Measurement dimension larger then 1 is currently not supported""")
+             
+             # Old  method of computing updated covariance (for testing) ->
+             #P_upd_tst = K.dot(S).dot(K.T)
+             #P_upd_tst = 0.5*(P_upd_tst + P_upd_tst.T)
+             #P_upd_tst =  P_pred - P_upd_tst# this update matrix is symmetric
+             # Old  method of computing updated covariance (for testing) <-
+             
+             if calc_grad_log_likelihood:
+                 dm_pred_all_params = p_dm # derivativas of the prediction phase 
+                 dP_pred_all_params = p_dP
+                 
+                 param_number = p_dP.shape[2]
+                 
+                 p_f_dH = grad_calc_params_2[0]; dH_all_params = p_f_dH(k)
+                 p_f_dR = grad_calc_params_2[1]; dR_all_params = p_f_dR(k)
+                 
+                 dm_upd = np.empty(dm_pred_all_params.shape)
+                 dP_upd = np.empty(dP_pred_all_params.shape)
+                 
+                 # firts dimension parameter_no, second - time series number
+                 d_log_likelihood_update = np.empty((param_number,time_series_no))
+                 for param in range(param_number):
+                
+                    dH = dH_all_params[:,:,param]
+                    dR = dR_all_params[:,:,param]
+                    
+                    if (multiple_ts_mode == False):    
+                        dm_pred = dm_pred_all_params[:,param]
+                    else:
+                        dm_pred = dm_pred_all_params[:,:,param]
+                        
+                        
+                    dP_pred = dP_pred_all_params[:,:,param]
+                
+                    # Terms in the likelihood derivatives
+                    dv = - np.dot( dH, m_pred) -  np.dot( H, dm_pred)           
+                    dS = np.dot(dH, np.dot( P_pred, H.T))
+                    dS += dS.T
+                    dS += np.dot(H, np.dot( dP_pred, H.T)) + dR
+                
+                    # TODO: maybe symmetrize dS
+                
+                    #dm and dP for the next stem
+                    if LL is not None: # the state vector is not a scalar
+                        tmp1 = linalg.cho_solve((LL,islower), H).T
+                        tmp2 = linalg.cho_solve((LL,islower), dH).T
+                        tmp3 = linalg.cho_solve((LL,islower), dS).T
+                    else: # the state vector is a scalar
+                        tmp1 = H.T / S
+                        tmp2 = dH.T / S
+                        tmp3 = dS.T / S
+                        
+                    dK = np.dot( dP_pred, tmp1) + np.dot( P_pred, tmp2) - \
+                         np.dot( P_pred, np.dot( tmp1, tmp3 ) )
+                    
+                    # terms required for the next step, save this for each parameter
+                    if (multiple_ts_mode == False):
+                        dm_upd[:,param] = dm_pred + np.squeeze(np.dot(dK, v) + np.dot(K, dv))
+                    else:
+                        dm_upd[:,:,param] = dm_pred + np.dot(dK, v) + np.dot(K, dv)
+                        
+                    dP_upd[:,:,param] = -np.dot(dK, np.dot(S, K.T))      
+                    dP_upd[:,:,param] += dP_upd[:,:,param].T
+                    dP_upd[:,:,param] += dP_pred - np.dot(K , np.dot( dS, K.T))
+                    
+                    dP_upd[:,:,param] = 0.5*(dP_upd[:,:,param] + dP_upd[:,:,param].T) #symmetrize
+                    # computing the likelihood change for each parameter:
+                    if LL is not None: # the state vector is not 1D
+                        #tmp4 = linalg.cho_solve((LL,islower), dv)
+                        tmp5 = linalg.cho_solve((LL,islower), v)
+                    else: # the state vector is a scalar
+                        #tmp4 = dv / S
+                        tmp5 = v / S
+                        
+                    
+                    d_log_likelihood_update[param,:] = -(0.5*np.sum(np.diag(tmp3)) + \
+                        np.sum(tmp5*dv, axis=0) - 0.5 * np.sum(tmp5 * np.dot(dS, tmp5), axis=0) ) 
+                        
+                    # Before  
+                    #d_log_likelihood_update[param,0] = -(0.5*np.sum(np.diag(tmp3)) + \
+                    #np.dot(tmp5.T, dv) - 0.5 * np.dot(tmp5.T ,np.dot(dS, tmp5)) ) 
+                 
+            # Compute the actual updates for mean of the states. Variance update
+            # is computed earlier.
+             m_upd = m_pred + K.dot( v )
+             
+             return m_upd, P_upd, log_likelihood_update, dm_upd, dP_upd, d_log_likelihood_update
+             
     @staticmethod
     def _rts_smoother_update_step(k, p_m , p_P, p_m_pred, p_P_pred, p_m_prev_step, 
                                   p_P_prev_step, p_f_A):
@@ -1065,7 +1415,7 @@ class DescreteStateSpace(object):
          
         P_upd = 0.5*(P_upd + P_upd.T)
         
-        return m_upd, P_upd
+        return m_upd, P_upd, G
              
     @classmethod  
     def rts_smoother(cls,state_dim, p_a, p_f_A, p_f_Q, filter_means, 
@@ -1112,30 +1462,138 @@ class DescreteStateSpace(object):
             Smoothed estimates of the state covariances
         """
 
-        no_steps = filter_covars.shape[0] # number of elements in covariance matrix
+        no_steps = filter_covars.shape[0]-1# number of steps (minus initial covariance)
       
         M = np.empty(filter_means.shape) # smoothed means
         P = np.empty(filter_covars.shape) # smoothed covars
+        #G = np.empty( (no_steps,state_dim,state_dim)  ) # G from the update step of the smoother         
         
         M[-1,:] = filter_means[-1,:]
         P[-1,:,:] = filter_covars[-1,:,:]
-        for k in range(no_steps-2,-1,-1): 
+        for k in range(no_steps-1,-1,-1): 
             
             m_pred, P_pred, tmp1, tmp2 = \
                     cls._kalman_prediction_step(k, filter_means[k,:], 
                                                 filter_covars[k,:,:], p_a, p_f_A, p_f_Q, 
                                                 calc_grad_log_likelihood=False) 
             
-            m_upd, P_upd = cls._rts_smoother_update_step(k, 
+            m_upd, P_upd, G_tmp = cls._rts_smoother_update_step(k, 
                             filter_means[k,:] ,filter_covars[k,:,:], 
                             m_pred, P_pred, M[k+1,:] ,P[k+1,:,:], p_f_A)
                       
             M[k,:] = np.squeeze(m_upd)
             P[k,:,:] = P_upd
+            #G[k,:,:] = G_upd.T # store transposed G.
         # Return values
             
-        return (M, P)
+        return (M, P) #, G)
     
+    @staticmethod
+    def _EM_gradient(A,Q,H,R,m_init,P_init,measurements, M, P, G, dA, dQ, dH, dR, dm_init, dP_init):
+        """
+        Gradient computation with the EM algorithm.
+        
+        Input:
+        -----------------
+        
+        M: Means from the smoother
+        P: Variances from the smoother
+        G: Gains? from the smoother
+        """
+        import pdb; pdb.set_trace();
+        
+        param_number = dA.shape[-1]
+        d_log_likelihood_update = np.empty((param_number,1))
+
+        sample_no = measurements.shape[0]
+        P_1 = P[1:,:,:] # remove 0-th step
+        P_2 = P[0:-1,:,:] # remove 0-th step
+        
+        M_1 = M[1:,:] # remove 0-th step
+        M_2 = M[0:-1,:] # remove the last step
+        
+        Sigma = np.mean(P_1,axis=0) + np.dot(M_1.T, M_1) / sample_no #
+        Phi =   np.mean(P_2,axis=0) + np.dot(M_2.T, M_2) / sample_no #
+        
+        B = np.dot( measurements.T, M_1 )/ sample_no
+        C =   (sp.einsum( 'ijk,ikl', P_1, G) + np.dot(M_1.T, M_2)) / sample_no #
+        
+#        C1 = np.zeros( (P_1.shape[1],P_1.shape[1]) )
+#        for k in range(P_1.shape[0]):
+#            C1 += np.dot(P_1[k,:,:],G[k,:,:]) + sp.outer( M_1[k,:], M_2[k,:] )
+#        C1 = C1 / sample_no
+        
+        D = np.dot( measurements.T, measurements ) / sample_no
+    
+        try:
+            P_init_inv = sp.linalg.inv(P_init)
+            
+            if np.max( np.abs(P_init_inv)) > 10e13:
+                compute_P_init_terms = False
+            else:
+                compute_P_init_terms = True
+        except np.linalg.LinAlgError:
+            compute_P_init_terms = False
+            
+        try:
+            Q_inv = sp.linalg.inv(Q)
+            
+            if np.max( np.abs(Q_inv)) > 10e13:
+                compute_Q_terms = False
+            else:
+                compute_Q_terms = True
+        except np.linalg.LinAlgError:
+            compute_Q_terms = False
+
+        try:
+            R_inv = sp.linalg.inv(R)
+            
+            if np.max( np.abs(R_inv)) > 10e13:
+                compute_R_terms = False
+            else:
+                compute_R_terms = True
+        except np.linalg.LinAlgError:
+            compute_R_terms = False
+        
+        
+        d_log_likelihood_update = np.zeros((param_number,1))
+        for j in range(param_number):       
+            if compute_P_init_terms:
+                d_log_likelihood_update[j,:] -= 0.5 * np.sum(P_init_inv* dP_init[:,:,j].T ) #p #m
+                
+                M0_smoothed = M[0]; M0_smoothed.shape = (M0_smoothed.shape[0],1)
+                tmp1 = np.dot( dP_init[:,:,j], np.dot( P_init_inv, (P[0,:,:] + sp.outer( (M0_smoothed - m_init), (M0_smoothed - m_init) )) )  ) #p #m          
+                d_log_likelihood_update[j,:] += 0.5 * np.sum(P_init_inv* tmp1.T )
+        
+                tmp2 = sp.outer( dm_init[:,j], M0_smoothed )
+                tmp2 += tmp2.T
+                d_log_likelihood_update[j,:] += 0.5 * np.sum(P_init_inv* tmp2.T )
+            
+            if compute_Q_terms:
+                
+                d_log_likelihood_update[j,:] -=  sample_no/2.0 * np.sum(Q_inv* dQ[:,:,j].T ) #m
+                
+                tmp1 = np.dot(C,A.T); tmp1 += tmp1.T; tmp1 = Sigma - tmp1 + np.dot(A, np.dot(Phi,A.T)) #m                
+                tmp1 = np.dot( dQ[:,:,j], np.dot( Q_inv, tmp1) )
+                d_log_likelihood_update[j,:] += sample_no/2.0 * np.sum(Q_inv * tmp1.T)
+       
+                tmp2 = np.dot( dA[:,:,j], C.T); tmp2 += tmp2.T;         
+                tmp3 = np.dot(dA[:,:,j], np.dot(Phi,A.T)); tmp3 += tmp3.T
+                d_log_likelihood_update[j,:] -= sample_no/2.0 * np.sum(Q_inv.T * (tmp3 - tmp2) )                
+                
+            if compute_R_terms:
+                d_log_likelihood_update[j,:] -=  sample_no/2.0 * np.sum(R_inv* dR[:,:,j].T )                
+                
+                tmp1 = np.dot(B,H.T); tmp1 += tmp1.T; tmp1 = D - tmp1 + np.dot(H, np.dot(Sigma,H.T))                
+                tmp1 = np.dot( dR[:,:,j], np.dot( R_inv, tmp1) )
+                d_log_likelihood_update[j,:] += sample_no/2.0 * np.sum(R_inv * tmp1.T)
+       
+                tmp2 = np.dot( dH[:,:,j], B.T); tmp2 += tmp2.T;         
+                tmp3 = np.dot(dH[:,:,j], np.dot(Sigma,H.T)); tmp3 += tmp3.T
+                d_log_likelihood_update[j,:] -= sample_no/2.0 * np.sum(R_inv.T * (tmp3 - tmp2) )
+       
+        return d_log_likelihood_update
+       
     @staticmethod
     def _check_SS_matrix(p_M, state_dim, measurement_dim, which='A'):
         """
@@ -1300,6 +1758,102 @@ class DescreteStateSpace(object):
 class Struct(object):
     pass
 
+def inverse_square_root(R, tol=1e-14):
+    """
+    The function computes the square root of the matrix inverse.
+    
+    Input:
+    ------------------
+        R - given matrix 
+        tol - smallest value of the singular number below which the inversion 
+              of S must be handeled specially.
+    Output:
+    -------------------
+        inv_square_root - square root of the inverse
+    """
+    
+    if (R.shape[0] == 1): # matrix M is (1x1)
+        inv_square_root = np.sqrt(R)
+    else:
+        (U,S,Vh) = sp.linalg.svd( R,full_matrices=False, compute_uv=True, 
+                      overwrite_a=False,check_finite=True)
+        if (np.abs(S) < tol):
+            raise ValueError("""Inverse Square Root: Measurement noise matrix
+                             is singular. Handling is not implemented.""")
+        inv_square_root = U * 1.0/np.sqrt(S)
+    
+    return inv_square_root
+
+class R_handling():
+    """
+    The calss handles noise matrix R.
+    """
+    def __init__(self, R, index, R_time_var_index, unique_R_number):
+        """
+        Input:        
+        ---------------
+        R - array with noise on various steps. The result of preprocessing
+            the noise input.
+        
+        index - for each step of Kalman filter contains the corresponding index
+                in the array.
+        
+        unique_R_number - number of unique noise matrices below which square roots
+            are cached and above which they are computed each time.
+        
+        R_time_var_index - another index in the array R. Computed earlier and passed here.
+        
+        Output:
+        --------------
+        Object which has two necessary functions:
+            f_R(k)
+            inv_R_square_root(k)
+        """
+        self.R = R
+        self.index = index
+        self.R_time_var_index = R_time_var_index
+        
+        if (len(np.unique(index)) > unique_R_number):
+            self.svd_each_time = True
+        else:
+            self.svd_each_time = False
+            
+        self.R_square_root = {}
+        
+    def f_R(self,k):
+        return self.R[:,:, self.index[self.R_time_var_index, k]]
+    
+    def f_iRsr(self, k):
+        """
+        Function returns the inverse square root of R matrix on step k.
+        """
+        R = self.R[:,:, self.index[self.R_time_var_index, k]]
+        
+        if (self.R.shape[0] == 1): # 1-D case handle simplier. No storage
+        # of the result, just compute it each time.
+            inv_square_root = np.sqrt( 1.0/R )
+        else:
+            if self.svd_each_time:
+                
+                (U,S,Vh) = sp.linalg.svd( R,full_matrices=False, compute_uv=True, 
+                          overwrite_a=False,check_finite=True)
+                          
+                inv_square_root = U * 1.0/np.sqrt(S)
+            else:
+                ind = self.index[self.R_time_var_index, k]
+                if ind in self.R_square_root:
+                    inv_square_root = self.R_square_root[ind]
+                else:
+                    (U,S,Vh) = sp.linalg.svd( R,full_matrices=False, compute_uv=True, 
+                              overwrite_a=False,check_finite=True)
+                    
+                    inv_square_root = U * 1.0/np.sqrt(S)
+                    
+                    self.R_square_root[ind] = inv_square_root
+                
+        return inv_square_root
+    
+    
 class ContDescrStateSpace(DescreteStateSpace):
     """
     Class for continuous-discrete Kalman filter. State equation is
@@ -1370,6 +1924,7 @@ class ContDescrStateSpace(DescreteStateSpace):
             self.dAk = None
             self.dQk = None
             
+            self.square_root_computed = False
             # !!!Print statistics! Which object is created
         def _recompute_for_new_k(self,k):
             """
@@ -1396,6 +1951,7 @@ class ContDescrStateSpace(DescreteStateSpace):
                 self.Qk = Qk
                 self.dAk = dAk
                 self.dQk = dQk
+                self.Q_square_root_computed = False
             else:
                 Ak = self.Ak
                 Qk = self.Qk
@@ -1417,6 +1973,7 @@ class ContDescrStateSpace(DescreteStateSpace):
             self.last_k = 0
             self.last_k_computed = False
             self.compute_derivatives = compute_derivatives
+            self.Q_square_root_computed = False
             
             return self
             
@@ -1433,10 +1990,42 @@ class ContDescrStateSpace(DescreteStateSpace):
             return dAk
         
         def f_dQ(self, k):
-            Ak,Qk, dAk, dQk = self._recompute_for_new_k(k)
+            Ak,Qk, dAk, dQk = self._recompute_for_new_k(k) 
             return dQk
+        
+        def f_Qsr(self,k):
+            """
+            Square root of the noise matrix Q
+            """
             
+            if ((self.last_k == k) and (self.last_k_computed == True)):
+                if not self.Q_square_root_computed:
+                    (U, S, Vh) = sp.linalg.svd( self.Qk, full_matrices=False, compute_uv=True, overwrite_a=False, check_finite=False)
+                    square_root = U * np.sqrt(S)
+                    self.square_root_computed = True
+                    self.Q_square_root = square_root
+                else:
+                    square_root = self.Q_square_root
+            else:
+                raise ValueError("Square root of Q can not be computed")
             
+            return square_root
+
+        def return_last(self):
+            """
+            Function returns last computed matrices.
+            """
+            if not self.last_k_computed:
+                raise ValueError("Matrices are not computed.")
+            else:
+                k = self.last_k
+                A = self.Ak
+                Q = self.Qk
+                dA = self.dAk
+                dQ = self.dQk
+                
+            return k, A, Q, dA, dQ 
+        
     class AQcompute_batch(object):
         """
         Class for calculating matrices A, Q, dA, dQ of the discrete Kalman Filter
@@ -1486,6 +2075,9 @@ class ContDescrStateSpace(DescreteStateSpace):
                             (self.dAs.nbytes if (self.dAs is not None) else 0) +\
                             (self.dQs.nbytes if (self.dQs is not None) else 0) +\
                             (self.reconstruct_indices.nbytes if (self.reconstruct_indices is not None) else 0)
+                            
+            self.Q_svd_dict = {}
+	    self.last_k = None
              # !!!Print statistics! Which object is created
             # !!!Print statistics! Print sizes of matrices
         def reset(self, compute_derivatives):
@@ -1497,21 +2089,61 @@ class ContDescrStateSpace(DescreteStateSpace):
             return self
             
         def f_A(self,k,m,P):
+            self.last_k = k
             return self.As[:,:, self.reconstruct_indices[k]]
             
         def f_Q(self,k):
+            self.last_k = k
             return self.Qs[:,:, self.reconstruct_indices[k]]
         
-        def f_dA(self,k): 
+        def f_dA(self,k):
+            self.last_k = k
             return self.dAs[:,:, :, self.reconstruct_indices[k]]
         
         def f_dQ(self,k):
+            self.last_k = k
             return self.dQs[:,:, :, self.reconstruct_indices[k]]
-    
+        
+        
+        def f_Qsr(self,k):
+            """
+            Square root of the noise matrix Q
+            """
+            matrix_index = self.reconstruct_indices[k]
+            if matrix_index in self.Q_svd_dict:
+                square_root = self.Q_svd_dict[matrix_index]
+            else:
+                (U, S, Vh) = sp.linalg.svd( self.Qs[:,:, matrix_index], 
+                                        full_matrices=False, compute_uv=True, 
+                                        overwrite_a=False, check_finite=False)
+                square_root = U * np.sqrt(S)
+                self.Q_svd_dict[matrix_index] = square_root
+            
+            return square_root
+
+        def return_last(self):
+            """
+            Function returns last available matrices.
+            """
+            
+            if (self.last_k is None):
+                raise ValueError("Matrices are not computed.")
+            else:
+                ind = self.reconstruct_indices[self.last_k]
+                A = self.As[:,:, ind]
+                Q = self.Qs[:,:, ind]
+                dA = self.dAs[:,:, :, ind]
+                dQ = self.dQs[:,:, :, ind]
+                
+            return self.last_k, A, Q, dA, dQ
+
     @classmethod
     def cont_discr_kalman_filter(cls, F, L, Qc, p_H, p_R, P_inf, X, Y, index = None, 
-                                 m_init=None, P_init=None, calc_log_likelihood=False, 
-                      calc_grad_log_likelihood=False, grad_params_no=None, grad_calc_params=None):
+                                 m_init=None, P_init=None, 
+                                 p_kalman_filter_type='regular',
+                                 calc_log_likelihood=False, 
+                                 calc_grad_log_likelihood=False, 
+                                 grad_params_no=None, grad_calc_params=None):
         """
         This function implements the continuous-discrete Kalman Filter algorithm
         These notations for the State-Space model are assumed:
@@ -1594,7 +2226,13 @@ class ContDescrStateSpace(DescreteStateSpace):
             multiplied by this scalar. If None the unit matrix is used instead.
             "multiple time series mode" does not affect it, since it does not
             affect anything related to state variaces.
-            
+        
+        p_kalman_filter_type: string, one of ('regular', 'svd')
+            Which Kalman Filter is used. Regular or SVD. SVD is more numerically
+            stable, in particular, Covariace matrices are guarantied to be
+            positive semi-definite. However, 'svd' works slower, especially for
+            small data due to SVD call overhead.
+        
         calc_log_likelihood: boolean
             Whether to calculate marginal likelihood of the state-space model.
             
@@ -1703,14 +2341,20 @@ class ContDescrStateSpace(DescreteStateSpace):
         
         if P_init is None:
             P_init = P_inf.copy()
-        
+            
+            
+            
+        if p_kalman_filter_type not in ('regular', 'svd'):
+            raise ValueError("Kalman filer type neither 'regular nor 'svd'.")
+            
         # Functions to pass to the kalman_filter algorithm:
         # Parameters:
         # k - number of Kalman filter iteration
         # m - vector for calculating matrices. Required for EKF. Not used here.
         f_h = lambda k,m,H: np.dot(H, m)
         f_H = lambda k,m,P: p_H[:,:, index[H_time_var_index, k]]
-        f_R = lambda k: p_R[:,:, index[R_time_var_index, k]]
+        #f_R = lambda k: p_R[:,:, index[R_time_var_index, k]]
+        o_R = R_handling( p_R, index, R_time_var_index, 20)        
         
         if calc_grad_log_likelihood:
             
@@ -1746,7 +2390,8 @@ class ContDescrStateSpace(DescreteStateSpace):
             print("General: run Continuos-Discrete Kalman Filter")
         # Also for dH, dR and probably for all derivatives
         (M, P, log_likelihood, grad_log_likelihood, AQcomp) = cls._cont_discr_kalman_filter_raw(state_dim,F, L, Qc, P_inf,
-                        f_h, f_H, f_R, X, Y, m_init=m_init, P_init=P_init, calc_log_likelihood=calc_log_likelihood, 
+                        f_h, f_H, o_R, X, Y, m_init=m_init, P_init=P_init, p_kalman_filter_type=p_kalman_filter_type, 
+                        calc_log_likelihood=calc_log_likelihood, 
                       calc_grad_log_likelihood=calc_grad_log_likelihood, grad_params_no=grad_params_no, dP_inf=dP_inf, 
                       dF = dF, dQc=dQc, dH=dH, dR=dR, dm_init=dm_init, dP_init=dP_init)
         
@@ -1768,8 +2413,10 @@ class ContDescrStateSpace(DescreteStateSpace):
         return (M, P, log_likelihood, grad_log_likelihood, AQcomp)
         
     @classmethod
-    def _cont_discr_kalman_filter_raw(cls,state_dim,F,L,Qc, P_inf, f_h, f_H, f_R, X, Y, 
-                                      m_init=None, P_init=None, calc_log_likelihood=False, 
+    def _cont_discr_kalman_filter_raw(cls,state_dim,F,L,Qc, P_inf, f_h, f_H, p_R, X, Y, 
+                                      m_init=None, P_init=None, 
+                                      p_kalman_filter_type='regular',
+                                      calc_log_likelihood=False, 
                       calc_grad_log_likelihood=False, grad_params_no=None, dP_inf=None, 
                       dF = None, dQc=None, dH=None, dR=None, dm_init=None, dP_init=None):
         """
@@ -1820,7 +2467,7 @@ class ContDescrStateSpace(DescreteStateSpace):
             m: point where Jacobian is evaluated,
             P: parameter for Jacobian, usually covariance matrix.
             
-         p_f_R: function (k). Returns noise matrix of measurement equation 
+        p_f_R: function (k). Returns noise matrix of measurement equation 
             on iteration k.
             k (iteration number).
             
@@ -1835,6 +2482,12 @@ class ContDescrStateSpace(DescreteStateSpace):
             "multiple time series mode" does not affect it, since it does not
             affect anything related to state variaces.
             
+        p_kalman_filter_type: string, one of ('regular', 'svd')
+            Which Kalman Filter is used. Regular or SVD. SVD is more numerically
+            stable, in particular, Covariace matrices are guarantied to be
+            positive semi-definite. However, 'svd' works slower, especially for
+            small data due to SVD call overhead.
+            
         calc_log_likelihood: boolean
             Whether to calculate marginal likelihood of the state-space model.
             
@@ -1850,7 +2503,7 @@ class ContDescrStateSpace(DescreteStateSpace):
             Necessary parameters for derivatives calculation.
         
         """
-
+        
         steps_no = Y.shape[0] # number of steps in the Kalman Filter
         if len(Y.shape) == 2: 
             time_series_no = 1 # regular case
@@ -1879,8 +2532,15 @@ class ContDescrStateSpace(DescreteStateSpace):
             M[0,:,:] = m_init # Initialize mean values
         # Variance estimations. Initial values will be included
         P = np.empty(((steps_no+1),state_dim,state_dim))
+        P_init = 0.5*( P_init + P_init.T) # symmetrize initial covariance. In some ustable cases this is uiseful
         P[0,:,:] = P_init # Initialize initial covariance matrix
         
+        #import pdb; pdb.set_trace()
+        if p_kalman_filter_type == 'svd':
+            (U,S,Vh) = sp.linalg.svd( P_init,full_matrices=False, compute_uv=True, 
+                      overwrite_a=False,check_finite=True)
+            S[ (S==0) ] = 1e-17 # allows to run algorithm for singular initial variance
+            P_upd = (P_init, S,U)
         #log_likelihood = 0
         #grad_log_likelihood = np.zeros((grad_params_no,1))
         log_likelihood = 0 if calc_log_likelihood else None
@@ -1898,14 +2558,40 @@ class ContDescrStateSpace(DescreteStateSpace):
                 k_measurment = Y[k,:].T # measurement as column
             else: # multiple time series mode
                 k_measurment = Y[k,:,:]
-                
-            m_pred, P_pred, dm_pred, dP_pred = \
-            cls._kalman_prediction_step(k, M[k,:] ,P[k,:,:], f_a, AQcomp.f_A, AQcomp.f_Q, 
-                calc_grad_log_likelihood=calc_grad_log_likelihood, 
-                p_dm = dm_upd, p_dP = dP_upd, grad_calc_params_1 = (AQcomp.f_dA, AQcomp.f_dQ) )
             
-            m_upd, P_upd, log_likelihood_update, dm_upd, dP_upd, d_log_likelihood_update = \
-            cls._kalman_update_step(k,  m_pred , P_pred, f_h, f_H, f_R, k_measurment, 
+            #import pdb; pdb.set_trace()
+            if p_kalman_filter_type == 'svd':            
+                m_pred, P_pred, dm_pred, dP_pred = \
+                cls._kalman_prediction_step_SVD(k, M[k,:] ,P_upd, f_a, AQcomp.f_A, AQcomp.f_Q, AQcomp.f_Qsr,
+                    calc_grad_log_likelihood=calc_grad_log_likelihood, 
+                    p_dm = dm_upd, p_dP = dP_upd, grad_calc_params_1 = (AQcomp.f_dA, AQcomp.f_dQ) )
+            else:
+                m_pred, P_pred, dm_pred, dP_pred = \
+                cls._kalman_prediction_step(k, M[k,:] ,P[k,:,:], f_a, AQcomp.f_A, AQcomp.f_Q, 
+                    calc_grad_log_likelihood=calc_grad_log_likelihood, 
+                    p_dm = dm_upd, p_dP = dP_upd, grad_calc_params_1 = (AQcomp.f_dA, AQcomp.f_dQ) )
+            
+            #import pdb; pdb.set_trace()
+            if p_kalman_filter_type == 'svd':            
+                m_upd, P_upd, log_likelihood_update, dm_upd, dP_upd, d_log_likelihood_update = \
+                cls._kalman_update_step_SVD(k,  m_pred , P_pred, f_h, f_H, p_R.f_R, p_R.f_iRsr, 
+                        k_measurment, calc_log_likelihood=calc_log_likelihood, 
+                        calc_grad_log_likelihood=calc_grad_log_likelihood, 
+                        p_dm = dm_pred, p_dP = dP_pred, grad_calc_params_2 = (dH, dR))
+                        
+                        
+#                m_upd, P_upd, log_likelihood_update, dm_upd, dP_upd, d_log_likelihood_update = \
+#                cls._kalman_update_step(k,  m_pred , P_pred[0], f_h, f_H, p_R.f_R, k_measurment, 
+#                        calc_log_likelihood=calc_log_likelihood, 
+#                        calc_grad_log_likelihood=calc_grad_log_likelihood, 
+#                        p_dm = dm_pred, p_dP = dP_pred, grad_calc_params_2 = (dH, dR))
+#                        
+#                (U,S,Vh) = sp.linalg.svd( P_upd,full_matrices=False, compute_uv=True, 
+#                      overwrite_a=False,check_finite=True)
+#                P_upd = (P_upd, S,U)
+            else:
+                m_upd, P_upd, log_likelihood_update, dm_upd, dP_upd, d_log_likelihood_update = \
+                cls._kalman_update_step(k,  m_pred , P_pred, f_h, f_H, p_R.f_R, k_measurment, 
                         calc_log_likelihood=calc_log_likelihood, 
                         calc_grad_log_likelihood=calc_grad_log_likelihood, 
                         p_dm = dm_pred, p_dP = dP_pred, grad_calc_params_2 = (dH, dR))                
@@ -1920,8 +2606,11 @@ class ContDescrStateSpace(DescreteStateSpace):
                 M[k+1,:] = np.squeeze(m_upd)
             else:
                 M[k+1,:,:] = m_upd # separate mean value for each time series
-                
-            P[k+1,:,:] = P_upd
+            
+            if p_kalman_filter_type == 'svd':            
+                P[k+1,:,:] = P_upd[0]
+            else:
+                P[k+1,:,:] = P_upd
             #print("kf it: %i" % k)
             # !!!Print statistics! Print sizes of matrices
             # !!!Print statistics! Print iteration time base on another boolean variable
@@ -1975,7 +2664,7 @@ class ContDescrStateSpace(DescreteStateSpace):
         
         f_a = lambda k,m,A: np.dot(A, m) # state dynamic model
         
-        no_steps = filter_covars.shape[0] # number 
+        no_steps = filter_covars.shape[0]-1# number of steps (minus initial covariance)
       
         M = np.empty(filter_means.shape) # smoothed means
         P = np.empty(filter_covars.shape) # smoothed covars
@@ -1985,14 +2674,14 @@ class ContDescrStateSpace(DescreteStateSpace):
             
         M[-1,:] = filter_means[-1,:]
         P[-1,:,:] = filter_covars[-1,:,:]
-        for k in range(no_steps-2,-1,-1): 
+        for k in range(no_steps-1,-1,-1): 
             
             m_pred, P_pred, tmp1, tmp2 = \
                     cls._kalman_prediction_step(k, filter_means[k,:], 
                                                 filter_covars[k,:,:], f_a, AQcomp.f_A, AQcomp.f_Q, 
                                                 calc_grad_log_likelihood=False) 
             
-            m_upd, P_upd = cls._rts_smoother_update_step(k, 
+            m_upd, P_upd, tmp_G = cls._rts_smoother_update_step(k, 
                             filter_means[k,:] ,filter_covars[k,:,:], 
                             m_pred, P_pred, M[(k+1),:] ,P[(k+1),:,:], AQcomp.f_A)
                       
@@ -2280,8 +2969,7 @@ def balance_matrix(A):
     gebal = sp.linalg.lapack.get_lapack_funcs('gebal',(A,))
     bA, lo, hi, pivscale, info = gebal(A, permute=True, scale=True,overwrite_a=False)
     if info < 0:
-        raise ValueError('balance_matrix: Illegal value in %d-th argument of internal gebal ' % -info)
-    #import pdb; pdb.set_trace()                                                
+        raise ValueError('balance_matrix: Illegal value in %d-th argument of internal gebal ' % -info)                                              
     # calculating the similarity transforamtion:
     def perm_matr(D, c1,c2):
         """
@@ -2322,7 +3010,7 @@ def balance_matrix(A):
     #print( np.max(A - np.dot(T, np.dot(bA, T_inv) )) )
     return bA.copy(), T, T_inv
     
-def balance_ss_model(F,L,Qc,H,Pinf,dF=None,dQc=None,dPinf=None):
+def balance_ss_model(F,L,Qc,H,Pinf,P0,dF=None,dQc=None,dPinf=None,dP0=None):
     """
     Balances State-Space model for more numerical stability
 
@@ -2353,6 +3041,7 @@ def balance_ss_model(F,L,Qc,H,Pinf,dF=None,dQc=None,dPinf=None):
 #    bLL = np.dot(T_inv, LL)
 #    bPinf = np.dot( bLL, bLL.T)
 
+    bP0 = np.dot(T_inv, np.dot(P0, T_inv.T))
     
     if dF is not None:
         bdF = np.zeros(dF.shape)
@@ -2376,7 +3065,17 @@ def balance_ss_model(F,L,Qc,H,Pinf,dF=None,dQc=None,dPinf=None):
             
     else:
         bdPinf = None
-        
+    
+    if dP0 is not None:
+        bdP0 = np.zeros(dP0.shape)
+        for i in range(dP0.shape[2]):
+            bdP0[:,:,i] = np.dot( T_inv, np.dot( dP0[:,:,i], T_inv.T))
+    else:
+        bdP0 = None    
+    
+    
     bdQc = dQc # not affected
     
-    return bF, bL, bQc, bH, bPinf, bdF, bdQc, bdPinf,T
\ No newline at end of file
+    # (F,L,Qc,H,Pinf,P0,dF,dQc,dPinf,dP0)
+    
+    return bF, bL, bQc, bH, bPinf, bP0, bdF, bdQc, bdPinf, bdP0, T
\ No newline at end of file
diff --git a/GPy/models/state_space_new.py b/GPy/models/state_space_new.py
index ca77f6b9..48675fdf 100644
--- a/GPy/models/state_space_new.py
+++ b/GPy/models/state_space_new.py
@@ -31,7 +31,7 @@ from .. import likelihoods
 from . import state_space_main as ssm
 
 class StateSpace(Model):
-    def __init__(self, X, Y, kernel=None, sigma2=1.0, name='StateSpace'):
+    def __init__(self, X, Y, kernel=None, noise_var=1.0, kalman_filter_type = 'regular', name='StateSpace'):
         super(StateSpace, self).__init__(name=name)
         self.num_data, input_dim = X.shape
         assert input_dim==1, "State space methods for time only"
@@ -42,13 +42,15 @@ class StateSpace(Model):
         assert num_data_Y == self.num_data, "X and Y data don't match"
         assert self.output_dim == 1, "State space methods for single outputs only"
 
+        self.kalman_filter_type = kalman_filter_type
+        
         # Make sure the observations are ordered in time
         sort_index = np.argsort(X[:,0])
         self.X = X[sort_index]
         self.Y = Y[sort_index]
 
         # Noise variance
-        self.likelihood = likelihoods.Gaussian()
+        self.likelihood = likelihoods.Gaussian(variance=noise_var)
         
         # Default kernel
         if kernel is None:
@@ -68,6 +70,7 @@ class StateSpace(Model):
         """
         Parameters have now changed
         """
+
         
         # Get the model matrices from the kernel
         (F,L,Qc,H,P_inf, P0, dFt,dQct,dP_inft, dP0t) = self.kern.sde()
@@ -92,9 +95,10 @@ class StateSpace(Model):
         dR = np.zeros([measurement_dim,measurement_dim,grad_params_no])
         dR[:,:,-1] = np.eye(measurement_dim)
 
-        #(F,L,Qc,H,P_inf,dF,dQc,dP_inf) = ssm.balance_ss_model(F,L,Qc,H,P_inf,dF,dQc,dP_inf)
-        # Use the Kalman filter to evaluate the likelihood
+        # Balancing
+        #(F,L,Qc,H,P_inf,P0, dF,dQc,dP_inf,dP0) = ssm.balance_ss_model(F,L,Qc,H,P_inf,P0, dF,dQc,dP_inf, dP0)
         
+        # Use the Kalman filter to evaluate the likelihood        
         grad_calc_params = {}
         grad_calc_params['dP_inf'] = dP_inf
         grad_calc_params['dF'] = dF
@@ -102,10 +106,12 @@ class StateSpace(Model):
         grad_calc_params['dR'] = dR
         grad_calc_params['dP_init'] = dP0
         
+        kalman_filter_type = self.kalman_filter_type
+        
         (filter_means, filter_covs, log_likelihood, 
          grad_log_likelihood,SmootherMatrObject) = ssm.ContDescrStateSpace.cont_discr_kalman_filter(F,L,Qc,H,
                                       float(self.Gaussian_noise.variance),P_inf,self.X,self.Y,m_init=None,
-                                      P_init=P0, calc_log_likelihood=True, 
+                                      P_init=P0, p_kalman_filter_type = kalman_filter_type, calc_log_likelihood=True, 
                                       calc_grad_log_likelihood=True, 
                                       grad_params_no=grad_params_no, 
                                       grad_calc_params=grad_calc_params)
@@ -120,7 +126,7 @@ class StateSpace(Model):
     def log_likelihood(self):
         return self._log_marginal_likelihood
 
-    def _raw_predict(self, Xnew, Ynew=None, filteronly=False):
+    def _raw_predict(self, Xnew=None, Ynew=None, filteronly=False):
         """
         Performs the actual prediction for new X points.
         Inner function. It is called only from inside this class.
@@ -154,9 +160,15 @@ class StateSpace(Model):
             Ynew = self.Y
 
         # Make a single matrix containing training and testing points
-        X = np.vstack((self.X, Xnew))
-        Y = np.vstack((Ynew, np.nan*np.zeros(Xnew.shape)))
-
+        if Xnew is not None:
+            X = np.vstack((self.X, Xnew))
+            Y = np.vstack((Ynew, np.nan*np.zeros(Xnew.shape)))
+            predict_only_training = False
+        else:
+            X = self.X
+            Y = Ynew
+            predict_only_training = True            
+            
         # Sort the matrix (save the order)
         _, return_index, return_inverse = np.unique(X,True,True)
         X = X[return_index] # TODO they are not used
@@ -170,10 +182,14 @@ class StateSpace(Model):
         #Y = self.Y[:, 0,0]
         # Run the Kalman filter
         #import pdb; pdb.set_trace()
+        
+        kalman_filter_type = self.kalman_filter_type
+        
         (M, P, log_likelihood, 
          grad_log_likelihood,SmootherMatrObject) = ssm.ContDescrStateSpace.cont_discr_kalman_filter(
                                       F,L,Qc,H,float(self.Gaussian_noise.variance),P_inf,self.X,Y,m_init=None,
-                                      P_init=P0, calc_log_likelihood=False, 
+                                      P_init=P0, p_kalman_filter_type = kalman_filter_type, 
+                                      calc_log_likelihood=False, 
                                       calc_grad_log_likelihood=False)                              
         # Run the Rauch-Tung-Striebel smoother
         if not filteronly:
@@ -189,8 +205,9 @@ class StateSpace(Model):
         P = P[return_inverse,:,:]
 
         # Only return the values for Xnew
-        M = M[self.num_data:,:]
-        P = P[self.num_data:,:,:]
+        if not predict_only_training:
+            M = M[self.num_data:,:]
+            P = P[self.num_data:,:,:]
 
         # Calculate the mean and variance
         m = np.dot(M,H.T)
@@ -201,7 +218,7 @@ class StateSpace(Model):
         # Return the posterior of the state
         return (m, V)
 
-    def predict(self, Xnew, filteronly=False):
+    def predict(self, Xnew=None, filteronly=False):
 
         # Run the Kalman filter to get the state
         (m, V) = self._raw_predict(Xnew,filteronly=filteronly)
@@ -216,7 +233,7 @@ class StateSpace(Model):
         # Return mean and variance
         return (m, V, lower, upper)
         
-    def predict_quantiles(self, Xnew, quantiles=(2.5, 97.5)):
+    def predict_quantiles(self, Xnew=None, quantiles=(2.5, 97.5)):
         mu, var = self._raw_predict(Xnew)
         #import pdb; pdb.set_trace()
         return  [stats.norm.ppf(q/100.)*np.sqrt(var + float(self.Gaussian_noise.variance)) + mu for q in quantiles]