diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 01190101..95a4c220 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -194,7 +194,7 @@ class GP(Model):
     # Make sure to name this variable and the predict functions will "just work"
     # In maths the predictive variable is:
     #         K_{xx} - K_{xp}W_{pp}^{-1}K_{px}
-    #         W_{pp} := \texttt{Woodbury inv}
+    #         W_{pp} := \\texttt{Woodbury inv}
     #         p := _predictive_variable
 
     @property
@@ -298,7 +298,7 @@ class GP(Model):
         .. math::
             p(f*|X*, X, Y) = \\int^{\\inf}_{\\inf} p(f*|f,X*)p(f|X,Y) df
                         = N(f*| K_{x*x}(K_{xx} + \\Sigma)^{-1}Y, K_{x*x*} - K_{xx*}(K_{xx} + \\Sigma)^{-1}K_{xx*}
-            \\Sigma := \texttt{Likelihood.variance / Approximate likelihood covariance}
+            \\Sigma := \\texttt{Likelihood.variance / Approximate likelihood covariance}
         """
         mu, var = self.posterior._raw_predict(kern=self.kern if kern is None else kern, Xnew=Xnew, pred_var=self._predictive_variable, full_cov=full_cov)
         if self.mean_function is not None:
diff --git a/GPy/inference/latent_function_inference/expectation_propagation.py b/GPy/inference/latent_function_inference/expectation_propagation.py
index 2a558b5b..df27d7db 100644
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@@ -134,10 +134,10 @@ class posteriorParams(posteriorParamsBase):
         B = np.eye(num_data) + Sroot_tilde_K * tau_tilde_root[None,:]
         L = jitchol(B)
         V, _ = dtrtrs(L, Sroot_tilde_K, lower=1)
-        Sigma = K - np.dot(V.T,V) #K - KS^(1/2)BS^(1/2)K = (K^(-1) + \Sigma^(-1))^(-1)
+        Sigma = K - np.dot(V.T,V) #K - KS^(1/2)BS^(1/2)K = (K^(-1) + \\Sigma^(-1))^(-1)
 
         aux_alpha , _ = dpotrs(L, tau_tilde_root * (np.dot(K, ga_approx.v) + mean_prior), lower=1)
-        alpha = ga_approx.v - tau_tilde_root * aux_alpha #(K + Sigma^(\tilde))^(-1) (/mu^(/tilde) - /mu_p)
+        alpha = ga_approx.v - tau_tilde_root * aux_alpha #(K + Sigma^(\\tilde))^(-1) (/mu^(/tilde) - /mu_p)
         mu = np.dot(K, alpha) + mean_prior
 
         return posteriorParams(mu=mu, Sigma=Sigma, L=L)
@@ -151,8 +151,8 @@ class posteriorParamsDTC(posteriorParamsBase):
         DSYR(LLT,Kmn[:,i].copy(),delta_tau)
         L = jitchol(LLT)
         V,info = dtrtrs(L,Kmn,lower=1)
-        self.Sigma_diag = np.maximum(np.sum(V*V,-2), np.finfo(float).eps)  #diag(K_nm (L L^\top)^(-1)) K_mn
-        si = np.sum(V.T*V[:,i],-1) #(V V^\top)[:,i]
+        self.Sigma_diag = np.maximum(np.sum(V*V,-2), np.finfo(float).eps)  #diag(K_nm (L L^\\top)^(-1)) K_mn
+        si = np.sum(V.T*V[:,i],-1) #(V V^\\top)[:,i]
         self.mu += (delta_v-delta_tau*self.mu[i])*si
         #mu = np.dot(Sigma, v_tilde)
 
@@ -391,11 +391,11 @@ class EP(EPBase, ExactGaussianInference):
 
 
         aux_alpha , _ = dpotrs(post_params.L, tau_tilde_root * (np.dot(K, ga_approx.v) +  mean_prior), lower=1)
-        alpha = (ga_approx.v - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) (/mu^(/tilde) -  /mu_p)
+        alpha = (ga_approx.v - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\\tilde))^(-1) (/mu^(/tilde) -  /mu_p)
 
         LWi, _ = dtrtrs(post_params.L, np.diag(tau_tilde_root), lower=1)
         Wi = np.dot(LWi.T,LWi)
-        symmetrify(Wi) #(K + Sigma^(\tilde))^(-1)
+        symmetrify(Wi) #(K + Sigma^(\\tilde))^(-1)
 
         dL_dK = 0.5 * (tdot(alpha) - Wi)
         dL_dthetaL = likelihood.ep_gradients(Y, cav_params.tau, cav_params.v, np.diag(dL_dK), Y_metadata=Y_metadata, quad_mode='gh')
@@ -530,7 +530,7 @@ class EPDTC(EPBase, VarDTC):
         #initial values - Gaussian factors
         #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
         LLT0 = Kmm.copy()
-        Lm = jitchol(LLT0) #K_m = L_m L_m^\top
+        Lm = jitchol(LLT0) #K_m = L_m L_m^\\top
         Vm,info = dtrtrs(Lm, Kmn,lower=1)
         # Lmi = dtrtri(Lm)
         # Kmmi = np.dot(Lmi.T,Lmi)
diff --git a/GPy/inference/latent_function_inference/pep.py b/GPy/inference/latent_function_inference/pep.py
index 79706292..cd4ac0cc 100644
--- a/GPy/inference/latent_function_inference/pep.py
+++ b/GPy/inference/latent_function_inference/pep.py
@@ -8,14 +8,14 @@ log_2_pi = np.log(2*np.pi)
 class PEP(LatentFunctionInference):
     '''
     Sparse Gaussian processes using Power-Expectation Propagation
-    for regression: alpha \approx 0 gives VarDTC and alpha = 1 gives FITC
-    
-    Reference: A Unifying Framework for Sparse Gaussian Process Approximation using 
+    for regression: alpha \\approx 0 gives VarDTC and alpha = 1 gives FITC
+
+    Reference: A Unifying Framework for Sparse Gaussian Process Approximation using
     Power Expectation Propagation, https://arxiv.org/abs/1605.07066
-    
+
     '''
     const_jitter = 1e-6
-    
+
     def __init__(self, alpha):
         super(PEP, self).__init__()
         self.alpha = alpha
@@ -69,7 +69,7 @@ class PEP(LatentFunctionInference):
         #compute dL_dR
         Uv = np.dot(U, v)
         dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - (1.0+alpha_const_term)/beta_star + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) \
-            + np.sum(np.square(Uv), 1))*beta_star**2 
+            + np.sum(np.square(Uv), 1))*beta_star**2
 
         # Compute dL_dKmm
         vvT_P = tdot(v.reshape(-1,1)) + P
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index e3cea7ea..e0de2aaa 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -82,7 +82,7 @@ class Posterior(object):
         Posterior mean
         $$
         K_{xx}v
-        v := \texttt{Woodbury vector}
+        v := \\texttt{Woodbury vector}
         $$
         """
         if self._mean is None:
@@ -95,7 +95,7 @@ class Posterior(object):
         Posterior covariance
         $$
         K_{xx} - K_{xx}W_{xx}^{-1}K_{xx}
-        W_{xx} := \texttt{Woodbury inv}
+        W_{xx} := \\texttt{Woodbury inv}
         $$
         """
         if self._covariance is None:
@@ -146,8 +146,8 @@ class Posterior(object):
         """
         return $L_{W}$ where L is the lower triangular Cholesky decomposition of the Woodbury matrix
         $$
-        L_{W}L_{W}^{\top} = W^{-1}
-        W^{-1} := \texttt{Woodbury inv}
+        L_{W}L_{W}^{\\top} = W^{-1}
+        W^{-1} := \\texttt{Woodbury inv}
         $$
         """
         if self._woodbury_chol is None:
@@ -179,7 +179,7 @@ class Posterior(object):
         The inverse of the woodbury matrix, in the gaussian likelihood case it is defined as
         $$
         (K_{xx} + \\Sigma_{xx})^{-1}
-        \\Sigma_{xx} := \texttt{Likelihood.variance / Approximate likelihood covariance}
+        \\Sigma_{xx} := \\texttt{Likelihood.variance / Approximate likelihood covariance}
         $$
         """
         if self._woodbury_inv is None:
@@ -201,7 +201,7 @@ class Posterior(object):
         Woodbury vector in the gaussian likelihood case only is defined as
         $$
         (K_{xx} + \\Sigma)^{-1}Y
-        \\Sigma := \texttt{Likelihood.variance / Approximate likelihood covariance}
+        \\Sigma := \\texttt{Likelihood.variance / Approximate likelihood covariance}
         $$
         """
         if self._woodbury_vector is None:
diff --git a/GPy/kern/src/coregionalize.py b/GPy/kern/src/coregionalize.py
index aaaeae72..a7c4f6f7 100644
--- a/GPy/kern/src/coregionalize.py
+++ b/GPy/kern/src/coregionalize.py
@@ -19,18 +19,18 @@ except ImportError:
 
 
 class Coregionalize(Kern):
-    r"""
+    """
     Covariance function for intrinsic/linear coregionalization models
 
     This covariance has the form:
 
     .. math::
-       \mathbf{B} = \mathbf{W}\mathbf{W}^\intercal + \mathrm{diag}(kappa)
+       \\mathbf{B} = \\mathbf{W}\\mathbf{W}^\\intercal + \\mathrm{diag}(kappa)
 
     An intrinsic/linear coregionalization covariance function of the form:
 
     .. math::
-       k_2(x, y)=\mathbf{B} k(x, y)
+       k_2(x, y)=\\mathbf{B} k(x, y)
 
     it is obtained as the tensor product between a covariance function
     k(x, y) and B.
diff --git a/GPy/kern/src/eq_ode1.py b/GPy/kern/src/eq_ode1.py
index 69b5ec54..caedc7a3 100644
--- a/GPy/kern/src/eq_ode1.py
+++ b/GPy/kern/src/eq_ode1.py
@@ -15,7 +15,7 @@ class EQ_ODE1(Kern):
 
     This outputs of this kernel have the form
     .. math::
-       \frac{\text{d}y_j}{\text{d}t} = \\sum_{i=1}^R w_{j,i} u_i(t-\\delta_j) - d_jy_j(t)
+       \\frac{\\text{d}y_j}{\\text{d}t} = \\sum_{i=1}^R w_{j,i} u_i(t-\\delta_j) - d_jy_j(t)
 
     where :math:`R` is the rank of the system, :math:`w_{j,i}` is the sensitivity of the :math:`j`th output to the :math:`i`th latent function, :math:`d_j` is the decay rate of the :math:`j`th output and :math:`u_i(t)` are independent latent Gaussian processes goverened by an exponentiated quadratic covariance.
 
diff --git a/GPy/kern/src/eq_ode2.py b/GPy/kern/src/eq_ode2.py
index e2f251d7..e809b151 100644
--- a/GPy/kern/src/eq_ode2.py
+++ b/GPy/kern/src/eq_ode2.py
@@ -15,7 +15,7 @@ class EQ_ODE2(Kern):
 
     This outputs of this kernel have the form
     .. math::
-       \frac{\text{d}^2y_j(t)}{\text{d}^2t} + C_j\frac{\text{d}y_j(t)}{\text{d}t} + B_jy_j(t) = \\sum_{i=1}^R w_{j,i} u_i(t)
+       \\frac{\\text{d}^2y_j(t)}{\\text{d}^2t} + C_j\\frac{\\text{d}y_j(t)}{\\text{d}t} + B_jy_j(t) = \\sum_{i=1}^R w_{j,i} u_i(t)
 
     where :math:`R` is the rank of the system, :math:`w_{j,i}` is the sensitivity of the :math:`j`th output to the :math:`i`th latent function, :math:`d_j` is the decay rate of the :math:`j`th output and :math:`f_i(t)` and :math:`g_i(t)` are independent latent Gaussian processes goverened by an exponentiated quadratic covariance.
 
diff --git a/GPy/kern/src/linear.py b/GPy/kern/src/linear.py
index 6106a671..1caf93f6 100644
--- a/GPy/kern/src/linear.py
+++ b/GPy/kern/src/linear.py
@@ -121,7 +121,7 @@ class Linear(Kern):
         the returned array is of shape [NxNxQxQ].
 
         ..math:
-            \frac{\\partial^2 K}{\\partial X2 ^2} = - \frac{\\partial^2 K}{\\partial X\\partial X2}
+            \\frac{\\partial^2 K}{\\partial X2 ^2} = - \\frac{\\partial^2 K}{\\partial X\\partial X2}
 
         ..returns:
             dL2_dXdX2:  [NxMxQxQ] for X [NxQ] and X2[MxQ] (X2 is X if, X2 is None)
diff --git a/GPy/kern/src/sde_matern.py b/GPy/kern/src/sde_matern.py
index f896957e..de321f58 100644
--- a/GPy/kern/src/sde_matern.py
+++ b/GPy/kern/src/sde_matern.py
@@ -11,15 +11,15 @@ import numpy as np
 
 class sde_Matern32(Matern32):
     """
-    
+
     Class provide extra functionality to transfer this covariance function into
     SDE forrm.
-    
+
     Matern 3/2 kernel:
 
     .. math::
 
-       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \\exp(- \\sqrt{3} r) \\ \\ \\ \\  \text{ where  } r = \\sqrt{\\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\\ell_i^2} }
+       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \\exp(- \\sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \\sqrt{\\sum_{i=1}^{input dim} \\frac{(x_i-y_i)^2}{\\ell_i^2} }
 
     """
     def sde_update_gradient_full(self, gradients):
@@ -27,59 +27,59 @@ class sde_Matern32(Matern32):
         Update gradient in the order in which parameters are represented in the
         kernel
         """
-    
+
         self.variance.gradient = gradients[0]
         self.lengthscale.gradient = gradients[1]
-        
-    def sde(self): 
-        """ 
-        Return the state space representation of the covariance. 
-        """ 
-        
+
+    def sde(self):
+        """
+        Return the state space representation of the covariance.
+        """
+
         variance = float(self.variance.values)
         lengthscale = float(self.lengthscale.values)
-        
-        foo  = np.sqrt(3.)/lengthscale 
-        F    = np.array(((0, 1.0), (-foo**2, -2*foo))) 
+
+        foo  = np.sqrt(3.)/lengthscale
+        F    = np.array(((0, 1.0), (-foo**2, -2*foo)))
         L    = np.array(( (0,), (1.0,) ))
-        Qc   = np.array(((12.*np.sqrt(3) / lengthscale**3 * variance,),)) 
-        H    = np.array(((1.0, 0),)) 
+        Qc   = np.array(((12.*np.sqrt(3) / lengthscale**3 * variance,),))
+        H    = np.array(((1.0, 0),))
         Pinf = np.array(((variance, 0.0), (0.0, 3.*variance/(lengthscale**2))))
         P0 = Pinf.copy()
-        
-        # Allocate space for the derivatives 
+
+        # Allocate space for the derivatives
         dF    = np.empty([F.shape[0],F.shape[1],2])
-        dQc   = np.empty([Qc.shape[0],Qc.shape[1],2]) 
-        dPinf = np.empty([Pinf.shape[0],Pinf.shape[1],2]) 
-        # The partial derivatives 
-        dFvariance       = np.zeros((2,2)) 
-        dFlengthscale    = np.array(((0,0), (6./lengthscale**3,2*np.sqrt(3)/lengthscale**2))) 
-        dQcvariance      = np.array((12.*np.sqrt(3)/lengthscale**3)) 
-        dQclengthscale   = np.array((-3*12*np.sqrt(3)/lengthscale**4*variance)) 
-        dPinfvariance    = np.array(((1,0),(0,3./lengthscale**2))) 
-        dPinflengthscale = np.array(((0,0), (0,-6*variance/lengthscale**3))) 
-        # Combine the derivatives 
-        dF[:,:,0]    = dFvariance 
-        dF[:,:,1]    = dFlengthscale 
-        dQc[:,:,0]   = dQcvariance 
-        dQc[:,:,1]   = dQclengthscale 
-        dPinf[:,:,0] = dPinfvariance 
-        dPinf[:,:,1] = dPinflengthscale 
+        dQc   = np.empty([Qc.shape[0],Qc.shape[1],2])
+        dPinf = np.empty([Pinf.shape[0],Pinf.shape[1],2])
+        # The partial derivatives
+        dFvariance       = np.zeros((2,2))
+        dFlengthscale    = np.array(((0,0), (6./lengthscale**3,2*np.sqrt(3)/lengthscale**2)))
+        dQcvariance      = np.array((12.*np.sqrt(3)/lengthscale**3))
+        dQclengthscale   = np.array((-3*12*np.sqrt(3)/lengthscale**4*variance))
+        dPinfvariance    = np.array(((1,0),(0,3./lengthscale**2)))
+        dPinflengthscale = np.array(((0,0), (0,-6*variance/lengthscale**3)))
+        # Combine the derivatives
+        dF[:,:,0]    = dFvariance
+        dF[:,:,1]    = dFlengthscale
+        dQc[:,:,0]   = dQcvariance
+        dQc[:,:,1]   = dQclengthscale
+        dPinf[:,:,0] = dPinfvariance
+        dPinf[:,:,1] = dPinflengthscale
         dP0 = dPinf.copy()
-        
+
         return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
 
 class sde_Matern52(Matern52):
     """
-    
+
     Class provide extra functionality to transfer this covariance function into
     SDE forrm.
-    
+
     Matern 5/2 kernel:
 
     .. math::
 
-       k(r) = \\sigma^2 (1 + \\sqrt{5} r + \frac{5}{3}r^2) \\exp(- \\sqrt{5} r) \\ \\ \\ \\  \text{ where  } r = \\sqrt{\\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\\ell_i^2} }
+       k(r) = \\sigma^2 (1 + \\sqrt{5} r + \\frac{5}{3}r^2) \\exp(- \\sqrt{5} r) \\ \\ \\ \\  \\text{ where  } r = \\sqrt{\\sum_{i=1}^{input dim} \\frac{(x_i-y_i)^2}{\\ell_i^2} }
 
     """
     def sde_update_gradient_full(self, gradients):
@@ -87,51 +87,51 @@ class sde_Matern52(Matern52):
         Update gradient in the order in which parameters are represented in the
         kernel
         """
-    
+
         self.variance.gradient = gradients[0]
         self.lengthscale.gradient = gradients[1]
-        
-    def sde(self): 
-        """ 
-        Return the state space representation of the covariance. 
-        """ 
-        
+
+    def sde(self):
+        """
+        Return the state space representation of the covariance.
+        """
+
         variance = float(self.variance.values)
         lengthscale = float(self.lengthscale.values)
 
         lamda = np.sqrt(5.0)/lengthscale
-        kappa = 5.0/3.0*variance/lengthscale**2        
-        
+        kappa = 5.0/3.0*variance/lengthscale**2
+
         F = np.array(((0, 1,0), (0, 0, 1), (-lamda**3, -3.0*lamda**2, -3*lamda)))
         L = np.array(((0,),(0,),(1,)))
         Qc = np.array((((variance*400.0*np.sqrt(5.0)/3.0/lengthscale**5),),))
-        H = np.array(((1,0,0),))        
-        
+        H = np.array(((1,0,0),))
+
         Pinf = np.array(((variance,0,-kappa), (0, kappa, 0), (-kappa, 0, 25.0*variance/lengthscale**4)))
         P0 = Pinf.copy()
-        # Allocate space for the derivatives         
-        dF = np.empty((3,3,2))        
-        dQc = np.empty((1,1,2))        
+        # Allocate space for the derivatives
+        dF = np.empty((3,3,2))
+        dQc = np.empty((1,1,2))
         dPinf = np.empty((3,3,2))
-        
-         # The partial derivatives 
+
+         # The partial derivatives
         dFvariance = np.zeros((3,3))
-        dFlengthscale = np.array(((0,0,0),(0,0,0),(15.0*np.sqrt(5.0)/lengthscale**4, 
+        dFlengthscale = np.array(((0,0,0),(0,0,0),(15.0*np.sqrt(5.0)/lengthscale**4,
                                    30.0/lengthscale**3, 3*np.sqrt(5.0)/lengthscale**2)))
         dQcvariance = np.array((((400*np.sqrt(5)/3/lengthscale**5,),)))
-        dQclengthscale = np.array((((-variance*2000*np.sqrt(5)/3/lengthscale**6,),)))        
-        
+        dQclengthscale = np.array((((-variance*2000*np.sqrt(5)/3/lengthscale**6,),)))
+
         dPinf_variance = Pinf/variance
         kappa2 = -2.0*kappa/lengthscale
-        dPinf_lengthscale = np.array(((0,0,-kappa2),(0,kappa2,0),(-kappa2, 
-                                    0,-100*variance/lengthscale**5)))        
-        # Combine the derivatives 
+        dPinf_lengthscale = np.array(((0,0,-kappa2),(0,kappa2,0),(-kappa2,
+                                    0,-100*variance/lengthscale**5)))
+        # Combine the derivatives
         dF[:,:,0] = dFvariance
-        dF[:,:,1] = dFlengthscale        
-        dQc[:,:,0] = dQcvariance         
-        dQc[:,:,1] = dQclengthscale        
+        dF[:,:,1] = dFlengthscale
+        dQc[:,:,0] = dQcvariance
+        dQc[:,:,1] = dQclengthscale
         dPinf[:,:,0] = dPinf_variance
         dPinf[:,:,1] = dPinf_lengthscale
         dP0 = dPinf.copy()
-        
-        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)  
\ No newline at end of file
+
+        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
\ No newline at end of file
diff --git a/GPy/kern/src/sde_standard_periodic.py b/GPy/kern/src/sde_standard_periodic.py
index cf6ab30d..add45936 100644
--- a/GPy/kern/src/sde_standard_periodic.py
+++ b/GPy/kern/src/sde_standard_periodic.py
@@ -24,8 +24,8 @@ class sde_StdPeriodic(StdPeriodic):
 
     .. math::
 
-       k(x,y) = \theta_1 \\exp \\left[  - \frac{1}{2} {}\\sum_{i=1}^{input\\_dim}
-       \\left( \frac{\\sin(\frac{\\pi}{\\lambda_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
+       k(x,y) = \\theta_1 \\exp \\left[  - \\frac{1}{2} {}\\sum_{i=1}^{input\\_dim}
+       \\left( \\frac{\\sin(\\frac{\\pi}{\\lambda_i} (x_i - y_i) )}{l_i} \\right)^2 \\right] }
 
     """
 
@@ -177,7 +177,7 @@ def seriescoeff(m=6, lengthScale=1.0, magnSigma2=1.0, true_covariance=False):
     Calculate the coefficients q_j^2 for the covariance function
     approximation:
 
-        k(\tau) =  \\sum_{j=0}^{+\\infty} q_j^2 \\cos(j\\omega_0 \tau)
+        k(\\tau) =  \\sum_{j=0}^{+\\infty} q_j^2 \\cos(j\\omega_0 \\tau)
 
     Reference is:
 
diff --git a/GPy/kern/src/sde_static.py b/GPy/kern/src/sde_static.py
index 93a13b3e..c61660a9 100644
--- a/GPy/kern/src/sde_static.py
+++ b/GPy/kern/src/sde_static.py
@@ -12,63 +12,63 @@ import numpy as np
 
 class sde_White(White):
     """
-    
+
     Class provide extra functionality to transfer this covariance function into
     SDE forrm.
-    
+
     White kernel:
 
     .. math::
 
-       k(x,y) = \alpha*\\delta(x-y)
+       k(x,y) = \\alpha*\\delta(x-y)
 
     """
-    
+
     def sde_update_gradient_full(self, gradients):
         """
         Update gradient in the order in which parameters are represented in the
         kernel
         """
-    
+
         self.variance.gradient = gradients[0]
-        
-    def sde(self): 
-        """ 
-        Return the state space representation of the covariance. 
-        """ 
-        
-        variance = float(self.variance.values) 
-        
+
+    def sde(self):
+        """
+        Return the state space representation of the covariance.
+        """
+
+        variance = float(self.variance.values)
+
         F = np.array( ((-np.inf,),) )
         L = np.array( ((1.0,),)  )
         Qc = np.array( ((variance,),)  )
         H = np.array( ((1.0,),) )
-        
+
         Pinf   = np.array( ((variance,),)  )
-        P0 = Pinf.copy()     
-        
+        P0 = Pinf.copy()
+
         dF = np.zeros((1,1,1))
         dQc = np.zeros((1,1,1))
         dQc[:,:,0]    = np.array( ((1.0,),) )
-        
+
         dPinf = np.zeros((1,1,1))
         dPinf[:,:,0] = np.array( ((1.0,),) )
         dP0 = dPinf.copy()
-        
+
         return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
 
 
 class sde_Bias(Bias):
     """
-    
+
     Class provide extra functionality to transfer this covariance function into
     SDE forrm.
-    
+
     Bias kernel:
 
     .. math::
 
-       k(x,y) = \alpha
+       k(x,y) = \\alpha
 
     """
     def sde_update_gradient_full(self, gradients):
@@ -76,28 +76,28 @@ class sde_Bias(Bias):
         Update gradient in the order in which parameters are represented in the
         kernel
         """
-    
+
         self.variance.gradient = gradients[0]
-        
-    def sde(self): 
-        """ 
-        Return the state space representation of the covariance. 
-        """ 
-        variance = float(self.variance.values) 
-        
+
+    def sde(self):
+        """
+        Return the state space representation of the covariance.
+        """
+        variance = float(self.variance.values)
+
         F = np.array( ((0.0,),))
         L = np.array( ((1.0,),))
         Qc = np.zeros((1,1))
         H = np.array( ((1.0,),))
-        
+
         Pinf   = np.zeros((1,1))
-        P0 = np.array( ((variance,),) )      
-        
+        P0 = np.array( ((variance,),) )
+
         dF = np.zeros((1,1,1))
         dQc    = np.zeros((1,1,1))
-        
+
         dPinf = np.zeros((1,1,1))
         dP0 = np.zeros((1,1,1))
         dP0[:,:,0] = np.array( ((1.0,),) )
-        
+
         return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
\ No newline at end of file
diff --git a/GPy/kern/src/sde_stationary.py b/GPy/kern/src/sde_stationary.py
index 58bb7424..491135c1 100644
--- a/GPy/kern/src/sde_stationary.py
+++ b/GPy/kern/src/sde_stationary.py
@@ -29,7 +29,7 @@ class sde_RBF(RBF):
 
     .. math::
 
-        k(r) = \\sigma^2 \\exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \\ \\ \\ \\  \text{ where  } r = \\sqrt{\\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\\ell_i^2} }
+        k(r) = \\sigma^2 \\exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \\ \\ \\ \\  \\text{ where  } r = \\sqrt{\\sum_{i=1}^{input dim} \\frac{(x_i-y_i)^2}{\\ell_i^2} }
 
     """
 
@@ -102,7 +102,7 @@ class sde_RBF(RBF):
         eps = 1e-12
         if (float(Qc) > 1.0 / eps) or (float(Qc) < eps):
             warnings.warn(
-                """sde_RBF kernel: the noise variance Qc is either very large or very small. 
+                """sde_RBF kernel: the noise variance Qc is either very large or very small.
                                 It influece conditioning of P_inf: {0:e}""".format(
                     float(Qc)
                 )
@@ -204,7 +204,7 @@ class sde_Exponential(Exponential):
 
     .. math::
 
-       k(r) = \\sigma^2 \\exp \\bigg(- \\frac{1}{2} r \\bigg) \\ \\ \\ \\  \text{ where  } r = \\sqrt{\\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\\ell_i^2} }
+       k(r) = \\sigma^2 \\exp \\bigg(- \\frac{1}{2} r \\bigg) \\ \\ \\ \\  \\text{ where  } r = \\sqrt{\\sum_{i=1}^{input dim} \\frac{(x_i-y_i)^2}{\\ell_i^2} }
 
     """
 
@@ -259,7 +259,7 @@ class sde_RatQuad(RatQuad):
 
     .. math::
 
-       k(r) = \\sigma^2 \\bigg( 1 + \\frac{r^2}{2} \\bigg)^{- \alpha} \\ \\ \\ \\  \text{ where  } r = \\sqrt{\\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\\ell_i^2} }
+       k(r) = \\sigma^2 \\bigg( 1 + \\frac{r^2}{2} \\bigg)^{- \\alpha} \\ \\ \\ \\  \\text{ where  } r = \\sqrt{\\sum_{i=1}^{input dim} \\frac{(x_i-y_i)^2}{\\ell_i^2} }
 
     """
 
diff --git a/GPy/kern/src/standard_periodic.py b/GPy/kern/src/standard_periodic.py
index 4baf86ee..6168956a 100644
--- a/GPy/kern/src/standard_periodic.py
+++ b/GPy/kern/src/standard_periodic.py
@@ -24,12 +24,12 @@ class StdPeriodic(Kern):
 
     .. math::
 
-       k(x,y) = \theta_1 \\exp \\left[  - \frac{1}{2} \\sum_{i=1}^{input\\_dim}
-       \\left( \frac{\\sin(\frac{\\pi}{T_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
+       k(x,y) = \\theta_1 \\exp \\left[  - \\frac{1}{2} \\sum_{i=1}^{input\\_dim}
+       \\left( \\frac{\\sin(\\frac{\\pi}{T_i} (x_i - y_i) )}{l_i} \\right)^2 \\right] }
 
     :param input_dim: the number of input dimensions
     :type input_dim: int
-    :param variance: the variance :math:`\theta_1` in the formula above
+    :param variance: the variance :math:`\\theta_1` in the formula above
     :type variance: float
     :param period: the vector of periods :math:`\\T_i`. If None then 1.0 is assumed.
     :type period: array or list of the appropriate size (or float if there is only one period parameter)
@@ -177,7 +177,7 @@ class StdPeriodic(Kern):
         Returns only diagonal elements.
         """
         return np.zeros(X.shape[0])
-    
+
     def dK2_dXdX2(self, X, X2, dimX, dimX2):
         """
         Compute the second derivative of K with respect to:
@@ -578,9 +578,9 @@ class StdPeriodic(Kern):
             X2 = X
         dX = -np.pi*((dL_dK*K)[:,:,None]*np.sin(2*np.pi/self.period*(X[:,None,:] - X2[None,:,:]))/(2.*np.square(self.lengthscale)*self.period)).sum(1)
         return dX
-    
+
     def gradients_X_diag(self, dL_dKdiag, X):
         return np.zeros(X.shape)
-    
+
     def input_sensitivity(self, summarize=True):
         return self.variance*np.ones(self.input_dim)/self.lengthscale**2
diff --git a/GPy/kern/src/stationary.py b/GPy/kern/src/stationary.py
index bb8c6bcb..507684b8 100644
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@@ -259,7 +259,7 @@ class Stationary(Kern):
         the returned array is of shape [NxNxQxQ].
 
         ..math:
-            \frac{\\partial^2 K}{\\partial X2 ^2} = - \frac{\\partial^2 K}{\\partial X\\partial X2}
+            \\frac{\\partial^2 K}{\\partial X2 ^2} = - \\frac{\\partial^2 K}{\\partial X\\partial X2}
 
         ..returns:
             dL2_dXdX2:  [NxMxQxQ] in the cov=True case, or [NxMxQ] in the cov=False case,
@@ -295,7 +295,7 @@ class Stationary(Kern):
         Given the derivative of the objective dL_dK, compute the second derivative of K wrt X:
 
         ..math:
-          \frac{\\partial^2 K}{\\partial X\\partial X}
+          \\frac{\\partial^2 K}{\\partial X\\partial X}
 
         ..returns:
             dL2_dXdX: [NxQxQ]
@@ -423,7 +423,7 @@ class OU(Stationary):
 
     .. math::
 
-       k(r) = \\sigma^2 \\exp(- r) \\ \\ \\ \\  \\text{ where  } r = \\sqrt{\\sum_{i=1}^{\text{input_dim}} \\frac{(x_i-y_i)^2}{\\ell_i^2} }
+       k(r) = \\sigma^2 \\exp(- r) \\ \\ \\ \\  \\text{ where  } r = \\sqrt{\\sum_{i=1}^{\\text{input_dim}} \\frac{(x_i-y_i)^2}{\\ell_i^2} }
 
     """
 
diff --git a/GPy/kern/src/todo/eq_ode1.py b/GPy/kern/src/todo/eq_ode1.py
index 7104a8e9..71604a20 100644
--- a/GPy/kern/src/todo/eq_ode1.py
+++ b/GPy/kern/src/todo/eq_ode1.py
@@ -14,23 +14,23 @@ class Eq_ode1(Kernpart):
 
     This outputs of this kernel have the form
     .. math::
-       \frac{\text{d}y_j}{\text{d}t} = \sum_{i=1}^R w_{j,i} f_i(t-\delta_j) +\sqrt{\kappa_j}g_j(t) - d_jy_j(t)
+       \\frac{\\text{d}y_j}{\\text{d}t} = \\sum_{i=1}^R w_{j,i} f_i(t-\\delta_j) +\\sqrt{\\kappa_j}g_j(t) - d_jy_j(t)
 
     where :math:`R` is the rank of the system, :math:`w_{j,i}` is the sensitivity of the :math:`j`th output to the :math:`i`th latent function, :math:`d_j` is the decay rate of the :math:`j`th output and :math:`f_i(t)` and :math:`g_i(t)` are independent latent Gaussian processes goverened by an exponentiated quadratic covariance.
-    
+
     :param output_dim: number of outputs driven by latent function.
     :type output_dim: int
-    :param W: sensitivities of each output to the latent driving function. 
+    :param W: sensitivities of each output to the latent driving function.
     :type W: ndarray (output_dim x rank).
     :param rank: If rank is greater than 1 then there are assumed to be a total of rank latent forces independently driving the system, each with identical covariance.
     :type rank: int
-    :param decay: decay rates for the first order system. 
+    :param decay: decay rates for the first order system.
     :type decay: array of length output_dim.
     :param delay: delay between latent force and output response.
     :type delay: array of length output_dim.
     :param kappa: diagonal term that allows each latent output to have an independent component to the response.
     :type kappa: array of length output_dim.
-    
+
     .. Note: see first order differential equation examples in GPy.examples.regression for some usage.
     """
     def __init__(self,output_dim, W=None, rank=1, kappa=None, lengthscale=1.0,  decay=None, delay=None):
@@ -62,7 +62,7 @@ class Eq_ode1(Kernpart):
         self.is_stationary = False
         self.gaussian_initial = False
         self._set_params(self._get_params())
-        
+
     def _get_params(self):
         param_list = [self.W.flatten()]
         if self.kappa is not None:
@@ -103,11 +103,11 @@ class Eq_ode1(Kernpart):
         param_names += ['decay_%i'%i for i in range(1,self.output_dim)]
         if self.delay is not None:
             param_names += ['delay_%i'%i for i in 1+range(1,self.output_dim)]
-        param_names+= ['lengthscale'] 
+        param_names+= ['lengthscale']
         return param_names
 
     def K(self,X,X2,target):
-        
+
         if X.shape[1] > 2:
             raise ValueError('Input matrix for ode1 covariance should have at most two columns, one containing times, the other output indices')
 
@@ -123,13 +123,13 @@ class Eq_ode1(Kernpart):
     def Kdiag(self,index,target):
         #target += np.diag(self.B)[np.asarray(index,dtype=int).flatten()]
         pass
-    
+
     def _param_grad_helper(self,dL_dK,X,X2,target):
-        
+
         # First extract times and indices.
         self._extract_t_indices(X, X2, dL_dK=dL_dK)
         self._dK_ode_dtheta(target)
-        
+
 
     def _dK_ode_dtheta(self, target):
         """Do all the computations for the ode parts of the covariance function."""
@@ -138,7 +138,7 @@ class Eq_ode1(Kernpart):
         index_ode = self._index[self._index>0]-1
         if self._t2 is None:
             if t_ode.size==0:
-                return        
+                return
             t2_ode = t_ode
             dL_dK_ode = dL_dK_ode[:, self._index>0]
             index2_ode = index_ode
@@ -210,7 +210,7 @@ class Eq_ode1(Kernpart):
         self._index = self._index[self._order]
         self._t = self._t[self._order]
         self._rorder = self._order.argsort() # rorder is for reversing the order
-        
+
         if X2 is None:
             self._t2 = None
             self._index2 = None
@@ -229,7 +229,7 @@ class Eq_ode1(Kernpart):
         if dL_dK is not None:
             self._dL_dK = dL_dK[self._order, :]
             self._dL_dK = self._dL_dK[:, self._order2]
-            
+
     def _K_computations(self, X, X2):
         """Perform main body of computations for the ode1 covariance function."""
         # First extract times and indices.
@@ -253,8 +253,8 @@ class Eq_ode1(Kernpart):
                                                    np.hstack((self._K_ode_eq, self._K_ode))))
         self._K_dvar = self._K_dvar[self._rorder, :]
         self._K_dvar = self._K_dvar[:, self._rorder2]
-        
-        
+
+
         if X2 is None:
             # Matrix giving scales of each output
             self._scale = np.zeros((self._t.size, self._t.size))
@@ -303,7 +303,7 @@ class Eq_ode1(Kernpart):
                 self._K_eq = np.zeros((t_eq.size, t2_eq.size))
                 return
             self._dist2 = np.square(t_eq[:, None] - t2_eq[None, :])
-        
+
         self._K_eq = np.exp(-self._dist2/(2*self.lengthscale*self.lengthscale))
         if self.is_normalized:
             self._K_eq/=(np.sqrt(2*np.pi)*self.lengthscale)
@@ -361,7 +361,7 @@ class Eq_ode1(Kernpart):
             self._K_eq_ode = sK.T
         else:
             self._K_ode_eq = sK
-        
+
     def _K_compute_ode(self):
         # Compute covariances between outputs of the ODE models.
 
@@ -370,7 +370,7 @@ class Eq_ode1(Kernpart):
         if self._t2 is None:
             if t_ode.size==0:
                 self._K_ode = np.zeros((0, 0))
-                return        
+                return
             t2_ode = t_ode
             index2_ode = index_ode
         else:
@@ -379,14 +379,14 @@ class Eq_ode1(Kernpart):
                 self._K_ode = np.zeros((t_ode.size, t2_ode.size))
                 return
             index2_ode = self._index2[self._index2>0]-1
-        
+
         # When index is identical
         h = self._compute_H(t_ode, index_ode, t2_ode, index2_ode, stationary=self.is_stationary)
 
         if self._t2 is None:
             self._K_ode = 0.5 * (h + h.T)
         else:
-            h2 = self._compute_H(t2_ode, index2_ode, t_ode, index_ode, stationary=self.is_stationary)                
+            h2 = self._compute_H(t2_ode, index2_ode, t_ode, index_ode, stationary=self.is_stationary)
             self._K_ode = 0.5 * (h + h2.T)
 
         if not self.is_normalized:
@@ -410,28 +410,28 @@ class Eq_ode1(Kernpart):
             Decay = self.decay[index]
             if self.delay is not None:
                 t = t - self.delay[index]
-            
+
             t_squared = t*t
             half_sigma_decay = 0.5*self.sigma*Decay
             [ln_part_1, sign1] = ln_diff_erfs(half_sigma_decay + t/self.sigma,
                                               half_sigma_decay)
-    
+
             [ln_part_2, sign2] = ln_diff_erfs(half_sigma_decay,
                                               half_sigma_decay - t/self.sigma)
-            
+
             h = (sign1*np.exp(half_sigma_decay*half_sigma_decay
                              + ln_part_1
-                             - log(Decay + D_j)) 
+                             - log(Decay + D_j))
                  - sign2*np.exp(half_sigma_decay*half_sigma_decay
                                 - (Decay + D_j)*t
-                                + ln_part_2 
+                                + ln_part_2
                                 - log(Decay + D_j)))
-    
+
             sigma2 = self.sigma*self.sigma
 
         if update_derivatives:
-        
-            dh_dD_i = ((0.5*Decay*sigma2*(Decay + D_j)-1)*h 
+
+            dh_dD_i = ((0.5*Decay*sigma2*(Decay + D_j)-1)*h
                        + t*sign2*np.exp(
                 half_sigma_decay*half_sigma_decay-(Decay+D_j)*t + ln_part_2
                 )
@@ -439,11 +439,11 @@ class Eq_ode1(Kernpart):
                        (-1 + np.exp(-t_squared/sigma2-Decay*t)
                         + np.exp(-t_squared/sigma2-D_j*t)
                         - np.exp(-(Decay + D_j)*t)))
-        
+
             dh_dD_i = (dh_dD_i/(Decay+D_j)).real
-        
-        
-        
+
+
+
             dh_dD_j = (t*sign2*np.exp(
                 half_sigma_decay*half_sigma_decay-(Decay + D_j)*t+ln_part_2
                 )
@@ -457,7 +457,7 @@ class Eq_ode1(Kernpart):
                           - (-t/sigma2-Decay/2)*np.exp(-t_squared/sigma2 - D_j*t) \
                           - Decay/2*np.exp(-(Decay+D_j)*t))"""
         pass
-    
+
     def _compute_H(self, t, index, t2, index2, update_derivatives=False, stationary=False):
         """Helper function for computing part of the ode1 covariance function.
 
@@ -491,7 +491,7 @@ class Eq_ode1(Kernpart):
         inv_sigma_diff_t = 1./self.sigma*diff_t
         half_sigma_decay_i = 0.5*self.sigma*Decay[:, None]
 
-        ln_part_1, sign1 = ln_diff_erfs(half_sigma_decay_i + t2_mat/self.sigma, 
+        ln_part_1, sign1 = ln_diff_erfs(half_sigma_decay_i + t2_mat/self.sigma,
                                         half_sigma_decay_i - inv_sigma_diff_t,
                                         return_sign=True)
         ln_part_2, sign2 = ln_diff_erfs(half_sigma_decay_i,
@@ -529,7 +529,7 @@ class Eq_ode1(Kernpart):
                 )
                 ))
             self._dh_ddecay = (dh_ddecay/(Decay[:, None]+Decay2[None, :])).real
-            
+
             # Update jth decay gradient
             dh_ddecay2 = (t2_mat*sign2
                          *np.exp(
@@ -539,7 +539,7 @@ class Eq_ode1(Kernpart):
                 )
                          -h)
             self._dh_ddecay2 = (dh_ddecay/(Decay[:, None] + Decay2[None, :])).real
-            
+
             # Update sigma gradient
             self._dh_dsigma = (half_sigma_decay_i*Decay[:, None]*h
                                + 2/(np.sqrt(np.pi)
@@ -547,10 +547,10 @@ class Eq_ode1(Kernpart):
                                *((-diff_t/sigma2-Decay[:, None]/2)
                                  *np.exp(-diff_t*diff_t/sigma2)
                                  + (-t2_mat/sigma2+Decay[:, None]/2)
-                                 *np.exp(-t2_mat*t2_mat/sigma2-Decay[:, None]*t_mat) 
-                                 - (-t_mat/sigma2-Decay[:, None]/2) 
-                                 *np.exp(-t_mat*t_mat/sigma2-Decay2[None, :]*t2_mat) 
+                                 *np.exp(-t2_mat*t2_mat/sigma2-Decay[:, None]*t_mat)
+                                 - (-t_mat/sigma2-Decay[:, None]/2)
+                                 *np.exp(-t_mat*t_mat/sigma2-Decay2[None, :]*t2_mat)
                                  - Decay[:, None]/2
                                  *np.exp(-(Decay[:, None]*t_mat+Decay2[None, :]*t2_mat))))
-                
+
         return h
diff --git a/GPy/likelihoods/bernoulli.py b/GPy/likelihoods/bernoulli.py
index 2e745b9b..b02b6994 100644
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@@ -243,7 +243,7 @@ class Bernoulli(Likelihood):
         assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
         #d3logpdf_dlink3 = 2*(y/(inv_link_f**3) - (1-y)/((1-inv_link_f)**3))
         state = np.seterr(divide='ignore')
-        # TODO check y \in {0, 1} or {-1, 1}
+        # TODO check y \\in {0, 1} or {-1, 1}
         d3logpdf_dlink3 = np.where(y==1, 2./(inv_link_f**3), -2./((1.-inv_link_f)**3))
         np.seterr(**state)
         return d3logpdf_dlink3
diff --git a/GPy/mappings/kernel.py b/GPy/mappings/kernel.py
index bde2c6a2..dea86f5a 100644
--- a/GPy/mappings/kernel.py
+++ b/GPy/mappings/kernel.py
@@ -12,13 +12,13 @@ class Kernel(Mapping):
 
     .. math::
 
-       f(\\mathbf{x}) = \\sum_i \alpha_i k(\\mathbf{z}_i, \\mathbf{x})
+       f(\\mathbf{x}) = \\sum_i \\alpha_i k(\\mathbf{z}_i, \\mathbf{x})
 
     or for multple outputs
 
     .. math::
 
-       f_i(\\mathbf{x}) = \\sum_j \alpha_{i,j} k(\\mathbf{z}_i, \\mathbf{x})
+       f_i(\\mathbf{x}) = \\sum_j \\alpha_{i,j} k(\\mathbf{z}_i, \\mathbf{x})
 
 
     :param input_dim: dimension of input.
diff --git a/GPy/models/state_space_main.py b/GPy/models/state_space_main.py
index 0280dafc..630c31b6 100644
--- a/GPy/models/state_space_main.py
+++ b/GPy/models/state_space_main.py
@@ -4002,10 +4002,10 @@ class ContDescrStateSpace(DescreteStateSpace):
         """
         Linear Time-Invariant Stochastic Differential Equation (LTI SDE):
 
-            dx(t) = F x(t) dt + L d \beta  ,where
+            dx(t) = F x(t) dt + L d \\beta  ,where
 
                 x(t): (vector) stochastic process
-                \beta: (vector) Brownian motion process
+                \\beta: (vector) Brownian motion process
                 F, L: (time invariant) matrices of corresponding dimensions
                 Qc: covariance of noise.
 
@@ -4022,7 +4022,7 @@ class ContDescrStateSpace(DescreteStateSpace):
         F,L: LTI SDE matrices of corresponding dimensions
 
         Qc: matrix (n,n)
-            Covarince between different dimensions of noise \beta.
+            Covarince between different dimensions of noise \\beta.
             n is the dimensionality of the noise.
 
         dt: double or iterable
diff --git a/GPy/models/tp_regression.py b/GPy/models/tp_regression.py
index 048f6316..46e223aa 100644
--- a/GPy/models/tp_regression.py
+++ b/GPy/models/tp_regression.py
@@ -185,9 +185,9 @@ class TPRegression(Model):
 
         .. math::
             p(f*|X*, X, Y) = \\int^{\\inf}_{\\inf} p(f*|f,X*)p(f|X,Y) df
-                        = MVN\\left(\nu + N,f*| K_{x*x}(K_{xx})^{-1}Y,
-                        \frac{\nu + \beta - 2}{\nu + N - 2}K_{x*x*} - K_{xx*}(K_{xx})^{-1}K_{xx*}\right)
-            \nu := \texttt{Degrees of freedom}
+                        = MVN\\left(\\nu + N,f*| K_{x*x}(K_{xx})^{-1}Y,
+                        \\frac{\\nu + \\beta - 2}{\\nu + N - 2}K_{x*x*} - K_{xx*}(K_{xx})^{-1}K_{xx*}\\right)
+            \\nu := \\texttt{Degrees of freedom}
         """
         mu, var = self.posterior._raw_predict(kern=self.kern if kern is None else kern, Xnew=Xnew,
                                               pred_var=self._predictive_variable, full_cov=full_cov)
diff --git a/GPy/testing/test_model.py b/GPy/testing/test_model.py
index aba6b2e8..72695a9f 100644
--- a/GPy/testing/test_model.py
+++ b/GPy/testing/test_model.py
@@ -269,8 +269,8 @@ class TestMisc:
         from GPy.core.parameterization.variational import NormalPosterior
 
         X_pred = NormalPosterior(X_pred_mu, X_pred_var)
-        # mu = \int f(x)q(x|mu,S) dx = \int 2x.q(x|mu,S) dx = 2.mu
-        # S = \int (f(x) - m)^2q(x|mu,S) dx = \int f(x)^2 q(x) dx - mu**2 = 4(mu^2 + S) - (2.mu)^2 = 4S
+        # mu = \\int f(x)q(x|mu,S) dx = \\int 2x.q(x|mu,S) dx = 2.mu
+        # S = \\int (f(x) - m)^2q(x|mu,S) dx = \\int f(x)^2 q(x) dx - mu**2 = 4(mu^2 + S) - (2.mu)^2 = 4S
         Y_mu_true = 2 * X_pred_mu
         Y_var_true = 4 * X_pred_var
         Y_mu_pred, Y_var_pred = m.predict_noiseless(X_pred)
diff --git a/GPy/testing/test_rv_transformation.py b/GPy/testing/test_rv_transformation.py
index 403d4c17..a6e4a666 100644
--- a/GPy/testing/test_rv_transformation.py
+++ b/GPy/testing/test_rv_transformation.py
@@ -47,7 +47,7 @@ class TestRVTransformation:
         # ax.hist(phi_s, normed=True, bins=100, alpha=0.25, label='Histogram')
         # ax.plot(phi, kde(phi), '--', linewidth=2, label='Kernel Density Estimation')
         # ax.plot(phi, pdf_phi, ':', linewidth=2, label='Transformed PDF')
-        # ax.set_xlabel(r'transformed $\theta$', fontsize=16)
+        # ax.set_xlabel(r'transformed $\\theta$', fontsize=16)
         # ax.set_ylabel('PDF', fontsize=16)
         # plt.legend(loc='best')
         # plt.show(block=True)