From 22d30d9d39c70f806fe5bcb815cce9c8eb0f8dca Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Mon, 10 Nov 2014 16:24:24 +0000
Subject: [PATCH 1/9] new ssrbf implementation

---
 GPy/kern/_src/psi_comp/ssrbf_psi_comp.py | 561 +++++++++++++++--------
 1 file changed, 380 insertions(+), 181 deletions(-)

diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
index 6302a590..f6a24c86 100644
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
@@ -7,193 +7,392 @@ The package for the psi statistics computation
 
 import numpy as np
 
-def psicomputations(variance, lengthscale, Z, variational_posterior):
-    """
-    Z - MxQ
-    mu - NxQ
-    S - NxQ
-    gamma - NxQ
-    """
-    # here are the "statistics" for psi0, psi1 and psi2
-    # Produced intermediate results:
-    # _psi1                NxM
-    mu = variational_posterior.mean
-    S = variational_posterior.variance
-    gamma = variational_posterior.binary_prob
+try:
+    from scipy import weave
+     
+    def _psicomputations(variance, lengthscale, Z, variational_posterior):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        """
+        # here are the "statistics" for psi0, psi1 and psi2
+        # Produced intermediate results:
+        # _psi1                NxM
+        mu = variational_posterior.mean
+        S = variational_posterior.variance
+        gamma = variational_posterior.binary_prob
+         
+        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
+        l2 = np.square(lengthscale)
+        log_denom1 = np.log(S/l2+1)
+        log_denom2 = np.log(2*S/l2+1)
+        log_gamma = np.log(gamma)
+        log_gamma1 = np.log(1.-gamma)
+        variance = float(variance)
+        psi0 = np.empty(N)
+        psi0[:] = variance
+        psi1 = np.empty((N,M))
+        psi2n = np.empty((N,M,M))
+         
+        from ....util.misc import param_to_array
+        S = param_to_array(S)
+        mu = param_to_array(mu)
+        gamma = param_to_array(gamma)
+        Z = param_to_array(Z)
+         
+        support_code = """
+        #include <math.h>
+        """
+        code = """
+        for(int n=0; n<N; n++) {
+            for(int m1=0;m1<M;m1++) {
+                double log_psi1=0;
+                for(int m2=0;m2<=m1;m2++) {
+                    double log_psi2_n=0;
+                    for(int q=0;q<Q;q++) {
+                        double Snq = S(n,q);
+                        double lq = l2(q);
+                        double Zm1q = Z(m1,q);
+                        double Zm2q = Z(m2,q);
+                         
+                        if(m2==0) {
+                            // Compute Psi_1
+                            double muZ = mu(n,q)-Z(m1,q);
+                             
+                            double psi1_exp1 = log_gamma(n,q) - (muZ*muZ/(Snq+lq) +log_denom1(n,q))/2.;
+                            double psi1_exp2 = log_gamma1(n,q) -Zm1q*Zm1q/(2.*lq);
+                            log_psi1 += (psi1_exp1>psi1_exp2)?psi1_exp1+log1p(exp(psi1_exp2-psi1_exp1)):psi1_exp2+log1p(exp(psi1_exp1-psi1_exp2));
+                        }
+                        // Compute Psi_2
+                        double muZhat = mu(n,q) - (Zm1q+Zm2q)/2.;
+                        double Z2 = Zm1q*Zm1q+ Zm2q*Zm2q;
+                        double dZ = Zm1q - Zm2q;
+                         
+                        double psi2_exp1 = dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2(n,q)/2. + log_gamma(n,q);
+                        double psi2_exp2 = log_gamma1(n,q) - Z2/(2.*lq);
+                        log_psi2_n += (psi2_exp1>psi2_exp2)?psi2_exp1+log1p(exp(psi2_exp2-psi2_exp1)):psi2_exp2+log1p(exp(psi2_exp1-psi2_exp2));                    
+                    }
+                    double exp_psi2_n = exp(log_psi2_n);
+                    psi2n(n,m1,m2) = variance*variance*exp_psi2_n;
+                    if(m1!=m2) { psi2n(n,m2,m1) = variance*variance*exp_psi2_n;}
+                }
+                psi1(n,m1) = variance*exp(log_psi1);
+            }
+        }
+        """
+        weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz)
+     
+        psi2 = psi2n.sum(axis=0)
+        return psi0,psi1,psi2,psi2n
+     
+    from GPy.util.caching import Cacher
+    psicomputations = Cacher(_psicomputations, limit=1)
+     
+    def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+        ARD = (len(lengthscale)!=1)
+         
+        _,psi1,_,psi2n = psicomputations(variance, lengthscale, Z, variational_posterior)
+     
+        mu = variational_posterior.mean
+        S = variational_posterior.variance
+        gamma = variational_posterior.binary_prob
+        N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
+        l2 = np.square(lengthscale)
+        log_denom1 = np.log(S/l2+1)
+        log_denom2 = np.log(2*S/l2+1)
+        log_gamma = np.log(gamma)
+        log_gamma1 = np.log(1.-gamma)
+        variance = float(variance)
+     
+        dvar = np.zeros(1)
+        dmu = np.zeros((N,Q))
+        dS = np.zeros((N,Q))
+        dgamma = np.zeros((N,Q))
+        dl = np.zeros(Q)
+        dZ = np.zeros((M,Q))
+        dvar += np.sum(dL_dpsi0)
+         
+        from ....util.misc import param_to_array
+        S = param_to_array(S)
+        mu = param_to_array(mu)
+        gamma = param_to_array(gamma)
+        Z = param_to_array(Z)
+         
+        support_code = """
+        #include <math.h>
+        """
+        code = """
+        for(int n=0; n<N; n++) {
+            for(int m1=0;m1<M;m1++) {
+                double log_psi1=0;
+                for(int m2=0;m2<M;m2++) {
+                    double log_psi2_n=0;
+                    for(int q=0;q<Q;q++) {
+                        double Snq = S(n,q);
+                        double lq = l2(q);
+                        double Zm1q = Z(m1,q);
+                        double Zm2q = Z(m2,q);
+                        double gnq = gamma(n,q);
+                        double mu_nq = mu(n,q);
+                         
+                        if(m2==0) {
+                            // Compute Psi_1                        
+                            double lpsi1 = psi1(n,m1)*dL_dpsi1(n,m1);
+                            if(q==0) {dvar(0) += lpsi1/variance;}
+                             
+                            double Zmu = Zm1q - mu_nq;
+                            double denom = Snq+lq;
+                            double Zmu2_denom = Zmu*Zmu/denom;
+                             
+                            double exp1 = log_gamma(n,q)-(Zmu*Zmu/(Snq+lq)+log_denom1(n,q))/(2.);
+                            double exp2 = log_gamma1(n,q)-Zm1q*Zm1q/(2.*lq);
+                            double d_exp1,d_exp2;
+                            if(exp1>exp2) {
+                                d_exp1 = 1.;
+                                d_exp2 = exp(exp2-exp1);
+                            } else {
+                                d_exp1 = exp(exp1-exp2);
+                                d_exp2 = 1.;
+                            }
+                            double exp_sum = d_exp1+d_exp2;
+                             
+                            dmu(n,q) += lpsi1*Zmu*d_exp1/(denom*exp_sum);
+                            dS(n,q) += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum)/2.;
+                            dgamma(n,q) += lpsi1*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
+                            dl(q) += lpsi1*((Zmu2_denom+Snq/lq)/denom*d_exp1+Zm1q*Zm1q/(lq*lq)*d_exp2)/(2.*exp_sum);
+                            dZ(m1,q) += lpsi1*(-Zmu/denom*d_exp1-Zm1q/lq*d_exp2)/exp_sum;
+                        }
+                        // Compute Psi_2
+                        double lpsi2 = psi2n(n,m1,m2)*dL_dpsi2(m1,m2);
+                        if(q==0) {dvar(0) += lpsi2*2/variance;}
+                         
+                        double dZm1m2 = Zm1q - Zm2q;
+                        double Z2 = Zm1q*Zm1q+Zm2q*Zm2q;
+                        double muZhat =  mu_nq - (Zm1q + Zm2q)/2.;
+                        double denom = 2.*Snq+lq;
+                        double muZhat2_denom = muZhat*muZhat/denom;
+                         
+                        double exp1 = dZm1m2*dZm1m2/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2(n,q)/2. + log_gamma(n,q);
+                        double exp2 = log_gamma1(n,q) - Z2/(2.*lq);
+                        double d_exp1,d_exp2;
+                        if(exp1>exp2) {
+                            d_exp1 = 1.;
+                            d_exp2 = exp(exp2-exp1);
+                        } else {
+                            d_exp1 = exp(exp1-exp2);
+                            d_exp2 = 1.;
+                        }
+                        double exp_sum = d_exp1+d_exp2;
+                         
+                        dmu(n,q) += -2.*lpsi2*muZhat/denom*d_exp1/exp_sum;
+                        dS(n,q) += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum;
+                        dgamma(n,q) += lpsi2*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
+                        dl(q) += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZm1m2*dZm1m2/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum;
+                        dZ(m1,q) += 2.*lpsi2*((muZhat/denom-dZm1m2/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum;                   
+                    }
+                }
+            }
+        }
+        """
+        weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz)
+     
+        dl *= 2.*lengthscale
+        if not ARD:
+            dl = dl.sum()
+         
+        return dvar, dl, dZ, dmu, dS, dgamma
+
+except:
+
+    def psicomputations(variance, lengthscale, Z, variational_posterior):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        """
+        # here are the "statistics" for psi0, psi1 and psi2
+        # Produced intermediate results:
+        # _psi1                NxM
+        mu = variational_posterior.mean
+        S = variational_posterior.variance
+        gamma = variational_posterior.binary_prob
+         
+        psi0 = np.empty(mu.shape[0])
+        psi0[:] = variance
+        psi1 = _psi1computations(variance, lengthscale, Z, mu, S, gamma)
+        psi2 = _psi2computations(variance, lengthscale, Z, mu, S, gamma)
+        return psi0, psi1, psi2
     
-    psi0 = np.empty(mu.shape[0])
-    psi0[:] = variance
-    psi1 = _psi1computations(variance, lengthscale, Z, mu, S, gamma)
-    psi2 = _psi2computations(variance, lengthscale, Z, mu, S, gamma)
-    return psi0, psi1, psi2
-
-def _psi1computations(variance, lengthscale, Z, mu, S, gamma):
-    """
-    Z - MxQ
-    mu - NxQ
-    S - NxQ
-    gamma - NxQ
-    """
-    # here are the "statistics" for psi1
-    # Produced intermediate results:
-    # _psi1                NxM
-
-    lengthscale2 = np.square(lengthscale)
-
-    # psi1
-    _psi1_denom = S[:, None, :] / lengthscale2 + 1.  # Nx1xQ
-    _psi1_denom_sqrt = np.sqrt(_psi1_denom) #Nx1xQ
-    _psi1_dist = Z[None, :, :] - mu[:, None, :]  # NxMxQ
-    _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom) # NxMxQ
-    _psi1_common = gamma[:,None,:] / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #Nx1xQ
-    _psi1_exponent1 = np.log(gamma[:,None,:]) - (_psi1_dist_sq + np.log(_psi1_denom))/2. # NxMxQ
-    _psi1_exponent2 = np.log(1.-gamma[:,None,:]) - (np.square(Z[None,:,:])/lengthscale2)/2. # NxMxQ
-    _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2)
-    _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ
-    _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM
-    _psi1 = variance * np.exp(_psi1_exp_sum) # NxM
-
-    return _psi1
-
-def _psi2computations(variance, lengthscale, Z, mu, S, gamma):
-    """
-    Z - MxQ
-    mu - NxQ
-    S - NxQ
-    gamma - NxQ
-    """
-    # here are the "statistics" for psi2
-    # Produced intermediate results:
-    # _psi2                MxM
+    def _psi1computations(variance, lengthscale, Z, mu, S, gamma):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        """
+        # here are the "statistics" for psi1
+        # Produced intermediate results:
+        # _psi1                NxM
     
-    lengthscale2 = np.square(lengthscale)
+        lengthscale2 = np.square(lengthscale)
     
-    _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
-    _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
-    _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
-    _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
-
-    # psi2
-    _psi2_denom = 2.*S[:, None, None, :] / lengthscale2 + 1. # Nx1x1xQ
-    _psi2_denom_sqrt = np.sqrt(_psi2_denom)
-    _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q
-    _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom)
-    _psi2_common = gamma[:,None,None,:]/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # Nx1x1xQ
-    _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom)+np.log(gamma[:,None,None,:]) #N,M,M,Q
-    _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ
-    _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2)
-    _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
-    _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
-    _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
-
-    return _psi2
-
-def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
-    ARD = (len(lengthscale)!=1)
+        # psi1
+        _psi1_denom = S[:, None, :] / lengthscale2 + 1.  # Nx1xQ
+        _psi1_denom_sqrt = np.sqrt(_psi1_denom) #Nx1xQ
+        _psi1_dist = Z[None, :, :] - mu[:, None, :]  # NxMxQ
+        _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom) # NxMxQ
+        _psi1_common = gamma[:,None,:] / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #Nx1xQ
+        _psi1_exponent1 = np.log(gamma[:,None,:]) - (_psi1_dist_sq + np.log(_psi1_denom))/2. # NxMxQ
+        _psi1_exponent2 = np.log(1.-gamma[:,None,:]) - (np.square(Z[None,:,:])/lengthscale2)/2. # NxMxQ
+        _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2)
+        _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ
+        _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM
+        _psi1 = variance * np.exp(_psi1_exp_sum) # NxM
     
-    dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1, dgamma_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-    dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2, dgamma_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
-
-    dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
+        return _psi1
     
-    dL_dlengscale = dl_psi1 + dl_psi2
-    if not ARD:
-        dL_dlengscale = dL_dlengscale.sum()
-
-    dL_dgamma = dgamma_psi1 + dgamma_psi2
-    dL_dmu = dmu_psi1 + dmu_psi2
-    dL_dS = dS_psi1 + dS_psi2
-    dL_dZ = dZ_psi1 + dZ_psi2
+    def _psi2computations(variance, lengthscale, Z, mu, S, gamma):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        """
+        # here are the "statistics" for psi2
+        # Produced intermediate results:
+        # _psi2                MxM
+        
+        lengthscale2 = np.square(lengthscale)
+        
+        _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
+        _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
+        _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
+        _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
     
-    return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma
-
-def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, gamma):
-    """
-    dL_dpsi1 - NxM
-    Z - MxQ
-    mu - NxQ
-    S - NxQ
-    gamma - NxQ
-    """
-    # here are the "statistics" for psi1
-    # Produced intermediate results: dL_dparams w.r.t. psi1
-    # _dL_dvariance     1
-    # _dL_dlengthscale  Q
-    # _dL_dZ            MxQ
-    # _dL_dgamma        NxQ
-    # _dL_dmu           NxQ
-    # _dL_dS            NxQ
+        # psi2
+        _psi2_denom = 2.*S[:, None, None, :] / lengthscale2 + 1. # Nx1x1xQ
+        _psi2_denom_sqrt = np.sqrt(_psi2_denom)
+        _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q
+        _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom)
+        _psi2_common = gamma[:,None,None,:]/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # Nx1x1xQ
+        _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom)+np.log(gamma[:,None,None,:]) #N,M,M,Q
+        _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ
+        _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2)
+        _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
+        _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
+        _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
     
-    lengthscale2 = np.square(lengthscale)
-
-    # psi1
-    _psi1_denom = S / lengthscale2 + 1.  # NxQ
-    _psi1_denom_sqrt = np.sqrt(_psi1_denom) #NxQ
-    _psi1_dist = Z[None, :, :] - mu[:, None, :]  # NxMxQ
-    _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom[:,None,:]) # NxMxQ
-    _psi1_common = gamma / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #NxQ
-    _psi1_exponent1 = np.log(gamma[:,None,:]) -0.5 * (_psi1_dist_sq + np.log(_psi1_denom[:, None,:])) # NxMxQ
-    _psi1_exponent2 = np.log(1.-gamma[:,None,:]) -0.5 * (np.square(Z[None,:,:])/lengthscale2) # NxMxQ
-    _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2)
-    _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ
-    _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM
-    _psi1_exp_dist_sq = np.exp(-0.5*_psi1_dist_sq) # NxMxQ
-    _psi1_exp_Z = np.exp(-0.5*np.square(Z[None,:,:])/lengthscale2) # 1xMxQ
-    _psi1_q = variance * np.exp(_psi1_exp_sum[:,:,None] - _psi1_exponent) # NxMxQ
-    _psi1 = variance * np.exp(_psi1_exp_sum) # NxM
-    _dL_dvariance = np.einsum('nm,nm->',dL_dpsi1, _psi1)/variance # 1
-    _dL_dgamma = np.einsum('nm,nmq,nmq->nq',dL_dpsi1, _psi1_q, (_psi1_exp_dist_sq/_psi1_denom_sqrt[:,None,:]-_psi1_exp_Z)) # NxQ
-    _dL_dmu = np.einsum('nm, nmq, nmq, nmq, nq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_dist,_psi1_common)  # NxQ
-    _dL_dS = np.einsum('nm,nmq,nmq,nq,nmq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_common,(_psi1_dist_sq-1.))/2.  # NxQ
-    _dL_dZ = np.einsum('nm,nmq,nmq->mq',dL_dpsi1,_psi1_q, (- _psi1_common[:,None,:] * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z))
-    _dL_dlengthscale = lengthscale* np.einsum('nm,nmq,nmq->q',dL_dpsi1,_psi1_q,(_psi1_common[:,None,:]*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + (1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z))
-
-    return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma 
-
-def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, gamma):
-    """
-    Z - MxQ
-    mu - NxQ
-    S - NxQ
-    gamma - NxQ
-    dL_dpsi2 - MxM
-    """
-    # here are the "statistics" for psi2
-    # Produced the derivatives w.r.t. psi2:
-    # _dL_dvariance      1
-    # _dL_dlengthscale   Q
-    # _dL_dZ             MxQ
-    # _dL_dgamma         NxQ
-    # _dL_dmu            NxQ
-    # _dL_dS             NxQ
+        return _psi2
     
-    lengthscale2 = np.square(lengthscale)
+    def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
+        ARD = (len(lengthscale)!=1)
+         
+        dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1, dgamma_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+        dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2, dgamma_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
+     
+        dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
+         
+        dL_dlengscale = dl_psi1 + dl_psi2
+        if not ARD:
+            dL_dlengscale = dL_dlengscale.sum()
+     
+        dL_dgamma = dgamma_psi1 + dgamma_psi2
+        dL_dmu = dmu_psi1 + dmu_psi2
+        dL_dS = dS_psi1 + dS_psi2
+        dL_dZ = dZ_psi1 + dZ_psi2
+         
+        return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma
     
-    _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
-    _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
-    _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
-    _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
-
-    # psi2
-    _psi2_denom = 2.*S / lengthscale2 + 1. # NxQ
-    _psi2_denom_sqrt = np.sqrt(_psi2_denom)
-    _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q
-    _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom[:,None,None,:])
-    _psi2_common = gamma/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # NxQ
-    _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom[:,None,None,:])+np.log(gamma[:,None,None,:]) #N,M,M,Q
-    _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ
-    _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2)
-    _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
-    _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
-    _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ 
-    _psi2_exp_dist_sq = np.exp(-_psi2_Zdist_sq -_psi2_mudist_sq) # NxMxMxQ
-    _psi2_exp_Z = np.exp(-0.5*_psi2_Z_sq_sum) # MxMxQ
-    _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
-    _dL_dvariance = np.einsum('mo,mo->',dL_dpsi2,_psi2)*2./variance
-    _dL_dgamma = np.einsum('mo,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,(_psi2_exp_dist_sq/_psi2_denom_sqrt[:,None,None,:] - _psi2_exp_Z))
-    _dL_dmu = -2.*np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,_psi2_common,_psi2_mudist,_psi2_exp_dist_sq)
-    _dL_dS = np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q, _psi2_common, (2.*_psi2_mudist_sq-1.), _psi2_exp_dist_sq)
-    _dL_dZ = 2.*np.einsum('mo,nmoq,nmoq->mq',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(-_psi2_Zdist*_psi2_denom[:,None,None,:]+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z))
-    _dL_dlengthscale = 2.*lengthscale* np.einsum('mo,nmoq,nmoq->q',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z))
-
-    return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma
+    def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, gamma):
+        """
+        dL_dpsi1 - NxM
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        """
+        # here are the "statistics" for psi1
+        # Produced intermediate results: dL_dparams w.r.t. psi1
+        # _dL_dvariance     1
+        # _dL_dlengthscale  Q
+        # _dL_dZ            MxQ
+        # _dL_dgamma        NxQ
+        # _dL_dmu           NxQ
+        # _dL_dS            NxQ
+        
+        lengthscale2 = np.square(lengthscale)
+    
+        # psi1
+        _psi1_denom = S / lengthscale2 + 1.  # NxQ
+        _psi1_denom_sqrt = np.sqrt(_psi1_denom) #NxQ
+        _psi1_dist = Z[None, :, :] - mu[:, None, :]  # NxMxQ
+        _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom[:,None,:]) # NxMxQ
+        _psi1_common = gamma / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #NxQ
+        _psi1_exponent1 = np.log(gamma[:,None,:]) -0.5 * (_psi1_dist_sq + np.log(_psi1_denom[:, None,:])) # NxMxQ
+        _psi1_exponent2 = np.log(1.-gamma[:,None,:]) -0.5 * (np.square(Z[None,:,:])/lengthscale2) # NxMxQ
+        _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2)
+        _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ
+        _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM
+        _psi1_exp_dist_sq = np.exp(-0.5*_psi1_dist_sq) # NxMxQ
+        _psi1_exp_Z = np.exp(-0.5*np.square(Z[None,:,:])/lengthscale2) # 1xMxQ
+        _psi1_q = variance * np.exp(_psi1_exp_sum[:,:,None] - _psi1_exponent) # NxMxQ
+        _psi1 = variance * np.exp(_psi1_exp_sum) # NxM
+        _dL_dvariance = np.einsum('nm,nm->',dL_dpsi1, _psi1)/variance # 1
+        _dL_dgamma = np.einsum('nm,nmq,nmq->nq',dL_dpsi1, _psi1_q, (_psi1_exp_dist_sq/_psi1_denom_sqrt[:,None,:]-_psi1_exp_Z)) # NxQ
+        _dL_dmu = np.einsum('nm, nmq, nmq, nmq, nq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_dist,_psi1_common)  # NxQ
+        _dL_dS = np.einsum('nm,nmq,nmq,nq,nmq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_common,(_psi1_dist_sq-1.))/2.  # NxQ
+        _dL_dZ = np.einsum('nm,nmq,nmq->mq',dL_dpsi1,_psi1_q, (- _psi1_common[:,None,:] * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z))
+        _dL_dlengthscale = lengthscale* np.einsum('nm,nmq,nmq->q',dL_dpsi1,_psi1_q,(_psi1_common[:,None,:]*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + (1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z))
+    
+        return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma 
+    
+    def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, gamma):
+        """
+        Z - MxQ
+        mu - NxQ
+        S - NxQ
+        gamma - NxQ
+        dL_dpsi2 - MxM
+        """
+        # here are the "statistics" for psi2
+        # Produced the derivatives w.r.t. psi2:
+        # _dL_dvariance      1
+        # _dL_dlengthscale   Q
+        # _dL_dZ             MxQ
+        # _dL_dgamma         NxQ
+        # _dL_dmu            NxQ
+        # _dL_dS             NxQ
+        
+        lengthscale2 = np.square(lengthscale)
+        
+        _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
+        _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
+        _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q
+        _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ
+    
+        # psi2
+        _psi2_denom = 2.*S / lengthscale2 + 1. # NxQ
+        _psi2_denom_sqrt = np.sqrt(_psi2_denom)
+        _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q
+        _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom[:,None,None,:])
+        _psi2_common = gamma/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # NxQ
+        _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom[:,None,None,:])+np.log(gamma[:,None,None,:]) #N,M,M,Q
+        _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ
+        _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2)
+        _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max))
+        _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM
+        _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ 
+        _psi2_exp_dist_sq = np.exp(-_psi2_Zdist_sq -_psi2_mudist_sq) # NxMxMxQ
+        _psi2_exp_Z = np.exp(-0.5*_psi2_Z_sq_sum) # MxMxQ
+        _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM
+        _dL_dvariance = np.einsum('mo,mo->',dL_dpsi2,_psi2)*2./variance
+        _dL_dgamma = np.einsum('mo,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,(_psi2_exp_dist_sq/_psi2_denom_sqrt[:,None,None,:] - _psi2_exp_Z))
+        _dL_dmu = -2.*np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,_psi2_common,_psi2_mudist,_psi2_exp_dist_sq)
+        _dL_dS = np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q, _psi2_common, (2.*_psi2_mudist_sq-1.), _psi2_exp_dist_sq)
+        _dL_dZ = 2.*np.einsum('mo,nmoq,nmoq->mq',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(-_psi2_Zdist*_psi2_denom[:,None,None,:]+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z))
+        _dL_dlengthscale = 2.*lengthscale* np.einsum('mo,nmoq,nmoq->q',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z))
+    
+        return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma

From ca0523c6ee50f957eb237739511eba8265c43e80 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Tue, 11 Nov 2014 09:56:15 +0000
Subject: [PATCH 2/9] [setup] tweaks for submission

---
 setup.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/setup.py b/setup.py
index e3593b3c..c4963bcc 100644
--- a/setup.py
+++ b/setup.py
@@ -20,6 +20,7 @@ setup(name = 'GPy',
       url = "http://sheffieldml.github.com/GPy/",
       packages = ["GPy.models",
                   "GPy.inference.optimization",
+                  "GPy.inference.mcmc",
                   "GPy.inference",
                   "GPy.inference.latent_function_inference",
                   "GPy.likelihoods", "GPy.mappings",
@@ -31,14 +32,20 @@ setup(name = 'GPy',
                   "GPy.plotting.matplot_dep.latent_space_visualizations",
                   "GPy.plotting.matplot_dep", "GPy.plotting"],
       package_dir={'GPy': 'GPy'},
-      package_data = {'GPy': ['defaults.cfg', 'installation.cfg', 'util/data_resources.json', 'util/football_teams.json']},
+      package_data = {'GPy': ['defaults.cfg', 'installation.cfg',
+                              'util/data_resources.json',
+                              'util/football_teams.json']},
+      include_package_data = True,
       py_modules = ['GPy.__init__'],
+      test_suite = 'GPy.testing',
       long_description=read('README.md'),
-      install_requires=['numpy>=1.8', 'scipy>=0.14'],
-      extras_require = {
-        'docs':['matplotlib>=1.4','Sphinx','ipython'],
-      },
-      classifiers=[
-      "License :: OSI Approved :: BSD License"],
-      zip_safe = False
+      install_requires=['numpy>=1.7', 'scipy>=0.12'],
+      extras_require = {'docs':['matplotlib >=1.3','Sphinx','IPython']},
+      classifiers=['License :: OSI Approved :: BSD License',
+                   'Natural Language :: English',
+                   'Operating System :: MacOS :: MacOS X',
+                   'Operating System :: Microsoft :: Windows',
+                   'Operating System :: POSIX :: Linux',
+                   'Programming Language :: Python :: 2.7',
+                   'Topic :: Scientific/Engineering :: Artificial Intelligence']
       )

From 1e6ed9f873f7518dee4490227f17dbe8e16a1bfd Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Tue, 11 Nov 2014 09:56:36 +0000
Subject: [PATCH 3/9] [linalg] ppca with missing data removed

---
 GPy/util/linalg.py | 99 ----------------------------------------------
 1 file changed, 99 deletions(-)

diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index cef699cf..dffd438a 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -328,105 +328,6 @@ def ppca(Y, Q, iterations=100):
         pass
     return np.asarray_chkfinite(exp_x), np.asarray_chkfinite(W)
 
-def ppca_missing_data_at_random(Y, Q, iters=100):
-    """
-    EM implementation of Probabilistic pca for when there is missing data.
-
-    Taken from <SheffieldML, https://github.com/SheffieldML>
-
-    .. math:
-        \\mathbf{Y} = \mathbf{XW} + \\epsilon \\text{, where}
-        \\epsilon = \\mathcal{N}(0, \\sigma^2 \mathbf{I})
-
-    :returns: X, W, sigma^2
-    """
-    from numpy.ma import dot as madot
-    import diag
-    from GPy.util.subarray_and_sorting import common_subarrays
-    import time
-    debug = 1
-    # Initialise W randomly
-    N, D = Y.shape
-    W = np.random.randn(Q, D) * 1e-3
-    Y = np.ma.masked_invalid(Y, copy=1)
-    nu = 1.
-    #num_obs_i = 1./Y.count()
-    Ycentered = Y - Y.mean(0)
-
-    X = np.zeros((N,Q))
-    cs = common_subarrays(Y.mask)
-    cr = common_subarrays(Y.mask, 1)
-    Sigma = np.zeros((N, Q, Q))
-    Sigma2 = np.zeros((N, Q, Q))
-    mu = np.zeros(D)
-    """
-    if debug:
-        import matplotlib.pyplot as pylab
-        fig = pylab.figure("FIT MISSING DATA");
-        ax = fig.gca()
-        ax.cla()
-        lines = pylab.plot(np.zeros((N,Q)).dot(W))
-    """
-    W2 = np.zeros((Q,D))
-
-    for i in range(iters):
-#         Sigma = np.linalg.solve(diag.add(madot(W,W.T), nu), diag.times(np.eye(Q),nu))
-#         exp_x = madot(madot(Ycentered, W.T),Sigma)/nu
-#         Ycentered = (Y - exp_x.dot(W).mean(0))
-#         #import ipdb;ipdb.set_trace()
-#         #Ycentered = mu
-#         W = np.linalg.solve(madot(exp_x.T,exp_x) + Sigma, madot(exp_x.T, Ycentered))
-#         nu = (((Ycentered - madot(exp_x, W))**2).sum(0) + madot(W.T,madot(Sigma,W)).sum(0)).sum()/N
-        for csi, (mask, index) in enumerate(cs.iteritems()):
-            mask = ~np.array(mask)
-            Sigma2[index, :, :] = nu * np.linalg.inv(diag.add(W2[:,mask].dot(W2[:,mask].T), nu))
-            #X[index,:] = madot((Sigma[csi]/nu),madot(W,Ycentered[index].T))[:,0]
-        X2 = ((Sigma2/nu) * (madot(Ycentered,W2.T).base)[:,:,None]).sum(-1)
-        mu2 = (Y - X.dot(W)).mean(0)
-        for n in range(N):
-            Sigma[n] = nu * np.linalg.inv(diag.add(W[:,~Y.mask[n]].dot(W[:,~Y.mask[n]].T), nu))
-            X[n, :] = (Sigma[n]/nu).dot(W[:,~Y.mask[n]].dot(Ycentered[n,~Y.mask[n]].T))
-        for d in range(D):
-            mu[d] = (Y[~Y.mask[:,d], d] - X[~Y.mask[:,d]].dot(W[:, d])).mean()
-        Ycentered = (Y - mu)
-        nu3 = 0.
-        for cri, (mask, index) in enumerate(cr.iteritems()):
-            mask = ~np.array(mask)
-            W2[:,index] = np.linalg.solve(X[mask].T.dot(X[mask]) + Sigma[mask].sum(0), madot(X[mask].T, Ycentered[mask,index]))[:,None]
-            W2[:,index] = np.linalg.solve(X.T.dot(X) + Sigma.sum(0), madot(X.T, Ycentered[:,index]))
-            #nu += (((Ycentered[mask,index] - X[mask].dot(W[:,index]))**2).sum(0) + W[:,index].T.dot(Sigma[mask].sum(0).dot(W[:,index])).sum(0)).sum()
-            nu3 += (((Ycentered[index] - X.dot(W[:,index]))**2).sum(0) + W[:,index].T.dot(Sigma.sum(0).dot(W[:,index])).sum(0)).sum()
-        nu3 /= N
-        nu = 0.
-        nu2 = 0.
-        W = np.zeros((Q,D))
-        for j in range(D):
-            W[:,j] = np.linalg.solve(X[~Y.mask[:,j]].T.dot(X[~Y.mask[:,j]]) + Sigma[~Y.mask[:,j]].sum(0), madot(X[~Y.mask[:,j]].T, Ycentered[~Y.mask[:,j],j]))
-            nu2f = np.tensordot(W[:,j].T, Sigma[~Y.mask[:,j],:,:], [0,1]).dot(W[:,j])
-            nu2s = W[:,j].T.dot(Sigma[~Y.mask[:,j],:,:].sum(0).dot(W[:,j]))
-            nu2 += (((Ycentered[~Y.mask[:,j],j] - X[~Y.mask[:,j],:].dot(W[:,j]))**2) + nu2f).sum()
-            for i in range(N):
-                if not Y.mask[i,j]:
-                    nu += ((Ycentered[i,j] - X[i,:].dot(W[:,j]))**2) + W[:,j].T.dot(Sigma[i,:,:].dot(W[:,j]))
-        nu /= N
-        nu2 /= N
-        nu4 = (((Ycentered - X.dot(W))**2).sum(0) + W.T.dot(Sigma.sum(0).dot(W)).sum(0)).sum()/N
-        import ipdb;ipdb.set_trace()
-        """
-        if debug:
-            #print Sigma[0]
-            print "nu:", nu, "sum(X):", X.sum()
-            pred_y = X.dot(W)
-            for x, l in zip(pred_y.T, lines):
-                l.set_ydata(x)
-            ax.autoscale_view()
-            ax.set_ylim(pred_y.min(), pred_y.max())
-            fig.canvas.draw()
-            time.sleep(.3)
-        """
-    return np.asarray_chkfinite(X), np.asarray_chkfinite(W), nu
-
-
 def tdot_numpy(mat, out=None):
     return np.dot(mat, mat.T, out)
 

From 07a5e290dea76cafcd063c4f49affe4930cc34af Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Tue, 11 Nov 2014 10:20:34 +0000
Subject: [PATCH 4/9] [updates] starting to extract out standalone modules

---
 GPy/core/gp.py                                |   8 +-
 GPy/core/parameterization/observable.py       |  69 ++++++++++
 GPy/core/parameterization/observable_array.py |   5 +-
 GPy/core/parameterization/parameter_core.py   | 124 ++----------------
 GPy/core/parameterization/updateable.py       |  63 +++++++++
 GPy/util/caching.py                           |   4 +-
 6 files changed, 149 insertions(+), 124 deletions(-)
 create mode 100644 GPy/core/parameterization/observable.py
 create mode 100644 GPy/core/parameterization/updateable.py

diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 7835f8b9..0b59c0ea 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -96,7 +96,7 @@ class GP(Model):
     def set_XY(self, X=None, Y=None):
         """
         Set the input / output of the model
-        
+
         :param X: input observations
         :param Y: output observations
         """
@@ -384,16 +384,16 @@ class GP(Model):
             print "KeyboardInterrupt caught, calling on_optimization_end() to round things up"
             self.inference_method.on_optimization_end()
             raise
-        
+
     def infer_newX(self, Y_new, optimize=True, ):
         """
         Infer the distribution of X for the new observed data *Y_new*.
-        
+
         :param Y_new: the new observed data for inference
         :type Y_new: numpy.ndarray
         :param optimize: whether to optimize the location of new X (True by default)
         :type optimize: boolean
-        :return: a tuple containing the posterior estimation of X and the model that optimize X 
+        :return: a tuple containing the posterior estimation of X and the model that optimize X
         :rtype: (GPy.core.parameterization.variational.VariationalPosterior or numpy.ndarray, GPy.core.Model)
         """
         from ..inference.latent_function_inference.inferenceX import infer_newX
diff --git a/GPy/core/parameterization/observable.py b/GPy/core/parameterization/observable.py
new file mode 100644
index 00000000..ad55b44c
--- /dev/null
+++ b/GPy/core/parameterization/observable.py
@@ -0,0 +1,69 @@
+'''
+Created on 30 Oct 2014
+
+@author: maxz
+'''
+
+
+class Observable(object):
+    """
+    Observable pattern for parameterization.
+
+    This Object allows for observers to register with self and a (bound!) function
+    as an observer. Every time the observable changes, it sends a notification with
+    self as only argument to all its observers.
+    """
+    def __init__(self, *args, **kwargs):
+        super(Observable, self).__init__()
+        from lists_and_dicts import ObserverList
+        self.observers = ObserverList()
+
+    def add_observer(self, observer, callble, priority=0):
+        """
+        Add an observer `observer` with the callback `callble`
+        and priority `priority` to this observers list.
+        """
+        self.observers.add(priority, observer, callble)
+
+    def remove_observer(self, observer, callble=None):
+        """
+        Either (if callble is None) remove all callables,
+        which were added alongside observer,
+        or remove callable `callble` which was added alongside
+        the observer `observer`.
+        """
+        to_remove = []
+        for poc in self.observers:
+            _, obs, clble = poc
+            if callble is not None:
+                if (obs is observer) and (callble == clble):
+                    to_remove.append(poc)
+            else:
+                if obs is observer:
+                    to_remove.append(poc)
+        for r in to_remove:
+            self.observers.remove(*r)
+
+    def notify_observers(self, which=None, min_priority=None):
+        """
+        Notifies all observers. Which is the element, which kicked off this
+        notification loop. The first argument will be self, the second `which`.
+
+        NOTE: notifies only observers with priority p > min_priority!
+                                                    ^^^^^^^^^^^^^^^^
+        :param min_priority: only notify observers with priority > min_priority
+                             if min_priority is None, notify all observers in order
+        """
+        if which is None:
+            which = self
+        if min_priority is None:
+            [callble(self, which=which) for _, _, callble in self.observers]
+        else:
+            for p, _, callble in self.observers:
+                if p <= min_priority:
+                    break
+                callble(self, which=which)
+
+    def change_priority(self, observer, callble, priority):
+        self.remove_observer(observer, callble)
+        self.add_observer(observer, callble, priority)
\ No newline at end of file
diff --git a/GPy/core/parameterization/observable_array.py b/GPy/core/parameterization/observable_array.py
index 1253caae..4a7bdf85 100644
--- a/GPy/core/parameterization/observable_array.py
+++ b/GPy/core/parameterization/observable_array.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-__updated__ = '2014-10-29'
+__updated__ = '2014-11-11'
 
 import numpy as np
-from parameter_core import Observable, Pickleable
+from parameter_core import Pickleable
+from observable import Observable
 
 class ObsAr(np.ndarray, Pickleable, Observable):
     """
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index f8f7f7cc..4048fbae 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -17,6 +17,7 @@ from transformations import Transformation,Logexp, NegativeLogexp, Logistic, __f
 import numpy as np
 import re
 import logging
+from updateable import Updateable
 
 class HierarchyError(Exception):
     """
@@ -40,115 +41,6 @@ def adjust_name_for_printing(name):
     return ''
 
 
-class Observable(object):
-    """
-    Observable pattern for parameterization.
-
-    This Object allows for observers to register with self and a (bound!) function
-    as an observer. Every time the observable changes, it sends a notification with
-    self as only argument to all its observers.
-    """
-    _updates = True
-    def __init__(self, *args, **kwargs):
-        super(Observable, self).__init__()
-        from lists_and_dicts import ObserverList
-        self.observers = ObserverList()
-
-    @property
-    def updates(self):
-        raise DeprecationWarning("updates is now a function, see update(True|False|None)")
-
-    @updates.setter
-    def updates(self, ups):
-        raise DeprecationWarning("updates is now a function, see update(True|False|None)")
-
-    def update_model(self, updates=None):
-        """
-        Get or set, whether automatic updates are performed. When updates are
-        off, the model might be in a non-working state. To make the model work
-        turn updates on again.
-
-        :param bool|None updates:
-
-            bool: whether to do updates
-            None: get the current update state
-        """
-        if updates is None:
-            p = getattr(self, '_highest_parent_', None)
-            if p is not None:
-                self._updates = p._updates
-            return self._updates
-        assert isinstance(updates, bool), "updates are either on (True) or off (False)"
-        p = getattr(self, '_highest_parent_', None)
-        if p is not None:
-            p._updates = updates
-        else:
-            self._updates = updates
-        self.trigger_update()
-
-    def toggle_update(self):
-        self.update_model(not self.update())
-
-    def trigger_update(self, trigger_parent=True):
-        """
-        Update the model from the current state.
-        Make sure that updates are on, otherwise this
-        method will do nothing
-
-        :param bool trigger_parent: Whether to trigger the parent, after self has updated
-        """
-        if not self.update_model() or (hasattr(self, "_in_init_") and self._in_init_):
-            #print "Warning: updates are off, updating the model will do nothing"
-            return
-        self._trigger_params_changed(trigger_parent)
-
-    def add_observer(self, observer, callble, priority=0):
-        """
-        Add an observer `observer` with the callback `callble`
-        and priority `priority` to this observers list.
-        """
-        self.observers.add(priority, observer, callble)
-
-    def remove_observer(self, observer, callble=None):
-        """
-        Either (if callble is None) remove all callables,
-        which were added alongside observer,
-        or remove callable `callble` which was added alongside
-        the observer `observer`.
-        """
-        to_remove = []
-        for poc in self.observers:
-            _, obs, clble = poc
-            if callble is not None:
-                if (obs is observer) and (callble == clble):
-                    to_remove.append(poc)
-            else:
-                if obs is observer:
-                    to_remove.append(poc)
-        for r in to_remove:
-            self.observers.remove(*r)
-
-    def notify_observers(self, which=None, min_priority=None):
-        """
-        Notifies all observers. Which is the element, which kicked off this
-        notification loop. The first argument will be self, the second `which`.
-
-        NOTE: notifies only observers with priority p > min_priority!
-                                                    ^^^^^^^^^^^^^^^^
-        :param min_priority: only notify observers with priority > min_priority
-                             if min_priority is None, notify all observers in order
-        """
-        if not self.update_model():
-            return
-        if which is None:
-            which = self
-        if min_priority is None:
-            [callble(self, which=which) for _, _, callble in self.observers]
-        else:
-            for p, _, callble in self.observers:
-                if p <= min_priority:
-                    break
-                callble(self, which=which)
 
 class Parentable(object):
     """
@@ -307,11 +199,11 @@ class Gradcheckable(Pickleable, Parentable):
         :param float step: the stepsize for the numerical three point gradient estimate.
         :param float tolerance: the tolerance for the gradient ratio or difference.
         :param float df_tolerance: the tolerance for df_tolerance
-        
+
         Note:-
-           The *dF_ratio* indicates the limit of accuracy of numerical gradients. 
-           If it is too small, e.g., smaller than 1e-12, the numerical gradients 
-           are usually not accurate enough for the tests (shown with blue). 
+           The *dF_ratio* indicates the limit of accuracy of numerical gradients.
+           If it is too small, e.g., smaller than 1e-12, the numerical gradients
+           are usually not accurate enough for the tests (shown with blue).
         """
         if self.has_parent():
             return self._highest_parent_._checkgrad(self, verbose=verbose, step=step, tolerance=tolerance, df_tolerance=df_tolerance)
@@ -363,7 +255,7 @@ class Nameable(Gradcheckable):
         return adjust(self.name)
 
 
-class Indexable(Nameable, Observable):
+class Indexable(Nameable, Updateable):
     """
     Make an object constrainable with Priors and Transformations.
     TODO: Mappings!!
@@ -726,7 +618,7 @@ class OptimizationHandlable(Indexable):
                 fixes[self.constraints[__fixed__]] = FIXED
                 return self._optimizer_copy_[np.logical_and(fixes, self._highest_parent_.tie.getTieFlag(self))]
             elif self._has_fixes():
-                    return self._optimizer_copy_[self._fixes_]
+                return self._optimizer_copy_[self._fixes_]
 
             self._optimizer_copy_transformed = True
 
@@ -757,7 +649,7 @@ class OptimizationHandlable(Indexable):
         #self._highest_parent_.tie.propagate_val()
 
         self._optimizer_copy_transformed = False
-        self._trigger_params_changed()
+        self.trigger_update()
 
     def _get_params_transformed(self):
         raise DeprecationWarning, "_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!"
diff --git a/GPy/core/parameterization/updateable.py b/GPy/core/parameterization/updateable.py
new file mode 100644
index 00000000..daf07d3c
--- /dev/null
+++ b/GPy/core/parameterization/updateable.py
@@ -0,0 +1,63 @@
+'''
+Created on 11 Nov 2014
+
+@author: maxz
+'''
+from observable import Observable
+
+
+class Updateable(Observable):
+    """
+    A model can be updated or not.
+    Make sure updates can be switched on and off.
+    """
+    _updates = True
+    def __init__(self, *args, **kwargs):
+        super(Updateable, self).__init__(*args, **kwargs)
+
+    @property
+    def updates(self):
+        raise DeprecationWarning("updates is now a function, see update(True|False|None)")
+
+    @updates.setter
+    def updates(self, ups):
+        raise DeprecationWarning("updates is now a function, see update(True|False|None)")
+
+    def update_model(self, updates=None):
+        """
+        Get or set, whether automatic updates are performed. When updates are
+        off, the model might be in a non-working state. To make the model work
+        turn updates on again.
+
+        :param bool|None updates:
+
+            bool: whether to do updates
+            None: get the current update state
+        """
+        if updates is None:
+            p = getattr(self, '_highest_parent_', None)
+            if p is not None:
+                self._updates = p._updates
+            return self._updates
+        assert isinstance(updates, bool), "updates are either on (True) or off (False)"
+        p = getattr(self, '_highest_parent_', None)
+        if p is not None:
+            p._updates = updates
+        self._updates = updates
+        self.trigger_update()
+
+    def toggle_update(self):
+        self.update_model(not self.update_model())
+
+    def trigger_update(self, trigger_parent=True):
+        """
+        Update the model from the current state.
+        Make sure that updates are on, otherwise this
+        method will do nothing
+
+        :param bool trigger_parent: Whether to trigger the parent, after self has updated
+        """
+        if not self.update_model() or (hasattr(self, "_in_init_") and self._in_init_):
+            #print "Warning: updates are off, updating the model will do nothing"
+            return
+        self._trigger_params_changed(trigger_parent)
diff --git a/GPy/util/caching.py b/GPy/util/caching.py
index 85c4caa8..6e954fc7 100644
--- a/GPy/util/caching.py
+++ b/GPy/util/caching.py
@@ -1,5 +1,5 @@
-from ..core.parameterization.parameter_core import Observable
-import collections, weakref, logging
+from ..core.parameterization.observable import Observable
+import collections, weakref
 
 class Cacher(object):
     def __init__(self, operation, limit=5, ignore_args=(), force_kwargs=()):

From 5736caa8fd4fa0eb5435a9e6d11ea0a165abc17e Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Tue, 11 Nov 2014 15:41:35 +0000
Subject: [PATCH 5/9] add the function of saving all the parameters into a HDF5
 file

---
 GPy/core/parameterization/parameter_core.py | 25 +++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index 4048fbae..a8f11606 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -1009,3 +1009,28 @@ class Parameterizable(OptimizationHandlable):
         updates get passed through. See :py:function:``GPy.core.param.Observable.add_observer``
         """
         pass
+
+    def save_params_H5(self, filename):
+        """
+        Save all the model parameters into a HDF5 file.
+        """
+        from . import Param
+        from ...util.misc import param_to_array
+        def gather_params(self, plist):
+            if isinstance(self,Param):
+                plist.append(self)
+        plist = []
+        self.traverse(gather_params, plist)
+        names = self.parameter_names(adjust_for_printing=True)
+        try:
+            import h5py
+            f = h5py.File(filename,'w')
+            for p,n in zip(plist,names):
+                n = n.replace('.','_')
+                p = param_to_array(p)
+                d = f.create_dataset(n,p.shape,dtype=p.dtype)
+                d[:] = p
+            f.close()
+        except:
+            raise 'Fails to write the parameters into a HDF5 file!'
+

From 0018b5e583f9b8ad661ab0650e3577489ee655d9 Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Tue, 11 Nov 2014 15:44:50 +0000
Subject: [PATCH 6/9] rename the save_params_H5 function to be a general
 function save which can potentially support other file format

---
 GPy/core/parameterization/parameter_core.py | 27 +++++++++++----------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index a8f11606..6add95b0 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -1010,9 +1010,9 @@ class Parameterizable(OptimizationHandlable):
         """
         pass
 
-    def save_params_H5(self, filename):
+    def save(self, filename, ftype='HDF5'):
         """
-        Save all the model parameters into a HDF5 file.
+        Save all the model parameters into a file (HDF5 by default).
         """
         from . import Param
         from ...util.misc import param_to_array
@@ -1022,15 +1022,16 @@ class Parameterizable(OptimizationHandlable):
         plist = []
         self.traverse(gather_params, plist)
         names = self.parameter_names(adjust_for_printing=True)
-        try:
-            import h5py
-            f = h5py.File(filename,'w')
-            for p,n in zip(plist,names):
-                n = n.replace('.','_')
-                p = param_to_array(p)
-                d = f.create_dataset(n,p.shape,dtype=p.dtype)
-                d[:] = p
-            f.close()
-        except:
-            raise 'Fails to write the parameters into a HDF5 file!'
+        if ftype=='HDF5':
+            try:
+                import h5py
+                f = h5py.File(filename,'w')
+                for p,n in zip(plist,names):
+                    n = n.replace('.','_')
+                    p = param_to_array(p)
+                    d = f.create_dataset(n,p.shape,dtype=p.dtype)
+                    d[:] = p
+                f.close()
+            except:
+                raise 'Fails to write the parameters into a HDF5 file!'
 

From a5c3e88f83d618b9cd8bd828b1acafe6b0743ff8 Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 12 Nov 2014 11:34:44 +0000
Subject: [PATCH 7/9] [MRD] init and sim nicer now

---
 GPy/examples/dimensionality_reduction.py | 88 ++++++++++++------------
 GPy/models/bayesian_gplvm_minibatch.py   | 13 ++--
 GPy/models/mrd.py                        | 31 ++++-----
 GPy/models/sparse_gp_minibatch.py        |  1 +
 4 files changed, 67 insertions(+), 66 deletions(-)

diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index 1da855bc..dc0b3dea 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as _np
 
-#default_seed = _np.random.seed(123344)
+# default_seed = _np.random.seed(123344)
 
 def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan=False):
     """
@@ -28,7 +28,7 @@ def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan
     Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, (output_dim,)).T
 
     # k = GPy.kern.RBF_inv(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim)
-    #k = GPy.kern.linear(input_dim)# + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
+    # k = GPy.kern.linear(input_dim)# + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
     # k = GPy.kern.RBF(input_dim, ARD = False)  + GPy.kern.white(input_dim, 0.00001)
     # k = GPy.kern.RBF(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.RBF(input_dim, .3, _np.ones(input_dim) * .2, ARD=True)
     # k = GPy.kern.RBF(input_dim, .5, 2., ARD=0) + GPy.kern.RBF(input_dim, .3, .2, ARD=0)
@@ -40,30 +40,30 @@ def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan
 
     if nan:
         m.inference_method = GPy.inference.latent_function_inference.var_dtc.VarDTCMissingData()
-        m.Y[_np.random.binomial(1,p,size=(Y.shape)).astype(bool)] = _np.nan
+        m.Y[_np.random.binomial(1, p, size=(Y.shape)).astype(bool)] = _np.nan
         m.parameters_changed()
 
     #===========================================================================
     # randomly obstruct data with percentage p
     #===========================================================================
-    #m2 = GPy.models.BayesianGPLVMWithMissingData(Y_obstruct, input_dim, kernel=k, num_inducing=num_inducing)
-    #m.lengthscales = lengthscales
+    # m2 = GPy.models.BayesianGPLVMWithMissingData(Y_obstruct, input_dim, kernel=k, num_inducing=num_inducing)
+    # m.lengthscales = lengthscales
 
     if plot:
         import matplotlib.pyplot as pb
         m.plot()
         pb.title('PCA initialisation')
-        #m2.plot()
-        #pb.title('PCA initialisation')
+        # m2.plot()
+        # pb.title('PCA initialisation')
 
     if optimize:
         m.optimize('scg', messages=verbose)
-        #m2.optimize('scg', messages=verbose)
+        # m2.optimize('scg', messages=verbose)
         if plot:
             m.plot()
             pb.title('After optimisation')
-            #m2.plot()
-            #pb.title('After optimisation')
+            # m2.plot()
+            # pb.title('After optimisation')
 
     return m
 
@@ -169,7 +169,7 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40,
         data = GPy.util.datasets.oil()
 
 
-    kernel = GPy.kern.RBF(Q, 1., 1./_np.random.uniform(0,1,(Q,)), ARD=True)# + GPy.kern.Bias(Q, _np.exp(-2))
+    kernel = GPy.kern.RBF(Q, 1., 1. / _np.random.uniform(0, 1, (Q,)), ARD=True)  # + GPy.kern.Bias(Q, _np.exp(-2))
     Y = data['X'][:N]
     m = GPy.models.BayesianGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing, **k)
     m.data_labels = data['Y'][:N].argmax(axis=1)
@@ -180,8 +180,8 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40,
     if plot:
         fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
         m.plot_latent(ax=latent_axes, labels=m.data_labels)
-        data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0,:]))
-        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean.values[0:1,:], # @UnusedVariable
+        data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0, :]))
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean.values[0:1, :],  # @UnusedVariable
             m, data_show, latent_axes=latent_axes, sense_axes=sense_axes, labels=m.data_labels)
         raw_input('Press enter to finish')
         plt.close(fig)
@@ -195,7 +195,7 @@ def ssgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40
     _np.random.seed(0)
     data = pods.datasets.oil()
 
-    kernel = GPy.kern.RBF(Q, 1., 1./_np.random.uniform(0,1,(Q,)), ARD=True)# + GPy.kern.Bias(Q, _np.exp(-2))
+    kernel = GPy.kern.RBF(Q, 1., 1. / _np.random.uniform(0, 1, (Q,)), ARD=True)  # + GPy.kern.Bias(Q, _np.exp(-2))
     Y = data['X'][:N]
     m = GPy.models.SSGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing, **k)
     m.data_labels = data['Y'][:N].argmax(axis=1)
@@ -206,8 +206,8 @@ def ssgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40
     if plot:
         fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
         m.plot_latent(ax=latent_axes, labels=m.data_labels)
-        data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0,:]))
-        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean.values[0:1,:], # @UnusedVariable
+        data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0, :]))
+        lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean.values[0:1, :],  # @UnusedVariable
             m, data_show, latent_axes=latent_axes, sense_axes=sense_axes, labels=m.data_labels)
         raw_input('Press enter to finish')
         plt.close(fig)
@@ -219,10 +219,12 @@ def _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim=False):
     import numpy as np
     np.random.seed(3000)
 
-    k = GPy.kern.Matern32(Q_signal, 10., lengthscale=1+(np.random.uniform(1,6,Q_signal)), ARD=1)
-    t = np.c_[[np.linspace(-1,5,N) for _ in range(Q_signal)]].T
+    k = GPy.kern.Matern32(Q_signal, 1., lengthscale=(np.random.uniform(1, 6, Q_signal)), ARD=1)
+    for i in range(Q_signal):
+        k += GPy.kern.PeriodicExponential(1, variance=1., active_dims=[i], period=3., lower=-2, upper=6)
+    t = np.c_[[np.linspace(-1, 5, N) for _ in range(Q_signal)]].T
     K = k.K(t)
-    s2, s1, s3, sS = np.random.multivariate_normal(np.zeros(K.shape[0]), K, size=(4))[:,:,None]
+    s2, s1, s3, sS = np.random.multivariate_normal(np.zeros(K.shape[0]), K, size=(4))[:, :, None]
 
     Y1, Y2, Y3, S1, S2, S3 = _generate_high_dimensional_output(D1, D2, D3, s1, s2, s3, sS)
 
@@ -243,7 +245,7 @@ def _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim=False):
         ax.legend()
         for i, Y in enumerate(Ylist):
             ax = fig.add_subplot(2, len(Ylist), len(Ylist) + 1 + i)
-            ax.imshow(Y, aspect='auto', cmap=cm.gray) # @UndefinedVariable
+            ax.imshow(Y, aspect='auto', cmap=cm.gray)  # @UndefinedVariable
             ax.set_title("Y{}".format(i + 1))
         plt.draw()
         plt.tight_layout()
@@ -255,7 +257,7 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim=False):
 
     x = _np.linspace(0, 4 * _np.pi, N)[:, None]
     s1 = _np.vectorize(lambda x: _np.sin(x))
-    s2 = _np.vectorize(lambda x: _np.cos(x)**2)
+    s2 = _np.vectorize(lambda x: _np.cos(x) ** 2)
     s3 = _np.vectorize(lambda x:-_np.exp(-_np.cos(2 * x)))
     sS = _np.vectorize(lambda x: _np.cos(x))
 
@@ -288,7 +290,7 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim=False):
         ax.legend()
         for i, Y in enumerate(Ylist):
             ax = fig.add_subplot(2, len(Ylist), len(Ylist) + 1 + i)
-            ax.imshow(Y, aspect='auto', cmap=cm.gray) # @UndefinedVariable
+            ax.imshow(Y, aspect='auto', cmap=cm.gray)  # @UndefinedVariable
             ax.set_title("Y{}".format(i + 1))
         plt.draw()
         plt.tight_layout()
@@ -323,10 +325,10 @@ def bgplvm_simulation(optimize=True, verbose=1,
     D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 45, 3, 9
     _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
     Y = Ylist[0]
-    k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
-    #k = kern.RBF(Q, ARD=True, lengthscale=10.)
+    k = kern.Linear(Q, ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+    # k = kern.RBF(Q, ARD=True, lengthscale=10.)
     m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k)
-    m.X.variance[:] = _np.random.uniform(0,.01,m.X.shape)
+    m.X.variance[:] = _np.random.uniform(0, .01, m.X.shape)
     m.likelihood.variance = .1
 
     if optimize:
@@ -348,10 +350,10 @@ def ssgplvm_simulation(optimize=True, verbose=1,
     D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 45, 3, 9
     _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
     Y = Ylist[0]
-    k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
-    #k = kern.RBF(Q, ARD=True, lengthscale=10.)
+    k = kern.Linear(Q, ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+    # k = kern.RBF(Q, ARD=True, lengthscale=10.)
     m = SSGPLVM(Y, Q, init="pca", num_inducing=num_inducing, kernel=k)
-    m.X.variance[:] = _np.random.uniform(0,.01,m.X.shape)
+    m.X.variance[:] = _np.random.uniform(0, .01, m.X.shape)
     m.likelihood.variance = .1
 
     if optimize:
@@ -365,7 +367,7 @@ def ssgplvm_simulation(optimize=True, verbose=1,
 
 def bgplvm_simulation_missing_data(optimize=True, verbose=1,
                       plot=True, plot_sim=False,
-                      max_iters=2e4,
+                      max_iters=2e4, percent_missing=.1,
                       ):
     from GPy import kern
     from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
@@ -373,9 +375,9 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
     D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 400, 3, 4
     _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
     Y = Ylist[0]
-    k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
+    k = kern.Linear(Q, ARD=True)  # + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
 
-    inan = _np.random.binomial(1, .2, size=Y.shape).astype(bool) # 80% missing data
+    inan = _np.random.binomial(1, percent_missing, size=Y.shape).astype(bool)  # 80% missing data
     Ymissing = Y.copy()
     Ymissing[inan] = _np.nan
 
@@ -401,11 +403,11 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
     D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
     _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
 
-    #Ylist = [Ylist[0]]
+    # Ylist = [Ylist[0]]
     k = kern.Linear(Q, ARD=True)
     m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing, kernel=k, initx="PCA_concat", initz='permute', **kw)
 
-    m['.*noise'] = [Y.var()/40. for Y in Ylist]
+    m['.*noise'] = [Y.var() / 40. for Y in Ylist]
 
     if optimize:
         print "Optimizing Model:"
@@ -422,7 +424,7 @@ def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim
     D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5
     _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
 
-    #Ylist = [Ylist[0]]
+    # Ylist = [Ylist[0]]
     k = kern.Linear(Q, ARD=True)
     inanlist = []
 
@@ -553,7 +555,7 @@ def bcgplvm_stick(kernel=None, optimize=True, verbose=True, plot=True):
 
     data = pods.datasets.osu_run1()
     # optimize
-    back_kernel=GPy.kern.RBF(data['Y'].shape[1], lengthscale=5.)
+    back_kernel = GPy.kern.RBF(data['Y'].shape[1], lengthscale=5.)
     mapping = GPy.mappings.Kernel(X=data['Y'], output_dim=2, kernel=back_kernel)
     m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping)
     if optimize: m.optimize(messages=verbose, max_f_eval=10000)
@@ -563,7 +565,7 @@ def bcgplvm_stick(kernel=None, optimize=True, verbose=True, plot=True):
         y = m.likelihood.Y[0, :]
         data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
         GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
-        #raw_input('Press enter to finish')
+        # raw_input('Press enter to finish')
 
     return m
 
@@ -627,7 +629,7 @@ def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose
     Y = data['Y']
     Y_mean = Y.mean(0)
     Y_std = Y.std(0)
-    m = GPy.models.GPLVM((Y-Y_mean)/Y_std, 2)
+    m = GPy.models.GPLVM((Y - Y_mean) / Y_std, 2)
 
     if optimize: m.optimize(messages=verbose, max_f_eval=10000)
     if plot:
@@ -651,17 +653,17 @@ def ssgplvm_simulation_linear():
         x = np.empty(Q)
         dies = np.random.rand(Q)
         for q in xrange(Q):
-            if dies[q]<pi:
+            if dies[q] < pi:
                 x[q] = np.random.randn()
             else:
                 x[q] = 0.
         return x
 
-    Y = np.empty((N,D))
-    X = np.empty((N,Q))
+    Y = np.empty((N, D))
+    X = np.empty((N, Q))
     # Generate data from random sampled weight matrices
     for n in xrange(N):
-        X[n] = sample_X(Q,pi)
-        w = np.random.randn(D,Q)
-        Y[n] = np.dot(w,X[n])
+        X[n] = sample_X(Q, pi)
+        w = np.random.randn(D, Q)
+        Y[n] = np.dot(w, X[n])
 
diff --git a/GPy/models/bayesian_gplvm_minibatch.py b/GPy/models/bayesian_gplvm_minibatch.py
index a2e44bbe..5b7d3162 100644
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@@ -81,18 +81,19 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
     def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None):
         posterior, log_marginal_likelihood, grad_dict, current_values, value_indices = super(BayesianGPLVMMiniBatch, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm, subset_indices=subset_indices)
 
-        kl_fctr = 1.
+        current_values['meangrad'], current_values['vargrad'] = self.kern.gradients_qX_expectations(
+                                            variational_posterior=X,
+                                            Z=Z, dL_dpsi0=grad_dict['dL_dpsi0'],
+                                            dL_dpsi1=grad_dict['dL_dpsi1'],
+                                            dL_dpsi2=grad_dict['dL_dpsi2'])
+
+        kl_fctr = self.kl_factr
         if self.missing_data:
             d = self.output_dim
             log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)/d
         else:
             log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)
 
-        current_values['meangrad'], current_values['vargrad'] = self.kern.gradients_qX_expectations(
-                                            variational_posterior=X,
-                                            Z=Z, dL_dpsi0=grad_dict['dL_dpsi0'],
-                                            dL_dpsi1=grad_dict['dL_dpsi1'],
-                                            dL_dpsi2=grad_dict['dL_dpsi2'])
 
         # Subsetting Variational Posterior objects, makes the gradients
         # empty. We need them to be 0 though:
diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
index bc39079d..91d34aa2 100644
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@@ -105,7 +105,6 @@ class MRD(BayesianGPLVMMiniBatch):
             kernels = []
             for i in range(len(Ylist)):
                 k = kernel.copy()
-                print k is kernel, k.observers, k.constraints
                 kernels.append(k)
         else:
             assert len(kernel) == len(Ylist), "need one kernel per output"
@@ -144,8 +143,16 @@ class MRD(BayesianGPLVMMiniBatch):
         for i, n, k, l, Y, im, bs in itertools.izip(itertools.count(), Ynames, kernels, likelihoods, Ylist, self.inference_method, batchsize):
             assert Y.shape[0] == self.num_data, "All datasets need to share the number of datapoints, and those have to correspond to one another"
             md = np.isnan(Y).any()
-            spgp = SparseGPMiniBatch(self.X, Y, Z, k, l, im, n, None, normalizer, md, stochastic, bs)
+            spgp = BayesianGPLVMMiniBatch(Y, input_dim, X, X_variance,
+                                          Z=Z, kernel=k, likelihood=l,
+                                          inference_method=im, name=n,
+                                          normalizer=normalizer,
+                                          missing_data=md,
+                                          stochastic=stochastic,
+                                          batchsize=bs)
+            spgp.kl_factr = 1./len(Ynames)
             spgp.unlink_parameter(spgp.Z)
+            spgp.unlink_parameter(spgp.X)
             del spgp.Z
             del spgp.X
             spgp.Z = self.Z
@@ -165,20 +172,10 @@ class MRD(BayesianGPLVMMiniBatch):
 
             self.logger.info('working on im <{}>'.format(hex(id(i))))
             self.Z.gradient[:] += b.full_values['Zgrad']
-            grad_dict = b.grad_dict
+            grad_dict = b.full_values
 
-            if isinstance(self.X, VariationalPosterior):
-                dL_dmean, dL_dS = b.kern.gradients_qX_expectations(
-                        grad_dict['dL_dpsi0'],
-                        grad_dict['dL_dpsi1'],
-                        grad_dict['dL_dpsi2'],
-                        self.Z, self.X)
-                self.X.mean.gradient += dL_dmean
-                self.X.variance.gradient += dL_dS
-
-            else:
-                #gradients wrt kernel
-                self.X.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'], self.X, self.Z)
+            self.X.mean.gradient += grad_dict['meangrad']
+            self.X.variance.gradient += grad_dict['vargrad']
 
         if isinstance(self.X, VariationalPosterior):
             # update for the KL divergence
@@ -238,7 +235,7 @@ class MRD(BayesianGPLVMMiniBatch):
                 pass
             if axes is None:
                 ax = fig.add_subplot(1, len(self.bgplvms), i + 1, sharex=sharex_ax, sharey=sharey_ax)
-            elif isinstance(axes, (tuple, list)):
+            elif isinstance(axes, (tuple, list, np.ndarray)):
                 ax = axes[i]
             else:
                 raise ValueError("Need one axes per latent dimension input_dim")
@@ -286,7 +283,7 @@ class MRD(BayesianGPLVMMiniBatch):
             titles = [r'${}$'.format(name) for name in self.names]
         ymax = reduce(max, [np.ceil(max(g.kern.input_sensitivity())) for g in self.bgplvms])
         def plotf(i, g, ax):
-            ax.set_ylim([0,ymax])
+            #ax.set_ylim([0,ymax])
             return g.kern.plot_ARD(ax=ax, title=titles[i], *args, **kwargs)
         fig = self._handle_plotting(fignum, ax, plotf, sharex=sharex, sharey=sharey)
         return fig
diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py
index 00929462..ec2e28f5 100644
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@@ -56,6 +56,7 @@ Created on 3 Nov 2014
                 raise NotImplementedError, "what to do what to do?"
             print "defaulting to ", inference_method, "for latent function inference"
 
+        self.kl_factr = 1.
         self.Z = Param('inducing inputs', Z)
         self.num_inducing = Z.shape[0]
 

From 3c642a5600d8dca0f9ea12c5b3c521f46e09e03d Mon Sep 17 00:00:00 2001
From: Max Zwiessele <ibinbei@gmail.com>
Date: Wed, 12 Nov 2014 15:08:38 +0000
Subject: [PATCH 8/9] [MRD] updates and nicer plotting

---
 GPy/__init__.py                                 |  9 +++++++--
 GPy/core/model.py                               | 12 ++++++++----
 GPy/core/parameterization/transformations.py    |  7 +++++++
 GPy/models/bayesian_gplvm_minibatch.py          |  2 ++
 GPy/models/mrd.py                               |  2 +-
 GPy/plotting/matplot_dep/dim_reduction_plots.py |  2 +-
 6 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/GPy/__init__.py b/GPy/__init__.py
index 99a91fe8..5e091170 100644
--- a/GPy/__init__.py
+++ b/GPy/__init__.py
@@ -39,6 +39,11 @@ def load(file_path):
     :param file_name: path/to/file.pickle
     """
     import cPickle as pickle
-    with open(file_path, 'rb') as f:
-        m = pickle.load(f)
+    try:
+        with open(file_path, 'rb') as f:
+            m = pickle.load(f)
+    except:
+        import pickle as pickle
+        with open(file_path, 'rb') as f:
+            m = pickle.load(f)
     return m
diff --git a/GPy/core/model.py b/GPy/core/model.py
index 029b8fc8..ac5a9732 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -238,6 +238,10 @@ class Model(Parameterized):
         if self.size == 0:
             print 'nothing to optimize'
 
+        if not self.update_model():
+            print "setting updates on again"
+            self.update_model(True)
+
         if start == None:
             start = self.optimizer_array
 
@@ -279,10 +283,10 @@ class Model(Parameterized):
         Note:-
            The gradient is considered correct if the ratio of the analytical
            and numerical gradients is within <tolerance> of unity.
-           
-           The *dF_ratio* indicates the limit of numerical accuracy of numerical gradients. 
-           If it is too small, e.g., smaller than 1e-12, the numerical gradients are usually 
-           not accurate enough for the tests (shown with blue). 
+
+           The *dF_ratio* indicates the limit of numerical accuracy of numerical gradients.
+           If it is too small, e.g., smaller than 1e-12, the numerical gradients are usually
+           not accurate enough for the tests (shown with blue).
         """
         x = self.optimizer_array.copy()
 
diff --git a/GPy/core/parameterization/transformations.py b/GPy/core/parameterization/transformations.py
index 01b0f324..291076a1 100644
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@@ -131,6 +131,13 @@ class NormalTheta(Transformation):
     def __str__(self):
         return "theta"
 
+    def __getstate__(self):
+        return [self.mu_indices, self.var_indices]
+
+    def __setstate__(self, state):
+        self.mu_indices = state[0]
+        self.var_indices = state[1]
+
 class NormalNaturalAntti(NormalTheta):
     _instances = []
     _logexp = Logexp()
diff --git a/GPy/models/bayesian_gplvm_minibatch.py b/GPy/models/bayesian_gplvm_minibatch.py
index 5b7d3162..80abba59 100644
--- a/GPy/models/bayesian_gplvm_minibatch.py
+++ b/GPy/models/bayesian_gplvm_minibatch.py
@@ -54,6 +54,8 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch):
         self.variational_prior = NormalPrior()
         X = NormalPosterior(X, X_variance)
 
+        self.kl_factr = 1.
+
         if inference_method is None:
             from ..inference.latent_function_inference.var_dtc import VarDTC
             self.logger.debug("creating inference_method var_dtc")
diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
index 91d34aa2..645cdf88 100644
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@@ -100,7 +100,7 @@ class MRD(BayesianGPLVMMiniBatch):
         self.logger.info("building kernels")
         if kernel is None:
             from ..kern import RBF
-            kernels = [RBF(input_dim, ARD=1, lengthscale=fracs[i]) for i in range(len(Ylist))]
+            kernels = [RBF(input_dim, ARD=1, lengthscale=1./fracs[i]) for i in range(len(Ylist))]
         elif isinstance(kernel, Kern):
             kernels = []
             for i in range(len(Ylist)):
diff --git a/GPy/plotting/matplot_dep/dim_reduction_plots.py b/GPy/plotting/matplot_dep/dim_reduction_plots.py
index 7ec0b59d..25a1166f 100644
--- a/GPy/plotting/matplot_dep/dim_reduction_plots.py
+++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py
@@ -162,7 +162,7 @@ def plot_latent(model, labels=None, which_indices=None,
         else:
             x = X[index, input_1]
             y = X[index, input_2]
-        ax.scatter(x, y, marker=m, s=s, c=Tango.nextMedium(), label=this_label, linewidth=.5, edgecolor='k', alpha=.9)
+        ax.scatter(x, y, marker=m, s=s, c=Tango.nextMedium(), label=this_label, linewidth=.2, edgecolor='k', alpha=.9)
 
     ax.set_xlabel('latent dimension %i' % input_1)
     ax.set_ylabel('latent dimension %i' % input_2)

From 7f7b0da7b9e6dd873323781c2497fc166a57c74c Mon Sep 17 00:00:00 2001
From: Zhenwen Dai <z.dai@sheffield.ac.uk>
Date: Wed, 12 Nov 2014 20:19:56 +0000
Subject: [PATCH 9/9] spike and slab binary variable numerical enhancement

---
 GPy/core/parameterization/variational.py | 30 ++++++++++++++++++++----
 GPy/kern/_src/psi_comp/ssrbf_psi_comp.py | 20 +++++++---------
 GPy/models/ss_gplvm.py                   |  5 +---
 3 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py
index e2a24008..bb7678ff 100644
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@@ -8,6 +8,8 @@ import numpy as np
 from parameterized import Parameterized
 from param import Param
 from transformations import Logexp, Logistic,__fixed__
+from GPy.util.misc import param_to_array
+from GPy.util.caching import Cache_this
 
 class VariationalPrior(Parameterized):
     def __init__(self, name='latent space', **kw):
@@ -48,7 +50,8 @@ class SpikeAndSlabPrior(VariationalPrior):
     def KL_divergence(self, variational_posterior):
         mu = variational_posterior.mean
         S = variational_posterior.variance
-        gamma = variational_posterior.binary_prob
+        gamma,gamma1 = variational_posterior.gamma_probabilities()
+        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
         if len(self.pi.shape)==2:
             idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
             pi = self.pi[idx]
@@ -57,20 +60,21 @@ class SpikeAndSlabPrior(VariationalPrior):
             
         var_mean = np.square(mu)/self.variance
         var_S = (S/self.variance - np.log(S))
-        var_gamma = (gamma*np.log(gamma/pi)).sum()+((1-gamma)*np.log((1-gamma)/(1-pi))).sum()
+        var_gamma = (gamma*(log_gamma-np.log(pi))).sum()+(gamma1*(log_gamma1-np.log(1-pi))).sum()
         return var_gamma+ (gamma* (np.log(self.variance)-1. +var_mean + var_S)).sum()/2.
 
     def update_gradients_KL(self, variational_posterior):
         mu = variational_posterior.mean
         S = variational_posterior.variance
-        gamma = variational_posterior.binary_prob
+        gamma,gamma1 = variational_posterior.gamma_probabilities()
+        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
         if len(self.pi.shape)==2:
             idx = np.unique(gamma._raveled_index()/gamma.shape[-1])
             pi = self.pi[idx]
         else:
             pi = self.pi
 
-        gamma.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
+        variational_posterior.binary_prob.gradient -= (np.log((1-pi)/pi)+log_gamma-log_gamma1+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.)*gamma*gamma1
         mu.gradient -= gamma*mu/self.variance
         S.gradient -= (1./self.variance - 1./S) * gamma /2.
         if self.learnPi:
@@ -158,8 +162,24 @@ class SpikeAndSlabPosterior(VariationalPosterior):
         binary_prob : the probability of the distribution on the slab part.
         """
         super(SpikeAndSlabPosterior, self).__init__(means, variances, name)
-        self.gamma = Param("binary_prob",binary_prob, Logistic(1e-10,1.-1e-10))
+        self.gamma = Param("binary_prob",binary_prob)
         self.link_parameter(self.gamma)
+        
+    @Cache_this(limit=5)
+    def gamma_probabilities(self):
+        prob = np.zeros_like(param_to_array(self.gamma))
+        prob[self.gamma>-710] = 1./(1.+np.exp(-self.gamma[self.gamma>-710]))
+        prob1 = np.zeros_like(param_to_array(self.gamma))
+        prob1[self.gamma<710] = 1./(1.+np.exp(self.gamma[self.gamma<710]))
+        return prob, prob1
+    
+    @Cache_this(limit=5)
+    def gamma_log_prob(self):
+        loggamma = param_to_array(self.gamma).copy()
+        loggamma[loggamma>-40] = -np.log1p(np.exp(-loggamma[loggamma>-40]))
+        loggamma1 = param_to_array(self.gamma).copy()
+        loggamma1[loggamma1<40] = -np.log1p(np.exp(loggamma1[loggamma1<40]))
+        return loggamma,loggamma1
 
     def set_gradients(self, grad):
         self.mean.gradient, self.variance.gradient, self.gamma.gradient = grad
diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
index f6a24c86..18a4d751 100644
--- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
+++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py
@@ -22,14 +22,12 @@ try:
         # _psi1                NxM
         mu = variational_posterior.mean
         S = variational_posterior.variance
-        gamma = variational_posterior.binary_prob
          
         N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
         l2 = np.square(lengthscale)
         log_denom1 = np.log(S/l2+1)
         log_denom2 = np.log(2*S/l2+1)
-        log_gamma = np.log(gamma)
-        log_gamma1 = np.log(1.-gamma)
+        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
         variance = float(variance)
         psi0 = np.empty(N)
         psi0[:] = variance
@@ -39,7 +37,6 @@ try:
         from ....util.misc import param_to_array
         S = param_to_array(S)
         mu = param_to_array(mu)
-        gamma = param_to_array(gamma)
         Z = param_to_array(Z)
          
         support_code = """
@@ -82,7 +79,7 @@ try:
             }
         }
         """
-        weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz)
+        weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz)
      
         psi2 = psi2n.sum(axis=0)
         return psi0,psi1,psi2,psi2n
@@ -97,13 +94,12 @@ try:
      
         mu = variational_posterior.mean
         S = variational_posterior.variance
-        gamma = variational_posterior.binary_prob
         N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1]
         l2 = np.square(lengthscale)
         log_denom1 = np.log(S/l2+1)
         log_denom2 = np.log(2*S/l2+1)
-        log_gamma = np.log(gamma)
-        log_gamma1 = np.log(1.-gamma)
+        log_gamma,log_gamma1 = variational_posterior.gamma_log_prob()
+        gamma, gamma1 = variational_posterior.gamma_probabilities()
         variance = float(variance)
      
         dvar = np.zeros(1)
@@ -117,7 +113,6 @@ try:
         from ....util.misc import param_to_array
         S = param_to_array(S)
         mu = param_to_array(mu)
-        gamma = param_to_array(gamma)
         Z = param_to_array(Z)
          
         support_code = """
@@ -135,6 +130,7 @@ try:
                         double Zm1q = Z(m1,q);
                         double Zm2q = Z(m2,q);
                         double gnq = gamma(n,q);
+                        double g1nq = gamma1(n,q);
                         double mu_nq = mu(n,q);
                          
                         if(m2==0) {
@@ -160,7 +156,7 @@ try:
                              
                             dmu(n,q) += lpsi1*Zmu*d_exp1/(denom*exp_sum);
                             dS(n,q) += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum)/2.;
-                            dgamma(n,q) += lpsi1*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
+                            dgamma(n,q) += lpsi1*(d_exp1*g1nq-d_exp2*gnq)/exp_sum;
                             dl(q) += lpsi1*((Zmu2_denom+Snq/lq)/denom*d_exp1+Zm1q*Zm1q/(lq*lq)*d_exp2)/(2.*exp_sum);
                             dZ(m1,q) += lpsi1*(-Zmu/denom*d_exp1-Zm1q/lq*d_exp2)/exp_sum;
                         }
@@ -188,7 +184,7 @@ try:
                          
                         dmu(n,q) += -2.*lpsi2*muZhat/denom*d_exp1/exp_sum;
                         dS(n,q) += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum;
-                        dgamma(n,q) += lpsi2*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
+                        dgamma(n,q) += lpsi2*(d_exp1*g1nq-d_exp2*gnq)/exp_sum;
                         dl(q) += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZm1m2*dZm1m2/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum;
                         dZ(m1,q) += 2.*lpsi2*((muZhat/denom-dZm1m2/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum;                   
                     }
@@ -196,7 +192,7 @@ try:
             }
         }
         """
-        weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz)
+        weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','gamma1','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz)
      
         dl *= 2.*lengthscale
         if not ARD:
diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py
index 04006d84..a61ad2a0 100644
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@@ -39,10 +39,7 @@ class SSGPLVM(SparseGP_MPI):
             X_variance = np.random.uniform(0,.1,X.shape)
             
         if Gamma is None:
-            gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
-            gamma[:] = 0.5 + 0.1 * np.random.randn(X.shape[0], input_dim)
-            gamma[gamma>1.-1e-9] = 1.-1e-9
-            gamma[gamma<1e-9] = 1e-9
+            gamma = np.random.randn(X.shape[0], input_dim)
         else:
             gamma = Gamma.copy()